19 files changed, 5208 insertions, 0 deletions
diff --git a/lib/librte_acl/Makefile b/lib/librte_acl/Makefile
new file mode 100644
index 00000000..2e394c97
--- /dev/null
+++ b/lib/librte_acl/Makefile
@@ -0,0 +1,96 @@
+#   BSD LICENSE
+#
+#   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+#   All rights reserved.
+#
+#   Redistribution and use in source and binary forms, with or without
+#   modification, are permitted provided that the following conditions
+#   are met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in
+#       the documentation and/or other materials provided with the
+#       distribution.
+#     * Neither the name of Intel Corporation nor the names of its
+#       contributors may be used to endorse or promote products derived
+#       from this software without specific prior written permission.
+#
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = librte_acl.a
+
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR)
+
+EXPORT_MAP := rte_acl_version.map
+
+LIBABIVER := 2
+
+# all source are stored in SRCS-y
+SRCS-$(CONFIG_RTE_LIBRTE_ACL) += tb_mem.c
+
+SRCS-$(CONFIG_RTE_LIBRTE_ACL) += rte_acl.c
+SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_bld.c
+SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_gen.c
+SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_scalar.c
+
+ifneq ($(filter y,$(CONFIG_RTE_ARCH_ARM) $(CONFIG_RTE_ARCH_ARM64)),)
+SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_neon.c
+CFLAGS_acl_run_neon.o += -flax-vector-conversions -Wno-maybe-uninitialized
+else
+SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_sse.c
+#check if flag for SSE4.1 is already on, if not set it up manually
+	ifeq ($(findstring RTE_MACHINE_CPUFLAG_SSE4_1,$(CFLAGS)),)
+		CFLAGS_acl_run_sse.o += -msse4.1
+	endif
+endif
+
+#
+# If the compiler supports AVX2 instructions,
+# then add support for AVX2 classify method.
+#
+
+#check if flag for AVX2 is already on, if not set it up manually
+ifeq ($(findstring RTE_MACHINE_CPUFLAG_AVX2,$(CFLAGS)),RTE_MACHINE_CPUFLAG_AVX2)
+	CC_AVX2_SUPPORT=1
+else
+	CC_AVX2_SUPPORT=\
+	$(shell $(CC) -march=core-avx2 -dM -E - </dev/null 2>&1 | \
+	grep -q AVX2 && echo 1)
+	ifeq ($(CC_AVX2_SUPPORT), 1)
+		ifeq ($(CC), icc)
+		CFLAGS_acl_run_avx2.o += -march=core-avx2
+		else
+		CFLAGS_acl_run_avx2.o += -mavx2
+		endif
+	endif
+endif
+
+ifeq ($(CC_AVX2_SUPPORT), 1)
+	SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_avx2.c
+	CFLAGS_rte_acl.o += -DCC_AVX2_SUPPORT
+endif
+
+# install this header file
+SYMLINK-$(CONFIG_RTE_LIBRTE_ACL)-include := rte_acl_osdep.h
+SYMLINK-$(CONFIG_RTE_LIBRTE_ACL)-include += rte_acl.h
+
+# this lib needs eal
+DEPDIRS-$(CONFIG_RTE_LIBRTE_ACL) += lib/librte_eal
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_acl/acl.h b/lib/librte_acl/acl.h
new file mode 100644
index 00000000..09d67841
--- /dev/null
+++ b/lib/librte_acl/acl.h
@@ -0,0 +1,241 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_ACL_H_
+#define	_ACL_H_
+
+#ifdef __cplusplus
+extern"C" {
+#endif /* __cplusplus */
+
+#define RTE_ACL_QUAD_MAX	5
+#define RTE_ACL_QUAD_SIZE	4
+#define RTE_ACL_QUAD_SINGLE	UINT64_C(0x7f7f7f7f00000000)
+
+#define RTE_ACL_SINGLE_TRIE_SIZE	2000
+
+#define RTE_ACL_DFA_MAX		UINT8_MAX
+#define RTE_ACL_DFA_SIZE	(UINT8_MAX + 1)
+
+#define	RTE_ACL_DFA_GR64_SIZE	64
+#define	RTE_ACL_DFA_GR64_NUM	(RTE_ACL_DFA_SIZE / RTE_ACL_DFA_GR64_SIZE)
+#define	RTE_ACL_DFA_GR64_BIT	\
+	(CHAR_BIT * sizeof(uint32_t) / RTE_ACL_DFA_GR64_NUM)
+
+typedef int bits_t;
+
+#define	RTE_ACL_BIT_SET_SIZE	((UINT8_MAX + 1) / (sizeof(bits_t) * CHAR_BIT))
+
+struct rte_acl_bitset {
+	bits_t             bits[RTE_ACL_BIT_SET_SIZE];
+};
+
+#define	RTE_ACL_NODE_DFA	(0 << RTE_ACL_TYPE_SHIFT)
+#define	RTE_ACL_NODE_SINGLE	(1U << RTE_ACL_TYPE_SHIFT)
+#define	RTE_ACL_NODE_QRANGE	(3U << RTE_ACL_TYPE_SHIFT)
+#define	RTE_ACL_NODE_MATCH	(4U << RTE_ACL_TYPE_SHIFT)
+#define	RTE_ACL_NODE_TYPE	(7U << RTE_ACL_TYPE_SHIFT)
+#define	RTE_ACL_NODE_UNDEFINED	UINT32_MAX
+
+/*
+ * ACL RT structure is a set of multibit tries (with stride == 8)
+ * represented by an array of transitions. The next position is calculated
+ * based on the current position and the input byte.
+ * Each transition is 64 bit value with the following format:
+ * | node_type_specific : 32 | node_type : 3 | node_addr : 29 |
+ * For all node types except RTE_ACL_NODE_MATCH, node_addr is an index
+ * to the start of the node in the transtions array.
+ * Few different node types are used:
+ * RTE_ACL_NODE_MATCH:
+ * node_addr value is and index into an array that contains the return value
+ * and its priority for each category.
+ * Upper 32 bits of the transition value are not used for that node type.
+ * RTE_ACL_NODE_QRANGE:
+ * that node consist of up to 5 transitions.
+ * Upper 32 bits are interpreted as 4 signed character values which
+ * are ordered from smallest(INT8_MIN) to largest (INT8_MAX).
+ * These values define 5 ranges:
+ * INT8_MIN <= range[0]  <= ((int8_t *)&transition)[4]
+ * ((int8_t *)&transition)[4] < range[1] <= ((int8_t *)&transition)[5]
+ * ((int8_t *)&transition)[5] < range[2] <= ((int8_t *)&transition)[6]
+ * ((int8_t *)&transition)[6] < range[3] <= ((int8_t *)&transition)[7]
+ * ((int8_t *)&transition)[7] < range[4] <= INT8_MAX
+ * So for input byte value within range[i] i-th transition within that node
+ * will be used.
+ * RTE_ACL_NODE_SINGLE:
+ * always transitions to the same node regardless of the input value.
+ * RTE_ACL_NODE_DFA:
+ * that node consits of up to 256 transitions.
+ * In attempt to conserve space all transitions are divided into 4 consecutive
+ * groups, by 64 transitions per group:
+ * group64[i] contains transitions[i * 64, .. i * 64 + 63].
+ * Upper 32 bits are interpreted as 4 unsigned character values one per group,
+ * which contain index to the start of the given group within the node.
+ * So to calculate transition index within the node for given input byte value:
+ * input_byte - ((uint8_t *)&transition)[4 + input_byte / 64].
+ */
+
+/*
+ * Structure of a node is a set of ptrs and each ptr has a bit map
+ * of values associated with this transition.
+ */
+struct rte_acl_ptr_set {
+	struct rte_acl_bitset values;	/* input values associated with ptr */
+	struct rte_acl_node  *ptr;	/* transition to next node */
+};
+
+struct rte_acl_classifier_results {
+	int results[RTE_ACL_MAX_CATEGORIES];
+};
+
+struct rte_acl_match_results {
+	uint32_t results[RTE_ACL_MAX_CATEGORIES];
+	int32_t priority[RTE_ACL_MAX_CATEGORIES];
+};
+
+struct rte_acl_node {
+	uint64_t node_index;  /* index for this node */
+	uint32_t level;       /* level 0-n in the trie */
+	uint32_t ref_count;   /* ref count for this node */
+	struct rte_acl_bitset  values;
+	/* set of all values that map to another node
+	 * (union of bits in each transition.
+	 */
+	uint32_t                num_ptrs; /* number of ptr_set in use */
+	uint32_t                max_ptrs; /* number of allocated ptr_set */
+	uint32_t                min_add;  /* number of ptr_set per allocation */
+	struct rte_acl_ptr_set *ptrs;     /* transitions array for this node */
+	int32_t                 match_flag;
+	int32_t                 match_index; /* index to match data */
+	uint32_t                node_type;
+	int32_t                 fanout;
+	/* number of ranges (transitions w/ consecutive bits) */
+	int32_t                 id;
+	struct rte_acl_match_results *mrt; /* only valid when match_flag != 0 */
+	union {
+		char            transitions[RTE_ACL_QUAD_SIZE];
+		/* boundaries for ranged node */
+		uint8_t         dfa_gr64[RTE_ACL_DFA_GR64_NUM];
+	};
+	struct rte_acl_node     *next;
+	/* free list link or pointer to duplicate node during merge */
+	struct rte_acl_node     *prev;
+	/* points to node from which this node was duplicated */
+};
+
+/*
+ * Types of tries used to generate runtime structure(s)
+ */
+enum {
+	RTE_ACL_FULL_TRIE = 0,
+	RTE_ACL_NOSRC_TRIE = 1,
+	RTE_ACL_NODST_TRIE = 2,
+	RTE_ACL_NOPORTS_TRIE = 4,
+	RTE_ACL_NOVLAN_TRIE = 8,
+	RTE_ACL_UNUSED_TRIE = 0x80000000
+};
+
+
+/** MAX number of tries per one ACL context.*/
+#define RTE_ACL_MAX_TRIES	8
+
+/** Max number of characters in PM name.*/
+#define RTE_ACL_NAMESIZE	32
+
+
+struct rte_acl_trie {
+	uint32_t        type;
+	uint32_t        count;
+	uint32_t        root_index;
+	const uint32_t *data_index;
+	uint32_t        num_data_indexes;
+};
+
+struct rte_acl_bld_trie {
+	struct rte_acl_node *trie;
+};
+
+struct rte_acl_ctx {
+	char                name[RTE_ACL_NAMESIZE];
+	/** Name of the ACL context. */
+	int32_t             socket_id;
+	/** Socket ID to allocate memory from. */
+	enum rte_acl_classify_alg alg;
+	void               *rules;
+	uint32_t            max_rules;
+	uint32_t            rule_sz;
+	uint32_t            num_rules;
+	uint32_t            num_categories;
+	uint32_t            num_tries;
+	uint32_t            match_index;
+	uint64_t            no_match;
+	uint64_t            idle;
+	uint64_t           *trans_table;
+	uint32_t           *data_indexes;
+	struct rte_acl_trie trie[RTE_ACL_MAX_TRIES];
+	void               *mem;
+	size_t              mem_sz;
+	struct rte_acl_config config; /* copy of build config. */
+};
+
+int rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie,
+	struct rte_acl_bld_trie *node_bld_trie, uint32_t num_tries,
+	uint32_t num_categories, uint32_t data_index_sz, size_t max_size);
+
+typedef int (*rte_acl_classify_t)
+(const struct rte_acl_ctx *, const uint8_t **, uint32_t *, uint32_t, uint32_t);
+
+/*
+ * Different implementations of ACL classify.
+ */
+int
+rte_acl_classify_scalar(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	uint32_t *results, uint32_t num, uint32_t categories);
+
+int
+rte_acl_classify_sse(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	uint32_t *results, uint32_t num, uint32_t categories);
+
+int
+rte_acl_classify_avx2(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	uint32_t *results, uint32_t num, uint32_t categories);
+
+int
+rte_acl_classify_neon(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	uint32_t *results, uint32_t num, uint32_t categories);
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _ACL_H_ */
diff --git a/lib/librte_acl/acl_bld.c b/lib/librte_acl/acl_bld.c
new file mode 100644
index 00000000..9e5ad1c6
--- /dev/null
+++ b/lib/librte_acl/acl_bld.c
@@ -0,0 +1,1598 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rte_acl.h>
+#include "tb_mem.h"
+#include "acl.h"
+
+#define	ACL_POOL_ALIGN		8
+#define	ACL_POOL_ALLOC_MIN	0x800000
+
+/* number of pointers per alloc */
+#define ACL_PTR_ALLOC	32
+
+/* macros for dividing rule sets heuristics */
+#define NODE_MAX	0x4000
+#define NODE_MIN	0x800
+
+/* TALLY are statistics per field */
+enum {
+	TALLY_0 = 0,        /* number of rules that are 0% or more wild. */
+	TALLY_25,	    /* number of rules that are 25% or more wild. */
+	TALLY_50,
+	TALLY_75,
+	TALLY_100,
+	TALLY_DEACTIVATED, /* deactivated fields (100% wild in all rules). */
+	TALLY_DEPTH,
+	/* number of rules that are 100% wild for this field and higher. */
+	TALLY_NUM
+};
+
+static const uint32_t wild_limits[TALLY_DEACTIVATED] = {0, 25, 50, 75, 100};
+
+enum {
+	ACL_INTERSECT_NONE = 0,
+	ACL_INTERSECT_A = 1,    /* set A is a superset of A and B intersect */
+	ACL_INTERSECT_B = 2,    /* set B is a superset of A and B intersect */
+	ACL_INTERSECT = 4,	/* sets A and B intersect */
+};
+
+enum {
+	ACL_PRIORITY_EQUAL = 0,
+	ACL_PRIORITY_NODE_A = 1,
+	ACL_PRIORITY_NODE_B = 2,
+	ACL_PRIORITY_MIXED = 3
+};
+
+
+struct acl_mem_block {
+	uint32_t block_size;
+	void     *mem_ptr;
+};
+
+#define	MEM_BLOCK_NUM	16
+
+/* Single ACL rule, build representation.*/
+struct rte_acl_build_rule {
+	struct rte_acl_build_rule   *next;
+	struct rte_acl_config       *config;
+	/**< configuration for each field in the rule. */
+	const struct rte_acl_rule   *f;
+	uint32_t                    *wildness;
+};
+
+/* Context for build phase */
+struct acl_build_context {
+	const struct rte_acl_ctx *acx;
+	struct rte_acl_build_rule *build_rules;
+	struct rte_acl_config     cfg;
+	int32_t                   node_max;
+	int32_t                   cur_node_max;
+	uint32_t                  node;
+	uint32_t                  num_nodes;
+	uint32_t                  category_mask;
+	uint32_t                  num_rules;
+	uint32_t                  node_id;
+	uint32_t                  src_mask;
+	uint32_t                  num_build_rules;
+	uint32_t                  num_tries;
+	struct tb_mem_pool        pool;
+	struct rte_acl_trie       tries[RTE_ACL_MAX_TRIES];
+	struct rte_acl_bld_trie   bld_tries[RTE_ACL_MAX_TRIES];
+	uint32_t            data_indexes[RTE_ACL_MAX_TRIES][RTE_ACL_MAX_FIELDS];
+
+	/* memory free lists for nodes and blocks used for node ptrs */
+	struct acl_mem_block      blocks[MEM_BLOCK_NUM];
+	struct rte_acl_node       *node_free_list;
+};
+
+static int acl_merge_trie(struct acl_build_context *context,
+	struct rte_acl_node *node_a, struct rte_acl_node *node_b,
+	uint32_t level, struct rte_acl_node **node_c);
+
+static void
+acl_deref_ptr(struct acl_build_context *context,
+	struct rte_acl_node *node, int index);
+
+static void *
+acl_build_alloc(struct acl_build_context *context, size_t n, size_t s)
+{
+	uint32_t m;
+	void *p;
+	size_t alloc_size = n * s;
+
+	/*
+	 * look for memory in free lists
+	 */
+	for (m = 0; m < RTE_DIM(context->blocks); m++) {
+		if (context->blocks[m].block_size ==
+		   alloc_size && context->blocks[m].mem_ptr != NULL) {
+			p = context->blocks[m].mem_ptr;
+			context->blocks[m].mem_ptr = *((void **)p);
+			memset(p, 0, alloc_size);
+			return p;
+		}
+	}
+
+	/*
+	 * return allocation from memory pool
+	 */
+	p = tb_alloc(&context->pool, alloc_size);
+	return p;
+}
+
+/*
+ * Free memory blocks (kept in context for reuse).
+ */
+static void
+acl_build_free(struct acl_build_context *context, size_t s, void *p)
+{
+	uint32_t n;
+
+	for (n = 0; n < RTE_DIM(context->blocks); n++) {
+		if (context->blocks[n].block_size == s) {
+			*((void **)p) = context->blocks[n].mem_ptr;
+			context->blocks[n].mem_ptr = p;
+			return;
+		}
+	}
+	for (n = 0; n < RTE_DIM(context->blocks); n++) {
+		if (context->blocks[n].block_size == 0) {
+			context->blocks[n].block_size = s;
+			*((void **)p) = NULL;
+			context->blocks[n].mem_ptr = p;
+			return;
+		}
+	}
+}
+
+/*
+ * Allocate and initialize a new node.
+ */
+static struct rte_acl_node *
+acl_alloc_node(struct acl_build_context *context, int level)
+{
+	struct rte_acl_node *node;
+
+	if (context->node_free_list != NULL) {
+		node = context->node_free_list;
+		context->node_free_list = node->next;
+		memset(node, 0, sizeof(struct rte_acl_node));
+	} else {
+		node = acl_build_alloc(context, sizeof(struct rte_acl_node), 1);
+	}
+
+	if (node != NULL) {
+		node->num_ptrs = 0;
+		node->level = level;
+		node->node_type = RTE_ACL_NODE_UNDEFINED;
+		node->node_index = RTE_ACL_NODE_UNDEFINED;
+		context->num_nodes++;
+		node->id = context->node_id++;
+	}
+	return node;
+}
+
+/*
+ * Dereference all nodes to which this node points
+ */
+static void
+acl_free_node(struct acl_build_context *context,
+	struct rte_acl_node *node)
+{
+	uint32_t n;
+
+	if (node->prev != NULL)
+		node->prev->next = NULL;
+	for (n = 0; n < node->num_ptrs; n++)
+		acl_deref_ptr(context, node, n);
+
+	/* free mrt if this is a match node */
+	if (node->mrt != NULL) {
+		acl_build_free(context, sizeof(struct rte_acl_match_results),
+			node->mrt);
+		node->mrt = NULL;
+	}
+
+	/* free transitions to other nodes */
+	if (node->ptrs != NULL) {
+		acl_build_free(context,
+			node->max_ptrs * sizeof(struct rte_acl_ptr_set),
+			node->ptrs);
+		node->ptrs = NULL;
+	}
+
+	/* put it on the free list */
+	context->num_nodes--;
+	node->next = context->node_free_list;
+	context->node_free_list = node;
+}
+
+
+/*
+ * Include src bitset in dst bitset
+ */
+static void
+acl_include(struct rte_acl_bitset *dst, struct rte_acl_bitset *src, bits_t mask)
+{
+	uint32_t n;
+
+	for (n = 0; n < RTE_ACL_BIT_SET_SIZE; n++)
+		dst->bits[n] = (dst->bits[n] & mask) | src->bits[n];
+}
+
+/*
+ * Set dst to bits of src1 that are not in src2
+ */
+static int
+acl_exclude(struct rte_acl_bitset *dst,
+	struct rte_acl_bitset *src1,
+	struct rte_acl_bitset *src2)
+{
+	uint32_t n;
+	bits_t all_bits = 0;
+
+	for (n = 0; n < RTE_ACL_BIT_SET_SIZE; n++) {
+		dst->bits[n] = src1->bits[n] & ~src2->bits[n];
+		all_bits |= dst->bits[n];
+	}
+	return all_bits != 0;
+}
+
+/*
+ * Add a pointer (ptr) to a node.
+ */
+static int
+acl_add_ptr(struct acl_build_context *context,
+	struct rte_acl_node *node,
+	struct rte_acl_node *ptr,
+	struct rte_acl_bitset *bits)
+{
+	uint32_t n, num_ptrs;
+	struct rte_acl_ptr_set *ptrs = NULL;
+
+	/*
+	 * If there's already a pointer to the same node, just add to the bitset
+	 */
+	for (n = 0; n < node->num_ptrs; n++) {
+		if (node->ptrs[n].ptr != NULL) {
+			if (node->ptrs[n].ptr == ptr) {
+				acl_include(&node->ptrs[n].values, bits, -1);
+				acl_include(&node->values, bits, -1);
+				return 0;
+			}
+		}
+	}
+
+	/* if there's no room for another pointer, make room */
+	if (node->num_ptrs >= node->max_ptrs) {
+		/* add room for more pointers */
+		num_ptrs = node->max_ptrs + ACL_PTR_ALLOC;
+		ptrs = acl_build_alloc(context, num_ptrs, sizeof(*ptrs));
+
+		/* copy current points to new memory allocation */
+		if (node->ptrs != NULL) {
+			memcpy(ptrs, node->ptrs,
+				node->num_ptrs * sizeof(*ptrs));
+			acl_build_free(context, node->max_ptrs * sizeof(*ptrs),
+				node->ptrs);
+		}
+		node->ptrs = ptrs;
+		node->max_ptrs = num_ptrs;
+	}
+
+	/* Find available ptr and add a new pointer to this node */
+	for (n = node->min_add; n < node->max_ptrs; n++) {
+		if (node->ptrs[n].ptr == NULL) {
+			node->ptrs[n].ptr = ptr;
+			acl_include(&node->ptrs[n].values, bits, 0);
+			acl_include(&node->values, bits, -1);
+			if (ptr != NULL)
+				ptr->ref_count++;
+			if (node->num_ptrs <= n)
+				node->num_ptrs = n + 1;
+			return 0;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Add a pointer for a range of values
+ */
+static int
+acl_add_ptr_range(struct acl_build_context *context,
+	struct rte_acl_node *root,
+	struct rte_acl_node *node,
+	uint8_t low,
+	uint8_t high)
+{
+	uint32_t n;
+	struct rte_acl_bitset bitset;
+
+	/* clear the bitset values */
+	for (n = 0; n < RTE_ACL_BIT_SET_SIZE; n++)
+		bitset.bits[n] = 0;
+
+	/* for each bit in range, add bit to set */
+	for (n = 0; n < UINT8_MAX + 1; n++)
+		if (n >= low && n <= high)
+			bitset.bits[n / (sizeof(bits_t) * 8)] |=
+				1 << (n % (sizeof(bits_t) * 8));
+
+	return acl_add_ptr(context, root, node, &bitset);
+}
+
+/*
+ * Generate a bitset from a byte value and mask.
+ */
+static int
+acl_gen_mask(struct rte_acl_bitset *bitset, uint32_t value, uint32_t mask)
+{
+	int range = 0;
+	uint32_t n;
+
+	/* clear the bitset values */
+	for (n = 0; n < RTE_ACL_BIT_SET_SIZE; n++)
+		bitset->bits[n] = 0;
+
+	/* for each bit in value/mask, add bit to set */
+	for (n = 0; n < UINT8_MAX + 1; n++) {
+		if ((n & mask) == value) {
+			range++;
+			bitset->bits[n / (sizeof(bits_t) * 8)] |=
+				1 << (n % (sizeof(bits_t) * 8));
+		}
+	}
+	return range;
+}
+
+/*
+ * Determine how A and B intersect.
+ * Determine if A and/or B are supersets of the intersection.
+ */
+static int
+acl_intersect_type(const struct rte_acl_bitset *a_bits,
+	const struct rte_acl_bitset *b_bits,
+	struct rte_acl_bitset *intersect)
+{
+	uint32_t n;
+	bits_t intersect_bits = 0;
+	bits_t a_superset = 0;
+	bits_t b_superset = 0;
+
+	/*
+	 * calculate and store intersection and check if A and/or B have
+	 * bits outside the intersection (superset)
+	 */
+	for (n = 0; n < RTE_ACL_BIT_SET_SIZE; n++) {
+		intersect->bits[n] = a_bits->bits[n] & b_bits->bits[n];
+		a_superset |= a_bits->bits[n] ^ intersect->bits[n];
+		b_superset |= b_bits->bits[n] ^ intersect->bits[n];
+		intersect_bits |= intersect->bits[n];
+	}
+
+	n = (intersect_bits == 0 ? ACL_INTERSECT_NONE : ACL_INTERSECT) |
+		(b_superset == 0 ? 0 : ACL_INTERSECT_B) |
+		(a_superset == 0 ? 0 : ACL_INTERSECT_A);
+
+	return n;
+}
+
+/*
+ * Duplicate a node
+ */
+static struct rte_acl_node *
+acl_dup_node(struct acl_build_context *context, struct rte_acl_node *node)
+{
+	uint32_t n;
+	struct rte_acl_node *next;
+
+	next = acl_alloc_node(context, node->level);
+
+	/* allocate the pointers */
+	if (node->num_ptrs > 0) {
+		next->ptrs = acl_build_alloc(context,
+			node->max_ptrs,
+			sizeof(struct rte_acl_ptr_set));
+		next->max_ptrs = node->max_ptrs;
+	}
+
+	/* copy over the pointers */
+	for (n = 0; n < node->num_ptrs; n++) {
+		if (node->ptrs[n].ptr != NULL) {
+			next->ptrs[n].ptr = node->ptrs[n].ptr;
+			next->ptrs[n].ptr->ref_count++;
+			acl_include(&next->ptrs[n].values,
+				&node->ptrs[n].values, -1);
+		}
+	}
+
+	next->num_ptrs = node->num_ptrs;
+
+	/* copy over node's match results */
+	if (node->match_flag == 0)
+		next->match_flag = 0;
+	else {
+		next->match_flag = -1;
+		next->mrt = acl_build_alloc(context, 1, sizeof(*next->mrt));
+		memcpy(next->mrt, node->mrt, sizeof(*next->mrt));
+	}
+
+	/* copy over node's bitset */
+	acl_include(&next->values, &node->values, -1);
+
+	node->next = next;
+	next->prev = node;
+
+	return next;
+}
+
+/*
+ * Dereference a pointer from a node
+ */
+static void
+acl_deref_ptr(struct acl_build_context *context,
+	struct rte_acl_node *node, int index)
+{
+	struct rte_acl_node *ref_node;
+
+	/* De-reference the node at the specified pointer */
+	if (node != NULL && node->ptrs[index].ptr != NULL) {
+		ref_node = node->ptrs[index].ptr;
+		ref_node->ref_count--;
+		if (ref_node->ref_count == 0)
+			acl_free_node(context, ref_node);
+	}
+}
+
+/*
+ * acl_exclude rte_acl_bitset from src and copy remaining pointer to dst
+ */
+static int
+acl_copy_ptr(struct acl_build_context *context,
+	struct rte_acl_node *dst,
+	struct rte_acl_node *src,
+	int index,
+	struct rte_acl_bitset *b_bits)
+{
+	int rc;
+	struct rte_acl_bitset bits;
+
+	if (b_bits != NULL)
+		if (!acl_exclude(&bits, &src->ptrs[index].values, b_bits))
+			return 0;
+
+	rc = acl_add_ptr(context, dst, src->ptrs[index].ptr, &bits);
+	if (rc < 0)
+		return rc;
+	return 1;
+}
+
+/*
+ * Fill in gaps in ptrs list with the ptr at the end of the list
+ */
+static void
+acl_compact_node_ptrs(struct rte_acl_node *node_a)
+{
+	uint32_t n;
+	int min_add = node_a->min_add;
+
+	while (node_a->num_ptrs > 0  &&
+			node_a->ptrs[node_a->num_ptrs - 1].ptr == NULL)
+		node_a->num_ptrs--;
+
+	for (n = min_add; n + 1 < node_a->num_ptrs; n++) {
+
+		/* if this entry is empty */
+		if (node_a->ptrs[n].ptr == NULL) {
+
+			/* move the last pointer to this entry */
+			acl_include(&node_a->ptrs[n].values,
+				&node_a->ptrs[node_a->num_ptrs - 1].values,
+				0);
+			node_a->ptrs[n].ptr =
+				node_a->ptrs[node_a->num_ptrs - 1].ptr;
+
+			/*
+			 * mark the end as empty and adjust the number
+			 * of used pointer enum_tries
+			 */
+			node_a->ptrs[node_a->num_ptrs - 1].ptr = NULL;
+			while (node_a->num_ptrs > 0  &&
+				node_a->ptrs[node_a->num_ptrs - 1].ptr == NULL)
+				node_a->num_ptrs--;
+		}
+	}
+}
+
+static int
+acl_resolve_leaf(struct acl_build_context *context,
+	struct rte_acl_node *node_a,
+	struct rte_acl_node *node_b,
+	struct rte_acl_node **node_c)
+{
+	uint32_t n;
+	int combined_priority = ACL_PRIORITY_EQUAL;
+
+	for (n = 0; n < context->cfg.num_categories; n++) {
+		if (node_a->mrt->priority[n] != node_b->mrt->priority[n]) {
+			combined_priority |= (node_a->mrt->priority[n] >
+				node_b->mrt->priority[n]) ?
+				ACL_PRIORITY_NODE_A : ACL_PRIORITY_NODE_B;
+		}
+	}
+
+	/*
+	 * if node a is higher or equal priority for all categories,
+	 * then return node_a.
+	 */
+	if (combined_priority == ACL_PRIORITY_NODE_A ||
+			combined_priority == ACL_PRIORITY_EQUAL) {
+		*node_c = node_a;
+		return 0;
+	}
+
+	/*
+	 * if node b is higher or equal priority for all categories,
+	 * then return node_b.
+	 */
+	if (combined_priority == ACL_PRIORITY_NODE_B) {
+		*node_c = node_b;
+		return 0;
+	}
+
+	/*
+	 * mixed priorities - create a new node with the highest priority
+	 * for each category.
+	 */
+
+	/* force new duplication. */
+	node_a->next = NULL;
+
+	*node_c = acl_dup_node(context, node_a);
+	for (n = 0; n < context->cfg.num_categories; n++) {
+		if ((*node_c)->mrt->priority[n] < node_b->mrt->priority[n]) {
+			(*node_c)->mrt->priority[n] = node_b->mrt->priority[n];
+			(*node_c)->mrt->results[n] = node_b->mrt->results[n];
+		}
+	}
+	return 0;
+}
+
+/*
+ * Merge nodes A and B together,
+ *   returns a node that is the path for the intersection
+ *
+ * If match node (leaf on trie)
+ *	For each category
+ *		return node = highest priority result
+ *
+ * Create C as a duplicate of A to point to child intersections
+ * If any pointers in C intersect with any in B
+ *	For each intersection
+ *		merge children
+ *		remove intersection from C pointer
+ *		add a pointer from C to child intersection node
+ * Compact the pointers in A and B
+ * Copy any B pointers that are outside of the intersection to C
+ * If C has no references to the B trie
+ *   free C and return A
+ * Else If C has no references to the A trie
+ *   free C and return B
+ * Else
+ *   return C
+ */
+static int
+acl_merge_trie(struct acl_build_context *context,
+	struct rte_acl_node *node_a, struct rte_acl_node *node_b,
+	uint32_t level, struct rte_acl_node **return_c)
+{
+	uint32_t n, m, ptrs_c, ptrs_b;
+	uint32_t min_add_c, min_add_b;
+	int node_intersect_type;
+	struct rte_acl_bitset node_intersect;
+	struct rte_acl_node *node_c;
+	struct rte_acl_node *node_a_next;
+	int node_b_refs;
+	int node_a_refs;
+
+	node_c = node_a;
+	node_a_next = node_a->next;
+	min_add_c = 0;
+	min_add_b = 0;
+	node_a_refs = node_a->num_ptrs;
+	node_b_refs = 0;
+	node_intersect_type = 0;
+
+	/* Resolve leaf nodes (matches) */
+	if (node_a->match_flag != 0) {
+		acl_resolve_leaf(context, node_a, node_b, return_c);
+		return 0;
+	}
+
+	/*
+	 * Create node C as a copy of node A, and do: C = merge(A,B);
+	 * If node A can be used instead (A==C), then later we'll
+	 * destroy C and return A.
+	 */
+	if (level > 0)
+		node_c = acl_dup_node(context, node_a);
+
+	/*
+	 * If the two node transitions intersect then merge the transitions.
+	 * Check intersection for entire node (all pointers)
+	 */
+	node_intersect_type = acl_intersect_type(&node_c->values,
+		&node_b->values,
+		&node_intersect);
+
+	if (node_intersect_type & ACL_INTERSECT) {
+
+		min_add_b = node_b->min_add;
+		node_b->min_add = node_b->num_ptrs;
+		ptrs_b = node_b->num_ptrs;
+
+		min_add_c = node_c->min_add;
+		node_c->min_add = node_c->num_ptrs;
+		ptrs_c = node_c->num_ptrs;
+
+		for (n = 0; n < ptrs_c; n++) {
+			if (node_c->ptrs[n].ptr == NULL) {
+				node_a_refs--;
+				continue;
+			}
+			node_c->ptrs[n].ptr->next = NULL;
+			for (m = 0; m < ptrs_b; m++) {
+
+				struct rte_acl_bitset child_intersect;
+				int child_intersect_type;
+				struct rte_acl_node *child_node_c = NULL;
+
+				if (node_b->ptrs[m].ptr == NULL ||
+						node_c->ptrs[n].ptr ==
+						node_b->ptrs[m].ptr)
+						continue;
+
+				child_intersect_type = acl_intersect_type(
+					&node_c->ptrs[n].values,
+					&node_b->ptrs[m].values,
+					&child_intersect);
+
+				if ((child_intersect_type & ACL_INTERSECT) !=
+						0) {
+					if (acl_merge_trie(context,
+							node_c->ptrs[n].ptr,
+							node_b->ptrs[m].ptr,
+							level + 1,
+							&child_node_c))
+						return 1;
+
+					if (child_node_c != NULL &&
+							child_node_c !=
+							node_c->ptrs[n].ptr) {
+
+						node_b_refs++;
+
+						/*
+						 * Added link from C to
+						 * child_C for all transitions
+						 * in the intersection.
+						 */
+						acl_add_ptr(context, node_c,
+							child_node_c,
+							&child_intersect);
+
+						/*
+						 * inc refs if pointer is not
+						 * to node b.
+						 */
+						node_a_refs += (child_node_c !=
+							node_b->ptrs[m].ptr);
+
+						/*
+						 * Remove intersection from C
+						 * pointer.
+						 */
+						if (!acl_exclude(
+							&node_c->ptrs[n].values,
+							&node_c->ptrs[n].values,
+							&child_intersect)) {
+							acl_deref_ptr(context,
+								node_c, n);
+							node_c->ptrs[n].ptr =
+								NULL;
+							node_a_refs--;
+						}
+					}
+				}
+			}
+		}
+
+		/* Compact pointers */
+		node_c->min_add = min_add_c;
+		acl_compact_node_ptrs(node_c);
+		node_b->min_add = min_add_b;
+		acl_compact_node_ptrs(node_b);
+	}
+
+	/*
+	 *  Copy pointers outside of the intersection from B to C
+	 */
+	if ((node_intersect_type & ACL_INTERSECT_B) != 0) {
+		node_b_refs++;
+		for (m = 0; m < node_b->num_ptrs; m++)
+			if (node_b->ptrs[m].ptr != NULL)
+				acl_copy_ptr(context, node_c,
+					node_b, m, &node_intersect);
+	}
+
+	/*
+	 * Free node C if top of trie is contained in A or B
+	 *  if node C is a duplicate of node A &&
+	 *     node C was not an existing duplicate
+	 */
+	if (node_c != node_a && node_c != node_a_next) {
+
+		/*
+		 * if the intersection has no references to the
+		 * B side, then it is contained in A
+		 */
+		if (node_b_refs == 0) {
+			acl_free_node(context, node_c);
+			node_c = node_a;
+		} else {
+			/*
+			 * if the intersection has no references to the
+			 * A side, then it is contained in B.
+			 */
+			if (node_a_refs == 0) {
+				acl_free_node(context, node_c);
+				node_c = node_b;
+			}
+		}
+	}
+
+	if (return_c != NULL)
+		*return_c = node_c;
+
+	if (level == 0)
+		acl_free_node(context, node_b);
+
+	return 0;
+}
+
+/*
+ * Reset current runtime fields before next build:
+ *  - free allocated RT memory.
+ *  - reset all RT related fields to zero.
+ */
+static void
+acl_build_reset(struct rte_acl_ctx *ctx)
+{
+	rte_free(ctx->mem);
+	memset(&ctx->num_categories, 0,
+		sizeof(*ctx) - offsetof(struct rte_acl_ctx, num_categories));
+}
+
+static void
+acl_gen_range(struct acl_build_context *context,
+	const uint8_t *hi, const uint8_t *lo, int size, int level,
+	struct rte_acl_node *root, struct rte_acl_node *end)
+{
+	struct rte_acl_node *node, *prev;
+	uint32_t n;
+
+	prev = root;
+	for (n = size - 1; n > 0; n--) {
+		node = acl_alloc_node(context, level++);
+		acl_add_ptr_range(context, prev, node, lo[n], hi[n]);
+		prev = node;
+	}
+	acl_add_ptr_range(context, prev, end, lo[0], hi[0]);
+}
+
+static struct rte_acl_node *
+acl_gen_range_trie(struct acl_build_context *context,
+	const void *min, const void *max,
+	int size, int level, struct rte_acl_node **pend)
+{
+	int32_t n;
+	struct rte_acl_node *root;
+	const uint8_t *lo = (const uint8_t *)min;
+	const uint8_t *hi = (const uint8_t *)max;
+
+	*pend = acl_alloc_node(context, level+size);
+	root = acl_alloc_node(context, level++);
+
+	if (lo[size - 1] == hi[size - 1]) {
+		acl_gen_range(context, hi, lo, size, level, root, *pend);
+	} else {
+		uint8_t limit_lo[64];
+		uint8_t limit_hi[64];
+		uint8_t hi_ff = UINT8_MAX;
+		uint8_t lo_00 = 0;
+
+		memset(limit_lo, 0, RTE_DIM(limit_lo));
+		memset(limit_hi, UINT8_MAX, RTE_DIM(limit_hi));
+
+		for (n = size - 2; n >= 0; n--) {
+			hi_ff = (uint8_t)(hi_ff & hi[n]);
+			lo_00 = (uint8_t)(lo_00 | lo[n]);
+		}
+
+		if (hi_ff != UINT8_MAX) {
+			limit_lo[size - 1] = hi[size - 1];
+			acl_gen_range(context, hi, limit_lo, size, level,
+				root, *pend);
+		}
+
+		if (lo_00 != 0) {
+			limit_hi[size - 1] = lo[size - 1];
+			acl_gen_range(context, limit_hi, lo, size, level,
+				root, *pend);
+		}
+
+		if (hi[size - 1] - lo[size - 1] > 1 ||
+				lo_00 == 0 ||
+				hi_ff == UINT8_MAX) {
+			limit_lo[size-1] = (uint8_t)(lo[size-1] + (lo_00 != 0));
+			limit_hi[size-1] = (uint8_t)(hi[size-1] -
+				(hi_ff != UINT8_MAX));
+			acl_gen_range(context, limit_hi, limit_lo, size,
+				level, root, *pend);
+		}
+	}
+	return root;
+}
+
+static struct rte_acl_node *
+acl_gen_mask_trie(struct acl_build_context *context,
+	const void *value, const void *mask,
+	int size, int level, struct rte_acl_node **pend)
+{
+	int32_t n;
+	struct rte_acl_node *root;
+	struct rte_acl_node *node, *prev;
+	struct rte_acl_bitset bits;
+	const uint8_t *val = (const uint8_t *)value;
+	const uint8_t *msk = (const uint8_t *)mask;
+
+	root = acl_alloc_node(context, level++);
+	prev = root;
+
+	for (n = size - 1; n >= 0; n--) {
+		node = acl_alloc_node(context, level++);
+		acl_gen_mask(&bits, val[n] & msk[n], msk[n]);
+		acl_add_ptr(context, prev, node, &bits);
+		prev = node;
+	}
+
+	*pend = prev;
+	return root;
+}
+
+static struct rte_acl_node *
+build_trie(struct acl_build_context *context, struct rte_acl_build_rule *head,
+	struct rte_acl_build_rule **last, uint32_t *count)
+{
+	uint32_t n, m;
+	int field_index, node_count;
+	struct rte_acl_node *trie;
+	struct rte_acl_build_rule *prev, *rule;
+	struct rte_acl_node *end, *merge, *root, *end_prev;
+	const struct rte_acl_field *fld;
+
+	prev = head;
+	rule = head;
+	*last = prev;
+
+	trie = acl_alloc_node(context, 0);
+
+	while (rule != NULL) {
+
+		root = acl_alloc_node(context, 0);
+
+		root->ref_count = 1;
+		end = root;
+
+		for (n = 0; n < rule->config->num_fields; n++) {
+
+			field_index = rule->config->defs[n].field_index;
+			fld = rule->f->field + field_index;
+			end_prev = end;
+
+			/* build a mini-trie for this field */
+			switch (rule->config->defs[n].type) {
+
+			case RTE_ACL_FIELD_TYPE_BITMASK:
+				merge = acl_gen_mask_trie(context,
+					&fld->value,
+					&fld->mask_range,
+					rule->config->defs[n].size,
+					end->level + 1,
+					&end);
+				break;
+
+			case RTE_ACL_FIELD_TYPE_MASK:
+			{
+				/*
+				 * set msb for the size of the field and
+				 * all higher bits.
+				 */
+				uint64_t mask;
+				mask = RTE_ACL_MASKLEN_TO_BITMASK(
+					fld->mask_range.u32,
+					rule->config->defs[n].size);
+
+				/* gen a mini-trie for this field */
+				merge = acl_gen_mask_trie(context,
+					&fld->value,
+					(char *)&mask,
+					rule->config->defs[n].size,
+					end->level + 1,
+					&end);
+			}
+			break;
+
+			case RTE_ACL_FIELD_TYPE_RANGE:
+				merge = acl_gen_range_trie(context,
+					&rule->f->field[field_index].value,
+					&rule->f->field[field_index].mask_range,
+					rule->config->defs[n].size,
+					end->level + 1,
+					&end);
+				break;
+
+			default:
+				RTE_LOG(ERR, ACL,
+					"Error in rule[%u] type - %hhu\n",
+					rule->f->data.userdata,
+					rule->config->defs[n].type);
+				return NULL;
+			}
+
+			/* merge this field on to the end of the rule */
+			if (acl_merge_trie(context, end_prev, merge, 0,
+					NULL) != 0) {
+				return NULL;
+			}
+		}
+
+		end->match_flag = ++context->num_build_rules;
+
+		/*
+		 * Setup the results for this rule.
+		 * The result and priority of each category.
+		 */
+		if (end->mrt == NULL)
+			end->mrt = acl_build_alloc(context, 1,
+				sizeof(*end->mrt));
+
+		for (m = context->cfg.num_categories; 0 != m--; ) {
+			if (rule->f->data.category_mask & (1 << m)) {
+				end->mrt->results[m] = rule->f->data.userdata;
+				end->mrt->priority[m] = rule->f->data.priority;
+			} else {
+				end->mrt->results[m] = 0;
+				end->mrt->priority[m] = 0;
+			}
+		}
+
+		node_count = context->num_nodes;
+		(*count)++;
+
+		/* merge this rule into the trie */
+		if (acl_merge_trie(context, trie, root, 0, NULL))
+			return NULL;
+
+		node_count = context->num_nodes - node_count;
+		if (node_count > context->cur_node_max) {
+			*last = prev;
+			return trie;
+		}
+
+		prev = rule;
+		rule = rule->next;
+	}
+
+	*last = NULL;
+	return trie;
+}
+
+static void
+acl_calc_wildness(struct rte_acl_build_rule *head,
+	const struct rte_acl_config *config)
+{
+	uint32_t n;
+	struct rte_acl_build_rule *rule;
+
+	for (rule = head; rule != NULL; rule = rule->next) {
+
+		for (n = 0; n < config->num_fields; n++) {
+
+			double wild = 0;
+			uint32_t bit_len = CHAR_BIT * config->defs[n].size;
+			uint64_t msk_val = RTE_LEN2MASK(bit_len,
+				typeof(msk_val));
+			double size = bit_len;
+			int field_index = config->defs[n].field_index;
+			const struct rte_acl_field *fld = rule->f->field +
+				field_index;
+
+			switch (rule->config->defs[n].type) {
+			case RTE_ACL_FIELD_TYPE_BITMASK:
+				wild = (size - __builtin_popcountll(
+					fld->mask_range.u64 & msk_val)) /
+					size;
+				break;
+
+			case RTE_ACL_FIELD_TYPE_MASK:
+				wild = (size - fld->mask_range.u32) / size;
+				break;
+
+			case RTE_ACL_FIELD_TYPE_RANGE:
+				wild = (fld->mask_range.u64 & msk_val) -
+					(fld->value.u64 & msk_val);
+				wild = wild / msk_val;
+				break;
+			}
+
+			rule->wildness[field_index] = (uint32_t)(wild * 100);
+		}
+	}
+}
+
+static void
+acl_rule_stats(struct rte_acl_build_rule *head, struct rte_acl_config *config)
+{
+	struct rte_acl_build_rule *rule;
+	uint32_t n, m, fields_deactivated = 0;
+	uint32_t start = 0, deactivate = 0;
+	int tally[RTE_ACL_MAX_LEVELS][TALLY_NUM];
+
+	memset(tally, 0, sizeof(tally));
+
+	for (rule = head; rule != NULL; rule = rule->next) {
+
+		for (n = 0; n < config->num_fields; n++) {
+			uint32_t field_index = config->defs[n].field_index;
+
+			tally[n][TALLY_0]++;
+			for (m = 1; m < RTE_DIM(wild_limits); m++) {
+				if (rule->wildness[field_index] >=
+						wild_limits[m])
+					tally[n][m]++;
+			}
+		}
+
+		for (n = config->num_fields - 1; n > 0; n--) {
+			uint32_t field_index = config->defs[n].field_index;
+
+			if (rule->wildness[field_index] == 100)
+				tally[n][TALLY_DEPTH]++;
+			else
+				break;
+		}
+	}
+
+	/*
+	 * Look for any field that is always wild and drop it from the config
+	 * Only deactivate if all fields for a given input loop are deactivated.
+	 */
+	for (n = 1; n < config->num_fields; n++) {
+		if (config->defs[n].input_index !=
+				config->defs[n - 1].input_index) {
+			for (m = start; m < n; m++)
+				tally[m][TALLY_DEACTIVATED] = deactivate;
+			fields_deactivated += deactivate;
+			start = n;
+			deactivate = 1;
+		}
+
+		/* if the field is not always completely wild */
+		if (tally[n][TALLY_100] != tally[n][TALLY_0])
+			deactivate = 0;
+	}
+
+	for (m = start; m < n; m++)
+		tally[m][TALLY_DEACTIVATED] = deactivate;
+
+	fields_deactivated += deactivate;
+
+	/* remove deactivated fields */
+	if (fields_deactivated) {
+		uint32_t k, l = 0;
+
+		for (k = 0; k < config->num_fields; k++) {
+			if (tally[k][TALLY_DEACTIVATED] == 0) {
+				memmove(&tally[l][0], &tally[k][0],
+					TALLY_NUM * sizeof(tally[0][0]));
+				memmove(&config->defs[l++],
+					&config->defs[k],
+					sizeof(struct rte_acl_field_def));
+			}
+		}
+		config->num_fields = l;
+	}
+}
+
+static int
+rule_cmp_wildness(struct rte_acl_build_rule *r1, struct rte_acl_build_rule *r2)
+{
+	uint32_t n;
+
+	for (n = 1; n < r1->config->num_fields; n++) {
+		int field_index = r1->config->defs[n].field_index;
+
+		if (r1->wildness[field_index] != r2->wildness[field_index])
+			return r1->wildness[field_index] -
+				r2->wildness[field_index];
+	}
+	return 0;
+}
+
+/*
+ * Split the rte_acl_build_rule list into two lists.
+ */
+static void
+rule_list_split(struct rte_acl_build_rule *source,
+	struct rte_acl_build_rule **list_a,
+	struct rte_acl_build_rule **list_b)
+{
+	struct rte_acl_build_rule *fast;
+	struct rte_acl_build_rule *slow;
+
+	if (source == NULL || source->next == NULL) {
+		/* length < 2 cases */
+		*list_a = source;
+		*list_b = NULL;
+	} else {
+		slow = source;
+		fast = source->next;
+		/* Advance 'fast' two nodes, and advance 'slow' one node */
+		while (fast != NULL) {
+			fast = fast->next;
+			if (fast != NULL) {
+				slow = slow->next;
+				fast = fast->next;
+			}
+		}
+		/* 'slow' is before the midpoint in the list, so split it in two
+		   at that point. */
+		*list_a = source;
+		*list_b = slow->next;
+		slow->next = NULL;
+	}
+}
+
+/*
+ * Merge two sorted lists.
+ */
+static struct rte_acl_build_rule *
+rule_list_sorted_merge(struct rte_acl_build_rule *a,
+	struct rte_acl_build_rule *b)
+{
+	struct rte_acl_build_rule *result = NULL;
+	struct rte_acl_build_rule **last_next = &result;
+
+	while (1) {
+		if (a == NULL) {
+			*last_next = b;
+			break;
+		} else if (b == NULL) {
+			*last_next = a;
+			break;
+		}
+		if (rule_cmp_wildness(a, b) >= 0) {
+			*last_next = a;
+			last_next = &a->next;
+			a = a->next;
+		} else {
+			*last_next = b;
+			last_next = &b->next;
+			b = b->next;
+		}
+	}
+	return result;
+}
+
+/*
+ * Sort list of rules based on the rules wildness.
+ * Use recursive mergesort algorithm.
+ */
+static struct rte_acl_build_rule *
+sort_rules(struct rte_acl_build_rule *head)
+{
+	struct rte_acl_build_rule *a;
+	struct rte_acl_build_rule *b;
+
+	/* Base case -- length 0 or 1 */
+	if (head == NULL || head->next == NULL)
+		return head;
+
+	/* Split head into 'a' and 'b' sublists */
+	rule_list_split(head, &a, &b);
+
+	/* Recursively sort the sublists */
+	a = sort_rules(a);
+	b = sort_rules(b);
+
+	/* answer = merge the two sorted lists together */
+	return rule_list_sorted_merge(a, b);
+}
+
+static uint32_t
+acl_build_index(const struct rte_acl_config *config, uint32_t *data_index)
+{
+	uint32_t n, m;
+	int32_t last_header;
+
+	m = 0;
+	last_header = -1;
+
+	for (n = 0; n < config->num_fields; n++) {
+		if (last_header != config->defs[n].input_index) {
+			last_header = config->defs[n].input_index;
+			data_index[m++] = config->defs[n].offset;
+		}
+	}
+
+	return m;
+}
+
+static struct rte_acl_build_rule *
+build_one_trie(struct acl_build_context *context,
+	struct rte_acl_build_rule *rule_sets[RTE_ACL_MAX_TRIES],
+	uint32_t n, int32_t node_max)
+{
+	struct rte_acl_build_rule *last;
+	struct rte_acl_config *config;
+
+	config = rule_sets[n]->config;
+
+	acl_rule_stats(rule_sets[n], config);
+	rule_sets[n] = sort_rules(rule_sets[n]);
+
+	context->tries[n].type = RTE_ACL_FULL_TRIE;
+	context->tries[n].count = 0;
+
+	context->tries[n].num_data_indexes = acl_build_index(config,
+		context->data_indexes[n]);
+	context->tries[n].data_index = context->data_indexes[n];
+
+	context->cur_node_max = node_max;
+
+	context->bld_tries[n].trie = build_trie(context, rule_sets[n],
+		&last, &context->tries[n].count);
+
+	return last;
+}
+
+static int
+acl_build_tries(struct acl_build_context *context,
+	struct rte_acl_build_rule *head)
+{
+	uint32_t n, num_tries;
+	struct rte_acl_config *config;
+	struct rte_acl_build_rule *last;
+	struct rte_acl_build_rule *rule_sets[RTE_ACL_MAX_TRIES];
+
+	config = head->config;
+	rule_sets[0] = head;
+
+	/* initialize tries */
+	for (n = 0; n < RTE_DIM(context->tries); n++) {
+		context->tries[n].type = RTE_ACL_UNUSED_TRIE;
+		context->bld_tries[n].trie = NULL;
+		context->tries[n].count = 0;
+	}
+
+	context->tries[0].type = RTE_ACL_FULL_TRIE;
+
+	/* calc wildness of each field of each rule */
+	acl_calc_wildness(head, config);
+
+	for (n = 0;; n = num_tries) {
+
+		num_tries = n + 1;
+
+		last = build_one_trie(context, rule_sets, n, context->node_max);
+		if (context->bld_tries[n].trie == NULL) {
+			RTE_LOG(ERR, ACL, "Build of %u-th trie failed\n", n);
+			return -ENOMEM;
+		}
+
+		/* Build of the last trie completed. */
+		if (last == NULL)
+			break;
+
+		if (num_tries == RTE_DIM(context->tries)) {
+			RTE_LOG(ERR, ACL,
+				"Exceeded max number of tries: %u\n",
+				num_tries);
+			return -ENOMEM;
+		}
+
+		/* Trie is getting too big, split remaining rule set. */
+		rule_sets[num_tries] = last->next;
+		last->next = NULL;
+		acl_free_node(context, context->bld_tries[n].trie);
+
+		/* Create a new copy of config for remaining rules. */
+		config = acl_build_alloc(context, 1, sizeof(*config));
+		memcpy(config, rule_sets[n]->config, sizeof(*config));
+
+		/* Make remaining rules use new config. */
+		for (head = rule_sets[num_tries]; head != NULL;
+				head = head->next)
+			head->config = config;
+
+		/*
+		 * Rebuild the trie for the reduced rule-set.
+		 * Don't try to split it any further.
+		 */
+		last = build_one_trie(context, rule_sets, n, INT32_MAX);
+		if (context->bld_tries[n].trie == NULL || last != NULL) {
+			RTE_LOG(ERR, ACL, "Build of %u-th trie failed\n", n);
+			return -ENOMEM;
+		}
+
+	}
+
+	context->num_tries = num_tries;
+	return 0;
+}
+
+static void
+acl_build_log(const struct acl_build_context *ctx)
+{
+	uint32_t n;
+
+	RTE_LOG(DEBUG, ACL, "Build phase for ACL \"%s\":\n"
+		"node limit for tree split: %u\n"
+		"nodes created: %u\n"
+		"memory consumed: %zu\n",
+		ctx->acx->name,
+		ctx->node_max,
+		ctx->num_nodes,
+		ctx->pool.alloc);
+
+	for (n = 0; n < RTE_DIM(ctx->tries); n++) {
+		if (ctx->tries[n].count != 0)
+			RTE_LOG(DEBUG, ACL,
+				"trie %u: number of rules: %u, indexes: %u\n",
+				n, ctx->tries[n].count,
+				ctx->tries[n].num_data_indexes);
+	}
+}
+
+static int
+acl_build_rules(struct acl_build_context *bcx)
+{
+	struct rte_acl_build_rule *br, *head;
+	const struct rte_acl_rule *rule;
+	uint32_t *wp;
+	uint32_t fn, i, n, num;
+	size_t ofs, sz;
+
+	fn = bcx->cfg.num_fields;
+	n = bcx->acx->num_rules;
+	ofs = n * sizeof(*br);
+	sz = ofs + n * fn * sizeof(*wp);
+
+	br = tb_alloc(&bcx->pool, sz);
+
+	wp = (uint32_t *)((uintptr_t)br + ofs);
+	num = 0;
+	head = NULL;
+
+	for (i = 0; i != n; i++) {
+		rule = (const struct rte_acl_rule *)
+			((uintptr_t)bcx->acx->rules + bcx->acx->rule_sz * i);
+		if ((rule->data.category_mask & bcx->category_mask) != 0) {
+			br[num].next = head;
+			br[num].config = &bcx->cfg;
+			br[num].f = rule;
+			br[num].wildness = wp;
+			wp += fn;
+			head = br + num;
+			num++;
+		}
+	}
+
+	bcx->num_rules = num;
+	bcx->build_rules = head;
+
+	return 0;
+}
+
+/*
+ * Copy data_indexes for each trie into RT location.
+ */
+static void
+acl_set_data_indexes(struct rte_acl_ctx *ctx)
+{
+	uint32_t i, n, ofs;
+
+	ofs = 0;
+	for (i = 0; i != ctx->num_tries; i++) {
+		n = ctx->trie[i].num_data_indexes;
+		memcpy(ctx->data_indexes + ofs, ctx->trie[i].data_index,
+			n * sizeof(ctx->data_indexes[0]));
+		ctx->trie[i].data_index = ctx->data_indexes + ofs;
+		ofs += RTE_ACL_MAX_FIELDS;
+	}
+}
+
+/*
+ * Internal routine, performs 'build' phase of trie generation:
+ * - setups build context.
+ * - analizes given set of rules.
+ * - builds internal tree(s).
+ */
+static int
+acl_bld(struct acl_build_context *bcx, struct rte_acl_ctx *ctx,
+	const struct rte_acl_config *cfg, uint32_t node_max)
+{
+	int32_t rc;
+
+	/* setup build context. */
+	memset(bcx, 0, sizeof(*bcx));
+	bcx->acx = ctx;
+	bcx->pool.alignment = ACL_POOL_ALIGN;
+	bcx->pool.min_alloc = ACL_POOL_ALLOC_MIN;
+	bcx->cfg = *cfg;
+	bcx->category_mask = RTE_LEN2MASK(bcx->cfg.num_categories,
+		typeof(bcx->category_mask));
+	bcx->node_max = node_max;
+
+	rc = sigsetjmp(bcx->pool.fail, 0);
+
+	/* build phase runs out of memory. */
+	if (rc != 0) {
+		RTE_LOG(ERR, ACL,
+			"ACL context: %s, %s() failed with error code: %d\n",
+			bcx->acx->name, __func__, rc);
+		return rc;
+	}
+
+	/* Create a build rules copy. */
+	rc = acl_build_rules(bcx);
+	if (rc != 0)
+		return rc;
+
+	/* No rules to build for that context+config */
+	if (bcx->build_rules == NULL) {
+		rc = -EINVAL;
+	} else {
+		/* build internal trie representation. */
+		rc = acl_build_tries(bcx, bcx->build_rules);
+	}
+	return rc;
+}
+
+/*
+ * Check that parameters for acl_build() are valid.
+ */
+static int
+acl_check_bld_param(struct rte_acl_ctx *ctx, const struct rte_acl_config *cfg)
+{
+	static const size_t field_sizes[] = {
+		sizeof(uint8_t), sizeof(uint16_t),
+		sizeof(uint32_t), sizeof(uint64_t),
+	};
+
+	uint32_t i, j;
+
+	if (ctx == NULL || cfg == NULL || cfg->num_categories == 0 ||
+			cfg->num_categories > RTE_ACL_MAX_CATEGORIES ||
+			cfg->num_fields == 0 ||
+			cfg->num_fields > RTE_ACL_MAX_FIELDS)
+		return -EINVAL;
+
+	for (i = 0; i != cfg->num_fields; i++) {
+		if (cfg->defs[i].type > RTE_ACL_FIELD_TYPE_BITMASK) {
+			RTE_LOG(ERR, ACL,
+			"ACL context: %s, invalid type: %hhu for %u-th field\n",
+			ctx->name, cfg->defs[i].type, i);
+			return -EINVAL;
+		}
+		for (j = 0;
+				j != RTE_DIM(field_sizes) &&
+				cfg->defs[i].size != field_sizes[j];
+				j++)
+			;
+
+		if (j == RTE_DIM(field_sizes)) {
+			RTE_LOG(ERR, ACL,
+			"ACL context: %s, invalid size: %hhu for %u-th field\n",
+			ctx->name, cfg->defs[i].size, i);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+int
+rte_acl_build(struct rte_acl_ctx *ctx, const struct rte_acl_config *cfg)
+{
+	int32_t rc;
+	uint32_t n;
+	size_t max_size;
+	struct acl_build_context bcx;
+
+	rc = acl_check_bld_param(ctx, cfg);
+	if (rc != 0)
+		return rc;
+
+	acl_build_reset(ctx);
+
+	if (cfg->max_size == 0) {
+		n = NODE_MIN;
+		max_size = SIZE_MAX;
+	} else {
+		n = NODE_MAX;
+		max_size = cfg->max_size;
+	}
+
+	for (rc = -ERANGE; n >= NODE_MIN && rc == -ERANGE; n /= 2) {
+
+		/* perform build phase. */
+		rc = acl_bld(&bcx, ctx, cfg, n);
+
+		if (rc == 0) {
+			/* allocate and fill run-time  structures. */
+			rc = rte_acl_gen(ctx, bcx.tries, bcx.bld_tries,
+				bcx.num_tries, bcx.cfg.num_categories,
+				RTE_ACL_MAX_FIELDS * RTE_DIM(bcx.tries) *
+				sizeof(ctx->data_indexes[0]), max_size);
+			if (rc == 0) {
+				/* set data indexes. */
+				acl_set_data_indexes(ctx);
+
+				/* copy in build config. */
+				ctx->config = *cfg;
+			}
+		}
+
+		acl_build_log(&bcx);
+
+		/* cleanup after build. */
+		tb_free_pool(&bcx.pool);
+	}
+
+	return rc;
+}
diff --git a/lib/librte_acl/acl_gen.c b/lib/librte_acl/acl_gen.c
new file mode 100644
index 00000000..ea557ab9
--- /dev/null
+++ b/lib/librte_acl/acl_gen.c
@@ -0,0 +1,561 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rte_acl.h>
+#include "acl.h"
+
+#define	QRANGE_MIN	((uint8_t)INT8_MIN)
+
+#define	RTE_ACL_VERIFY(exp)	do {                                          \
+	if (!(exp))                                                           \
+		rte_panic("line %d\tassert \"" #exp "\" failed\n", __LINE__); \
+} while (0)
+
+struct acl_node_counters {
+	int32_t match;
+	int32_t match_used;
+	int32_t single;
+	int32_t quad;
+	int32_t quad_vectors;
+	int32_t dfa;
+	int32_t dfa_gr64;
+};
+
+struct rte_acl_indices {
+	int32_t dfa_index;
+	int32_t quad_index;
+	int32_t single_index;
+	int32_t match_index;
+	int32_t match_start;
+};
+
+static void
+acl_gen_log_stats(const struct rte_acl_ctx *ctx,
+	const struct acl_node_counters *counts,
+	const struct rte_acl_indices *indices,
+	size_t max_size)
+{
+	RTE_LOG(DEBUG, ACL, "Gen phase for ACL \"%s\":\n"
+		"runtime memory footprint on socket %d:\n"
+		"single nodes/bytes used: %d/%zu\n"
+		"quad nodes/vectors/bytes used: %d/%d/%zu\n"
+		"DFA nodes/group64/bytes used: %d/%d/%zu\n"
+		"match nodes/bytes used: %d/%zu\n"
+		"total: %zu bytes\n"
+		"max limit: %zu bytes\n",
+		ctx->name, ctx->socket_id,
+		counts->single, counts->single * sizeof(uint64_t),
+		counts->quad, counts->quad_vectors,
+		(indices->quad_index - indices->dfa_index) * sizeof(uint64_t),
+		counts->dfa, counts->dfa_gr64,
+		indices->dfa_index * sizeof(uint64_t),
+		counts->match,
+		counts->match * sizeof(struct rte_acl_match_results),
+		ctx->mem_sz,
+		max_size);
+}
+
+static uint64_t
+acl_dfa_gen_idx(const struct rte_acl_node *node, uint32_t index)
+{
+	uint64_t idx;
+	uint32_t i;
+
+	idx = 0;
+	for (i = 0; i != RTE_DIM(node->dfa_gr64); i++) {
+		RTE_ACL_VERIFY(node->dfa_gr64[i] < RTE_ACL_DFA_GR64_NUM);
+		RTE_ACL_VERIFY(node->dfa_gr64[i] < node->fanout);
+		idx |= (i - node->dfa_gr64[i]) <<
+			(6 + RTE_ACL_DFA_GR64_BIT * i);
+	}
+
+	return idx << (CHAR_BIT * sizeof(index)) | index | node->node_type;
+}
+
+static void
+acl_dfa_fill_gr64(const struct rte_acl_node *node,
+	const uint64_t src[RTE_ACL_DFA_SIZE], uint64_t dst[RTE_ACL_DFA_SIZE])
+{
+	uint32_t i;
+
+	for (i = 0; i != RTE_DIM(node->dfa_gr64); i++) {
+		memcpy(dst + node->dfa_gr64[i] * RTE_ACL_DFA_GR64_SIZE,
+			src + i * RTE_ACL_DFA_GR64_SIZE,
+			RTE_ACL_DFA_GR64_SIZE * sizeof(dst[0]));
+	}
+}
+
+static uint32_t
+acl_dfa_count_gr64(const uint64_t array_ptr[RTE_ACL_DFA_SIZE],
+	uint8_t gr64[RTE_ACL_DFA_GR64_NUM])
+{
+	uint32_t i, j, k;
+
+	k = 0;
+	for (i = 0; i != RTE_ACL_DFA_GR64_NUM; i++) {
+		gr64[i] = i;
+		for (j = 0; j != i; j++) {
+			if (memcmp(array_ptr + i * RTE_ACL_DFA_GR64_SIZE,
+					array_ptr + j * RTE_ACL_DFA_GR64_SIZE,
+					RTE_ACL_DFA_GR64_SIZE *
+					sizeof(array_ptr[0])) == 0)
+				break;
+		}
+		gr64[i] = (j != i) ? gr64[j] : k++;
+	}
+
+	return k;
+}
+
+static uint32_t
+acl_node_fill_dfa(const struct rte_acl_node *node,
+	uint64_t dfa[RTE_ACL_DFA_SIZE], uint64_t no_match, int32_t resolved)
+{
+	uint32_t n, x;
+	uint32_t ranges, last_bit;
+	struct rte_acl_node *child;
+	struct rte_acl_bitset *bits;
+
+	ranges = 0;
+	last_bit = 0;
+
+	for (n = 0; n < RTE_ACL_DFA_SIZE; n++)
+		dfa[n] = no_match;
+
+	for (x = 0; x < node->num_ptrs; x++) {
+
+		child = node->ptrs[x].ptr;
+		if (child == NULL)
+			continue;
+
+		bits = &node->ptrs[x].values;
+		for (n = 0; n < RTE_ACL_DFA_SIZE; n++) {
+
+			if (bits->bits[n / (sizeof(bits_t) * CHAR_BIT)] &
+				(1 << (n % (sizeof(bits_t) * CHAR_BIT)))) {
+
+				dfa[n] = resolved ? child->node_index : x;
+				ranges += (last_bit == 0);
+				last_bit = 1;
+			} else {
+				last_bit = 0;
+			}
+		}
+	}
+
+	return ranges;
+}
+
+/*
+*  Counts the number of groups of sequential bits that are
+*  either 0 or 1, as specified by the zero_one parameter. This is used to
+*  calculate the number of ranges in a node to see if it fits in a quad range
+*  node.
+*/
+static int
+acl_count_sequential_groups(struct rte_acl_bitset *bits, int zero_one)
+{
+	int n, ranges, last_bit;
+
+	ranges = 0;
+	last_bit = zero_one ^ 1;
+
+	for (n = QRANGE_MIN; n < UINT8_MAX + 1; n++) {
+		if (bits->bits[n / (sizeof(bits_t) * 8)] &
+				(1 << (n % (sizeof(bits_t) * 8)))) {
+			if (zero_one == 1 && last_bit != 1)
+				ranges++;
+			last_bit = 1;
+		} else {
+			if (zero_one == 0 && last_bit != 0)
+				ranges++;
+			last_bit = 0;
+		}
+	}
+	for (n = 0; n < QRANGE_MIN; n++) {
+		if (bits->bits[n / (sizeof(bits_t) * 8)] &
+				(1 << (n % (sizeof(bits_t) * 8)))) {
+			if (zero_one == 1 && last_bit != 1)
+				ranges++;
+			last_bit = 1;
+		} else {
+			if (zero_one == 0 && last_bit != 0)
+				ranges++;
+			last_bit = 0;
+		}
+	}
+
+	return ranges;
+}
+
+/*
+ * Count number of ranges spanned by the node's pointers
+ */
+static int
+acl_count_fanout(struct rte_acl_node *node)
+{
+	uint32_t n;
+	int ranges;
+
+	if (node->fanout != 0)
+		return node->fanout;
+
+	ranges = acl_count_sequential_groups(&node->values, 0);
+
+	for (n = 0; n < node->num_ptrs; n++) {
+		if (node->ptrs[n].ptr != NULL)
+			ranges += acl_count_sequential_groups(
+				&node->ptrs[n].values, 1);
+	}
+
+	node->fanout = ranges;
+	return node->fanout;
+}
+
+/*
+ * Determine the type of nodes and count each type
+ */
+static void
+acl_count_trie_types(struct acl_node_counters *counts,
+	struct rte_acl_node *node, uint64_t no_match, int force_dfa)
+{
+	uint32_t n;
+	int num_ptrs;
+	uint64_t dfa[RTE_ACL_DFA_SIZE];
+
+	/* skip if this node has been counted */
+	if (node->node_type != (uint32_t)RTE_ACL_NODE_UNDEFINED)
+		return;
+
+	if (node->match_flag != 0 || node->num_ptrs == 0) {
+		counts->match++;
+		node->node_type = RTE_ACL_NODE_MATCH;
+		return;
+	}
+
+	num_ptrs = acl_count_fanout(node);
+
+	/* Force type to dfa */
+	if (force_dfa)
+		num_ptrs = RTE_ACL_DFA_SIZE;
+
+	/* determine node type based on number of ranges */
+	if (num_ptrs == 1) {
+		counts->single++;
+		node->node_type = RTE_ACL_NODE_SINGLE;
+	} else if (num_ptrs <= RTE_ACL_QUAD_MAX) {
+		counts->quad++;
+		counts->quad_vectors += node->fanout;
+		node->node_type = RTE_ACL_NODE_QRANGE;
+	} else {
+		counts->dfa++;
+		node->node_type = RTE_ACL_NODE_DFA;
+		if (force_dfa != 0) {
+			/* always expand to a max number of nodes. */
+			for (n = 0; n != RTE_DIM(node->dfa_gr64); n++)
+				node->dfa_gr64[n] = n;
+			node->fanout = n;
+		} else {
+			acl_node_fill_dfa(node, dfa, no_match, 0);
+			node->fanout = acl_dfa_count_gr64(dfa, node->dfa_gr64);
+		}
+		counts->dfa_gr64 += node->fanout;
+	}
+
+	/*
+	 * recursively count the types of all children
+	 */
+	for (n = 0; n < node->num_ptrs; n++) {
+		if (node->ptrs[n].ptr != NULL)
+			acl_count_trie_types(counts, node->ptrs[n].ptr,
+				no_match, 0);
+	}
+}
+
+static void
+acl_add_ptrs(struct rte_acl_node *node, uint64_t *node_array, uint64_t no_match,
+	int resolved)
+{
+	uint32_t x;
+	int32_t m;
+	uint64_t *node_a, index, dfa[RTE_ACL_DFA_SIZE];
+
+	acl_node_fill_dfa(node, dfa, no_match, resolved);
+
+	/*
+	 * Rather than going from 0 to 256, the range count and
+	 * the layout are from 80-ff then 0-7f due to signed compare
+	 * for SSE (cmpgt).
+	 */
+	if (node->node_type == RTE_ACL_NODE_QRANGE) {
+
+		m = 0;
+		node_a = node_array;
+		index = dfa[QRANGE_MIN];
+		*node_a++ = index;
+
+		for (x = QRANGE_MIN + 1; x < UINT8_MAX + 1; x++) {
+			if (dfa[x] != index) {
+				index = dfa[x];
+				*node_a++ = index;
+				node->transitions[m++] = (uint8_t)(x - 1);
+			}
+		}
+
+		for (x = 0; x < INT8_MAX + 1; x++) {
+			if (dfa[x] != index) {
+				index = dfa[x];
+				*node_a++ = index;
+				node->transitions[m++] = (uint8_t)(x - 1);
+			}
+		}
+
+		/* fill unused locations with max value - nothing is greater */
+		for (; m < RTE_ACL_QUAD_SIZE; m++)
+			node->transitions[m] = INT8_MAX;
+
+		RTE_ACL_VERIFY(m <= RTE_ACL_QUAD_SIZE);
+
+	} else if (node->node_type == RTE_ACL_NODE_DFA && resolved) {
+		acl_dfa_fill_gr64(node, dfa, node_array);
+	}
+}
+
+/*
+ * Routine that allocates space for this node and recursively calls
+ * to allocate space for each child. Once all the children are allocated,
+ * then resolve all transitions for this node.
+ */
+static void
+acl_gen_node(struct rte_acl_node *node, uint64_t *node_array,
+	uint64_t no_match, struct rte_acl_indices *index, int num_categories)
+{
+	uint32_t n, sz, *qtrp;
+	uint64_t *array_ptr;
+	struct rte_acl_match_results *match;
+
+	if (node->node_index != RTE_ACL_NODE_UNDEFINED)
+		return;
+
+	array_ptr = NULL;
+
+	switch (node->node_type) {
+	case RTE_ACL_NODE_DFA:
+		array_ptr = &node_array[index->dfa_index];
+		node->node_index = acl_dfa_gen_idx(node, index->dfa_index);
+		sz = node->fanout * RTE_ACL_DFA_GR64_SIZE;
+		index->dfa_index += sz;
+		for (n = 0; n < sz; n++)
+			array_ptr[n] = no_match;
+		break;
+	case RTE_ACL_NODE_SINGLE:
+		node->node_index = RTE_ACL_QUAD_SINGLE | index->single_index |
+			node->node_type;
+		array_ptr = &node_array[index->single_index];
+		index->single_index += 1;
+		array_ptr[0] = no_match;
+		break;
+	case RTE_ACL_NODE_QRANGE:
+		array_ptr = &node_array[index->quad_index];
+		acl_add_ptrs(node, array_ptr, no_match, 0);
+		qtrp = (uint32_t *)node->transitions;
+		node->node_index = qtrp[0];
+		node->node_index <<= sizeof(index->quad_index) * CHAR_BIT;
+		node->node_index |= index->quad_index | node->node_type;
+		index->quad_index += node->fanout;
+		break;
+	case RTE_ACL_NODE_MATCH:
+		match = ((struct rte_acl_match_results *)
+			(node_array + index->match_start));
+		for (n = 0; n != RTE_DIM(match->results); n++)
+			RTE_ACL_VERIFY(match->results[0] == 0);
+		memcpy(match + index->match_index, node->mrt,
+			sizeof(*node->mrt));
+		node->node_index = index->match_index | node->node_type;
+		index->match_index += 1;
+		break;
+	case RTE_ACL_NODE_UNDEFINED:
+		RTE_ACL_VERIFY(node->node_type !=
+			(uint32_t)RTE_ACL_NODE_UNDEFINED);
+		break;
+	}
+
+	/* recursively allocate space for all children */
+	for (n = 0; n < node->num_ptrs; n++) {
+		if (node->ptrs[n].ptr != NULL)
+			acl_gen_node(node->ptrs[n].ptr,
+				node_array,
+				no_match,
+				index,
+				num_categories);
+	}
+
+	/* All children are resolved, resolve this node's pointers */
+	switch (node->node_type) {
+	case RTE_ACL_NODE_DFA:
+		acl_add_ptrs(node, array_ptr, no_match, 1);
+		break;
+	case RTE_ACL_NODE_SINGLE:
+		for (n = 0; n < node->num_ptrs; n++) {
+			if (node->ptrs[n].ptr != NULL)
+				array_ptr[0] = node->ptrs[n].ptr->node_index;
+		}
+		break;
+	case RTE_ACL_NODE_QRANGE:
+		acl_add_ptrs(node, array_ptr, no_match, 1);
+		break;
+	case RTE_ACL_NODE_MATCH:
+		break;
+	case RTE_ACL_NODE_UNDEFINED:
+		RTE_ACL_VERIFY(node->node_type !=
+			(uint32_t)RTE_ACL_NODE_UNDEFINED);
+		break;
+	}
+}
+
+static void
+acl_calc_counts_indices(struct acl_node_counters *counts,
+	struct rte_acl_indices *indices,
+	struct rte_acl_bld_trie *node_bld_trie, uint32_t num_tries,
+	uint64_t no_match)
+{
+	uint32_t n;
+
+	memset(indices, 0, sizeof(*indices));
+	memset(counts, 0, sizeof(*counts));
+
+	/* Get stats on nodes */
+	for (n = 0; n < num_tries; n++) {
+		acl_count_trie_types(counts, node_bld_trie[n].trie,
+			no_match, 1);
+	}
+
+	indices->dfa_index = RTE_ACL_DFA_SIZE + 1;
+	indices->quad_index = indices->dfa_index +
+		counts->dfa_gr64 * RTE_ACL_DFA_GR64_SIZE;
+	indices->single_index = indices->quad_index + counts->quad_vectors;
+	indices->match_start = indices->single_index + counts->single + 1;
+	indices->match_start = RTE_ALIGN(indices->match_start,
+		(XMM_SIZE / sizeof(uint64_t)));
+	indices->match_index = 1;
+}
+
+/*
+ * Generate the runtime structure using build structure
+ */
+int
+rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie,
+	struct rte_acl_bld_trie *node_bld_trie, uint32_t num_tries,
+	uint32_t num_categories, uint32_t data_index_sz, size_t max_size)
+{
+	void *mem;
+	size_t total_size;
+	uint64_t *node_array, no_match;
+	uint32_t n, match_index;
+	struct rte_acl_match_results *match;
+	struct acl_node_counters counts;
+	struct rte_acl_indices indices;
+
+	no_match = RTE_ACL_NODE_MATCH;
+
+	/* Fill counts and indices arrays from the nodes. */
+	acl_calc_counts_indices(&counts, &indices,
+		node_bld_trie, num_tries, no_match);
+
+	/* Allocate runtime memory (align to cache boundary) */
+	total_size = RTE_ALIGN(data_index_sz, RTE_CACHE_LINE_SIZE) +
+		indices.match_start * sizeof(uint64_t) +
+		(counts.match + 1) * sizeof(struct rte_acl_match_results) +
+		XMM_SIZE;
+
+	if (total_size > max_size) {
+		RTE_LOG(DEBUG, ACL,
+			"Gen phase for ACL ctx \"%s\" exceeds max_size limit, "
+			"bytes required: %zu, allowed: %zu\n",
+			ctx->name, total_size, max_size);
+		return -ERANGE;
+	}
+
+	mem = rte_zmalloc_socket(ctx->name, total_size, RTE_CACHE_LINE_SIZE,
+			ctx->socket_id);
+	if (mem == NULL) {
+		RTE_LOG(ERR, ACL,
+			"allocation of %zu bytes on socket %d for %s failed\n",
+			total_size, ctx->socket_id, ctx->name);
+		return -ENOMEM;
+	}
+
+	/* Fill the runtime structure */
+	match_index = indices.match_start;
+	node_array = (uint64_t *)((uintptr_t)mem +
+		RTE_ALIGN(data_index_sz, RTE_CACHE_LINE_SIZE));
+
+	/*
+	 * Setup the NOMATCH node (a SINGLE at the
+	 * highest index, that points to itself)
+	 */
+
+	node_array[RTE_ACL_DFA_SIZE] = RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE;
+
+	for (n = 0; n < RTE_ACL_DFA_SIZE; n++)
+		node_array[n] = no_match;
+
+	/* NOMATCH result at index 0 */
+	match = ((struct rte_acl_match_results *)(node_array + match_index));
+	memset(match, 0, sizeof(*match));
+
+	for (n = 0; n < num_tries; n++) {
+
+		acl_gen_node(node_bld_trie[n].trie, node_array, no_match,
+			&indices, num_categories);
+
+		if (node_bld_trie[n].trie->node_index == no_match)
+			trie[n].root_index = 0;
+		else
+			trie[n].root_index = node_bld_trie[n].trie->node_index;
+	}
+
+	ctx->mem = mem;
+	ctx->mem_sz = total_size;
+	ctx->data_indexes = mem;
+	ctx->num_tries = num_tries;
+	ctx->num_categories = num_categories;
+	ctx->match_index = match_index;
+	ctx->no_match = no_match;
+	ctx->idle = node_array[RTE_ACL_DFA_SIZE];
+	ctx->trans_table = node_array;
+	memcpy(ctx->trie, trie, sizeof(ctx->trie));
+
+	acl_gen_log_stats(ctx, &counts, &indices, max_size);
+	return 0;
+}
diff --git a/lib/librte_acl/acl_run.h b/lib/librte_acl/acl_run.h
new file mode 100644
index 00000000..b2fc42c6
--- /dev/null
+++ b/lib/librte_acl/acl_run.h
@@ -0,0 +1,263 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_ACL_RUN_H_
+#define	_ACL_RUN_H_
+
+#include <rte_acl.h>
+#include "acl.h"
+
+#define MAX_SEARCHES_AVX16	16
+#define MAX_SEARCHES_SSE8	8
+#define MAX_SEARCHES_SSE4	4
+#define MAX_SEARCHES_SCALAR	2
+
+#define GET_NEXT_4BYTES(prm, idx)	\
+	(*((const int32_t *)((prm)[(idx)].data + *(prm)[idx].data_index++)))
+
+
+#define RTE_ACL_NODE_INDEX	((uint32_t)~RTE_ACL_NODE_TYPE)
+
+#define	SCALAR_QRANGE_MULT	0x01010101
+#define	SCALAR_QRANGE_MASK	0x7f7f7f7f
+#define	SCALAR_QRANGE_MIN	0x80808080
+
+/*
+ * Structure to manage N parallel trie traversals.
+ * The runtime trie traversal routines can process 8, 4, or 2 tries
+ * in parallel. Each packet may require multiple trie traversals (up to 4).
+ * This structure is used to fill the slots (0 to n-1) for parallel processing
+ * with the trie traversals needed for each packet.
+ */
+struct acl_flow_data {
+	uint32_t            num_packets;
+	/* number of packets processed */
+	uint32_t            started;
+	/* number of trie traversals in progress */
+	uint32_t            trie;
+	/* current trie index (0 to N-1) */
+	uint32_t            cmplt_size;
+	uint32_t            total_packets;
+	uint32_t            categories;
+	/* number of result categories per packet. */
+	/* maximum number of packets to process */
+	const uint64_t     *trans;
+	const uint8_t     **data;
+	uint32_t           *results;
+	struct completion  *last_cmplt;
+	struct completion  *cmplt_array;
+};
+
+/*
+ * Structure to maintain running results for
+ * a single packet (up to 4 tries).
+ */
+struct completion {
+	uint32_t *results;                          /* running results. */
+	int32_t   priority[RTE_ACL_MAX_CATEGORIES]; /* running priorities. */
+	uint32_t  count;                            /* num of remaining tries */
+	/* true for allocated struct */
+} __attribute__((aligned(XMM_SIZE)));
+
+/*
+ * One parms structure for each slot in the search engine.
+ */
+struct parms {
+	const uint8_t              *data;
+	/* input data for this packet */
+	const uint32_t             *data_index;
+	/* data indirection for this trie */
+	struct completion          *cmplt;
+	/* completion data for this packet */
+};
+
+/*
+ * Define an global idle node for unused engine slots
+ */
+static const uint32_t idle[UINT8_MAX + 1];
+
+/*
+ * Allocate a completion structure to manage the tries for a packet.
+ */
+static inline struct completion *
+alloc_completion(struct completion *p, uint32_t size, uint32_t tries,
+	uint32_t *results)
+{
+	uint32_t n;
+
+	for (n = 0; n < size; n++) {
+
+		if (p[n].count == 0) {
+
+			/* mark as allocated and set number of tries. */
+			p[n].count = tries;
+			p[n].results = results;
+			return &(p[n]);
+		}
+	}
+
+	/* should never get here */
+	return NULL;
+}
+
+/*
+ * Resolve priority for a single result trie.
+ */
+static inline void
+resolve_single_priority(uint64_t transition, int n,
+	const struct rte_acl_ctx *ctx, struct parms *parms,
+	const struct rte_acl_match_results *p)
+{
+	if (parms[n].cmplt->count == ctx->num_tries ||
+			parms[n].cmplt->priority[0] <=
+			p[transition].priority[0]) {
+
+		parms[n].cmplt->priority[0] = p[transition].priority[0];
+		parms[n].cmplt->results[0] = p[transition].results[0];
+	}
+}
+
+/*
+ * Routine to fill a slot in the parallel trie traversal array (parms) from
+ * the list of packets (flows).
+ */
+static inline uint64_t
+acl_start_next_trie(struct acl_flow_data *flows, struct parms *parms, int n,
+	const struct rte_acl_ctx *ctx)
+{
+	uint64_t transition;
+
+	/* if there are any more packets to process */
+	if (flows->num_packets < flows->total_packets) {
+		parms[n].data = flows->data[flows->num_packets];
+		parms[n].data_index = ctx->trie[flows->trie].data_index;
+
+		/* if this is the first trie for this packet */
+		if (flows->trie == 0) {
+			flows->last_cmplt = alloc_completion(flows->cmplt_array,
+				flows->cmplt_size, ctx->num_tries,
+				flows->results +
+				flows->num_packets * flows->categories);
+		}
+
+		/* set completion parameters and starting index for this slot */
+		parms[n].cmplt = flows->last_cmplt;
+		transition =
+			flows->trans[parms[n].data[*parms[n].data_index++] +
+			ctx->trie[flows->trie].root_index];
+
+		/*
+		 * if this is the last trie for this packet,
+		 * then setup next packet.
+		 */
+		flows->trie++;
+		if (flows->trie >= ctx->num_tries) {
+			flows->trie = 0;
+			flows->num_packets++;
+		}
+
+		/* keep track of number of active trie traversals */
+		flows->started++;
+
+	/* no more tries to process, set slot to an idle position */
+	} else {
+		transition = ctx->idle;
+		parms[n].data = (const uint8_t *)idle;
+		parms[n].data_index = idle;
+	}
+	return transition;
+}
+
+static inline void
+acl_set_flow(struct acl_flow_data *flows, struct completion *cmplt,
+	uint32_t cmplt_size, const uint8_t **data, uint32_t *results,
+	uint32_t data_num, uint32_t categories, const uint64_t *trans)
+{
+	flows->num_packets = 0;
+	flows->started = 0;
+	flows->trie = 0;
+	flows->last_cmplt = NULL;
+	flows->cmplt_array = cmplt;
+	flows->total_packets = data_num;
+	flows->categories = categories;
+	flows->cmplt_size = cmplt_size;
+	flows->data = data;
+	flows->results = results;
+	flows->trans = trans;
+}
+
+typedef void (*resolve_priority_t)
+(uint64_t transition, int n, const struct rte_acl_ctx *ctx,
+	struct parms *parms, const struct rte_acl_match_results *p,
+	uint32_t categories);
+
+/*
+ * Detect matches. If a match node transition is found, then this trie
+ * traversal is complete and fill the slot with the next trie
+ * to be processed.
+ */
+static inline uint64_t
+acl_match_check(uint64_t transition, int slot,
+	const struct rte_acl_ctx *ctx, struct parms *parms,
+	struct acl_flow_data *flows, resolve_priority_t resolve_priority)
+{
+	const struct rte_acl_match_results *p;
+
+	p = (const struct rte_acl_match_results *)
+		(flows->trans + ctx->match_index);
+
+	if (transition & RTE_ACL_NODE_MATCH) {
+
+		/* Remove flags from index and decrement active traversals */
+		transition &= RTE_ACL_NODE_INDEX;
+		flows->started--;
+
+		/* Resolve priorities for this trie and running results */
+		if (flows->categories == 1)
+			resolve_single_priority(transition, slot, ctx,
+				parms, p);
+		else
+			resolve_priority(transition, slot, ctx, parms,
+				p, flows->categories);
+
+		/* Count down completed tries for this search request */
+		parms[slot].cmplt->count--;
+
+		/* Fill the slot with the next trie or idle trie */
+		transition = acl_start_next_trie(flows, parms, slot, ctx);
+	}
+
+	return transition;
+}
+
+#endif /* _ACL_RUN_H_ */
diff --git a/lib/librte_acl/acl_run_avx2.c b/lib/librte_acl/acl_run_avx2.c
new file mode 100644
index 00000000..79ebbd6c
--- /dev/null
+++ b/lib/librte_acl/acl_run_avx2.c
@@ -0,0 +1,54 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include "acl_run_avx2.h"
+
+/*
+ * Note, that to be able to use AVX2 classify method,
+ * both compiler and target cpu have to support AVX2 instructions.
+ */
+int
+rte_acl_classify_avx2(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	uint32_t *results, uint32_t num, uint32_t categories)
+{
+	if (likely(num >= MAX_SEARCHES_AVX16))
+		return search_avx2x16(ctx, data, results, num, categories);
+	else if (num >= MAX_SEARCHES_SSE8)
+		return search_sse_8(ctx, data, results, num, categories);
+	else if (num >= MAX_SEARCHES_SSE4)
+		return search_sse_4(ctx, data, results, num, categories);
+	else
+		return rte_acl_classify_scalar(ctx, data, results, num,
+			categories);
+}
diff --git a/lib/librte_acl/acl_run_avx2.h b/lib/librte_acl/acl_run_avx2.h
new file mode 100644
index 00000000..b01a46a5
--- /dev/null
+++ b/lib/librte_acl/acl_run_avx2.h
@@ -0,0 +1,284 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "acl_run_sse.h"
+
+static const rte_ymm_t ymm_match_mask = {
+	.u32 = {
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+	},
+};
+
+static const rte_ymm_t ymm_index_mask = {
+	.u32 = {
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+	},
+};
+
+static const rte_ymm_t ymm_shuffle_input = {
+	.u32 = {
+		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
+		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
+	},
+};
+
+static const rte_ymm_t ymm_ones_16 = {
+	.u16 = {
+		1, 1, 1, 1, 1, 1, 1, 1,
+		1, 1, 1, 1, 1, 1, 1, 1,
+	},
+};
+
+static const rte_ymm_t ymm_range_base = {
+	.u32 = {
+		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
+		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
+	},
+};
+
+/*
+ * Process 8 transitions in parallel.
+ * tr_lo contains low 32 bits for 8 transition.
+ * tr_hi contains high 32 bits for 8 transition.
+ * next_input contains up to 4 input bytes for 8 flows.
+ */
+static inline __attribute__((always_inline)) ymm_t
+transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi)
+{
+	const int32_t *tr;
+	ymm_t addr;
+
+	tr = (const int32_t *)(uintptr_t)trans;
+
+	/* Calculate the address (array index) for all 8 transitions. */
+	ACL_TR_CALC_ADDR(mm256, 256, addr, ymm_index_mask.y, next_input,
+		ymm_shuffle_input.y, ymm_ones_16.y, ymm_range_base.y,
+		*tr_lo, *tr_hi);
+
+	/* load lower 32 bits of 8 transactions at once. */
+	*tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0]));
+
+	next_input = _mm256_srli_epi32(next_input, CHAR_BIT);
+
+	/* load high 32 bits of 8 transactions at once. */
+	*tr_hi = _mm256_i32gather_epi32(tr + 1, addr, sizeof(trans[0]));
+
+	return next_input;
+}
+
+/*
+ * Process matches for  8 flows.
+ * tr_lo contains low 32 bits for 8 transition.
+ * tr_hi contains high 32 bits for 8 transition.
+ */
+static inline void
+acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx,
+	struct parms *parms, struct acl_flow_data *flows, uint32_t slot,
+	ymm_t matches, ymm_t *tr_lo, ymm_t *tr_hi)
+{
+	ymm_t t0, t1;
+	ymm_t lo, hi;
+	xmm_t l0, l1;
+	uint32_t i;
+	uint64_t tr[MAX_SEARCHES_SSE8];
+
+	l1 = _mm256_extracti128_si256(*tr_lo, 1);
+	l0 = _mm256_castsi256_si128(*tr_lo);
+
+	for (i = 0; i != RTE_DIM(tr) / 2; i++) {
+
+		/*
+		 * Extract low 32bits of each transition.
+		 * That's enough to process the match.
+		 */
+		tr[i] = (uint32_t)_mm_cvtsi128_si32(l0);
+		tr[i + 4] = (uint32_t)_mm_cvtsi128_si32(l1);
+
+		l0 = _mm_srli_si128(l0, sizeof(uint32_t));
+		l1 = _mm_srli_si128(l1, sizeof(uint32_t));
+
+		tr[i] = acl_match_check(tr[i], slot + i,
+			ctx, parms, flows, resolve_priority_sse);
+		tr[i + 4] = acl_match_check(tr[i + 4], slot + i + 4,
+			ctx, parms, flows, resolve_priority_sse);
+	}
+
+	/* Collect new transitions into 2 YMM registers. */
+	t0 = _mm256_set_epi64x(tr[5], tr[4], tr[1], tr[0]);
+	t1 = _mm256_set_epi64x(tr[7], tr[6], tr[3], tr[2]);
+
+	/* For each transition: put low 32 into tr_lo and high 32 into tr_hi */
+	ACL_TR_HILO(mm256, __m256, t0, t1, lo, hi);
+
+	/* Keep transitions wth NOMATCH intact. */
+	*tr_lo = _mm256_blendv_epi8(*tr_lo, lo, matches);
+	*tr_hi = _mm256_blendv_epi8(*tr_hi, hi, matches);
+}
+
+static inline void
+acl_match_check_avx2x8(const struct rte_acl_ctx *ctx, struct parms *parms,
+	struct acl_flow_data *flows, uint32_t slot,
+	ymm_t *tr_lo, ymm_t *tr_hi, ymm_t match_mask)
+{
+	uint32_t msk;
+	ymm_t matches, temp;
+
+	/* test for match node */
+	temp = _mm256_and_si256(match_mask, *tr_lo);
+	matches = _mm256_cmpeq_epi32(temp, match_mask);
+	msk = _mm256_movemask_epi8(matches);
+
+	while (msk != 0) {
+
+		acl_process_matches_avx2x8(ctx, parms, flows, slot,
+			matches, tr_lo, tr_hi);
+		temp = _mm256_and_si256(match_mask, *tr_lo);
+		matches = _mm256_cmpeq_epi32(temp, match_mask);
+		msk = _mm256_movemask_epi8(matches);
+	}
+}
+
+/*
+ * Execute trie traversal for up to 16 flows in parallel.
+ */
+static inline int
+search_avx2x16(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	uint32_t *results, uint32_t total_packets, uint32_t categories)
+{
+	uint32_t n;
+	struct acl_flow_data flows;
+	uint64_t index_array[MAX_SEARCHES_AVX16];
+	struct completion cmplt[MAX_SEARCHES_AVX16];
+	struct parms parms[MAX_SEARCHES_AVX16];
+	ymm_t input[2], tr_lo[2], tr_hi[2];
+	ymm_t t0, t1;
+
+	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
+		total_packets, categories, ctx->trans_table);
+
+	for (n = 0; n < RTE_DIM(cmplt); n++) {
+		cmplt[n].count = 0;
+		index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
+	}
+
+	t0 = _mm256_set_epi64x(index_array[5], index_array[4],
+		index_array[1], index_array[0]);
+	t1 = _mm256_set_epi64x(index_array[7], index_array[6],
+		index_array[3], index_array[2]);
+
+	ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[0], tr_hi[0]);
+
+	t0 = _mm256_set_epi64x(index_array[13], index_array[12],
+		index_array[9], index_array[8]);
+	t1 = _mm256_set_epi64x(index_array[15], index_array[14],
+		index_array[11], index_array[10]);
+
+	ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[1], tr_hi[1]);
+
+	 /* Check for any matches. */
+	acl_match_check_avx2x8(ctx, parms, &flows, 0, &tr_lo[0], &tr_hi[0],
+		ymm_match_mask.y);
+	acl_match_check_avx2x8(ctx, parms, &flows, 8, &tr_lo[1], &tr_hi[1],
+		ymm_match_mask.y);
+
+	while (flows.started > 0) {
+
+		uint32_t in[MAX_SEARCHES_SSE8];
+
+		/* Gather 4 bytes of input data for first 8 flows. */
+		in[0] = GET_NEXT_4BYTES(parms, 0);
+		in[4] = GET_NEXT_4BYTES(parms, 4);
+		in[1] = GET_NEXT_4BYTES(parms, 1);
+		in[5] = GET_NEXT_4BYTES(parms, 5);
+		in[2] = GET_NEXT_4BYTES(parms, 2);
+		in[6] = GET_NEXT_4BYTES(parms, 6);
+		in[3] = GET_NEXT_4BYTES(parms, 3);
+		in[7] = GET_NEXT_4BYTES(parms, 7);
+		input[0] = _mm256_set_epi32(in[7], in[6], in[5], in[4],
+			in[3], in[2], in[1], in[0]);
+
+		/* Gather 4 bytes of input data for last 8 flows. */
+		in[0] = GET_NEXT_4BYTES(parms, 8);
+		in[4] = GET_NEXT_4BYTES(parms, 12);
+		in[1] = GET_NEXT_4BYTES(parms, 9);
+		in[5] = GET_NEXT_4BYTES(parms, 13);
+		in[2] = GET_NEXT_4BYTES(parms, 10);
+		in[6] = GET_NEXT_4BYTES(parms, 14);
+		in[3] = GET_NEXT_4BYTES(parms, 11);
+		in[7] = GET_NEXT_4BYTES(parms, 15);
+		input[1] = _mm256_set_epi32(in[7], in[6], in[5], in[4],
+			in[3], in[2], in[1], in[0]);
+
+		input[0] = transition8(input[0], flows.trans,
+			&tr_lo[0], &tr_hi[0]);
+		input[1] = transition8(input[1], flows.trans,
+			&tr_lo[1], &tr_hi[1]);
+
+		input[0] = transition8(input[0], flows.trans,
+			&tr_lo[0], &tr_hi[0]);
+		input[1] = transition8(input[1], flows.trans,
+			&tr_lo[1], &tr_hi[1]);
+
+		input[0] = transition8(input[0], flows.trans,
+			&tr_lo[0], &tr_hi[0]);
+		input[1] = transition8(input[1], flows.trans,
+			&tr_lo[1], &tr_hi[1]);
+
+		input[0] = transition8(input[0], flows.trans,
+			&tr_lo[0], &tr_hi[0]);
+		input[1] = transition8(input[1], flows.trans,
+			&tr_lo[1], &tr_hi[1]);
+
+		 /* Check for any matches. */
+		acl_match_check_avx2x8(ctx, parms, &flows, 0,
+			&tr_lo[0], &tr_hi[0], ymm_match_mask.y);
+		acl_match_check_avx2x8(ctx, parms, &flows, 8,
+			&tr_lo[1], &tr_hi[1], ymm_match_mask.y);
+	}
+
+	return 0;
+}
diff --git a/lib/librte_acl/acl_run_neon.c b/lib/librte_acl/acl_run_neon.c
new file mode 100644
index 00000000..b0144514
--- /dev/null
+++ b/lib/librte_acl/acl_run_neon.c
@@ -0,0 +1,46 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2015.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "acl_run_neon.h"
+
+int
+rte_acl_classify_neon(const struct rte_acl_ctx *ctx, const uint8_t **data,
+		      uint32_t *results, uint32_t num, uint32_t categories)
+{
+	if (likely(num >= 8))
+		return search_neon_8(ctx, data, results, num, categories);
+	else if (num >= 4)
+		return search_neon_4(ctx, data, results, num, categories);
+	else
+		return rte_acl_classify_scalar(ctx, data, results, num,
+			categories);
+}
diff --git a/lib/librte_acl/acl_run_neon.h b/lib/librte_acl/acl_run_neon.h
new file mode 100644
index 00000000..d233ff00
--- /dev/null
+++ b/lib/librte_acl/acl_run_neon.h
@@ -0,0 +1,289 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2015.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "acl_run.h"
+#include "acl_vect.h"
+
+struct _neon_acl_const {
+	rte_xmm_t xmm_shuffle_input;
+	rte_xmm_t xmm_index_mask;
+	rte_xmm_t range_base;
+} neon_acl_const  __attribute__((aligned(RTE_CACHE_LINE_SIZE))) = {
+	{
+		.u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c}
+	},
+	{
+		.u32 = {RTE_ACL_NODE_INDEX, RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX, RTE_ACL_NODE_INDEX}
+	},
+	{
+		.u32 = {0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c}
+	},
+};
+
+/*
+ * Resolve priority for multiple results (neon version).
+ * This consists comparing the priority of the current traversal with the
+ * running set of results for the packet.
+ * For each result, keep a running array of the result (rule number) and
+ * its priority for each category.
+ */
+static inline void
+resolve_priority_neon(uint64_t transition, int n, const struct rte_acl_ctx *ctx,
+		      struct parms *parms,
+		      const struct rte_acl_match_results *p,
+		      uint32_t categories)
+{
+	uint32_t x;
+	int32x4_t results, priority, results1, priority1;
+	uint32x4_t selector;
+	int32_t *saved_results, *saved_priority;
+
+	for (x = 0; x < categories; x += RTE_ACL_RESULTS_MULTIPLIER) {
+		saved_results = (int32_t *)(&parms[n].cmplt->results[x]);
+		saved_priority = (int32_t *)(&parms[n].cmplt->priority[x]);
+
+		/* get results and priorities for completed trie */
+		results = vld1q_s32(
+			(const int32_t *)&p[transition].results[x]);
+		priority = vld1q_s32(
+			(const int32_t *)&p[transition].priority[x]);
+
+		/* if this is not the first completed trie */
+		if (parms[n].cmplt->count != ctx->num_tries) {
+			/* get running best results and their priorities */
+			results1 = vld1q_s32(saved_results);
+			priority1 = vld1q_s32(saved_priority);
+
+			/* select results that are highest priority */
+			selector = vcgtq_s32(priority1, priority);
+			results = vbslq_s32(selector, results1, results);
+			priority = vbslq_s32(selector, priority1, priority);
+		}
+
+		/* save running best results and their priorities */
+		vst1q_s32(saved_results, results);
+		vst1q_s32(saved_priority, priority);
+	}
+}
+
+/*
+ * Check for any match in 4 transitions
+ */
+static inline __attribute__((always_inline)) uint32_t
+check_any_match_x4(uint64_t val[])
+{
+	return (val[0] | val[1] | val[2] | val[3]) & RTE_ACL_NODE_MATCH;
+}
+
+static inline __attribute__((always_inline)) void
+acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,
+		   struct acl_flow_data *flows, uint64_t transitions[])
+{
+	while (check_any_match_x4(transitions)) {
+		transitions[0] = acl_match_check(transitions[0], slot, ctx,
+			parms, flows, resolve_priority_neon);
+		transitions[1] = acl_match_check(transitions[1], slot + 1, ctx,
+			parms, flows, resolve_priority_neon);
+		transitions[2] = acl_match_check(transitions[2], slot + 2, ctx,
+			parms, flows, resolve_priority_neon);
+		transitions[3] = acl_match_check(transitions[3], slot + 3, ctx,
+			parms, flows, resolve_priority_neon);
+	}
+}
+
+/*
+ * Process 4 transitions (in 2 NEON Q registers) in parallel
+ */
+static inline __attribute__((always_inline)) int32x4_t
+transition4(int32x4_t next_input, const uint64_t *trans, uint64_t transitions[])
+{
+	int32x4x2_t tr_hi_lo;
+	int32x4_t t, in, r;
+	uint32x4_t index_msk, node_type, addr;
+	uint32x4_t dfa_msk, mask, quad_ofs, dfa_ofs;
+
+	/* Move low 32 into tr_hi_lo.val[0] and high 32 into tr_hi_lo.val[1] */
+	tr_hi_lo = vld2q_s32((const int32_t *)transitions);
+
+	/* Calculate the address (array index) for all 4 transitions. */
+
+	index_msk = vld1q_u32((const uint32_t *)&neon_acl_const.xmm_index_mask);
+
+	/* Calc node type and node addr */
+	node_type = vbicq_s32(tr_hi_lo.val[0], index_msk);
+	addr = vandq_s32(tr_hi_lo.val[0], index_msk);
+
+	/* t = 0 */
+	t = veorq_s32(node_type, node_type);
+
+	/* mask for DFA type(0) nodes */
+	dfa_msk = vceqq_u32(node_type, t);
+
+	mask = vld1q_s32((const int32_t *)&neon_acl_const.xmm_shuffle_input);
+	in = vqtbl1q_u8((uint8x16_t)next_input, (uint8x16_t)mask);
+
+	/* DFA calculations. */
+	r = vshrq_n_u32(in, 30); /* div by 64 */
+	mask = vld1q_s32((const int32_t *)&neon_acl_const.range_base);
+	r = vaddq_u8(r, mask);
+	t = vshrq_n_u32(in, 24);
+	r = vqtbl1q_u8((uint8x16_t)tr_hi_lo.val[1], (uint8x16_t)r);
+	dfa_ofs = vsubq_s32(t, r);
+
+	/* QUAD/SINGLE calculations. */
+	t = vcgtq_s8(in, tr_hi_lo.val[1]);
+	t = vabsq_s8(t);
+	t = vpaddlq_u8(t);
+	quad_ofs = vpaddlq_u16(t);
+
+	/* blend DFA and QUAD/SINGLE. */
+	t = vbslq_u8(dfa_msk, dfa_ofs, quad_ofs);
+
+	/* calculate address for next transitions */
+	addr = vaddq_u32(addr, t);
+
+	/* Fill next transitions */
+	transitions[0] = trans[vgetq_lane_u32(addr, 0)];
+	transitions[1] = trans[vgetq_lane_u32(addr, 1)];
+	transitions[2] = trans[vgetq_lane_u32(addr, 2)];
+	transitions[3] = trans[vgetq_lane_u32(addr, 3)];
+
+	return vshrq_n_u32(next_input, CHAR_BIT);
+}
+
+/*
+ * Execute trie traversal with 8 traversals in parallel
+ */
+static inline int
+search_neon_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	      uint32_t *results, uint32_t total_packets, uint32_t categories)
+{
+	int n;
+	struct acl_flow_data flows;
+	uint64_t index_array[8];
+	struct completion cmplt[8];
+	struct parms parms[8];
+	int32x4_t input0, input1;
+
+	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
+		     total_packets, categories, ctx->trans_table);
+
+	for (n = 0; n < 8; n++) {
+		cmplt[n].count = 0;
+		index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
+	}
+
+	 /* Check for any matches. */
+	acl_match_check_x4(0, ctx, parms, &flows, &index_array[0]);
+	acl_match_check_x4(4, ctx, parms, &flows, &index_array[4]);
+
+	while (flows.started > 0) {
+		/* Gather 4 bytes of input data for each stream. */
+		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input0, 0);
+		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 4), input1, 0);
+
+		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input0, 1);
+		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 5), input1, 1);
+
+		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input0, 2);
+		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 6), input1, 2);
+
+		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input0, 3);
+		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 7), input1, 3);
+
+		/* Process the 4 bytes of input on each stream. */
+
+		input0 = transition4(input0, flows.trans, &index_array[0]);
+		input1 = transition4(input1, flows.trans, &index_array[4]);
+
+		input0 = transition4(input0, flows.trans, &index_array[0]);
+		input1 = transition4(input1, flows.trans, &index_array[4]);
+
+		input0 = transition4(input0, flows.trans, &index_array[0]);
+		input1 = transition4(input1, flows.trans, &index_array[4]);
+
+		input0 = transition4(input0, flows.trans, &index_array[0]);
+		input1 = transition4(input1, flows.trans, &index_array[4]);
+
+		 /* Check for any matches. */
+		acl_match_check_x4(0, ctx, parms, &flows, &index_array[0]);
+		acl_match_check_x4(4, ctx, parms, &flows, &index_array[4]);
+	}
+
+	return 0;
+}
+
+/*
+ * Execute trie traversal with 4 traversals in parallel
+ */
+static inline int
+search_neon_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	      uint32_t *results, int total_packets, uint32_t categories)
+{
+	int n;
+	struct acl_flow_data flows;
+	uint64_t index_array[4];
+	struct completion cmplt[4];
+	struct parms parms[4];
+	int32x4_t input;
+
+	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
+		     total_packets, categories, ctx->trans_table);
+
+	for (n = 0; n < 4; n++) {
+		cmplt[n].count = 0;
+		index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
+	}
+
+	/* Check for any matches. */
+	acl_match_check_x4(0, ctx, parms, &flows, index_array);
+
+	while (flows.started > 0) {
+		/* Gather 4 bytes of input data for each stream. */
+		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input, 0);
+		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input, 1);
+		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input, 2);
+		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input, 3);
+
+		/* Process the 4 bytes of input on each stream. */
+		input = transition4(input, flows.trans, index_array);
+		input = transition4(input, flows.trans, index_array);
+		input = transition4(input, flows.trans, index_array);
+		input = transition4(input, flows.trans, index_array);
+
+		/* Check for any matches. */
+		acl_match_check_x4(0, ctx, parms, &flows, index_array);
+	}
+
+	return 0;
+}
diff --git a/lib/librte_acl/acl_run_scalar.c b/lib/librte_acl/acl_run_scalar.c
new file mode 100644
index 00000000..5be216c6
--- /dev/null
+++ b/lib/librte_acl/acl_run_scalar.c
@@ -0,0 +1,192 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "acl_run.h"
+
+/*
+ * Resolve priority for multiple results (scalar version).
+ * This consists comparing the priority of the current traversal with the
+ * running set of results for the packet.
+ * For each result, keep a running array of the result (rule number) and
+ * its priority for each category.
+ */
+static inline void
+resolve_priority_scalar(uint64_t transition, int n,
+	const struct rte_acl_ctx *ctx, struct parms *parms,
+	const struct rte_acl_match_results *p, uint32_t categories)
+{
+	uint32_t i;
+	int32_t *saved_priority;
+	uint32_t *saved_results;
+	const int32_t *priority;
+	const uint32_t *results;
+
+	saved_results = parms[n].cmplt->results;
+	saved_priority = parms[n].cmplt->priority;
+
+	/* results and priorities for completed trie */
+	results = p[transition].results;
+	priority = p[transition].priority;
+
+	/* if this is not the first completed trie */
+	if (parms[n].cmplt->count != ctx->num_tries) {
+		for (i = 0; i < categories; i += RTE_ACL_RESULTS_MULTIPLIER) {
+
+			if (saved_priority[i] <= priority[i]) {
+				saved_priority[i] = priority[i];
+				saved_results[i] = results[i];
+			}
+			if (saved_priority[i + 1] <= priority[i + 1]) {
+				saved_priority[i + 1] = priority[i + 1];
+				saved_results[i + 1] = results[i + 1];
+			}
+			if (saved_priority[i + 2] <= priority[i + 2]) {
+				saved_priority[i + 2] = priority[i + 2];
+				saved_results[i + 2] = results[i + 2];
+			}
+			if (saved_priority[i + 3] <= priority[i + 3]) {
+				saved_priority[i + 3] = priority[i + 3];
+				saved_results[i + 3] = results[i + 3];
+			}
+		}
+	} else {
+		for (i = 0; i < categories; i += RTE_ACL_RESULTS_MULTIPLIER) {
+			saved_priority[i] = priority[i];
+			saved_priority[i + 1] = priority[i + 1];
+			saved_priority[i + 2] = priority[i + 2];
+			saved_priority[i + 3] = priority[i + 3];
+
+			saved_results[i] = results[i];
+			saved_results[i + 1] = results[i + 1];
+			saved_results[i + 2] = results[i + 2];
+			saved_results[i + 3] = results[i + 3];
+		}
+	}
+}
+
+static inline uint32_t
+scan_forward(uint32_t input, uint32_t max)
+{
+	return (input == 0) ? max : rte_bsf32(input);
+}
+
+static inline uint64_t
+scalar_transition(const uint64_t *trans_table, uint64_t transition,
+	uint8_t input)
+{
+	uint32_t addr, index, ranges, x, a, b, c;
+
+	/* break transition into component parts */
+	ranges = transition >> (sizeof(index) * CHAR_BIT);
+	index = transition & ~RTE_ACL_NODE_INDEX;
+	addr = transition ^ index;
+
+	if (index != RTE_ACL_NODE_DFA) {
+		/* calc address for a QRANGE/SINGLE node */
+		c = (uint32_t)input * SCALAR_QRANGE_MULT;
+		a = ranges | SCALAR_QRANGE_MIN;
+		a -= (c & SCALAR_QRANGE_MASK);
+		b = c & SCALAR_QRANGE_MIN;
+		a &= SCALAR_QRANGE_MIN;
+		a ^= (ranges ^ b) & (a ^ b);
+		x = scan_forward(a, 32) >> 3;
+	} else {
+		/* calc address for a DFA node */
+		x = ranges >> (input /
+			RTE_ACL_DFA_GR64_SIZE * RTE_ACL_DFA_GR64_BIT);
+		x &= UINT8_MAX;
+		x = input - x;
+	}
+
+	addr += x;
+
+	/* pickup next transition */
+	transition = *(trans_table + addr);
+	return transition;
+}
+
+int
+rte_acl_classify_scalar(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	uint32_t *results, uint32_t num, uint32_t categories)
+{
+	int n;
+	uint64_t transition0, transition1;
+	uint32_t input0, input1;
+	struct acl_flow_data flows;
+	uint64_t index_array[MAX_SEARCHES_SCALAR];
+	struct completion cmplt[MAX_SEARCHES_SCALAR];
+	struct parms parms[MAX_SEARCHES_SCALAR];
+
+	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, num,
+		categories, ctx->trans_table);
+
+	for (n = 0; n < MAX_SEARCHES_SCALAR; n++) {
+		cmplt[n].count = 0;
+		index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
+	}
+
+	transition0 = index_array[0];
+	transition1 = index_array[1];
+
+	while ((transition0 | transition1) & RTE_ACL_NODE_MATCH) {
+		transition0 = acl_match_check(transition0,
+			0, ctx, parms, &flows, resolve_priority_scalar);
+		transition1 = acl_match_check(transition1,
+			1, ctx, parms, &flows, resolve_priority_scalar);
+	}
+
+	while (flows.started > 0) {
+
+		input0 = GET_NEXT_4BYTES(parms, 0);
+		input1 = GET_NEXT_4BYTES(parms, 1);
+
+		for (n = 0; n < 4; n++) {
+
+			transition0 = scalar_transition(flows.trans,
+				transition0, (uint8_t)input0);
+			input0 >>= CHAR_BIT;
+
+			transition1 = scalar_transition(flows.trans,
+				transition1, (uint8_t)input1);
+			input1 >>= CHAR_BIT;
+		}
+
+		while ((transition0 | transition1) & RTE_ACL_NODE_MATCH) {
+			transition0 = acl_match_check(transition0,
+				0, ctx, parms, &flows, resolve_priority_scalar);
+			transition1 = acl_match_check(transition1,
+				1, ctx, parms, &flows, resolve_priority_scalar);
+		}
+	}
+	return 0;
+}
diff --git a/lib/librte_acl/acl_run_sse.c b/lib/librte_acl/acl_run_sse.c
new file mode 100644
index 00000000..a5a7d361
--- /dev/null
+++ b/lib/librte_acl/acl_run_sse.c
@@ -0,0 +1,47 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "acl_run_sse.h"
+
+int
+rte_acl_classify_sse(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	uint32_t *results, uint32_t num, uint32_t categories)
+{
+	if (likely(num >= MAX_SEARCHES_SSE8))
+		return search_sse_8(ctx, data, results, num, categories);
+	else if (num >= MAX_SEARCHES_SSE4)
+		return search_sse_4(ctx, data, results, num, categories);
+	else
+		return rte_acl_classify_scalar(ctx, data, results, num,
+			categories);
+}
diff --git a/lib/librte_acl/acl_run_sse.h b/lib/librte_acl/acl_run_sse.h
new file mode 100644
index 00000000..ad40a674
--- /dev/null
+++ b/lib/librte_acl/acl_run_sse.h
@@ -0,0 +1,357 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "acl_run.h"
+#include "acl_vect.h"
+
+enum {
+	SHUFFLE32_SLOT1 = 0xe5,
+	SHUFFLE32_SLOT2 = 0xe6,
+	SHUFFLE32_SLOT3 = 0xe7,
+	SHUFFLE32_SWAP64 = 0x4e,
+};
+
+static const rte_xmm_t xmm_shuffle_input = {
+	.u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c},
+};
+
+static const rte_xmm_t xmm_ones_16 = {
+	.u16 = {1, 1, 1, 1, 1, 1, 1, 1},
+};
+
+static const rte_xmm_t xmm_match_mask = {
+	.u32 = {
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+	},
+};
+
+static const rte_xmm_t xmm_index_mask = {
+	.u32 = {
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+	},
+};
+
+static const rte_xmm_t xmm_range_base = {
+	.u32 = {
+		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
+	},
+};
+
+/*
+ * Resolve priority for multiple results (sse version).
+ * This consists comparing the priority of the current traversal with the
+ * running set of results for the packet.
+ * For each result, keep a running array of the result (rule number) and
+ * its priority for each category.
+ */
+static inline void
+resolve_priority_sse(uint64_t transition, int n, const struct rte_acl_ctx *ctx,
+	struct parms *parms, const struct rte_acl_match_results *p,
+	uint32_t categories)
+{
+	uint32_t x;
+	xmm_t results, priority, results1, priority1, selector;
+	xmm_t *saved_results, *saved_priority;
+
+	for (x = 0; x < categories; x += RTE_ACL_RESULTS_MULTIPLIER) {
+
+		saved_results = (xmm_t *)(&parms[n].cmplt->results[x]);
+		saved_priority =
+			(xmm_t *)(&parms[n].cmplt->priority[x]);
+
+		/* get results and priorities for completed trie */
+		results = _mm_loadu_si128(
+			(const xmm_t *)&p[transition].results[x]);
+		priority = _mm_loadu_si128(
+			(const xmm_t *)&p[transition].priority[x]);
+
+		/* if this is not the first completed trie */
+		if (parms[n].cmplt->count != ctx->num_tries) {
+
+			/* get running best results and their priorities */
+			results1 = _mm_loadu_si128(saved_results);
+			priority1 = _mm_loadu_si128(saved_priority);
+
+			/* select results that are highest priority */
+			selector = _mm_cmpgt_epi32(priority1, priority);
+			results = _mm_blendv_epi8(results, results1, selector);
+			priority = _mm_blendv_epi8(priority, priority1,
+				selector);
+		}
+
+		/* save running best results and their priorities */
+		_mm_storeu_si128(saved_results, results);
+		_mm_storeu_si128(saved_priority, priority);
+	}
+}
+
+/*
+ * Extract transitions from an XMM register and check for any matches
+ */
+static void
+acl_process_matches(xmm_t *indices, int slot, const struct rte_acl_ctx *ctx,
+	struct parms *parms, struct acl_flow_data *flows)
+{
+	uint64_t transition1, transition2;
+
+	/* extract transition from low 64 bits. */
+	transition1 = _mm_cvtsi128_si64(*indices);
+
+	/* extract transition from high 64 bits. */
+	*indices = _mm_shuffle_epi32(*indices, SHUFFLE32_SWAP64);
+	transition2 = _mm_cvtsi128_si64(*indices);
+
+	transition1 = acl_match_check(transition1, slot, ctx,
+		parms, flows, resolve_priority_sse);
+	transition2 = acl_match_check(transition2, slot + 1, ctx,
+		parms, flows, resolve_priority_sse);
+
+	/* update indices with new transitions. */
+	*indices = _mm_set_epi64x(transition2, transition1);
+}
+
+/*
+ * Check for any match in 4 transitions (contained in 2 SSE registers)
+ */
+static inline __attribute__((always_inline)) void
+acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,
+	struct acl_flow_data *flows, xmm_t *indices1, xmm_t *indices2,
+	xmm_t match_mask)
+{
+	xmm_t temp;
+
+	/* put low 32 bits of each transition into one register */
+	temp = (xmm_t)_mm_shuffle_ps((__m128)*indices1, (__m128)*indices2,
+		0x88);
+	/* test for match node */
+	temp = _mm_and_si128(match_mask, temp);
+
+	while (!_mm_testz_si128(temp, temp)) {
+		acl_process_matches(indices1, slot, ctx, parms, flows);
+		acl_process_matches(indices2, slot + 2, ctx, parms, flows);
+
+		temp = (xmm_t)_mm_shuffle_ps((__m128)*indices1,
+					(__m128)*indices2,
+					0x88);
+		temp = _mm_and_si128(match_mask, temp);
+	}
+}
+
+/*
+ * Process 4 transitions (in 2 XMM registers) in parallel
+ */
+static inline __attribute__((always_inline)) xmm_t
+transition4(xmm_t next_input, const uint64_t *trans,
+	xmm_t *indices1, xmm_t *indices2)
+{
+	xmm_t addr, tr_lo, tr_hi;
+	uint64_t trans0, trans2;
+
+	/* Shuffle low 32 into tr_lo and high 32 into tr_hi */
+	ACL_TR_HILO(mm, __m128, *indices1, *indices2, tr_lo, tr_hi);
+
+	 /* Calculate the address (array index) for all 4 transitions. */
+	ACL_TR_CALC_ADDR(mm, 128, addr, xmm_index_mask.x, next_input,
+		xmm_shuffle_input.x, xmm_ones_16.x, xmm_range_base.x,
+		tr_lo, tr_hi);
+
+	 /* Gather 64 bit transitions and pack back into 2 registers. */
+
+	trans0 = trans[_mm_cvtsi128_si32(addr)];
+
+	/* get slot 2 */
+
+	/* {x0, x1, x2, x3} -> {x2, x1, x2, x3} */
+	addr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT2);
+	trans2 = trans[_mm_cvtsi128_si32(addr)];
+
+	/* get slot 1 */
+
+	/* {x2, x1, x2, x3} -> {x1, x1, x2, x3} */
+	addr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT1);
+	*indices1 = _mm_set_epi64x(trans[_mm_cvtsi128_si32(addr)], trans0);
+
+	/* get slot 3 */
+
+	/* {x1, x1, x2, x3} -> {x3, x1, x2, x3} */
+	addr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT3);
+	*indices2 = _mm_set_epi64x(trans[_mm_cvtsi128_si32(addr)], trans2);
+
+	return _mm_srli_epi32(next_input, CHAR_BIT);
+}
+
+/*
+ * Execute trie traversal with 8 traversals in parallel
+ */
+static inline int
+search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	uint32_t *results, uint32_t total_packets, uint32_t categories)
+{
+	int n;
+	struct acl_flow_data flows;
+	uint64_t index_array[MAX_SEARCHES_SSE8];
+	struct completion cmplt[MAX_SEARCHES_SSE8];
+	struct parms parms[MAX_SEARCHES_SSE8];
+	xmm_t input0, input1;
+	xmm_t indices1, indices2, indices3, indices4;
+
+	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
+		total_packets, categories, ctx->trans_table);
+
+	for (n = 0; n < MAX_SEARCHES_SSE8; n++) {
+		cmplt[n].count = 0;
+		index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
+	}
+
+	/*
+	 * indices1 contains index_array[0,1]
+	 * indices2 contains index_array[2,3]
+	 * indices3 contains index_array[4,5]
+	 * indices4 contains index_array[6,7]
+	 */
+
+	indices1 = _mm_loadu_si128((xmm_t *) &index_array[0]);
+	indices2 = _mm_loadu_si128((xmm_t *) &index_array[2]);
+
+	indices3 = _mm_loadu_si128((xmm_t *) &index_array[4]);
+	indices4 = _mm_loadu_si128((xmm_t *) &index_array[6]);
+
+	 /* Check for any matches. */
+	acl_match_check_x4(0, ctx, parms, &flows,
+		&indices1, &indices2, xmm_match_mask.x);
+	acl_match_check_x4(4, ctx, parms, &flows,
+		&indices3, &indices4, xmm_match_mask.x);
+
+	while (flows.started > 0) {
+
+		/* Gather 4 bytes of input data for each stream. */
+		input0 = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 0));
+		input1 = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 4));
+
+		input0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 1), 1);
+		input1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 5), 1);
+
+		input0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 2), 2);
+		input1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 6), 2);
+
+		input0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 3), 3);
+		input1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 7), 3);
+
+		 /* Process the 4 bytes of input on each stream. */
+
+		input0 = transition4(input0, flows.trans,
+			&indices1, &indices2);
+		input1 = transition4(input1, flows.trans,
+			&indices3, &indices4);
+
+		input0 = transition4(input0, flows.trans,
+			&indices1, &indices2);
+		input1 = transition4(input1, flows.trans,
+			&indices3, &indices4);
+
+		input0 = transition4(input0, flows.trans,
+			&indices1, &indices2);
+		input1 = transition4(input1, flows.trans,
+			&indices3, &indices4);
+
+		input0 = transition4(input0, flows.trans,
+			&indices1, &indices2);
+		input1 = transition4(input1, flows.trans,
+			&indices3, &indices4);
+
+		 /* Check for any matches. */
+		acl_match_check_x4(0, ctx, parms, &flows,
+			&indices1, &indices2, xmm_match_mask.x);
+		acl_match_check_x4(4, ctx, parms, &flows,
+			&indices3, &indices4, xmm_match_mask.x);
+	}
+
+	return 0;
+}
+
+/*
+ * Execute trie traversal with 4 traversals in parallel
+ */
+static inline int
+search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	 uint32_t *results, int total_packets, uint32_t categories)
+{
+	int n;
+	struct acl_flow_data flows;
+	uint64_t index_array[MAX_SEARCHES_SSE4];
+	struct completion cmplt[MAX_SEARCHES_SSE4];
+	struct parms parms[MAX_SEARCHES_SSE4];
+	xmm_t input, indices1, indices2;
+
+	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
+		total_packets, categories, ctx->trans_table);
+
+	for (n = 0; n < MAX_SEARCHES_SSE4; n++) {
+		cmplt[n].count = 0;
+		index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
+	}
+
+	indices1 = _mm_loadu_si128((xmm_t *) &index_array[0]);
+	indices2 = _mm_loadu_si128((xmm_t *) &index_array[2]);
+
+	/* Check for any matches. */
+	acl_match_check_x4(0, ctx, parms, &flows,
+		&indices1, &indices2, xmm_match_mask.x);
+
+	while (flows.started > 0) {
+
+		/* Gather 4 bytes of input data for each stream. */
+		input = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 0));
+		input = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 1), 1);
+		input = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 2), 2);
+		input = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 3), 3);
+
+		/* Process the 4 bytes of input on each stream. */
+		input = transition4(input, flows.trans, &indices1, &indices2);
+		input = transition4(input, flows.trans, &indices1, &indices2);
+		input = transition4(input, flows.trans, &indices1, &indices2);
+		input = transition4(input, flows.trans, &indices1, &indices2);
+
+		/* Check for any matches. */
+		acl_match_check_x4(0, ctx, parms, &flows,
+			&indices1, &indices2, xmm_match_mask.x);
+	}
+
+	return 0;
+}
diff --git a/lib/librte_acl/acl_vect.h b/lib/librte_acl/acl_vect.h
new file mode 100644
index 00000000..6cc19997
--- /dev/null
+++ b/lib/librte_acl/acl_vect.h
@@ -0,0 +1,116 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_ACL_VECT_H_
+#define _RTE_ACL_VECT_H_
+
+/**
+ * @file
+ *
+ * RTE ACL SSE/AVX related header.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*
+ * Takes 2 SIMD registers containing N transitions eachi (tr0, tr1).
+ * Shuffles it into different representation:
+ * lo - contains low 32 bits of given N transitions.
+ * hi - contains high 32 bits of given N transitions.
+ */
+#define	ACL_TR_HILO(P, TC, tr0, tr1, lo, hi)                        do { \
+	lo = (typeof(lo))_##P##_shuffle_ps((TC)(tr0), (TC)(tr1), 0x88);  \
+	hi = (typeof(hi))_##P##_shuffle_ps((TC)(tr0), (TC)(tr1), 0xdd);  \
+} while (0)
+
+
+/*
+ * Calculate the address of the next transition for
+ * all types of nodes. Note that only DFA nodes and range
+ * nodes actually transition to another node. Match
+ * nodes not supposed to be encountered here.
+ * For quad range nodes:
+ * Calculate number of range boundaries that are less than the
+ * input value. Range boundaries for each node are in signed 8 bit,
+ * ordered from -128 to 127.
+ * This is effectively a popcnt of bytes that are greater than the
+ * input byte.
+ * Single nodes are processed in the same ways as quad range nodes.
+*/
+#define ACL_TR_CALC_ADDR(P, S,					\
+	addr, index_mask, next_input, shuffle_input,		\
+	ones_16, range_base, tr_lo, tr_hi)               do {	\
+								\
+	typeof(addr) in, node_type, r, t;			\
+	typeof(addr) dfa_msk, dfa_ofs, quad_ofs;		\
+								\
+	t = _##P##_xor_si##S(index_mask, index_mask);		\
+	in = _##P##_shuffle_epi8(next_input, shuffle_input);	\
+								\
+	/* Calc node type and node addr */			\
+	node_type = _##P##_andnot_si##S(index_mask, tr_lo);	\
+	addr = _##P##_and_si##S(index_mask, tr_lo);		\
+								\
+	/* mask for DFA type(0) nodes */			\
+	dfa_msk = _##P##_cmpeq_epi32(node_type, t);		\
+								\
+	/* DFA calculations. */					\
+	r = _##P##_srli_epi32(in, 30);				\
+	r = _##P##_add_epi8(r, range_base);			\
+	t = _##P##_srli_epi32(in, 24);				\
+	r = _##P##_shuffle_epi8(tr_hi, r);			\
+								\
+	dfa_ofs = _##P##_sub_epi32(t, r);			\
+								\
+	/* QUAD/SINGLE caluclations. */				\
+	t = _##P##_cmpgt_epi8(in, tr_hi);			\
+	t = _##P##_sign_epi8(t, t);				\
+	t = _##P##_maddubs_epi16(t, t);				\
+	quad_ofs = _##P##_madd_epi16(t, ones_16);		\
+								\
+	/* blend DFA and QUAD/SINGLE. */			\
+	t = _##P##_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk);	\
+								\
+	/* calculate address for next transitions. */		\
+	addr = _##P##_add_epi32(addr, t);			\
+} while (0)
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_ACL_VECT_H_ */
diff --git a/lib/librte_acl/rte_acl.c b/lib/librte_acl/rte_acl.c
new file mode 100644
index 00000000..4ba9786b
--- /dev/null
+++ b/lib/librte_acl/rte_acl.c
@@ -0,0 +1,393 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rte_acl.h>
+#include "acl.h"
+
+TAILQ_HEAD(rte_acl_list, rte_tailq_entry);
+
+static struct rte_tailq_elem rte_acl_tailq = {
+	.name = "RTE_ACL",
+};
+EAL_REGISTER_TAILQ(rte_acl_tailq)
+
+/*
+ * If the compiler doesn't support AVX2 instructions,
+ * then the dummy one would be used instead for AVX2 classify method.
+ */
+int __attribute__ ((weak))
+rte_acl_classify_avx2(__rte_unused const struct rte_acl_ctx *ctx,
+	__rte_unused const uint8_t **data,
+	__rte_unused uint32_t *results,
+	__rte_unused uint32_t num,
+	__rte_unused uint32_t categories)
+{
+	return -ENOTSUP;
+}
+
+int __attribute__ ((weak))
+rte_acl_classify_sse(__rte_unused const struct rte_acl_ctx *ctx,
+	__rte_unused const uint8_t **data,
+	__rte_unused uint32_t *results,
+	__rte_unused uint32_t num,
+	__rte_unused uint32_t categories)
+{
+	return -ENOTSUP;
+}
+
+int __attribute__ ((weak))
+rte_acl_classify_neon(__rte_unused const struct rte_acl_ctx *ctx,
+	__rte_unused const uint8_t **data,
+	__rte_unused uint32_t *results,
+	__rte_unused uint32_t num,
+	__rte_unused uint32_t categories)
+{
+	return -ENOTSUP;
+}
+
+static const rte_acl_classify_t classify_fns[] = {
+	[RTE_ACL_CLASSIFY_DEFAULT] = rte_acl_classify_scalar,
+	[RTE_ACL_CLASSIFY_SCALAR] = rte_acl_classify_scalar,
+	[RTE_ACL_CLASSIFY_SSE] = rte_acl_classify_sse,
+	[RTE_ACL_CLASSIFY_AVX2] = rte_acl_classify_avx2,
+	[RTE_ACL_CLASSIFY_NEON] = rte_acl_classify_neon,
+};
+
+/* by default, use always available scalar code path. */
+static enum rte_acl_classify_alg rte_acl_default_classify =
+	RTE_ACL_CLASSIFY_SCALAR;
+
+static void
+rte_acl_set_default_classify(enum rte_acl_classify_alg alg)
+{
+	rte_acl_default_classify = alg;
+}
+
+extern int
+rte_acl_set_ctx_classify(struct rte_acl_ctx *ctx, enum rte_acl_classify_alg alg)
+{
+	if (ctx == NULL || (uint32_t)alg >= RTE_DIM(classify_fns))
+		return -EINVAL;
+
+	ctx->alg = alg;
+	return 0;
+}
+
+/*
+ * Select highest available classify method as default one.
+ * Note that CLASSIFY_AVX2 should be set as a default only
+ * if both conditions are met:
+ * at build time compiler supports AVX2 and target cpu supports AVX2.
+ */
+static void __attribute__((constructor))
+rte_acl_init(void)
+{
+	enum rte_acl_classify_alg alg = RTE_ACL_CLASSIFY_DEFAULT;
+
+#if defined(RTE_ARCH_ARM64)
+	alg =  RTE_ACL_CLASSIFY_NEON;
+#elif defined(RTE_ARCH_ARM)
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
+		alg =  RTE_ACL_CLASSIFY_NEON;
+#else
+#ifdef CC_AVX2_SUPPORT
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2))
+		alg = RTE_ACL_CLASSIFY_AVX2;
+	else if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE4_1))
+#else
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE4_1))
+#endif
+		alg = RTE_ACL_CLASSIFY_SSE;
+
+#endif
+	rte_acl_set_default_classify(alg);
+}
+
+int
+rte_acl_classify_alg(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	uint32_t *results, uint32_t num, uint32_t categories,
+	enum rte_acl_classify_alg alg)
+{
+	if (categories != 1 &&
+			((RTE_ACL_RESULTS_MULTIPLIER - 1) & categories) != 0)
+		return -EINVAL;
+
+	return classify_fns[alg](ctx, data, results, num, categories);
+}
+
+int
+rte_acl_classify(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	uint32_t *results, uint32_t num, uint32_t categories)
+{
+	return rte_acl_classify_alg(ctx, data, results, num, categories,
+		ctx->alg);
+}
+
+struct rte_acl_ctx *
+rte_acl_find_existing(const char *name)
+{
+	struct rte_acl_ctx *ctx = NULL;
+	struct rte_acl_list *acl_list;
+	struct rte_tailq_entry *te;
+
+	acl_list = RTE_TAILQ_CAST(rte_acl_tailq.head, rte_acl_list);
+
+	rte_rwlock_read_lock(RTE_EAL_TAILQ_RWLOCK);
+	TAILQ_FOREACH(te, acl_list, next) {
+		ctx = (struct rte_acl_ctx *) te->data;
+		if (strncmp(name, ctx->name, sizeof(ctx->name)) == 0)
+			break;
+	}
+	rte_rwlock_read_unlock(RTE_EAL_TAILQ_RWLOCK);
+
+	if (te == NULL) {
+		rte_errno = ENOENT;
+		return NULL;
+	}
+	return ctx;
+}
+
+void
+rte_acl_free(struct rte_acl_ctx *ctx)
+{
+	struct rte_acl_list *acl_list;
+	struct rte_tailq_entry *te;
+
+	if (ctx == NULL)
+		return;
+
+	acl_list = RTE_TAILQ_CAST(rte_acl_tailq.head, rte_acl_list);
+
+	rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
+
+	/* find our tailq entry */
+	TAILQ_FOREACH(te, acl_list, next) {
+		if (te->data == (void *) ctx)
+			break;
+	}
+	if (te == NULL) {
+		rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
+		return;
+	}
+
+	TAILQ_REMOVE(acl_list, te, next);
+
+	rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
+
+	rte_free(ctx->mem);
+	rte_free(ctx);
+	rte_free(te);
+}
+
+struct rte_acl_ctx *
+rte_acl_create(const struct rte_acl_param *param)
+{
+	size_t sz;
+	struct rte_acl_ctx *ctx;
+	struct rte_acl_list *acl_list;
+	struct rte_tailq_entry *te;
+	char name[sizeof(ctx->name)];
+
+	acl_list = RTE_TAILQ_CAST(rte_acl_tailq.head, rte_acl_list);
+
+	/* check that input parameters are valid. */
+	if (param == NULL || param->name == NULL) {
+		rte_errno = EINVAL;
+		return NULL;
+	}
+
+	snprintf(name, sizeof(name), "ACL_%s", param->name);
+
+	/* calculate amount of memory required for pattern set. */
+	sz = sizeof(*ctx) + param->max_rule_num * param->rule_size;
+
+	/* get EAL TAILQ lock. */
+	rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
+
+	/* if we already have one with that name */
+	TAILQ_FOREACH(te, acl_list, next) {
+		ctx = (struct rte_acl_ctx *) te->data;
+		if (strncmp(param->name, ctx->name, sizeof(ctx->name)) == 0)
+			break;
+	}
+
+	/* if ACL with such name doesn't exist, then create a new one. */
+	if (te == NULL) {
+		ctx = NULL;
+		te = rte_zmalloc("ACL_TAILQ_ENTRY", sizeof(*te), 0);
+
+		if (te == NULL) {
+			RTE_LOG(ERR, ACL, "Cannot allocate tailq entry!\n");
+			goto exit;
+		}
+
+		ctx = rte_zmalloc_socket(name, sz, RTE_CACHE_LINE_SIZE, param->socket_id);
+
+		if (ctx == NULL) {
+			RTE_LOG(ERR, ACL,
+				"allocation of %zu bytes on socket %d for %s failed\n",
+				sz, param->socket_id, name);
+			rte_free(te);
+			goto exit;
+		}
+		/* init new allocated context. */
+		ctx->rules = ctx + 1;
+		ctx->max_rules = param->max_rule_num;
+		ctx->rule_sz = param->rule_size;
+		ctx->socket_id = param->socket_id;
+		ctx->alg = rte_acl_default_classify;
+		snprintf(ctx->name, sizeof(ctx->name), "%s", param->name);
+
+		te->data = (void *) ctx;
+
+		TAILQ_INSERT_TAIL(acl_list, te, next);
+	}
+
+exit:
+	rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);
+	return ctx;
+}
+
+static int
+acl_add_rules(struct rte_acl_ctx *ctx, const void *rules, uint32_t num)
+{
+	uint8_t *pos;
+
+	if (num + ctx->num_rules > ctx->max_rules)
+		return -ENOMEM;
+
+	pos = ctx->rules;
+	pos += ctx->rule_sz * ctx->num_rules;
+	memcpy(pos, rules, num * ctx->rule_sz);
+	ctx->num_rules += num;
+
+	return 0;
+}
+
+static int
+acl_check_rule(const struct rte_acl_rule_data *rd)
+{
+	if ((RTE_LEN2MASK(RTE_ACL_MAX_CATEGORIES, typeof(rd->category_mask)) &
+			rd->category_mask) == 0 ||
+			rd->priority > RTE_ACL_MAX_PRIORITY ||
+			rd->priority < RTE_ACL_MIN_PRIORITY ||
+			rd->userdata == RTE_ACL_INVALID_USERDATA)
+		return -EINVAL;
+	return 0;
+}
+
+int
+rte_acl_add_rules(struct rte_acl_ctx *ctx, const struct rte_acl_rule *rules,
+	uint32_t num)
+{
+	const struct rte_acl_rule *rv;
+	uint32_t i;
+	int32_t rc;
+
+	if (ctx == NULL || rules == NULL || 0 == ctx->rule_sz)
+		return -EINVAL;
+
+	for (i = 0; i != num; i++) {
+		rv = (const struct rte_acl_rule *)
+			((uintptr_t)rules + i * ctx->rule_sz);
+		rc = acl_check_rule(&rv->data);
+		if (rc != 0) {
+			RTE_LOG(ERR, ACL, "%s(%s): rule #%u is invalid\n",
+				__func__, ctx->name, i + 1);
+			return rc;
+		}
+	}
+
+	return acl_add_rules(ctx, rules, num);
+}
+
+/*
+ * Reset all rules.
+ * Note that RT structures are not affected.
+ */
+void
+rte_acl_reset_rules(struct rte_acl_ctx *ctx)
+{
+	if (ctx != NULL)
+		ctx->num_rules = 0;
+}
+
+/*
+ * Reset all rules and destroys RT structures.
+ */
+void
+rte_acl_reset(struct rte_acl_ctx *ctx)
+{
+	if (ctx != NULL) {
+		rte_acl_reset_rules(ctx);
+		rte_acl_build(ctx, &ctx->config);
+	}
+}
+
+/*
+ * Dump ACL context to the stdout.
+ */
+void
+rte_acl_dump(const struct rte_acl_ctx *ctx)
+{
+	if (!ctx)
+		return;
+	printf("acl context <%s>@%p\n", ctx->name, ctx);
+	printf("  socket_id=%"PRId32"\n", ctx->socket_id);
+	printf("  alg=%"PRId32"\n", ctx->alg);
+	printf("  max_rules=%"PRIu32"\n", ctx->max_rules);
+	printf("  rule_size=%"PRIu32"\n", ctx->rule_sz);
+	printf("  num_rules=%"PRIu32"\n", ctx->num_rules);
+	printf("  num_categories=%"PRIu32"\n", ctx->num_categories);
+	printf("  num_tries=%"PRIu32"\n", ctx->num_tries);
+}
+
+/*
+ * Dump all ACL contexts to the stdout.
+ */
+void
+rte_acl_list_dump(void)
+{
+	struct rte_acl_ctx *ctx;
+	struct rte_acl_list *acl_list;
+	struct rte_tailq_entry *te;
+
+	acl_list = RTE_TAILQ_CAST(rte_acl_tailq.head, rte_acl_list);
+
+	rte_rwlock_read_lock(RTE_EAL_TAILQ_RWLOCK);
+	TAILQ_FOREACH(te, acl_list, next) {
+		ctx = (struct rte_acl_ctx *) te->data;
+		rte_acl_dump(ctx);
+	}
+	rte_rwlock_read_unlock(RTE_EAL_TAILQ_RWLOCK);
+}
diff --git a/lib/librte_acl/rte_acl.h b/lib/librte_acl/rte_acl.h
new file mode 100644
index 00000000..0979a098
--- /dev/null
+++ b/lib/librte_acl/rte_acl.h
@@ -0,0 +1,388 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_ACL_H_
+#define _RTE_ACL_H_
+
+/**
+ * @file
+ *
+ * RTE Classifier.
+ */
+
+#include <rte_acl_osdep.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	RTE_ACL_MAX_CATEGORIES	16
+
+#define	RTE_ACL_RESULTS_MULTIPLIER	(XMM_SIZE / sizeof(uint32_t))
+
+#define RTE_ACL_MAX_LEVELS 64
+#define RTE_ACL_MAX_FIELDS 64
+
+union rte_acl_field_types {
+	uint8_t  u8;
+	uint16_t u16;
+	uint32_t u32;
+	uint64_t u64;
+};
+
+enum {
+	RTE_ACL_FIELD_TYPE_MASK = 0,
+	RTE_ACL_FIELD_TYPE_RANGE,
+	RTE_ACL_FIELD_TYPE_BITMASK
+};
+
+/**
+ * ACL Field definition.
+ * Each field in the ACL rule has an associate definition.
+ * It defines the type of field, its size, its offset in the input buffer,
+ * the field index, and the input index.
+ * For performance reasons, the inner loop of the search function is unrolled
+ * to process four input bytes at a time. This requires the input to be grouped
+ * into sets of 4 consecutive bytes. The loop processes the first input byte as
+ * part of the setup and then subsequent bytes must be in groups of 4
+ * consecutive bytes.
+ */
+struct rte_acl_field_def {
+	uint8_t  type;        /**< type - RTE_ACL_FIELD_TYPE_*. */
+	uint8_t	 size;        /**< size of field 1,2,4, or 8. */
+	uint8_t	 field_index; /**< index of field inside the rule. */
+	uint8_t  input_index; /**< 0-N input index. */
+	uint32_t offset;      /**< offset to start of field. */
+};
+
+/**
+ * ACL build configuration.
+ * Defines the fields of an ACL trie and number of categories to build with.
+ */
+struct rte_acl_config {
+	uint32_t num_categories; /**< Number of categories to build with. */
+	uint32_t num_fields;     /**< Number of field definitions. */
+	struct rte_acl_field_def defs[RTE_ACL_MAX_FIELDS];
+	/**< array of field definitions. */
+	size_t max_size;
+	/**< max memory limit for internal run-time structures. */
+};
+
+/**
+ * Defines the value of a field for a rule.
+ */
+struct rte_acl_field {
+	union rte_acl_field_types value;
+	/**< a 1,2,4, or 8 byte value of the field. */
+	union rte_acl_field_types mask_range;
+	/**<
+	 * depending on field type:
+	 * mask -> 1.2.3.4/32 value=0x1020304, mask_range=32,
+	 * range -> 0 : 65535 value=0, mask_range=65535,
+	 * bitmask -> 0x06/0xff value=6, mask_range=0xff.
+	 */
+};
+
+enum {
+	RTE_ACL_TYPE_SHIFT = 29,
+	RTE_ACL_MAX_INDEX = RTE_LEN2MASK(RTE_ACL_TYPE_SHIFT, uint32_t),
+	RTE_ACL_MAX_PRIORITY = RTE_ACL_MAX_INDEX,
+	RTE_ACL_MIN_PRIORITY = 0,
+};
+
+#define	RTE_ACL_INVALID_USERDATA	0
+
+#define	RTE_ACL_MASKLEN_TO_BITMASK(v, s)	\
+((v) == 0 ? (v) : (typeof(v))((uint64_t)-1 << ((s) * CHAR_BIT - (v))))
+
+/**
+ * Miscellaneous data for ACL rule.
+ */
+struct rte_acl_rule_data {
+	uint32_t category_mask; /**< Mask of categories for that rule. */
+	int32_t  priority;      /**< Priority for that rule. */
+	uint32_t userdata;      /**< Associated with the rule user data. */
+};
+
+/**
+ * Defines single ACL rule.
+ * data - miscellaneous data for the rule.
+ * field[] - value and mask or range for each field.
+ */
+#define	RTE_ACL_RULE_DEF(name, fld_num)	struct name {\
+	struct rte_acl_rule_data data;               \
+	struct rte_acl_field field[fld_num];         \
+}
+
+RTE_ACL_RULE_DEF(rte_acl_rule, 0);
+
+#define	RTE_ACL_RULE_SZ(fld_num)	\
+	(sizeof(struct rte_acl_rule) + sizeof(struct rte_acl_field) * (fld_num))
+
+
+/** Max number of characters in name.*/
+#define	RTE_ACL_NAMESIZE		32
+
+/**
+ * Parameters used when creating the ACL context.
+ */
+struct rte_acl_param {
+	const char *name;         /**< Name of the ACL context. */
+	int         socket_id;    /**< Socket ID to allocate memory for. */
+	uint32_t    rule_size;    /**< Size of each rule. */
+	uint32_t    max_rule_num; /**< Maximum number of rules. */
+};
+
+
+/**
+ * Create a new ACL context.
+ *
+ * @param param
+ *   Parameters used to create and initialise the ACL context.
+ * @return
+ *   Pointer to ACL context structure that is used in future ACL
+ *   operations, or NULL on error, with error code set in rte_errno.
+ *   Possible rte_errno errors include:
+ *   - EINVAL - invalid parameter passed to function
+ */
+struct rte_acl_ctx *
+rte_acl_create(const struct rte_acl_param *param);
+
+/**
+ * Find an existing ACL context object and return a pointer to it.
+ *
+ * @param name
+ *   Name of the ACL context as passed to rte_acl_create()
+ * @return
+ *   Pointer to ACL context or NULL if object not found
+ *   with rte_errno set appropriately. Possible rte_errno values include:
+ *    - ENOENT - value not available for return
+ */
+struct rte_acl_ctx *
+rte_acl_find_existing(const char *name);
+
+/**
+ * De-allocate all memory used by ACL context.
+ *
+ * @param ctx
+ *   ACL context to free
+ */
+void
+rte_acl_free(struct rte_acl_ctx *ctx);
+
+/**
+ * Add rules to an existing ACL context.
+ * This function is not multi-thread safe.
+ *
+ * @param ctx
+ *   ACL context to add patterns to.
+ * @param rules
+ *   Array of rules to add to the ACL context.
+ *   Note that all fields in rte_acl_rule structures are expected
+ *   to be in host byte order.
+ *   Each rule expected to be in the same format and not exceed size
+ *   specified at ACL context creation time.
+ * @param num
+ *   Number of elements in the input array of rules.
+ * @return
+ *   - -ENOMEM if there is no space in the ACL context for these rules.
+ *   - -EINVAL if the parameters are invalid.
+ *   - Zero if operation completed successfully.
+ */
+int
+rte_acl_add_rules(struct rte_acl_ctx *ctx, const struct rte_acl_rule *rules,
+	uint32_t num);
+
+/**
+ * Delete all rules from the ACL context.
+ * This function is not multi-thread safe.
+ * Note that internal run-time structures are not affected.
+ *
+ * @param ctx
+ *   ACL context to delete rules from.
+ */
+void
+rte_acl_reset_rules(struct rte_acl_ctx *ctx);
+
+/**
+ * Analyze set of rules and build required internal run-time structures.
+ * This function is not multi-thread safe.
+ *
+ * @param ctx
+ *   ACL context to build.
+ * @param cfg
+ *   Pointer to struct rte_acl_config - defines build parameters.
+ * @return
+ *   - -ENOMEM if couldn't allocate enough memory.
+ *   - -EINVAL if the parameters are invalid.
+ *   - Negative error code if operation failed.
+ *   - Zero if operation completed successfully.
+ */
+int
+rte_acl_build(struct rte_acl_ctx *ctx, const struct rte_acl_config *cfg);
+
+/**
+ * Delete all rules from the ACL context and
+ * destroy all internal run-time structures.
+ * This function is not multi-thread safe.
+ *
+ * @param ctx
+ *   ACL context to reset.
+ */
+void
+rte_acl_reset(struct rte_acl_ctx *ctx);
+
+/**
+ *  Available implementations of ACL classify.
+ */
+enum rte_acl_classify_alg {
+	RTE_ACL_CLASSIFY_DEFAULT = 0,
+	RTE_ACL_CLASSIFY_SCALAR = 1,  /**< generic implementation. */
+	RTE_ACL_CLASSIFY_SSE = 2,     /**< requires SSE4.1 support. */
+	RTE_ACL_CLASSIFY_AVX2 = 3,    /**< requires AVX2 support. */
+	RTE_ACL_CLASSIFY_NEON = 4,    /**< requires NEON support. */
+	RTE_ACL_CLASSIFY_NUM          /* should always be the last one. */
+};
+
+/**
+ * Perform search for a matching ACL rule for each input data buffer.
+ * Each input data buffer can have up to *categories* matches.
+ * That implies that results array should be big enough to hold
+ * (categories * num) elements.
+ * Also categories parameter should be either one or multiple of
+ * RTE_ACL_RESULTS_MULTIPLIER and can't be bigger than RTE_ACL_MAX_CATEGORIES.
+ * If more than one rule is applicable for given input buffer and
+ * given category, then rule with highest priority will be returned as a match.
+ * Note, that it is a caller's responsibility to ensure that input parameters
+ * are valid and point to correct memory locations.
+ *
+ * @param ctx
+ *   ACL context to search with.
+ * @param data
+ *   Array of pointers to input data buffers to perform search.
+ *   Note that all fields in input data buffers supposed to be in network
+ *   byte order (MSB).
+ * @param results
+ *   Array of search results, *categories* results per each input data buffer.
+ * @param num
+ *   Number of elements in the input data buffers array.
+ * @param categories
+ *   Number of maximum possible matches for each input buffer, one possible
+ *   match per category.
+ * @return
+ *   zero on successful completion.
+ *   -EINVAL for incorrect arguments.
+ */
+extern int
+rte_acl_classify(const struct rte_acl_ctx *ctx,
+		 const uint8_t **data,
+		 uint32_t *results, uint32_t num,
+		 uint32_t categories);
+
+/**
+ * Perform search using specified algorithm for a matching ACL rule for
+ * each input data buffer.
+ * Each input data buffer can have up to *categories* matches.
+ * That implies that results array should be big enough to hold
+ * (categories * num) elements.
+ * Also categories parameter should be either one or multiple of
+ * RTE_ACL_RESULTS_MULTIPLIER and can't be bigger than RTE_ACL_MAX_CATEGORIES.
+ * If more than one rule is applicable for given input buffer and
+ * given category, then rule with highest priority will be returned as a match.
+ * Note, that it is a caller's responsibility to ensure that input parameters
+ * are valid and point to correct memory locations.
+ *
+ * @param ctx
+ *   ACL context to search with.
+ * @param data
+ *   Array of pointers to input data buffers to perform search.
+ *   Note that all fields in input data buffers supposed to be in network
+ *   byte order (MSB).
+ * @param results
+ *   Array of search results, *categories* results per each input data buffer.
+ * @param num
+ *   Number of elements in the input data buffers array.
+ * @param categories
+ *   Number of maximum possible matches for each input buffer, one possible
+ *   match per category.
+ * @param alg
+ *   Algorithm to be used for the search.
+ *   It is the caller responsibility to ensure that the value refers to the
+ *   existing algorithm, and that it could be run on the given CPU.
+ * @return
+ *   zero on successful completion.
+ *   -EINVAL for incorrect arguments.
+ */
+extern int
+rte_acl_classify_alg(const struct rte_acl_ctx *ctx,
+		 const uint8_t **data,
+		 uint32_t *results, uint32_t num,
+		 uint32_t categories,
+		 enum rte_acl_classify_alg alg);
+
+/*
+ * Override the default classifier function for a given ACL context.
+ * @param ctx
+ *   ACL context to change classify function for.
+ * @param alg
+ *   New default classify algorithm for given ACL context.
+ *   It is the caller responsibility to ensure that the value refers to the
+ *   existing algorithm, and that it could be run on the given CPU.
+ * @return
+ *   - -EINVAL if the parameters are invalid.
+ *   - Zero if operation completed successfully.
+ */
+extern int
+rte_acl_set_ctx_classify(struct rte_acl_ctx *ctx,
+	enum rte_acl_classify_alg alg);
+
+/**
+ * Dump an ACL context structure to the console.
+ *
+ * @param ctx
+ *   ACL context to dump.
+ */
+void
+rte_acl_dump(const struct rte_acl_ctx *ctx);
+
+/**
+ * Dump all ACL context structures to the console.
+ */
+void
+rte_acl_list_dump(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_ACL_H_ */
diff --git a/lib/librte_acl/rte_acl_osdep.h b/lib/librte_acl/rte_acl_osdep.h
new file mode 100644
index 00000000..41f7e3d4
--- /dev/null
+++ b/lib/librte_acl/rte_acl_osdep.h
@@ -0,0 +1,80 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_ACL_OSDEP_H_
+#define _RTE_ACL_OSDEP_H_
+
+/**
+ * @file
+ *
+ * RTE ACL DPDK/OS dependent file.
+ */
+
+#include <stdint.h>
+#include <stddef.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <sys/queue.h>
+
+/*
+ * Common defines.
+ */
+
+#define DIM(x) RTE_DIM(x)
+
+#include <rte_common.h>
+#include <rte_vect.h>
+#include <rte_memory.h>
+#include <rte_log.h>
+#include <rte_memcpy.h>
+#include <rte_prefetch.h>
+#include <rte_byteorder.h>
+#include <rte_branch_prediction.h>
+#include <rte_memzone.h>
+#include <rte_malloc.h>
+#include <rte_eal.h>
+#include <rte_eal_memconfig.h>
+#include <rte_per_lcore.h>
+#include <rte_errno.h>
+#include <rte_string_fns.h>
+#include <rte_cpuflags.h>
+#include <rte_log.h>
+#include <rte_debug.h>
+
+#endif /* _RTE_ACL_OSDEP_H_ */
diff --git a/lib/librte_acl/rte_acl_version.map b/lib/librte_acl/rte_acl_version.map
new file mode 100644
index 00000000..b09370a1
--- /dev/null
+++ b/lib/librte_acl/rte_acl_version.map
@@ -0,0 +1,19 @@
+DPDK_2.0 {
+	global:
+
+	rte_acl_add_rules;
+	rte_acl_build;
+	rte_acl_classify;
+	rte_acl_classify_alg;
+	rte_acl_classify_scalar;
+	rte_acl_create;
+	rte_acl_dump;
+	rte_acl_find_existing;
+	rte_acl_free;
+	rte_acl_list_dump;
+	rte_acl_reset;
+	rte_acl_reset_rules;
+	rte_acl_set_ctx_classify;
+
+	local: *;
+};
diff --git a/lib/librte_acl/tb_mem.c b/lib/librte_acl/tb_mem.c
new file mode 100644
index 00000000..157e6080
--- /dev/null
+++ b/lib/librte_acl/tb_mem.c
@@ -0,0 +1,108 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "tb_mem.h"
+
+/*
+ *  Memory management routines for temporary memory.
+ *  That memory is used only during build phase and is released after
+ *  build is finished.
+ *  Note, that tb_pool/tb_alloc() are not supposed to return NULL.
+ *  Instead, in the case of failure to allocate memory,
+ *  it would do siglongjmp(pool->fail).
+ *  It is responsibility of the caller to save the proper context/environment,
+ *  in the pool->fail before calling tb_alloc() for the given pool first time.
+ */
+
+static struct tb_mem_block *
+tb_pool(struct tb_mem_pool *pool, size_t sz)
+{
+	struct tb_mem_block *block;
+	uint8_t *ptr;
+	size_t size;
+
+	size = sz + pool->alignment - 1;
+	block = calloc(1, size + sizeof(*pool->block));
+	if (block == NULL) {
+		RTE_LOG(ERR, MALLOC, "%s(%zu)\n failed, currently allocated "
+			"by pool: %zu bytes\n", __func__, sz, pool->alloc);
+		siglongjmp(pool->fail, -ENOMEM);
+		return NULL;
+	}
+
+	block->pool = pool;
+
+	block->next = pool->block;
+	pool->block = block;
+
+	pool->alloc += size;
+
+	ptr = (uint8_t *)(block + 1);
+	block->mem = RTE_PTR_ALIGN_CEIL(ptr, pool->alignment);
+	block->size = size - (block->mem - ptr);
+
+	return block;
+}
+
+void *
+tb_alloc(struct tb_mem_pool *pool, size_t size)
+{
+	struct tb_mem_block *block;
+	void *ptr;
+	size_t new_sz;
+
+	size = RTE_ALIGN_CEIL(size, pool->alignment);
+
+	block = pool->block;
+	if (block == NULL || block->size < size) {
+		new_sz = (size > pool->min_alloc) ? size : pool->min_alloc;
+		block = tb_pool(pool, new_sz);
+	}
+	ptr = block->mem;
+	block->size -= size;
+	block->mem += size;
+	return ptr;
+}
+
+void
+tb_free_pool(struct tb_mem_pool *pool)
+{
+	struct tb_mem_block *next, *block;
+
+	for (block = pool->block; block != NULL; block = next) {
+		next = block->next;
+		free(block);
+	}
+	pool->block = NULL;
+	pool->alloc = 0;
+}
diff --git a/lib/librte_acl/tb_mem.h b/lib/librte_acl/tb_mem.h
new file mode 100644
index 00000000..ca7af966
--- /dev/null
+++ b/lib/librte_acl/tb_mem.h
@@ -0,0 +1,76 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TB_MEM_H_
+#define _TB_MEM_H_
+
+/**
+ * @file
+ *
+ * RTE ACL temporary (build phase) memory management.
+ * Contains structures and functions to manage temporary (used by build only)
+ * memory. Memory allocated in large blocks to speed 'free' when trie is
+ * destructed (finish of build phase).
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_acl_osdep.h>
+#include <setjmp.h>
+
+struct tb_mem_block {
+	struct tb_mem_block *next;
+	struct tb_mem_pool  *pool;
+	size_t               size;
+	uint8_t             *mem;
+};
+
+struct tb_mem_pool {
+	struct tb_mem_block *block;
+	size_t               alignment;
+	size_t               min_alloc;
+	size_t               alloc;
+	/* jump target in case of memory allocation failure. */
+	sigjmp_buf           fail;
+};
+
+void *tb_alloc(struct tb_mem_pool *pool, size_t size);
+void tb_free_pool(struct tb_mem_pool *pool);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _TB_MEM_H_ */