add CLIB_HAVE_VEC128 with NEON intrinsics (VPP-1127)

Enable CLIB_HAVE_VEC128 if both aarch64 and __ARM_NEON ie. armv8 only, not armv7 Add more neon compare intrinsics wrappers. I only add simple intrinsics wrappers. More complex ones can be added later as they are needed, with performance tests on the corresponding feature to back them up. Remove wrongly added 128bits definitions defined on both armv7 and armv8 without concern for NEON instructions presence. Notable correspondinf code activations: * MHEAP_FLAG_SMALL_OBJECT_CACHE in mheap.c * ip4 fib mtrie leaves access * enable ixge plugin compilation for aarch64 (conf still disables it by default) Change-Id: I99953823627bdff6f222d232c78aa7b655aaf77a Signed-off-by: Gabriel Ganne <gabriel.ganne@enea.com>
author: Gabriel Ganne <gabriel.ganne@enea.com> 2017-12-05 17:33:37 +0100
committer: Damjan Marion <dmarion.lists@gmail.com> 2018-02-08 20:32:31 +0000
commit: b81831d122d59b55c7d42b39e4fd9e94b7d40aca (patch)
tree: 2427cad202283105a19387ab5f249f7be07acc97 /src/vppinfra
parent: c43b3f986476ffb4506b7115898e809a6e34f601 (diff)
2 files changed, 64 insertions, 8 deletions
diff --git a/src/vppinfra/vector.h b/src/vppinfra/vector.h
index 5da1c19037b..e786275f5d4 100644
--- a/src/vppinfra/vector.h
+++ b/src/vppinfra/vector.h
@@ -46,6 +46,10 @@
 #define CLIB_HAVE_VEC64
 #endif
 
+#if defined (__aarch64__) && defined(__ARM_NEON)
+#define CLIB_HAVE_VEC128
+#endif
+
 #if defined (__SSE2__) && __GNUC__ >= 4
 #define CLIB_HAVE_VEC128
 #endif
@@ -69,14 +73,6 @@
 
 #define _vector_size(n) __attribute__ ((vector_size (n)))
 
-#if defined (__aarch64__) || defined (__arm__)
-typedef unsigned int u32x4 _vector_size (16);
-typedef u8 u8x16 _vector_size (16);
-typedef u16 u16x8 _vector_size (16);
-typedef u32 u32x4 _vector_size (16);
-typedef u64 u64x2 _vector_size (16);
-#endif
-
 #ifdef CLIB_HAVE_VEC64
 /* Signed 64 bit. */
 typedef char i8x8 _vector_size (8);
diff --git a/src/vppinfra/vector_neon.h b/src/vppinfra/vector_neon.h
index cea5275949f..6357d68fabb 100644
--- a/src/vppinfra/vector_neon.h
+++ b/src/vppinfra/vector_neon.h
@@ -31,8 +31,68 @@
 #define u16x8_sub_saturate(a,b) vsubq_u16(a,b)
 #define i16x8_sub_saturate(a,b) vsubq_s16(a,b)
 
+
+/* Compare operations. */
+#define u8x16_is_equal(a,b) vceqq_u8(a,b)
+#define i8x16_is_equal(a,b) vceqq_s8(a,b)
 #define u16x8_is_equal(a,b) vceqq_u16(a,b)
 #define i16x8_is_equal(a,b) vceqq_i16(a,b)
+#define u32x4_is_equal(a,b) vceqq_u32(a,b)
+#define i32x4_is_equal(a,b) vceqq_s32(a,b)
+#define i8x16_is_greater(a,b) vcgtq_s8(a,b)
+#define i16x8_is_greater(a,b) vcgtq_u8(a,b)
+#define i32x4_is_greater(a,b) vcgtq_s32(a,b)
+
+always_inline u8x16
+u8x16_is_zero (u8x16 x)
+{
+  u8x16 zero = { 0 };
+  return u8x16_is_equal (x, zero);
+}
+
+always_inline u16x8
+u16x8_is_zero (u16x8 x)
+{
+  u16x8 zero = { 0 };
+  return u16x8_is_equal (x, zero);
+}
+
+always_inline u32x4
+u32x4_is_zero (u32x4 x)
+{
+  u32x4 zero = { 0 };
+  return u32x4_is_equal (x, zero);
+}
+
+/* Converts all ones/zeros compare mask to bitmap. */
+always_inline u32
+u8x16_compare_byte_mask (u8x16 x)
+{
+  static int8_t const __attribute__ ((aligned (16))) xr[8] =
+  {
+  -7, -6, -5, -4, -3, -2, -1, 0};
+  uint8x8_t mask_and = vdup_n_u8 (0x80);
+  int8x8_t mask_shift = vld1_s8 (xr);
+
+  uint8x8_t lo = vget_low_u8 (x);
+  uint8x8_t hi = vget_high_u8 (x);
+
+  lo = vand_u8 (lo, mask_and);
+  lo = vshl_u8 (lo, mask_shift);
+
+  hi = vand_u8 (hi, mask_and);
+  hi = vshl_u8 (hi, mask_shift);
+
+  lo = vpadd_u8 (lo, lo);
+  lo = vpadd_u8 (lo, lo);
+  lo = vpadd_u8 (lo, lo);
+
+  hi = vpadd_u8 (hi, hi);
+  hi = vpadd_u8 (hi, hi);
+  hi = vpadd_u8 (hi, hi);
+
+  return ((hi[0] << 8) | (lo[0] & 0xff));
+}
 
 always_inline u32
 u16x8_zero_byte_mask (u16x8 input)
author	Gabriel Ganne <gabriel.ganne@enea.com>	2017-12-05 17:33:37 +0100
committer	Damjan Marion <dmarion.lists@gmail.com>	2018-02-08 20:32:31 +0000
commit	b81831d122d59b55c7d42b39e4fd9e94b7d40aca (patch)
tree	2427cad202283105a19387ab5f249f7be07acc97 /src/vppinfra
parent	c43b3f986476ffb4506b7115898e809a6e34f601 (diff)