diff options
-rw-r--r-- | plugins/lb-plugin/lb/lbhash.h | 2 | ||||
-rw-r--r-- | vlib/vlib/buffer.h | 2 | ||||
-rw-r--r-- | vlib/vlib/node.c | 6 | ||||
-rw-r--r-- | vppinfra/Makefile.am | 1 | ||||
-rw-r--r-- | vppinfra/vppinfra/cpu.c | 4 | ||||
-rw-r--r-- | vppinfra/vppinfra/cpu.h | 14 | ||||
-rw-r--r-- | vppinfra/vppinfra/vector.h | 10 | ||||
-rw-r--r-- | vppinfra/vppinfra/vector_neon.h | 71 |
8 files changed, 100 insertions, 10 deletions
diff --git a/plugins/lb-plugin/lb/lbhash.h b/plugins/lb-plugin/lb/lbhash.h index d47b49828fa..ca3cc143dc2 100644 --- a/plugins/lb-plugin/lb/lbhash.h +++ b/plugins/lb-plugin/lb/lbhash.h @@ -31,7 +31,9 @@ #include <vnet/vnet.h> +#if defined (__SSE4_2__) #include <immintrin.h> +#endif /* * @brief Number of entries per bucket. diff --git a/vlib/vlib/buffer.h b/vlib/vlib/buffer.h index 4ede684aa4b..5f1e62f08c9 100644 --- a/vlib/vlib/buffer.h +++ b/vlib/vlib/buffer.h @@ -56,7 +56,7 @@ #define VLIB_BUFFER_PRE_DATA_SIZE __PRE_DATA_SIZE #endif -#ifdef CLIB_HAVE_VEC128 +#if defined (CLIB_HAVE_VEC128) || defined (__aarch64__) typedef u8x16 vlib_copy_unit_t; #else typedef u64 vlib_copy_unit_t; diff --git a/vlib/vlib/node.c b/vlib/vlib/node.c index 23f7ea0239e..c419a13a487 100644 --- a/vlib/vlib/node.c +++ b/vlib/vlib/node.c @@ -433,9 +433,9 @@ register_node (vlib_main_t * vm, vlib_node_registration_t * r) for (i = 0; i < vec_len (rt->errors); i++) rt->errors[i] = vlib_error_set (n->index, i); - STATIC_ASSERT (sizeof (vlib_node_runtime_t) == 2 * CLIB_CACHE_LINE_BYTES, - "Size of vlib_node_runtime_t must be equal to 2 cachelines"); - ASSERT (vec_len (n->runtime_data) <= sizeof (vlib_node_runtime_t) - + STATIC_ASSERT_SIZEOF (vlib_node_runtime_t, 128); + ASSERT (vec_len (n->runtime_data) <= + sizeof (vlib_node_runtime_t) - STRUCT_OFFSET_OF (vlib_node_runtime_t, runtime_data)); if (vec_len (n->runtime_data) > 0) diff --git a/vppinfra/Makefile.am b/vppinfra/Makefile.am index d0a023e25d3..6183cd7a74e 100644 --- a/vppinfra/Makefile.am +++ b/vppinfra/Makefile.am @@ -214,6 +214,7 @@ nobase_include_HEADERS = \ vppinfra/vector_altivec.h \ vppinfra/vector_funcs.h \ vppinfra/vector_iwmmxt.h \ + vppinfra/vector_neon.h \ vppinfra/vector_sse2.h \ vppinfra/valgrind.h \ vppinfra/vm_unix.h \ diff --git a/vppinfra/vppinfra/cpu.c b/vppinfra/vppinfra/cpu.c index 70b39214817..a26d5c9ae10 100644 --- a/vppinfra/vppinfra/cpu.c +++ b/vppinfra/vppinfra/cpu.c @@ -101,6 +101,8 @@ format_cpu_model_name (u8 * s, va_list * args) vec_free (name); return s; +#elif defined(__aarch64__) + return format (s, "armv8"); #else /* ! __x86_64__ */ return format (s, "unknown"); #endif @@ -109,7 +111,7 @@ format_cpu_model_name (u8 * s, va_list * args) u8 * format_cpu_flags (u8 * s, va_list * args) { -#if __x86_64__ +#if defined(__x86_64__) #define _(flag, func, reg, bit) \ if (clib_cpu_supports_ ## flag()) \ s = format (s, #flag " "); diff --git a/vppinfra/vppinfra/cpu.h b/vppinfra/vppinfra/cpu.h index 7d63f85b506..9c149f3fa2a 100644 --- a/vppinfra/vppinfra/cpu.h +++ b/vppinfra/vppinfra/cpu.h @@ -51,8 +51,6 @@ return & fn; \ } -#if __x86_64__ -#include "cpuid.h" #define foreach_x86_64_flags \ _ (sse3, 1, ecx, 0) \ @@ -66,6 +64,9 @@ _ (aes, 1, ecx, 25) \ _ (sha, 7, ebx, 29) \ _ (invariant_tsc, 0x80000007, edx, 8) +#if defined(__x86_64__) +#include "cpuid.h" + static inline int clib_get_cpuid (const u32 lev, u32 * eax, u32 * ebx, u32 * ecx, u32 * edx) { @@ -90,13 +91,18 @@ clib_cpu_supports_ ## flag() \ } foreach_x86_64_flags #undef _ +#else + +#define _(flag, func, reg, bit) \ +static inline int clib_cpu_supports_ ## flag() { return 0; } +foreach_x86_64_flags +#undef _ +#endif #endif format_function_t format_cpu_uarch; format_function_t format_cpu_model_name; format_function_t format_cpu_flags; -#endif - /* * fd.io coding-style-patch-verification: ON * diff --git a/vppinfra/vppinfra/vector.h b/vppinfra/vppinfra/vector.h index 58fc808abcc..491e7cfe547 100644 --- a/vppinfra/vppinfra/vector.h +++ b/vppinfra/vppinfra/vector.h @@ -42,7 +42,7 @@ /* Vector types. */ -#if defined (__MMX__) || defined (__IWMMXT__) +#if defined (__MMX__) || defined (__IWMMXT__) || defined (__aarch64__) #define CLIB_HAVE_VEC64 #endif @@ -63,6 +63,10 @@ #if defined (__aarch64__) || defined (__arm__) typedef unsigned int u32x4 _vector_size (16); +typedef u8 u8x16 _vector_size (16); +typedef u16 u16x8 _vector_size (16); +typedef u32 u32x4 _vector_size (16); +typedef u64 u64x2 _vector_size (16); #endif #ifdef CLIB_HAVE_VEC64 @@ -245,6 +249,10 @@ _(i64, 2); #include <vppinfra/vector_iwmmxt.h> #endif +#if defined (__aarch64__) +#include <vppinfra/vector_neon.h> +#endif + #if (defined(CLIB_HAVE_VEC128) || defined(CLIB_HAVE_VEC64)) #include <vppinfra/vector_funcs.h> #endif diff --git a/vppinfra/vppinfra/vector_neon.h b/vppinfra/vppinfra/vector_neon.h new file mode 100644 index 00000000000..cea5275949f --- /dev/null +++ b/vppinfra/vppinfra/vector_neon.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_vector_neon_h +#define included_vector_neon_h +#include <arm_neon.h> + +/* Splats. */ + +#define u8x16_splat(i) vdupq_n_u8(i) +#define u16x8_splat(i) vdupq_n_u16(i) +#define i16x8_splat(i) vdupq_n_s16(i) +#define u32x4_splat(i) vdupq_n_u32(i) +#define i32x4_splat(i) vdupq_n_s32(i) + +/* Arithmetic */ +#define u16x8_add(a,b) vaddq_u16(a,b) +#define i16x8_add(a,b) vaddq_s16(a,b) +#define u16x8_sub_saturate(a,b) vsubq_u16(a,b) +#define i16x8_sub_saturate(a,b) vsubq_s16(a,b) + +#define u16x8_is_equal(a,b) vceqq_u16(a,b) +#define i16x8_is_equal(a,b) vceqq_i16(a,b) + +always_inline u32 +u16x8_zero_byte_mask (u16x8 input) +{ + u8x16 vall_one = vdupq_n_u8 (0x0); + u8x16 res_values = { 0x01, 0x02, 0x04, 0x08, + 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, + 0x10, 0x20, 0x40, 0x80 + }; + + /* input --> [0x80, 0x40, 0x01, 0xf0, ... ] */ + u8x16 test_result = + vreinterpretq_u8_u16 (vceqq_u16 (input, vreinterpretq_u16_u8 (vall_one))); + u8x16 before_merge = vminq_u8 (test_result, res_values); + /*before_merge--> [0x80, 0x00, 0x00, 0x10, ... ] */ + /* u8x16 --> [a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p] */ + /* pair add until we have 2 uint64_t */ + u16x8 merge1 = vpaddlq_u8 (before_merge); + /* u16x8--> [a+b,c+d, e+f,g+h, i+j,k+l, m+n,o+p] */ + u32x4 merge2 = vpaddlq_u16 (merge1); + /* u32x4--> [a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p] */ + u64x2 merge3 = vpaddlq_u32 (merge2); + /* u64x2--> [a+b+c+d+e+f+g+h, i+j+k+l+m+n+o+p] */ + return (u32) (vgetq_lane_u64 (merge3, 1) << 8) + vgetq_lane_u64 (merge3, 0); +} + +#endif /* included_vector_neon_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ |