summaryrefslogtreecommitdiffstats
path: root/src/vppinfra
diff options
context:
space:
mode:
Diffstat (limited to 'src/vppinfra')
-rw-r--r--src/vppinfra/string.h76
-rw-r--r--src/vppinfra/vector_avx2.h13
-rw-r--r--src/vppinfra/vector_sse42.h12
3 files changed, 101 insertions, 0 deletions
diff --git a/src/vppinfra/string.h b/src/vppinfra/string.h
index d9cd8fe1af9..4755a9868d6 100644
--- a/src/vppinfra/string.h
+++ b/src/vppinfra/string.h
@@ -214,6 +214,82 @@ memset_s_inline (void *s, rsize_t smax, int c, rsize_t n)
#define clib_memset(s,c,n) memset_s_inline(s,n,c,n)
static_always_inline void
+clib_memcpy_le (u8 * dst, u8 * src, u8 len, u8 max_len)
+{
+#if defined (CLIB_HxAVE_VEC256)
+ u8x32 s, d;
+ u8x32 mask = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+ 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ };
+ u8x32 lv = u8x32_splat (len);
+ u8x32 add = u8x32_splat (32);
+
+ s = u8x32_load_unaligned (src);
+ d = u8x32_load_unaligned (dst);
+ d = u8x32_blend (d, s, u8x32_is_greater (lv, mask));
+ u8x32_store_unaligned (d, dst);
+
+ if (max_len <= 32)
+ return;
+
+ mask += add;
+ s = u8x32_load_unaligned (src + 32);
+ d = u8x32_load_unaligned (dst + 32);
+ d = u8x32_blend (d, s, u8x32_is_greater (lv, mask));
+ u8x32_store_unaligned (d, dst + 32);
+
+#elif defined (CLIB_HAVE_VEC128) && !defined (__aarch64__)
+ u8x16 s, d;
+ u8x16 mask = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+ u8x16 lv = u8x16_splat (len);
+ u8x16 add = u8x16_splat (16);
+
+ s = u8x16_load_unaligned (src);
+ d = u8x16_load_unaligned (dst);
+ d = u8x16_blend (d, s, u8x16_is_greater (lv, mask));
+ u8x16_store_unaligned (d, dst);
+
+ if (max_len <= 16)
+ return;
+
+ mask += add;
+ s = u8x16_load_unaligned (src + 16);
+ d = u8x16_load_unaligned (dst + 16);
+ d = u8x16_blend (d, s, u8x16_is_greater (lv, mask));
+ u8x16_store_unaligned (d, dst + 16);
+
+ if (max_len <= 32)
+ return;
+
+ mask += add;
+ s = u8x16_load_unaligned (src + 32);
+ d = u8x16_load_unaligned (dst + 32);
+ d = u8x16_blend (d, s, u8x16_is_greater (lv, mask));
+ u8x16_store_unaligned (d, dst + 32);
+
+ mask += add;
+ s = u8x16_load_unaligned (src + 48);
+ d = u8x16_load_unaligned (dst + 48);
+ d = u8x16_blend (d, s, u8x16_is_greater (lv, mask));
+ u8x16_store_unaligned (d, dst + 48);
+#else
+ clib_memcpy_fast (dst, src, len);
+#endif
+}
+
+static_always_inline void
+clib_memcpy_le64 (u8 * dst, u8 * src, u8 len)
+{
+ clib_memcpy_le (dst, src, len, 64);
+}
+
+static_always_inline void
+clib_memcpy_le32 (u8 * dst, u8 * src, u8 len)
+{
+ clib_memcpy_le (dst, src, len, 32);
+}
+
+static_always_inline void
clib_memset_u64 (void *p, u64 val, uword count)
{
u64 *ptr = p;
diff --git a/src/vppinfra/vector_avx2.h b/src/vppinfra/vector_avx2.h
index 51625618823..b9d6549da99 100644
--- a/src/vppinfra/vector_avx2.h
+++ b/src/vppinfra/vector_avx2.h
@@ -247,6 +247,19 @@ u32x8_scatter_one (u32x8 r, int index, void *p)
*(u32 *) p = r[index];
}
+static_always_inline u8x32
+u8x32_is_greater (u8x32 v1, u8x32 v2)
+{
+ return (u8x32) _mm256_cmpgt_epi8 ((__m256i) v1, (__m256i) v2);
+}
+
+static_always_inline u8x32
+u8x32_blend (u8x32 v1, u8x32 v2, u8x32 mask)
+{
+ return (u8x32) _mm256_blendv_epi8 ((__m256i) v1, (__m256i) v2,
+ (__m256i) mask);
+}
+
#endif /* included_vector_avx2_h */
/*
diff --git a/src/vppinfra/vector_sse42.h b/src/vppinfra/vector_sse42.h
index 5d6a47d3915..ee5b4dcc7df 100644
--- a/src/vppinfra/vector_sse42.h
+++ b/src/vppinfra/vector_sse42.h
@@ -691,6 +691,18 @@ u32x4_scatter_one (u32x4 r, int index, void *p)
*(u32 *) p = r[index];
}
+static_always_inline u8x16
+u8x16_is_greater (u8x16 v1, u8x16 v2)
+{
+ return (u8x16) _mm_cmpgt_epi8 ((__m128i) v1, (__m128i) v2);
+}
+
+static_always_inline u8x16
+u8x16_blend (u8x16 v1, u8x16 v2, u8x16 mask)
+{
+ return (u8x16) _mm_blendv_epi8 ((__m128i) v1, (__m128i) v2, (__m128i) mask);
+}
+
#endif /* included_vector_sse2_h */