summaryrefslogtreecommitdiffstats
path: root/src/vppinfra
diff options
context:
space:
mode:
authorDamjan Marion <damarion@cisco.com>2021-10-12 20:30:02 +0200
committerFlorin Coras <florin.coras@gmail.com>2021-11-10 16:45:23 +0000
commit56f54af21d18f9fdd471b81db77a3942b0aa4d9c (patch)
tree80c5e9681dc209cdbb3c54d7205bc07ad4379f69 /src/vppinfra
parent904638f4625c82d166d67870f9cf8088dd29a8b2 (diff)
vppinfra: new memcpy for x86_64
Change-Id: I5a5055580479960ac53e3f989aa188faf57fb05d Type: improvement Signed-off-by: Damjan Marion <damarion@cisco.com>
Diffstat (limited to 'src/vppinfra')
-rw-r--r--src/vppinfra/CMakeLists.txt5
-rw-r--r--src/vppinfra/hash.h15
-rw-r--r--src/vppinfra/memcpy_avx2.h249
-rw-r--r--src/vppinfra/memcpy_avx512.h285
-rw-r--r--src/vppinfra/memcpy_sse3.h368
-rw-r--r--src/vppinfra/memcpy_x86_64.h611
-rw-r--r--src/vppinfra/string.h32
-rw-r--r--src/vppinfra/types.h12
-rw-r--r--src/vppinfra/vector.h5
-rw-r--r--src/vppinfra/vector/test/memcpy_x86_64.c142
10 files changed, 786 insertions, 938 deletions
diff --git a/src/vppinfra/CMakeLists.txt b/src/vppinfra/CMakeLists.txt
index 11d4a5d539b..9f407a10a22 100644
--- a/src/vppinfra/CMakeLists.txt
+++ b/src/vppinfra/CMakeLists.txt
@@ -150,9 +150,7 @@ set(VPPINFRA_HEADERS
maplog.h
math.h
memcpy.h
- memcpy_avx2.h
- memcpy_avx512.h
- memcpy_sse3.h
+ memcpy_x86_64.h
mem.h
mhash.h
mpcap.h
@@ -278,6 +276,7 @@ set(test_files
vector/test/count_equal.c
vector/test/index_to_ptr.c
vector/test/mask_compare.c
+ vector/test/memcpy_x86_64.c
)
add_vpp_executable(test_vector_funcs
diff --git a/src/vppinfra/hash.h b/src/vppinfra/hash.h
index e4a65d21e65..7d4ed04dc4d 100644
--- a/src/vppinfra/hash.h
+++ b/src/vppinfra/hash.h
@@ -278,9 +278,20 @@ uword hash_bytes (void *v);
always_inline void
hash_set_mem_alloc (uword ** h, const void *key, uword v)
{
+ int objsize = __builtin_object_size (key, 0);
size_t ksz = hash_header (*h)->user;
- void *copy = clib_mem_alloc (ksz);
- clib_memcpy_fast (copy, key, ksz);
+ void *copy;
+ if (objsize > 0)
+ {
+ ASSERT (objsize == ksz);
+ copy = clib_mem_alloc (objsize);
+ clib_memcpy_fast (copy, key, objsize);
+ }
+ else
+ {
+ copy = clib_mem_alloc (ksz);
+ clib_memcpy_fast (copy, key, ksz);
+ }
hash_set_mem (*h, copy, v);
}
diff --git a/src/vppinfra/memcpy_avx2.h b/src/vppinfra/memcpy_avx2.h
deleted file mode 100644
index ac29d2590a7..00000000000
--- a/src/vppinfra/memcpy_avx2.h
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- * Copyright (c) 2016 Cisco and/or its affiliates.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at:
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*-
- * BSD LICENSE
- *
- * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef included_clib_memcpy_avx2_h
-#define included_clib_memcpy_avx2_h
-
-#include <stdint.h>
-#include <x86intrin.h>
-#include <vppinfra/warnings.h>
-
-/* *INDENT-OFF* */
-WARN_OFF (stringop-overflow)
-/* *INDENT-ON* */
-
-static inline void
-clib_mov16 (u8 * dst, const u8 * src)
-{
- __m128i xmm0;
-
- xmm0 = _mm_loadu_si128 ((const __m128i *) src);
- _mm_storeu_si128 ((__m128i *) dst, xmm0);
-}
-
-static inline void
-clib_mov32 (u8 * dst, const u8 * src)
-{
- __m256i ymm0;
-
- ymm0 = _mm256_loadu_si256 ((const __m256i *) src);
- _mm256_storeu_si256 ((__m256i *) dst, ymm0);
-}
-
-static inline void
-clib_mov64 (u8 * dst, const u8 * src)
-{
- clib_mov32 ((u8 *) dst + 0 * 32, (const u8 *) src + 0 * 32);
- clib_mov32 ((u8 *) dst + 1 * 32, (const u8 *) src + 1 * 32);
-}
-
-static inline void
-clib_mov128 (u8 * dst, const u8 * src)
-{
- clib_mov64 ((u8 *) dst + 0 * 64, (const u8 *) src + 0 * 64);
- clib_mov64 ((u8 *) dst + 1 * 64, (const u8 *) src + 1 * 64);
-}
-
-static inline void
-clib_mov128blocks (u8 * dst, const u8 * src, size_t n)
-{
- __m256i ymm0, ymm1, ymm2, ymm3;
-
- while (n >= 128)
- {
- ymm0 =
- _mm256_loadu_si256 ((const __m256i *) ((const u8 *) src + 0 * 32));
- n -= 128;
- ymm1 =
- _mm256_loadu_si256 ((const __m256i *) ((const u8 *) src + 1 * 32));
- ymm2 =
- _mm256_loadu_si256 ((const __m256i *) ((const u8 *) src + 2 * 32));
- ymm3 =
- _mm256_loadu_si256 ((const __m256i *) ((const u8 *) src + 3 * 32));
- src = (const u8 *) src + 128;
- _mm256_storeu_si256 ((__m256i *) ((u8 *) dst + 0 * 32), ymm0);
- _mm256_storeu_si256 ((__m256i *) ((u8 *) dst + 1 * 32), ymm1);
- _mm256_storeu_si256 ((__m256i *) ((u8 *) dst + 2 * 32), ymm2);
- _mm256_storeu_si256 ((__m256i *) ((u8 *) dst + 3 * 32), ymm3);
- dst = (u8 *) dst + 128;
- }
-}
-
-static inline void *
-clib_memcpy_fast_avx2 (void *dst, const void *src, size_t n)
-{
- uword dstu = (uword) dst;
- uword srcu = (uword) src;
- void *ret = dst;
- size_t dstofss;
- size_t bits;
-
- /**
- * Copy less than 16 bytes
- */
- if (n < 16)
- {
- if (n & 0x01)
- {
- *(u8 *) dstu = *(const u8 *) srcu;
- srcu = (uword) ((const u8 *) srcu + 1);
- dstu = (uword) ((u8 *) dstu + 1);
- }
- if (n & 0x02)
- {
- *(u16u *) dstu = *(const u16u *) srcu;
- srcu = (uword) ((const u16u *) srcu + 1);
- dstu = (uword) ((u16u *) dstu + 1);
- }
- if (n & 0x04)
- {
- *(u32u *) dstu = *(const u32u *) srcu;
- srcu = (uword) ((const u32u *) srcu + 1);
- dstu = (uword) ((u32u *) dstu + 1);
- }
- if (n & 0x08)
- {
- *(u64u *) dstu = *(const u64u *) srcu;
- }
- return ret;
- }
-
- /**
- * Fast way when copy size doesn't exceed 512 bytes
- */
- if (n <= 32)
- {
- clib_mov16 ((u8 *) dst, (const u8 *) src);
- clib_mov16 ((u8 *) dst - 16 + n, (const u8 *) src - 16 + n);
- return ret;
- }
- if (n <= 48)
- {
- clib_mov16 ((u8 *) dst, (const u8 *) src);
- clib_mov16 ((u8 *) dst + 16, (const u8 *) src + 16);
- clib_mov16 ((u8 *) dst - 16 + n, (const u8 *) src - 16 + n);
- return ret;
- }
- if (n <= 64)
- {
- clib_mov32 ((u8 *) dst, (const u8 *) src);
- clib_mov32 ((u8 *) dst - 32 + n, (const u8 *) src - 32 + n);
- return ret;
- }
- if (n <= 256)
- {
- if (n >= 128)
- {
- n -= 128;
- clib_mov128 ((u8 *) dst, (const u8 *) src);
- src = (const u8 *) src + 128;
- dst = (u8 *) dst + 128;
- }
- COPY_BLOCK_128_BACK31:
- if (n >= 64)
- {
- n -= 64;
- clib_mov64 ((u8 *) dst, (const u8 *) src);
- src = (const u8 *) src + 64;
- dst = (u8 *) dst + 64;
- }
- if (n > 32)
- {
- clib_mov32 ((u8 *) dst, (const u8 *) src);
- clib_mov32 ((u8 *) dst - 32 + n, (const u8 *) src - 32 + n);
- return ret;
- }
- if (n > 0)
- {
- clib_mov32 ((u8 *) dst - 32 + n, (const u8 *) src - 32 + n);
- }
- return ret;
- }
-
- /**
- * Make store aligned when copy size exceeds 256 bytes
- */
- dstofss = (uword) dst & 0x1F;
- if (dstofss > 0)
- {
- dstofss = 32 - dstofss;
- n -= dstofss;
- clib_mov32 ((u8 *) dst, (const u8 *) src);
- src = (const u8 *) src + dstofss;
- dst = (u8 *) dst + dstofss;
- }
-
- /**
- * Copy 128-byte blocks.
- */
- clib_mov128blocks ((u8 *) dst, (const u8 *) src, n);
- bits = n;
- n = n & 127;
- bits -= n;
- src = (const u8 *) src + bits;
- dst = (u8 *) dst + bits;
-
- /**
- * Copy whatever left
- */
- goto COPY_BLOCK_128_BACK31;
-}
-
-/* *INDENT-OFF* */
-WARN_ON (stringop-overflow)
-/* *INDENT-ON* */
-
-#endif /* included_clib_memcpy_avx2_h */
-
-
-/*
- * fd.io coding-style-patch-verification: ON
- *
- * Local Variables:
- * eval: (c-set-style "gnu")
- * End:
- */
diff --git a/src/vppinfra/memcpy_avx512.h b/src/vppinfra/memcpy_avx512.h
deleted file mode 100644
index 2025070272e..00000000000
--- a/src/vppinfra/memcpy_avx512.h
+++ /dev/null
@@ -1,285 +0,0 @@
-/*
- * Copyright (c) 2016 Cisco and/or its affiliates.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at:
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*-
- * BSD LICENSE
- *
- * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef included_clib_memcpy_avx512_h
-#define included_clib_memcpy_avx512_h
-
-#include <stdint.h>
-#include <x86intrin.h>
-#include <vppinfra/warnings.h>
-
-/* *INDENT-OFF* */
-WARN_OFF (stringop-overflow)
-/* *INDENT-ON* */
-
-static inline void
-clib_mov16 (u8 * dst, const u8 * src)
-{
- __m128i xmm0;
-
- xmm0 = _mm_loadu_si128 ((const __m128i *) src);
- _mm_storeu_si128 ((__m128i *) dst, xmm0);
-}
-
-static inline void
-clib_mov32 (u8 * dst, const u8 * src)
-{
- __m256i ymm0;
-
- ymm0 = _mm256_loadu_si256 ((const __m256i *) src);
- _mm256_storeu_si256 ((__m256i *) dst, ymm0);
-}
-
-static inline void
-clib_mov64 (u8 * dst, const u8 * src)
-{
- __m512i zmm0;
-
- zmm0 = _mm512_loadu_si512 ((const void *) src);
- _mm512_storeu_si512 ((void *) dst, zmm0);
-}
-
-static inline void
-clib_mov128 (u8 * dst, const u8 * src)
-{
- clib_mov64 (dst + 0 * 64, src + 0 * 64);
- clib_mov64 (dst + 1 * 64, src + 1 * 64);
-}
-
-static inline void
-clib_mov256 (u8 * dst, const u8 * src)
-{
- clib_mov128 (dst + 0 * 128, src + 0 * 128);
- clib_mov128 (dst + 1 * 128, src + 1 * 128);
-}
-
-static inline void
-clib_mov128blocks (u8 * dst, const u8 * src, size_t n)
-{
- __m512i zmm0, zmm1;
-
- while (n >= 128)
- {
- zmm0 = _mm512_loadu_si512 ((const void *) (src + 0 * 64));
- n -= 128;
- zmm1 = _mm512_loadu_si512 ((const void *) (src + 1 * 64));
- src = src + 128;
- _mm512_storeu_si512 ((void *) (dst + 0 * 64), zmm0);
- _mm512_storeu_si512 ((void *) (dst + 1 * 64), zmm1);
- dst = dst + 128;
- }
-}
-
-static inline void
-clib_mov512blocks (u8 * dst, const u8 * src, size_t n)
-{
- __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
-
- while (n >= 512)
- {
- zmm0 = _mm512_loadu_si512 ((const void *) (src + 0 * 64));
- n -= 512;
- zmm1 = _mm512_loadu_si512 ((const void *) (src + 1 * 64));
- zmm2 = _mm512_loadu_si512 ((const void *) (src + 2 * 64));
- zmm3 = _mm512_loadu_si512 ((const void *) (src + 3 * 64));
- zmm4 = _mm512_loadu_si512 ((const void *) (src + 4 * 64));
- zmm5 = _mm512_loadu_si512 ((const void *) (src + 5 * 64));
- zmm6 = _mm512_loadu_si512 ((const void *) (src + 6 * 64));
- zmm7 = _mm512_loadu_si512 ((const void *) (src + 7 * 64));
- src = src + 512;
- _mm512_storeu_si512 ((void *) (dst + 0 * 64), zmm0);
- _mm512_storeu_si512 ((void *) (dst + 1 * 64), zmm1);
- _mm512_storeu_si512 ((void *) (dst + 2 * 64), zmm2);
- _mm512_storeu_si512 ((void *) (dst + 3 * 64), zmm3);
- _mm512_storeu_si512 ((void *) (dst + 4 * 64), zmm4);
- _mm512_storeu_si512 ((void *) (dst + 5 * 64), zmm5);
- _mm512_storeu_si512 ((void *) (dst + 6 * 64), zmm6);
- _mm512_storeu_si512 ((void *) (dst + 7 * 64), zmm7);
- dst = dst + 512;
- }
-}
-
-static inline void *
-clib_memcpy_fast_avx512 (void *dst, const void *src, size_t n)
-{
- uword dstu = (uword) dst;
- uword srcu = (uword) src;
- void *ret = dst;
- size_t dstofss;
- size_t bits;
-
- /**
- * Copy less than 16 bytes
- */
- if (n < 16)
- {
- if (n & 0x01)
- {
- *(u8 *) dstu = *(const u8 *) srcu;
- srcu = (uword) ((const u8 *) srcu + 1);
- dstu = (uword) ((u8 *) dstu + 1);
- }
- if (n & 0x02)
- {
- *(u16u *) dstu = *(const u16u *) srcu;
- srcu = (uword) ((const u16u *) srcu + 1);
- dstu = (uword) ((u16u *) dstu + 1);
- }
- if (n & 0x04)
- {
- *(u32u *) dstu = *(const u32u *) srcu;
- srcu = (uword) ((const u32u *) srcu + 1);
- dstu = (uword) ((u32u *) dstu + 1);
- }
- if (n & 0x08)
- *(u64u *) dstu = *(const u64u *) srcu;
- return ret;
- }
-
- /**
- * Fast way when copy size doesn't exceed 512 bytes
- */
- if (n <= 32)
- {
- clib_mov16 ((u8 *) dst, (const u8 *) src);
- clib_mov16 ((u8 *) dst - 16 + n, (const u8 *) src - 16 + n);
- return ret;
- }
- if (n <= 64)
- {
- clib_mov32 ((u8 *) dst, (const u8 *) src);
- clib_mov32 ((u8 *) dst - 32 + n, (const u8 *) src - 32 + n);
- return ret;
- }
- if (n <= 512)
- {
- if (n >= 256)
- {
- n -= 256;
- clib_mov256 ((u8 *) dst, (const u8 *) src);
- src = (const u8 *) src + 256;
- dst = (u8 *) dst + 256;
- }
- if (n >= 128)
- {
- n -= 128;
- clib_mov128 ((u8 *) dst, (const u8 *) src);
- src = (const u8 *) src + 128;
- dst = (u8 *) dst + 128;
- }
- COPY_BLOCK_128_BACK63:
- if (n > 64)
- {
- clib_mov64 ((u8 *) dst, (const u8 *) src);
- clib_mov64 ((u8 *) dst - 64 + n, (const u8 *) src - 64 + n);
- return ret;
- }
- if (n > 0)
- clib_mov64 ((u8 *) dst - 64 + n, (const u8 *) src - 64 + n);
- return ret;
- }
-
- /**
- * Make store aligned when copy size exceeds 512 bytes
- */
- dstofss = (uword) dst & 0x3F;
- if (dstofss > 0)
- {
- dstofss = 64 - dstofss;
- n -= dstofss;
- clib_mov64 ((u8 *) dst, (const u8 *) src);
- src = (const u8 *) src + dstofss;
- dst = (u8 *) dst + dstofss;
- }
-
- /**
- * Copy 512-byte blocks.
- * Use copy block function for better instruction order control,
- * which is important when load is unaligned.
- */
- clib_mov512blocks ((u8 *) dst, (const u8 *) src, n);
- bits = n;
- n = n & 511;
- bits -= n;
- src = (const u8 *) src + bits;
- dst = (u8 *) dst + bits;
-
- /**
- * Copy 128-byte blocks.
- * Use copy block function for better instruction order control,
- * which is important when load is unaligned.
- */
- if (n >= 128)
- {
- clib_mov128blocks ((u8 *) dst, (const u8 *) src, n);
- bits = n;
- n = n & 127;
- bits -= n;
- src = (const u8 *) src + bits;
- dst = (u8 *) dst + bits;
- }
-
- /**
- * Copy whatever left
- */
- goto COPY_BLOCK_128_BACK63;
-}
-
-/* *INDENT-OFF* */
-WARN_ON (stringop-overflow)
-/* *INDENT-ON* */
-
-#endif /* included_clib_memcpy_avx512_h */
-
-
-/*
- * fd.io coding-style-patch-verification: ON
- *
- * Local Variables:
- * eval: (c-set-style "gnu")
- * End:
- */
diff --git a/src/vppinfra/memcpy_sse3.h b/src/vppinfra/memcpy_sse3.h
deleted file mode 100644
index 2ad3648a52a..00000000000
--- a/src/vppinfra/memcpy_sse3.h
+++ /dev/null
@@ -1,368 +0,0 @@
-/*
- * Copyright (c) 2016 Cisco and/or its affiliates.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at:
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*-
- * BSD LICENSE
- *
- * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef included_clib_memcpy_sse3_h
-#define included_clib_memcpy_sse3_h
-
-#include <stdint.h>
-#include <x86intrin.h>
-#include <vppinfra/warnings.h>
-
-/* *INDENT-OFF* */
-WARN_OFF (stringop-overflow)
-/* *INDENT-ON* */
-
-static inline void
-clib_mov16 (u8 * dst, const u8 * src)
-{
- __m128i xmm0;
-
- xmm0 = _mm_loadu_si128 ((const __m128i *) src);
- _mm_storeu_si128 ((__m128i *) dst, xmm0);
-}
-
-static inline void
-clib_mov32 (u8 * dst, const u8 * src)
-{
- clib_mov16 ((u8 *) dst + 0 * 16, (const u8 *) src + 0 * 16);
- clib_mov16 ((u8 *) dst + 1 * 16, (const u8 *) src + 1 * 16);
-}
-
-static inline void
-clib_mov64 (u8 * dst, const u8 * src)
-{
- clib_mov32 ((u8 *) dst + 0 * 32, (const u8 *) src + 0 * 32);
- clib_mov32 ((u8 *) dst + 1 * 32, (const u8 *) src + 1 * 32);
-}
-
-static inline void
-clib_mov128 (u8 * dst, const u8 * src)
-{
- clib_mov64 ((u8 *) dst + 0 * 64, (const u8 *) src + 0 * 64);
- clib_mov64 ((u8 *) dst + 1 * 64, (const u8 *) src + 1 * 64);
-}
-
-static inline void
-clib_mov256 (u8 * dst, const u8 * src)
-{
- clib_mov128 ((u8 *) dst + 0 * 128, (const u8 *) src + 0 * 128);
- clib_mov128 ((u8 *) dst + 1 * 128, (const u8 *) src + 1 * 128);
-}
-
-/**
- * Macro for copying unaligned block from one location to another with constant load offset,
- * 47 bytes leftover maximum,
- * locations should not overlap.
- * Requirements:
- * - Store is aligned
- * - Load offset is <offset>, which must be immediate value within [1, 15]
- * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
- * - <dst>, <src>, <len> must be variables
- * - __m128i <xmm0> ~ <xmm8> must be pre-defined
- */
-#define CLIB_MVUNALIGN_LEFT47_IMM(dst, src, len, offset) \
-({ \
- int tmp; \
- while (len >= 128 + 16 - offset) { \
- xmm0 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 0 * 16)); \
- len -= 128; \
- xmm1 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 1 * 16)); \
- xmm2 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 2 * 16)); \
- xmm3 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 3 * 16)); \
- xmm4 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 4 * 16)); \
- xmm5 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 5 * 16)); \
- xmm6 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 6 * 16)); \
- xmm7 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 7 * 16)); \
- xmm8 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 8 * 16)); \
- src = (const u8 *)src + 128; \
- _mm_storeu_si128((__m128i *)((u8 *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \
- _mm_storeu_si128((__m128i *)((u8 *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \
- _mm_storeu_si128((__m128i *)((u8 *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset)); \
- _mm_storeu_si128((__m128i *)((u8 *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset)); \
- _mm_storeu_si128((__m128i *)((u8 *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset)); \
- _mm_storeu_si128((__m128i *)((u8 *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset)); \
- _mm_storeu_si128((__m128i *)((u8 *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset)); \
- _mm_storeu_si128((__m128i *)((u8 *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset)); \
- dst = (u8 *)dst + 128; \
- } \
- tmp = len; \
- len = ((len - 16 + offset) & 127) + 16 - offset; \
- tmp -= len; \
- src = (const u8 *)src + tmp; \
- dst = (u8 *)dst + tmp; \
- if (len >= 32 + 16 - offset) { \
- while (len >= 32 + 16 - offset) { \
- xmm0 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 0 * 16)); \
- len -= 32; \
- xmm1 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 1 * 16)); \
- xmm2 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 2 * 16)); \
- src = (const u8 *)src + 32; \
- _mm_storeu_si128((__m128i *)((u8 *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset)); \
- _mm_storeu_si128((__m128i *)((u8 *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset)); \
- dst = (u8 *)dst + 32; \
- } \
- tmp = len; \
- len = ((len - 16 + offset) & 31) + 16 - offset; \
- tmp -= len; \
- src = (const u8 *)src + tmp; \
- dst = (u8 *)dst + tmp; \
- } \
-})
-
-/**
- * Macro for copying unaligned block from one location to another,
- * 47 bytes leftover maximum,
- * locations should not overlap.
- * Use switch here because the aligning instruction requires immediate value for shift count.
- * Requirements:
- * - Store is aligned
- * - Load offset is <offset>, which must be within [1, 15]
- * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
- * - <dst>, <src>, <len> must be variables
- * - __m128i <xmm0> ~ <xmm8> used in CLIB_MVUNALIGN_LEFT47_IMM must be pre-defined
- */
-#define CLIB_MVUNALIGN_LEFT47(dst, src, len, offset) \
-({ \
- switch (offset) { \
- case 0x01: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x01); break; \
- case 0x02: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x02); break; \
- case 0x03: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x03); break; \
- case 0x04: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x04); break; \
- case 0x05: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x05); break; \
- case 0x06: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x06); break; \
- case 0x07: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x07); break; \
- case 0x08: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x08); break; \
- case 0x09: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x09); break; \
- case 0x0A: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0A); break; \
- case 0x0B: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0B); break; \
- case 0x0C: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0C); break; \
- case 0x0D: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0D); break; \
- case 0x0E: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0E); break; \
- case 0x0F: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0F); break; \
- default:; \
- } \
-})
-
-static inline void *
-clib_memcpy_fast_sse3 (void *dst, const void *src, size_t n)
-{
- __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
- uword dstu = (uword) dst;
- uword srcu = (uword) src;
- void *ret = dst;
- size_t dstofss;
- size_t srcofs;
-
- /**
- * Copy less than 16 bytes
- */
- if (n < 16)
- {
- if (n & 0x01)
- {
- *(u8 *) dstu = *(const u8 *) srcu;
- srcu = (uword) ((const u8 *) srcu + 1);
- dstu = (uword) ((u8 *) dstu + 1);
- }
- if (n & 0x02)
- {
- *(u16u *) dstu = *(const u16u *) srcu;
- srcu = (uword) ((const u16u *) srcu + 1);
- dstu = (uword) ((u16u *) dstu + 1);
- }
- if (n & 0x04)
- {
- *(u32u *) dstu = *(const u32u *) srcu;
- srcu = (uword) ((const u32u *) srcu + 1);
- dstu = (uword) ((u32u *) dstu + 1);
- }
- if (n & 0x08)
- {
- *(u64u *) dstu = *(const u64u *) srcu;
- }
- return ret;
- }
-
- /**
- * Fast way when copy size doesn't exceed 512 bytes
- */
- if (n <= 32)
- {
- clib_mov16 ((u8 *) dst, (const u8 *) src);
- clib_mov16 ((u8 *) dst - 16 + n, (const u8 *) src - 16 + n);
- return ret;
- }
- if (n <= 48)
- {
- clib_mov32 ((u8 *) dst, (const u8 *) src);
- clib_mov16 ((u8 *) dst - 16 + n, (const u8 *) src - 16 + n);
- return ret;
- }
- if (n <= 64)
- {
- clib_mov32 ((u8 *) dst, (const u8 *) src);
- clib_mov16 ((u8 *) dst + 32, (const u8 *) src + 32);
- clib_mov16 ((u8 *) dst - 16 + n, (const u8 *) src - 16 + n);
- return ret;
- }
- if (n <= 128)
- {
- goto COPY_BLOCK_128_BACK15;
- }
- if (n <= 512)
- {
- if (n >= 256)
- {
- n -= 256;
- clib_mov128 ((u8 *) dst, (const u8 *) src);
- clib_mov128 ((u8 *) dst + 128, (const u8 *) src + 128);
- src = (const u8 *) src + 256;
- dst = (u8 *) dst + 256;
- }
- COPY_BLOCK_255_BACK15:
- if (n >= 128)
- {
- n -= 128;
- clib_mov128 ((u8 *) dst, (const u8 *) src);
- src = (const u8 *) src + 128;
- dst = (u8 *) dst + 128;
- }
- COPY_BLOCK_128_BACK15:
- if (n >= 64)
- {
- n -= 64;
- clib_mov64 ((u8 *) dst, (const u8 *) src);
- src = (const u8 *) src + 64;
- dst = (u8 *) dst + 64;
- }
- COPY_BLOCK_64_BACK15:
- if (n >= 32)
- {
- n -= 32;
- clib_mov32 ((u8 *) dst, (const u8 *) src);
- src = (const u8 *) src + 32;
- dst = (u8 *) dst + 32;
- }
- if (n > 16)
- {
- clib_mov16 ((u8 *) dst, (const u8 *) src);
- clib_mov16 ((u8 *) dst - 16 + n, (const u8 *) src - 16 + n);
- return ret;
- }
- if (n > 0)
- {
- clib_mov16 ((u8 *) dst - 16 + n, (const u8 *) src - 16 + n);
- }
- return ret;
- }
-
- /**
- * Make store aligned when copy size exceeds 512 bytes,
- * and make sure the first 15 bytes are copied, because
- * unaligned copy functions require up to 15 bytes
- * backwards access.
- */
- dstofss = (uword) dst & 0x0F;
- if (dstofss > 0)
- {
- dstofss = 16 - dstofss + 16;
- n -= dstofss;
- clib_mov32 ((u8 *) dst, (const u8 *) src);
- src = (const u8 *) src + dstofss;
- dst = (u8 *) dst + dstofss;
- }
- srcofs = ((uword) src & 0x0F);
-
- /**
- * For aligned copy
- */
- if (srcofs == 0)
- {
- /**
- * Copy 256-byte blocks
- */
- for (; n >= 256; n -= 256)
- {
- clib_mov256 ((u8 *) dst, (const u8 *) src);
- dst = (u8 *) dst + 256;
- src = (const u8 *) src + 256;
- }
-
- /**
- * Copy whatever left
- */
- goto COPY_BLOCK_255_BACK15;
- }
-
- /**
- * For copy with unaligned load
- */
- CLIB_MVUNALIGN_LEFT47 (dst, src, n, srcofs);
-
- /**
- * Copy whatever left
- */
- goto COPY_BLOCK_64_BACK15;
-}
-
-/* *INDENT-OFF* */
-WARN_ON (stringop-overflow)
-/* *INDENT-ON* */
-
-#undef CLIB_MVUNALIGN_LEFT47_IMM
-#undef CLIB_MVUNALIGN_LEFT47
-
-#endif /* included_clib_memcpy_sse3_h */
-
-
-/*
- * fd.io coding-style-patch-verification: ON
- *
- * Local Variables:
- * eval: (c-set-style "gnu")
- * End:
- */
diff --git a/src/vppinfra/memcpy_x86_64.h b/src/vppinfra/memcpy_x86_64.h
new file mode 100644
index 00000000000..9662ab4e7ef
--- /dev/null
+++ b/src/vppinfra/memcpy_x86_64.h
@@ -0,0 +1,611 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Damjan Marion
+ */
+
+#ifndef included_clib_memcpy_x86_64_h
+#define included_clib_memcpy_x86_64_h
+#ifdef __x86_64__
+
+#include <vppinfra/clib.h>
+#include <vppinfra/warnings.h>
+#include <stdio.h>
+
+/* clang-format off */
+WARN_OFF (stringop-overflow)
+/* clang-format on */
+
+static_always_inline void
+clib_memcpy1 (void *d, void *s)
+{
+ *(u8 *) d = *(u8 *) s;
+}
+
+static_always_inline void
+clib_memcpy2 (void *d, void *s)
+{
+ *(u16u *) d = *(u16u *) s;
+}
+
+static_always_inline void
+clib_memcpy4 (void *d, void *s)
+{
+ *(u32u *) d = *(u32u *) s;
+}
+
+static_always_inline void
+clib_memcpy8 (void *d, void *s)
+{
+ *(u64u *) d = *(u64u *) s;
+}
+
+#ifdef CLIB_HAVE_VEC128
+static_always_inline void
+clib_memcpy16 (void *d, void *s)
+{
+ *(u8x16u *) d = *(u8x16u *) s;
+}
+#endif
+
+#ifdef CLIB_HAVE_VEC256
+static_always_inline void
+clib_memcpy32 (void *d, void *s)
+{
+ *(u8x32u *) d = *(u8x32u *) s;
+}
+#endif
+
+#ifdef CLIB_HAVE_VEC512
+static_always_inline void
+clib_memcpy64 (void *d, void *s)
+{
+ *(u8x64u *) d = *(u8x64u *) s;
+}
+#endif
+
+static_always_inline void
+clib_memcpy_const_le32 (u8 *dst, u8 *src, size_t n)
+{
+ switch (n)
+ {
+ case 1:
+ clib_memcpy1 (dst, src);
+ break;
+ case 2:
+ clib_memcpy2 (dst, src);
+ break;
+ case 3:
+ clib_memcpy2 (dst, src);
+ clib_memcpy1 (dst + 2, src + 2);
+ break;
+ case 4:
+ clib_memcpy4 (dst, src);
+ break;
+ case 5:
+ clib_memcpy4 (dst, src);
+ clib_memcpy1 (dst + 4, src + 4);
+ break;
+ case 6:
+ clib_memcpy4 (dst, src);
+ clib_memcpy2 (dst + 4, src + 4);
+ break;
+ case 7:
+ clib_memcpy4 (dst, src);
+ clib_memcpy4 (dst + 3, src + 3);
+ break;
+ case 8:
+ clib_memcpy8 (dst, src);
+ break;
+ case 9:
+ clib_memcpy8 (dst, src);
+ clib_memcpy1 (dst + 8, src + 8);
+ break;
+ case 10:
+ clib_memcpy8 (dst, src);
+ clib_memcpy2 (dst + 8, src + 8);
+ break;
+ case 11:
+ case 12:
+ clib_memcpy8 (dst, src);
+ clib_memcpy4 (dst + n - 4, src + n - 4);
+ break;
+ case 13:
+ case 14:
+ case 15:
+ clib_memcpy8 (dst, src);
+ clib_memcpy8 (dst + n - 8, src + n - 8);
+ break;
+ case 16:
+ clib_memcpy16 (dst, src);
+ break;
+ case 17:
+ clib_memcpy16 (dst, src);
+ clib_memcpy1 (dst + 16, src + 16);
+ break;
+ case 18:
+ clib_memcpy16 (dst, src);
+ clib_memcpy2 (dst + 16, src + 16);
+ break;
+ case 20:
+ clib_memcpy16 (dst, src);
+ clib_memcpy4 (dst + 16, src + 16);
+ break;
+ case 24:
+ clib_memcpy16 (dst, src);
+ clib_memcpy8 (dst + 16, src + 16);
+ break;
+ default:
+ clib_memcpy16 (dst, src);
+ clib_memcpy16 (dst + n - 16, src + n - 16);
+ break;
+ }
+}
+
+static_always_inline void
+clib_memcpy_const_le64 (u8 *dst, u8 *src, size_t n)
+{
+ if (n < 32)
+ {
+ clib_memcpy_const_le32 (dst, src, n);
+ return;
+ }
+
+#if defined(CLIB_HAVE_VEC256)
+ switch (n)
+ {
+ case 32:
+ clib_memcpy32 (dst, src);
+ break;
+ case 33:
+ clib_memcpy32 (dst, src);
+ clib_memcpy1 (dst + 32, src + 32);
+ break;
+ case 34:
+ clib_memcpy32 (dst, src);
+ clib_memcpy2 (dst + 32, src + 32);
+ break;
+ case 36:
+ clib_memcpy32 (dst, src);
+ clib_memcpy4 (dst + 32, src + 32);
+ break;
+ case 40:
+ clib_memcpy32 (dst, src);
+ clib_memcpy8 (dst + 32, src + 32);
+ break;
+ case 48:
+ clib_memcpy32 (dst, src);
+ clib_memcpy16 (dst + 32, src + 32);
+ break;
+ default:
+ clib_memcpy32 (dst, src);
+ clib_memcpy32 (dst + n - 32, src + n - 32);
+ break;
+ }
+#else
+ while (n > 31)
+ {
+ clib_memcpy16 (dst, src);
+ clib_memcpy16 (dst + 16, src + 16);
+ dst += 32;
+ src += 32;
+ n -= 32;
+ }
+ clib_memcpy_const_le32 (dst, src, n);
+#endif
+}
+
+static_always_inline void
+clib_memcpy_x86_64_const (u8 *dst, u8 *src, size_t n)
+{
+#if defined(CLIB_HAVE_VEC512)
+ while (n > 128)
+ {
+ clib_memcpy64 (dst, src);
+ dst += 64;
+ src += 64;
+ n -= 64;
+ }
+
+ if (n < 64)
+ {
+ clib_memcpy_const_le64 (dst, src, n);
+ return;
+ }
+
+ switch (n)
+ {
+ case 64:
+ clib_memcpy64 (dst, src);
+ break;
+ case 65:
+ clib_memcpy64 (dst, src);
+ clib_memcpy1 (dst + 64, src + 64);
+ break;
+ case 66:
+ clib_memcpy64 (dst, src);
+ clib_memcpy2 (dst + 64, src + 64);
+ break;
+ case 68:
+ clib_memcpy64 (dst, src);
+ clib_memcpy4 (dst + 64, src + 64);
+ break;
+ case 72:
+ clib_memcpy64 (dst, src);
+ clib_memcpy8 (dst + 64, src + 64);
+ break;
+ case 80:
+ clib_memcpy64 (dst, src);
+ clib_memcpy16 (dst + 64, src + 64);
+ break;
+ case 96:
+ clib_memcpy64 (dst, src);
+ clib_memcpy32 (dst + 64, src + 64);
+ break;
+ default:
+ clib_memcpy64 (dst, src);
+ clib_memcpy64 (dst + n - 64, src + n - 64);
+ break;
+ }
+#elif defined(CLIB_HAVE_VEC256)
+ while (n > 64)
+ {
+ clib_memcpy32 (dst, src);
+ dst += 32;
+ src += 32;
+ n -= 32;
+ }
+ clib_memcpy_const_le64 (dst, src, n);
+#else
+ while (n > 32)
+ {
+ clib_memcpy16 (dst, src);
+ dst += 16;
+ src += 16;
+ n -= 16;
+ }
+ clib_memcpy_const_le32 (dst, src, n);
+#endif
+}
+
+static_always_inline void *
+clib_memcpy_x86_64 (void *restrict dst, const void *restrict src, size_t n)
+{
+ u8 *d = (u8 *) dst, *s = (u8 *) src;
+
+ if (n == 0)
+ return dst;
+
+ if (COMPILE_TIME_CONST (n))
+ {
+ if (n)
+ clib_memcpy_x86_64_const (d, s, n);
+ return dst;
+ }
+
+ if (n <= 32)
+ {
+#if defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
+ u32 mask = pow2_mask (n);
+ u8x32_mask_store (u8x32_mask_load_zero (s, mask), d, mask);
+#else
+ if (PREDICT_TRUE (n >= 16))
+ {
+ clib_memcpy16 (d, s);
+ clib_memcpy16 (d + n - 16, s + n - 16);
+ }
+ else if (PREDICT_TRUE (n >= 8))
+ {
+ clib_memcpy8 (d, s);
+ clib_memcpy8 (d + n - 8, s + n - 8);
+ }
+ else if (PREDICT_TRUE (n >= 4))
+ {
+ clib_memcpy4 (d, s);
+ clib_memcpy4 (d + n - 4, s + n - 4);
+ }
+ else if (PREDICT_TRUE (n > 1))
+ {
+ clib_memcpy2 (d, s);
+ clib_memcpy2 (d + n - 2, s + n - 2);
+ }
+ else
+ clib_memcpy1 (d, s);
+#endif
+ }
+#ifdef CLIB_HAVE_VEC512
+ else
+ {
+ u8x64 v0, v1, v2, v3;
+ u64 final_off, nr, off = 64;
+
+ if (n <= 64)
+ {
+ n -= 32;
+ u8x32_store_unaligned (u8x32_load_unaligned (s), d);
+ u8x32_store_unaligned (u8x32_load_unaligned (s + n), d + n);
+ return dst;
+ }
+
+ u8x64_store_unaligned (u8x64_load_unaligned (s), d);
+
+ if (n <= 128)
+ goto done2;
+
+ if (n <= 192)
+ goto one;
+
+ if (n <= 512 + 64)
+ {
+ nr = round_pow2 (n - 128, 64);
+ goto last;
+ }
+
+ off -= ((u64) d) & 0x3f;
+ nr = round_pow2 (n - off - 64, 64);
+ final_off = (nr & ~(u64) 0x1ff) + off;
+
+ more:
+ v0 = u8x64_load_unaligned (s + off + 0x000);
+ v1 = u8x64_load_unaligned (s + off + 0x040);
+ v2 = u8x64_load_unaligned (s + off + 0x080);
+ v3 = u8x64_load_unaligned (s + off + 0x0c0);
+ u8x64_store_unaligned (v0, d + off + 0x000);
+ u8x64_store_unaligned (v1, d + off + 0x040);
+ u8x64_store_unaligned (v2, d + off + 0x080);
+ u8x64_store_unaligned (v3, d + off + 0x0c0);
+ v0 = u8x64_load_unaligned (s + off + 0x100);
+ v1 = u8x64_load_unaligned (s + off + 0x140);
+ v2 = u8x64_load_unaligned (s + off + 0x180);
+ v3 = u8x64_load_unaligned (s + off + 0x1c0);
+ u8x64_store_unaligned (v0, d + off + 0x100);
+ u8x64_store_unaligned (v1, d + off + 0x140);
+ u8x64_store_unaligned (v2, d + off + 0x180);
+ u8x64_store_unaligned (v3, d + off + 0x1c0);
+ off += 512;
+ if (off != final_off)
+ goto more;
+
+ if ((nr & 0x1ff) == 0)
+ goto done2;
+
+ last:
+ if (PREDICT_TRUE (nr & 256))
+ {
+ v0 = u8x64_load_unaligned (s + off + 0x000);
+ v1 = u8x64_load_unaligned (s + off + 0x040);
+ v2 = u8x64_load_unaligned (s + off + 0x080);
+ v3 = u8x64_load_unaligned (s + off + 0x0c0);
+ u8x64_store_unaligned (v0, d + off + 0x000);
+ u8x64_store_unaligned (v1, d + off + 0x040);
+ u8x64_store_unaligned (v2, d + off + 0x080);
+ u8x64_store_unaligned (v3, d + off + 0x0c0);
+ off += 256;
+ }
+ if (PREDICT_TRUE (nr & 128))
+ {
+ v0 = u8x64_load_unaligned (s + off + 0x000);
+ v1 = u8x64_load_unaligned (s + off + 0x040);
+ u8x64_store_unaligned (v0, d + off + 0x000);
+ u8x64_store_unaligned (v1, d + off + 0x040);
+ off += 128;
+ }
+ if (PREDICT_TRUE (nr & 64))
+ {
+ one:
+ u8x64_store_unaligned (u8x64_load_unaligned (s + off), d + off);
+ }
+ done2:
+ u8x64_store_unaligned (u8x64_load_unaligned (s + n - 64), d + n - 64);
+ }
+ return dst;
+#elif defined(CLIB_HAVE_VEC256)
+ else
+ {
+ u8x32 v0, v1, v2, v3;
+ u64 final_off, nr, off = 32;
+
+ u8x32_store_unaligned (u8x32_load_unaligned (s), d);
+
+ if (n <= 64)
+ goto done2;
+
+ if (n <= 96)
+ goto one;
+
+ if (n <= 256 + 32)
+ {
+ nr = round_pow2 (n - 64, 32);
+ goto last;
+ }
+
+ off -= ((u64) d) & 0x1f;
+ nr = round_pow2 (n - off - 32, 32);
+ final_off = (nr & ~(u64) 0xff) + off;
+
+ more:
+ v0 = u8x32_load_unaligned (s + off + 0x00);
+ v1 = u8x32_load_unaligned (s + off + 0x20);
+ v2 = u8x32_load_unaligned (s + off + 0x40);
+ v3 = u8x32_load_unaligned (s + off + 0x60);
+ u8x32_store_unaligned (v0, d + off + 0x00);
+ u8x32_store_unaligned (v1, d + off + 0x20);
+ u8x32_store_unaligned (v2, d + off + 0x40);
+ u8x32_store_unaligned (v3, d + off + 0x60);
+ v0 = u8x32_load_unaligned (s + off + 0x80);
+ v1 = u8x32_load_unaligned (s + off + 0xa0);
+ v2 = u8x32_load_unaligned (s + off + 0xc0);
+ v3 = u8x32_load_unaligned (s + off + 0xe0);
+ u8x32_store_unaligned (v0, d + off + 0x80);
+ u8x32_store_unaligned (v1, d + off + 0xa0);
+ u8x32_store_unaligned (v2, d + off + 0xc0);
+ u8x32_store_unaligned (v3, d + off + 0xe0);
+ off += 256;
+ if (off != final_off)
+ goto more;
+
+ if ((nr & 0xff) == 0)
+ goto done2;
+
+ last:
+ if (PREDICT_TRUE (nr & 128))
+ {
+ v0 = u8x32_load_unaligned (s + off + 0x00);
+ v1 = u8x32_load_unaligned (s + off + 0x20);
+ v2 = u8x32_load_unaligned (s + off + 0x40);
+ v3 = u8x32_load_unaligned (s + off + 0x60);
+ u8x32_store_unaligned (v0, d + off + 0x00);
+ u8x32_store_unaligned (v1, d + off + 0x20);
+ u8x32_store_unaligned (v2, d + off + 0x40);
+ u8x32_store_unaligned (v3, d + off + 0x60);
+ off += 128;
+ }
+ if (PREDICT_TRUE (nr & 64))
+ {
+ v0 = u8x32_load_unaligned (s + off + 0x00);
+ v1 = u8x32_load_unaligned (s + off + 0x20);
+ u8x32_store_unaligned (v0, d + off + 0x00);
+ u8x32_store_unaligned (v1, d + off + 0x20);
+ off += 64;
+ }
+ if (PREDICT_TRUE (nr & 32))
+ {
+ one:
+ u8x32_store_unaligned (u8x32_load_unaligned (s + off), d + off);
+ }
+ done2:
+ u8x32_store_unaligned (u8x32_load_unaligned (s + n - 32), d + n - 32);
+ }
+ return dst;
+#elif defined(CLIB_HAVE_VEC128)
+ else
+ {
+ u8x16 v0, v1, v2, v3;
+ u64 final_off, nr, off = 32;
+
+ if (0 && n > 389)
+ {
+ __builtin_memcpy (d, s, n);
+ return dst;
+ }
+
+ u8x16_store_unaligned (u8x16_load_unaligned (s), d);
+ u8x16_store_unaligned (u8x16_load_unaligned (s + 16), d + 16);
+
+ if (n <= 48)
+ goto done2;
+
+ if (n <= 64)
+ goto one;
+
+ if (n <= 256 + 32)
+ {
+ nr = round_pow2 (n - 48, 16);
+ goto last;
+ }
+
+ off -= ((u64) d) & 0x0f;
+ nr = round_pow2 (n - off - 16, 16);
+ final_off = (nr & ~(u64) 0xff) + off;
+
+ more:
+ v0 = u8x16_load_unaligned (s + off + 0x00);
+ v1 = u8x16_load_unaligned (s + off + 0x10);
+ v2 = u8x16_load_unaligned (s + off + 0x20);
+ v3 = u8x16_load_unaligned (s + off + 0x30);
+ u8x16_store_unaligned (v0, d + off + 0x00);
+ u8x16_store_unaligned (v1, d + off + 0x10);
+ u8x16_store_unaligned (v2, d + off + 0x20);
+ u8x16_store_unaligned (v3, d + off + 0x30);
+ v0 = u8x16_load_unaligned (s + off + 0x40);
+ v1 = u8x16_load_unaligned (s + off + 0x50);
+ v2 = u8x16_load_unaligned (s + off + 0x60);
+ v3 = u8x16_load_unaligned (s + off + 0x70);
+ u8x16_store_unaligned (v0, d + off + 0x40);
+ u8x16_store_unaligned (v1, d + off + 0x50);
+ u8x16_store_unaligned (v2, d + off + 0x60);
+ u8x16_store_unaligned (v3, d + off + 0x70);
+ v0 = u8x16_load_unaligned (s + off + 0x80);
+ v1 = u8x16_load_unaligned (s + off + 0x90);
+ v2 = u8x16_load_unaligned (s + off + 0xa0);
+ v3 = u8x16_load_unaligned (s + off + 0xb0);
+ u8x16_store_unaligned (v0, d + off + 0x80);
+ u8x16_store_unaligned (v1, d + off + 0x90);
+ u8x16_store_unaligned (v2, d + off + 0xa0);
+ u8x16_store_unaligned (v3, d + off + 0xb0);
+ v0 = u8x16_load_unaligned (s + off + 0xc0);
+ v1 = u8x16_load_unaligned (s + off + 0xd0);
+ v2 = u8x16_load_unaligned (s + off + 0xe0);
+ v3 = u8x16_load_unaligned (s + off + 0xf0);
+ u8x16_store_unaligned (v0, d + off + 0xc0);
+ u8x16_store_unaligned (v1, d + off + 0xd0);
+ u8x16_store_unaligned (v2, d + off + 0xe0);
+ u8x16_store_unaligned (v3, d + off + 0xf0);
+ off += 256;
+ if (off != final_off)
+ goto more;
+
+ if ((nr & 0xff) == 0)
+ goto done2;
+
+ last:
+ if (PREDICT_TRUE (nr & 128))
+ {
+ v0 = u8x16_load_unaligned (s + off + 0x00);
+ v1 = u8x16_load_unaligned (s + off + 0x10);
+ v2 = u8x16_load_unaligned (s + off + 0x20);
+ v3 = u8x16_load_unaligned (s + off + 0x30);
+ u8x16_store_unaligned (v0, d + off + 0x00);
+ u8x16_store_unaligned (v1, d + off + 0x10);
+ u8x16_store_unaligned (v2, d + off + 0x20);
+ u8x16_store_unaligned (v3, d + off + 0x30);
+ v0 = u8x16_load_unaligned (s + off + 0x40);
+ v1 = u8x16_load_unaligned (s + off + 0x50);
+ v2 = u8x16_load_unaligned (s + off + 0x60);
+ v3 = u8x16_load_unaligned (s + off + 0x70);
+ u8x16_store_unaligned (v0, d + off + 0x40);
+ u8x16_store_unaligned (v1, d + off + 0x50);
+ u8x16_store_unaligned (v2, d + off + 0x60);
+ u8x16_store_unaligned (v3, d + off + 0x70);
+ off += 128;
+ }
+ if (PREDICT_TRUE (nr & 64))
+ {
+ v0 = u8x16_load_unaligned (s + off + 0x00);
+ v1 = u8x16_load_unaligned (s + off + 0x10);
+ v2 = u8x16_load_unaligned (s + off + 0x20);
+ v3 = u8x16_load_unaligned (s + off + 0x30);
+ u8x16_store_unaligned (v0, d + off + 0x00);
+ u8x16_store_unaligned (v1, d + off + 0x10);
+ u8x16_store_unaligned (v2, d + off + 0x20);
+ u8x16_store_unaligned (v3, d + off + 0x30);
+ off += 64;
+ }
+ if (PREDICT_TRUE (nr & 32))
+ {
+ v0 = u8x16_load_unaligned (s + off + 0x00);
+ v1 = u8x16_load_unaligned (s + off + 0x10);
+ u8x16_store_unaligned (v0, d + off + 0x00);
+ u8x16_store_unaligned (v1, d + off + 0x10);
+ off += 32;
+ }
+ if (PREDICT_TRUE (nr & 16))
+ {
+ one:
+ u8x16_store_unaligned (u8x16_load_unaligned (s + off), d + off);
+ }
+ done2:
+ u8x16_store_unaligned (u8x16_load_unaligned (s + n - 16), d + n - 16);
+ }
+ return dst;
+#else
+#error "SSE/AVX2/AVX512 must be enabled"
+#endif
+
+ return dst;
+}
+
+/* clang-format off */
+WARN_ON (stringop-overflow)
+/* clang-format on */
+
+#endif
+#endif
diff --git a/src/vppinfra/string.h b/src/vppinfra/string.h
index 7f9211b1bd2..758a541814d 100644
--- a/src/vppinfra/string.h
+++ b/src/vppinfra/string.h
@@ -47,6 +47,7 @@
#include <vppinfra/clib.h> /* for CLIB_LINUX_KERNEL */
#include <vppinfra/vector.h>
#include <vppinfra/error_bootstrap.h>
+#include <vppinfra/memcpy_x86_64.h>
#ifdef CLIB_LINUX_KERNEL
#include <linux/string.h>
@@ -67,26 +68,6 @@
/* Exchanges source and destination. */
void clib_memswap (void *_a, void *_b, uword bytes);
-/*
- * the vector unit memcpy variants confuse coverity
- * so don't let it anywhere near them.
- */
-#ifndef __COVERITY__
-#if __AVX512BITALG__
-#include <vppinfra/memcpy_avx512.h>
-#define clib_memcpy_fast_arch(a, b, c) clib_memcpy_fast_avx512 (a, b, c)
-#elif __AVX2__
-#include <vppinfra/memcpy_avx2.h>
-#define clib_memcpy_fast_arch(a, b, c) clib_memcpy_fast_avx2 (a, b, c)
-#elif __SSSE3__
-#include <vppinfra/memcpy_sse3.h>
-#define clib_memcpy_fast_arch(a, b, c) clib_memcpy_fast_sse3 (a, b, c)
-#endif /* __AVX512BITALG__ */
-#endif /* __COVERITY__ */
-
-#ifndef clib_memcpy_fast_arch
-#define clib_memcpy_fast_arch(a, b, c) memcpy (a, b, c)
-#endif /* clib_memcpy_fast_arch */
static_always_inline void *
clib_memcpy_fast (void *restrict dst, const void *restrict src, size_t n)
@@ -94,11 +75,16 @@ clib_memcpy_fast (void *restrict dst, const void *restrict src, size_t n)
ASSERT (dst && src &&
"memcpy(src, dst, n) with src == NULL or dst == NULL is undefined "
"behaviour");
- return clib_memcpy_fast_arch (dst, src, n);
+#if defined(__COVERITY__)
+ return memcpy (dst, src, n);
+#elif defined(__x86_64__)
+ clib_memcpy_x86_64 (dst, src, n);
+ return dst;
+#else
+ return memcpy (dst, src, n);
+#endif
}
-#undef clib_memcpy_fast_arch
-
#include <vppinfra/memcpy.h>
/* c-11 string manipulation variants */
diff --git a/src/vppinfra/types.h b/src/vppinfra/types.h
index 598061bb3e8..34b8b078fbf 100644
--- a/src/vppinfra/types.h
+++ b/src/vppinfra/types.h
@@ -165,12 +165,12 @@ typedef f64 fword;
__attribute__ ((aligned (align), packed)); \
} *) (addr))->_data)
-typedef u16 u16u __attribute__ ((aligned (1)));
-typedef u32 u32u __attribute__ ((aligned (1)));
-typedef u64 u64u __attribute__ ((aligned (1)));
-typedef i16 i16u __attribute__ ((aligned (1)));
-typedef i32 i32u __attribute__ ((aligned (1)));
-typedef i64 i64u __attribute__ ((aligned (1)));
+typedef u16 u16u __attribute__ ((aligned (1), __may_alias__));
+typedef u32 u32u __attribute__ ((aligned (1), __may_alias__));
+typedef u64 u64u __attribute__ ((aligned (1), __may_alias__));
+typedef i16 i16u __attribute__ ((aligned (1), __may_alias__));
+typedef i32 i32u __attribute__ ((aligned (1), __may_alias__));
+typedef i64 i64u __attribute__ ((aligned (1), __may_alias__));
#endif /* included_clib_types_h */
diff --git a/src/vppinfra/vector.h b/src/vppinfra/vector.h
index 6a6635b4c93..88cf288cb26 100644
--- a/src/vppinfra/vector.h
+++ b/src/vppinfra/vector.h
@@ -65,8 +65,9 @@
#define CLIB_HAVE_VEC512
#endif
-#define _vector_size(n) __attribute__ ((vector_size (n)))
-#define _vector_size_unaligned(n) __attribute__ ((vector_size (n), __aligned__ (1)))
+#define _vector_size(n) __attribute__ ((vector_size (n), __may_alias__))
+#define _vector_size_unaligned(n) \
+ __attribute__ ((vector_size (n), __aligned__ (1), __may_alias__))
#define foreach_vec64i _(i,8,8) _(i,16,4) _(i,32,2)
#define foreach_vec64u _(u,8,8) _(u,16,4) _(u,32,2)
diff --git a/src/vppinfra/vector/test/memcpy_x86_64.c b/src/vppinfra/vector/test/memcpy_x86_64.c
new file mode 100644
index 00000000000..78aab18d2e3
--- /dev/null
+++ b/src/vppinfra/vector/test/memcpy_x86_64.c
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2021 Cisco Systems, Inc.
+ */
+
+#ifdef __x86_64__
+
+#include <vppinfra/format.h>
+#include <vppinfra/vector/test/test.h>
+#include <vppinfra/vector/mask_compare.h>
+
+__clib_test_fn void
+wrapper (u8 *dst, u8 *src, uword n)
+{
+ clib_memcpy_x86_64 (dst, src, n);
+}
+
+/* clang-format off */
+#define foreach_const_n \
+ _(1) _(2) _(3) _(4) _(5) _(6) _(7) _(8) _(9) _(10) _(11) _(12) _(13) _(14) \
+ _(15) _(16) _(17) _(18) _(19) _(20) _(21) _(22) _(23) _(24) _(25) _(26) \
+ _(27) _(28) _(29) _(30) _(31) _(32) _(33) _(34) _(35) _(36) _(37) _(38) \
+ _(39) _(40) _(41) _(42) _(43) _(44) _(45) _(46) _(47) _(48) _(49) _(50) \
+ _(51) _(52) _(53) _(54) _(55) _(56) _(57) _(58) _(59) _(60) _(61) _(62) \
+ _(63) _(64) _(65) _(66) _(67) _(68) _(69) _(70) _(71) _(72) _(73) _(74) \
+ _(75) _(76) _(77) _(78) _(79) _(80) _(81) _(82) _(83) _(84) _(85) _(86) \
+ _(87) _(88) _(89) _(90) _(91) _(92) _(93) _(94) _(95) _(96) _(97) _(98) \
+ _(99) _(100) _(101) _(102) _(103) _(104) _(105) _(106) _(107) _(108) \
+ _(109) _(110) _(111) _(112) _(113) _(114) _(115) _(116) _(117) _(118) \
+ _(119) _(120) _(121) _(122) _(123) _(124) _(125) _(126) _(127) _(128) \
+ _(129) _(130) _(131) _(132) _(133) _(134) _(135) _(136) _(137) _(138) \
+ _(139) _(140) _(141) _(142) _(143) _(144) _(145) _(146) _(147) _(148) \
+ _(149) _(150) _(151) _(152) _(153) _(154) _(155) _(156) _(157) _(158) \
+ _(159) _(160) _(161) _(162) _(163) _(164) _(165) _(166) _(167) _(168) \
+ _(169) _(170) _(171) _(172) _(173) _(174) _(175) _(176) _(177) _(178) \
+ _(179) _(180) _(181) _(182) _(183) _(184) _(185) _(186) _(187) _(188) \
+ _(189) _(190) _(191) _(192) _(193) _(194) _(195) _(196) _(197) _(198) \
+ _(199) _(200) _(201) _(202) _(203) _(204) _(205) _(206) _(207) _(208) \
+ _(209) _(210) _(211) _(212) _(213) _(214) _(215) _(216) _(217) _(218) \
+ _(219) _(220) _(221) _(222) _(223) _(224) _(225) _(226) _(227) _(228) \
+ _(229) _(230) _(231) _(232) _(233) _(234) _(235) _(236) _(237) _(238) \
+ _(239) _(240) _(241) _(242) _(243) _(244) _(245) _(246) _(247) _(248) \
+ _(249) _(250) _(251) _(252) _(253) _(254) _(255)
+/* clang-format on */
+
+#define _(n) \
+ static __clib_noinline void wrapper##n (u8 *dst, u8 *src) \
+ { \
+ clib_memcpy_x86_64 (dst, src, n); \
+ }
+
+foreach_const_n;
+#undef _
+
+typedef void (const_fp_t) (u8 *dst, u8 *src);
+typedef struct
+{
+ u16 len;
+ const_fp_t *fp;
+} counst_test_t;
+
+static counst_test_t const_tests[] = {
+#define _(n) { .fp = wrapper##n, .len = n },
+ foreach_const_n
+#undef _
+};
+
+#define MAX_LEN 1024
+
+static clib_error_t *
+validate_one (clib_error_t *err, u8 *d, u8 *s, u16 n, u8 off, int is_const)
+{
+ for (int i = 0; i < n; i++)
+ if (d[i] != s[i])
+ return clib_error_return (err,
+ "memcpy error at position %d "
+ "(n = %u, off = %u, expected 0x%02x "
+ "found 0x%02x%s)",
+ i, n, off, s[i], d[i],
+ is_const ? ", const" : "");
+ for (int i = -64; i < 0; i++)
+ if (d[i] != 0xfe)
+ return clib_error_return (err,
+ "buffer underrun at position %d "
+ "(n = %u, off = %u, expected 0xfe "
+ "found 0x%02x%s)",
+ i, n, off, d[i], is_const ? ", const" : "");
+ for (int i = n; i < n + 64; i++)
+ if (d[i] != 0xfe)
+ return clib_error_return (err,
+ "buffer overrun at position %d "
+ "(n = %u, off = %u, expected 0xfe "
+ "found 0x%02x%s)",
+ i, n, off, d[i], is_const ? ", const" : "");
+ return err;
+}
+
+static clib_error_t *
+test_clib_memcpy_x86_64 (clib_error_t *err)
+{
+ u8 src[MAX_LEN + 192];
+ u8 dst[MAX_LEN + 128];
+
+ for (int i = 0; i < ARRAY_LEN (src); i++)
+ src[i] = i & 0x7f;
+
+ for (int j = 0; j < ARRAY_LEN (const_tests); j++)
+ {
+ u8 *d = dst + 64;
+ u8 *s = src + 64;
+ u16 n = const_tests[j].len;
+
+ for (int i = 0; i < 128 + n; i++)
+ dst[i] = 0xfe;
+ const_tests[j].fp (d, s);
+ if ((err = validate_one (err, d, s, n, 0, /* is_const */ 1)))
+ return err;
+ }
+
+ for (u16 n = 1; n <= MAX_LEN; n++)
+ {
+ for (int off = 0; off < 64; off += 7)
+ {
+ u8 *d = dst + 64 + off;
+ u8 *s = src + 64;
+
+ for (int i = 0; i < 128 + n + off; i++)
+ dst[i] = 0xfe;
+
+ wrapper (d, s, n);
+
+ if ((err = validate_one (err, d, s, n, off, /* is_const */ 0)))
+ return err;
+ }
+ }
+ return err;
+}
+
+REGISTER_TEST (clib_memcpy_x86_64) = {
+ .name = "clib_memcpy_x86_64",
+ .fn = test_clib_memcpy_x86_64,
+};
+#endif