From 04f3db3847d242857b9d9d858bcdca538a1be7d7 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Fri, 10 Nov 2017 21:55:45 +0100 Subject: dpdk: introduce AVX512 variants of node functions Change-Id: If581feca0d51d0420c971801aecdf9250c671b36 Signed-off-by: Damjan Marion --- src/configure.ac | 36 ++++++++++++++++++++++ src/plugins/Makefile.am | 1 + src/plugins/dpdk.am | 32 ++++++++++++++++++++ src/plugins/dpdk/buffer.c | 64 +++++++++++++++++++++++++++++++-------- src/plugins/dpdk/device/device.c | 30 +++++++++++++++---- src/plugins/dpdk/device/node.c | 64 ++++++++++++++------------------------- src/vlib/buffer.h | 24 +++++++++------ src/vppinfra/clib.h | 5 ++++ src/vppinfra/cpu.h | 7 +++++ src/vppinfra/string.h | 65 ++++++++++++++++++++++++++++++++++++++++ 10 files changed, 259 insertions(+), 69 deletions(-) diff --git a/src/configure.ac b/src/configure.ac index ee4985b5471..b5546a7c2e8 100644 --- a/src/configure.ac +++ b/src/configure.ac @@ -119,6 +119,22 @@ AC_DEFUN([DETECT_DPDK_IS_1702_OR_1705], AM_CONDITIONAL(DPDK_IS_1702_OR_1705, test "$dpdk_is_1702_or_1705" = "yes") ]) +# Check if compiler supports specific flag +AC_DEFUN([CC_CHECK_FLAG], +[ + AC_MSG_CHECKING([if $CC supports $1]) + AC_LANG_PUSH([C]) + ac_saved_cflags="$CFLAGS" + CFLAGS="-Werror $1" + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([])], + [c_flag_check=yes], + [c_flag_check=no] +) + AC_MSG_RESULT([$c_flag_check]) + CFLAGS="$ac_saved_cflags" + AC_LANG_POP([C]) +]) + ############################################################################### # configure arguments ############################################################################### @@ -164,6 +180,20 @@ AC_ARG_WITH(pre-data, # Target CPU flags ############################################################################### +# Check if compiler supports march=core-avx2 +CC_CHECK_FLAG("-march=core-avx2") +AS_IF([test "$cc_flag_check" = yes], + [march_core_avx2=yes], + [march_core_avx2=no]) +AM_CONDITIONAL([CC_SUPPORTS_AVX2], [test "$march_core_avx2" = "yes"]) + +# Check if compiler supports march=skylake-avx512 +CC_CHECK_FLAG("-march=skylake-avx512") +AS_IF([test "$cc_flag_check" = yes], + [march_skylake_avx512=yes], + [march_skylake_avx512=no]) +AM_CONDITIONAL([CC_SUPPORTS_AVX512], [test "$march_skylake_avx512" = "yes"]) + AS_CASE([$build_cpu], [x86_64], [CPU_FLAGS="-march=corei7 -mtune=corei7-avx"], [aarch64], [CPU_FLAGS="-march=armv8-a+crc"], @@ -171,6 +201,12 @@ AS_CASE([$build_cpu], ) AC_SUBST([CPU_FLAGS]) +AC_SUBST([CPU_AVX2_FLAGS],"-march=core-avx2 -mtune=core-avx2") +AC_SUBST([CPU_AVX512_FLAGS],"-march=skylake-avx512 -mtune=skylake-avx512") + +AM_CONDITIONAL([CPU_X86_64], [test "$build_cpu" = "x86_64"]) +AM_CONDITIONAL([CPU_AARCH64], [test "$build_cpu" = "aarch64"]) + ############################################################################### # Substitutions and defines ############################################################################### diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am index 97f2f88f821..286fd1f9885 100644 --- a/src/plugins/Makefile.am +++ b/src/plugins/Makefile.am @@ -23,6 +23,7 @@ BUILT_SOURCES = vppplugins_LTLIBRARIES = vppapitestplugins_LTLIBRARIES = noinst_HEADERS = +noinst_LTLIBRARIES = nobase_apiinclude_HEADERS = nobase_include_HEADERS = diff --git a/src/plugins/dpdk.am b/src/plugins/dpdk.am index 905ba2016a3..99abded7cd2 100644 --- a/src/plugins/dpdk.am +++ b/src/plugins/dpdk.am @@ -36,6 +36,7 @@ dpdk_plugin_la_LDFLAGS += -Wl,-lnuma endif dpdk_plugin_la_LDFLAGS += -Wl,-lm,-ldl +dpdk_plugin_la_LIBADD = dpdk_plugin_la_SOURCES = \ dpdk/main.c \ @@ -59,6 +60,37 @@ dpdk_plugin_la_SOURCES = \ API_FILES += dpdk/api/dpdk.api +if CPU_X86_64 +dpdk_multiversioning_files = \ + dpdk/buffer.c \ + dpdk/device/node.c \ + dpdk/device/device.c + +if CC_SUPPORTS_AVX2 +############################################################### +# AVX2 +############################################################### +libdpdk_plugin_avx2_la_SOURCES = $(dpdk_multiversioning_files) +libdpdk_plugin_avx2_la_CFLAGS = \ + $(AM_CFLAGS) @CPU_AVX2_FLAGS@ \ + -DCLIB_MULTIARCH_VARIANT=avx2 +noinst_LTLIBRARIES += libdpdk_plugin_avx2.la +dpdk_plugin_la_LIBADD += libdpdk_plugin_avx2.la +endif + +if CC_SUPPORTS_AVX512 +############################################################### +# AVX512 +############################################################### +libdpdk_plugin_avx512_la_SOURCES = $(dpdk_multiversioning_files) +libdpdk_plugin_avx512_la_CFLAGS = \ + $(AM_CFLAGS) @CPU_AVX512_FLAGS@ \ + -DCLIB_MULTIARCH_VARIANT=avx512 +noinst_LTLIBRARIES += libdpdk_plugin_avx512.la +dpdk_plugin_la_LIBADD += libdpdk_plugin_avx512.la +endif +endif + nobase_include_HEADERS += \ dpdk/device/dpdk.h \ dpdk/api/dpdk_all_api_h.h \ diff --git a/src/plugins/dpdk/buffer.c b/src/plugins/dpdk/buffer.c index 555b1109c19..80c6442f463 100644 --- a/src/plugins/dpdk/buffer.c +++ b/src/plugins/dpdk/buffer.c @@ -124,6 +124,7 @@ next: } } +#ifndef CLIB_MULTIARCH_VARIANT static void del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f) { @@ -176,6 +177,7 @@ dpdk_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index) pool_put (bm->buffer_free_list_pool, f); } } +#endif /* Make sure free list has at least given number of free buffers. */ static uword @@ -253,10 +255,7 @@ fill_free_list (vlib_main_t * vm, fl->buffers[f++] = bi2; fl->buffers[f++] = bi3; - clib_memcpy (b0, &bt, sizeof (vlib_buffer_t)); - clib_memcpy (b1, &bt, sizeof (vlib_buffer_t)); - clib_memcpy (b2, &bt, sizeof (vlib_buffer_t)); - clib_memcpy (b3, &bt, sizeof (vlib_buffer_t)); + clib_memcpy64_x4 (b0, b1, b2, b3, &bt); if (fl->buffer_init_function) { @@ -317,7 +316,8 @@ alloc_from_free_list (vlib_main_t * vm, Returns number actually allocated which will be either zero or number requested. */ u32 -dpdk_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +CLIB_MULTIARCH_FN (dpdk_buffer_alloc) (vlib_main_t * vm, u32 * buffers, + u32 n_buffers) { vlib_buffer_main_t *bm = vm->buffer_main; @@ -330,9 +330,10 @@ dpdk_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers) u32 -dpdk_buffer_alloc_from_free_list (vlib_main_t * vm, - u32 * buffers, - u32 n_buffers, u32 free_list_index) +CLIB_MULTIARCH_FN (dpdk_buffer_alloc_from_free_list) (vlib_main_t * vm, + u32 * buffers, + u32 n_buffers, + u32 free_list_index) { vlib_buffer_main_t *bm = vm->buffer_main; vlib_buffer_free_list_t *f; @@ -455,20 +456,23 @@ vlib_buffer_free_inline (vlib_main_t * vm, } } -static void -dpdk_buffer_free (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +void +CLIB_MULTIARCH_FN (dpdk_buffer_free) (vlib_main_t * vm, u32 * buffers, + u32 n_buffers) { vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */ 1); } -static void -dpdk_buffer_free_no_next (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +void +CLIB_MULTIARCH_FN (dpdk_buffer_free_no_next) (vlib_main_t * vm, u32 * buffers, + u32 n_buffers) { vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */ 0); } +#ifndef CLIB_MULTIARCH_VARIANT static void dpdk_packet_template_init (vlib_main_t * vm, void *vt, @@ -682,6 +686,42 @@ VLIB_BUFFER_REGISTER_CALLBACKS (dpdk, static) = { }; /* *INDENT-ON* */ +#if __x86_64__ +vlib_buffer_alloc_cb_t __clib_weak dpdk_buffer_alloc_avx512; +vlib_buffer_alloc_cb_t __clib_weak dpdk_buffer_alloc_avx2; +vlib_buffer_alloc_from_free_list_cb_t __clib_weak + dpdk_buffer_alloc_from_free_list_avx512; +vlib_buffer_alloc_from_free_list_cb_t __clib_weak + dpdk_buffer_alloc_from_free_list_avx2; +vlib_buffer_free_cb_t __clib_weak dpdk_buffer_free_cb_avx512; +vlib_buffer_free_cb_t __clib_weak dpdk_buffer_free_cb_avx2; +vlib_buffer_free_no_next_cb_t __clib_weak dpdk_buffer_free_no_next_cb_avx512; +vlib_buffer_free_no_next_cb_t __clib_weak dpdk_buffer_free_no_next_cb_avx2; + +static void __clib_constructor +dpdk_input_multiarch_select (void) +{ + vlib_buffer_callbacks_t *cb = &__dpdk_buffer_callbacks; + if (dpdk_buffer_alloc_avx512 && clib_cpu_supports_avx512f ()) + { + cb->vlib_buffer_alloc_cb = dpdk_buffer_alloc_avx512; + cb->vlib_buffer_alloc_from_free_list_cb = + dpdk_buffer_alloc_from_free_list_avx512; + cb->vlib_buffer_free_cb = dpdk_buffer_free_cb_avx512; + cb->vlib_buffer_free_no_next_cb = dpdk_buffer_free_no_next_cb_avx512; + } + else if (dpdk_buffer_alloc_avx2 && clib_cpu_supports_avx2 ()) + { + cb->vlib_buffer_alloc_cb = dpdk_buffer_alloc_avx2; + cb->vlib_buffer_alloc_from_free_list_cb = + dpdk_buffer_alloc_from_free_list_avx2; + cb->vlib_buffer_free_cb = dpdk_buffer_free_cb_avx2; + cb->vlib_buffer_free_no_next_cb = dpdk_buffer_free_no_next_cb_avx2; + } +} +#endif +#endif + /** @endcond */ /* * fd.io coding-style-patch-verification: ON diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c index aa134327373..987596ead9b 100644 --- a/src/plugins/dpdk/device/device.c +++ b/src/plugins/dpdk/device/device.c @@ -38,6 +38,7 @@ typedef enum DPDK_TX_FUNC_N_ERROR, } dpdk_tx_func_error_t; +#ifndef CLIB_MULTIARCH_VARIANT static char *dpdk_tx_func_error_strings[] = { #define _(n,s) s, foreach_dpdk_tx_func_error @@ -65,8 +66,9 @@ dpdk_set_mac_address (vnet_hw_interface_t * hi, char *address) return NULL; } } +#endif -struct rte_mbuf * +static struct rte_mbuf * dpdk_replicate_packet_mb (vlib_buffer_t * b) { dpdk_main_t *dm = &dpdk_main; @@ -368,9 +370,10 @@ dpdk_buffer_tx_offload (dpdk_device_t * xd, vlib_buffer_t * b, * rte_mbuf pointers. It then passes this vector to tx_burst_vector_internal * which calls the dpdk tx_burst function. */ -static uword -dpdk_interface_tx (vlib_main_t * vm, - vlib_node_runtime_t * node, vlib_frame_t * f) +uword +CLIB_MULTIARCH_FN (dpdk_interface_tx) (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * f) { dpdk_main_t *dm = &dpdk_main; vnet_interface_output_runtime_t *rd = (void *) node->runtime_data; @@ -632,6 +635,7 @@ dpdk_interface_tx (vlib_main_t * vm, return tx_pkts; } +#ifndef CLIB_MULTIARCH_VARIANT static void dpdk_clear_hw_interface_counters (u32 instance) { @@ -789,12 +793,25 @@ VNET_DEVICE_CLASS (dpdk_device_class) = { .rx_redirect_to_node = dpdk_set_interface_next_node, .mac_addr_change_function = dpdk_set_mac_address, }; - -VLIB_DEVICE_TX_FUNCTION_MULTIARCH (dpdk_device_class, dpdk_interface_tx) /* *INDENT-ON* */ +#if __x86_64__ +vlib_node_function_t __clib_weak dpdk_interface_tx_avx512; +vlib_node_function_t __clib_weak dpdk_interface_tx_avx2; +static void __clib_constructor +dpdk_interface_tx_multiarch_select (void) +{ + if (dpdk_interface_tx_avx512 && clib_cpu_supports_avx512f ()) + dpdk_device_class.tx_function = dpdk_interface_tx_avx512; + else if (dpdk_interface_tx_avx2 && clib_cpu_supports_avx2 ()) + dpdk_device_class.tx_function = dpdk_interface_tx_avx2; +} +#endif +#endif + #define UP_DOWN_FLAG_EVENT 1 +#ifndef CLIB_MULTIARCH_VARIANT uword admin_up_down_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) @@ -846,6 +863,7 @@ VLIB_REGISTER_NODE (admin_up_down_process_node,static) = { .process_log2_n_stack_bytes = 17, // 256KB }; /* *INDENT-ON* */ +#endif /* * fd.io coding-style-patch-verification: ON diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c index 82978216817..1240b8d01a7 100644 --- a/src/plugins/dpdk/device/node.c +++ b/src/plugins/dpdk/device/node.c @@ -28,11 +28,13 @@ #include +#ifndef CLIB_MULTIARCH_VARIANT static char *dpdk_error_strings[] = { #define _(n,s) s, foreach_dpdk_error #undef _ }; +#endif always_inline int vlib_buffer_is_ip4 (vlib_buffer_t * b) @@ -259,43 +261,6 @@ dpdk_prefetch_ethertype (struct rte_mbuf *mb) CLIB_CACHE_LINE_BYTES, LOAD); } - -/* - This function should fill 1st cacheline of vlib_buffer_t metadata with data - from buffer template. Instead of filling field by field, we construct - template and then use 128/256 bit vector instruction to copy data. - This code first loads whole cacheline into 4 128-bit registers (xmm) - or two 256 bit registers (ymm) and then stores data into all 4 buffers - efectively saving on register load operations. -*/ - -static_always_inline void -dpdk_buffer_init_from_template (void *d0, void *d1, void *d2, void *d3, - void *s) -{ -#if defined(CLIB_HAVE_VEC128) - int i; - for (i = 0; i < 2; i++) - { - *(u8x32 *) (((u8 *) d0) + i * 32) = - *(u8x32 *) (((u8 *) d1) + i * 32) = - *(u8x32 *) (((u8 *) d2) + i * 32) = - *(u8x32 *) (((u8 *) d3) + i * 32) = *(u8x32 *) (((u8 *) s) + i * 32); - } -#elif defined(CLIB_HAVE_VEC64) - int i; - for (i = 0; i < 4; i++) - { - *(u8x16 *) (((u8 *) d0) + i * 16) = - *(u8x16 *) (((u8 *) d1) + i * 16) = - *(u8x16 *) (((u8 *) d2) + i * 16) = - *(u8x16 *) (((u8 *) d3) + i * 16) = *(u8x16 *) (((u8 *) s) + i * 16); - } -#else -#error "Either CLIB_HAVE_VEC128 or CLIB_HAVE_VEC64 has to be defined" -#endif -} - /* * This function is used when there are no worker threads. * The main thread performs IO and forwards the packets. @@ -401,7 +366,7 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, b2 = vlib_buffer_from_rte_mbuf (mb2); b3 = vlib_buffer_from_rte_mbuf (mb3); - dpdk_buffer_init_from_template (b0, b1, b2, b3, bt); + clib_memcpy64_x4 (b0, b1, b2, b3, bt); dpdk_prefetch_buffer (xd->rx_vectors[queue_id][mb_index + 9]); dpdk_prefetch_ethertype (xd->rx_vectors[queue_id][mb_index + 5]); @@ -647,8 +612,9 @@ poll_rate_limit (dpdk_main_t * dm) xd->per_interface_next_index */ -static uword -dpdk_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f) +uword +CLIB_MULTIARCH_FN (dpdk_input) (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * f) { dpdk_main_t *dm = &dpdk_main; dpdk_device_t *xd; @@ -678,6 +644,7 @@ dpdk_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f) return n_rx_packets; } +#ifndef CLIB_MULTIARCH_VARIANT /* *INDENT-OFF* */ VLIB_REGISTER_NODE (dpdk_input_node) = { .function = dpdk_input, @@ -694,10 +661,23 @@ VLIB_REGISTER_NODE (dpdk_input_node) = { .n_errors = DPDK_N_ERROR, .error_strings = dpdk_error_strings, }; - -VLIB_NODE_FUNCTION_MULTIARCH (dpdk_input_node, dpdk_input); /* *INDENT-ON* */ +vlib_node_function_t __clib_weak dpdk_input_avx512; +vlib_node_function_t __clib_weak dpdk_input_avx2; + +#if __x86_64__ +static void __clib_constructor +dpdk_input_multiarch_select (void) +{ + if (dpdk_input_avx512 && clib_cpu_supports_avx512f ()) + dpdk_input_node.function = dpdk_input_avx512; + else if (dpdk_input_avx2 && clib_cpu_supports_avx2 ()) + dpdk_input_node.function = dpdk_input_avx2; +} +#endif +#endif + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index 6170323c815..21bed48ac14 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -380,17 +380,23 @@ typedef struct vlib_buffer_free_list_t uword buffer_init_function_opaque; } __attribute__ ((aligned (16))) vlib_buffer_free_list_t; +typedef u32 (vlib_buffer_alloc_cb_t) (struct vlib_main_t * vm, u32 * buffers, + u32 n_buffers); +typedef u32 (vlib_buffer_alloc_from_free_list_cb_t) (struct vlib_main_t * vm, + u32 * buffers, + u32 n_buffers, + u32 free_list_index); +typedef void (vlib_buffer_free_cb_t) (struct vlib_main_t * vm, u32 * buffers, + u32 n_buffers); +typedef void (vlib_buffer_free_no_next_cb_t) (struct vlib_main_t * vm, + u32 * buffers, u32 n_buffers); + typedef struct { - u32 (*vlib_buffer_alloc_cb) (struct vlib_main_t * vm, u32 * buffers, - u32 n_buffers); - u32 (*vlib_buffer_alloc_from_free_list_cb) (struct vlib_main_t * vm, - u32 * buffers, u32 n_buffers, - u32 free_list_index); - void (*vlib_buffer_free_cb) (struct vlib_main_t * vm, u32 * buffers, - u32 n_buffers); - void (*vlib_buffer_free_no_next_cb) (struct vlib_main_t * vm, u32 * buffers, - u32 n_buffers); + vlib_buffer_alloc_cb_t *vlib_buffer_alloc_cb; + vlib_buffer_alloc_from_free_list_cb_t *vlib_buffer_alloc_from_free_list_cb; + vlib_buffer_free_cb_t *vlib_buffer_free_cb; + vlib_buffer_free_no_next_cb_t *vlib_buffer_free_no_next_cb; void (*vlib_packet_template_init_cb) (struct vlib_main_t * vm, void *t, void *packet_data, uword n_packet_data_bytes, diff --git a/src/vppinfra/clib.h b/src/vppinfra/clib.h index fbb2a21c6b9..33db3b203f9 100644 --- a/src/vppinfra/clib.h +++ b/src/vppinfra/clib.h @@ -78,6 +78,11 @@ #define CLIB_PACKED(x) x __attribute__ ((packed)) #define CLIB_UNUSED(x) x __attribute__ ((unused)) +#define __clib_unused __attribute__ ((unused)) +#define __clib_weak __attribute__ ((weak)) +#define __clib_packed __attribute__ ((packed)) +#define __clib_constructor __attribute__ ((constructor)) + #define never_inline __attribute__ ((__noinline__)) #if CLIB_DEBUG > 0 diff --git a/src/vppinfra/cpu.h b/src/vppinfra/cpu.h index 9c149f3fa2a..75b01e606f3 100644 --- a/src/vppinfra/cpu.h +++ b/src/vppinfra/cpu.h @@ -51,6 +51,13 @@ return & fn; \ } +#ifdef CLIB_MULTIARCH_VARIANT +#define __CLIB_MULTIARCH_FN(a,b) a##_##b +#define _CLIB_MULTIARCH_FN(a,b) __CLIB_MULTIARCH_FN(a,b) +#define CLIB_MULTIARCH_FN(fn) _CLIB_MULTIARCH_FN(fn,CLIB_MULTIARCH_VARIANT) +#else +#define CLIB_MULTIARCH_FN(fn) fn +#endif #define foreach_x86_64_flags \ _ (sse3, 1, ecx, 0) \ diff --git a/src/vppinfra/string.h b/src/vppinfra/string.h index 69a99a3f0ce..914f6a7bbc4 100644 --- a/src/vppinfra/string.h +++ b/src/vppinfra/string.h @@ -72,6 +72,71 @@ void clib_memswap (void *_a, void *_b, uword bytes); #define clib_memcpy(a,b,c) memcpy(a,b,c) #endif +/* + * Copy 64 bytes of data to 4 destinations + * this function is typically used in quad-loop case when whole cacheline + * needs to be copied to 4 different places. First it reads whole cacheline + * to 1/2/4 SIMD registers and then it writes data to 4 destinations. + */ + +static_always_inline void +clib_memcpy64_x4 (void *d0, void *d1, void *d2, void *d3, void *s) +{ +#if defined (CLIB_HAVE_VEC512) + u8x64 __attribute__ ((aligned (1))) r0 = *(((u8x64 *) s) + 0); + + *(((u8x64 *) d0) + 0) = r0; + *(((u8x64 *) d1) + 0) = r0; + *(((u8x64 *) d2) + 0) = r0; + *(((u8x64 *) d3) + 0) = r0; +#elif defined (CLIB_HAVE_VEC256) + u8x32 __attribute__ ((aligned (1))) r0 = *(((u8x32 *) s) + 0); + u8x32 __attribute__ ((aligned (1))) r1 = *(((u8x32 *) s) + 1); + + *(((u8x32 *) d0) + 0) = r0; + *(((u8x32 *) d0) + 1) = r1; + + *(((u8x32 *) d1) + 0) = r0; + *(((u8x32 *) d1) + 1) = r1; + + *(((u8x32 *) d2) + 0) = r0; + *(((u8x32 *) d2) + 1) = r1; + + *(((u8x32 *) d3) + 0) = r0; + *(((u8x32 *) d3) + 1) = r1; +#elif defined (CLIB_HAVE_VEC128) + u8x16 __attribute__ ((aligned (1))) r0 = *(((u8x16 *) s) + 0); + u8x16 __attribute__ ((aligned (1))) r1 = *(((u8x16 *) s) + 1); + u8x16 __attribute__ ((aligned (1))) r2 = *(((u8x16 *) s) + 3); + u8x16 __attribute__ ((aligned (1))) r3 = *(((u8x16 *) s) + 4); + + *(((u8x16 *) d0) + 0) = r0; + *(((u8x16 *) d0) + 1) = r1; + *(((u8x16 *) d0) + 2) = r2; + *(((u8x16 *) d0) + 3) = r3; + + *(((u8x16 *) d1) + 0) = r0; + *(((u8x16 *) d1) + 1) = r1; + *(((u8x16 *) d1) + 2) = r2; + *(((u8x16 *) d1) + 3) = r3; + + *(((u8x16 *) d2) + 0) = r0; + *(((u8x16 *) d2) + 1) = r1; + *(((u8x16 *) d2) + 2) = r2; + *(((u8x16 *) d2) + 3) = r3; + + *(((u8x16 *) d3) + 0) = r0; + *(((u8x16 *) d3) + 1) = r1; + *(((u8x16 *) d3) + 2) = r2; + *(((u8x16 *) d3) + 3) = r3; +#else + clib_memcpy (d0, s, 64); + clib_memcpy (d1, s, 64); + clib_memcpy (d2, s, 64); + clib_memcpy (d3, s, 64); +#endif +} + #endif /* included_clib_string_h */ /* -- cgit 1.2.3-korg