aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDamjan Marion <damarion@cisco.com>2019-01-18 19:56:09 +0100
committerFlorin Coras <florin.coras@gmail.com>2019-01-18 22:00:27 +0000
commite58041f242bf4bd120ecc9619b88348d80b94c17 (patch)
treef163418c56ef3e76074685900b00b044cc1b49bd
parent865872ebdb2bbaf3f157e2a9889405b84114d2eb (diff)
deprecate clib_memcpy64_x4
Storing buffer in local template seems to be better option.... Change-Id: I1a2fdd68cb956f99a5b36d2cd810fc623e089bcf Signed-off-by: Damjan Marion <damarion@cisco.com>
-rw-r--r--src/plugins/avf/input.c24
-rw-r--r--src/plugins/dpdk/buffer.c17
-rw-r--r--src/plugins/dpdk/device/node.c31
-rw-r--r--src/plugins/memif/memif.c5
-rw-r--r--src/plugins/memif/node.c27
-rw-r--r--src/vlib/buffer.h10
-rw-r--r--src/vlib/buffer_funcs.h13
-rw-r--r--src/vppinfra/string.h68
8 files changed, 71 insertions, 124 deletions
diff --git a/src/plugins/avf/input.c b/src/plugins/avf/input.c
index 8072e94346b..b784bf731c1 100644
--- a/src/plugins/avf/input.c
+++ b/src/plugins/avf/input.c
@@ -146,7 +146,7 @@ avf_rx_attach_tail (vlib_main_t * vm, vlib_buffer_t * bt, vlib_buffer_t * b,
b->next_buffer = t->buffers[i];
b->flags |= VLIB_BUFFER_NEXT_PRESENT;
b = vlib_get_buffer (vm, b->next_buffer);
- clib_memcpy_fast (b, bt, sizeof (vlib_buffer_t));
+ vlib_buffer_copy_template (b, bt);
tlnifb += b->current_length = qw1 >> AVF_RXD_LEN_SHIFT;
i++;
}
@@ -161,12 +161,15 @@ avf_process_rx_burst (vlib_main_t * vm, vlib_node_runtime_t * node,
avf_per_thread_data_t * ptd, u32 n_left,
int maybe_multiseg)
{
- vlib_buffer_t *bt = &ptd->buffer_template;
+ vlib_buffer_t bt;
vlib_buffer_t **b = ptd->bufs;
u64 *qw1 = ptd->qw1s;
avf_rx_tail_t *tail = ptd->tails;
uword n_rx_bytes = 0;
+ /* copy template into local variable - will save per packet load */
+ vlib_buffer_copy_template (&bt, &ptd->buffer_template);
+
while (n_left >= 4)
{
if (n_left >= 12)
@@ -177,7 +180,10 @@ avf_process_rx_burst (vlib_main_t * vm, vlib_node_runtime_t * node,
vlib_prefetch_buffer_header (b[11], LOAD);
}
- clib_memcpy64_x4 (b[0], b[1], b[2], b[3], bt);
+ vlib_buffer_copy_template (b[0], &bt);
+ vlib_buffer_copy_template (b[1], &bt);
+ vlib_buffer_copy_template (b[2], &bt);
+ vlib_buffer_copy_template (b[3], &bt);
n_rx_bytes += b[0]->current_length = qw1[0] >> AVF_RXD_LEN_SHIFT;
n_rx_bytes += b[1]->current_length = qw1[1] >> AVF_RXD_LEN_SHIFT;
@@ -186,10 +192,10 @@ avf_process_rx_burst (vlib_main_t * vm, vlib_node_runtime_t * node,
if (maybe_multiseg)
{
- n_rx_bytes += avf_rx_attach_tail (vm, bt, b[0], qw1[0], tail + 0);
- n_rx_bytes += avf_rx_attach_tail (vm, bt, b[1], qw1[1], tail + 1);
- n_rx_bytes += avf_rx_attach_tail (vm, bt, b[2], qw1[2], tail + 2);
- n_rx_bytes += avf_rx_attach_tail (vm, bt, b[3], qw1[3], tail + 3);
+ n_rx_bytes += avf_rx_attach_tail (vm, &bt, b[0], qw1[0], tail + 0);
+ n_rx_bytes += avf_rx_attach_tail (vm, &bt, b[1], qw1[1], tail + 1);
+ n_rx_bytes += avf_rx_attach_tail (vm, &bt, b[2], qw1[2], tail + 2);
+ n_rx_bytes += avf_rx_attach_tail (vm, &bt, b[3], qw1[3], tail + 3);
}
VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[0]);
@@ -205,12 +211,12 @@ avf_process_rx_burst (vlib_main_t * vm, vlib_node_runtime_t * node,
}
while (n_left)
{
- clib_memcpy_fast (b[0], bt, sizeof (vlib_buffer_t));
+ vlib_buffer_copy_template (b[0], &bt);
n_rx_bytes += b[0]->current_length = qw1[0] >> AVF_RXD_LEN_SHIFT;
if (maybe_multiseg)
- n_rx_bytes += avf_rx_attach_tail (vm, bt, b[0], qw1[0], tail + 0);
+ n_rx_bytes += avf_rx_attach_tail (vm, &bt, b[0], qw1[0], tail + 0);
VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[0]);
diff --git a/src/plugins/dpdk/buffer.c b/src/plugins/dpdk/buffer.c
index ee63f76b0d4..7093b01162c 100644
--- a/src/plugins/dpdk/buffer.c
+++ b/src/plugins/dpdk/buffer.c
@@ -241,14 +241,15 @@ CLIB_MULTIARCH_FN (dpdk_buffer_fill_free_list) (vlib_main_t * vm,
no_prefetch:
vlib_get_buffer_indices_with_offset (vm, (void **) mb, bi, 8,
sizeof (struct rte_mbuf));
- clib_memcpy64_x4 (vlib_buffer_from_rte_mbuf (mb[0]),
- vlib_buffer_from_rte_mbuf (mb[1]),
- vlib_buffer_from_rte_mbuf (mb[2]),
- vlib_buffer_from_rte_mbuf (mb[3]), &bt);
- clib_memcpy64_x4 (vlib_buffer_from_rte_mbuf (mb[4]),
- vlib_buffer_from_rte_mbuf (mb[5]),
- vlib_buffer_from_rte_mbuf (mb[6]),
- vlib_buffer_from_rte_mbuf (mb[7]), &bt);
+
+ vlib_buffer_copy_template (vlib_buffer_from_rte_mbuf (mb[0]), &bt);
+ vlib_buffer_copy_template (vlib_buffer_from_rte_mbuf (mb[1]), &bt);
+ vlib_buffer_copy_template (vlib_buffer_from_rte_mbuf (mb[2]), &bt);
+ vlib_buffer_copy_template (vlib_buffer_from_rte_mbuf (mb[3]), &bt);
+ vlib_buffer_copy_template (vlib_buffer_from_rte_mbuf (mb[4]), &bt);
+ vlib_buffer_copy_template (vlib_buffer_from_rte_mbuf (mb[5]), &bt);
+ vlib_buffer_copy_template (vlib_buffer_from_rte_mbuf (mb[6]), &bt);
+ vlib_buffer_copy_template (vlib_buffer_from_rte_mbuf (mb[7]), &bt);
n_left -= 8;
mb += 8;
diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c
index 194c359dbac..250ded5048c 100644
--- a/src/plugins/dpdk/device/node.c
+++ b/src/plugins/dpdk/device/node.c
@@ -40,7 +40,7 @@ STATIC_ASSERT ((PKT_RX_IP_CKSUM_BAD | PKT_RX_FDIR) <
static_always_inline uword
dpdk_process_subseq_segs (vlib_main_t * vm, vlib_buffer_t * b,
- struct rte_mbuf *mb, vlib_buffer_free_list_t * fl)
+ struct rte_mbuf *mb, vlib_buffer_t * bt)
{
u8 nb_seg = 1;
struct rte_mbuf *mb_seg = 0;
@@ -59,10 +59,7 @@ dpdk_process_subseq_segs (vlib_main_t * vm, vlib_buffer_t * b,
ASSERT (mb_seg != 0);
b_seg = vlib_buffer_from_rte_mbuf (mb_seg);
- vlib_buffer_init_for_free_list (b_seg, fl);
-
- ASSERT ((b_seg->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
- ASSERT (b_seg->current_data == 0);
+ vlib_buffer_copy_template (b_seg, bt);
/*
* The driver (e.g. virtio) may not put the packet data at the start
@@ -167,17 +164,16 @@ dpdk_process_rx_burst (vlib_main_t * vm, dpdk_per_thread_data_t * ptd,
{
u32 n_left = n_rx_packets;
vlib_buffer_t *b[4];
- vlib_buffer_free_list_t *fl;
struct rte_mbuf **mb = ptd->mbufs;
uword n_bytes = 0;
u8 *flags, or_flags = 0;
-
- if (maybe_multiseg)
- fl = vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+ vlib_buffer_t bt;
mb = ptd->mbufs;
flags = ptd->flags;
+ /* copy template into local variable - will save per packet load */
+ vlib_buffer_copy_template (&bt, &ptd->buffer_template);
while (n_left >= 8)
{
dpdk_prefetch_buffer_x4 (mb + 4);
@@ -187,7 +183,10 @@ dpdk_process_rx_burst (vlib_main_t * vm, dpdk_per_thread_data_t * ptd,
b[2] = vlib_buffer_from_rte_mbuf (mb[2]);
b[3] = vlib_buffer_from_rte_mbuf (mb[3]);
- clib_memcpy64_x4 (b[0], b[1], b[2], b[3], &ptd->buffer_template);
+ vlib_buffer_copy_template (b[0], &bt);
+ vlib_buffer_copy_template (b[1], &bt);
+ vlib_buffer_copy_template (b[2], &bt);
+ vlib_buffer_copy_template (b[3], &bt);
dpdk_prefetch_mbuf_x4 (mb + 4);
@@ -208,10 +207,10 @@ dpdk_process_rx_burst (vlib_main_t * vm, dpdk_per_thread_data_t * ptd,
if (maybe_multiseg)
{
- n_bytes += dpdk_process_subseq_segs (vm, b[0], mb[0], fl);
- n_bytes += dpdk_process_subseq_segs (vm, b[1], mb[1], fl);
- n_bytes += dpdk_process_subseq_segs (vm, b[2], mb[2], fl);
- n_bytes += dpdk_process_subseq_segs (vm, b[3], mb[3], fl);
+ n_bytes += dpdk_process_subseq_segs (vm, b[0], mb[0], &bt);
+ n_bytes += dpdk_process_subseq_segs (vm, b[1], mb[1], &bt);
+ n_bytes += dpdk_process_subseq_segs (vm, b[2], mb[2], &bt);
+ n_bytes += dpdk_process_subseq_segs (vm, b[3], mb[3], &bt);
}
VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[0]);
@@ -227,7 +226,7 @@ dpdk_process_rx_burst (vlib_main_t * vm, dpdk_per_thread_data_t * ptd,
while (n_left)
{
b[0] = vlib_buffer_from_rte_mbuf (mb[0]);
- clib_memcpy_fast (b[0], &ptd->buffer_template, 64);
+ vlib_buffer_copy_template (b[0], &bt);
or_flags |= dpdk_ol_flags_extract (mb, flags, 1);
flags += 1;
@@ -235,7 +234,7 @@ dpdk_process_rx_burst (vlib_main_t * vm, dpdk_per_thread_data_t * ptd,
n_bytes += b[0]->current_length = mb[0]->data_len;
if (maybe_multiseg)
- n_bytes += dpdk_process_subseq_segs (vm, b[0], mb[0], fl);
+ n_bytes += dpdk_process_subseq_segs (vm, b[0], mb[0], &bt);
VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[0]);
/* next */
diff --git a/src/plugins/memif/memif.c b/src/plugins/memif/memif.c
index f976f16dec8..3171ba22f7d 100644
--- a/src/plugins/memif/memif.c
+++ b/src/plugins/memif/memif.c
@@ -848,19 +848,16 @@ memif_create_if (vlib_main_t * vm, memif_create_if_args_t * args)
if (mm->per_thread_data == 0)
{
int i;
- vlib_buffer_free_list_t *fl;
vec_validate_aligned (mm->per_thread_data, tm->n_vlib_mains - 1,
CLIB_CACHE_LINE_BYTES);
- fl =
- vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
for (i = 0; i < tm->n_vlib_mains; i++)
{
memif_per_thread_data_t *ptd =
vec_elt_at_index (mm->per_thread_data, i);
vlib_buffer_t *bt = &ptd->buffer_template;
- vlib_buffer_init_for_free_list (bt, fl);
+ clib_memset (bt, 0, sizeof (vlib_buffer_t));
bt->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
bt->total_length_not_including_first_buffer = 0;
vnet_buffer (bt)->sw_if_index[VLIB_TX] = (u32) ~ 0;
diff --git a/src/plugins/memif/node.c b/src/plugins/memif/node.c
index 3cb79541c17..490c60356db 100644
--- a/src/plugins/memif/node.c
+++ b/src/plugins/memif/node.c
@@ -180,7 +180,7 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
memif_main_t *mm = &memif_main;
memif_ring_t *ring;
memif_queue_t *mq;
- u16 buffer_size = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES;
+ u16 buffer_size = VLIB_BUFFER_DATA_SIZE;
uword n_trace = vlib_get_trace_count (vm, node);
u16 nexts[MEMIF_RX_VECTOR_SZ], *next = nexts;
u32 _to_next_bufs[MEMIF_RX_VECTOR_SZ], *to_next_bufs = _to_next_bufs, *bi;
@@ -190,7 +190,7 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
u32 thread_index = vm->thread_index;
memif_per_thread_data_t *ptd = vec_elt_at_index (mm->per_thread_data,
thread_index);
- vlib_buffer_t *bt = &ptd->buffer_template;
+ vlib_buffer_t bt;
u16 cur_slot, last_slot, ring_size, n_slots, mask;
i16 start_offset;
u16 n_buffers = 0, n_alloc;
@@ -338,10 +338,11 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
}
/* prepare buffer template and next indices */
- vnet_buffer (bt)->sw_if_index[VLIB_RX] = mif->sw_if_index;
- vnet_buffer (bt)->feature_arc_index = 0;
- bt->current_data = start_offset;
- bt->current_config_index = 0;
+ vnet_buffer (&ptd->buffer_template)->sw_if_index[VLIB_RX] =
+ mif->sw_if_index;
+ vnet_buffer (&ptd->buffer_template)->feature_arc_index = 0;
+ ptd->buffer_template.current_data = start_offset;
+ ptd->buffer_template.current_config_index = 0;
if (mode == MEMIF_INTERFACE_MODE_ETHERNET)
{
@@ -350,7 +351,7 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
next_index = mif->per_interface_next_index;
else
vnet_feature_start_device_input_x1 (mif->sw_if_index, &next_index,
- bt);
+ &ptd->buffer_template);
vlib_get_new_next_frame (vm, node, next_index, to_next_bufs,
n_left_to_next);
@@ -374,6 +375,9 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
po = ptd->packet_ops;
bi = to_next_bufs;
+ /* copy template into local variable - will save per packet load */
+ vlib_buffer_copy_template (&bt, &ptd->buffer_template);
+
while (n_from >= 8)
{
b0 = vlib_get_buffer (vm, po[4].first_buffer_vec_index);
@@ -402,7 +406,10 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
b2 = vlib_get_buffer (vm, bi[2]);
b3 = vlib_get_buffer (vm, bi[3]);
- clib_memcpy64_x4 (b0, b1, b2, b3, bt);
+ vlib_buffer_copy_template (b0, &bt);
+ vlib_buffer_copy_template (b1, &bt);
+ vlib_buffer_copy_template (b2, &bt);
+ vlib_buffer_copy_template (b3, &bt);
b0->current_length = po[0].packet_len;
n_rx_bytes += b0->current_length;
@@ -439,7 +446,7 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
fbvi[0] = po[0].first_buffer_vec_index;
bi[0] = ptd->buffers[fbvi[0]];
b0 = vlib_get_buffer (vm, bi[0]);
- clib_memcpy_fast (b0, bt, 64);
+ vlib_buffer_copy_template (b0, &bt);
b0->current_length = po->packet_len;
n_rx_bytes += b0->current_length;
@@ -559,7 +566,7 @@ memif_device_input_zc_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
/* asume that somebody will want to add ethernet header on the packet
so start with IP header at offset 14 */
start_offset = (mode == MEMIF_INTERFACE_MODE_IP) ? 14 : 0;
- buffer_length = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES - start_offset;
+ buffer_length = VLIB_BUFFER_DATA_SIZE - start_offset;
cur_slot = mq->last_tail;
last_slot = ring->tail;
diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h
index 2c8d5a046a7..68d6c5eb32b 100644
--- a/src/vlib/buffer.h
+++ b/src/vlib/buffer.h
@@ -104,7 +104,6 @@ enum
typedef struct
{
CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
- STRUCT_MARK (template_start);
/* Offset within data[] that we are currently processing.
If negative current header points into predata area. */
i16 current_data; /**< signed offset in data[], pre_data[]
@@ -131,8 +130,6 @@ typedef struct
Only valid if VLIB_BUFFER_NEXT_PRESENT flag is set.
*/
- STRUCT_MARK (template_end);
-
u32 current_config_index; /**< Used by feature subgraph arcs to
visit enabled feature nodes
*/
@@ -146,6 +143,13 @@ typedef struct
u32 opaque[10]; /**< Opaque data used by sub-graphs for their own purposes.
See .../vnet/vnet/buffer.h
*/
+
+ STRUCT_MARK (template_end); /**< part of buffer metadata which is
+ initialized on alloc ends here. It may be
+ different than cacheline on systems with
+ buffer cacheline size */
+
+ /***** end of first cache line */
CLIB_CACHE_LINE_ALIGN_MARK (cacheline1);
u32 trace_index; /**< Specifies index into trace buffer
diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h
index b561a91c394..8fbb58d68b3 100644
--- a/src/vlib/buffer_funcs.h
+++ b/src/vlib/buffer_funcs.h
@@ -64,6 +64,12 @@ vlib_get_buffer (vlib_main_t * vm, u32 buffer_index)
return uword_to_pointer (bm->buffer_mem_start + offset, void *);
}
+static_always_inline void
+vlib_buffer_copy_template (vlib_buffer_t * b, vlib_buffer_t * bt)
+{
+ clib_memcpy_fast (b, bt, STRUCT_OFFSET_OF (vlib_buffer_t, template_end));
+}
+
/** \brief Translate array of buffer indices into buffer pointers with offset
@param vm - (vlib_main_t *) vlib main data structure pointer
@@ -1011,12 +1017,7 @@ vlib_buffer_init_for_free_list (vlib_buffer_t * dst,
CLIB_CACHE_LINE_BYTES * 2);
/* Make sure buffer template is sane. */
- ASSERT (fl->index == vlib_buffer_get_free_list_index (src));
-
- clib_memcpy_fast (STRUCT_MARK_PTR (dst, template_start),
- STRUCT_MARK_PTR (src, template_start),
- STRUCT_OFFSET_OF (vlib_buffer_t, template_end) -
- STRUCT_OFFSET_OF (vlib_buffer_t, template_start));
+ vlib_buffer_copy_template (dst, src);
/* Not in the first 16 octets. */
dst->n_add_refs = src->n_add_refs;
diff --git a/src/vppinfra/string.h b/src/vppinfra/string.h
index 42f7890f3d0..d9cd8fe1af9 100644
--- a/src/vppinfra/string.h
+++ b/src/vppinfra/string.h
@@ -213,74 +213,6 @@ memset_s_inline (void *s, rsize_t smax, int c, rsize_t n)
*/
#define clib_memset(s,c,n) memset_s_inline(s,n,c,n)
-/*
- * Copy 64 bytes of data to 4 destinations
- * this function is typically used in quad-loop case when whole cacheline
- * needs to be copied to 4 different places. First it reads whole cacheline
- * to 1/2/4 SIMD registers and then it writes data to 4 destinations.
- */
-
-static_always_inline void
-clib_memcpy64_x4 (void *d0, void *d1, void *d2, void *d3, void *s)
-{
-#if defined (__AVX512F__)
- __m512i r0 = _mm512_loadu_si512 (s);
-
- _mm512_storeu_si512 (d0, r0);
- _mm512_storeu_si512 (d1, r0);
- _mm512_storeu_si512 (d2, r0);
- _mm512_storeu_si512 (d3, r0);
-
-#elif defined (__AVX2__)
- __m256i r0 = _mm256_loadu_si256 ((__m256i *) (s + 0 * 32));
- __m256i r1 = _mm256_loadu_si256 ((__m256i *) (s + 1 * 32));
-
- _mm256_storeu_si256 ((__m256i *) (d0 + 0 * 32), r0);
- _mm256_storeu_si256 ((__m256i *) (d0 + 1 * 32), r1);
-
- _mm256_storeu_si256 ((__m256i *) (d1 + 0 * 32), r0);
- _mm256_storeu_si256 ((__m256i *) (d1 + 1 * 32), r1);
-
- _mm256_storeu_si256 ((__m256i *) (d2 + 0 * 32), r0);
- _mm256_storeu_si256 ((__m256i *) (d2 + 1 * 32), r1);
-
- _mm256_storeu_si256 ((__m256i *) (d3 + 0 * 32), r0);
- _mm256_storeu_si256 ((__m256i *) (d3 + 1 * 32), r1);
-
-#elif defined (__SSSE3__)
- __m128i r0 = _mm_loadu_si128 ((__m128i *) (s + 0 * 16));
- __m128i r1 = _mm_loadu_si128 ((__m128i *) (s + 1 * 16));
- __m128i r2 = _mm_loadu_si128 ((__m128i *) (s + 2 * 16));
- __m128i r3 = _mm_loadu_si128 ((__m128i *) (s + 3 * 16));
-
- _mm_storeu_si128 ((__m128i *) (d0 + 0 * 16), r0);
- _mm_storeu_si128 ((__m128i *) (d0 + 1 * 16), r1);
- _mm_storeu_si128 ((__m128i *) (d0 + 2 * 16), r2);
- _mm_storeu_si128 ((__m128i *) (d0 + 3 * 16), r3);
-
- _mm_storeu_si128 ((__m128i *) (d1 + 0 * 16), r0);
- _mm_storeu_si128 ((__m128i *) (d1 + 1 * 16), r1);
- _mm_storeu_si128 ((__m128i *) (d1 + 2 * 16), r2);
- _mm_storeu_si128 ((__m128i *) (d1 + 3 * 16), r3);
-
- _mm_storeu_si128 ((__m128i *) (d2 + 0 * 16), r0);
- _mm_storeu_si128 ((__m128i *) (d2 + 1 * 16), r1);
- _mm_storeu_si128 ((__m128i *) (d2 + 2 * 16), r2);
- _mm_storeu_si128 ((__m128i *) (d2 + 3 * 16), r3);
-
- _mm_storeu_si128 ((__m128i *) (d3 + 0 * 16), r0);
- _mm_storeu_si128 ((__m128i *) (d3 + 1 * 16), r1);
- _mm_storeu_si128 ((__m128i *) (d3 + 2 * 16), r2);
- _mm_storeu_si128 ((__m128i *) (d3 + 3 * 16), r3);
-
-#else
- clib_memcpy_fast (d0, s, 64);
- clib_memcpy_fast (d1, s, 64);
- clib_memcpy_fast (d2, s, 64);
- clib_memcpy_fast (d3, s, 64);
-#endif
-}
-
static_always_inline void
clib_memset_u64 (void *p, u64 val, uword count)
{