From e58041f242bf4bd120ecc9619b88348d80b94c17 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Fri, 18 Jan 2019 19:56:09 +0100 Subject: deprecate clib_memcpy64_x4 Storing buffer in local template seems to be better option.... Change-Id: I1a2fdd68cb956f99a5b36d2cd810fc623e089bcf Signed-off-by: Damjan Marion --- src/plugins/avf/input.c | 24 +++++++++------ src/plugins/dpdk/buffer.c | 17 ++++++----- src/plugins/dpdk/device/node.c | 31 ++++++++++--------- src/plugins/memif/memif.c | 5 +--- src/plugins/memif/node.c | 27 ++++++++++------- src/vlib/buffer.h | 10 +++++-- src/vlib/buffer_funcs.h | 13 ++++---- src/vppinfra/string.h | 68 ------------------------------------------ 8 files changed, 71 insertions(+), 124 deletions(-) (limited to 'src') diff --git a/src/plugins/avf/input.c b/src/plugins/avf/input.c index 8072e94346b..b784bf731c1 100644 --- a/src/plugins/avf/input.c +++ b/src/plugins/avf/input.c @@ -146,7 +146,7 @@ avf_rx_attach_tail (vlib_main_t * vm, vlib_buffer_t * bt, vlib_buffer_t * b, b->next_buffer = t->buffers[i]; b->flags |= VLIB_BUFFER_NEXT_PRESENT; b = vlib_get_buffer (vm, b->next_buffer); - clib_memcpy_fast (b, bt, sizeof (vlib_buffer_t)); + vlib_buffer_copy_template (b, bt); tlnifb += b->current_length = qw1 >> AVF_RXD_LEN_SHIFT; i++; } @@ -161,12 +161,15 @@ avf_process_rx_burst (vlib_main_t * vm, vlib_node_runtime_t * node, avf_per_thread_data_t * ptd, u32 n_left, int maybe_multiseg) { - vlib_buffer_t *bt = &ptd->buffer_template; + vlib_buffer_t bt; vlib_buffer_t **b = ptd->bufs; u64 *qw1 = ptd->qw1s; avf_rx_tail_t *tail = ptd->tails; uword n_rx_bytes = 0; + /* copy template into local variable - will save per packet load */ + vlib_buffer_copy_template (&bt, &ptd->buffer_template); + while (n_left >= 4) { if (n_left >= 12) @@ -177,7 +180,10 @@ avf_process_rx_burst (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_prefetch_buffer_header (b[11], LOAD); } - clib_memcpy64_x4 (b[0], b[1], b[2], b[3], bt); + vlib_buffer_copy_template (b[0], &bt); + vlib_buffer_copy_template (b[1], &bt); + vlib_buffer_copy_template (b[2], &bt); + vlib_buffer_copy_template (b[3], &bt); n_rx_bytes += b[0]->current_length = qw1[0] >> AVF_RXD_LEN_SHIFT; n_rx_bytes += b[1]->current_length = qw1[1] >> AVF_RXD_LEN_SHIFT; @@ -186,10 +192,10 @@ avf_process_rx_burst (vlib_main_t * vm, vlib_node_runtime_t * node, if (maybe_multiseg) { - n_rx_bytes += avf_rx_attach_tail (vm, bt, b[0], qw1[0], tail + 0); - n_rx_bytes += avf_rx_attach_tail (vm, bt, b[1], qw1[1], tail + 1); - n_rx_bytes += avf_rx_attach_tail (vm, bt, b[2], qw1[2], tail + 2); - n_rx_bytes += avf_rx_attach_tail (vm, bt, b[3], qw1[3], tail + 3); + n_rx_bytes += avf_rx_attach_tail (vm, &bt, b[0], qw1[0], tail + 0); + n_rx_bytes += avf_rx_attach_tail (vm, &bt, b[1], qw1[1], tail + 1); + n_rx_bytes += avf_rx_attach_tail (vm, &bt, b[2], qw1[2], tail + 2); + n_rx_bytes += avf_rx_attach_tail (vm, &bt, b[3], qw1[3], tail + 3); } VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[0]); @@ -205,12 +211,12 @@ avf_process_rx_burst (vlib_main_t * vm, vlib_node_runtime_t * node, } while (n_left) { - clib_memcpy_fast (b[0], bt, sizeof (vlib_buffer_t)); + vlib_buffer_copy_template (b[0], &bt); n_rx_bytes += b[0]->current_length = qw1[0] >> AVF_RXD_LEN_SHIFT; if (maybe_multiseg) - n_rx_bytes += avf_rx_attach_tail (vm, bt, b[0], qw1[0], tail + 0); + n_rx_bytes += avf_rx_attach_tail (vm, &bt, b[0], qw1[0], tail + 0); VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[0]); diff --git a/src/plugins/dpdk/buffer.c b/src/plugins/dpdk/buffer.c index ee63f76b0d4..7093b01162c 100644 --- a/src/plugins/dpdk/buffer.c +++ b/src/plugins/dpdk/buffer.c @@ -241,14 +241,15 @@ CLIB_MULTIARCH_FN (dpdk_buffer_fill_free_list) (vlib_main_t * vm, no_prefetch: vlib_get_buffer_indices_with_offset (vm, (void **) mb, bi, 8, sizeof (struct rte_mbuf)); - clib_memcpy64_x4 (vlib_buffer_from_rte_mbuf (mb[0]), - vlib_buffer_from_rte_mbuf (mb[1]), - vlib_buffer_from_rte_mbuf (mb[2]), - vlib_buffer_from_rte_mbuf (mb[3]), &bt); - clib_memcpy64_x4 (vlib_buffer_from_rte_mbuf (mb[4]), - vlib_buffer_from_rte_mbuf (mb[5]), - vlib_buffer_from_rte_mbuf (mb[6]), - vlib_buffer_from_rte_mbuf (mb[7]), &bt); + + vlib_buffer_copy_template (vlib_buffer_from_rte_mbuf (mb[0]), &bt); + vlib_buffer_copy_template (vlib_buffer_from_rte_mbuf (mb[1]), &bt); + vlib_buffer_copy_template (vlib_buffer_from_rte_mbuf (mb[2]), &bt); + vlib_buffer_copy_template (vlib_buffer_from_rte_mbuf (mb[3]), &bt); + vlib_buffer_copy_template (vlib_buffer_from_rte_mbuf (mb[4]), &bt); + vlib_buffer_copy_template (vlib_buffer_from_rte_mbuf (mb[5]), &bt); + vlib_buffer_copy_template (vlib_buffer_from_rte_mbuf (mb[6]), &bt); + vlib_buffer_copy_template (vlib_buffer_from_rte_mbuf (mb[7]), &bt); n_left -= 8; mb += 8; diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c index 194c359dbac..250ded5048c 100644 --- a/src/plugins/dpdk/device/node.c +++ b/src/plugins/dpdk/device/node.c @@ -40,7 +40,7 @@ STATIC_ASSERT ((PKT_RX_IP_CKSUM_BAD | PKT_RX_FDIR) < static_always_inline uword dpdk_process_subseq_segs (vlib_main_t * vm, vlib_buffer_t * b, - struct rte_mbuf *mb, vlib_buffer_free_list_t * fl) + struct rte_mbuf *mb, vlib_buffer_t * bt) { u8 nb_seg = 1; struct rte_mbuf *mb_seg = 0; @@ -59,10 +59,7 @@ dpdk_process_subseq_segs (vlib_main_t * vm, vlib_buffer_t * b, ASSERT (mb_seg != 0); b_seg = vlib_buffer_from_rte_mbuf (mb_seg); - vlib_buffer_init_for_free_list (b_seg, fl); - - ASSERT ((b_seg->flags & VLIB_BUFFER_NEXT_PRESENT) == 0); - ASSERT (b_seg->current_data == 0); + vlib_buffer_copy_template (b_seg, bt); /* * The driver (e.g. virtio) may not put the packet data at the start @@ -167,17 +164,16 @@ dpdk_process_rx_burst (vlib_main_t * vm, dpdk_per_thread_data_t * ptd, { u32 n_left = n_rx_packets; vlib_buffer_t *b[4]; - vlib_buffer_free_list_t *fl; struct rte_mbuf **mb = ptd->mbufs; uword n_bytes = 0; u8 *flags, or_flags = 0; - - if (maybe_multiseg) - fl = vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + vlib_buffer_t bt; mb = ptd->mbufs; flags = ptd->flags; + /* copy template into local variable - will save per packet load */ + vlib_buffer_copy_template (&bt, &ptd->buffer_template); while (n_left >= 8) { dpdk_prefetch_buffer_x4 (mb + 4); @@ -187,7 +183,10 @@ dpdk_process_rx_burst (vlib_main_t * vm, dpdk_per_thread_data_t * ptd, b[2] = vlib_buffer_from_rte_mbuf (mb[2]); b[3] = vlib_buffer_from_rte_mbuf (mb[3]); - clib_memcpy64_x4 (b[0], b[1], b[2], b[3], &ptd->buffer_template); + vlib_buffer_copy_template (b[0], &bt); + vlib_buffer_copy_template (b[1], &bt); + vlib_buffer_copy_template (b[2], &bt); + vlib_buffer_copy_template (b[3], &bt); dpdk_prefetch_mbuf_x4 (mb + 4); @@ -208,10 +207,10 @@ dpdk_process_rx_burst (vlib_main_t * vm, dpdk_per_thread_data_t * ptd, if (maybe_multiseg) { - n_bytes += dpdk_process_subseq_segs (vm, b[0], mb[0], fl); - n_bytes += dpdk_process_subseq_segs (vm, b[1], mb[1], fl); - n_bytes += dpdk_process_subseq_segs (vm, b[2], mb[2], fl); - n_bytes += dpdk_process_subseq_segs (vm, b[3], mb[3], fl); + n_bytes += dpdk_process_subseq_segs (vm, b[0], mb[0], &bt); + n_bytes += dpdk_process_subseq_segs (vm, b[1], mb[1], &bt); + n_bytes += dpdk_process_subseq_segs (vm, b[2], mb[2], &bt); + n_bytes += dpdk_process_subseq_segs (vm, b[3], mb[3], &bt); } VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[0]); @@ -227,7 +226,7 @@ dpdk_process_rx_burst (vlib_main_t * vm, dpdk_per_thread_data_t * ptd, while (n_left) { b[0] = vlib_buffer_from_rte_mbuf (mb[0]); - clib_memcpy_fast (b[0], &ptd->buffer_template, 64); + vlib_buffer_copy_template (b[0], &bt); or_flags |= dpdk_ol_flags_extract (mb, flags, 1); flags += 1; @@ -235,7 +234,7 @@ dpdk_process_rx_burst (vlib_main_t * vm, dpdk_per_thread_data_t * ptd, n_bytes += b[0]->current_length = mb[0]->data_len; if (maybe_multiseg) - n_bytes += dpdk_process_subseq_segs (vm, b[0], mb[0], fl); + n_bytes += dpdk_process_subseq_segs (vm, b[0], mb[0], &bt); VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[0]); /* next */ diff --git a/src/plugins/memif/memif.c b/src/plugins/memif/memif.c index f976f16dec8..3171ba22f7d 100644 --- a/src/plugins/memif/memif.c +++ b/src/plugins/memif/memif.c @@ -848,19 +848,16 @@ memif_create_if (vlib_main_t * vm, memif_create_if_args_t * args) if (mm->per_thread_data == 0) { int i; - vlib_buffer_free_list_t *fl; vec_validate_aligned (mm->per_thread_data, tm->n_vlib_mains - 1, CLIB_CACHE_LINE_BYTES); - fl = - vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); for (i = 0; i < tm->n_vlib_mains; i++) { memif_per_thread_data_t *ptd = vec_elt_at_index (mm->per_thread_data, i); vlib_buffer_t *bt = &ptd->buffer_template; - vlib_buffer_init_for_free_list (bt, fl); + clib_memset (bt, 0, sizeof (vlib_buffer_t)); bt->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID; bt->total_length_not_including_first_buffer = 0; vnet_buffer (bt)->sw_if_index[VLIB_TX] = (u32) ~ 0; diff --git a/src/plugins/memif/node.c b/src/plugins/memif/node.c index 3cb79541c17..490c60356db 100644 --- a/src/plugins/memif/node.c +++ b/src/plugins/memif/node.c @@ -180,7 +180,7 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, memif_main_t *mm = &memif_main; memif_ring_t *ring; memif_queue_t *mq; - u16 buffer_size = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES; + u16 buffer_size = VLIB_BUFFER_DATA_SIZE; uword n_trace = vlib_get_trace_count (vm, node); u16 nexts[MEMIF_RX_VECTOR_SZ], *next = nexts; u32 _to_next_bufs[MEMIF_RX_VECTOR_SZ], *to_next_bufs = _to_next_bufs, *bi; @@ -190,7 +190,7 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, u32 thread_index = vm->thread_index; memif_per_thread_data_t *ptd = vec_elt_at_index (mm->per_thread_data, thread_index); - vlib_buffer_t *bt = &ptd->buffer_template; + vlib_buffer_t bt; u16 cur_slot, last_slot, ring_size, n_slots, mask; i16 start_offset; u16 n_buffers = 0, n_alloc; @@ -338,10 +338,11 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } /* prepare buffer template and next indices */ - vnet_buffer (bt)->sw_if_index[VLIB_RX] = mif->sw_if_index; - vnet_buffer (bt)->feature_arc_index = 0; - bt->current_data = start_offset; - bt->current_config_index = 0; + vnet_buffer (&ptd->buffer_template)->sw_if_index[VLIB_RX] = + mif->sw_if_index; + vnet_buffer (&ptd->buffer_template)->feature_arc_index = 0; + ptd->buffer_template.current_data = start_offset; + ptd->buffer_template.current_config_index = 0; if (mode == MEMIF_INTERFACE_MODE_ETHERNET) { @@ -350,7 +351,7 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, next_index = mif->per_interface_next_index; else vnet_feature_start_device_input_x1 (mif->sw_if_index, &next_index, - bt); + &ptd->buffer_template); vlib_get_new_next_frame (vm, node, next_index, to_next_bufs, n_left_to_next); @@ -374,6 +375,9 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, po = ptd->packet_ops; bi = to_next_bufs; + /* copy template into local variable - will save per packet load */ + vlib_buffer_copy_template (&bt, &ptd->buffer_template); + while (n_from >= 8) { b0 = vlib_get_buffer (vm, po[4].first_buffer_vec_index); @@ -402,7 +406,10 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, b2 = vlib_get_buffer (vm, bi[2]); b3 = vlib_get_buffer (vm, bi[3]); - clib_memcpy64_x4 (b0, b1, b2, b3, bt); + vlib_buffer_copy_template (b0, &bt); + vlib_buffer_copy_template (b1, &bt); + vlib_buffer_copy_template (b2, &bt); + vlib_buffer_copy_template (b3, &bt); b0->current_length = po[0].packet_len; n_rx_bytes += b0->current_length; @@ -439,7 +446,7 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, fbvi[0] = po[0].first_buffer_vec_index; bi[0] = ptd->buffers[fbvi[0]]; b0 = vlib_get_buffer (vm, bi[0]); - clib_memcpy_fast (b0, bt, 64); + vlib_buffer_copy_template (b0, &bt); b0->current_length = po->packet_len; n_rx_bytes += b0->current_length; @@ -559,7 +566,7 @@ memif_device_input_zc_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* asume that somebody will want to add ethernet header on the packet so start with IP header at offset 14 */ start_offset = (mode == MEMIF_INTERFACE_MODE_IP) ? 14 : 0; - buffer_length = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES - start_offset; + buffer_length = VLIB_BUFFER_DATA_SIZE - start_offset; cur_slot = mq->last_tail; last_slot = ring->tail; diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index 2c8d5a046a7..68d6c5eb32b 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -104,7 +104,6 @@ enum typedef struct { CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); - STRUCT_MARK (template_start); /* Offset within data[] that we are currently processing. If negative current header points into predata area. */ i16 current_data; /**< signed offset in data[], pre_data[] @@ -131,8 +130,6 @@ typedef struct Only valid if VLIB_BUFFER_NEXT_PRESENT flag is set. */ - STRUCT_MARK (template_end); - u32 current_config_index; /**< Used by feature subgraph arcs to visit enabled feature nodes */ @@ -146,6 +143,13 @@ typedef struct u32 opaque[10]; /**< Opaque data used by sub-graphs for their own purposes. See .../vnet/vnet/buffer.h */ + + STRUCT_MARK (template_end); /**< part of buffer metadata which is + initialized on alloc ends here. It may be + different than cacheline on systems with + buffer cacheline size */ + + /***** end of first cache line */ CLIB_CACHE_LINE_ALIGN_MARK (cacheline1); u32 trace_index; /**< Specifies index into trace buffer diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index b561a91c394..8fbb58d68b3 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -64,6 +64,12 @@ vlib_get_buffer (vlib_main_t * vm, u32 buffer_index) return uword_to_pointer (bm->buffer_mem_start + offset, void *); } +static_always_inline void +vlib_buffer_copy_template (vlib_buffer_t * b, vlib_buffer_t * bt) +{ + clib_memcpy_fast (b, bt, STRUCT_OFFSET_OF (vlib_buffer_t, template_end)); +} + /** \brief Translate array of buffer indices into buffer pointers with offset @param vm - (vlib_main_t *) vlib main data structure pointer @@ -1011,12 +1017,7 @@ vlib_buffer_init_for_free_list (vlib_buffer_t * dst, CLIB_CACHE_LINE_BYTES * 2); /* Make sure buffer template is sane. */ - ASSERT (fl->index == vlib_buffer_get_free_list_index (src)); - - clib_memcpy_fast (STRUCT_MARK_PTR (dst, template_start), - STRUCT_MARK_PTR (src, template_start), - STRUCT_OFFSET_OF (vlib_buffer_t, template_end) - - STRUCT_OFFSET_OF (vlib_buffer_t, template_start)); + vlib_buffer_copy_template (dst, src); /* Not in the first 16 octets. */ dst->n_add_refs = src->n_add_refs; diff --git a/src/vppinfra/string.h b/src/vppinfra/string.h index 42f7890f3d0..d9cd8fe1af9 100644 --- a/src/vppinfra/string.h +++ b/src/vppinfra/string.h @@ -213,74 +213,6 @@ memset_s_inline (void *s, rsize_t smax, int c, rsize_t n) */ #define clib_memset(s,c,n) memset_s_inline(s,n,c,n) -/* - * Copy 64 bytes of data to 4 destinations - * this function is typically used in quad-loop case when whole cacheline - * needs to be copied to 4 different places. First it reads whole cacheline - * to 1/2/4 SIMD registers and then it writes data to 4 destinations. - */ - -static_always_inline void -clib_memcpy64_x4 (void *d0, void *d1, void *d2, void *d3, void *s) -{ -#if defined (__AVX512F__) - __m512i r0 = _mm512_loadu_si512 (s); - - _mm512_storeu_si512 (d0, r0); - _mm512_storeu_si512 (d1, r0); - _mm512_storeu_si512 (d2, r0); - _mm512_storeu_si512 (d3, r0); - -#elif defined (__AVX2__) - __m256i r0 = _mm256_loadu_si256 ((__m256i *) (s + 0 * 32)); - __m256i r1 = _mm256_loadu_si256 ((__m256i *) (s + 1 * 32)); - - _mm256_storeu_si256 ((__m256i *) (d0 + 0 * 32), r0); - _mm256_storeu_si256 ((__m256i *) (d0 + 1 * 32), r1); - - _mm256_storeu_si256 ((__m256i *) (d1 + 0 * 32), r0); - _mm256_storeu_si256 ((__m256i *) (d1 + 1 * 32), r1); - - _mm256_storeu_si256 ((__m256i *) (d2 + 0 * 32), r0); - _mm256_storeu_si256 ((__m256i *) (d2 + 1 * 32), r1); - - _mm256_storeu_si256 ((__m256i *) (d3 + 0 * 32), r0); - _mm256_storeu_si256 ((__m256i *) (d3 + 1 * 32), r1); - -#elif defined (__SSSE3__) - __m128i r0 = _mm_loadu_si128 ((__m128i *) (s + 0 * 16)); - __m128i r1 = _mm_loadu_si128 ((__m128i *) (s + 1 * 16)); - __m128i r2 = _mm_loadu_si128 ((__m128i *) (s + 2 * 16)); - __m128i r3 = _mm_loadu_si128 ((__m128i *) (s + 3 * 16)); - - _mm_storeu_si128 ((__m128i *) (d0 + 0 * 16), r0); - _mm_storeu_si128 ((__m128i *) (d0 + 1 * 16), r1); - _mm_storeu_si128 ((__m128i *) (d0 + 2 * 16), r2); - _mm_storeu_si128 ((__m128i *) (d0 + 3 * 16), r3); - - _mm_storeu_si128 ((__m128i *) (d1 + 0 * 16), r0); - _mm_storeu_si128 ((__m128i *) (d1 + 1 * 16), r1); - _mm_storeu_si128 ((__m128i *) (d1 + 2 * 16), r2); - _mm_storeu_si128 ((__m128i *) (d1 + 3 * 16), r3); - - _mm_storeu_si128 ((__m128i *) (d2 + 0 * 16), r0); - _mm_storeu_si128 ((__m128i *) (d2 + 1 * 16), r1); - _mm_storeu_si128 ((__m128i *) (d2 + 2 * 16), r2); - _mm_storeu_si128 ((__m128i *) (d2 + 3 * 16), r3); - - _mm_storeu_si128 ((__m128i *) (d3 + 0 * 16), r0); - _mm_storeu_si128 ((__m128i *) (d3 + 1 * 16), r1); - _mm_storeu_si128 ((__m128i *) (d3 + 2 * 16), r2); - _mm_storeu_si128 ((__m128i *) (d3 + 3 * 16), r3); - -#else - clib_memcpy_fast (d0, s, 64); - clib_memcpy_fast (d1, s, 64); - clib_memcpy_fast (d2, s, 64); - clib_memcpy_fast (d3, s, 64); -#endif -} - static_always_inline void clib_memset_u64 (void *p, u64 val, uword count) { -- cgit 1.2.3-korg