From afe56de947822bb981bd30242f4e3c2c469f9ecc Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 17 May 2018 12:44:00 +0200 Subject: Add buffer pointer-to-index and index-to-pointer array functions Change-Id: Ib3fcc3ceb7f315389bcdecbb7d9632540a5dd6ba Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/device.c | 50 ++------------ src/plugins/dpdk/device/node.c | 63 +---------------- src/vlib/buffer_funcs.h | 143 +++++++++++++++++++++++++++++++++++++++ src/vppinfra/vector_avx2.h | 22 ++++++ 4 files changed, 172 insertions(+), 106 deletions(-) (limited to 'src') diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c index cdd9785a3f3..0ac798fa704 100644 --- a/src/plugins/dpdk/device/device.c +++ b/src/plugins/dpdk/device/device.c @@ -345,12 +345,6 @@ CLIB_MULTIARCH_FN (dpdk_interface_tx) (vlib_main_t * vm, thread_index); struct rte_mbuf **mb; vlib_buffer_t *b[4]; -#ifdef CLIB_HAVE_VEC256 - u64x4 off4 = u64x4_splat (buffer_main.buffer_mem_start - - sizeof (struct rte_mbuf)); - u32x8 permute_mask = { 0, 4, 1, 5, 2, 6, 3, 7 }; - u32x8 zero = { 0 }; -#endif from = vlib_frame_vector_args (f); @@ -373,46 +367,10 @@ CLIB_MULTIARCH_FN (dpdk_interface_tx) (vlib_main_t * vm, } /* calculate rte_mbuf pointers out of buffer indices */ - from = vlib_frame_vector_args (f); - n_left = n_packets; - mb = ptd->mbufs; - while (n_left >= 8) - { -#ifdef CLIB_HAVE_VEC256 - u32x8 bi0, bi1; - u64x4 mb0, mb1; - /* load 4 bufer indices into lower part of 256-bit register */ - bi0 = u32x8_insert_lo (zero, u32x4_load_unaligned (from)); - bi1 = u32x8_insert_lo (zero, u32x4_load_unaligned (from + 4)); - /* permute 256-bit register so each buffer index is in own u64 */ - mb0 = (u64x4) u32x8_permute (bi0, permute_mask); - mb1 = (u64x4) u32x8_permute (bi1, permute_mask); - /* shift and add to get rte_mbuf pointer */ - mb0 <<= CLIB_LOG2_CACHE_LINE_BYTES; - mb1 <<= CLIB_LOG2_CACHE_LINE_BYTES; - u64x4_store_unaligned (mb0 + off4, mb); - u64x4_store_unaligned (mb1 + off4, mb + 4); -#else - mb[0] = rte_mbuf_from_vlib_buffer (vlib_get_buffer (vm, from[0])); - mb[1] = rte_mbuf_from_vlib_buffer (vlib_get_buffer (vm, from[1])); - mb[2] = rte_mbuf_from_vlib_buffer (vlib_get_buffer (vm, from[2])); - mb[3] = rte_mbuf_from_vlib_buffer (vlib_get_buffer (vm, from[3])); - mb[4] = rte_mbuf_from_vlib_buffer (vlib_get_buffer (vm, from[4])); - mb[5] = rte_mbuf_from_vlib_buffer (vlib_get_buffer (vm, from[5])); - mb[6] = rte_mbuf_from_vlib_buffer (vlib_get_buffer (vm, from[6])); - mb[7] = rte_mbuf_from_vlib_buffer (vlib_get_buffer (vm, from[7])); -#endif - from += 8; - mb += 8; - n_left -= 8; - } - while (n_left) - { - mb[0] = rte_mbuf_from_vlib_buffer (vlib_get_buffer (vm, from[0])); - from++; - mb++; - n_left--; - } + vlib_get_buffers_with_offset (vm, vlib_frame_vector_args (f), + (void **) ptd->mbufs, n_packets, + -(i32) sizeof (struct rte_mbuf)); + from = vlib_frame_vector_args (f); n_left = n_packets; mb = ptd->mbufs; diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c index 7ba4dad8ae9..3311ac4830d 100644 --- a/src/plugins/dpdk/device/node.c +++ b/src/plugins/dpdk/device/node.c @@ -213,65 +213,6 @@ poll_rate_limit (dpdk_main_t * dm) xd->per_interface_next_index */ -static_always_inline void -dpdk_mbufs_to_buffer_indices (vlib_main_t * vm, struct rte_mbuf **mb, - u32 * bi, uword n_left) -{ -#ifdef CLIB_HAVE_VEC256 - u32x8 mask = { 0, 2, 4, 6, 1, 3, 5, 7 }; - u64x4 off4 = u64x4_splat (buffer_main.buffer_mem_start - - sizeof (struct rte_mbuf)); -#endif - - while (n_left >= 8) - { -#ifdef CLIB_HAVE_VEC256 - /* load 4 pointers into 256-bit register */ - u64x4 v0 = u64x4_load_unaligned (mb); - u64x4 v1 = u64x4_load_unaligned (mb + 4); - u32x8 v2, v3; - - /* calculate 4 buffer indices in parallel - vlib_buffer_t is straight after rte_mbuf so advance all 4 - pointers for size of rte_mbuf */ - v0 -= off4; - v1 -= off4; - - v0 >>= CLIB_LOG2_CACHE_LINE_BYTES; - v1 >>= CLIB_LOG2_CACHE_LINE_BYTES; - - /* permute 256-bit register so lower u32s of each buffer index are - * placed into lower 128-bits */ - v2 = u32x8_permute ((u32x8) v0, mask); - v3 = u32x8_permute ((u32x8) v1, mask); - - /* extract lower 128-bits and save them to the array of buffer indices */ - u32x4_store_unaligned (u32x8_extract_lo (v2), bi); - u32x4_store_unaligned (u32x8_extract_lo (v3), bi + 4); -#else - /* equivalent non-nector implementation */ - bi[0] = vlib_get_buffer_index (vm, vlib_buffer_from_rte_mbuf (mb[0])); - bi[1] = vlib_get_buffer_index (vm, vlib_buffer_from_rte_mbuf (mb[1])); - bi[2] = vlib_get_buffer_index (vm, vlib_buffer_from_rte_mbuf (mb[2])); - bi[3] = vlib_get_buffer_index (vm, vlib_buffer_from_rte_mbuf (mb[3])); - bi[4] = vlib_get_buffer_index (vm, vlib_buffer_from_rte_mbuf (mb[4])); - bi[5] = vlib_get_buffer_index (vm, vlib_buffer_from_rte_mbuf (mb[5])); - bi[6] = vlib_get_buffer_index (vm, vlib_buffer_from_rte_mbuf (mb[6])); - bi[7] = vlib_get_buffer_index (vm, vlib_buffer_from_rte_mbuf (mb[7])); -#endif - bi += 8; - mb += 8; - n_left -= 8; - } - while (n_left) - { - bi[0] = vlib_get_buffer_index (vm, vlib_buffer_from_rte_mbuf (mb[0])); - bi += 1; - mb += 1; - n_left -= 1; - } -} - static_always_inline u8 dpdk_ol_flags_extract (struct rte_mbuf **mb, u8 * flags, int count) { @@ -604,7 +545,9 @@ dpdk_device_input (vlib_main_t * vm, dpdk_main_t * dm, dpdk_device_t * xd, } /* enqueue buffers to the next node */ - dpdk_mbufs_to_buffer_indices (vm, ptd->mbufs, ptd->buffers, n_rx_packets); + vlib_get_buffer_indices_with_offset (vm, (void **) ptd->mbufs, ptd->buffers, + n_rx_packets, + sizeof (struct rte_mbuf)); n_left = n_rx_packets; next = ptd->next; buffers = ptd->buffers; diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index ba87d9566c7..6072b2ea44b 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -63,6 +63,73 @@ vlib_get_buffer (vlib_main_t * vm, u32 buffer_index) return uword_to_pointer (bm->buffer_mem_start + offset, void *); } +/** \brief Translate array of buffer indices into buffer pointers with offset + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param bi - (u32 *) array of buffer indices + @param b - (void **) array to store buffer pointers + @param count - (uword) number of elements + @param offset - (i32) offset applied to each pointer +*/ +static_always_inline void +vlib_get_buffers_with_offset (vlib_main_t * vm, u32 * bi, void **b, int count, + i32 offset) +{ +#ifdef CLIB_HAVE_VEC256 + u64x4 off = u64x4_splat (buffer_main.buffer_mem_start + offset); + /* if count is not const, compiler will not unroll while loop + se we maintain two-in-parallel variant */ + while (count >= 8) + { + u64x4 b0 = u32x4_extend_to_u64x4 (u32x4_load_unaligned (bi)); + u64x4 b1 = u32x4_extend_to_u64x4 (u32x4_load_unaligned (bi + 4)); + /* shift and add to get vlib_buffer_t pointer */ + u64x4_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b); + u64x4_store_unaligned ((b1 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 4); + b += 8; + bi += 8; + count -= 8; + } +#endif + while (count >= 4) + { +#ifdef CLIB_HAVE_VEC256 + u64x4 b0 = u32x4_extend_to_u64x4 (u32x4_load_unaligned (bi)); + /* shift and add to get vlib_buffer_t pointer */ + u64x4_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b); +#else + b[0] = ((u8 *) vlib_get_buffer (vm, bi[0])) + offset; + b[1] = ((u8 *) vlib_get_buffer (vm, bi[1])) + offset; + b[2] = ((u8 *) vlib_get_buffer (vm, bi[2])) + offset; + b[3] = ((u8 *) vlib_get_buffer (vm, bi[3])) + offset; +#endif + b += 4; + bi += 4; + count -= 4; + } + while (count) + { + b[0] = ((u8 *) vlib_get_buffer (vm, bi[0])) + offset; + b += 1; + bi += 1; + count -= 1; + } +} + +/** \brief Translate array of buffer indices into buffer pointers + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param bi - (u32 *) array of buffer indices + @param b - (vlib_buffer_t **) array to store buffer pointers + @param count - (uword) number of elements +*/ + +static_always_inline void +vlib_get_buffers (vlib_main_t * vm, u32 * bi, vlib_buffer_t ** b, int count) +{ + vlib_get_buffers_with_offset (vm, bi, (void **) b, count, 0); +} + /** \brief Translate buffer pointer into buffer index @param vm - (vlib_main_t *) vlib main data structure pointer @@ -81,6 +148,82 @@ vlib_get_buffer_index (vlib_main_t * vm, void *p) return offset >> CLIB_LOG2_CACHE_LINE_BYTES; } +/** \brief Translate array of buffer pointers into buffer indices with offset + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param b - (void **) array of buffer pointers + @param bi - (u32 *) array to store buffer indices + @param count - (uword) number of elements + @param offset - (i32) offset applied to each pointer +*/ +static_always_inline void +vlib_get_buffer_indices_with_offset (vlib_main_t * vm, void **b, u32 * bi, + uword count, i32 offset) +{ +#ifdef CLIB_HAVE_VEC256 + u32x8 mask = { 0, 2, 4, 6, 1, 3, 5, 7 }; + u64x4 off4 = u64x4_splat (buffer_main.buffer_mem_start - offset); + + while (count >= 8) + { + /* load 4 pointers into 256-bit register */ + u64x4 v0 = u64x4_load_unaligned (b); + u64x4 v1 = u64x4_load_unaligned (b + 4); + u32x8 v2, v3; + + v0 -= off4; + v1 -= off4; + + v0 >>= CLIB_LOG2_CACHE_LINE_BYTES; + v1 >>= CLIB_LOG2_CACHE_LINE_BYTES; + + /* permute 256-bit register so lower u32s of each buffer index are + * placed into lower 128-bits */ + v2 = u32x8_permute ((u32x8) v0, mask); + v3 = u32x8_permute ((u32x8) v1, mask); + + /* extract lower 128-bits and save them to the array of buffer indices */ + u32x4_store_unaligned (u32x8_extract_lo (v2), bi); + u32x4_store_unaligned (u32x8_extract_lo (v3), bi + 4); + bi += 8; + b += 8; + count -= 8; + } +#endif + while (count >= 4) + { + /* equivalent non-nector implementation */ + bi[0] = vlib_get_buffer_index (vm, ((u8 *) b[0]) + offset); + bi[1] = vlib_get_buffer_index (vm, ((u8 *) b[1]) + offset); + bi[2] = vlib_get_buffer_index (vm, ((u8 *) b[2]) + offset); + bi[3] = vlib_get_buffer_index (vm, ((u8 *) b[3]) + offset); + bi += 4; + b += 4; + count -= 4; + } + while (count) + { + bi[0] = vlib_get_buffer_index (vm, b[0] + offset); + bi += 1; + b += 1; + count -= 1; + } +} + +/** \brief Translate array of buffer pointers into buffer indices + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param b - (vlib_buffer_t **) array of buffer pointers + @param bi - (u32 *) array to store buffer indices + @param count - (uword) number of elements +*/ +static_always_inline void +vlib_get_buffer_indices (vlib_main_t * vm, vlib_buffer_t ** b, u32 * bi, + uword count) +{ + vlib_get_buffer_indices_with_offset (vm, (void **) b, bi, count, 0); +} + /** \brief Get next buffer in buffer linklist, or zero for end of list. @param vm - (vlib_main_t *) vlib main data structure pointer diff --git a/src/vppinfra/vector_avx2.h b/src/vppinfra/vector_avx2.h index 9c1ce4700c5..f651392559b 100644 --- a/src/vppinfra/vector_avx2.h +++ b/src/vppinfra/vector_avx2.h @@ -81,6 +81,28 @@ u32x8_insert_hi (u32x8 v1, u32x4 v2) return (u32x8) _mm256_inserti128_si256 ((__m256i) v1, (__m128i) v2, 1); } +/* _extend_to_ */ +/* *INDENT-OFF* */ +#define _(f,t,i) \ +static_always_inline t \ +f##_extend_to_##t (f x) \ +{ return (t) _mm256_cvt##i ((__m128i) x); } + +_(u16x8, u32x8, epu16_epi32) +_(u16x8, u64x4, epu16_epi64) +_(u32x4, u64x4, epu32_epi64) +_(u8x16, u16x16, epu8_epi64) +_(u8x16, u32x8, epu8_epi32) +_(u8x16, u64x4, epu8_epi64) +_(i16x8, i32x8, epi16_epi32) +_(i16x8, i64x4, epi16_epi64) +_(i32x4, i64x4, epi32_epi64) +_(i8x16, i16x16, epi8_epi64) +_(i8x16, i32x8, epi8_epi32) +_(i8x16, i64x4, epi8_epi64) +#undef _ +/* *INDENT-ON* */ + #endif /* included_vector_avx2_h */ /* -- cgit 1.2.3-korg