diff options
Diffstat (limited to 'src/vlib/buffer_funcs.h')
-rw-r--r-- | src/vlib/buffer_funcs.h | 247 |
1 files changed, 141 insertions, 106 deletions
diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 77964fde821..010289ce2be 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -42,6 +42,7 @@ #include <vppinfra/hash.h> #include <vppinfra/fifo.h> +#include <vppinfra/vector/index_to_ptr.h> #include <vlib/buffer.h> #include <vlib/physmem_funcs.h> #include <vlib/main.h> @@ -55,24 +56,38 @@ typedef void (vlib_buffer_enqueue_to_next_fn_t) (vlib_main_t *vm, vlib_node_runtime_t *node, u32 *buffers, u16 *nexts, uword count); +typedef void (vlib_buffer_enqueue_to_next_with_aux_fn_t) ( + vlib_main_t *vm, vlib_node_runtime_t *node, u32 *buffers, u32 *aux_data, + u16 *nexts, uword count); typedef void (vlib_buffer_enqueue_to_single_next_fn_t) ( vlib_main_t *vm, vlib_node_runtime_t *node, u32 *ers, u16 next_index, u32 count); +typedef void (vlib_buffer_enqueue_to_single_next_with_aux_fn_t) ( + vlib_main_t *vm, vlib_node_runtime_t *node, u32 *ers, u32 *aux_data, + u16 next_index, u32 count); + typedef u32 (vlib_buffer_enqueue_to_thread_fn_t) ( vlib_main_t *vm, vlib_node_runtime_t *node, u32 frame_queue_index, u32 *buffer_indices, u16 *thread_indices, u32 n_packets, int drop_on_congestion); -typedef u32 (vlib_frame_queue_dequeue_fn_t) (vlib_main_t *vm, - vlib_frame_queue_main_t *fqm); +typedef u32 (vlib_buffer_enqueue_to_thread_with_aux_fn_t) ( + vlib_main_t *vm, vlib_node_runtime_t *node, u32 frame_queue_index, + u32 *buffer_indices, u32 *aux, u16 *thread_indices, u32 n_packets, + int drop_on_congestion); typedef struct { vlib_buffer_enqueue_to_next_fn_t *buffer_enqueue_to_next_fn; + vlib_buffer_enqueue_to_next_with_aux_fn_t + *buffer_enqueue_to_next_with_aux_fn; vlib_buffer_enqueue_to_single_next_fn_t *buffer_enqueue_to_single_next_fn; + vlib_buffer_enqueue_to_single_next_with_aux_fn_t + *buffer_enqueue_to_single_next_with_aux_fn; vlib_buffer_enqueue_to_thread_fn_t *buffer_enqueue_to_thread_fn; - vlib_frame_queue_dequeue_fn_t *frame_queue_dequeue_fn; + vlib_buffer_enqueue_to_thread_with_aux_fn_t + *buffer_enqueue_to_thread_with_aux_fn; } vlib_buffer_func_main_t; extern vlib_buffer_func_main_t vlib_buffer_func_main; @@ -166,7 +181,6 @@ vlib_buffer_copy_indices_to_ring (u32 * ring, u32 * src, u32 start, } } -STATIC_ASSERT_OFFSET_OF (vlib_buffer_t, template_end, 64); static_always_inline void vlib_buffer_copy_template (vlib_buffer_t * b, vlib_buffer_t * bt) { @@ -201,102 +215,38 @@ vlib_buffer_pool_get_default_for_numa (vlib_main_t * vm, u32 numa_node) @param offset - (i32) offset applied to each pointer */ static_always_inline void -vlib_get_buffers_with_offset (vlib_main_t * vm, u32 * bi, void **b, int count, +vlib_get_buffers_with_offset (vlib_main_t *vm, u32 *bi, void **b, u32 count, i32 offset) { uword buffer_mem_start = vm->buffer_main->buffer_mem_start; -#ifdef CLIB_HAVE_VEC512 - u64x8 of8 = u64x8_splat (buffer_mem_start + offset); - u64x4 off = u64x8_extract_lo (of8); - /* if count is not const, compiler will not unroll while loop - se we maintain two-in-parallel variant */ - while (count >= 32) - { - u64x8 b0 = u64x8_from_u32x8 (u32x8_load_unaligned (bi)); - u64x8 b1 = u64x8_from_u32x8 (u32x8_load_unaligned (bi + 8)); - u64x8 b2 = u64x8_from_u32x8 (u32x8_load_unaligned (bi + 16)); - u64x8 b3 = u64x8_from_u32x8 (u32x8_load_unaligned (bi + 24)); - /* shift and add to get vlib_buffer_t pointer */ - u64x8_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b); - u64x8_store_unaligned ((b1 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b + 8); - u64x8_store_unaligned ((b2 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b + 16); - u64x8_store_unaligned ((b3 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b + 24); - b += 32; - bi += 32; - count -= 32; - } - while (count >= 8) - { - u64x8 b0 = u64x8_from_u32x8 (u32x8_load_unaligned (bi)); - /* shift and add to get vlib_buffer_t pointer */ - u64x8_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + of8, b); - b += 8; - bi += 8; - count -= 8; - } -#elif defined CLIB_HAVE_VEC256 - u64x4 off = u64x4_splat (buffer_mem_start + offset); - /* if count is not const, compiler will not unroll while loop - se we maintain two-in-parallel variant */ - while (count >= 32) - { - u64x4 b0 = u64x4_from_u32x4 (u32x4_load_unaligned (bi)); - u64x4 b1 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 4)); - u64x4 b2 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 8)); - u64x4 b3 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 12)); - u64x4 b4 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 16)); - u64x4 b5 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 20)); - u64x4 b6 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 24)); - u64x4 b7 = u64x4_from_u32x4 (u32x4_load_unaligned (bi + 28)); - /* shift and add to get vlib_buffer_t pointer */ - u64x4_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b); - u64x4_store_unaligned ((b1 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 4); - u64x4_store_unaligned ((b2 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 8); - u64x4_store_unaligned ((b3 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 12); - u64x4_store_unaligned ((b4 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 16); - u64x4_store_unaligned ((b5 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 20); - u64x4_store_unaligned ((b6 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 24); - u64x4_store_unaligned ((b7 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 28); - b += 32; - bi += 32; - count -= 32; - } -#endif - while (count >= 4) - { -#ifdef CLIB_HAVE_VEC256 - u64x4 b0 = u64x4_from_u32x4 (u32x4_load_unaligned (bi)); - /* shift and add to get vlib_buffer_t pointer */ - u64x4_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b); -#elif defined (CLIB_HAVE_VEC128) - u64x2 off = u64x2_splat (buffer_mem_start + offset); - u32x4 bi4 = u32x4_load_unaligned (bi); - u64x2 b0 = u64x2_from_u32x4 ((u32x4) bi4); -#if defined (__aarch64__) - u64x2 b1 = u64x2_from_u32x4_high ((u32x4) bi4); -#else - bi4 = u32x4_shuffle (bi4, 2, 3, 0, 1); - u64x2 b1 = u64x2_from_u32x4 ((u32x4) bi4); -#endif - u64x2_store_unaligned ((b0 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b); - u64x2_store_unaligned ((b1 << CLIB_LOG2_CACHE_LINE_BYTES) + off, b + 2); -#else - b[0] = vlib_buffer_ptr_from_index (buffer_mem_start, bi[0], offset); - b[1] = vlib_buffer_ptr_from_index (buffer_mem_start, bi[1], offset); - b[2] = vlib_buffer_ptr_from_index (buffer_mem_start, bi[2], offset); - b[3] = vlib_buffer_ptr_from_index (buffer_mem_start, bi[3], offset); -#endif - b += 4; - bi += 4; - count -= 4; - } - while (count) + void *base = (void *) (buffer_mem_start + offset); + int objsize = __builtin_object_size (b, 0); + const int sh = CLIB_LOG2_CACHE_LINE_BYTES; + + if (COMPILE_TIME_CONST (count) == 0 && objsize >= 64 * sizeof (b[0]) && + (objsize & ((8 * sizeof (b[0])) - 1)) == 0) { - b[0] = vlib_buffer_ptr_from_index (buffer_mem_start, bi[0], offset); - b += 1; - bi += 1; - count -= 1; + u32 n = round_pow2 (count, 8); + ASSERT (objsize >= count); + CLIB_ASSUME (objsize >= count); + while (n >= 64) + { + clib_index_to_ptr_u32 (bi, base, sh, b, 64); + b += 64; + bi += 64; + n -= 64; + } + + while (n) + { + clib_index_to_ptr_u32 (bi, base, sh, b, 8); + b += 8; + bi += 8; + n -= 8; + } } + else + clib_index_to_ptr_u32 (bi, base, sh, b, count); } /** \brief Translate array of buffer indices into buffer pointers @@ -308,7 +258,7 @@ vlib_get_buffers_with_offset (vlib_main_t * vm, u32 * bi, void **b, int count, */ static_always_inline void -vlib_get_buffers (vlib_main_t * vm, u32 * bi, vlib_buffer_t ** b, int count) +vlib_get_buffers (vlib_main_t *vm, u32 *bi, vlib_buffer_t **b, u32 count) { vlib_get_buffers_with_offset (vm, bi, (void **) b, count, 0); } @@ -803,6 +753,23 @@ vlib_buffer_pool_put (vlib_main_t * vm, u8 buffer_pool_index, clib_spinlock_unlock (&bp->lock); } +/** \brief return unused buffers back to pool + This function can be used to return buffers back to pool without going + through vlib_buffer_free. Buffer metadata must not be modified in any + way before buffers are returned. + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param buffers - (u32 * ) buffer index array + @param n_buffers - (u32) number of buffers to free + @param buffer_pool_index - (u8) buffer pool index +*/ +always_inline void +vlib_buffer_unalloc_to_pool (vlib_main_t *vm, u32 *buffers, u32 n_buffers, + u8 buffer_pool_index) +{ + vlib_buffer_pool_put (vm, buffer_pool_index, buffers, n_buffers); +} + static_always_inline void vlib_buffer_free_inline (vlib_main_t * vm, u32 * buffers, u32 n_buffers, int maybe_next) @@ -810,8 +777,8 @@ vlib_buffer_free_inline (vlib_main_t * vm, u32 * buffers, u32 n_buffers, const int queue_size = 128; vlib_buffer_pool_t *bp = 0; u8 buffer_pool_index = ~0; - u32 n_queue = 0, queue[queue_size + 4]; - vlib_buffer_t bt = { }; + u32 n_queue = 0, queue[queue_size + 8]; + vlib_buffer_template_t bt = {}; #if defined(CLIB_HAVE_VEC128) vlib_buffer_t bpi_mask = {.buffer_pool_index = ~0 }; vlib_buffer_t bpi_vec = {}; @@ -827,7 +794,7 @@ vlib_buffer_free_inline (vlib_main_t * vm, u32 * buffers, u32 n_buffers, vlib_buffer_t *b = vlib_get_buffer (vm, buffers[0]); buffer_pool_index = b->buffer_pool_index; bp = vlib_get_buffer_pool (vm, buffer_pool_index); - vlib_buffer_copy_template (&bt, &bp->buffer_template); + bt = bp->buffer_template; #if defined(CLIB_HAVE_VEC128) bpi_vec.buffer_pool_index = buffer_pool_index; #endif @@ -837,9 +804,16 @@ vlib_buffer_free_inline (vlib_main_t * vm, u32 * buffers, u32 n_buffers, vlib_buffer_t *b[8]; u32 bi, sum = 0, flags, next; +#if defined(CLIB_HAVE_VEC512) + if (n_buffers < 8) +#else if (n_buffers < 4) +#endif goto one_by_one; +#if defined(CLIB_HAVE_VEC512) + vlib_get_buffers (vm, buffers, b, 8); +#else vlib_get_buffers (vm, buffers, b, 4); if (n_buffers >= 12) @@ -850,8 +824,33 @@ vlib_buffer_free_inline (vlib_main_t * vm, u32 * buffers, u32 n_buffers, vlib_prefetch_buffer_header (b[6], LOAD); vlib_prefetch_buffer_header (b[7], LOAD); } +#endif -#if defined(CLIB_HAVE_VEC128) +#if defined(CLIB_HAVE_VEC512) + u8x16 p0, p1, p2, p3, p4, p5, p6, p7, r; + p0 = u8x16_load_unaligned (b[0]); + p1 = u8x16_load_unaligned (b[1]); + p2 = u8x16_load_unaligned (b[2]); + p3 = u8x16_load_unaligned (b[3]); + p4 = u8x16_load_unaligned (b[4]); + p5 = u8x16_load_unaligned (b[5]); + p6 = u8x16_load_unaligned (b[6]); + p7 = u8x16_load_unaligned (b[7]); + + r = p0 ^ bpi_vec.as_u8x16[0]; + r |= p1 ^ bpi_vec.as_u8x16[0]; + r |= p2 ^ bpi_vec.as_u8x16[0]; + r |= p3 ^ bpi_vec.as_u8x16[0]; + r |= p4 ^ bpi_vec.as_u8x16[0]; + r |= p5 ^ bpi_vec.as_u8x16[0]; + r |= p6 ^ bpi_vec.as_u8x16[0]; + r |= p7 ^ bpi_vec.as_u8x16[0]; + r &= bpi_mask.as_u8x16[0]; + r |= + (p0 | p1 | p2 | p3 | p4 | p5 | p6 | p7) & flags_refs_mask.as_u8x16[0]; + + sum = !u8x16_is_all_zero (r); +#elif defined(CLIB_HAVE_VEC128) u8x16 p0, p1, p2, p3, r; p0 = u8x16_load_unaligned (b[0]); p1 = u8x16_load_unaligned (b[1]); @@ -885,11 +884,41 @@ vlib_buffer_free_inline (vlib_main_t * vm, u32 * buffers, u32 n_buffers, if (sum) goto one_by_one; +#if defined(CLIB_HAVE_VEC512) + vlib_buffer_copy_indices (queue + n_queue, buffers, 8); + b[0]->template = bt; + b[1]->template = bt; + b[2]->template = bt; + b[3]->template = bt; + b[4]->template = bt; + b[5]->template = bt; + b[6]->template = bt; + b[7]->template = bt; + n_queue += 8; + + vlib_buffer_validate (vm, b[0]); + vlib_buffer_validate (vm, b[1]); + vlib_buffer_validate (vm, b[2]); + vlib_buffer_validate (vm, b[3]); + vlib_buffer_validate (vm, b[4]); + vlib_buffer_validate (vm, b[5]); + vlib_buffer_validate (vm, b[6]); + vlib_buffer_validate (vm, b[7]); + + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[0]); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[1]); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[2]); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[3]); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[4]); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[5]); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[6]); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[7]); +#else vlib_buffer_copy_indices (queue + n_queue, buffers, 4); - vlib_buffer_copy_template (b[0], &bt); - vlib_buffer_copy_template (b[1], &bt); - vlib_buffer_copy_template (b[2], &bt); - vlib_buffer_copy_template (b[3], &bt); + b[0]->template = bt; + b[1]->template = bt; + b[2]->template = bt; + b[3]->template = bt; n_queue += 4; vlib_buffer_validate (vm, b[0]); @@ -901,14 +930,20 @@ vlib_buffer_free_inline (vlib_main_t * vm, u32 * buffers, u32 n_buffers, VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[1]); VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[2]); VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[3]); +#endif if (n_queue >= queue_size) { vlib_buffer_pool_put (vm, buffer_pool_index, queue, n_queue); n_queue = 0; } +#if defined(CLIB_HAVE_VEC512) + buffers += 8; + n_buffers -= 8; +#else buffers += 4; n_buffers -= 4; +#endif continue; one_by_one: @@ -933,7 +968,7 @@ vlib_buffer_free_inline (vlib_main_t * vm, u32 * buffers, u32 n_buffers, bpi_vec.buffer_pool_index = buffer_pool_index; #endif bp = vlib_get_buffer_pool (vm, buffer_pool_index); - vlib_buffer_copy_template (&bt, &bp->buffer_template); + bt = bp->buffer_template; } vlib_buffer_validate (vm, b[0]); @@ -942,7 +977,7 @@ vlib_buffer_free_inline (vlib_main_t * vm, u32 * buffers, u32 n_buffers, if (clib_atomic_sub_fetch (&b[0]->ref_count, 1) == 0) { - vlib_buffer_copy_template (b[0], &bt); + b[0]->template = bt; queue[n_queue++] = bi; } |