aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDamjan Marion <damarion@cisco.com>2018-06-02 20:42:07 +0200
committerDave Barach <openvpp@barachs.net>2018-06-03 12:01:12 +0000
commit8855386411af888e47c60645daa1fe6081fa56e1 (patch)
treed8100b16a87f6618df8c3d4c172c8e029f9f8ce6
parente8d7ff5f876c3950caf1bacf603d79b9ada8aef2 (diff)
dpdk: improve buffer alloc perfomance
This is ~50% improvement in buffer alloc performance. For a 256 buffer allocation, it was ~10 clocks/buffer, now is < 5 clocks. Change-Id: I97590e240a79a42bcab5eb26587fc2d11e6eb163 Signed-off-by: Damjan Marion <damarion@cisco.com>
-rw-r--r--src/plugins/dpdk/buffer.c126
-rw-r--r--src/vlib/main.h2
-rw-r--r--src/vlib/threads.c1
3 files changed, 56 insertions, 73 deletions
diff --git a/src/plugins/dpdk/buffer.c b/src/plugins/dpdk/buffer.c
index 3b3aaf2379f..78d5becad78 100644
--- a/src/plugins/dpdk/buffer.c
+++ b/src/plugins/dpdk/buffer.c
@@ -88,6 +88,7 @@ STATIC_ASSERT (VLIB_BUFFER_PRE_DATA_SIZE == RTE_PKTMBUF_HEADROOM,
typedef struct
{
CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+ struct rte_mbuf **mbuf_alloc_list;
struct rte_mbuf ***mbuf_pending_free_list;
/* cached last pool */
@@ -199,106 +200,91 @@ CLIB_MULTIARCH_FN (dpdk_buffer_fill_free_list) (vlib_main_t * vm,
uword min_free_buffers)
{
dpdk_main_t *dm = &dpdk_main;
- vlib_buffer_t *b0, *b1, *b2, *b3;
- int n, i;
- u32 bi0, bi1, bi2, bi3;
+ dpdk_buffer_main_t *dbm = &dpdk_buffer_main;
+ struct rte_mbuf **mb;
+ uword n_left, first;
+ word n_alloc;
unsigned socket_id = rte_socket_id ();
+ u32 thread_index = vlib_get_thread_index ();
+ dpdk_buffer_per_thread_data *d = vec_elt_at_index (dbm->ptd, thread_index);
struct rte_mempool *rmp = dm->pktmbuf_pools[socket_id];
dpdk_mempool_private_t *privp = rte_mempool_get_priv (rmp);
- struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
vlib_buffer_t bt;
+ u32 *bi;
/* Too early? */
if (PREDICT_FALSE (rmp == 0))
return 0;
/* Already have enough free buffers on free list? */
- n = min_free_buffers - vec_len (fl->buffers);
- if (n <= 0)
+ n_alloc = min_free_buffers - vec_len (fl->buffers);
+ if (n_alloc <= 0)
return min_free_buffers;
/* Always allocate round number of buffers. */
- n = round_pow2 (n, CLIB_CACHE_LINE_BYTES / sizeof (u32));
+ n_alloc = round_pow2 (n_alloc, CLIB_CACHE_LINE_BYTES / sizeof (u32));
/* Always allocate new buffers in reasonably large sized chunks. */
- n = clib_max (n, fl->min_n_buffers_each_alloc);
+ n_alloc = clib_max (n_alloc, fl->min_n_buffers_each_alloc);
- vec_validate_aligned (vm->mbuf_alloc_list, n - 1, CLIB_CACHE_LINE_BYTES);
+ vec_validate_aligned (d->mbuf_alloc_list, n_alloc - 1,
+ CLIB_CACHE_LINE_BYTES);
- if (rte_mempool_get_bulk (rmp, vm->mbuf_alloc_list, n) < 0)
+ if (rte_mempool_get_bulk (rmp, (void *) d->mbuf_alloc_list, n_alloc) < 0)
return 0;
memset (&bt, 0, sizeof (vlib_buffer_t));
vlib_buffer_init_for_free_list (&bt, fl);
bt.buffer_pool_index = privp->buffer_pool_index;
- _vec_len (vm->mbuf_alloc_list) = n;
-
- i = 0;
- int f = vec_len (fl->buffers);
- vec_resize_aligned (fl->buffers, n, CLIB_CACHE_LINE_BYTES);
-
- while (i < (n - 7))
- {
- vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf
- (vm->mbuf_alloc_list[i + 4]), STORE);
- vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf
- (vm->mbuf_alloc_list[i + 5]), STORE);
- vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf
- (vm->mbuf_alloc_list[i + 6]), STORE);
- vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf
- (vm->mbuf_alloc_list[i + 7]), STORE);
-
- mb0 = vm->mbuf_alloc_list[i];
- mb1 = vm->mbuf_alloc_list[i + 1];
- mb2 = vm->mbuf_alloc_list[i + 2];
- mb3 = vm->mbuf_alloc_list[i + 3];
-
- b0 = vlib_buffer_from_rte_mbuf (mb0);
- b1 = vlib_buffer_from_rte_mbuf (mb1);
- b2 = vlib_buffer_from_rte_mbuf (mb2);
- b3 = vlib_buffer_from_rte_mbuf (mb3);
-
- bi0 = vlib_get_buffer_index (vm, b0);
- bi1 = vlib_get_buffer_index (vm, b1);
- bi2 = vlib_get_buffer_index (vm, b2);
- bi3 = vlib_get_buffer_index (vm, b3);
-
- fl->buffers[f++] = bi0;
- fl->buffers[f++] = bi1;
- fl->buffers[f++] = bi2;
- fl->buffers[f++] = bi3;
-
- clib_memcpy64_x4 (b0, b1, b2, b3, &bt);
-
- if (fl->buffer_init_function)
- {
- fl->buffer_init_function (vm, fl, &bi0, 1);
- fl->buffer_init_function (vm, fl, &bi1, 1);
- fl->buffer_init_function (vm, fl, &bi2, 1);
- fl->buffer_init_function (vm, fl, &bi3, 1);
- }
- i += 4;
- }
+ _vec_len (d->mbuf_alloc_list) = n_alloc;
- while (i < n)
- {
- mb0 = vm->mbuf_alloc_list[i];
+ first = vec_len (fl->buffers);
+ vec_resize_aligned (fl->buffers, n_alloc, CLIB_CACHE_LINE_BYTES);
- b0 = vlib_buffer_from_rte_mbuf (mb0);
- bi0 = vlib_get_buffer_index (vm, b0);
+ n_left = n_alloc;
+ mb = d->mbuf_alloc_list;
+ bi = fl->buffers + first;
- fl->buffers[f++] = bi0;
- clib_memcpy (b0, &bt, sizeof (vlib_buffer_t));
+ ASSERT (n_left % 8 == 0);
- if (fl->buffer_init_function)
- fl->buffer_init_function (vm, fl, &bi0, 1);
- i++;
+ while (n_left >= 8)
+ {
+ if (PREDICT_FALSE (n_left < 24))
+ goto no_prefetch;
+
+ vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[16]), STORE);
+ vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[17]), STORE);
+ vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[18]), STORE);
+ vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[19]), STORE);
+ vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[20]), STORE);
+ vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[21]), STORE);
+ vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[22]), STORE);
+ vlib_prefetch_buffer_header (vlib_buffer_from_rte_mbuf (mb[23]), STORE);
+
+ no_prefetch:
+ vlib_get_buffer_indices_with_offset (vm, (void **) mb, bi, 8,
+ sizeof (struct rte_mbuf));
+ clib_memcpy64_x4 (vlib_buffer_from_rte_mbuf (mb[0]),
+ vlib_buffer_from_rte_mbuf (mb[1]),
+ vlib_buffer_from_rte_mbuf (mb[2]),
+ vlib_buffer_from_rte_mbuf (mb[3]), &bt);
+ clib_memcpy64_x4 (vlib_buffer_from_rte_mbuf (mb[4]),
+ vlib_buffer_from_rte_mbuf (mb[5]),
+ vlib_buffer_from_rte_mbuf (mb[6]),
+ vlib_buffer_from_rte_mbuf (mb[7]), &bt);
+
+ n_left -= 8;
+ mb += 8;
+ bi += 8;
}
- fl->n_alloc += n;
+ if (fl->buffer_init_function)
+ fl->buffer_init_function (vm, fl, fl->buffers + first, n_alloc);
+
+ fl->n_alloc += n_alloc;
- return n;
+ return n_alloc;
}
static_always_inline void
diff --git a/src/vlib/main.h b/src/vlib/main.h
index 57b1efb7513..16e4120067d 100644
--- a/src/vlib/main.h
+++ b/src/vlib/main.h
@@ -175,8 +175,6 @@ typedef struct vlib_main_t
/* to compare with node runtime */
u32 thread_index;
- void **mbuf_alloc_list;
-
/* List of init functions to call, setup by constructors */
_vlib_init_function_list_elt_t *init_function_registrations;
_vlib_init_function_list_elt_t *worker_init_function_registrations;
diff --git a/src/vlib/threads.c b/src/vlib/threads.c
index edf5a0e0711..bbe94c7f272 100644
--- a/src/vlib/threads.c
+++ b/src/vlib/threads.c
@@ -827,7 +827,6 @@ start_workers (vlib_main_t * vm)
vm_clone->thread_index = worker_thread_index;
vm_clone->heap_base = w->thread_mheap;
- vm_clone->mbuf_alloc_list = 0;
vm_clone->init_functions_called =
hash_create (0, /* value bytes */ 0);
vm_clone->pending_rpc_requests = 0;