aboutsummaryrefslogtreecommitdiffstats
path: root/src/plugins/memif/device.c
diff options
context:
space:
mode:
authorMarvin Liu <yong.liu@intel.com>2023-03-15 01:01:38 +0800
committerDamjan Marion <dmarion@0xa5.net>2023-04-25 15:18:27 +0000
commitcada0c5075ebf4c59db3192f190b35bf588fac34 (patch)
treef5852b11087ab0c2f2aa13a6e36ca96d6b568787 /src/plugins/memif/device.c
parentefad24a84d35458e2c672b94027e54923a42fd25 (diff)
memif: support dma option
Introduce async model into memif by utilizing new DMA API. Original process is broken down to submission stage and completion stage. As multiple submissions may in flight simultaneously, per thread data is no longer safe, now replace thread data into each dma data structure. As slave side already support zero copy mode, DMA option is only added in master side. Type: feature Signed-off-by: Marvin Liu <yong.liu@intel.com> Change-Id: I084f253866f5127cdc73b9a08c8ce73b091488f3
Diffstat (limited to 'src/plugins/memif/device.c')
-rw-r--r--src/plugins/memif/device.c281
1 files changed, 278 insertions, 3 deletions
diff --git a/src/plugins/memif/device.c b/src/plugins/memif/device.c
index f049a7be38e..ff6068f8243 100644
--- a/src/plugins/memif/device.c
+++ b/src/plugins/memif/device.c
@@ -369,6 +369,270 @@ no_free_slots:
return n_left;
}
+CLIB_MARCH_FN (memif_tx_dma_completion_cb, void, vlib_main_t *vm,
+ vlib_dma_batch_t *b)
+{
+ memif_main_t *mm = &memif_main;
+ memif_if_t *mif = vec_elt_at_index (mm->interfaces, b->cookie >> 16);
+ memif_queue_t *mq = vec_elt_at_index (mif->tx_queues, b->cookie & 0xffff);
+ memif_dma_info_t *dma_info = mq->dma_info + mq->dma_info_head;
+ memif_per_thread_data_t *ptd = &dma_info->data;
+
+ vlib_buffer_free (vm, ptd->buffers, vec_len (ptd->buffers));
+
+ dma_info->finished = 1;
+ vec_reset_length (ptd->buffers);
+ vec_reset_length (ptd->copy_ops);
+
+ __atomic_store_n (&mq->ring->tail, dma_info->dma_tail, __ATOMIC_RELEASE);
+
+ mq->dma_info_head++;
+ if (mq->dma_info_head == mq->dma_info_size)
+ mq->dma_info_head = 0;
+ mq->dma_info_full = 0;
+}
+
+#ifndef CLIB_MARCH_VARIANT
+void
+memif_tx_dma_completion_cb (vlib_main_t *vm, vlib_dma_batch_t *b)
+{
+ return CLIB_MARCH_FN_SELECT (memif_tx_dma_completion_cb) (vm, b);
+}
+#endif
+
+static_always_inline uword
+memif_interface_tx_dma_inline (vlib_main_t *vm, vlib_node_runtime_t *node,
+ u32 *buffers, memif_if_t *mif,
+ memif_ring_type_t type, memif_queue_t *mq,
+ u32 n_left)
+{
+ memif_ring_t *ring;
+ u32 n_copy_op;
+ u16 ring_size, mask, slot, free_slots;
+ int n_retries = 5, fallback = 0;
+ vlib_buffer_t *b0, *b1, *b2, *b3;
+ memif_copy_op_t *co;
+ memif_region_index_t last_region = ~0;
+ void *last_region_shm = 0;
+ u16 head, tail;
+ memif_dma_info_t *dma_info;
+ memif_per_thread_data_t *ptd;
+ memif_main_t *mm = &memif_main;
+ u16 mif_id = mif - mm->interfaces;
+
+ ring = mq->ring;
+ ring_size = 1 << mq->log2_ring_size;
+ mask = ring_size - 1;
+
+ dma_info = mq->dma_info + mq->dma_info_tail;
+ ptd = &dma_info->data;
+
+ /* do software fallback if dma info ring is full */
+ u16 dma_mask = mq->dma_info_size - 1;
+ if ((((mq->dma_info_tail + 1) & dma_mask) == mq->dma_info_head) ||
+ ((mq->dma_info_head == dma_mask) && (mq->dma_info_tail == 0)))
+ {
+ if (!mq->dma_info_full)
+ mq->dma_info_full = 1;
+ else
+ fallback = 1;
+ }
+
+ vlib_dma_batch_t *b = NULL;
+ if (PREDICT_TRUE (!fallback))
+ b = vlib_dma_batch_new (vm, mif->dma_tx_config);
+ if (!b)
+ return n_left;
+
+retry:
+
+ slot = tail = mq->dma_tail;
+ head = __atomic_load_n (&ring->head, __ATOMIC_ACQUIRE);
+ mq->last_tail += tail - mq->last_tail;
+ free_slots = head - mq->dma_tail;
+
+ while (n_left && free_slots)
+ {
+ memif_desc_t *d0;
+ void *mb0;
+ i32 src_off;
+ u32 bi0, dst_off, src_left, dst_left, bytes_to_copy;
+ u32 saved_ptd_copy_ops_len = _vec_len (ptd->copy_ops);
+ u32 saved_ptd_buffers_len = _vec_len (ptd->buffers);
+ u16 saved_slot = slot;
+
+ clib_prefetch_load (&ring->desc[(slot + 8) & mask]);
+
+ d0 = &ring->desc[slot & mask];
+ if (PREDICT_FALSE (last_region != d0->region))
+ {
+ last_region_shm = mif->regions[d0->region].shm;
+ last_region = d0->region;
+ }
+ mb0 = last_region_shm + d0->offset;
+
+ dst_off = 0;
+
+ /* slave is the producer, so it should be able to reset buffer length */
+ dst_left = d0->length;
+
+ if (PREDICT_TRUE (n_left >= 4))
+ vlib_prefetch_buffer_header (vlib_get_buffer (vm, buffers[3]), LOAD);
+ bi0 = buffers[0];
+
+ next_in_chain:
+
+ b0 = vlib_get_buffer (vm, bi0);
+ src_off = b0->current_data;
+ src_left = b0->current_length;
+
+ while (src_left)
+ {
+ if (PREDICT_FALSE (dst_left == 0))
+ {
+ if (free_slots)
+ {
+ d0->length = dst_off;
+ d0->flags = MEMIF_DESC_FLAG_NEXT;
+ d0 = &ring->desc[slot & mask];
+ dst_off = 0;
+ dst_left = (type == MEMIF_RING_S2M) ? mif->run.buffer_size :
+ d0->length;
+
+ if (PREDICT_FALSE (last_region != d0->region))
+ {
+ last_region_shm = mif->regions[d0->region].shm;
+ last_region = d0->region;
+ }
+ mb0 = last_region_shm + d0->offset;
+ }
+ else
+ {
+ /* we need to rollback vectors before bailing out */
+ vec_set_len (ptd->buffers, saved_ptd_buffers_len);
+ vec_set_len (ptd->copy_ops, saved_ptd_copy_ops_len);
+ vlib_error_count (vm, node->node_index,
+ MEMIF_TX_ERROR_ROLLBACK, 1);
+ slot = saved_slot;
+ goto no_free_slots;
+ }
+ }
+ bytes_to_copy = clib_min (src_left, dst_left);
+ memif_add_copy_op (ptd, mb0 + dst_off, bytes_to_copy, src_off,
+ vec_len (ptd->buffers));
+ src_off += bytes_to_copy;
+ dst_off += bytes_to_copy;
+ src_left -= bytes_to_copy;
+ dst_left -= bytes_to_copy;
+ }
+
+ if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_NEXT_PRESENT))
+ {
+ slot++;
+ free_slots--;
+ bi0 = b0->next_buffer;
+ goto next_in_chain;
+ }
+
+ vec_add1_aligned (ptd->buffers, buffers[0], CLIB_CACHE_LINE_BYTES);
+ d0->length = dst_off;
+ d0->flags = 0;
+
+ free_slots -= 1;
+ slot += 1;
+
+ buffers++;
+ n_left--;
+ }
+no_free_slots:
+
+ /* copy data */
+ n_copy_op = vec_len (ptd->copy_ops);
+ co = ptd->copy_ops;
+ while (n_copy_op >= 8)
+ {
+ clib_prefetch_load (co[4].data);
+ clib_prefetch_load (co[5].data);
+ clib_prefetch_load (co[6].data);
+ clib_prefetch_load (co[7].data);
+
+ b0 = vlib_get_buffer (vm, ptd->buffers[co[0].buffer_vec_index]);
+ b1 = vlib_get_buffer (vm, ptd->buffers[co[1].buffer_vec_index]);
+ b2 = vlib_get_buffer (vm, ptd->buffers[co[2].buffer_vec_index]);
+ b3 = vlib_get_buffer (vm, ptd->buffers[co[3].buffer_vec_index]);
+
+ if (PREDICT_TRUE (!fallback))
+ {
+ vlib_dma_batch_add (vm, b, co[0].data,
+ b0->data + co[0].buffer_offset, co[0].data_len);
+ vlib_dma_batch_add (vm, b, co[1].data,
+ b1->data + co[1].buffer_offset, co[1].data_len);
+ vlib_dma_batch_add (vm, b, co[2].data,
+ b2->data + co[2].buffer_offset, co[2].data_len);
+ vlib_dma_batch_add (vm, b, co[3].data,
+ b3->data + co[3].buffer_offset, co[3].data_len);
+ }
+ else
+ {
+ clib_memcpy_fast (co[0].data, b0->data + co[0].buffer_offset,
+ co[0].data_len);
+ clib_memcpy_fast (co[1].data, b1->data + co[1].buffer_offset,
+ co[1].data_len);
+ clib_memcpy_fast (co[2].data, b2->data + co[2].buffer_offset,
+ co[2].data_len);
+ clib_memcpy_fast (co[3].data, b3->data + co[3].buffer_offset,
+ co[3].data_len);
+ }
+
+ co += 4;
+ n_copy_op -= 4;
+ }
+ while (n_copy_op)
+ {
+ b0 = vlib_get_buffer (vm, ptd->buffers[co[0].buffer_vec_index]);
+ if (PREDICT_TRUE (!fallback))
+ vlib_dma_batch_add (vm, b, co[0].data, b0->data + co[0].buffer_offset,
+ co[0].data_len);
+ else
+ clib_memcpy_fast (co[0].data, b0->data + co[0].buffer_offset,
+ co[0].data_len);
+ co += 1;
+ n_copy_op -= 1;
+ }
+
+ /* save dma info before retry */
+ dma_info->dma_tail = slot;
+ mq->dma_tail = slot;
+ vec_reset_length (ptd->copy_ops);
+
+ if (n_left && n_retries--)
+ goto retry;
+
+ if (PREDICT_TRUE (!fallback))
+ {
+ vlib_dma_batch_set_cookie (vm, b,
+ (mif_id << 16) | (mq - mif->tx_queues));
+ vlib_dma_batch_submit (vm, b);
+ dma_info->finished = 0;
+
+ if (b->n_enq)
+ {
+ mq->dma_info_tail++;
+ if (mq->dma_info_tail == mq->dma_info_size)
+ mq->dma_info_tail = 0;
+ }
+ }
+ else if (fallback && dma_info->finished)
+ {
+ /* if dma has been completed, update ring immediately */
+ vlib_buffer_free (vm, ptd->buffers, vec_len (ptd->buffers));
+ vec_reset_length (ptd->buffers);
+ __atomic_store_n (&mq->ring->tail, slot, __ATOMIC_RELEASE);
+ }
+
+ return n_left;
+}
+
VNET_DEVICE_CLASS_TX_FN (memif_device_class) (vlib_main_t * vm,
vlib_node_runtime_t * node,
vlib_frame_t * frame)
@@ -399,8 +663,14 @@ VNET_DEVICE_CLASS_TX_FN (memif_device_class) (vlib_main_t * vm,
n_left = memif_interface_tx_inline (vm, node, from, mif, MEMIF_RING_S2M,
mq, ptd, n_left);
else
- n_left = memif_interface_tx_inline (vm, node, from, mif, MEMIF_RING_M2S,
- mq, ptd, n_left);
+ {
+ if ((mif->flags & MEMIF_IF_FLAG_USE_DMA) && (mif->dma_tx_config >= 0))
+ n_left = memif_interface_tx_dma_inline (vm, node, from, mif,
+ MEMIF_RING_M2S, mq, n_left);
+ else
+ n_left = memif_interface_tx_inline (vm, node, from, mif,
+ MEMIF_RING_M2S, mq, ptd, n_left);
+ }
if (tf->shared_queue)
clib_spinlock_unlock (&mq->lockp);
@@ -416,7 +686,12 @@ VNET_DEVICE_CLASS_TX_FN (memif_device_class) (vlib_main_t * vm,
mq->int_count++;
}
- if ((mif->flags & MEMIF_IF_FLAG_ZERO_COPY) == 0)
+ if ((mif->flags & MEMIF_IF_FLAG_USE_DMA) && (mif->dma_tx_config >= 0))
+ {
+ if (n_left)
+ vlib_buffer_free (vm, from + frame->n_vectors - n_left, n_left);
+ }
+ else if ((mif->flags & MEMIF_IF_FLAG_ZERO_COPY) == 0)
vlib_buffer_free (vm, from, frame->n_vectors);
else if (n_left)
vlib_buffer_free (vm, from + frame->n_vectors - n_left, n_left);