diff options
author | Lijian.Zhang <Lijian.Zhang@arm.com> | 2019-07-11 16:44:22 +0800 |
---|---|---|
committer | Damjan Marion <dmarion@me.com> | 2019-09-11 19:20:27 +0000 |
commit | fe2523d1a42c66ee3ddd594fad1cf5ac91c66c54 (patch) | |
tree | 82998c6aa17601640f0258513ef1efb698c0721f /src | |
parent | 8a1dea4ce6fd0684aef6d0b0843a90658775129d (diff) |
dpdk: apply dual loop unrolling in DPDK TX
Too many prefetches within loop unrollings induce bottleneck and
performance degradation on some CPUs which have less cache line fill
buffers, e.g, Arm Cortex-A72.
Apply dual loop unrolling and tune prefetches manually to remove
hot-spot with prefetch instructions, to get throughput improvement.
It brings about 1% throughput improvement and saves 8% clocks with
the target node on Cortex-A72.
Type: feature
Change-Id: If3a64a04a77e90cd0240bc4d1186dbb09dac7df0
Signed-off-by: Lijian Zhang <Lijian.Zhang@arm.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/plugins/dpdk/device/device.c | 57 |
1 files changed, 57 insertions, 0 deletions
diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c index 806749ccb5d..e9c1a557a75 100644 --- a/src/plugins/dpdk/device/device.c +++ b/src/plugins/dpdk/device/device.c @@ -289,6 +289,7 @@ VNET_DEVICE_CLASS_TX_FN (dpdk_device_class) (vlib_main_t * vm, n_left = n_packets; mb = ptd->mbufs; +#if (CLIB_N_PREFETCHES >= 8) while (n_left >= 8) { u32 or_flags; @@ -353,6 +354,62 @@ VNET_DEVICE_CLASS_TX_FN (dpdk_device_class) (vlib_main_t * vm, mb += 4; n_left -= 4; } +#elif (CLIB_N_PREFETCHES >= 4) + while (n_left >= 4) + { + vlib_buffer_t *b2, *b3; + u32 or_flags; + + CLIB_PREFETCH (mb[2], CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (mb[3], CLIB_CACHE_LINE_BYTES, STORE); + b2 = vlib_buffer_from_rte_mbuf (mb[2]); + CLIB_PREFETCH (b2, CLIB_CACHE_LINE_BYTES, LOAD); + b3 = vlib_buffer_from_rte_mbuf (mb[3]); + CLIB_PREFETCH (b3, CLIB_CACHE_LINE_BYTES, LOAD); + + b[0] = vlib_buffer_from_rte_mbuf (mb[0]); + b[1] = vlib_buffer_from_rte_mbuf (mb[1]); + + or_flags = b[0]->flags | b[1]->flags; + all_or_flags |= or_flags; + + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[0]); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[1]); + + if (or_flags & VLIB_BUFFER_NEXT_PRESENT) + { + dpdk_validate_rte_mbuf (vm, b[0], 1); + dpdk_validate_rte_mbuf (vm, b[1], 1); + } + else + { + dpdk_validate_rte_mbuf (vm, b[0], 0); + dpdk_validate_rte_mbuf (vm, b[1], 0); + } + + if (PREDICT_FALSE ((xd->flags & DPDK_DEVICE_FLAG_TX_OFFLOAD) && + (or_flags & + (VNET_BUFFER_F_OFFLOAD_TCP_CKSUM + | VNET_BUFFER_F_OFFLOAD_IP_CKSUM + | VNET_BUFFER_F_OFFLOAD_UDP_CKSUM)))) + { + dpdk_buffer_tx_offload (xd, b[0], mb[0]); + dpdk_buffer_tx_offload (xd, b[1], mb[1]); + } + + if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_TRACE)) + { + if (b[0]->flags & VLIB_BUFFER_IS_TRACED) + dpdk_tx_trace_buffer (dm, node, xd, queue_id, b[0]); + if (b[1]->flags & VLIB_BUFFER_IS_TRACED) + dpdk_tx_trace_buffer (dm, node, xd, queue_id, b[1]); + } + + mb += 2; + n_left -= 2; + } +#endif + while (n_left > 0) { b[0] = vlib_buffer_from_rte_mbuf (mb[0]); |