From fe2523d1a42c66ee3ddd594fad1cf5ac91c66c54 Mon Sep 17 00:00:00 2001 From: "Lijian.Zhang" Date: Thu, 11 Jul 2019 16:44:22 +0800 Subject: dpdk: apply dual loop unrolling in DPDK TX Too many prefetches within loop unrollings induce bottleneck and performance degradation on some CPUs which have less cache line fill buffers, e.g, Arm Cortex-A72. Apply dual loop unrolling and tune prefetches manually to remove hot-spot with prefetch instructions, to get throughput improvement. It brings about 1% throughput improvement and saves 8% clocks with the target node on Cortex-A72. Type: feature Change-Id: If3a64a04a77e90cd0240bc4d1186dbb09dac7df0 Signed-off-by: Lijian Zhang --- src/plugins/dpdk/device/device.c | 57 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c index 806749ccb5d..e9c1a557a75 100644 --- a/src/plugins/dpdk/device/device.c +++ b/src/plugins/dpdk/device/device.c @@ -289,6 +289,7 @@ VNET_DEVICE_CLASS_TX_FN (dpdk_device_class) (vlib_main_t * vm, n_left = n_packets; mb = ptd->mbufs; +#if (CLIB_N_PREFETCHES >= 8) while (n_left >= 8) { u32 or_flags; @@ -353,6 +354,62 @@ VNET_DEVICE_CLASS_TX_FN (dpdk_device_class) (vlib_main_t * vm, mb += 4; n_left -= 4; } +#elif (CLIB_N_PREFETCHES >= 4) + while (n_left >= 4) + { + vlib_buffer_t *b2, *b3; + u32 or_flags; + + CLIB_PREFETCH (mb[2], CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (mb[3], CLIB_CACHE_LINE_BYTES, STORE); + b2 = vlib_buffer_from_rte_mbuf (mb[2]); + CLIB_PREFETCH (b2, CLIB_CACHE_LINE_BYTES, LOAD); + b3 = vlib_buffer_from_rte_mbuf (mb[3]); + CLIB_PREFETCH (b3, CLIB_CACHE_LINE_BYTES, LOAD); + + b[0] = vlib_buffer_from_rte_mbuf (mb[0]); + b[1] = vlib_buffer_from_rte_mbuf (mb[1]); + + or_flags = b[0]->flags | b[1]->flags; + all_or_flags |= or_flags; + + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[0]); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b[1]); + + if (or_flags & VLIB_BUFFER_NEXT_PRESENT) + { + dpdk_validate_rte_mbuf (vm, b[0], 1); + dpdk_validate_rte_mbuf (vm, b[1], 1); + } + else + { + dpdk_validate_rte_mbuf (vm, b[0], 0); + dpdk_validate_rte_mbuf (vm, b[1], 0); + } + + if (PREDICT_FALSE ((xd->flags & DPDK_DEVICE_FLAG_TX_OFFLOAD) && + (or_flags & + (VNET_BUFFER_F_OFFLOAD_TCP_CKSUM + | VNET_BUFFER_F_OFFLOAD_IP_CKSUM + | VNET_BUFFER_F_OFFLOAD_UDP_CKSUM)))) + { + dpdk_buffer_tx_offload (xd, b[0], mb[0]); + dpdk_buffer_tx_offload (xd, b[1], mb[1]); + } + + if (PREDICT_FALSE (node->flags & VLIB_NODE_FLAG_TRACE)) + { + if (b[0]->flags & VLIB_BUFFER_IS_TRACED) + dpdk_tx_trace_buffer (dm, node, xd, queue_id, b[0]); + if (b[1]->flags & VLIB_BUFFER_IS_TRACED) + dpdk_tx_trace_buffer (dm, node, xd, queue_id, b[1]); + } + + mb += 2; + n_left -= 2; + } +#endif + while (n_left > 0) { b[0] = vlib_buffer_from_rte_mbuf (mb[0]); -- cgit 1.2.3-korg