From 86b1871ba212064ceb985be4a6b655ebfe2e32f9 Mon Sep 17 00:00:00 2001 From: "Lijian.Zhang" Date: Mon, 8 Jul 2019 10:33:34 +0800 Subject: ip: apply dual loop unrolling in ip4_input Too many prefetches within loop unrollings induce bottleneck and performance degradation on some CPUs which have less cache line fill buffers, e.g, Arm Cortex-A72. Apply dual loop unrolling and tune prefetches manually to resolve hot-spot with prefetch instructions. It saves about 11.5% cycles with ip4_input node on Cortex-A72 CPUs. Type: feature Change-Id: I1ac9eb21061a804af2a414b420217fbcda3689c9 Signed-off-by: Lijian Zhang --- src/vnet/ip/ip4_input.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) (limited to 'src/vnet/ip') diff --git a/src/vnet/ip/ip4_input.c b/src/vnet/ip/ip4_input.c index 91766b4a6c1..94c4aac4271 100644 --- a/src/vnet/ip/ip4_input.c +++ b/src/vnet/ip/ip4_input.c @@ -151,6 +151,7 @@ ip4_input_inline (vlib_main_t * vm, vlib_get_buffers (vm, from, bufs, n_left_from); b = bufs; next = nexts; +#if (CLIB_N_PREFETCHES >= 8) while (n_left_from >= 4) { u32 x = 0; @@ -233,6 +234,73 @@ ip4_input_inline (vlib_main_t * vm, next += 4; n_left_from -= 4; } +#elif (CLIB_N_PREFETCHES >= 4) + while (n_left_from >= 2) + { + u32 x = 0; + u32 next0, next1; + + /* Prefetch next iteration. */ + if (n_left_from >= 6) + { + vlib_prefetch_buffer_header (b[4], LOAD); + vlib_prefetch_buffer_header (b[5], LOAD); + + vlib_prefetch_buffer_data (b[2], LOAD); + vlib_prefetch_buffer_data (b[3], LOAD); + } + + vnet_buffer (b[0])->ip.adj_index[VLIB_RX] = ~0; + vnet_buffer (b[1])->ip.adj_index[VLIB_RX] = ~0; + + sw_if_index[0] = vnet_buffer (b[0])->sw_if_index[VLIB_RX]; + sw_if_index[1] = vnet_buffer (b[1])->sw_if_index[VLIB_RX]; + + x |= sw_if_index[0] ^ last_sw_if_index; + x |= sw_if_index[1] ^ last_sw_if_index; + + if (PREDICT_TRUE (x == 0)) + { + /* we deal with 2 more packets sharing the same sw_if_index + with the previous one, so we can optimize */ + cnt += 2; + if (arc_enabled) + { + next0 = ip4_input_set_next (sw_if_index[0], b[0], 1); + next1 = ip4_input_set_next (sw_if_index[1], b[1], 1); + } + else + { + next0 = ip4_input_set_next (sw_if_index[0], b[0], 0); + next1 = ip4_input_set_next (sw_if_index[1], b[1], 0); + } + } + else + { + ip4_input_check_sw_if_index (vm, cm, sw_if_index[0], + &last_sw_if_index, &cnt, &arc_enabled); + ip4_input_check_sw_if_index (vm, cm, sw_if_index[1], + &last_sw_if_index, &cnt, &arc_enabled); + + next0 = ip4_input_set_next (sw_if_index[0], b[0], 1); + next1 = ip4_input_set_next (sw_if_index[1], b[1], 1); + } + + ip[0] = vlib_buffer_get_current (b[0]); + ip[1] = vlib_buffer_get_current (b[1]); + + ip4_input_check_x2 (vm, error_node, b[0], b[1], ip[0], ip[1], + &next0, &next1, verify_checksum); + next[0] = (u16) next0; + next[1] = (u16) next1; + + /* next */ + b += 2; + next += 2; + n_left_from -= 2; + } +#endif + while (n_left_from) { u32 next0; -- cgit 1.2.3-korg