diff options
author | Lijian.Zhang <Lijian.Zhang@arm.com> | 2019-07-08 10:33:34 +0800 |
---|---|---|
committer | Damjan Marion <dmarion@me.com> | 2019-09-11 19:20:27 +0000 |
commit | 86b1871ba212064ceb985be4a6b655ebfe2e32f9 (patch) | |
tree | 71d0e9bb6e98a76f79628fdd72f91312b470e30d | |
parent | 840f64b4b2d6063adebb8c7b31c9357aaaf8dd5e (diff) |
ip: apply dual loop unrolling in ip4_input
Too many prefetches within loop unrollings induce bottleneck and
performance degradation on some CPUs which have less cache line fill
buffers, e.g, Arm Cortex-A72.
Apply dual loop unrolling and tune prefetches manually to resolve
hot-spot with prefetch instructions.
It saves about 11.5% cycles with ip4_input node on Cortex-A72 CPUs.
Type: feature
Change-Id: I1ac9eb21061a804af2a414b420217fbcda3689c9
Signed-off-by: Lijian Zhang <Lijian.Zhang@arm.com>
-rw-r--r-- | src/vnet/ip/ip4_input.c | 68 |
1 files changed, 68 insertions, 0 deletions
diff --git a/src/vnet/ip/ip4_input.c b/src/vnet/ip/ip4_input.c index 91766b4a6c1..94c4aac4271 100644 --- a/src/vnet/ip/ip4_input.c +++ b/src/vnet/ip/ip4_input.c @@ -151,6 +151,7 @@ ip4_input_inline (vlib_main_t * vm, vlib_get_buffers (vm, from, bufs, n_left_from); b = bufs; next = nexts; +#if (CLIB_N_PREFETCHES >= 8) while (n_left_from >= 4) { u32 x = 0; @@ -233,6 +234,73 @@ ip4_input_inline (vlib_main_t * vm, next += 4; n_left_from -= 4; } +#elif (CLIB_N_PREFETCHES >= 4) + while (n_left_from >= 2) + { + u32 x = 0; + u32 next0, next1; + + /* Prefetch next iteration. */ + if (n_left_from >= 6) + { + vlib_prefetch_buffer_header (b[4], LOAD); + vlib_prefetch_buffer_header (b[5], LOAD); + + vlib_prefetch_buffer_data (b[2], LOAD); + vlib_prefetch_buffer_data (b[3], LOAD); + } + + vnet_buffer (b[0])->ip.adj_index[VLIB_RX] = ~0; + vnet_buffer (b[1])->ip.adj_index[VLIB_RX] = ~0; + + sw_if_index[0] = vnet_buffer (b[0])->sw_if_index[VLIB_RX]; + sw_if_index[1] = vnet_buffer (b[1])->sw_if_index[VLIB_RX]; + + x |= sw_if_index[0] ^ last_sw_if_index; + x |= sw_if_index[1] ^ last_sw_if_index; + + if (PREDICT_TRUE (x == 0)) + { + /* we deal with 2 more packets sharing the same sw_if_index + with the previous one, so we can optimize */ + cnt += 2; + if (arc_enabled) + { + next0 = ip4_input_set_next (sw_if_index[0], b[0], 1); + next1 = ip4_input_set_next (sw_if_index[1], b[1], 1); + } + else + { + next0 = ip4_input_set_next (sw_if_index[0], b[0], 0); + next1 = ip4_input_set_next (sw_if_index[1], b[1], 0); + } + } + else + { + ip4_input_check_sw_if_index (vm, cm, sw_if_index[0], + &last_sw_if_index, &cnt, &arc_enabled); + ip4_input_check_sw_if_index (vm, cm, sw_if_index[1], + &last_sw_if_index, &cnt, &arc_enabled); + + next0 = ip4_input_set_next (sw_if_index[0], b[0], 1); + next1 = ip4_input_set_next (sw_if_index[1], b[1], 1); + } + + ip[0] = vlib_buffer_get_current (b[0]); + ip[1] = vlib_buffer_get_current (b[1]); + + ip4_input_check_x2 (vm, error_node, b[0], b[1], ip[0], ip[1], + &next0, &next1, verify_checksum); + next[0] = (u16) next0; + next[1] = (u16) next1; + + /* next */ + b += 2; + next += 2; + n_left_from -= 2; + } +#endif + while (n_left_from) { u32 next0; |