aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorLijian.Zhang <Lijian.Zhang@arm.com>2019-07-08 10:33:34 +0800
committerDamjan Marion <dmarion@me.com>2019-09-11 19:20:27 +0000
commit86b1871ba212064ceb985be4a6b655ebfe2e32f9 (patch)
tree71d0e9bb6e98a76f79628fdd72f91312b470e30d /src
parent840f64b4b2d6063adebb8c7b31c9357aaaf8dd5e (diff)
ip: apply dual loop unrolling in ip4_input
Too many prefetches within loop unrollings induce bottleneck and performance degradation on some CPUs which have less cache line fill buffers, e.g, Arm Cortex-A72. Apply dual loop unrolling and tune prefetches manually to resolve hot-spot with prefetch instructions. It saves about 11.5% cycles with ip4_input node on Cortex-A72 CPUs. Type: feature Change-Id: I1ac9eb21061a804af2a414b420217fbcda3689c9 Signed-off-by: Lijian Zhang <Lijian.Zhang@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/vnet/ip/ip4_input.c68
1 files changed, 68 insertions, 0 deletions
diff --git a/src/vnet/ip/ip4_input.c b/src/vnet/ip/ip4_input.c
index 91766b4a6c1..94c4aac4271 100644
--- a/src/vnet/ip/ip4_input.c
+++ b/src/vnet/ip/ip4_input.c
@@ -151,6 +151,7 @@ ip4_input_inline (vlib_main_t * vm,
vlib_get_buffers (vm, from, bufs, n_left_from);
b = bufs;
next = nexts;
+#if (CLIB_N_PREFETCHES >= 8)
while (n_left_from >= 4)
{
u32 x = 0;
@@ -233,6 +234,73 @@ ip4_input_inline (vlib_main_t * vm,
next += 4;
n_left_from -= 4;
}
+#elif (CLIB_N_PREFETCHES >= 4)
+ while (n_left_from >= 2)
+ {
+ u32 x = 0;
+ u32 next0, next1;
+
+ /* Prefetch next iteration. */
+ if (n_left_from >= 6)
+ {
+ vlib_prefetch_buffer_header (b[4], LOAD);
+ vlib_prefetch_buffer_header (b[5], LOAD);
+
+ vlib_prefetch_buffer_data (b[2], LOAD);
+ vlib_prefetch_buffer_data (b[3], LOAD);
+ }
+
+ vnet_buffer (b[0])->ip.adj_index[VLIB_RX] = ~0;
+ vnet_buffer (b[1])->ip.adj_index[VLIB_RX] = ~0;
+
+ sw_if_index[0] = vnet_buffer (b[0])->sw_if_index[VLIB_RX];
+ sw_if_index[1] = vnet_buffer (b[1])->sw_if_index[VLIB_RX];
+
+ x |= sw_if_index[0] ^ last_sw_if_index;
+ x |= sw_if_index[1] ^ last_sw_if_index;
+
+ if (PREDICT_TRUE (x == 0))
+ {
+ /* we deal with 2 more packets sharing the same sw_if_index
+ with the previous one, so we can optimize */
+ cnt += 2;
+ if (arc_enabled)
+ {
+ next0 = ip4_input_set_next (sw_if_index[0], b[0], 1);
+ next1 = ip4_input_set_next (sw_if_index[1], b[1], 1);
+ }
+ else
+ {
+ next0 = ip4_input_set_next (sw_if_index[0], b[0], 0);
+ next1 = ip4_input_set_next (sw_if_index[1], b[1], 0);
+ }
+ }
+ else
+ {
+ ip4_input_check_sw_if_index (vm, cm, sw_if_index[0],
+ &last_sw_if_index, &cnt, &arc_enabled);
+ ip4_input_check_sw_if_index (vm, cm, sw_if_index[1],
+ &last_sw_if_index, &cnt, &arc_enabled);
+
+ next0 = ip4_input_set_next (sw_if_index[0], b[0], 1);
+ next1 = ip4_input_set_next (sw_if_index[1], b[1], 1);
+ }
+
+ ip[0] = vlib_buffer_get_current (b[0]);
+ ip[1] = vlib_buffer_get_current (b[1]);
+
+ ip4_input_check_x2 (vm, error_node, b[0], b[1], ip[0], ip[1],
+ &next0, &next1, verify_checksum);
+ next[0] = (u16) next0;
+ next[1] = (u16) next1;
+
+ /* next */
+ b += 2;
+ next += 2;
+ n_left_from -= 2;
+ }
+#endif
+
while (n_left_from)
{
u32 next0;