diff options
author | Lijian.Zhang <Lijian.Zhang@arm.com> | 2019-07-09 17:54:32 +0800 |
---|---|---|
committer | Damjan Marion <dmarion@me.com> | 2019-09-11 19:20:27 +0000 |
commit | 840f64b4b2d6063adebb8c7b31c9357aaaf8dd5e (patch) | |
tree | 34af2303319245845dd701953d51681850d9d84b | |
parent | fe2523d1a42c66ee3ddd594fad1cf5ac91c66c54 (diff) |
ip: apply dual loop unrolling in ip4_rewrite
Too many prefetches within loop unrollings induce bottleneck and
performance degradation on some CPUs which have less cache line fill
buffers, e.g, Arm Cortex-A72.
Apply dual loop unrolling and tune prefetches manually to remove
hot-spot with prefetch instructions, to get throughput improvement.
It brings about 7% throughput improvement and saves 28% clocks with
ip4_rewrite nodes on Cortex-A72 CPUs.
Type: feature
Change-Id: I0d35ef19faccbd7a5a4647f50bc369bfcb01a20d
Signed-off-by: Lijian Zhang <Lijian.Zhang@arm.com>
-rw-r--r-- | src/vnet/ip/ip4_forward.c | 131 |
1 files changed, 131 insertions, 0 deletions
diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index ecb35955798..3562f501bf7 100644 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -2421,6 +2421,7 @@ ip4_rewrite_inline_with_gso (vlib_main_t * vm, vlib_get_buffers (vm, from, bufs, n_left_from); clib_memset_u16 (nexts, IP4_REWRITE_NEXT_DROP, n_left_from); +#if (CLIB_N_PREFETCHES >= 8) if (n_left_from >= 6) { int i; @@ -2601,6 +2602,136 @@ ip4_rewrite_inline_with_gso (vlib_main_t * vm, b += 2; n_left_from -= 2; } +#elif (CLIB_N_PREFETCHES >= 4) + next = nexts; + b = bufs; + while (n_left_from >= 1) + { + ip_adjacency_t *adj0; + ip4_header_t *ip0; + u32 rw_len0, error0, adj_index0; + u32 tx_sw_if_index0; + u8 *p; + + /* Prefetch next iteration */ + if (PREDICT_TRUE (n_left_from >= 4)) + { + ip_adjacency_t *adj2; + u32 adj_index2; + + vlib_prefetch_buffer_header (b[3], LOAD); + vlib_prefetch_buffer_data (b[2], LOAD); + + /* Prefetch adj->rewrite_header */ + adj_index2 = vnet_buffer (b[2])->ip.adj_index[VLIB_TX]; + adj2 = adj_get (adj_index2); + p = (u8 *) adj2; + CLIB_PREFETCH (p + CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES, + LOAD); + } + + adj_index0 = vnet_buffer (b[0])->ip.adj_index[VLIB_TX]; + + /* + * Prefetch the per-adjacency counters + */ + if (do_counters) + { + vlib_prefetch_combined_counter (&adjacency_counters, + thread_index, adj_index0); + } + + ip0 = vlib_buffer_get_current (b[0]); + + error0 = IP4_ERROR_NONE; + + ip4_ttl_and_checksum_check (b[0], ip0, next + 0, &error0); + + /* Rewrite packet header and updates lengths. */ + adj0 = adj_get (adj_index0); + + /* Rewrite header was prefetched. */ + rw_len0 = adj0[0].rewrite_header.data_bytes; + vnet_buffer (b[0])->ip.save_rewrite_length = rw_len0; + + /* Check MTU of outgoing interface. */ + u16 ip0_len = clib_net_to_host_u16 (ip0->length); + + if (do_gso && (b[0]->flags & VNET_BUFFER_F_GSO)) + ip0_len = gso_mtu_sz (b[0]); + + ip4_mtu_check (b[0], ip0_len, + adj0[0].rewrite_header.max_l3_packet_bytes, + ip0->flags_and_fragment_offset & + clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT), + next + 0, &error0); + + if (is_mcast) + { + error0 = ((adj0[0].rewrite_header.sw_if_index == + vnet_buffer (b[0])->sw_if_index[VLIB_RX]) ? + IP4_ERROR_SAME_INTERFACE : error0); + } + + /* Don't adjust the buffer for ttl issue; icmp-error node wants + * to see the IP header */ + if (PREDICT_TRUE (error0 == IP4_ERROR_NONE)) + { + u32 next_index = adj0[0].rewrite_header.next_index; + vlib_buffer_advance (b[0], -(word) rw_len0); + tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index; + vnet_buffer (b[0])->sw_if_index[VLIB_TX] = tx_sw_if_index0; + + if (PREDICT_FALSE + (adj0[0].rewrite_header.flags & VNET_REWRITE_HAS_FEATURES)) + vnet_feature_arc_start (lm->output_feature_arc_index, + tx_sw_if_index0, &next_index, b[0]); + next[0] = next_index; + } + else + { + b[0]->error = error_node->errors[error0]; + } + if (is_midchain) + { + calc_checksums (vm, b[0]); + } + /* Guess we are only writing on simple Ethernet header. */ + vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t)); + + /* + * Bump the per-adjacency counters + */ + if (do_counters) + { + vlib_increment_combined_counter + (&adjacency_counters, + thread_index, + adj_index0, 1, vlib_buffer_length_in_chain (vm, b[0]) + rw_len0); + } + + if (is_midchain) + { + if (adj0->sub_type.midchain.fixup_func) + adj0->sub_type.midchain.fixup_func + (vm, adj0, b[0], adj0->sub_type.midchain.fixup_data); + } + + if (is_mcast) + { + /* + * copy bytes from the IP address into the MAC rewrite + */ + vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK, + adj0->rewrite_header.dst_mcast_offset, + &ip0->dst_address.as_u32, (u8 *) ip0); + } + + next += 1; + b += 1; + n_left_from -= 1; + } +#endif while (n_left_from > 0) { |