From 1105600416e0560cb05120a22e0a2e7359a13665 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 10 May 2018 13:40:44 +0200 Subject: vppinfra: use count_trailing_zeros in sparse_vec_index It is much cheaper to use ctzll than to do shift,subtract and mask in likely case when we are looking for 1st set bit in the uword. Change-Id: I31954081571978878c7098bafad0c85a91755fa2 Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/cli.c | 6 ++-- src/plugins/dpdk/hqos/hqos.c | 10 +++--- src/svm/svm.c | 2 +- src/vnet/devices/virtio/vhost-user.c | 12 +++---- src/vnet/l2/feat_bitmap.h | 2 +- src/vppinfra/bitmap.h | 2 +- src/vppinfra/clib.h | 63 ++++-------------------------------- src/vppinfra/sparse_vec.h | 33 +++++++++++++------ 8 files changed, 46 insertions(+), 84 deletions(-) (limited to 'src') diff --git a/src/plugins/dpdk/device/cli.c b/src/plugins/dpdk/device/cli.c index 2eaf17e55a5..08bf0c94b53 100644 --- a/src/plugins/dpdk/device/cli.c +++ b/src/plugins/dpdk/device/cli.c @@ -1323,20 +1323,20 @@ set_dpdk_if_hqos_pktfield (vlib_main_t * vm, unformat_input_t * input, xd->hqos_wt[worker_thread_first + i].hqos_field0_slabpos = offset; xd->hqos_wt[worker_thread_first + i].hqos_field0_slabmask = mask; xd->hqos_wt[worker_thread_first + i].hqos_field0_slabshr = - __builtin_ctzll (mask); + count_trailing_zeros (mask); break; case 1: xd->hqos_wt[worker_thread_first + i].hqos_field1_slabpos = offset; xd->hqos_wt[worker_thread_first + i].hqos_field1_slabmask = mask; xd->hqos_wt[worker_thread_first + i].hqos_field1_slabshr = - __builtin_ctzll (mask); + count_trailing_zeros (mask); break; case 2: default: xd->hqos_wt[worker_thread_first + i].hqos_field2_slabpos = offset; xd->hqos_wt[worker_thread_first + i].hqos_field2_slabmask = mask; xd->hqos_wt[worker_thread_first + i].hqos_field2_slabshr = - __builtin_ctzll (mask); + count_trailing_zeros (mask); } done: diff --git a/src/plugins/dpdk/hqos/hqos.c b/src/plugins/dpdk/hqos/hqos.c index c9b85652863..123ecbab0d3 100644 --- a/src/plugins/dpdk/hqos/hqos.c +++ b/src/plugins/dpdk/hqos/hqos.c @@ -168,8 +168,8 @@ int dpdk_hqos_validate_mask (u64 mask, u32 n) { int count = __builtin_popcountll (mask); - int pos_lead = sizeof (u64) * 8 - __builtin_clzll (mask); - int pos_trail = __builtin_ctzll (mask); + int pos_lead = sizeof (u64) * 8 - count_leading_zeros (mask); + int pos_trail = count_trailing_zeros (mask); int count_expected = __builtin_popcount (n - 1); /* Handle the exceptions */ @@ -363,15 +363,15 @@ dpdk_port_setup_hqos (dpdk_device_t * xd, dpdk_device_config_hqos_t * hqos) xd->hqos_wt[tid].hqos_field0_slabpos = hqos->pktfield0_slabpos; xd->hqos_wt[tid].hqos_field0_slabmask = hqos->pktfield0_slabmask; xd->hqos_wt[tid].hqos_field0_slabshr = - __builtin_ctzll (hqos->pktfield0_slabmask); + count_trailing_zeros (hqos->pktfield0_slabmask); xd->hqos_wt[tid].hqos_field1_slabpos = hqos->pktfield1_slabpos; xd->hqos_wt[tid].hqos_field1_slabmask = hqos->pktfield1_slabmask; xd->hqos_wt[tid].hqos_field1_slabshr = - __builtin_ctzll (hqos->pktfield1_slabmask); + count_trailing_zeros (hqos->pktfield1_slabmask); xd->hqos_wt[tid].hqos_field2_slabpos = hqos->pktfield2_slabpos; xd->hqos_wt[tid].hqos_field2_slabmask = hqos->pktfield2_slabmask; xd->hqos_wt[tid].hqos_field2_slabshr = - __builtin_ctzll (hqos->pktfield2_slabmask); + count_trailing_zeros (hqos->pktfield2_slabmask); memcpy (xd->hqos_wt[tid].hqos_tc_table, hqos->tc_table, sizeof (hqos->tc_table)); } diff --git a/src/svm/svm.c b/src/svm/svm.c index 16a58fa126a..e3a734cbc84 100644 --- a/src/svm/svm.c +++ b/src/svm/svm.c @@ -83,7 +83,7 @@ svm_get_global_region_base_va () unformat_free (&input); close (fd); - count_leading_zeros (bits, end); + bits = count_leading_zeros (end); bits = 64 - bits; if (bits >= 36 && bits <= 48) return ((1ul << bits) / 4) - (2 * SVM_GLOBAL_REGION_SIZE); diff --git a/src/vnet/devices/virtio/vhost-user.c b/src/vnet/devices/virtio/vhost-user.c index c1fca441fa5..7e10a60f5fb 100644 --- a/src/vnet/devices/virtio/vhost-user.c +++ b/src/vnet/devices/virtio/vhost-user.c @@ -248,8 +248,8 @@ map_guest_mem (vhost_user_intf_t * vui, uword addr, u32 * hint) r = _mm_blend_epi16 (r, _mm_and_si128 (rl, rh), 0x88); r = _mm_shuffle_epi8 (r, _mm_set_epi64x (0, 0x0e060c040a020800)); - i = __builtin_ctzll (_mm_movemask_epi8 (r) | - (1 << VHOST_MEMORY_MAX_NREGIONS)); + i = count_trailing_zeros (_mm_movemask_epi8 (r) | + (1 << VHOST_MEMORY_MAX_NREGIONS)); if (i < vui->nregions) { @@ -275,7 +275,7 @@ map_guest_mem (vhost_user_intf_t * vui, uword addr, u32 * hint) if (u32) { - i = __builtin_ctzll (u32); + i = count_trailing_zeros (u32); goto vhost_map_guest_mem_done; } @@ -290,7 +290,7 @@ map_guest_mem (vhost_user_intf_t * vui, uword addr, u32 * hint) if (u32) { - i = __builtin_ctzll (u32); + i = count_trailing_zeros (u32); goto vhost_map_guest_mem_done; } @@ -305,7 +305,7 @@ map_guest_mem (vhost_user_intf_t * vui, uword addr, u32 * hint) if (u32) { - i = __builtin_ctzll (u32); + i = count_trailing_zeros (u32); goto vhost_map_guest_mem_done; } @@ -318,7 +318,7 @@ map_guest_mem (vhost_user_intf_t * vui, uword addr, u32 * hint) u32 |= ((vgetq_lane_u8 (vreinterpretq_u8_u64 (r), 0) & 0x1) << 6); u32 |= ((vgetq_lane_u8 (vreinterpretq_u8_u64 (r), 8) & 0x1) << 7); - i = __builtin_ctzll (u32 | (1 << VHOST_MEMORY_MAX_NREGIONS)); + i = count_trailing_zeros (u32 | (1 << VHOST_MEMORY_MAX_NREGIONS)); vhost_map_guest_mem_done: if (i < vui->nregions) diff --git a/src/vnet/l2/feat_bitmap.h b/src/vnet/l2/feat_bitmap.h index 5940ff7ea9b..a1b295ac4ac 100644 --- a/src/vnet/l2/feat_bitmap.h +++ b/src/vnet/l2/feat_bitmap.h @@ -80,7 +80,7 @@ feat_bitmap_get_next_node_index (u32 * next_nodes, u32 bitmap) { u32 first_bit; - count_leading_zeros (first_bit, bitmap); + first_bit = count_leading_zeros (bitmap); first_bit = uword_bits - 1 - first_bit; return next_nodes[first_bit]; } diff --git a/src/vppinfra/bitmap.h b/src/vppinfra/bitmap.h index 9e1ae493285..dbf9eeb2232 100644 --- a/src/vppinfra/bitmap.h +++ b/src/vppinfra/bitmap.h @@ -409,7 +409,7 @@ clib_bitmap_last_set (uword * ai) if (x != 0) { uword first_bit; - count_leading_zeros (first_bit, x); + first_bit = count_leading_zeros (x); return (i) * BITS (ai[0]) - first_bit - 1; } } diff --git a/src/vppinfra/clib.h b/src/vppinfra/clib.h index 0d059a0778b..42748b0a34c 100644 --- a/src/vppinfra/clib.h +++ b/src/vppinfra/clib.h @@ -125,71 +125,20 @@ decl /* Use __builtin_clz if available. */ -#ifdef __GNUC__ -#include -#if __GNUC_PREREQ(3, 4) #if uword_bits == 64 -#define count_leading_zeros(count,x) count = __builtin_clzll (x) -#define count_trailing_zeros(count,x) count = __builtin_ctzll (x) +#define count_leading_zeros(x) __builtin_clzll (x) +#define count_trailing_zeros(x) __builtin_ctzll (x) #else -#define count_leading_zeros(count,x) count = __builtin_clzl (x) -#define count_trailing_zeros(count,x) count = __builtin_ctzl (x) +#define count_leading_zeros(x) __builtin_clzl (x) +#define count_trailing_zeros(x) __builtin_ctzl (x) #endif -#endif -#endif - -#ifndef count_leading_zeros - -/* Misc. integer arithmetic functions. */ -#if defined (i386) -#define count_leading_zeros(count, x) \ - do { \ - word _clz; \ - __asm__ ("bsrl %1,%0" \ - : "=r" (_clz) : "rm" ((word) (x)));\ - (count) = _clz ^ 31; \ - } while (0) - -#define count_trailing_zeros(count, x) \ - __asm__ ("bsfl %1,%0" : "=r" (count) : "rm" ((word)(x))) -#endif /* i386 */ - -#if defined (__alpha__) && defined (HAVE_CIX) -#define count_leading_zeros(count, x) \ - __asm__ ("ctlz %1,%0" \ - : "=r" ((word) (count)) \ - : "r" ((word) (x))) -#define count_trailing_zeros(count, x) \ - __asm__ ("cttz %1,%0" \ - : "=r" ((word) (count)) \ - : "r" ((word) (x))) -#endif /* alpha && HAVE_CIX */ - -#if __mips >= 4 - -/* Select between 32/64 opcodes. */ -#if uword_bits == 32 -#define count_leading_zeros(_count, _x) \ - __asm__ ("clz %[count],%[x]" \ - : [count] "=r" ((word) (_count)) \ - : [x] "r" ((word) (_x))) -#else -#define count_leading_zeros(_count, _x) \ - __asm__ ("dclz %[count],%[x]" \ - : [count] "=r" ((word) (_count)) \ - : [x] "r" ((word) (_x))) -#endif - -#endif /* __mips >= 4 */ - -#endif /* count_leading_zeros */ #if defined (count_leading_zeros) always_inline uword min_log2 (uword x) { uword n; - count_leading_zeros (n, x); + n = count_leading_zeros (x); return BITS (uword) - n - 1; } #else @@ -305,7 +254,7 @@ log2_first_set (uword x) { uword result; #ifdef count_trailing_zeros - count_trailing_zeros (result, x); + result = count_trailing_zeros (x); #else result = min_log2 (first_set (x)); #endif diff --git a/src/vppinfra/sparse_vec.h b/src/vppinfra/sparse_vec.h index 0da154d8b68..cfa5778e290 100644 --- a/src/vppinfra/sparse_vec.h +++ b/src/vppinfra/sparse_vec.h @@ -108,15 +108,20 @@ sparse_vec_index_internal (void *v, h = sparse_vec_header (v); i = sparse_index / BITS (h->is_member_bitmap[0]); - b = (uword) 1 << (uword) (sparse_index % BITS (h->is_member_bitmap[0])); + b = sparse_index % BITS (h->is_member_bitmap[0]); ASSERT (i < vec_len (h->is_member_bitmap)); ASSERT (i < vec_len (h->member_counts)); w = h->is_member_bitmap[i]; - d = h->member_counts[i] + count_set_bits (w & (b - 1)); - is_member = (w & b) != 0; + if (PREDICT_TRUE (maybe_range == 0 && insert == 0 && + count_trailing_zeros (w) == b)) + return h->member_counts[i] + 1; + + d = h->member_counts[i] + count_set_bits (w & ((1ULL << b) - 1)); + is_member = (w & (1ULL << b)) != 0; + if (maybe_range) { u8 r = h->range_flags[d]; @@ -134,7 +139,7 @@ sparse_vec_index_internal (void *v, if (!is_member) { uword j; - w |= b; + w |= 1ULL << b; h->is_member_bitmap[i] = w; for (j = i + 1; j < vec_len (h->member_counts); j++) h->member_counts[j] += 1; @@ -170,8 +175,8 @@ sparse_vec_index2 (void *v, i0 = si0 / BITS (h->is_member_bitmap[0]); i1 = si1 / BITS (h->is_member_bitmap[0]); - b0 = (uword) 1 << (uword) (si0 % BITS (h->is_member_bitmap[0])); - b1 = (uword) 1 << (uword) (si1 % BITS (h->is_member_bitmap[0])); + b0 = si0 % BITS (h->is_member_bitmap[0]); + b1 = si1 % BITS (h->is_member_bitmap[0]); ASSERT (i0 < vec_len (h->is_member_bitmap)); ASSERT (i1 < vec_len (h->is_member_bitmap)); @@ -182,8 +187,16 @@ sparse_vec_index2 (void *v, w0 = h->is_member_bitmap[i0]; w1 = h->is_member_bitmap[i1]; - v0 = w0 & (b0 - 1); - v1 = w1 & (b1 - 1); + if (PREDICT_TRUE ((count_trailing_zeros (w0) == b0) + + (count_trailing_zeros (w1) == b1) == 2)) + { + *i0_return = h->member_counts[i0] + 1; + *i1_return = h->member_counts[i1] + 1; + return; + } + + v0 = w0 & ((1ULL << b0) - 1); + v1 = w1 & ((1ULL << b1) - 1); /* Speculate that masks will have zero or one bits set. */ d0 = h->member_counts[i0] + (v0 != 0); @@ -196,8 +209,8 @@ sparse_vec_index2 (void *v, d1 += count_set_bits (v1) - (v1 != 0); } - is_member0 = (w0 & b0) != 0; - is_member1 = (w1 & b1) != 0; + is_member0 = (w0 & (1ULL << b0)) != 0; + is_member1 = (w1 & (1ULL << b1)) != 0; d0 = is_member0 ? d0 : 0; d1 = is_member1 ? d1 : 0; -- cgit 1.2.3-korg