/* SPDX-License-Identifier: Apache-2.0 * Copyright(c) 2021 Cisco Systems, Inc. */ #ifndef included_vector_index_to_ptr_h #define included_vector_index_to_ptr_h #include #ifdef CLIB_HAVE_VEC128 static_always_inline void clib_index_to_ptr_u32x4 (u32 *indices, void **ptrs, i32 i, u64x2 ov, u8 shift) { u32x4 iv4 = u32x4_load_unaligned (indices + i); u64x2 pv2; pv2 = u64x2_from_u32x4 (iv4); u64x2_store_unaligned ((pv2 << shift) + ov, ptrs + i); #ifdef __aarch64__ pv2 = u64x2_from_u32x4_high (iv4); #else pv2 = u64x2_from_u32x4 ((u32x4) u8x16_word_shift_right (iv4, 8)); #endif u64x2_store_unaligned ((pv2 << shift) + ov, ptrs + i + 2); } #endif /** \brief Convert array of indices to pointers with base and shift @param indices source array of u32 indices @param base base pointer @param shift numbers of bits to be shifted @param ptrs destinatin array of pointers @param n_elts number of elements in the source array */ static_always_inline void clib_index_to_ptr_u32 (u32 *indices, void *base, u8 shift, void **ptrs, u32 n_elts) { #if defined CLIB_HAVE_VEC512 if (n_elts >= 8) { u64x8 off = u64x8_splat ((u64) base); u64x8 b0, b1, b2, b3, b4, b5, b6, b7; while (n_elts >= 64) { b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices)); b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8)); b2 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 16)); b3 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 24)); b4 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 32)); b5 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 40)); b6 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 48)); b7 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 56)); u64x8_store_unaligned ((b0 << shift) + off, ptrs); u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8); u64x8_store_unaligned ((b2 << shift) + off, ptrs + 16); u64x8_store_unaligned ((b3 << shift) + off, ptrs + 24); u64x8_store_unaligned ((b4 << shift) + off, ptrs + 32); u64x8_store_unaligned ((b5 << shift) + off, ptrs + 40); u64x8_store_unaligned ((b6 << shift) + off, ptrs + 48); u64x8_store_unaligned ((b7 << shift) + off, ptrs + 56); ptrs += 64; indices += 64; n_elts -= 64; } if (n_elts == 0) return; if (n_elts >= 32) { b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices)); b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8)); b2 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 16)); b3 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 24)); u64x8_store_unaligned ((b0 << shift) + off, ptrs); u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8); u64x8_store_unaligned ((b2 << shift) + off, ptrs + 16); u64x8_store_unaligned ((b3 << shift) + off, ptrs + 24); ptrs += 32; indices += 32; n_elts -= 32; } if (n_elts >= 16) { b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices)); b1 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + 8)); u64x8_store_unaligned ((b0 << shift) + off, ptrs); u64x8_store_unaligned ((b1 << shift) + off, ptrs + 8); ptrs += 16; indices += 16; n_elts -= 16; } if (n_elts >= 8) { b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices)); u64x8_store_unaligned ((b0 << shift) + off, ptrs); ptrs += 8; indices += 8; n_elts -= 8; } if (n_elts == 0) return; b0 = u64x8_from_u32x8 (u32x8_load_unaligned (indices + n_elts - 8)); u64x8_store_unaligned ((b0 << shift) + off, ptrs + n_elts - 8); } else { u32 mask = pow2_mask (n_elts); u64x8 r = u64x8_from_u32x8 (u32x8_mask_load_zero (indices, mask)); u64x8_mask_store ((r << shift) + u64x8_splat ((u64) base), ptrs, mask); return; } #elif defined CLIB_HAVE_VEC256 if (n_elts >= 4) { u64x4 off = u64x4_splat ((u64) base); u64x4 b0, b1, b2, b3, b4, b5, b6, b7; while (n_elts >= 32) { b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices)); b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4)); b2 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 8)); b3 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 12)); b4 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 16)); b5 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 20)); b6 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 24)); b7 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 28)); u64x4_store_unaligned ((b0 << shift) + off, ptrs); u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4); u64x4_store_unaligned ((b2 << shift) + off, ptrs + 8); u64x4_store_unaligned ((b3 << shift) + off, ptrs + 12); u64x4_store_unaligned ((b4 << shift) + off, ptrs + 16); u64x4_store_unaligned ((b5 << shift) + off, ptrs + 20); u64x4_store_unaligned ((b6 << shift) + off, ptrs + 24); u64x4_store_unaligned ((b7 << shift) + off, ptrs + 28); ptrs += 32; indices += 32; n_elts -= 32; } if (n_elts == 0) return; if (n_elts >= 16) { b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices)); b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4)); b2 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 8)); b3 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 12)); u64x4_store_unaligned ((b0 << shift) + off, ptrs); u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4); u64x4_store_unaligned ((b2 << shift) + off, ptrs + 8); u64x4_store_unaligned ((b3 << shift) + off, ptrs + 12); ptrs += 16; indices += 16; n_elts -= 16; } if (n_elts >= 8) { b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices)); b1 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + 4)); u64x4_store_unaligned ((b0 << shift) + off, ptrs); u64x4_store_unaligned ((b1 << shift) + off, ptrs + 4); ptrs += 8; indices += 8; n_elts -= 8; } if (n_elts > 4) { b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices)); u64x4_store_unaligned ((b0 << shift) + off, ptrs); ptrs += 4; indices += 4; n_elts -= 4; } b0 = u64x4_from_u32x4 (u32x4_load_unaligned (indices + n_elts - 4)); u64x4_store_unaligned ((b0 << shift) + off, ptrs + n_elts - 4); return; } #ifdef CLIB_HAVE_VEC256_MASK_LOAD_STORE else { u32 mask = pow2_mask (n_elts); u64x4 r = u64x4_from_u32x4 (u32x4_mask_load_zero (indices, mask)); u64x4_mask_store ((r << shift) + u64x4_splat ((u64) base), ptrs, mask); return; } #endif #elif defined(CLIB_HAVE_VEC128) if (n_elts >= 4) { u64x2 ov = u64x2_splat ((u64) base); u32 *i = (u32 *) indices; void **p = (void **) ptrs; u32 n = n_elts; while (n >= 32) { clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift); clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift); clib_index_to_ptr_u32x4 (indices, ptrs, 8, ov, shift); clib_index_to_ptr_u32x4 (indices, ptrs, 12, ov, shift); clib_index_to_ptr_u32x4 (indices, ptrs, 16, ov, shift); clib_index_to_ptr_u32x4 (indices, ptrs, 20, ov, shift); clib_index_to_ptr_u32x4 (indices, ptrs, 24, ov, shift); clib_index_to_ptr_u32x4 (indices, ptrs, 28, ov, shift); indices += 32; ptrs += 32; n -= 32; } if (n == 0) return; if (n >= 16) { clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift); clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift); clib_index_to_ptr_u32x4 (indices, ptrs, 8, ov, shift); clib_index_to_ptr_u32x4 (indices, ptrs, 12, ov, shift); indices += 16; ptrs += 16; n -= 16; } if (n >= 8) { clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift); clib_index_to_ptr_u32x4 (indices, ptrs, 4, ov, shift); indices += 8; ptrs += 8; n -= 8; } if (n > 4) clib_index_to_ptr_u32x4 (indices, ptrs, 0, ov, shift); clib_index_to_ptr_u32x4 (i, p, n_elts - 4, ov, shift); return; } #endif while (n_elts) { ptrs[0] = base + ((u64) indices[0] << shift); ptrs += 1; indices += 1; n_elts -= 1; } } #endif