/* SPDX-License-Identifier: Apache-2.0
 * Copyright(c) 2021 Cisco Systems, Inc.
 */

#ifndef included_vector_compress_h
#define included_vector_compress_h
#include <vppinfra/clib.h>
#include <vppinfra/memcpy.h>

static_always_inline u64 *
clib_compress_u64_x64 (u64 *dst, u64 *src, u64 mask)
{
#if defined(CLIB_HAVE_VEC512_COMPRESS)
  u64x8u *sv = (u64x8u *) src;
  for (int i = 0; i < 8; i++)
    {
      u64x8_compress_store (sv[i], mask, dst);
      dst += _popcnt32 ((u8) mask);
      mask >>= 8;
    }
#elif defined(CLIB_HAVE_VEC256_COMPRESS)
  u64x4u *sv = (u64x4u *) src;
  for (int i = 0; i < 16; i++)
    {
      u64x4_compress_store (sv[i], mask, dst);
      dst += _popcnt32 (((u8) mask) & 0x0f);
      mask >>= 4;
    }
#else
  u32 i;
  foreach_set_bit_index (i, mask)
    dst++[0] = src[i];
#endif
  return dst;
}

static_always_inline u64 *
clib_compress_u64_x64_masked (u64 *dst, u64 *src, u64 mask)
{
#if defined(CLIB_HAVE_VEC512_COMPRESS) &&                                     \
  defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE)
  u64x8u *sv = (u64x8u *) src;
  for (int i = 0; i < 8; i++)
    {
      u64x8u s = u64x8_mask_load_zero (&sv[i], mask);
      u64x8_compress_store (s, mask, dst);
      dst += _popcnt32 ((u8) mask);
      mask >>= 8;
    }
#elif defined(CLIB_HAVE_VEC256_COMPRESS) &&                                   \
  defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
  u64x4u *sv = (u64x4u *) src;
  for (int i = 0; i < 16; i++)
    {
      u64x4u s = u64x4_mask_load_zero (&sv[i], mask);
      u64x4_compress_store (s, mask, dst);
      dst += _popcnt32 (((u8) mask) & 0x0f);
      mask >>= 4;
    }
#else
  u32 i;
  foreach_set_bit_index (i, mask)
    dst++[0] = src[i];
#endif
  return dst;
}

/** \brief Compress array of 64-bit elemments into destination array based on
 * mask

    @param dst destination array of u64 elements
    @param src source array of u64 elements
    @param mask array of u64 values representing compress mask
    @param n_elts number of elements in the source array
    @return number of elements stored in destionation array
*/

static_always_inline u32
clib_compress_u64 (u64 *dst, u64 *src, u64 *mask, u32 n_elts)
{
  u64 *dst0 = dst;
  while (n_elts >= 64)
    {
      if (mask[0] == ~0ULL)
	{
	  clib_memcpy_fast (dst, src, 64 * sizeof (u64));
	  dst += 64;
	}
      else
	dst = clib_compress_u64_x64 (dst, src, mask[0]);

      mask++;
      src += 64;
      n_elts -= 64;
    }

  if (PREDICT_TRUE (n_elts == 0))
    return dst - dst0;

  return clib_compress_u64_x64_masked (dst, src,
				       mask[0] & pow2_mask (n_elts)) -
	 dst0;
}

static_always_inline u32 *
clib_compress_u32_x64 (u32 *dst, u32 *src, u64 mask)
{
#if defined(CLIB_HAVE_VEC512_COMPRESS)
  u32x16u *sv = (u32x16u *) src;
  for (int i = 0; i < 4; i++)
    {
      u32x16_compress_store (sv[i], mask, dst);
      dst += _popcnt32 ((u16) mask);
      mask >>= 16;
    }

#elif defined(CLIB_HAVE_VEC256_COMPRESS)
  u32x8u *sv = (u32x8u *) src;
  for (int i = 0; i < 8; i++)
    {
      u32x8_compress_store (sv[i], mask, dst);
      dst += _popcnt32 ((u8) mask);
      mask >>= 8;
    }
#else
  u32 i;
  foreach_set_bit_index (i, mask)
    dst++[0] = src[i];
#endif
  return dst;
}

static_always_inline u32 *
clib_compress_u32_x64_masked (u32 *dst, u32 *src, u64 mask)
{
#if defined(CLIB_HAVE_VEC512_COMPRESS) &&                                     \
  defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE)
  u32x16u *sv = (u32x16u *) src;
  for (int i = 0; i < 4; i++)
    {
      u32x16u s = u32x16_mask_load_zero (&sv[i], mask);
      u32x16_compress_store (s, mask, dst);
      dst += _popcnt32 ((u16) mask);
      mask >>= 16;
    }

#elif defined(CLIB_HAVE_VEC256_COMPRESS) &&                                   \
  defined(CLIB_HAVE_VEC256_MASK_LOAD_STORE)
  u32x8u *sv = (u32x8u *) src;
  for (int i = 0; i < 8; i++)
    {
      u32x8u s = u32x8_mask_load_zero (&sv[i], mask);
      u32x8_compress_store (s, mask, dst);
      dst += _popcnt32 ((u8) mask);
      mask >>= 8;
    }
#else
  u32 i;
  foreach_set_bit_index (i, mask)
    dst++[0] = src[i];
#endif
  return dst;
}

/** \brief Compress array of 32-bit elemments into destination array based on
 * mask

    @param dst destination array of u32 elements
    @param src source array of u32 elements
    @param mask array of u64 values representing compress mask
    @param n_elts number of elements in the source array
    @return number of elements stored in destionation array
*/

static_always_inline u32
clib_compress_u32 (u32 *dst, u32 *src, u64 *mask, u32 n_elts)
{
  u32 *dst0 = dst;
  while (n_elts >= 64)
    {
      if (mask[0] == ~0ULL)
	{
	  clib_memcpy_u32 (dst, src, 64);
	  dst += 64;
	}
      else
	dst = clib_compress_u32_x64 (dst, src, mask[0]);

      mask++;
      src += 64;
      n_elts -= 64;
    }

  if (PREDICT_TRUE (n_elts == 0))
    return dst - dst0;

  return clib_compress_u32_x64_masked (dst, src,
				       mask[0] & pow2_mask (n_elts)) -
	 dst0;
}

static_always_inline u16 *
clib_compress_u16_x64 (u16 *dst, u16 *src, u64 mask)
{
#if defined(CLIB_HAVE_VEC512_COMPRESS_U8_U16)
  u16x32u *sv = (u16x32u *) src;
  for (int i = 0; i < 2; i++)
    {
      u16x32_compress_store (sv[i], mask, dst);
      dst += _popcnt32 ((u32) mask);
      mask >>= 32;
    }
#else
  u32 i;
  foreach_set_bit_index (i, mask)
    dst++[0] = src[i];
#endif
  return dst;
}

static_always_inline u16 *
clib_compress_u16_x64_masked (u16 *dst, u16 *src, u64 mask)
{
#if defined(CLIB_HAVE_VEC512_COMPRESS_U8_U16) &&                              \
  defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE)
  u16x32u *sv = (u16x32u *) src;
  for (int i = 0; i < 2; i++)
    {
      u16x32u s = u16x32_mask_load_zero (&sv[i], mask);
      u16x32_compress_store (s, mask, dst);
      dst += _popcnt32 ((u32) mask);
      mask >>= 32;
    }
#else
  u32 i;
  foreach_set_bit_index (i, mask)
    dst++[0] = src[i];
#endif
  return dst;
}

/** \brief Compress array of 16-bit elemments into destination array based on
 * mask

    @param dst destination array of u16 elements
    @param src source array of u16 elements
    @param mask array of u64 values representing compress mask
    @param n_elts number of elements in the source array
    @return number of elements stored in destionation array
*/

static_always_inline u32
clib_compress_u16 (u16 *dst, u16 *src, u64 *mask, u32 n_elts)
{
  u16 *dst0 = dst;
  while (n_elts >= 64)
    {
      if (mask[0] == ~0ULL)
	{
	  clib_memcpy_fast (dst, src, 64 * sizeof (u16));
	  dst += 64;
	}
      else
	dst = clib_compress_u16_x64 (dst, src, mask[0]);

      mask++;
      src += 64;
      n_elts -= 64;
    }

  if (PREDICT_TRUE (n_elts == 0))
    return dst - dst0;

  return clib_compress_u16_x64_masked (dst, src,
				       mask[0] & pow2_mask (n_elts)) -
	 dst0;
}

static_always_inline u8 *
clib_compress_u8_x64 (u8 *dst, u8 *src, u64 mask)
{
#if defined(CLIB_HAVE_VEC512_COMPRESS_U8_U16)
  u8x64u *sv = (u8x64u *) src;
  u8x64_compress_store (sv[0], mask, dst);
  dst += _popcnt64 (mask);
#else
  u32 i;
  foreach_set_bit_index (i, mask)
    dst++[0] = src[i];
#endif
  return dst;
}

static_always_inline u8 *
clib_compress_u8_x64_masked (u8 *dst, u8 *src, u64 mask)
{
#if defined(CLIB_HAVE_VEC512_COMPRESS_U8_U16) &&                              \
  defined(CLIB_HAVE_VEC512_MASK_LOAD_STORE)
  u8x64u *sv = (u8x64u *) src;
  u8x64u s = u8x64_mask_load_zero (sv, mask);
  u8x64_compress_store (s, mask, dst);
  dst += _popcnt64 (mask);
#else
  u32 i;
  foreach_set_bit_index (i, mask)
    dst++[0] = src[i];
#endif
  return dst;
}

/** \brief Compress array of 8-bit elemments into destination array based on
 * mask

    @param dst destination array of u8 elements
    @param src source array of u8 elements
    @param mask array of u64 values representing compress mask
    @param n_elts number of elements in the source array
    @return number of elements stored in destionation array
*/

static_always_inline u32
clib_compress_u8 (u8 *dst, u8 *src, u64 *mask, u32 n_elts)
{
  u8 *dst0 = dst;
  while (n_elts >= 64)
    {
      if (mask[0] == ~0ULL)
	{
	  clib_memcpy_fast (dst, src, 64);
	  dst += 64;
	}
      else
	dst = clib_compress_u8_x64 (dst, src, mask[0]);

      mask++;
      src += 64;
      n_elts -= 64;
    }

  if (PREDICT_TRUE (n_elts == 0))
    return dst - dst0;

  return clib_compress_u8_x64_masked (dst, src, mask[0] & pow2_mask (n_elts)) -
	 dst0;
}

#endif