/* SPDX-License-Identifier: Apache-2.0
 * Copyright(c) 2021 Cisco Systems, Inc.
 */

#include <vppinfra/format.h>
#include <vppinfra/vector/test/test.h>
#include <vppinfra/error.h>

test_main_t test_main;

int
test_march_supported (clib_march_variant_type_t type)
{
#define _(s, n)                                                               \
  if (CLIB_MARCH_VARIANT_TYPE_##s == type)                                    \
    return clib_cpu_march_priority_##s ();
  foreach_march_variant
#undef _
    return 0;
}

clib_error_t *
test_funct (test_main_t *tm)
{
  for (int i = 0; i < CLIB_MARCH_TYPE_N_VARIANTS; i++)
    {
      test_registration_t *r = tm->registrations[i];

      if (r == 0 || test_march_supported (i) < 0)
	continue;

      fformat (stdout, "\nMultiarch Variant: %U\n", format_march_variant, i);
      fformat (stdout,
	       "-------------------------------------------------------\n");
      while (r)
	{
	  clib_error_t *err;
	  if (tm->filter && strstr (r->name, (char *) tm->filter) == 0)
	    goto next;
	  err = (r->fn) (0);
	  fformat (stdout, "%-50s %s\n", r->name, err ? "FAIL" : "PASS");
	  if (err)
	    {
	      clib_error_report (err);
	      fformat (stdout, "\n");
	    }
	next:
	  r = r->next;
	}
    }

  fformat (stdout, "\n");
  return 0;
}

#define TEST_PERF_MAX_EVENTS 7
typedef struct
{
  char *name;
  char *desc;
  u64 config[TEST_PERF_MAX_EVENTS];
  u32 type;
  u8 n_events;
  format_function_t *format_fn;
} test_perf_event_bundle_t;

static u8 *
format_test_perf_bundle_default (u8 *s, va_list *args)
{
  test_main_t *tm = &test_main;
  test_perf_event_bundle_t __clib_unused *b =
    va_arg (*args, test_perf_event_bundle_t *);
  test_perf_t *tp = va_arg (*args, test_perf_t *);
  u64 *data = va_arg (*args, u64 *);

  if (tm->ref_clock > 0)
    {
      if (data)
	s = format (s, "%8.1f", tm->ref_clock * data[0] / data[1] / 1e9);
      else
	s = format (s, "%8s", "Freq");
    }

  if (data)
    s = format (s, "%5.2f", (f64) data[2] / data[0]);
  else
    s = format (s, "%5s", "IPC");

  if (data)
    s = format (s, "%8.2f", (f64) data[0] / tp->n_ops);
  else
    s = format (s, "%8s", "Clks/Op");

  if (data)
    s = format (s, "%8.2f", (f64) data[2] / tp->n_ops);
  else
    s = format (s, "%8s", "Inst/Op");

  if (data)
    s = format (s, "%9.2f", (f64) data[3] / tp->n_ops);
  else
    s = format (s, "%9s", "Brnch/Op");

  if (data)
    s = format (s, "%10.2f", (f64) data[4] / tp->n_ops);
  else
    s = format (s, "%10s", "BrMiss/Op");
  return s;
}

static u8 *
format_test_perf_bundle_core_power (u8 *s, va_list *args)
{
  test_perf_event_bundle_t __clib_unused *b =
    va_arg (*args, test_perf_event_bundle_t *);
  test_perf_t __clib_unused *tp = va_arg (*args, test_perf_t *);
  u64 *data = va_arg (*args, u64 *);

  if (data)
    s = format (s, "%7.1f %%", (f64) 100 * data[1] / data[0]);
  else
    s = format (s, "%9s", "Level 0");

  if (data)
    s = format (s, "%8.1f %%", (f64) 100 * data[2] / data[0]);
  else
    s = format (s, "%9s", "Level 1");

  if (data)
    s = format (s, "%7.1f %%", (f64) 100 * data[3] / data[0]);
  else
    s = format (s, "%9s", "Level 2");

  return s;
}

test_perf_event_bundle_t perf_bundles[] = {
  {
    .name = "default",
    .desc = "IPC, Clocks/Operatiom, Instr/Operation, Branch Total & Miss",
    .type = PERF_TYPE_HARDWARE,
    .config[0] = PERF_COUNT_HW_CPU_CYCLES,
    .config[1] = PERF_COUNT_HW_REF_CPU_CYCLES,
    .config[2] = PERF_COUNT_HW_INSTRUCTIONS,
    .config[3] = PERF_COUNT_HW_BRANCH_INSTRUCTIONS,
    .config[4] = PERF_COUNT_HW_BRANCH_MISSES,
    .n_events = 5,
    .format_fn = format_test_perf_bundle_default,
  }
#ifdef __x86_64__
#define PERF_INTEL_CODE(event, umask) ((event) | (umask) << 8)
  ,
  {
    .name = "core-power",
    .desc =
      "Core cycles where the core was running under specific turbo schedule.",
    .type = PERF_TYPE_RAW,
    .config[0] = PERF_INTEL_CODE (0x3c, 0x00),
    .config[1] = PERF_INTEL_CODE (0x28, 0x07),
    .config[2] = PERF_INTEL_CODE (0x28, 0x18),
    .config[3] = PERF_INTEL_CODE (0x28, 0x20),
    .config[4] = PERF_INTEL_CODE (0x28, 0x40),
    .n_events = 5,
    .format_fn = format_test_perf_bundle_core_power,
  }
#endif
};

#ifdef __linux__
clib_error_t *
test_perf (test_main_t *tm)
{
  clib_error_t *err = 0;
  test_perf_event_bundle_t *b = 0;
  int group_fd = -1, fds[TEST_PERF_MAX_EVENTS];
  u64 count[TEST_PERF_MAX_EVENTS + 3] = {};
  struct perf_event_attr pe = {
    .size = sizeof (struct perf_event_attr),
    .disabled = 1,
    .exclude_kernel = 1,
    .exclude_hv = 1,
    .pinned = 1,
    .exclusive = 1,
    .read_format = (PERF_FORMAT_GROUP | PERF_FORMAT_TOTAL_TIME_ENABLED |
		    PERF_FORMAT_TOTAL_TIME_RUNNING),
  };

  for (int i = 0; i < TEST_PERF_MAX_EVENTS; i++)
    fds[i] = -1;

  tm->ref_clock = os_cpu_clock_frequency ();

  if (tm->bundle)
    {
      for (int i = 0; i < ARRAY_LEN (perf_bundles); i++)
	if (strncmp ((char *) tm->bundle, perf_bundles[i].name,
		     vec_len (tm->bundle)) == 0)
	  {
	    b = perf_bundles + i;
	    break;
	  }
      if (b == 0)
	return clib_error_return (0, "Unknown bundle '%s'", tm->bundle);
    }
  else
    b = perf_bundles;

  for (int i = 0; i < b->n_events; i++)
    {
      pe.config = b->config[i];
      pe.type = b->type;
      int fd = syscall (__NR_perf_event_open, &pe, /* pid */ 0, /* cpu */ -1,
			/* group_fd */ group_fd, /* flags */ 0);
      if (fd < 0)
	{
	  err = clib_error_return_unix (0, "perf_event_open");
	  goto done;
	}

      if (group_fd == -1)
	{
	  group_fd = fd;
	  pe.pinned = 0;
	  pe.exclusive = 0;
	}
      fds[i] = fd;
    }
  fformat (stdout, "Warming up...\n");
  for (u64 i = 0; i < (u64) tm->ref_clock; i++)
    asm inline("" : : "r"(i * i) : "memory");

  for (int i = 0; i < CLIB_MARCH_TYPE_N_VARIANTS; i++)
    {
      test_registration_t *r = tm->registrations[i];

      if (r == 0 || test_march_supported (i) < 0)
	continue;

      fformat (stdout, "\nMultiarch Variant: %U\n", format_march_variant, i);
      fformat (stdout,
	       "-------------------------------------------------------\n");
      while (r)
	{
	  if (r->perf_tests)
	    {
	      test_perf_t *pt = r->perf_tests;
	      if (tm->filter && strstr (r->name, (char *) tm->filter) == 0)
		goto next;
	      fformat (stdout, "%-22s%-12s%U\n", r->name, "OpType",
		       b->format_fn, b, pt, 0UL);
	      do
		{
		  u32 read_size = (b->n_events + 3) * sizeof (u64);
		  for (int i = 0; i < tm->repeat; i++)
		    {
		      test_perf_event_reset (group_fd);
		      pt->fn (group_fd, pt);
		      if ((read (group_fd, &count, read_size) != read_size))
			{
			  err = clib_error_return_unix (0, "read");
			  goto done;
			}
		      if (count[1] != count[2])
			clib_warning (
			  "perf counters were not running all the time."
#ifdef __x86_64__
			  "\nConsider turning NMI watchdog off ('sysctl -w "
			  "kernel.nmi_watchdog=0')."
#endif
			);
		      fformat (stdout, "  %-20s%-12s%U\n", pt->name,
			       pt->op_name ? pt->op_name : "", b->format_fn, b,
			       pt, count + 3);
		    }
		}
	      while ((++pt)->fn);
	    }
	next:
	  r = r->next;
	}
    }

done:
  for (int i = 0; i < TEST_PERF_MAX_EVENTS; i++)
    if (fds[i] != -1)
      close (fds[i]);
  return err;
}
#endif

int
main (int argc, char *argv[])
{
  test_main_t *tm = &test_main;
  unformat_input_t _i = {}, *i = &_i;
  clib_mem_init (0, 64ULL << 20);
  clib_error_t *err;
  int perf = 0;

  /* defaults */
  tm->repeat = 3;

  unformat_init_command_line (i, argv);

  while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT)
    {
      if (unformat (i, "perf"))
	perf = 1;
      else if (unformat (i, "filter %s", &tm->filter))
	;
      else if (unformat (i, "bundle %s", &tm->bundle))
	;
      else if (unformat (i, "repeat %d", &tm->repeat))
	;
      else
	{
	  clib_warning ("unknown input '%U'", format_unformat_error, i);
	  exit (1);
	}
    }

  if (perf)
    err = test_perf (tm);
  else
    err = test_funct (tm);

  if (err)
    {
      clib_error_report (err);
      fformat (stderr, "\n");
      return 1;
    }
  return 0;
}

void *
test_mem_alloc (uword size)
{
  void *rv;
  size = round_pow2 (size, CLIB_CACHE_LINE_BYTES);
  rv = clib_mem_alloc_aligned (size, CLIB_CACHE_LINE_BYTES);
  clib_memset_u8 (rv, 0, size);
  return rv;
}

void *
test_mem_alloc_and_fill_inc_u8 (uword size, u8 start, u8 mask)
{
  u8 *rv;
  mask = mask ? mask : 0xff;
  size = round_pow2 (size, CLIB_CACHE_LINE_BYTES);
  rv = clib_mem_alloc_aligned (size, CLIB_CACHE_LINE_BYTES);
  for (uword i = 0; i < size; i++)
    rv[i] = ((u8) i + start) & mask;
  return rv;
}

void *
test_mem_alloc_and_splat (uword elt_size, uword n_elts, void *elt)
{
  u8 *rv, *e;
  uword data_size = elt_size * n_elts;
  uword alloc_size = round_pow2 (data_size, CLIB_CACHE_LINE_BYTES);
  e = rv = clib_mem_alloc_aligned (alloc_size, CLIB_CACHE_LINE_BYTES);
  while (e - rv < data_size)
    {
      clib_memcpy_fast (e, elt, elt_size);
      e += elt_size;
    }

  if (data_size < alloc_size)
    clib_memset_u8 (e, 0, alloc_size - data_size);
  return rv;
}

void
test_mem_free (void *p)
{
  clib_mem_free (p);
}