/*
  Copyright (c) 2010 Cisco and/or its affiliates.

  * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
*/

#include <vppinfra/error.h>
#include <vppinfra/unix.h>
#include <vppinfra/elog.h>
#include <vppinfra/format.h>
#include <vppinfra/os.h>

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>

typedef enum
{
  RUNNING = 0,
  WAKEUP,
} sched_event_type_t;

typedef struct
{
  u32 cpu;
  u8 *task;
  u32 pid;
  f64 timestamp;
  sched_event_type_t type;
} sched_event_t;

void
kelog_init (elog_main_t * em, char *kernel_tracer, u32 n_events)
{
  int enable_fd, current_tracer_fd, data_fd;
  int len;
  struct timespec ts, ts2;
  char *trace_enable = "/debug/tracing/tracing_enabled";
  char *current_tracer = "/debug/tracing/current_tracer";
  char *trace_data = "/debug/tracing/trace";
  f64 realtime, monotonic;
  f64 freq, secs_per_clock;

  ASSERT (kernel_tracer);

  /*$$$$ fixme */
  n_events = 1 << 18;

  /* init first so we won't hurt ourselves if we bail */
  elog_init (em, n_events);

  enable_fd = open (trace_enable, O_RDWR);
  if (enable_fd < 0)
    {
      clib_warning ("Couldn't open %s", trace_enable);
      return;
    }
  /* disable kernel tracing */
  if (write (enable_fd, "0\n", 2) != 2)
    {
      clib_unix_warning ("disable tracing");
      close (enable_fd);
      return;
    }

  /*
   * open + clear the data buffer.
   * see .../linux/kernel/trace/trace.c:tracing_open()
   */
  data_fd = open (trace_data, O_RDWR | O_TRUNC);
  if (data_fd < 0)
    {
      clib_warning ("Couldn't open+clear %s", trace_data);
      return;
    }
  close (data_fd);

  /* configure tracing */
  current_tracer_fd = open (current_tracer, O_RDWR);

  if (current_tracer_fd < 0)
    {
      clib_warning ("Couldn't open %s", current_tracer);
      close (enable_fd);
      return;
    }

  len = strlen (kernel_tracer);

  if (write (current_tracer_fd, kernel_tracer, len) != len)
    {
      clib_unix_warning ("configure trace");
      close (current_tracer_fd);
      close (enable_fd);
      return;
    }

  close (current_tracer_fd);

  /*
   * The kernel event log uses CLOCK_MONOTONIC timestamps,
   * not CLOCK_REALTIME timestamps. These differ by a constant
   * but the constant is not available in user mode.
   * This estimate will be off by one syscall round-trip.
   */
  clib_time_init (&em->cpu_timer);
  em->init_time.cpu = em->cpu_timer.init_cpu_time;
  syscall (SYS_clock_gettime, CLOCK_MONOTONIC, &ts);

  /* enable kernel tracing */
  if (write (enable_fd, "1\n", 2) != 2)
    {
      clib_unix_warning ("enable tracing");
      close (enable_fd);
      return;
    }

  close (enable_fd);
}


u8 *
format_sched_event (u8 * s, va_list * va)
{
  sched_event_t *e = va_arg (*va, sched_event_t *);

  s = format (s, "cpu %d task %10s type %s timestamp %12.6f\n",
	      e->cpu, e->task, e->type ? "WAKEUP " : "RUNNING", e->timestamp);

  return s;
}

sched_event_t *
parse_sched_switch_trace (u8 * tdata, u32 * index)
{
  u8 *cp = tdata + *index;
  u8 *limit = tdata + vec_len (tdata);
  int colons;
  static sched_event_t event;
  sched_event_t *e = &event;
  static u8 *task_name;
  u32 secs, usecs;
  int i;

again:
  /* eat leading w/s */
  while (cp < limit && (*cp == ' ' && *cp == '\t'))
    cp++;
  if (cp == limit)
    return 0;

  /* header line */
  if (*cp == '#')
    {
      while (cp < limit && (*cp != '\n'))
	cp++;
      if (*cp == '\n')
	{
	  cp++;
	  goto again;
	}
      clib_warning ("bugger 0");
      return 0;
    }

  while (cp < limit && *cp != ']')
    cp++;

  if (*cp == 0)
    return 0;

  if (*cp != ']')
    {
      clib_warning ("bugger 0.1");
      return 0;
    }

  cp++;
  while (cp < limit && (*cp == ' ' && *cp == '\t'))
    cp++;
  if (cp == limit)
    {
      clib_warning ("bugger 0.2");
      return 0;
    }

  secs = atoi (cp);

  while (cp < limit && (*cp != '.'))
    cp++;

  if (cp == limit)
    {
      clib_warning ("bugger 0.3");
      return 0;
    }

  cp++;

  usecs = atoi (cp);

  e->timestamp = ((f64) secs) + ((f64) usecs) * 1e-6;

  /* eat up to third colon */
  for (i = 0; i < 3; i++)
    {
      while (cp < limit && *cp != ':')
	cp++;
      cp++;
    }
  --cp;
  if (*cp != ':')
    {
      clib_warning ("bugger 1");
      return 0;
    }
  /* aim at '>' (switch-to) / '+' (wakeup) */
  cp += 5;
  if (cp >= limit)
    {
      clib_warning ("bugger 2");
      return 0;
    }
  if (*cp == '>')
    e->type = RUNNING;
  else if (*cp == '+')
    e->type = WAKEUP;
  else
    {
      clib_warning ("bugger 3");
      return 0;
    }

  cp += 3;
  if (cp >= limit)
    {
      clib_warning ("bugger 4");
      return 0;
    }

  e->cpu = atoi (cp);
  cp += 4;

  if (cp >= limit)
    {
      clib_warning ("bugger 4");
      return 0;
    }
  while (cp < limit && (*cp == ' ' || *cp == '\t'))
    cp++;

  e->pid = atoi (cp);

  for (i = 0; i < 2; i++)
    {
      while (cp < limit && *cp != ':')
	cp++;
      cp++;
    }
  --cp;
  if (*cp != ':')
    {
      clib_warning ("bugger 5");
      return 0;
    }

  cp += 3;
  if (cp >= limit)
    {
      clib_warning ("bugger 6");
      return 0;
    }
  while (cp < limit && (*cp != ' ' && *cp != '\n'))
    {
      vec_add1 (task_name, *cp);
      cp++;
    }
  vec_add1 (task_name, 0);
  /* _vec_len() = 0 in caller */
  e->task = task_name;

  if (cp < limit)
    cp++;

  *index = cp - tdata;
  return e;
}

static u32
elog_id_for_pid (elog_main_t * em, u8 * name, u32 pid)
{
  uword *p, r;
  mhash_t *h = &em->string_table_hash;

  if (!em->string_table_hash.hash)
    mhash_init (h, sizeof (uword), sizeof (pid));

  p = mhash_get (h, &pid);
  if (p)
    return p[0];
  r = elog_string (em, "%s(%d)", name, pid);
  mhash_set (h, &pid, r, /* old_value */ 0);
  return r;
}

void
kelog_collect_sched_switch_trace (elog_main_t * em)
{
  int enable_fd, data_fd;
  char *trace_enable = "/debug/tracing/tracing_enabled";
  char *trace_data = "/debug/tracing/trace";
  u8 *data = 0;
  u8 *dp;
  int bytes, total_bytes;
  u32 pos;
  sched_event_t *evt;
  u64 nsec_to_add;
  u32 index;
  f64 clocks_per_sec;

  enable_fd = open (trace_enable, O_RDWR);
  if (enable_fd < 0)
    {
      clib_warning ("Couldn't open %s", trace_enable);
      return;
    }
  /* disable kernel tracing */
  if (write (enable_fd, "0\n", 2) != 2)
    {
      clib_unix_warning ("disable tracing");
      close (enable_fd);
      return;
    }
  close (enable_fd);

  /* Read the trace data */
  data_fd = open (trace_data, O_RDWR);
  if (data_fd < 0)
    {
      clib_warning ("Couldn't open %s", trace_data);
      return;
    }

  /*
   * Extract trace into a vector. Note that seq_printf() [kernel]
   * is not guaranteed to produce 4096 bytes at a time.
   */
  vec_validate (data, 4095);
  total_bytes = 0;
  pos = 0;
  while (1)
    {
      bytes = read (data_fd, data + pos, 4096);
      if (bytes <= 0)
	break;

      total_bytes += bytes;
      _vec_len (data) = total_bytes;

      pos = vec_len (data);
      vec_validate (data, vec_len (data) + 4095);
    }
  vec_add1 (data, 0);

  /* Synthesize events */
  em->is_enabled = 1;

  index = 0;
  while ((evt = parse_sched_switch_trace (data, &index)))
    {
      u64 fake_cpu_clock;

      fake_cpu_clock = evt->timestamp * em->cpu_timer.clocks_per_second;
      {
	ELOG_TYPE_DECLARE (e) =
	{
	  .format = "%d: %s %s",.format_args = "i4T4t4",.n_enum_strings =
	    2,.enum_strings =
	  {
	  "running", "wakeup",}
	,};
	struct
	{
	  u32 cpu, string_table_offset, which;
	} *ed;

	ed = elog_event_data_not_inline (em, &__ELOG_TYPE_VAR (e),
					 &em->default_track, fake_cpu_clock);
	ed->cpu = evt->cpu;
	ed->string_table_offset = elog_id_for_pid (em, evt->task, evt->pid);
	ed->which = evt->type;
      }
      _vec_len (evt->task) = 0;
    }
  em->is_enabled = 0;
}

/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */