/*
 *------------------------------------------------------------------
 * Copyright (c) 2018 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *------------------------------------------------------------------
 */

#include <poll.h>
#include <vlib/vlib.h>
#include <vlib/unix/unix.h>
#include <vlib/pci/pci.h>
#include <vnet/ethernet/ethernet.h>
#include <vnet/devices/devices.h>
#include "af_xdp.h"

#define foreach_af_xdp_input_error \
  _(POLL_REQUIRED, "poll required") \
  _(POLL_FAILURES, "poll failures")

typedef enum
{
#define _(f,s) AF_XDP_INPUT_ERROR_##f,
  foreach_af_xdp_input_error
#undef _
    AF_XDP_INPUT_N_ERROR,
} af_xdp_input_error_t;

static __clib_unused char *af_xdp_input_error_strings[] = {
#define _(n,s) s,
  foreach_af_xdp_input_error
#undef _
};

static_always_inline void
af_xdp_device_input_trace (vlib_main_t * vm, vlib_node_runtime_t * node,
			   u32 n_left, const u32 * bi, u32 next_index,
			   u32 hw_if_index)
{
  u32 n_trace = vlib_get_trace_count (vm, node);

  if (PREDICT_TRUE (0 == n_trace))
    return;

  while (n_trace && n_left)
    {
      vlib_buffer_t *b = vlib_get_buffer (vm, bi[0]);
      if (PREDICT_TRUE
	  (vlib_trace_buffer (vm, node, next_index, b, /* follow_chain */ 0)))
	{
	  af_xdp_input_trace_t *tr =
	    vlib_add_trace (vm, node, b, sizeof (*tr));
	  tr->next_index = next_index;
	  tr->hw_if_index = hw_if_index;
	  n_trace--;
	}
      n_left--;
      bi++;
    }

  vlib_set_trace_count (vm, node, n_trace);
}

static_always_inline void
af_xdp_device_input_refill_db (vlib_main_t * vm,
			       const vlib_node_runtime_t * node,
			       af_xdp_device_t * ad, af_xdp_rxq_t * rxq,
			       const u32 n_alloc)
{
  int ret;

  xsk_ring_prod__submit (&rxq->fq, n_alloc);

  if (!xsk_ring_prod__needs_wakeup (&rxq->fq))
    return;

  vlib_error_count (vm, node->node_index, AF_XDP_INPUT_ERROR_POLL_REQUIRED,
		    1);

  struct pollfd fd = {.fd = rxq->xsk_fd,.events = POLLIN };
  ret = poll (&fd, 1, 0);
  if (PREDICT_TRUE (ret >= 0))
    return;

  /* something bad is happening */
  vlib_error_count (vm, node->node_index, AF_XDP_INPUT_ERROR_POLL_FAILURES,
		    1);
  af_xdp_device_error (ad, "poll() failed");
}

static_always_inline void
af_xdp_device_input_refill (vlib_main_t * vm,
			    const vlib_node_runtime_t * node,
			    af_xdp_device_t * ad, af_xdp_rxq_t * rxq,
			    const int copy)
{
  __u64 *fill;
  const u32 size = rxq->fq.size;
  const u32 mask = size - 1;
  u32 bis[VLIB_FRAME_SIZE], *bi = bis;
  u32 n_alloc, n, n_wrap;
  u32 idx = 0;

  ASSERT (mask == rxq->fq.mask);

  /* do not enqueue more packet than ring space */
  n_alloc = xsk_prod_nb_free (&rxq->fq, 16);
  /* do not bother to allocate if too small */
  if (n_alloc < 16)
    return;

  n_alloc = clib_min (n_alloc, ARRAY_LEN (bis));
  n_alloc = vlib_buffer_alloc_from_pool (vm, bis, n_alloc, ad->pool);
  n = xsk_ring_prod__reserve (&rxq->fq, n_alloc, &idx);
  ASSERT (n == n_alloc);

  fill = xsk_ring_prod__fill_addr (&rxq->fq, idx);
  n = clib_min (n_alloc, size - (idx & mask));
  n_wrap = n_alloc - n;

  /*
   * Note about headroom: for some reasons, there seem to be a discrepency
   * between 0-copy and copy mode:
   *   - 0-copy: XDP_PACKET_HEADROOM will be added to the user headroom
   *   - copy: nothing is added to the user headroom
   * We privileged 0-copy and set headroom to 0. As XDP_PACKET_HEADROOM ==
   * sizeof(vlib_buffer_t), data will correctly point to vlib_buffer_t->data.
   * In copy mode, we have to add sizeof(vlib_buffer_t) to desc offset during
   * refill.
   */
  STATIC_ASSERT (sizeof (vlib_buffer_t) == XDP_PACKET_HEADROOM, "wrong size");
#define bi2addr(bi) \
  (((bi) << CLIB_LOG2_CACHE_LINE_BYTES) + (copy ? sizeof(vlib_buffer_t) : 0))

wrap_around:

  while (n >= 8)
    {
#ifdef CLIB_HAVE_VEC256
      u64x4 b0 = u64x4_from_u32x4 (*(u32x4u *) (bi + 0));
      u64x4 b1 = u64x4_from_u32x4 (*(u32x4u *) (bi + 4));
      *(u64x4u *) (fill + 0) = bi2addr (b0);
      *(u64x4u *) (fill + 4) = bi2addr (b1);
#else
      fill[0] = bi2addr (bi[0]);
      fill[1] = bi2addr (bi[1]);
      fill[2] = bi2addr (bi[2]);
      fill[3] = bi2addr (bi[3]);
      fill[4] = bi2addr (bi[4]);
      fill[5] = bi2addr (bi[5]);
      fill[6] = bi2addr (bi[6]);
      fill[7] = bi2addr (bi[7]);
#endif
      fill += 8;
      bi += 8;
      n -= 8;
    }

  while (n >= 1)
    {
      fill[0] = bi2addr (bi[0]);
      fill += 1;
      bi += 1;
      n -= 1;
    }

  if (n_wrap)
    {
      fill = xsk_ring_prod__fill_addr (&rxq->fq, 0);
      n = n_wrap;
      n_wrap = 0;
      goto wrap_around;
    }

  af_xdp_device_input_refill_db (vm, node, ad, rxq, n_alloc);
}

static_always_inline void
af_xdp_device_input_ethernet (vlib_main_t * vm, vlib_node_runtime_t * node,
			      const u32 next_index, const u32 sw_if_index,
			      const u32 hw_if_index)
{
  vlib_next_frame_t *nf;
  vlib_frame_t *f;
  ethernet_input_frame_t *ef;

  if (PREDICT_FALSE (VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT != next_index))
    return;

  nf =
    vlib_node_runtime_get_next_frame (vm, node,
				      VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT);
  f = vlib_get_frame (vm, nf->frame);
  f->flags = ETH_INPUT_FRAME_F_SINGLE_SW_IF_IDX;

  ef = vlib_frame_scalar_args (f);
  ef->sw_if_index = sw_if_index;
  ef->hw_if_index = hw_if_index;
}

static_always_inline u32
af_xdp_device_input_bufs (vlib_main_t * vm, const af_xdp_device_t * ad,
			  af_xdp_rxq_t * rxq, u32 * bis, const u32 n_rx,
			  vlib_buffer_t * bt, u32 idx, const int copy)
{
  vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs;
  u16 lens[VLIB_FRAME_SIZE], *len = lens;
  const u32 mask = rxq->rx.mask;
  u32 n = n_rx, *bi = bis, bytes = 0;

#define addr2bi(addr) \
  (((addr) - (copy ? sizeof(vlib_buffer_t) : 0)) >> CLIB_LOG2_CACHE_LINE_BYTES)

  while (n >= 1)
    {
      const struct xdp_desc *desc = xsk_ring_cons__rx_desc (&rxq->rx, idx);
      bi[0] = addr2bi (xsk_umem__extract_addr (desc->addr));
      ASSERT (vlib_buffer_is_known (vm, bi[0]) ==
	      VLIB_BUFFER_KNOWN_ALLOCATED);
      len[0] = desc->len;
      idx = (idx + 1) & mask;
      bi += 1;
      len += 1;
      n -= 1;
    }

  vlib_get_buffers (vm, bis, bufs, n_rx);

  n = n_rx;
  len = lens;

  while (n >= 8)
    {
      vlib_prefetch_buffer_header (b[4], LOAD);
      vlib_buffer_copy_template (b[0], bt);
      bytes += b[0]->current_length = len[0];

      vlib_prefetch_buffer_header (b[5], LOAD);
      vlib_buffer_copy_template (b[1], bt);
      bytes += b[1]->current_length = len[1];

      vlib_prefetch_buffer_header (b[6], LOAD);
      vlib_buffer_copy_template (b[2], bt);
      bytes += b[2]->current_length = len[2];

      vlib_prefetch_buffer_header (b[7], LOAD);
      vlib_buffer_copy_template (b[3], bt);
      bytes += b[3]->current_length = len[3];

      b += 4;
      len += 4;
      n -= 4;
    }

  while (n >= 1)
    {
      vlib_buffer_copy_template (b[0], bt);
      bytes += b[0]->current_length = len[0];
      b += 1;
      len += 1;
      n -= 1;
    }

  xsk_ring_cons__release (&rxq->rx, n_rx);
  return bytes;
}

static_always_inline uword
af_xdp_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
			    vlib_frame_t * frame, af_xdp_device_t * ad,
			    u16 qid, const int copy)
{
  vnet_main_t *vnm = vnet_get_main ();
  af_xdp_rxq_t *rxq = vec_elt_at_index (ad->rxqs, qid);
  vlib_buffer_t bt;
  u32 next_index, *to_next, n_left_to_next;
  u32 n_rx_packets, n_rx_bytes;
  u32 idx;

  n_rx_packets = xsk_ring_cons__peek (&rxq->rx, VLIB_FRAME_SIZE, &idx);

  if (PREDICT_FALSE (0 == n_rx_packets))
    goto refill;

  vlib_buffer_copy_template (&bt, ad->buffer_template);
  next_index = ad->per_interface_next_index;
  if (PREDICT_FALSE (vnet_device_input_have_features (ad->sw_if_index)))
    vnet_feature_start_device_input_x1 (ad->sw_if_index, &next_index, &bt);

  vlib_get_new_next_frame (vm, node, next_index, to_next, n_left_to_next);

  n_rx_bytes =
    af_xdp_device_input_bufs (vm, ad, rxq, to_next, n_rx_packets, &bt, idx,
			      copy);
  af_xdp_device_input_ethernet (vm, node, next_index, ad->sw_if_index,
				ad->hw_if_index);

  vlib_put_next_frame (vm, node, next_index, n_left_to_next - n_rx_packets);

  af_xdp_device_input_trace (vm, node, n_rx_packets, to_next, next_index,
			     ad->hw_if_index);

  vlib_increment_combined_counter
    (vnm->interface_main.combined_sw_if_counters +
     VNET_INTERFACE_COUNTER_RX, vm->thread_index,
     ad->hw_if_index, n_rx_packets, n_rx_bytes);

refill:
  af_xdp_device_input_refill (vm, node, ad, rxq, copy);

  return n_rx_packets;
}

VLIB_NODE_FN (af_xdp_input_node) (vlib_main_t * vm,
				  vlib_node_runtime_t * node,
				  vlib_frame_t * frame)
{
  u32 n_rx = 0;
  af_xdp_main_t *am = &af_xdp_main;
  vnet_device_input_runtime_t *rt = (void *) node->runtime_data;
  vnet_device_and_queue_t *dq;

  foreach_device_and_queue (dq, rt->devices_and_queues)
  {
    af_xdp_device_t *ad;
    ad = vec_elt_at_index (am->devices, dq->dev_instance);
    if ((ad->flags & AF_XDP_DEVICE_F_ADMIN_UP) == 0)
      continue;
    if (PREDICT_TRUE (ad->flags & AF_XDP_DEVICE_F_ZEROCOPY))
      n_rx += af_xdp_device_input_inline (vm, node, frame, ad, dq->queue_id,
					  /* copy */ 0);
    else
      n_rx += af_xdp_device_input_inline (vm, node, frame, ad, dq->queue_id,
					  /* copy */ 1);
  }
  return n_rx;
}

/* *INDENT-OFF* */
VLIB_REGISTER_NODE (af_xdp_input_node) = {
  .name = "af_xdp-input",
  .sibling_of = "device-input",
  .format_trace = format_af_xdp_input_trace,
  .type = VLIB_NODE_TYPE_INPUT,
  .state = VLIB_NODE_STATE_DISABLED,
  .n_errors = AF_XDP_INPUT_N_ERROR,
  .error_strings = af_xdp_input_error_strings,
  .flags = VLIB_NODE_FLAG_TRACE_SUPPORTED,
};
/* *INDENT-ON* */

/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */