summaryrefslogtreecommitdiffstats
path: root/src/vlib/buffer.h
AgeCommit message (Expand)AuthorFilesLines
2018-09-03Deprecate old buffer replication schemeDamjan Marion1-12/+2
2018-04-27A bit of buffer metadata reshuffling to accommodate flow_idDamjan Marion1-12/+5
2018-04-09plugins: unload plugin if early init failsDamjan Marion1-0/+4
2018-03-14vlib: internal buffer manager reworkDamjan Marion1-30/+27
2018-03-05vlib: vfio code reworkDamjan Marion1-0/+1
2018-02-07Refactor vlib_buffer flagsDamjan Marion1-23/+44
2017-11-27vlib: make vlib_buffer_alloc inline functionDamjan Marion1-8/+4
2017-11-13dpdk: introduce AVX512 variants of node functionsDamjan Marion1-9/+15
2017-10-25vlib: add support for multiple buffer poolsDamjan Marion1-4/+14
2017-10-19VPP-1024: rewrite buffer trajectory tracerDave Barach1-2/+7
2017-09-07vlib physmem reworkDamjan Marion1-1/+2
2017-08-30Thread safe internal buffer manager, take twoDamjan Marion1-0/+6
2017-07-27Thread safe internal buffer managerDamjan Marion1-0/+2
2017-07-17Fix unlinking of /dev/shm files.Dave Wallace1-9/+9
2017-07-14Introduce l{2,3,4}_hdr_offset fields in the buffer metadataDamjan Marion1-18/+22
2017-07-10vlib: store buffer memory information in the buffer_mainDamjan Marion1-6/+26
2017-05-03Fix vnet unit testsFilip Tehlar1-0/+12
2017-03-01VPP-598: tcp stack initial commitDave Barach1-0/+68
2017-03-01Fix buffer template copyDave Barach1-0/+2
2017-02-28vlib: add buffer cloning supportDamjan Marion1-1/+3
2017-02-27vlib: add VLIB_BUFFER_EXT_HDR_VALID flagDamjan Marion1-0/+2
2017-02-06vlib: remove algned/unaligned buffers schemeDamjan Marion1-11/+2
2017-01-14vlib: add buffer and thread callbacksDamjan Marion1-16/+29
2016-12-28Reorganize source tree to use single autotools instanceDamjan Marion1-0/+417
464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511
/*
 *------------------------------------------------------------------
 * Copyright (c) 2016 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *------------------------------------------------------------------
 */

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <net/if.h>
#include <linux/if_tun.h>
#include <sys/ioctl.h>
#include <sys/eventfd.h>

#include <vlib/vlib.h>
#include <vlib/unix/unix.h>
#include <vnet/ethernet/ethernet.h>
#include <vnet/devices/devices.h>
#include <vnet/feature/feature.h>
#include <vnet/gso/gro_func.h>
#include <vnet/ip/ip4_packet.h>
#include <vnet/ip/ip6_packet.h>
#include <vnet/udp/udp_packet.h>
#include <vnet/devices/virtio/virtio.h>


#define foreach_virtio_input_error \
  _(BUFFER_ALLOC, "buffer alloc error") \
  _(UNKNOWN, "unknown")

typedef enum
{
#define _(f,s) VIRTIO_INPUT_ERROR_##f,
  foreach_virtio_input_error
#undef _
    VIRTIO_INPUT_N_ERROR,
} virtio_input_error_t;

static char *virtio_input_error_strings[] = {
#define _(n,s) s,
  foreach_virtio_input_error
#undef _
};

typedef struct
{
  u32 next_index;
  u32 hw_if_index;
  u16 ring;
  u16 len;
  virtio_net_hdr_v1_t hdr;
} virtio_input_trace_t;

static u8 *
format_virtio_input_trace (u8 * s, va_list * args)
{
  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
  virtio_input_trace_t *t = va_arg (*args, virtio_input_trace_t *);
  u32 indent = format_get_indent (s);

  s = format (s, "virtio: hw_if_index %d next-index %d vring %u len %u",
	      t->hw_if_index, t->next_index, t->ring, t->len);
  s = format (s, "\n%Uhdr: flags 0x%02x gso_type 0x%02x hdr_len %u "
	      "gso_size %u csum_start %u csum_offset %u num_buffers %u",
	      format_white_space, indent + 2,
	      t->hdr.flags, t->hdr.gso_type, t->hdr.hdr_len, t->hdr.gso_size,
	      t->hdr.csum_start, t->hdr.csum_offset, t->hdr.num_buffers);
  return s;
}

static_always_inline void
virtio_refill_vring (vlib_main_t * vm, virtio_if_t * vif,
		     virtio_if_type_t type, virtio_vring_t * vring,
		     const int hdr_sz, u32 node_index)
{
  u16 used, next, avail, n_slots, n_refill;
  u16 sz = vring->size;
  u16 mask = sz - 1;

more:
  used = vring->desc_in_use;

  if (sz - used < sz / 8)
    return;

  /* deliver free buffers in chunks of 64 */
  n_refill = clib_min (sz - used, 64);

  next = vring->desc_next;
  avail = vring->avail->idx;
  n_slots =
    vlib_buffer_alloc_to_ring_from_pool (vm, vring->buffers, next,
					 vring->size, n_refill,
					 vring->buffer_pool_index);

  if (PREDICT_FALSE (n_slots != n_refill))
    {
      vlib_error_count (vm, node_index,
			VIRTIO_INPUT_ERROR_BUFFER_ALLOC, n_refill - n_slots);
      if (n_slots == 0)
	return;
    }

  while (n_slots)
    {
      vring_desc_t *d = &vring->desc[next];;
      vlib_buffer_t *b = vlib_get_buffer (vm, vring->buffers[next]);
      /*
       * current_data may not be initialized with 0 and may contain
       * previous offset. Here we want to make sure, it should be 0
       * initialized.
       */
      b->current_data = -hdr_sz;
      memset (vlib_buffer_get_current (b), 0, hdr_sz);
      d->addr =
	((type == VIRTIO_IF_TYPE_PCI) ? vlib_buffer_get_current_pa (vm,
								    b) :
	 pointer_to_uword (vlib_buffer_get_current (b)));
      d->len = vlib_buffer_get_default_data_size (vm) + hdr_sz;
      d->flags = VRING_DESC_F_WRITE;
      vring->avail->ring[avail & mask] = next;
      avail++;
      next = (next + 1) & mask;
      n_slots--;
      used++;
    }
  CLIB_MEMORY_STORE_BARRIER ();
  vring->avail->idx = avail;
  vring->desc_next = next;
  vring->desc_in_use = used;

  if ((vring->used->flags & VRING_USED_F_NO_NOTIFY) == 0)
    {
      virtio_kick (vm, vring, vif);
    }
  goto more;
}

static_always_inline void
virtio_needs_csum (vlib_buffer_t * b0, virtio_net_hdr_v1_t * hdr,
		   u8 * l4_proto, u8 * l4_hdr_sz, virtio_if_type_t type)
{
  if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
    {
      u16 ethertype = 0, l2hdr_sz = 0;

      if (type == VIRTIO_IF_TYPE_TUN)
	{
	  switch (b0->data[0] & 0xf0)
	    {
	    case 0x40:
	      ethertype = ETHERNET_TYPE_IP4;
	      break;
	    case 0x60:
	      ethertype = ETHERNET_TYPE_IP6;
	      break;
	    }
	}
      else
	{
	  ethernet_header_t *eh =
	    (ethernet_header_t *) vlib_buffer_get_current (b0);
	  ethertype = clib_net_to_host_u16 (eh->type);
	  l2hdr_sz = sizeof (ethernet_header_t);

	  if (ethernet_frame_is_tagged (ethertype))
	    {
	      ethernet_vlan_header_t *vlan =
		(ethernet_vlan_header_t *) (eh + 1);

	      ethertype = clib_net_to_host_u16 (vlan->type);
	      l2hdr_sz += sizeof (*vlan);
	      if (ethertype == ETHERNET_TYPE_VLAN)
		{
		  vlan++;
		  ethertype = clib_net_to_host_u16 (vlan->type);
		  l2hdr_sz += sizeof (*vlan);
		}
	    }
	}

      vnet_buffer (b0)->l2_hdr_offset = 0;
      vnet_buffer (b0)->l3_hdr_offset = l2hdr_sz;

      if (PREDICT_TRUE (ethertype == ETHERNET_TYPE_IP4))
	{
	  ip4_header_t *ip4 =
	    (ip4_header_t *) (vlib_buffer_get_current (b0) + l2hdr_sz);
	  vnet_buffer (b0)->l4_hdr_offset = l2hdr_sz + ip4_header_bytes (ip4);
	  *l4_proto = ip4->protocol;
	  b0->flags |=
	    (VNET_BUFFER_F_IS_IP4 | VNET_BUFFER_F_OFFLOAD_IP_CKSUM);
	  b0->flags |=
	    (VNET_BUFFER_F_L2_HDR_OFFSET_VALID
	     | VNET_BUFFER_F_L3_HDR_OFFSET_VALID |
	     VNET_BUFFER_F_L4_HDR_OFFSET_VALID);
	}
      else if (PREDICT_TRUE (ethertype == ETHERNET_TYPE_IP6))
	{
	  ip6_header_t *ip6 =
	    (ip6_header_t *) (vlib_buffer_get_current (b0) + l2hdr_sz);
	  vnet_buffer (b0)->l4_hdr_offset = l2hdr_sz + sizeof (ip6_header_t);
	  /* FIXME IPv6 EH traversal */
	  *l4_proto = ip6->protocol;
	  b0->flags |= (VNET_BUFFER_F_IS_IP6 |
			VNET_BUFFER_F_L2_HDR_OFFSET_VALID
			| VNET_BUFFER_F_L3_HDR_OFFSET_VALID |
			VNET_BUFFER_F_L4_HDR_OFFSET_VALID);
	}
      if (*l4_proto == IP_PROTOCOL_TCP)
	{
	  b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM;
	  tcp_header_t *tcp = (tcp_header_t *) (vlib_buffer_get_current (b0) +
						vnet_buffer
						(b0)->l4_hdr_offset);
	  *l4_hdr_sz = tcp_header_bytes (tcp);
	}
      else if (*l4_proto == IP_PROTOCOL_UDP)
	{
	  b0->flags |= VNET_BUFFER_F_OFFLOAD_UDP_CKSUM;
	  udp_header_t *udp = (udp_header_t *) (vlib_buffer_get_current (b0) +
						vnet_buffer
						(b0)->l4_hdr_offset);
	  *l4_hdr_sz = sizeof (*udp);
	}
    }
}

static_always_inline void
fill_gso_buffer_flags (vlib_buffer_t * b0, virtio_net_hdr_v1_t * hdr,
		       u8 l4_proto, u8 l4_hdr_sz)
{
  if (hdr->gso_type == VIRTIO_NET_HDR_GSO_TCPV4)
    {
      ASSERT (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM);
      vnet_buffer2 (b0)->gso_size = hdr->gso_size;
      vnet_buffer2 (b0)->gso_l4_hdr_sz = l4_hdr_sz;
      b0->flags |= VNET_BUFFER_F_GSO | VNET_BUFFER_F_IS_IP4;
    }
  if (hdr->gso_type == VIRTIO_NET_HDR_GSO_TCPV6)
    {
      ASSERT (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM);
      vnet_buffer2 (b0)->gso_size = hdr->gso_size;
      vnet_buffer2 (b0)->gso_l4_hdr_sz = l4_hdr_sz;
      b0->flags |= VNET_BUFFER_F_GSO | VNET_BUFFER_F_IS_IP6;
    }
}

static_always_inline uword
virtio_device_input_gso_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
				vlib_frame_t * frame, virtio_if_t * vif,
				u16 qid, virtio_if_type_t type,
				int gso_enabled, int checksum_offload_enabled)
{
  vnet_main_t *vnm = vnet_get_main ();
  u32 thread_index = vm->thread_index;
  uword n_trace = vlib_get_trace_count (vm, node);
  virtio_vring_t *vring = vec_elt_at_index (vif->rxq_vrings, qid);
  u16 txq_id = thread_index % vif->num_txqs;
  virtio_vring_t *txq_vring = vec_elt_at_index (vif->txq_vrings, txq_id);
  u32 next_index = VNET_DEVICE_INPUT_NEXT_ETHERNET_INPUT;
  const int hdr_sz = vif->virtio_net_hdr_sz;
  u32 *to_next = 0;
  u32 n_rx_packets = 0;
  u32 n_rx_bytes = 0;
  u16 mask = vring->size - 1;
  u16 last = vring->last_used_idx;
  u16 n_left = vring->used->idx - last;

  if (clib_spinlock_trylock_if_init (&txq_vring->lockp))
    {
      if (vif->packet_coalesce)
	vnet_gro_flow_table_schedule_node_on_dispatcher (vm,
							 txq_vring->flow_table);
      else if (vif->packet_buffering)
	virtio_vring_buffering_schedule_node_on_dispatcher (vm,
							    txq_vring->buffering);
      clib_spinlock_unlock_if_init (&txq_vring->lockp);
    }

  if ((vring->used->flags & VRING_USED_F_NO_NOTIFY) == 0 &&
      vring->last_kick_avail_idx != vring->avail->idx)
    virtio_kick (vm, vring, vif);

  if (n_left == 0)
    goto refill;

  if (type == VIRTIO_IF_TYPE_TUN)
    next_index = VNET_DEVICE_INPUT_NEXT_IP4_INPUT;

  while (n_left)
    {
      u32 n_left_to_next;
      u32 next0 = next_index;

      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);

      while (n_left && n_left_to_next)
	{
	  u8 l4_proto = 0, l4_hdr_sz = 0;
	  u16 num_buffers = 1;
	  vring_used_elem_t *e = &vring->used->ring[last & mask];
	  virtio_net_hdr_v1_t *hdr;
	  u16 slot = e->id;
	  u16 len = e->len - hdr_sz;
	  u32 bi0 = vring->buffers[slot];
	  vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0);
	  hdr = vlib_buffer_get_current (b0);
	  if (hdr_sz == sizeof (virtio_net_hdr_v1_t))
	    num_buffers = hdr->num_buffers;

	  b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
	  b0->current_data = 0;
	  b0->current_length = len;

	  if (checksum_offload_enabled)
	    virtio_needs_csum (b0, hdr, &l4_proto, &l4_hdr_sz, type);

	  if (gso_enabled)
	    fill_gso_buffer_flags (b0, hdr, l4_proto, l4_hdr_sz);

	  vnet_buffer (b0)->sw_if_index[VLIB_RX] = vif->sw_if_index;
	  vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0;

	  /* if multisegment packet */
	  if (PREDICT_FALSE (num_buffers > 1))
	    {
	      vlib_buffer_t *pb, *cb;
	      pb = b0;
	      b0->total_length_not_including_first_buffer = 0;
	      while (num_buffers > 1)
		{
		  last++;
		  e = &vring->used->ring[last & mask];
		  u32 cbi = vring->buffers[e->id];
		  cb = vlib_get_buffer (vm, cbi);

		  /* current buffer */
		  cb->current_length = e->len;

		  /* previous buffer */
		  pb->next_buffer = cbi;
		  pb->flags |= VLIB_BUFFER_NEXT_PRESENT;

		  /* first buffer */
		  b0->total_length_not_including_first_buffer += e->len;

		  pb = cb;
		  vring->desc_in_use--;
		  num_buffers--;
		  n_left--;
		}
	      len += b0->total_length_not_including_first_buffer;
	    }

	  if (type == VIRTIO_IF_TYPE_TUN)
	    {
	      switch (b0->data[0] & 0xf0)
		{
		case 0x40:
		  next0 = VNET_DEVICE_INPUT_NEXT_IP4_INPUT;
		  break;
		case 0x60:
		  next0 = VNET_DEVICE_INPUT_NEXT_IP6_INPUT;
		  break;
		default:
		  next0 = VNET_DEVICE_INPUT_NEXT_DROP;
		  break;
		}
	    }

	  if (PREDICT_FALSE (vif->per_interface_next_index != ~0))
	    next0 = vif->per_interface_next_index;

	  if (type != VIRTIO_IF_TYPE_TUN)
	    {
	      /* only for l2, redirect if feature path enabled */
	      vnet_feature_start_device_input_x1 (vif->sw_if_index, &next0,
						  b0);
	    }

	  /* trace */
	  VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0);

	  if (PREDICT_FALSE (n_trace > 0))
	    {
	      virtio_input_trace_t *tr;
	      vlib_trace_buffer (vm, node, next0, b0,
				 /* follow_chain */ 1);
	      vlib_set_trace_count (vm, node, --n_trace);
	      tr = vlib_add_trace (vm, node, b0, sizeof (*tr));
	      tr->next_index = next0;
	      tr->hw_if_index = vif->hw_if_index;
	      tr->len = len;
	      clib_memcpy_fast (&tr->hdr, hdr, hdr_sz);
	    }

	  /* enqueue buffer */
	  to_next[0] = bi0;
	  vring->desc_in_use--;
	  to_next += 1;
	  n_left_to_next--;
	  n_left--;
	  last++;

	  /* enqueue */
	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
					   n_left_to_next, bi0, next0);

	  /* next packet */
	  n_rx_packets++;
	  n_rx_bytes += len;
	}
      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
    }
  vring->last_used_idx = last;

  vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters
				   + VNET_INTERFACE_COUNTER_RX, thread_index,
				   vif->sw_if_index, n_rx_packets,
				   n_rx_bytes);

refill:
  virtio_refill_vring (vm, vif, type, vring, hdr_sz, node->node_index);

  return n_rx_packets;
}

static_always_inline uword
virtio_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
			    vlib_frame_t * frame, virtio_if_t * vif, u16 qid,
			    virtio_if_type_t type)
{

  if (vif->gso_enabled)
    return virtio_device_input_gso_inline (vm, node, frame, vif,
					   qid, type, 1, 1);
  else if (vif->csum_offload_enabled)
    return virtio_device_input_gso_inline (vm, node, frame, vif,
					   qid, type, 0, 1);
  else
    return virtio_device_input_gso_inline (vm, node, frame, vif,
					   qid, type, 0, 0);
  return 0;
}

VLIB_NODE_FN (virtio_input_node) (vlib_main_t * vm,
				  vlib_node_runtime_t * node,
				  vlib_frame_t * frame)
{
  u32 n_rx = 0;
  virtio_main_t *nm = &virtio_main;
  vnet_device_input_runtime_t *rt = (void *) node->runtime_data;
  vnet_device_and_queue_t *dq;

  foreach_device_and_queue (dq, rt->devices_and_queues)
  {
    virtio_if_t *vif;
    vif = vec_elt_at_index (nm->interfaces, dq->dev_instance);
    if (vif->flags & VIRTIO_IF_FLAG_ADMIN_UP)
      {
	if (vif->type == VIRTIO_IF_TYPE_TAP)
	  n_rx += virtio_device_input_inline (vm, node, frame, vif,
					      dq->queue_id,
					      VIRTIO_IF_TYPE_TAP);
	else if (vif->type == VIRTIO_IF_TYPE_PCI)
	  n_rx += virtio_device_input_inline (vm, node, frame, vif,
					      dq->queue_id,
					      VIRTIO_IF_TYPE_PCI);
	else if (vif->type == VIRTIO_IF_TYPE_TUN)
	  n_rx += virtio_device_input_inline (vm, node, frame, vif,
					      dq->queue_id,
					      VIRTIO_IF_TYPE_TUN);
      }
  }

  return n_rx;
}

/* *INDENT-OFF* */
VLIB_REGISTER_NODE (virtio_input_node) = {
  .name = "virtio-input",
  .sibling_of = "device-input",
  .format_trace = format_virtio_input_trace,
  .flags = VLIB_NODE_FLAG_TRACE_SUPPORTED,
  .type = VLIB_NODE_TYPE_INPUT,
  .state = VLIB_NODE_STATE_INTERRUPT,
  .n_errors = VIRTIO_INPUT_N_ERROR,
  .error_strings = virtio_input_error_strings,
};
/* *INDENT-ON* */

/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */