summaryrefslogtreecommitdiffstats
AgeCommit message (Expand)AuthorFilesLines
2020-04-29ip: use thread local vm instead of thread main for vlib_time_now callsTom Seidenberg2-2/+2
2020-04-29misc: switch to clang-9Damjan Marion4-5/+7
2020-04-29hsa: cleanup ifdefs in sock testsFlorin Coras4-532/+168
2020-04-29devices: vhost: simplify string copies for GCC-10Benoît Ganne1-10/+5
2020-04-29tap: fix missing "num-rx-queues" from cli helpMohsin Kazmi1-6/+6
2020-04-29vcl: add separate fcntl64 ldp handlerFlorin Coras1-5/+12
2020-04-29interface: fix interface_types.api enumsPaul Vinciguerra1-9/+8
2020-04-29flow: explicit rss function enum castFlorin Coras1-1/+1
2020-04-28vlib: add ASSERT to vlib_time_now(...)Dave Barach3-2/+10
2020-04-28svm: fix fifo alignemnt in batch preallocFlorin Coras1-3/+9
2020-04-28vlib: use flexible array in vlib_buffer for GCC-10Benoît Ganne1-1/+1
2020-04-28gomemif: introduce gomemifJakub Grajciar16-0/+2751
2020-04-28vppinfra: type prove vec_new and vec_resizeAndreas Schultz1-6/+10
2020-04-28flow: add RSS supportChenmin Sun6-34/+190
2020-04-28tap: use one tap fd per rx queueAloys Augustin3-35/+64
2020-04-28ipsec: fix buffer allocFilip Tehlar1-1/+7
2020-04-28nat: remove unused codeKlement Sekera1-9/+0
2020-04-28vppinfra: improve test coverageDave Barach6-2/+128
2020-04-28docs: add missing spelling dictionaryPaul Vinciguerra1-0/+774
2020-04-28tests: fix update_path_flags for multicast in vpp_ip_routePaul Vinciguerra1-2/+2
2020-04-28tests: implement ipaddress convenience methodsPaul Vinciguerra8-94/+82
2020-04-28lisp: API cleanupJakub Grajciar17-1471/+1006
2020-04-28vlib: startup multi-arch variant configurationRay Kinsella9-1/+376
2020-04-28stats: add apis to delete simple/combined countersOle Troan4-0/+75
2020-04-28tests: move defaults from defaultmapping to .api filesPaul Vinciguerra12-50/+35
2020-04-28tcp: remove sack reneging verbose loggingFlorin Coras1-2/+0
2020-04-28svm: null instead of panic if fifo hdr alloc failsFlorin Coras1-1/+4
2020-04-28tls: fix wrong usage of session close function issueSimon Zhang1-1/+1
2020-04-27virtio: support virtio 1.1 packed ring in vhostSteven Luong11-163/+1625
2020-04-27l2: merge two clib_memcpy_fast into oneZhiyong Yang1-11/+16
2020-04-27vlib: pcap rx/tx/dispatch trace testDave Barach1-0/+89
2020-04-27vlib: deprecate i2c and cjDave Barach11-15/+0
2020-04-27ioam: do not reuse existing vnet symbolBenoît Ganne1-2/+2
2020-04-27vppinfra: selectively disable false-positive GCC-10 warningsBenoît Ganne3-0/+24
2020-04-27ip: reassembly: fix one possible use-after-freeGao Feng2-10/+11
2020-04-27vppinfra: selectively disable false-positive GCC-10 warningsBenoît Ganne1-0/+9
2020-04-27build: add vppinfra/warnings.h to exported headers listBenoît Ganne1-0/+1
2020-04-27rdma: tx: interleave prefetchesBenoît Ganne1-24/+13
2020-04-27devices: allow link state down with netlinkMatthew Smith1-1/+1
2020-04-27nat: improve perf - long read after short writeKlement Sekera7-363/+424
2020-04-26dhcp: fix dhcp proxy behavior for qinq and dot1q subinterfacesStanislav Zaikin2-14/+57
2020-04-25tls: improve cli state reportingFlorin Coras1-3/+12
2020-04-25session vcl: propagate transport cleanup notificationsFlorin Coras5-6/+26
2020-04-25vcl: generate select events on read/write errorsFlorin Coras2-8/+9
2020-04-24vppinfra: finish deprecating qsort.cDave Barach2-3/+7
2020-04-24acl: ACL creation CLI parsing fixNeale Ranns1-2/+4
2020-04-24nat: ignore user hash in ED NATKlement Sekera8-362/+273
2020-04-24nat: make usage of vnet_buffer2 transparentKlement Sekera6-21/+19
2020-04-24tests: test vnet_calc_checksums_inline(...)Dave Barach1-0/+74
2020-04-24nat: ED: reduce number of hash tables usedKlement Sekera5-212/+210
906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020
/*
 *------------------------------------------------------------------
 * Copyright (c) 2017 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *------------------------------------------------------------------
 */

#define _GNU_SOURCE
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <net/if.h>
#include <linux/if_tun.h>
#include <sys/ioctl.h>
#include <linux/virtio_net.h>
#include <linux/vhost.h>
#include <sys/eventfd.h>
#include <net/if_arp.h>
#include <sched.h>
#include <limits.h>

#include <linux/netlink.h>
#include <linux/rtnetlink.h>

#include <vlib/vlib.h>
#include <vlib/physmem.h>
#include <vlib/unix/unix.h>
#include <vnet/ethernet/ethernet.h>
#include <vnet/ip/ip4_packet.h>
#include <vnet/ip/ip6_packet.h>
#include <vnet/devices/netlink.h>
#include <vnet/devices/virtio/virtio.h>
#include <vnet/devices/tap/tap.h>

tap_main_t tap_main;

#define tap_log_err(dev, f, ...)                        \
  vlib_log (VLIB_LOG_LEVEL_ERR, tap_main.log_default, "tap%u: " f, dev->dev_instance, ## __VA_ARGS__)
#define tap_log_dbg(dev, f, ...)                        \
  vlib_log (VLIB_LOG_LEVEL_DEBUG, tap_main.log_default, "tap%u: " f, dev->dev_instance, ## __VA_ARGS__)

#define _IOCTL(fd,a,...) \
  if (ioctl (fd, a, __VA_ARGS__) < 0) \
    { \
      err = clib_error_return_unix (0, "ioctl(" #a ")"); \
      tap_log_err (vif, "%U", format_clib_error, err); \
      goto error; \
    }

  /* *INDENT-OFF* */
VNET_HW_INTERFACE_CLASS (tun_device_hw_interface_class, static) =
{
  .name = "tun-device",
  .flags = VNET_HW_INTERFACE_CLASS_FLAG_P2P,
};
  /* *INDENT-ON* */

static u32
virtio_eth_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi,
			u32 flags)
{
  /* nothing for now */
  //TODO On MTU change call vnet_netlink_set_if_mtu
  return 0;
}

static int
open_netns_fd (char *netns)
{
  u8 *s = 0;
  int fd;

  if (strncmp (netns, "pid:", 4) == 0)
    s = format (0, "/proc/%u/ns/net%c", atoi (netns + 4), 0);
  else if (netns[0] == '/')
    s = format (0, "%s%c", netns, 0);
  else
    s = format (0, "/var/run/netns/%s%c", netns, 0);

  fd = open ((char *) s, O_RDONLY);
  vec_free (s);
  return fd;
}

#define TAP_MAX_INSTANCE 1024

static void
tap_free (vlib_main_t * vm, virtio_if_t * vif)
{
  virtio_main_t *mm = &virtio_main;
  tap_main_t *tm = &tap_main;
  clib_error_t *err = 0;
  int i;

  /* *INDENT-OFF* */
  vec_foreach_index (i, vif->vhost_fds) if (vif->vhost_fds[i] != -1)
    close (vif->vhost_fds[i]);
  vec_foreach_index (i, vif->rxq_vrings)
    virtio_vring_free_rx (vm, vif, RX_QUEUE (i));
  vec_foreach_index (i, vif->txq_vrings)
    virtio_vring_free_tx (vm, vif, TX_QUEUE (i));
  /* *INDENT-ON* */

  _IOCTL (vif->tap_fds[0], TUNSETPERSIST, (void *) (uintptr_t) 0);
  tap_log_dbg (vif, "TUNSETPERSIST: unset");
error:
  vec_foreach_index (i, vif->tap_fds) close (vif->tap_fds[i]);

  vec_free (vif->vhost_fds);
  vec_free (vif->rxq_vrings);
  vec_free (vif->txq_vrings);
  vec_free (vif->host_if_name);
  vec_free (vif->net_ns);
  vec_free (vif->host_bridge);
  clib_error_free (vif->error);

  tm->tap_ids = clib_bitmap_set (tm->tap_ids, vif->id, 0);
  clib_memset (vif, 0, sizeof (*vif));
  pool_put (mm->interfaces, vif);
}

void
tap_create_if (vlib_main_t * vm, tap_create_if_args_t * args)
{
  vlib_thread_main_t *thm = vlib_get_thread_main ();
  vlib_physmem_main_t *vpm = &vm->physmem_main;
  vnet_main_t *vnm = vnet_get_main ();
  virtio_main_t *vim = &virtio_main;
  tap_main_t *tm = &tap_main;
  vnet_sw_interface_t *sw;
  vnet_hw_interface_t *hw;
  int i, num_vhost_queues;
  int old_netns_fd = -1;
  struct ifreq ifr = {.ifr_flags = IFF_NO_PI | IFF_VNET_HDR };
  struct ifreq get_ifr = {.ifr_flags = 0 };
  size_t hdrsz;
  struct vhost_memory *vhost_mem = 0;
  virtio_if_t *vif = 0;
  clib_error_t *err = 0;
  unsigned int tap_features;
  int tfd = -1, qfd = -1, vfd = -1, nfd = -1;
  char *host_if_name = 0;
  unsigned int offload = 0;
  int sndbuf = 0;

  if (args->id != ~0)
    {
      if (clib_bitmap_get (tm->tap_ids, args->id))
	{
	  args->rv = VNET_API_ERROR_INVALID_INTERFACE;
	  args->error = clib_error_return (0, "interface already exists");
	  return;
	}
    }
  else
    {
      args->id = clib_bitmap_first_clear (tm->tap_ids);
    }

  if (args->id > TAP_MAX_INSTANCE)
    {
      args->rv = VNET_API_ERROR_UNSPECIFIED;
      args->error = clib_error_return (0, "cannot find free interface id");
      return;
    }

  pool_get_zero (vim->interfaces, vif);

  if (args->tap_flags & TAP_FLAG_TUN)
    {
      vif->type = VIRTIO_IF_TYPE_TUN;
      ifr.ifr_flags |= IFF_TUN;

      /*
       * From kernel 4.20, xdp support has been added in tun_sendmsg.
       * If sndbuf == INT_MAX, vhost batches the packet and processes
       * them using xdp data path for tun driver. It assumes packets
       * are ethernet frames (It needs to be fixed).
       * To avoid xdp data path in tun driver, sndbuf value should
       * be < INT_MAX.
       */
      sndbuf = INT_MAX - 1;
    }
  else
    {
      vif->type = VIRTIO_IF_TYPE_TAP;
      ifr.ifr_flags |= IFF_TAP;
      sndbuf = INT_MAX;
    }

  vif->dev_instance = vif - vim->interfaces;
  vif->id = args->id;
  vif->num_txqs = thm->n_vlib_mains;
  vif->num_rxqs = clib_max (args->num_rx_queues, 1);

  if (args->tap_flags & TAP_FLAG_ATTACH)
    {
      if (args->host_if_name != NULL)
	{
	  host_if_name = (char *) args->host_if_name;
	  clib_memcpy (ifr.ifr_name, host_if_name,
		       clib_min (IFNAMSIZ, strlen (host_if_name)));
	}
      else
	{
	  args->rv = VNET_API_ERROR_NO_MATCHING_INTERFACE;
	  err = clib_error_return (0, "host_if_name is not provided");
	  goto error;
	}
      if (args->host_namespace)
	{
	  old_netns_fd = open ("/proc/self/ns/net", O_RDONLY);
	  if ((nfd = open_netns_fd ((char *) args->host_namespace)) == -1)
	    {
	      args->rv = VNET_API_ERROR_SYSCALL_ERROR_2;
	      args->error = clib_error_return_unix (0, "open_netns_fd '%s'",
						    args->host_namespace);
	      goto error;
	    }
	  if (setns (nfd, CLONE_NEWNET) == -1)
	    {
	      args->rv = VNET_API_ERROR_SYSCALL_ERROR_3;
	      args->error = clib_error_return_unix (0, "setns '%s'",
						    args->host_namespace);
	      goto error;
	    }
	}
    }

  if ((tfd = open ("/dev/net/tun", O_RDWR | O_NONBLOCK)) < 0)
    {
      args->rv = VNET_API_ERROR_SYSCALL_ERROR_2;
      args->error = clib_error_return_unix (0, "open '/dev/net/tun'");
      goto error;
    }
  vec_add1 (vif->tap_fds, tfd);
  tap_log_dbg (vif, "open tap fd %d", tfd);

  _IOCTL (tfd, TUNGETFEATURES, &tap_features);
  tap_log_dbg (vif, "TUNGETFEATURES: features 0x%lx", tap_features);
  if ((tap_features & IFF_VNET_HDR) == 0)
    {
      args->rv = VNET_API_ERROR_SYSCALL_ERROR_2;
      args->error = clib_error_return (0, "vhost-net backend not available");
      goto error;
    }

  if ((tap_features & IFF_MULTI_QUEUE) == 0)
    {
      if (vif->num_rxqs > 1)
	{
	  args->rv = VNET_API_ERROR_SYSCALL_ERROR_2;
	  args->error = clib_error_return (0, "multiqueue not supported");
	  goto error;
	}
      vif->num_rxqs = vif->num_txqs = 1;
    }
  else
    ifr.ifr_flags |= IFF_MULTI_QUEUE;

  hdrsz = sizeof (struct virtio_net_hdr_v1);
  if (args->tap_flags & TAP_FLAG_GSO)
    {
      offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6;
      vif->gso_enabled = 1;
    }
  else if (args->tap_flags & TAP_FLAG_CSUM_OFFLOAD)
    {
      offload = TUN_F_CSUM;
      vif->csum_offload_enabled = 1;
    }

  _IOCTL (tfd, TUNSETIFF, (void *) &ifr);
  tap_log_dbg (vif, "TUNSETIFF fd %d name %s flags 0x%x", tfd,
	       ifr.ifr_ifrn.ifrn_name, ifr.ifr_flags);

  vif->ifindex = if_nametoindex (ifr.ifr_ifrn.ifrn_name);
  tap_log_dbg (vif, "ifindex %d", vif->ifindex);

  if (!args->host_if_name)
    host_if_name = ifr.ifr_ifrn.ifrn_name;
  else
    host_if_name = (char *) args->host_if_name;

  /*
   * unset the persistence when attaching to existing
   * interface
   */
  if (args->tap_flags & TAP_FLAG_ATTACH)
    {
      _IOCTL (tfd, TUNSETPERSIST, (void *) (uintptr_t) 0);
      tap_log_dbg (vif, "TUNSETPERSIST: unset");
    }

  /* set the persistence */
  if (args->tap_flags & TAP_FLAG_PERSIST)
    {
      _IOCTL (tfd, TUNSETPERSIST, (void *) (uintptr_t) 1);
      tap_log_dbg (vif, "TUNSETPERSIST: set");

      /* verify persistence is set, read the flags */
      _IOCTL (tfd, TUNGETIFF, (void *) &get_ifr);
      tap_log_dbg (vif, "TUNGETIFF: flags 0x%lx", get_ifr.ifr_flags);
      if ((get_ifr.ifr_flags & IFF_PERSIST) == 0)
	{
	  args->rv = VNET_API_ERROR_SYSCALL_ERROR_2;
	  args->error = clib_error_return (0, "persistence not supported");
	  goto error;
	}
    }

  /* create additional queues on the linux side.
   * we create as many linux queue pairs as we have rx queues
   */
  for (i = 1; i < vif->num_rxqs; i++)
    {
      if ((qfd = open ("/dev/net/tun", O_RDWR | O_NONBLOCK)) < 0)
	{
	  args->rv = VNET_API_ERROR_SYSCALL_ERROR_2;
	  args->error = clib_error_return_unix (0, "open '/dev/net/tun'");
	  goto error;
	}
      _IOCTL (qfd, TUNSETIFF, (void *) &ifr);
      tap_log_dbg (vif, "TUNSETIFF fd %d name %s flags 0x%x", qfd,
		   ifr.ifr_ifrn.ifrn_name, ifr.ifr_flags);
      vec_add1 (vif->tap_fds, qfd);
    }

  for (i = 0; i < vif->num_rxqs; i++)
    {
      tap_log_dbg (vif, "TUNSETVNETHDRSZ: fd %d vnet_hdr_sz %u",
		   vif->tap_fds[i], hdrsz);
      _IOCTL (vif->tap_fds[i], TUNSETVNETHDRSZ, &hdrsz);

      tap_log_dbg (vif, "TUNSETSNDBUF: fd %d sndbuf %d", vif->tap_fds[i],
		   sndbuf);
      _IOCTL (vif->tap_fds[i], TUNSETSNDBUF, &sndbuf);

      tap_log_dbg (vif, "TUNSETOFFLOAD: fd %d offload 0x%lx", vif->tap_fds[i],
		   offload);
      _IOCTL (vif->tap_fds[i], TUNSETOFFLOAD, offload);

      if (fcntl (vif->tap_fds[i], F_SETFL, O_NONBLOCK) < 0)
	{
	  err = clib_error_return_unix (0, "fcntl(tfd, F_SETFL, O_NONBLOCK)");
	  tap_log_err (vif, "set nonblocking: %U", format_clib_error, err);
	  goto error;
	}
    }

  /* open as many vhost-net fds as required and set ownership */
  num_vhost_queues = clib_max (vif->num_rxqs, vif->num_txqs);
  for (i = 0; i < num_vhost_queues; i++)
    {
      if ((vfd = open ("/dev/vhost-net", O_RDWR | O_NONBLOCK)) < 0)
	{
	  args->rv = VNET_API_ERROR_SYSCALL_ERROR_1;
	  args->error = clib_error_return_unix (0, "open '/dev/vhost-net'");
	  goto error;
	}
      vec_add1 (vif->vhost_fds, vfd);
      virtio_log_debug (vif, "open vhost-net fd %d qpair %u", vfd, i);
      _IOCTL (vfd, VHOST_SET_OWNER, 0);
      virtio_log_debug (vif, "VHOST_SET_OWNER: fd %u", vfd);
    }

  _IOCTL (vif->vhost_fds[0], VHOST_GET_FEATURES, &vif->remote_features);
  virtio_log_debug (vif, "VHOST_GET_FEATURES: features 0x%lx",
		    vif->remote_features);

  if ((vif->remote_features & VIRTIO_FEATURE (VIRTIO_NET_F_MRG_RXBUF)) == 0)
    {
      args->rv = VNET_API_ERROR_UNSUPPORTED;
      args->error = clib_error_return (0, "vhost-net backend doesn't support "
				       "VIRTIO_NET_F_MRG_RXBUF feature");
      goto error;
    }

  if ((vif->remote_features & VIRTIO_FEATURE (VIRTIO_RING_F_INDIRECT_DESC)) ==
      0)
    {
      args->rv = VNET_API_ERROR_UNSUPPORTED;
      args->error = clib_error_return (0, "vhost-net backend doesn't support "
				       "VIRTIO_RING_F_INDIRECT_DESC feature");
      goto error;
    }

  if ((vif->remote_features & VIRTIO_FEATURE (VIRTIO_F_VERSION_1)) == 0)
    {
      args->rv = VNET_API_ERROR_UNSUPPORTED;
      args->error = clib_error_return (0, "vhost-net backend doesn't support "
				       "VIRTIO_F_VERSION_1 features");
      goto error;
    }

  vif->features |= VIRTIO_FEATURE (VIRTIO_NET_F_MRG_RXBUF);
  vif->features |= VIRTIO_FEATURE (VIRTIO_F_VERSION_1);
  vif->features |= VIRTIO_FEATURE (VIRTIO_RING_F_INDIRECT_DESC);

  virtio_set_net_hdr_size (vif);

  if (!(args->tap_flags & TAP_FLAG_ATTACH))
    {
      /* if namespace is specified, all further netlink messages should be executed
         after we change our net namespace */
      if (args->host_namespace)
	{
	  old_netns_fd = open ("/proc/self/ns/net", O_RDONLY);
	  if ((nfd = open_netns_fd ((char *) args->host_namespace)) == -1)
	    {
	      args->rv = VNET_API_ERROR_SYSCALL_ERROR_2;
	      args->error = clib_error_return_unix (0, "open_netns_fd '%s'",
						    args->host_namespace);
	      goto error;
	    }
	  args->error = vnet_netlink_set_link_netns (vif->ifindex, nfd,
						     host_if_name);
	  if (args->error)
	    {
	      args->rv = VNET_API_ERROR_NETLINK_ERROR;
	      goto error;
	    }
	  if (setns (nfd, CLONE_NEWNET) == -1)
	    {
	      args->rv = VNET_API_ERROR_SYSCALL_ERROR_3;
	      args->error = clib_error_return_unix (0, "setns '%s'",
						    args->host_namespace);
	      goto error;
	    }
	  if ((vif->ifindex = if_nametoindex (host_if_name)) == 0)
	    {
	      args->rv = VNET_API_ERROR_SYSCALL_ERROR_3;
	      args->error = clib_error_return_unix (0, "if_nametoindex '%s'",
						    host_if_name);
	      goto error;
	    }
	}
      else if (host_if_name)
	{
	  args->error =
	    vnet_netlink_set_link_name (vif->ifindex, host_if_name);
	  if (args->error)
	    {
	      args->rv = VNET_API_ERROR_NETLINK_ERROR;
	      goto error;
	    }
	}
    }

  if (vif->type == VIRTIO_IF_TYPE_TAP)
    {
      if (ethernet_mac_address_is_zero (args->host_mac_addr.bytes))
	ethernet_mac_address_generate (args->host_mac_addr.bytes);
      args->error = vnet_netlink_set_link_addr (vif->ifindex,
						args->host_mac_addr.bytes);
      if (args->error)
	{
	  args->rv = VNET_API_ERROR_NETLINK_ERROR;
	  goto error;
	}

      if (args->host_bridge)
	{
	  args->error = vnet_netlink_set_link_master (vif->ifindex,
						      (char *)
						      args->host_bridge);
	  if (args->error)
	    {
	      args->rv = VNET_API_ERROR_NETLINK_ERROR;
	      goto error;
	    }
	}
    }

  if (args->host_ip4_prefix_len)
    {
      args->error = vnet_netlink_add_ip4_addr (vif->ifindex,
					       &args->host_ip4_addr,
					       args->host_ip4_prefix_len);
      if (args->error)
	{
	  args->rv = VNET_API_ERROR_NETLINK_ERROR;
	  goto error;
	}
    }

  if (args->host_ip6_prefix_len)
    {
      args->error = vnet_netlink_add_ip6_addr (vif->ifindex,
					       &args->host_ip6_addr,
					       args->host_ip6_prefix_len);
      if (args->error)
	{
	  args->rv = VNET_API_ERROR_NETLINK_ERROR;
	  goto error;
	}
    }

  args->error = vnet_netlink_set_link_state (vif->ifindex, 1 /* UP */ );
  if (args->error)
    {
      args->rv = VNET_API_ERROR_NETLINK_ERROR;
      goto error;
    }

  if (args->host_ip4_gw_set)
    {
      args->error = vnet_netlink_add_ip4_route (0, 0, &args->host_ip4_gw);
      if (args->error)
	{
	  args->rv = VNET_API_ERROR_NETLINK_ERROR;
	  goto error;
	}
    }

  if (args->host_ip6_gw_set)
    {
      args->error = vnet_netlink_add_ip6_route (0, 0, &args->host_ip6_gw);
      if (args->error)
	{
	  args->rv = VNET_API_ERROR_NETLINK_ERROR;
	  goto error;
	}
    }

  if (args->host_mtu_set)
    {
      args->error =
	vnet_netlink_set_link_mtu (vif->ifindex, args->host_mtu_size);
      if (args->error)
	{
	  args->rv = VNET_API_ERROR_NETLINK_ERROR;
	  goto error;
	}
    }
  else if (tm->host_mtu_size != 0)
    {
      args->error =
	vnet_netlink_set_link_mtu (vif->ifindex, tm->host_mtu_size);
      if (args->error)
	{
	  args->rv = VNET_API_ERROR_NETLINK_ERROR;
	  goto error;
	}
      args->host_mtu_set = 1;
      args->host_mtu_size = tm->host_mtu_size;
    }

  /* switch back to old net namespace */
  if (args->host_namespace)
    {
      if (setns (old_netns_fd, CLONE_NEWNET) == -1)
	{
	  args->rv = VNET_API_ERROR_SYSCALL_ERROR_2;
	  args->error = clib_error_return_unix (0, "setns '%s'",
						args->host_namespace);
	  goto error;
	}
    }

  for (i = 0; i < num_vhost_queues; i++)
    {
      if (i < vif->num_rxqs && (args->error =
				virtio_vring_init (vm, vif, RX_QUEUE (i),
						   args->rx_ring_sz)))
	{
	  args->rv = VNET_API_ERROR_INIT_FAILED;
	  goto error;
	}

      if (i < vif->num_txqs && (args->error =
				virtio_vring_init (vm, vif, TX_QUEUE (i),
						   args->tx_ring_sz)))
	{
	  args->rv = VNET_API_ERROR_INIT_FAILED;
	  goto error;
	}
    }

  /* setup features and memtable */
  i = sizeof (struct vhost_memory) + sizeof (struct vhost_memory_region);
  vhost_mem = clib_mem_alloc (i);
  clib_memset (vhost_mem, 0, i);
  vhost_mem->nregions = 1;
  vhost_mem->regions[0].memory_size = vpm->max_size;
  vhost_mem->regions[0].guest_phys_addr = vpm->base_addr;
  vhost_mem->regions[0].userspace_addr =
    vhost_mem->regions[0].guest_phys_addr;

  for (i = 0; i < vhost_mem->nregions; i++)
    virtio_log_debug (vif, "memtable region %u memory_size 0x%lx "
		      "guest_phys_addr 0x%lx userspace_addr 0x%lx", i,
		      vhost_mem->regions[0].memory_size,
		      vhost_mem->regions[0].guest_phys_addr,
		      vhost_mem->regions[0].userspace_addr);


  for (i = 0; i < num_vhost_queues; i++)
    {
      int fd = vif->vhost_fds[i];
      _IOCTL (fd, VHOST_SET_FEATURES, &vif->features);
      virtio_log_debug (vif, "VHOST_SET_FEATURES: fd %u features 0x%lx",
			fd, vif->features);
      _IOCTL (fd, VHOST_SET_MEM_TABLE, vhost_mem);
      virtio_log_debug (vif, "VHOST_SET_MEM_TABLE: fd %u", fd);
    }

  /* finish initializing queue pair */
  for (i = 0; i < num_vhost_queues * 2; i++)
    {
      struct vhost_vring_addr addr = { 0 };
      struct vhost_vring_state state = { 0 };
      struct vhost_vring_file file = { 0 };
      virtio_vring_t *vring;
      u16 qp = i >> 1;
      int fd = vif->vhost_fds[qp];

      if (i & 1)
	{
	  if (qp >= vif->num_txqs)
	    continue;
	  vring = vec_elt_at_index (vif->txq_vrings, qp);
	}
      else
	{
	  if (qp >= vif->num_rxqs)
	    continue;
	  vring = vec_elt_at_index (vif->rxq_vrings, qp);
	}

      addr.index = state.index = file.index = vring->queue_id & 1;
      state.num = vring->size;
      virtio_log_debug (vif, "VHOST_SET_VRING_NUM fd %d index %u num %u", fd,
			state.index, state.num);
      _IOCTL (fd, VHOST_SET_VRING_NUM, &state);

      addr.flags = 0;
      addr.desc_user_addr = pointer_to_uword (vring->desc);
      addr.avail_user_addr = pointer_to_uword (vring->avail);
      addr.used_user_addr = pointer_to_uword (vring->used);

      virtio_log_debug (vif, "VHOST_SET_VRING_ADDR fd %d index %u flags 0x%x "
			"desc_user_addr 0x%lx avail_user_addr 0x%lx "
			"used_user_addr 0x%lx", fd, addr.index,
			addr.flags, addr.desc_user_addr, addr.avail_user_addr,
			addr.used_user_addr);
      _IOCTL (fd, VHOST_SET_VRING_ADDR, &addr);

      file.fd = vring->call_fd;
      virtio_log_debug (vif, "VHOST_SET_VRING_CALL fd %d index %u call_fd %d",
			fd, file.index, file.fd);
      _IOCTL (fd, VHOST_SET_VRING_CALL, &file);

      file.fd = vring->kick_fd;
      virtio_log_debug (vif, "VHOST_SET_VRING_KICK fd %d index %u kick_fd %d",
			fd, file.index, file.fd);
      _IOCTL (fd, VHOST_SET_VRING_KICK, &file);

      file.fd = vif->tap_fds[qp % vif->num_rxqs];
      virtio_log_debug (vif, "VHOST_NET_SET_BACKEND fd %d index %u tap_fd %d",
			fd, file.index, file.fd);
      _IOCTL (fd, VHOST_NET_SET_BACKEND, &file);
    }

  if (vif->type == VIRTIO_IF_TYPE_TAP)
    {
      if (!args->mac_addr_set)
	ethernet_mac_address_generate (args->mac_addr.bytes);

      clib_memcpy (vif->mac_addr, args->mac_addr.bytes, 6);
      vif->host_bridge = format (0, "%s%c", args->host_bridge, 0);
    }
  vif->host_if_name = format (0, "%s%c", host_if_name, 0);
  vif->net_ns = format (0, "%s%c", args->host_namespace, 0);
  vif->host_mtu_size = args->host_mtu_size;
  vif->tap_flags = args->tap_flags;
  clib_memcpy (vif->host_mac_addr, args->host_mac_addr.bytes, 6);
  vif->host_ip4_prefix_len = args->host_ip4_prefix_len;
  vif->host_ip6_prefix_len = args->host_ip6_prefix_len;
  if (args->host_ip4_prefix_len)
    clib_memcpy (&vif->host_ip4_addr, &args->host_ip4_addr, 4);
  if (args->host_ip6_prefix_len)
    clib_memcpy (&vif->host_ip6_addr, &args->host_ip6_addr, 16);

  if (vif->type != VIRTIO_IF_TYPE_TUN)
    {
      args->error =
	ethernet_register_interface (vnm, virtio_device_class.index,
				     vif->dev_instance, vif->mac_addr,
				     &vif->hw_if_index,
				     virtio_eth_flag_change);
      if (args->error)
	{
	  args->rv = VNET_API_ERROR_INVALID_REGISTRATION;
	  goto error;
	}

    }
  else
    {
      vif->hw_if_index = vnet_register_interface
	(vnm, virtio_device_class.index,
	 vif->dev_instance /* device instance */ ,
	 tun_device_hw_interface_class.index, vif->dev_instance);

    }
  tm->tap_ids = clib_bitmap_set (tm->tap_ids, vif->id, 1);
  sw = vnet_get_hw_sw_interface (vnm, vif->hw_if_index);
  vif->sw_if_index = sw->sw_if_index;
  args->sw_if_index = vif->sw_if_index;
  args->rv = 0;
  hw = vnet_get_hw_interface (vnm, vif->hw_if_index);
  hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE;
  if (args->tap_flags & TAP_FLAG_GSO)
    {
      hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO |
	VNET_HW_INTERFACE_FLAG_SUPPORTS_TX_L4_CKSUM_OFFLOAD;
    }
  else if (args->tap_flags & TAP_FLAG_CSUM_OFFLOAD)
    {
      hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_TX_L4_CKSUM_OFFLOAD;
    }
  vnet_hw_interface_set_input_node (vnm, vif->hw_if_index,
				    virtio_input_node.index);

  for (i = 0; i < vif->num_rxqs; i++)
    {
      vnet_hw_interface_assign_rx_thread (vnm, vif->hw_if_index, i, ~0);
      vnet_hw_interface_set_rx_mode (vnm, vif->hw_if_index, i,
				     VNET_HW_INTERFACE_RX_MODE_DEFAULT);
      virtio_vring_set_numa_node (vm, vif, RX_QUEUE (i));
    }

  vif->per_interface_next_index = ~0;
  vif->flags |= VIRTIO_IF_FLAG_ADMIN_UP;
  vnet_hw_interface_set_flags (vnm, vif->hw_if_index,
			       VNET_HW_INTERFACE_FLAG_LINK_UP);
  vif->cxq_vring = NULL;

  goto done;

error:
  if (err)
    {
      ASSERT (args->error == 0);
      args->error = err;
      args->rv = VNET_API_ERROR_SYSCALL_ERROR_3;
    }

  tap_log_err (vif, "%U", format_clib_error, args->error);
  tap_free (vm, vif);
done:
  if (vhost_mem)
    clib_mem_free (vhost_mem);
  if (old_netns_fd != -1)
    close (old_netns_fd);
  if (nfd != -1)
    close (nfd);
}

int
tap_delete_if (vlib_main_t * vm, u32 sw_if_index)
{
  vnet_main_t *vnm = vnet_get_main ();
  virtio_main_t *mm = &virtio_main;
  int i;
  virtio_if_t *vif;
  vnet_hw_interface_t *hw;

  hw = vnet_get_sup_hw_interface_api_visible_or_null (vnm, sw_if_index);
  if (hw == NULL || virtio_device_class.index != hw->dev_class_index)
    return VNET_API_ERROR_INVALID_SW_IF_INDEX;

  vif = pool_elt_at_index (mm->interfaces, hw->dev_instance);

  if ((vif->type != VIRTIO_IF_TYPE_TAP) && (vif->type != VIRTIO_IF_TYPE_TUN))
    return VNET_API_ERROR_INVALID_INTERFACE;

  /* bring down the interface */
  vnet_hw_interface_set_flags (vnm, vif->hw_if_index, 0);
  vnet_sw_interface_set_flags (vnm, vif->sw_if_index, 0);
  for (i = 0; i < vif->num_rxqs; i++)
    vnet_hw_interface_unassign_rx_thread (vnm, vif->hw_if_index, i);

  if (vif->type == VIRTIO_IF_TYPE_TAP)
    ethernet_delete_interface (vnm, vif->hw_if_index);
  else				/* VIRTIO_IF_TYPE_TUN */
    vnet_delete_hw_interface (vnm, vif->hw_if_index);
  vif->hw_if_index = ~0;

  tap_free (vm, vif);

  return 0;
}

int
tap_csum_offload_enable_disable (vlib_main_t * vm, u32 sw_if_index,
				 int enable_disable)
{
  vnet_main_t *vnm = vnet_get_main ();
  virtio_main_t *mm = &virtio_main;
  virtio_if_t *vif;
  vnet_hw_interface_t *hw;
  clib_error_t *err = 0;
  int i = 0;

  hw = vnet_get_sup_hw_interface_api_visible_or_null (vnm, sw_if_index);

  if (hw == NULL || virtio_device_class.index != hw->dev_class_index)
    return VNET_API_ERROR_INVALID_SW_IF_INDEX;

  vif = pool_elt_at_index (mm->interfaces, hw->dev_instance);

  const unsigned int csum_offload_on = TUN_F_CSUM;
  const unsigned int csum_offload_off = 0;
  unsigned int offload = enable_disable ? csum_offload_on : csum_offload_off;
  vec_foreach_index (i, vif->tap_fds)
    _IOCTL (vif->tap_fds[i], TUNSETOFFLOAD, offload);
  vif->gso_enabled = 0;
  vif->csum_offload_enabled = enable_disable ? 1 : 0;

  if ((hw->flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO) != 0)
    {
      hw->flags &= ~VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO;
    }

  if (enable_disable)
    {
      if ((hw->flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_TX_L4_CKSUM_OFFLOAD) ==
	  0)
	{
	  hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_TX_L4_CKSUM_OFFLOAD;
	}
    }
  else
    {
      if ((hw->flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_TX_L4_CKSUM_OFFLOAD) !=
	  0)
	{
	  hw->flags &= ~VNET_HW_INTERFACE_FLAG_SUPPORTS_TX_L4_CKSUM_OFFLOAD;
	}
    }

error:
  if (err)
    {
      clib_warning ("Error %s checksum offload on sw_if_index %d",
		    enable_disable ? "enabling" : "disabling", sw_if_index);
      return VNET_API_ERROR_SYSCALL_ERROR_3;
    }
  return 0;
}

int
tap_gso_enable_disable (vlib_main_t * vm, u32 sw_if_index, int enable_disable)
{
  vnet_main_t *vnm = vnet_get_main ();
  virtio_main_t *mm = &virtio_main;
  virtio_if_t *vif;
  vnet_hw_interface_t *hw;
  clib_error_t *err = 0;
  int i = 0;

  hw = vnet_get_sup_hw_interface_api_visible_or_null (vnm, sw_if_index);

  if (hw == NULL || virtio_device_class.index != hw->dev_class_index)
    return VNET_API_ERROR_INVALID_SW_IF_INDEX;

  vif = pool_elt_at_index (mm->interfaces, hw->dev_instance);

  const unsigned int gso_on = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6;
  const unsigned int gso_off = 0;
  unsigned int offload = enable_disable ? gso_on : gso_off;
  vec_foreach_index (i, vif->tap_fds)
    _IOCTL (vif->tap_fds[i], TUNSETOFFLOAD, offload);
  vif->gso_enabled = enable_disable ? 1 : 0;
  vif->csum_offload_enabled = 0;
  if (enable_disable)
    {
      if ((hw->flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO) == 0)
	{
	  hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO |
	    VNET_HW_INTERFACE_FLAG_SUPPORTS_TX_L4_CKSUM_OFFLOAD;
	}
    }
  else
    {
      if ((hw->flags & VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO) != 0)
	{
	  hw->flags &= ~(VNET_HW_INTERFACE_FLAG_SUPPORTS_GSO |
			 VNET_HW_INTERFACE_FLAG_SUPPORTS_TX_L4_CKSUM_OFFLOAD);
	}
    }

error:
  if (err)
    {
      clib_warning ("Error %s gso on sw_if_index %d",
		    enable_disable ? "enabling" : "disabling", sw_if_index);
      return VNET_API_ERROR_SYSCALL_ERROR_3;
    }
  return 0;
}

int
tap_dump_ifs (tap_interface_details_t ** out_tapids)
{
  vnet_main_t *vnm = vnet_get_main ();
  virtio_main_t *mm = &virtio_main;
  virtio_if_t *vif;
  virtio_vring_t *vring;
  vnet_hw_interface_t *hi;
  tap_interface_details_t *r_tapids = NULL;
  tap_interface_details_t *tapid = NULL;

  /* *INDENT-OFF* */
  pool_foreach (vif, mm->interfaces,
    if ((vif->type != VIRTIO_IF_TYPE_TAP)
      && (vif->type != VIRTIO_IF_TYPE_TUN))
      continue;
    vec_add2(r_tapids, tapid, 1);
    clib_memset (tapid, 0, sizeof (*tapid));
    tapid->id = vif->id;
    tapid->sw_if_index = vif->sw_if_index;
    hi = vnet_get_hw_interface (vnm, vif->hw_if_index);
    clib_memcpy(tapid->dev_name, hi->name,
                MIN (ARRAY_LEN (tapid->dev_name) - 1,
                     strlen ((const char *) hi->name)));
    vring = vec_elt_at_index (vif->rxq_vrings, RX_QUEUE_ACCESS(0));
    tapid->rx_ring_sz = vring->size;
    vring = vec_elt_at_index (vif->txq_vrings, TX_QUEUE_ACCESS(0));
    tapid->tx_ring_sz = vring->size;
    tapid->tap_flags = vif->tap_flags;
    clib_memcpy(&tapid->host_mac_addr, vif->host_mac_addr, 6);
    if (vif->host_if_name)
      {
        clib_memcpy(tapid->host_if_name, vif->host_if_name,
                    MIN (ARRAY_LEN (tapid->host_if_name) - 1,
                    strlen ((const char *) vif->host_if_name)));
      }
    if (vif->net_ns)
      {
        clib_memcpy(tapid->host_namespace, vif->net_ns,
                    MIN (ARRAY_LEN (tapid->host_namespace) - 1,
                    strlen ((const char *) vif->net_ns)));
      }
    if (vif->host_bridge)
      {
        clib_memcpy(tapid->host_bridge, vif->host_bridge,
                    MIN (ARRAY_LEN (tapid->host_bridge) - 1,
                    strlen ((const char *) vif->host_bridge)));
      }
    if (vif->host_ip4_prefix_len)
      clib_memcpy(tapid->host_ip4_addr.as_u8, &vif->host_ip4_addr, 4);
    tapid->host_ip4_prefix_len = vif->host_ip4_prefix_len;
    if (vif->host_ip6_prefix_len)
      clib_memcpy(tapid->host_ip6_addr.as_u8, &vif->host_ip6_addr, 16);
    tapid->host_ip6_prefix_len = vif->host_ip6_prefix_len;
    tapid->host_mtu_size = vif->host_mtu_size;
  );
  /* *INDENT-ON* */

  *out_tapids = r_tapids;

  return 0;
}

static clib_error_t *
tap_mtu_config (vlib_main_t * vm, unformat_input_t * input)
{
  tap_main_t *tm = &tap_main;

  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
    {
      if (unformat (input, "host-mtu %d", &tm->host_mtu_size))
	;
      else
	return clib_error_return (0, "unknown input `%U'",
				  format_unformat_error, input);
    }

  return 0;
}

/* tap { host-mtu <size> } configuration. */
VLIB_CONFIG_FUNCTION (tap_mtu_config, "tap");

static clib_error_t *
tap_init (vlib_main_t * vm)
{
  tap_main_t *tm = &tap_main;
  clib_error_t *error = 0;

  tm->log_default = vlib_log_register_class ("tap", 0);
  vlib_log_debug (tm->log_default, "initialized");

  tm->host_mtu_size = 0;

  return error;
}

VLIB_INIT_FUNCTION (tap_init);

/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */