summaryrefslogtreecommitdiffstats
path: root/src/vlibmemory
AgeCommit message (Expand)AuthorFilesLines
2018-06-14Use __attribute__((weak)) references where necessaryDave Barach1-5/+1
2018-06-08export counters in a memfd segmentDave Barach2-6/+16
2018-06-05VPP API: Memory traceOle Troan2-16/+28
2018-05-10Remove the historical memfd api segment bootstrapDave Barach2-88/+28
2018-04-18typo fix: UNKOWN -> UNKNOWNAndrey "Zed" Zaikin1-1/+1
2018-03-06API: Add service definitions for events and singleton messages (second attempt)Marek Gradzki1-0/+10
2018-03-05Revert "API: Add service definitions for events and singleton messages."Ole Trøan1-10/+0
2018-03-05API: Add service definitions for events and singleton messages.Ole Troan1-0/+10
2018-02-02vlmemory/svm: fix client detach from svm regionFlorin Coras4-8/+23
2018-01-25session: add support for memfd segmentsFlorin Coras11-86/+219
2018-01-23VPPAPIGEN: vppapigen replacement in Python PLY.Ole Troan1-1/+1
2018-01-22svm: queue sub: Add conditional timed waitMohsin Kazmi2-3/+3
2018-01-15svm: refactor memfd and remove ssvm_ethFlorin Coras5-12/+15
2018-01-11api: fix handlers that explicitly depend on svm queueFlorin Coras3-0/+16
2018-01-11api: remove transport specific code from handlersFlorin Coras1-4/+4
2018-01-10svm: calc base address on AArch64 based on autodetected VA space sizeDamjan Marion1-1/+1
2018-01-09api: refactor vlibmemoryFlorin Coras17-3718/+3400
2018-01-05sock api: add first msg id retrieval functionFlorin Coras2-17/+39
2018-01-05sock api: add infra for bootstrapping shm clientsFlorin Coras6-151/+514
2017-12-15VPP-1102: fix dangling references in RPC handlingDave Barach2-27/+40
2017-12-01session: allocate cb messages from client ringFlorin Coras1-0/+2
2017-10-31Fix typo (double unlock)Dave Barach1-2/+1
2017-10-25VCL-LDPRELOAD: statically link VPP objects into libvcl_ldpreload.soDave Wallace1-6/+0
2017-10-18CSIT-844: fix binary api rx pthread heap push/popDave Barach2-6/+27
2017-10-10API versioning: Fix coverity errors from strncpy()Ole Troan1-1/+1
2017-10-09vppapigen: support per-file (major,minor,patch) version stampsDave Barach2-1/+60
2017-10-06Coverity fixes for API socketChris Luke1-3/+3
2017-10-05Clean up "show api ring" debug CLIDave Barach1-3/+11
2017-10-03api: fix internal client registrationsFlorin Coras1-0/+2
2017-10-03Repair vlib API socket serverDave Barach8-271/+1824
2017-09-26Add thread-safe event signaller, use RPC where requiredDave Barach1-1/+12
2017-09-25Fix sending GARP/NA on Bonded Interface Active/Backup Link Up/DownJohn Lo1-3/+27
2017-09-25Add binary API documentationDave Barach1-1/+3
2017-09-19Add new C APIKlement Sekera2-5/+71
2017-09-13API message table inspection utilitiesDave Barach1-60/+433
2017-09-11Recombine diags and minimum barrier open time changes (VPP-968)Colin Tregenza Dancer1-0/+5
2017-08-23Fix vl_map_shmem() root_path dangling reference.Dave Wallace1-5/+8
2017-08-18API: More gracefully fail when opening shared memory segment fails.Ole Troan1-2/+6
2017-08-14jvpp: make shm_prefix configurable (VPP-591)Jan Srnicek1-3/+16
2017-08-10TCP proxy prototypeDave Barach2-0/+39
2017-07-01Refactor API message handling codeKlement Sekera2-106/+139
2017-06-01Improve fifo allocator performanceDave Barach2-4/+44
2017-05-10completelly deprecate os_get_cpu_number, replace new occurencesDamjan Marion1-1/+1
2017-05-09Fix remaining 32-bit compile issuesDamjan Marion2-2/+3
2017-05-03A sprinkling of const in vlibmemory/api.h and friendsNeale Ranns4-21/+23
2017-04-25"autoreply" flag: autogenerate standard xxx_reply_t messagesDave Barach2-10/+5
2017-04-11move binary-api client-only routines to memory_client.cDave Barach2-250/+241
2017-03-14Clean up dead API client reaper callack schemeDave Barach1-7/+15
2017-03-09vlib_mains == 0 special cases be goneDave Barach1-0/+471
2017-03-07Python API: Synchronous mode.Ole Troan1-2/+15
ef='#n948'>948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056
/*
 *------------------------------------------------------------------
 * Copyright (c) 2017 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *------------------------------------------------------------------
 */


#define _GNU_SOURCE
#include <stdint.h>
#include <net/if.h>
#include <sys/types.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <sys/uio.h>
#include <sys/mman.h>
#include <sys/prctl.h>
#include <sys/eventfd.h>
#include <inttypes.h>
#include <limits.h>

#include <vlib/vlib.h>
#include <vlib/unix/unix.h>
#include <vppinfra/linux/syscall.h>
#include <vnet/plugin/plugin.h>
#include <vnet/ethernet/ethernet.h>
#include <vpp/app/version.h>
#include <memif/memif.h>
#include <memif/private.h>

memif_main_t memif_main;

static u32
memif_eth_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags)
{
  /* nothing for now */
  return 0;
}

static void
memif_queue_intfd_close (memif_queue_t * mq)
{
  if (mq->int_clib_file_index != ~0)
    {
      memif_file_del_by_index (mq->int_clib_file_index);
      mq->int_clib_file_index = ~0;
      mq->int_fd = -1;
    }
  else if (mq->int_fd > -1)
    {
      close (mq->int_fd);
      mq->int_fd = -1;
    }
}

void
memif_disconnect (memif_if_t * mif, clib_error_t * err)
{
  memif_main_t *mm = &memif_main;
  vnet_main_t *vnm = vnet_get_main ();
  memif_region_t *mr;
  memif_queue_t *mq;
  int i;

  if (mif == 0)
    return;

  vlib_log_debug (mm->log_class, "disconnect %u (%v)", mif->dev_instance,
		  err ? err->what : 0);

  if (err)
    {
      clib_error_t *e = 0;
      mif->local_disc_string = vec_dup (err->what);
      if (mif->sock && clib_socket_is_connected (mif->sock))
	e = memif_msg_send_disconnect (mif, err);
      clib_error_free (e);
    }

  /* set interface down */
  mif->flags &= ~(MEMIF_IF_FLAG_CONNECTED | MEMIF_IF_FLAG_CONNECTING);
  if (mif->hw_if_index != ~0)
    vnet_hw_interface_set_flags (vnm, mif->hw_if_index, 0);

  /* close connection socket */
  if (mif->sock && mif->sock->fd)
    {
      memif_socket_file_t *msf = vec_elt_at_index (mm->socket_files,
						   mif->socket_file_index);
      hash_unset (msf->dev_instance_by_fd, mif->sock->fd);
      memif_socket_close (&mif->sock);
    }
  else if (mif->sock)
    {
      clib_error_t *err;
      err = clib_socket_close (mif->sock);
      if (err)
	{
	  vlib_log_err (mm->log_class, "%U", format_clib_error, err);
	  clib_error_free (err);
	}
      clib_mem_free (mif->sock);
    }

  /* *INDENT-OFF* */
  vec_foreach_index (i, mif->rx_queues)
    {
      mq = vec_elt_at_index (mif->rx_queues, i);
      if (mq->ring)
	{
	  int rv;
	  rv = vnet_hw_interface_unassign_rx_thread (vnm, mif->hw_if_index, i);
	  if (rv)
	    vlib_log_warn (mm->log_class,
			   "Unable to unassign interface %d, queue %d: rc=%d",
			   mif->hw_if_index, i, rv);
	  mq->ring = 0;
	}
    }

  /* free tx and rx queues */
  vec_foreach (mq, mif->rx_queues)
    memif_queue_intfd_close (mq);
  vec_free (mif->rx_queues);

  vec_foreach (mq, mif->tx_queues)
    memif_queue_intfd_close (mq);
  vec_free (mif->tx_queues);

  /* free memory regions */
  vec_foreach (mr, mif->regions)
    {
      int rv;
      if (mr->is_external)
	continue;
      if ((rv = munmap (mr->shm, mr->region_size)))
	clib_warning ("munmap failed, rv = %d", rv);
      if (mr->fd > -1)
	close (mr->fd);
    }
  /* *INDENT-ON* */
  vec_free (mif->regions);
  vec_free (mif->remote_name);
  vec_free (mif->remote_if_name);
  clib_fifo_free (mif->msg_queue);
}

static clib_error_t *
memif_int_fd_read_ready (clib_file_t * uf)
{
  memif_main_t *mm = &memif_main;
  vnet_main_t *vnm = vnet_get_main ();
  u16 qid = uf->private_data & 0xFFFF;
  memif_if_t *mif = vec_elt_at_index (mm->interfaces, uf->private_data >> 16);
  memif_queue_t *mq = vec_elt_at_index (mif->rx_queues, qid);
  u64 b;
  ssize_t size;

  size = read (uf->file_descriptor, &b, sizeof (b));
  if (size < 0)
    {
      vlib_log_debug (mm->log_class, "Failed to read form socket");
      return 0;
    }

  vnet_device_input_set_interrupt_pending (vnm, mif->hw_if_index, qid);
  mq->int_count++;

  return 0;
}


clib_error_t *
memif_connect (memif_if_t * mif)
{
  memif_main_t *mm = &memif_main;
  vnet_main_t *vnm = vnet_get_main ();
  clib_file_t template = { 0 };
  memif_region_t *mr;
  int i;
  clib_error_t *err = NULL;

  vlib_log_debug (mm->log_class, "connect %u", mif->dev_instance);

  vec_free (mif->local_disc_string);
  vec_free (mif->remote_disc_string);

  /* *INDENT-OFF* */
  vec_foreach (mr, mif->regions)
    {
      if (mr->shm)
	continue;

      if (mr->fd < 0)
	{
	  err = clib_error_return (0, "no memory region fd");
	  goto error;
	}

      if ((mr->shm = mmap (NULL, mr->region_size, PROT_READ | PROT_WRITE,
			   MAP_SHARED, mr->fd, 0)) == MAP_FAILED)
	{
	  err = clib_error_return_unix (0, "mmap");
	  goto error;
	}
    }
  /* *INDENT-ON* */

  template.read_function = memif_int_fd_read_ready;

  /* *INDENT-OFF* */
  vec_foreach_index (i, mif->tx_queues)
    {
      memif_queue_t *mq = vec_elt_at_index (mif->tx_queues, i);

      mq->ring = mif->regions[mq->region].shm + mq->offset;
      if (mq->ring->cookie != MEMIF_COOKIE)
	{
	  err = clib_error_return (0, "wrong cookie on tx ring %u", i);
	  goto error;
	}
    }

  vec_foreach_index (i, mif->rx_queues)
    {
      memif_queue_t *mq = vec_elt_at_index (mif->rx_queues, i);
      int rv;

      mq->ring = mif->regions[mq->region].shm + mq->offset;
      if (mq->ring->cookie != MEMIF_COOKIE)
	{
	  err = clib_error_return (0, "wrong cookie on tx ring %u", i);
	  goto error;
	}

      if (mq->int_fd > -1)
	{
	  template.file_descriptor = mq->int_fd;
	  template.private_data = (mif->dev_instance << 16) | (i & 0xFFFF);
	  template.description = format (0, "%U rx %u int",
					 format_memif_device_name,
					 mif->dev_instance, i);
	  memif_file_add (&mq->int_clib_file_index, &template);
	}
      vnet_hw_interface_assign_rx_thread (vnm, mif->hw_if_index, i, ~0);
      rv = vnet_hw_interface_set_rx_mode (vnm, mif->hw_if_index, i,
					  VNET_HW_INTERFACE_RX_MODE_DEFAULT);
      if (rv)
	clib_warning
	  ("Warning: unable to set rx mode for interface %d queue %d: "
	   "rc=%d", mif->hw_if_index, i, rv);
      else
	{
	  vnet_hw_interface_rx_mode rxmode;
	  vnet_hw_interface_get_rx_mode (vnm, mif->hw_if_index, i, &rxmode);

	  if (rxmode == VNET_HW_INTERFACE_RX_MODE_POLLING)
	    mq->ring->flags |= MEMIF_RING_FLAG_MASK_INT;
	  else
	    vnet_device_input_set_interrupt_pending (vnm, mif->hw_if_index, i);
	}
    }
  /* *INDENT-ON* */

  mif->flags &= ~MEMIF_IF_FLAG_CONNECTING;
  mif->flags |= MEMIF_IF_FLAG_CONNECTED;

  vnet_hw_interface_set_flags (vnm, mif->hw_if_index,
			       VNET_HW_INTERFACE_FLAG_LINK_UP);
  return 0;

error:
  vlib_log_err (mm->log_class, "%U", format_clib_error, err);
  return err;
}

static_always_inline memif_ring_t *
memif_get_ring (memif_if_t * mif, memif_ring_type_t type, u16 ring_num)
{
  if (vec_len (mif->regions) == 0)
    return NULL;
  void *p = mif->regions[0].shm;
  int ring_size =
    sizeof (memif_ring_t) +
    sizeof (memif_desc_t) * (1 << mif->run.log2_ring_size);
  p += (ring_num + type * mif->run.num_s2m_rings) * ring_size;

  return (memif_ring_t *) p;
}

clib_error_t *
memif_init_regions_and_queues (memif_if_t * mif)
{
  vlib_main_t *vm = vlib_get_main ();
  memif_main_t *mm = &memif_main;
  memif_ring_t *ring = NULL;
  int i, j;
  u64 buffer_offset;
  memif_region_t *r;
  clib_mem_vm_alloc_t alloc = { 0 };
  clib_error_t *err;

  ASSERT (vec_len (mif->regions) == 0);
  vec_add2_aligned (mif->regions, r, 1, CLIB_CACHE_LINE_BYTES);

  buffer_offset = (mif->run.num_s2m_rings + mif->run.num_m2s_rings) *
    (sizeof (memif_ring_t) +
     sizeof (memif_desc_t) * (1 << mif->run.log2_ring_size));

  r->region_size = buffer_offset;

  if ((mif->flags & MEMIF_IF_FLAG_ZERO_COPY) == 0)
    r->region_size += mif->run.buffer_size * (1 << mif->run.log2_ring_size) *
      (mif->run.num_s2m_rings + mif->run.num_m2s_rings);

  alloc.name = "memif region";
  alloc.size = r->region_size;
  alloc.flags = CLIB_MEM_VM_F_SHARED;

  err = clib_mem_vm_ext_alloc (&alloc);
  if (err)
    goto error;

  r->fd = alloc.fd;
  r->shm = alloc.addr;

  if (mif->flags & MEMIF_IF_FLAG_ZERO_COPY)
    {
      vlib_buffer_pool_t *bp;
      /* *INDENT-OFF* */
      vec_foreach (bp, buffer_main.buffer_pools)
	{
	  vlib_physmem_region_t *pr;
	  pr = vlib_physmem_get_region (vm, bp->physmem_region);
	  vec_add2_aligned (mif->regions, r, 1, CLIB_CACHE_LINE_BYTES);
	  r->fd = pr->fd;
	  r->region_size = pr->size;
	  r->shm = pr->mem;
	  r->is_external = 1;
	}
      /* *INDENT-ON* */
    }

  for (i = 0; i < mif->run.num_s2m_rings; i++)
    {
      ring = memif_get_ring (mif, MEMIF_RING_S2M, i);
      ring->head = ring->tail = 0;
      ring->cookie = MEMIF_COOKIE;

      if (mif->flags & MEMIF_IF_FLAG_ZERO_COPY)
	continue;

      for (j = 0; j < (1 << mif->run.log2_ring_size); j++)
	{
	  u16 slot = i * (1 << mif->run.log2_ring_size) + j;
	  ring->desc[j].region = 0;
	  ring->desc[j].offset =
	    buffer_offset + (u32) (slot * mif->run.buffer_size);
	  ring->desc[j].length = mif->run.buffer_size;
	}
    }
  for (i = 0; i < mif->run.num_m2s_rings; i++)
    {
      ring = memif_get_ring (mif, MEMIF_RING_M2S, i);
      ring->head = ring->tail = 0;
      ring->cookie = MEMIF_COOKIE;

      if (mif->flags & MEMIF_IF_FLAG_ZERO_COPY)
	continue;

      for (j = 0; j < (1 << mif->run.log2_ring_size); j++)
	{
	  u16 slot =
	    (i + mif->run.num_s2m_rings) * (1 << mif->run.log2_ring_size) + j;
	  ring->desc[j].region = 0;
	  ring->desc[j].offset =
	    buffer_offset + (u32) (slot * mif->run.buffer_size);
	  ring->desc[j].length = mif->run.buffer_size;
	}
    }

  ASSERT (mif->tx_queues == 0);
  vec_validate_aligned (mif->tx_queues, mif->run.num_s2m_rings - 1,
			CLIB_CACHE_LINE_BYTES);

  /* *INDENT-OFF* */
  vec_foreach_index (i, mif->tx_queues)
    {
      memif_queue_t *mq = vec_elt_at_index (mif->tx_queues, i);
      if ((mq->int_fd = eventfd (0, EFD_NONBLOCK)) < 0)
	{
	  err = clib_error_return_unix (0, "eventfd[tx queue %u]", i);
	  goto error;
	}
      mq->int_clib_file_index = ~0;
      mq->ring = memif_get_ring (mif, MEMIF_RING_S2M, i);
      mq->log2_ring_size = mif->cfg.log2_ring_size;
      mq->region = 0;
      mq->offset = (void *) mq->ring - (void *) mif->regions[mq->region].shm;
      mq->last_head = 0;
      mq->type = MEMIF_RING_S2M;
      if (mif->flags & MEMIF_IF_FLAG_ZERO_COPY)
	vec_validate_aligned (mq->buffers, 1 << mq->log2_ring_size,
			      CLIB_CACHE_LINE_BYTES);
    }
  /* *INDENT-ON* */

  ASSERT (mif->rx_queues == 0);
  vec_validate_aligned (mif->rx_queues, mif->run.num_m2s_rings - 1,
			CLIB_CACHE_LINE_BYTES);

  /* *INDENT-OFF* */
  vec_foreach_index (i, mif->rx_queues)
    {
      memif_queue_t *mq = vec_elt_at_index (mif->rx_queues, i);
      if ((mq->int_fd = eventfd (0, EFD_NONBLOCK)) < 0)
	{
	  err = clib_error_return_unix (0, "eventfd[rx queue %u]", i);
	  goto error;
	}
      mq->int_clib_file_index = ~0;
      mq->ring = memif_get_ring (mif, MEMIF_RING_M2S, i);
      mq->log2_ring_size = mif->cfg.log2_ring_size;
      mq->region = 0;
      mq->offset = (void *) mq->ring - (void *) mif->regions[mq->region].shm;
      mq->last_head = 0;
      mq->type = MEMIF_RING_M2S;
      if (mif->flags & MEMIF_IF_FLAG_ZERO_COPY)
	vec_validate_aligned (mq->buffers, 1 << mq->log2_ring_size,
			      CLIB_CACHE_LINE_BYTES);
    }
  /* *INDENT-ON* */

  return 0;

error:
  vlib_log_err (mm->log_class, "%U", format_clib_error, err);
  return err;
}

static uword
memif_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f)
{
  memif_main_t *mm = &memif_main;
  memif_if_t *mif;
  clib_socket_t *sock;
  uword *event_data = 0, event_type;
  u8 enabled = 0;
  f64 start_time, last_run_duration = 0, now;
  clib_error_t *err;

  sock = clib_mem_alloc (sizeof (clib_socket_t));
  memset (sock, 0, sizeof (clib_socket_t));

  while (1)
    {
      if (enabled)
	vlib_process_wait_for_event_or_clock (vm, (f64) 3 -
					      last_run_duration);
      else
	vlib_process_wait_for_event (vm);

      event_type = vlib_process_get_events (vm, &event_data);
      vec_reset_length (event_data);

      switch (event_type)
	{
	case ~0:
	  break;
	case MEMIF_PROCESS_EVENT_START:
	  enabled = 1;
	  break;
	case MEMIF_PROCESS_EVENT_STOP:
	  enabled = 0;
	  continue;
	default:
	  ASSERT (0);
	}

      last_run_duration = start_time = vlib_time_now (vm);
      /* *INDENT-OFF* */
      pool_foreach (mif, mm->interfaces,
        ({
	  memif_socket_file_t * msf = vec_elt_at_index (mm->socket_files, mif->socket_file_index);
	  /* Allow no more than 10us without a pause */
	  now = vlib_time_now (vm);
	  if (now > start_time + 10e-6)
	    {
	      vlib_process_suspend (vm, 100e-6);	/* suspend for 100 us */
	      start_time = vlib_time_now (vm);
	    }

	  if ((mif->flags & MEMIF_IF_FLAG_ADMIN_UP) == 0)
	    continue;

	  if (mif->flags & MEMIF_IF_FLAG_CONNECTING)
	    continue;

	  if (mif->flags & MEMIF_IF_FLAG_CONNECTED)
	    continue;

	  if (mif->flags & MEMIF_IF_FLAG_IS_SLAVE)
	    {
              memset (sock, 0, sizeof(clib_socket_t));
	      sock->config = (char *) msf->filename;
              sock->flags = CLIB_SOCKET_F_IS_CLIENT| CLIB_SOCKET_F_SEQPACKET;

              if ((err = clib_socket_init (sock)))
		{
	          clib_error_free (err);
		}
	      else
	        {
		  clib_file_t t = { 0 };

		  t.read_function = memif_slave_conn_fd_read_ready;
		  t.write_function = memif_slave_conn_fd_write_ready;
		  t.error_function = memif_slave_conn_fd_error;
		  t.file_descriptor = sock->fd;
		  t.private_data = mif->dev_instance;
		  memif_file_add (&sock->private_data, &t);
	          t.description = format (0, "%U ctl",
					  format_memif_device_name,
					  mif->dev_instance);
		  hash_set (msf->dev_instance_by_fd, sock->fd, mif->dev_instance);

		  mif->flags |= MEMIF_IF_FLAG_CONNECTING;
		  mif->sock = sock;
                  sock = clib_mem_alloc (sizeof(clib_socket_t));
	        }
	    }
        }));
      /* *INDENT-ON* */
      last_run_duration = vlib_time_now (vm) - last_run_duration;
    }
  return 0;
}

/* *INDENT-OFF* */
VLIB_REGISTER_NODE (memif_process_node,static) = {
  .function = memif_process,
  .type = VLIB_NODE_TYPE_PROCESS,
  .name = "memif-process",
};
/* *INDENT-ON* */

static int
memif_add_socket_file (u32 sock_id, u8 * socket_filename)
{
  memif_main_t *mm = &memif_main;
  uword *p;
  memif_socket_file_t *msf;

  p = hash_get (mm->socket_file_index_by_sock_id, sock_id);
  if (p)
    {
      msf = pool_elt_at_index (mm->socket_files, *p);
      if (strcmp ((char *) msf->filename, (char *) socket_filename) == 0)
	{
	  /* Silently accept identical "add". */
	  return 0;
	}

      /* But don't allow a direct add of a different filename. */
      return VNET_API_ERROR_ENTRY_ALREADY_EXISTS;
    }

  pool_get (mm->socket_files, msf);
  memset (msf, 0, sizeof (memif_socket_file_t));

  msf->filename = socket_filename;
  msf->socket_id = sock_id;

  hash_set (mm->socket_file_index_by_sock_id, sock_id,
	    msf - mm->socket_files);

  return 0;
}

static int
memif_delete_socket_file (u32 sock_id)
{
  memif_main_t *mm = &memif_main;
  uword *p;
  memif_socket_file_t *msf;

  p = hash_get (mm->socket_file_index_by_sock_id, sock_id);
  if (!p)
    {
      /* Don't delete non-existent entries. */
      return VNET_API_ERROR_INVALID_ARGUMENT;
    }

  msf = pool_elt_at_index (mm->socket_files, *p);
  if (msf->ref_cnt > 0)
    {
      return VNET_API_ERROR_UNEXPECTED_INTF_STATE;
    }

  vec_free (msf->filename);
  pool_put (mm->socket_files, msf);

  hash_unset (mm->socket_file_index_by_sock_id, sock_id);

  return 0;
}

int
memif_socket_filename_add_del (u8 is_add, u32 sock_id, u8 * sock_filename)
{
  struct stat file_stat;
  char *dir = 0, *tmp;
  u32 idx = 0;

  /* allow adding socket id 0 */
  if ((sock_id == 0 && is_add == 0) || sock_id == ~0)
    {
      return VNET_API_ERROR_INVALID_ARGUMENT;
    }

  if (is_add == 0)
    {
      return memif_delete_socket_file (sock_id);
    }

  if (sock_filename == 0 || sock_filename[0] == 0)
    {
      return VNET_API_ERROR_INVALID_ARGUMENT;
    }

  if (sock_filename[0] != '/')
    {
      clib_error_t *error;

      /* copy runtime dir path */
      vec_add (dir, vlib_unix_get_runtime_dir (),
	       strlen (vlib_unix_get_runtime_dir ()));
      vec_add1 (dir, '/');

      /* if sock_filename contains dirs, add them to path */
      tmp = strrchr ((char *) sock_filename, '/');
      if (tmp)
	{
	  idx = tmp - (char *) sock_filename;
	  vec_add (dir, sock_filename, idx);
	}

      vec_add1 (dir, '\0');
      /* create socket dir */
      error = vlib_unix_recursive_mkdir (dir);
      if (error)
	{
	  clib_error_free (error);
	  return VNET_API_ERROR_SYSCALL_ERROR_1;
	}

      sock_filename = format (0, "%s/%s%c", vlib_unix_get_runtime_dir (),
			      sock_filename, 0);
    }
  else
    {
      sock_filename = vec_dup (sock_filename);

      /* check if directory exists */
      tmp = strrchr ((char *) sock_filename, '/');
      if (tmp)
	{
	  idx = tmp - (char *) sock_filename;
	  vec_add (dir, sock_filename, idx);
	  vec_add1 (dir, '\0');
	}

      if (((stat (dir, &file_stat) == -1) || (!S_ISDIR (file_stat.st_mode)))
	  && (idx != 0))
	{
	  vec_free (dir);
	  return VNET_API_ERROR_INVALID_ARGUMENT;
	}
    }
  vec_free (dir);

  return memif_add_socket_file (sock_id, sock_filename);
}

int
memif_delete_if (vlib_main_t * vm, memif_if_t * mif)
{
  vnet_main_t *vnm = vnet_get_main ();
  memif_main_t *mm = &memif_main;
  memif_socket_file_t *msf =
    vec_elt_at_index (mm->socket_files, mif->socket_file_index);
  clib_error_t *err;

  mif->flags |= MEMIF_IF_FLAG_DELETING;
  vec_free (mif->local_disc_string);
  vec_free (mif->remote_disc_string);

  /* bring down the interface */
  vnet_hw_interface_set_flags (vnm, mif->hw_if_index, 0);
  vnet_sw_interface_set_flags (vnm, mif->sw_if_index, 0);

  err = clib_error_return (0, "interface deleted");
  memif_disconnect (mif, err);
  clib_error_free (err);

  if (mif->hw_if_index != ~0)
    {
      /* remove the interface */
      if (mif->mode == MEMIF_INTERFACE_MODE_IP)
	vnet_delete_hw_interface (vnm, mif->hw_if_index);
      else
	ethernet_delete_interface (vnm, mif->hw_if_index);
      mif->hw_if_index = ~0;
    }

  /* free interface data structures */
  clib_spinlock_free (&mif->lockp);
  mhash_unset (&msf->dev_instance_by_id, &mif->id, 0);

  /* remove socket file */
  if (--(msf->ref_cnt) == 0)
    {
      if (msf->is_listener)
	{
	  int i;
	  /* *INDENT-OFF* */
	  vec_foreach_index (i, msf->pending_clients)
	    memif_socket_close (msf->pending_clients + i);
	  /* *INDENT-ON* */
	  memif_socket_close (&msf->sock);
	  vec_free (msf->pending_clients);
	}
      mhash_free (&msf->dev_instance_by_id);
      hash_free (msf->dev_instance_by_fd);
      if (msf->sock)
	{
	  err = clib_socket_close (msf->sock);
	  if (err)
	    {
	      vlib_log_err (mm->log_class, "%U", format_clib_error, err);
	      clib_error_free (err);
	    }
	  clib_mem_free (msf->sock);
	}
    }

  memset (mif, 0, sizeof (*mif));
  pool_put (mm->interfaces, mif);

  if (pool_elts (mm->interfaces) == 0)
    vlib_process_signal_event (vm, memif_process_node.index,
			       MEMIF_PROCESS_EVENT_STOP, 0);

  return 0;
}

/* *INDENT-OFF* */
VNET_HW_INTERFACE_CLASS (memif_ip_hw_if_class, static) =
{
  .name = "memif-ip",
  .flags = VNET_HW_INTERFACE_CLASS_FLAG_P2P,
};
/* *INDENT-ON* */

int
memif_create_if (vlib_main_t * vm, memif_create_if_args_t * args)
{
  memif_main_t *mm = &memif_main;
  vlib_thread_main_t *tm = vlib_get_thread_main ();
  vnet_main_t *vnm = vnet_get_main ();
  memif_if_t *mif = 0;
  vnet_sw_interface_t *sw;
  clib_error_t *error = 0;
  int ret = 0;
  uword *p;
  vnet_hw_interface_t *hw;
  memif_socket_file_t *msf = 0;
  int rv = 0;

  p = hash_get (mm->socket_file_index_by_sock_id, args->socket_id);
  if (p == 0)
    {
      rv = VNET_API_ERROR_INVALID_ARGUMENT;
      goto done;
    }

  msf = vec_elt_at_index (mm->socket_files, p[0]);

  /* existing socket file can be either master or slave but cannot be both */
  if (msf->ref_cnt > 0)
    {
      if ((!msf->is_listener != !args->is_master))
	{
	  rv = VNET_API_ERROR_SUBIF_ALREADY_EXISTS;
	  goto done;
	}

      p = mhash_get (&msf->dev_instance_by_id, &args->id);
      if (p)
	{
	  rv = VNET_API_ERROR_SUBIF_ALREADY_EXISTS;
	  goto done;
	}
    }

  /* Create new socket file */
  if (msf->ref_cnt == 0)
    {
      struct stat file_stat;

      /* If we are creating listener make sure file doesn't exist or if it
       * exists thn delete it if it is old socket file */
      if (args->is_master && (stat ((char *) msf->filename, &file_stat) == 0))
	{
	  if (S_ISSOCK (file_stat.st_mode))
	    {
	      unlink ((char *) msf->filename);
	    }
	  else
	    {
	      error = clib_error_return (0, "File exists for %s",
					 msf->filename);
	      rv = VNET_API_ERROR_VALUE_EXIST;
	      goto done;
	    }
	}

      mhash_init (&msf->dev_instance_by_id, sizeof (uword),
		  sizeof (memif_interface_id_t));
      msf->dev_instance_by_fd = hash_create (0, sizeof (uword));
      msf->is_listener = (args->is_master != 0);

      vlib_log_debug (mm->log_class, "initializing socket file %s",
		      msf->filename);
    }

  if (mm->per_thread_data == 0)
    {
      int i;
      vlib_buffer_free_list_t *fl;

      vec_validate_aligned (mm->per_thread_data, tm->n_vlib_mains - 1,
			    CLIB_CACHE_LINE_BYTES);

      fl =
	vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
      for (i = 0; i < tm->n_vlib_mains; i++)
	{
	  memif_per_thread_data_t *ptd =
	    vec_elt_at_index (mm->per_thread_data, i);
	  vlib_buffer_t *bt = &ptd->buffer_template;
	  vlib_buffer_init_for_free_list (bt, fl);
	  bt->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID;
	  bt->total_length_not_including_first_buffer = 0;
	  vnet_buffer (bt)->sw_if_index[VLIB_TX] = (u32) ~ 0;

	  /* initially prealloc copy_ops so we can use
	     _vec_len instead of vec_elen */
	  vec_validate_aligned (ptd->copy_ops, 0, CLIB_CACHE_LINE_BYTES);
	  vec_reset_length (ptd->copy_ops);
	  vec_validate_aligned (ptd->buffers, 0, CLIB_CACHE_LINE_BYTES);
	  vec_reset_length (ptd->buffers);
	}
    }

  pool_get (mm->interfaces, mif);
  memset (mif, 0, sizeof (*mif));
  mif->dev_instance = mif - mm->interfaces;
  mif->socket_file_index = msf - mm->socket_files;
  mif->id = args->id;
  mif->sw_if_index = mif->hw_if_index = mif->per_interface_next_index = ~0;
  mif->mode = args->mode;
  if (args->secret)
    mif->secret = vec_dup (args->secret);

  if (tm->n_vlib_mains > 1)
    clib_spinlock_init (&mif->lockp);

  if (mif->mode == MEMIF_INTERFACE_MODE_ETHERNET)
    {

      if (!args->hw_addr_set)
	{
	  f64 now = vlib_time_now (vm);
	  u32 rnd;
	  rnd = (u32) (now * 1e6);
	  rnd = random_u32 (&rnd);

	  memcpy (args->hw_addr + 2, &rnd, sizeof (rnd));
	  args->hw_addr[0] = 2;
	  args->hw_addr[1] = 0xfe;
	}
      error = ethernet_register_interface (vnm, memif_device_class.index,
					   mif->dev_instance, args->hw_addr,
					   &mif->hw_if_index,
					   memif_eth_flag_change);
    }
  else if (mif->mode == MEMIF_INTERFACE_MODE_IP)
    {
      mif->hw_if_index =
	vnet_register_interface (vnm, memif_device_class.index,
				 mif->dev_instance,
				 memif_ip_hw_if_class.index,
				 mif->dev_instance);
    }
  else
    error = clib_error_return (0, "unsupported interface mode");

  if (error)
    {
      ret = VNET_API_ERROR_SYSCALL_ERROR_2;
      goto error;
    }

  sw = vnet_get_hw_sw_interface (vnm, mif->hw_if_index);
  mif->sw_if_index = sw->sw_if_index;

  mif->cfg.log2_ring_size = args->log2_ring_size;
  mif->cfg.buffer_size = args->buffer_size;
  mif->cfg.num_s2m_rings =
    args->is_master ? args->rx_queues : args->tx_queues;
  mif->cfg.num_m2s_rings =
    args->is_master ? args->tx_queues : args->rx_queues;

  args->sw_if_index = mif->sw_if_index;

  /* If this is new one, start listening */
  if (msf->is_listener && msf->ref_cnt == 0)
    {
      struct stat file_stat;
      clib_socket_t *s = clib_mem_alloc (sizeof (clib_socket_t));

      ASSERT (msf->sock == 0);
      msf->sock = s;

      memset (s, 0, sizeof (clib_socket_t));
      s->config = (char *) msf->filename;
      s->flags = CLIB_SOCKET_F_IS_SERVER |
	CLIB_SOCKET_F_ALLOW_GROUP_WRITE |
	CLIB_SOCKET_F_SEQPACKET | CLIB_SOCKET_F_PASSCRED;

      if ((error = clib_socket_init (s)))
	{
	  ret = VNET_API_ERROR_SYSCALL_ERROR_4;
	  goto error;
	}

      if (stat ((char *) msf->filename, &file_stat) == -1)
	{
	  ret = VNET_API_ERROR_SYSCALL_ERROR_8;
	  goto error;
	}

      clib_file_t template = { 0 };
      template.read_function = memif_conn_fd_accept_ready;
      template.file_descriptor = msf->sock->fd;
      template.private_data = mif->socket_file_index;
      template.description = format (0, "memif listener %s", msf->filename);
      memif_file_add (&msf->sock->private_data, &template);
    }

  msf->ref_cnt++;

  if (args->is_master == 0)
    {
      mif->flags |= MEMIF_IF_FLAG_IS_SLAVE;
      if (args->is_zero_copy)
	mif->flags |= MEMIF_IF_FLAG_ZERO_COPY;
    }

  hw = vnet_get_hw_interface (vnm, mif->hw_if_index);
  hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE;
  vnet_hw_interface_set_input_node (vnm, mif->hw_if_index,
				    memif_input_node.index);

  mhash_set (&msf->dev_instance_by_id, &mif->id, mif->dev_instance, 0);

  if (pool_elts (mm->interfaces) == 1)
    {
      vlib_process_signal_event (vm, memif_process_node.index,
				 MEMIF_PROCESS_EVENT_START, 0);
    }
  goto done;

error:
  if (mif->hw_if_index != ~0)
    {
      if (mif->mode == MEMIF_INTERFACE_MODE_IP)
	vnet_delete_hw_interface (vnm, mif->hw_if_index);
      else
	ethernet_delete_interface (vnm, mif->hw_if_index);
      mif->hw_if_index = ~0;
    }
  memif_delete_if (vm, mif);
  if (error)
    {
      vlib_log_err (mm->log_class, "%U", format_clib_error, error);
      clib_error_free (error);
    }
  return ret;

done:
  return rv;
}

static clib_error_t *
memif_init (vlib_main_t * vm)
{
  memif_main_t *mm = &memif_main;

  memset (mm, 0, sizeof (memif_main_t));

  mm->log_class = vlib_log_register_class ("memif_plugin", 0);
  vlib_log_debug (mm->log_class, "initialized");

  /* initialize binary API */
  memif_plugin_api_hookup (vm);

  /*
   * Pre-stuff socket filename pool with a non-modifieable mapping
   * for socket-id 0 to MEMIF_DEFAULT_SOCKET_FILENAME in the
   * default run-time directory.
   */
  memif_socket_filename_add_del (1, 0, (u8 *) MEMIF_DEFAULT_SOCKET_FILENAME);

  return 0;
}

VLIB_INIT_FUNCTION (memif_init);

/* *INDENT-OFF* */
VLIB_PLUGIN_REGISTER () = {
    .version = VPP_BUILD_VER,
    .description = "Packet Memory Interface (experimetal)",
};
/* *INDENT-ON* */

/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */