Initial commit of vpp code.v1.0.0

Change-Id: Ib246f1fbfce93274020ee93ce461e3d8bd8b9f17 Signed-off-by: Ed Warnicke <eaw@cisco.com>
author: Ed Warnicke <eaw@cisco.com> 2015-12-08 15:45:58 -0700
committer: Ed Warnicke <eaw@cisco.com> 2015-12-08 15:47:27 -0700
commit: cb9cadad578297ffd78fa8a33670bdf1ab669e7e (patch)
tree: 6ac2be912482cc7849a26f0ab845561c3d7f4e26 /vlib/vlib
parent: fb0815d4ae4bb0fe27bd9313f34b45c8593b907e (diff)
54 files changed, 22906 insertions, 0 deletions
diff --git a/vlib/vlib/buffer.c b/vlib/vlib/buffer.c
new file mode 100644
index 00000000000..4463f7fdb4f
--- /dev/null
+++ b/vlib/vlib/buffer.c
@@ -0,0 +1,1435 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * buffer.c: allocate/free network buffers.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+
+uword vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm, vlib_buffer_t * b_first)
+{
+  vlib_buffer_t * b = b_first;
+  uword l_first = b_first->current_length;
+  uword l = 0;
+  while (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+    {
+      b = vlib_get_buffer (vm, b->next_buffer);
+      l += b->current_length;
+    }
+  b_first->total_length_not_including_first_buffer = l;
+  b_first->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
+  return l + l_first;
+}
+
+u8 * format_vlib_buffer (u8 * s, va_list * args)
+{
+  vlib_buffer_t * b = va_arg (*args, vlib_buffer_t *);
+  
+  s = format (s, "current data %d, length %d, free-list %d",
+	      b->current_data, b->current_length,
+	      b->free_list_index);
+
+  if (b->flags & VLIB_BUFFER_IS_TRACED)
+    s = format (s, ", trace 0x%x", b->trace_index);
+
+  if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+    s = format (s, ", next-buffer 0x%x", b->next_buffer);
+
+  return s;
+}
+
+u8 * format_vlib_buffer_and_data (u8 * s, va_list * args)
+{
+  vlib_buffer_t * b = va_arg (*args, vlib_buffer_t *);
+  
+  s = format (s, "%U, %U",
+	      format_vlib_buffer, b,
+	      format_hex_bytes, vlib_buffer_get_current (b), 64);
+
+  return s;
+}
+
+static u8 * format_vlib_buffer_known_state (u8 * s, va_list * args)
+{
+  vlib_buffer_known_state_t state = va_arg (*args, vlib_buffer_known_state_t);
+  char * t;
+
+  switch (state)
+    {
+    case VLIB_BUFFER_UNKNOWN:
+      t = "unknown";
+      break;
+
+    case VLIB_BUFFER_KNOWN_ALLOCATED:
+      t = "known-allocated";
+      break;
+
+    case VLIB_BUFFER_KNOWN_FREE:
+      t = "known-free";
+      break;
+
+    default:
+      t = "invalid";
+      break;
+    }
+
+  return format (s, "%s", t);
+}
+
+u8 * format_vlib_buffer_contents (u8 * s, va_list * va)
+{
+  vlib_main_t * vm = va_arg (*va, vlib_main_t *);
+  vlib_buffer_t * b = va_arg (*va, vlib_buffer_t *);
+  
+  while (1)
+    {
+      vec_add (s, vlib_buffer_get_current (b),
+	       b->current_length);
+      if (! (b->flags & VLIB_BUFFER_NEXT_PRESENT))
+	break;
+      b = vlib_get_buffer (vm, b->next_buffer);
+    }
+
+  return s;
+}
+
+static u8 *
+vlib_validate_buffer_helper (vlib_main_t * vm,
+			     u32 bi,
+			     uword follow_buffer_next,
+			     uword ** unique_hash)
+		      
+{
+  vlib_buffer_t * b = vlib_get_buffer (vm, bi);
+  vlib_buffer_main_t * bm = vm->buffer_main;
+  vlib_buffer_free_list_t * fl;
+
+  if (pool_is_free_index (bm->buffer_free_list_pool,
+			  b->free_list_index))
+    return format (0, "unknown free list 0x%x", b->free_list_index);
+
+  fl = pool_elt_at_index (bm->buffer_free_list_pool,
+			  b->free_list_index);
+
+  if ((signed) b->current_data < (signed) - VLIB_BUFFER_PRE_DATA_SIZE)
+    return format (0, "current data %d before pre-data", b->current_data);
+#if DPDK == 0
+  if (b->current_data + b->current_length > fl->n_data_bytes)
+    return format (0, "%d-%d beyond end of buffer %d",
+		   b->current_data, b->current_length,
+		   fl->n_data_bytes);
+#endif
+
+  if (follow_buffer_next
+      && (b->flags & VLIB_BUFFER_NEXT_PRESENT))
+    {
+      vlib_buffer_known_state_t k;
+      u8 * msg, * result;
+
+      k = vlib_buffer_is_known (vm, b->next_buffer);
+      if (k != VLIB_BUFFER_KNOWN_ALLOCATED)
+	return format (0, "next 0x%x: %U",
+		       b->next_buffer,
+		       format_vlib_buffer_known_state, k);
+
+      if (unique_hash)
+	{
+	  if (hash_get (*unique_hash, b->next_buffer))
+	    return format (0, "duplicate buffer 0x%x", b->next_buffer);
+
+	  hash_set1 (*unique_hash, b->next_buffer);
+	}
+
+      msg = vlib_validate_buffer (vm, b->next_buffer, follow_buffer_next);
+      if (msg)
+	{
+	  result = format (0, "next 0x%x: %v", b->next_buffer, msg);
+	  vec_free (msg);
+	  return result;
+	}
+    }
+
+  return 0;
+}
+
+u8 *
+vlib_validate_buffer (vlib_main_t * vm, u32 bi, uword follow_buffer_next)
+{ return vlib_validate_buffer_helper (vm, bi, follow_buffer_next, /* unique_hash */ 0); }
+
+u8 *
+vlib_validate_buffers (vlib_main_t * vm,
+		       u32 * buffers,
+		       uword next_buffer_stride,
+		       uword n_buffers,
+		       vlib_buffer_known_state_t known_state,
+		       uword follow_buffer_next)
+{
+  uword i, * hash;
+  u32 bi, * b = buffers;
+  vlib_buffer_known_state_t k;
+  u8 * msg = 0, * result = 0;
+
+  hash = hash_create (0, 0);
+  for (i = 0; i < n_buffers; i++)
+    {
+      bi = b[0];
+      b += next_buffer_stride;
+
+      /* Buffer is not unique. */
+      if (hash_get (hash, bi))
+	{
+	  msg = format (0, "not unique");
+	  goto done;
+	}
+
+      k = vlib_buffer_is_known (vm, bi);
+      if (k != known_state)
+	{
+	  msg = format (0, "is %U; expected %U",
+			format_vlib_buffer_known_state, k,
+			format_vlib_buffer_known_state, known_state);
+	  goto done;
+	}
+
+      msg = vlib_validate_buffer_helper (vm, bi, follow_buffer_next, &hash);
+      if (msg)
+	goto done;
+
+      hash_set1 (hash, bi);
+    }
+
+ done:
+  if (msg)
+    {
+      result = format (0, "0x%x: %v", bi, msg);
+      vec_free (msg);
+    }
+  hash_free (hash);
+  return result;
+}
+
+vlib_main_t **vlib_mains;
+
+/* When dubugging validate that given buffers are either known allocated
+   or known free. */
+static void
+vlib_buffer_validate_alloc_free (vlib_main_t * vm,
+				 u32 * buffers,
+				 uword n_buffers,
+				 vlib_buffer_known_state_t expected_state)
+{
+  u32 * b;
+  uword i, bi, is_free;
+
+  if (CLIB_DEBUG == 0)
+    return;
+
+  ASSERT(os_get_cpu_number() == 0);
+
+  /* smp disaster check */
+  if (vlib_mains)
+      ASSERT(vm == vlib_mains[0]);
+
+  is_free = expected_state == VLIB_BUFFER_KNOWN_ALLOCATED;
+  b = buffers;
+  for (i = 0; i < n_buffers; i++)
+    {
+      vlib_buffer_known_state_t known;
+	
+      bi = b[0];
+      b += 1;
+      known = vlib_buffer_is_known (vm, bi);
+      if (known != expected_state)
+	{
+	  ASSERT (0);
+	  vlib_panic_with_msg
+	    (vm, "%s %U buffer 0x%x",
+	     is_free ? "freeing" : "allocating",
+	     format_vlib_buffer_known_state, known,
+	     bi);
+	}
+
+      vlib_buffer_set_known_state
+	(vm, bi,
+	 is_free ? VLIB_BUFFER_KNOWN_FREE : VLIB_BUFFER_KNOWN_ALLOCATED);
+    }
+}
+
+/* Aligned copy routine. */
+void
+vlib_aligned_memcpy (void * _dst, void * _src, int n_bytes)
+{
+  vlib_copy_unit_t * dst = _dst;
+  vlib_copy_unit_t * src = _src;
+
+  /* Arguments must be naturally aligned. */
+  ASSERT (pointer_to_uword (dst) % sizeof (dst[0]) == 0);
+  ASSERT (pointer_to_uword (src) % sizeof (src[0]) == 0);
+  ASSERT (n_bytes % sizeof (dst[0]) == 0);
+
+  if (4 * sizeof (dst[0]) == CLIB_CACHE_LINE_BYTES)
+    {
+      CLIB_PREFETCH (dst + 0, 4 * sizeof (dst[0]), WRITE);
+      CLIB_PREFETCH (src + 0, 4 * sizeof (src[0]), READ);
+
+      while (n_bytes >= 4 * sizeof (dst[0]))
+	{
+	  dst += 4;
+	  src += 4;
+	  n_bytes -= 4 * sizeof (dst[0]);
+	  CLIB_PREFETCH (dst, 4 * sizeof (dst[0]), WRITE);
+	  CLIB_PREFETCH (src, 4 * sizeof (src[0]), READ);
+	  dst[-4] = src[-4];
+	  dst[-3] = src[-3];
+	  dst[-2] = src[-2];
+	  dst[-1] = src[-1];
+	}
+    }
+  else if (8 * sizeof (dst[0]) == CLIB_CACHE_LINE_BYTES)
+    {
+      CLIB_PREFETCH (dst + 0, 8 * sizeof (dst[0]), WRITE);
+      CLIB_PREFETCH (src + 0, 8 * sizeof (src[0]), READ);
+
+      while (n_bytes >= 8 * sizeof (dst[0]))
+	{
+	  dst += 8;
+	  src += 8;
+	  n_bytes -= 8 * sizeof (dst[0]);
+	  CLIB_PREFETCH (dst, 8 * sizeof (dst[0]), WRITE);
+	  CLIB_PREFETCH (src, 8 * sizeof (src[0]), READ);
+	  dst[-8] = src[-8];
+	  dst[-7] = src[-7];
+	  dst[-6] = src[-6];
+	  dst[-5] = src[-5];
+	  dst[-4] = src[-4];
+	  dst[-3] = src[-3];
+	  dst[-2] = src[-2];
+	  dst[-1] = src[-1];
+	}
+    }
+  else
+    /* Cache line size unknown: fall back to slow version. */;
+
+  while (n_bytes > 0)
+    {
+      *dst++ = *src++;
+      n_bytes -= 1 * sizeof (dst[0]);
+    }
+}
+
+#define BUFFERS_PER_COPY (sizeof (vlib_copy_unit_t) / sizeof (u32))
+
+/* Make sure we have at least given number of unaligned buffers. */
+static void
+fill_unaligned (vlib_main_t * vm,
+		vlib_buffer_free_list_t * free_list,
+		uword n_unaligned_buffers)
+{
+  word la = vec_len (free_list->aligned_buffers);
+  word lu = vec_len (free_list->unaligned_buffers);
+
+  /* Aligned come in aligned copy-sized chunks. */
+  ASSERT (la % BUFFERS_PER_COPY == 0);
+
+  ASSERT (la >= n_unaligned_buffers);
+
+  while (lu < n_unaligned_buffers)
+    {
+      /* Copy 4 buffers from end of aligned vector to unaligned vector. */
+      vec_add (free_list->unaligned_buffers,
+	       free_list->aligned_buffers + la - BUFFERS_PER_COPY,
+	       BUFFERS_PER_COPY);
+      la -= BUFFERS_PER_COPY;
+      lu += BUFFERS_PER_COPY;
+    }
+  _vec_len (free_list->aligned_buffers) = la;
+}
+
+/* After free aligned buffers may not contain even sized chunks. */
+static void
+trim_aligned (vlib_buffer_free_list_t * f)
+{
+  uword l, n_trim;
+
+  /* Add unaligned to aligned before trim. */
+  l = vec_len (f->unaligned_buffers);
+  if (l > 0)
+    {
+      vec_add_aligned (f->aligned_buffers, f->unaligned_buffers, l,
+		       /* align */ sizeof (vlib_copy_unit_t));
+
+      _vec_len (f->unaligned_buffers) = 0;
+    }
+
+  /* Remove unaligned buffers from end of aligned vector and save for next trim. */
+  l = vec_len (f->aligned_buffers);
+  n_trim = l % BUFFERS_PER_COPY;
+  if (n_trim)
+    {
+      /* Trim aligned -> unaligned. */
+      vec_add (f->unaligned_buffers, f->aligned_buffers + l - n_trim, n_trim);
+
+      /* Remove from aligned. */
+      _vec_len (f->aligned_buffers) = l - n_trim;
+    }
+}
+
+static void
+merge_free_lists (vlib_buffer_free_list_t * dst,
+		  vlib_buffer_free_list_t * src)
+{
+  uword l;
+  u32 * d;
+  
+  trim_aligned (src);
+  trim_aligned (dst);
+
+  l = vec_len (src->aligned_buffers);
+  if (l > 0)
+    {
+      vec_add2_aligned (dst->aligned_buffers, d, l,
+			/* align */ sizeof (vlib_copy_unit_t));
+      vlib_aligned_memcpy (d, src->aligned_buffers, l * sizeof (d[0]));
+      vec_free (src->aligned_buffers);
+    }
+
+  l = vec_len (src->unaligned_buffers);
+  if (l > 0)
+    {
+      vec_add (dst->unaligned_buffers, src->unaligned_buffers, l);
+      vec_free (src->unaligned_buffers);
+    }
+}
+
+always_inline u32
+vlib_buffer_get_free_list_with_size (vlib_main_t * vm, u32 size)
+{
+  vlib_buffer_main_t * bm = vm->buffer_main;
+
+  size = vlib_buffer_round_size (size);
+  uword * p = hash_get (bm->free_list_by_size, size);
+  return p ? p[0] : ~0;
+}
+
+/* Add buffer free list. */
+static u32
+vlib_buffer_create_free_list_helper (vlib_main_t * vm,
+				     u32 n_data_bytes,
+				     u32 is_public,
+				     u32 is_default,
+				     u8 * name)
+{
+  vlib_buffer_main_t * bm = vm->buffer_main;
+  vlib_buffer_free_list_t * f;
+
+  if (! is_default && pool_elts (bm->buffer_free_list_pool) == 0)
+    {
+      u32 default_free_free_list_index;
+
+      default_free_free_list_index =
+	vlib_buffer_create_free_list_helper (vm,
+					     /* default buffer size */ VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES,
+					     /* is_public */ 1,
+					     /* is_default */ 1,
+					     (u8 *) "default");
+      ASSERT (default_free_free_list_index == VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+
+      if (n_data_bytes == VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES && is_public)
+	return default_free_free_list_index;
+    }
+
+  pool_get_aligned (bm->buffer_free_list_pool, f, CLIB_CACHE_LINE_BYTES);
+
+  memset (f, 0, sizeof (f[0]));
+  f->index = f - bm->buffer_free_list_pool;
+  f->n_data_bytes = vlib_buffer_round_size (n_data_bytes);
+  f->min_n_buffers_each_physmem_alloc = 256;
+  f->name = clib_mem_is_heap_object (name) ? name : format (0, "%s", name);
+
+  /* Setup free buffer template. */
+  f->buffer_init_template.free_list_index = f->index;
+
+  if (is_public)
+    {
+      uword * p = hash_get (bm->free_list_by_size, f->n_data_bytes);
+      if (! p)
+	hash_set (bm->free_list_by_size, f->n_data_bytes, f->index);
+    }
+
+  return f->index;
+}
+
+u32 vlib_buffer_create_free_list (vlib_main_t * vm, u32 n_data_bytes,
+				  char * fmt, ...)
+{
+  va_list va;
+  u8 * name;
+
+  va_start (va, fmt);
+  name = va_format (0, fmt, &va);
+  va_end (va);
+
+  return vlib_buffer_create_free_list_helper (vm, n_data_bytes,
+					      /* is_public */ 0,
+					      /* is_default */ 0,
+					      name);
+}
+
+u32 vlib_buffer_get_or_create_free_list (vlib_main_t * vm, u32 n_data_bytes,
+					 char * fmt, ...)
+{
+  u32 i = vlib_buffer_get_free_list_with_size (vm, n_data_bytes);
+
+  if (i == ~0)
+    {
+      va_list va;
+      u8 * name;
+
+      va_start (va, fmt);
+      name = va_format (0, fmt, &va);
+      va_end (va);
+
+      i = vlib_buffer_create_free_list_helper (vm, n_data_bytes,
+					       /* is_public */ 1,
+					       /* is_default */ 0,
+					       name);
+    }
+  
+  return i;
+}
+
+static void
+del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f)
+{
+  u32 i;
+
+  for (i = 0; i < vec_len (f->buffer_memory_allocated); i++)
+    vm->os_physmem_free (f->buffer_memory_allocated[i]);
+  vec_free (f->name);
+  vec_free (f->buffer_memory_allocated);
+  vec_free (f->unaligned_buffers);
+  vec_free (f->aligned_buffers);
+}
+
+/* Add buffer free list. */
+void vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index)
+{
+  vlib_buffer_main_t * bm = vm->buffer_main;
+  vlib_buffer_free_list_t * f;
+  u32 merge_index;
+
+  f = vlib_buffer_get_free_list (vm, free_list_index);
+
+  ASSERT (vec_len (f->unaligned_buffers) + vec_len (f->aligned_buffers) == f->n_alloc);
+  merge_index = vlib_buffer_get_free_list_with_size (vm, f->n_data_bytes);
+  if (merge_index != ~0 && merge_index != free_list_index)
+    {
+      merge_free_lists (pool_elt_at_index (bm->buffer_free_list_pool,
+					   merge_index), f);
+    }
+
+  del_free_list (vm, f);
+
+  /* Poison it. */
+  memset (f, 0xab, sizeof (f[0]));
+
+  pool_put (bm->buffer_free_list_pool, f);
+}
+
+/* Make sure free list has at least given number of free buffers. */
+static uword
+fill_free_list (vlib_main_t * vm,
+		vlib_buffer_free_list_t * fl,
+		uword min_free_buffers)
+{
+  vlib_buffer_t * buffers, * b;
+  int n, n_bytes, i;
+  u32 * bi;
+  u32 n_remaining, n_alloc, n_this_chunk;
+
+  trim_aligned (fl);
+
+  /* Already have enough free buffers on free list? */
+  n = min_free_buffers - vec_len (fl->aligned_buffers);
+  if (n <= 0)
+    return min_free_buffers;
+
+  /* Always allocate round number of buffers. */
+  n = round_pow2 (n, BUFFERS_PER_COPY);
+
+  /* Always allocate new buffers in reasonably large sized chunks. */
+  n = clib_max (n, fl->min_n_buffers_each_physmem_alloc);
+
+  n_remaining = n;
+  n_alloc = 0;
+  while (n_remaining > 0)
+    {
+      n_this_chunk = clib_min (n_remaining, 16);
+      
+      n_bytes = n_this_chunk * (sizeof (b[0]) + fl->n_data_bytes);
+
+      /* drb: removed power-of-2 ASSERT */
+      buffers = vm->os_physmem_alloc_aligned (&vm->physmem_main,
+                                              n_bytes, sizeof (vlib_buffer_t));
+      if (! buffers)
+        return n_alloc;
+
+      /* Record chunk as being allocated so we can free it later. */
+      vec_add1 (fl->buffer_memory_allocated, buffers);
+
+      fl->n_alloc += n_this_chunk;
+      n_alloc += n_this_chunk;
+      n_remaining -= n_this_chunk;
+
+      b = buffers;
+      vec_add2_aligned (fl->aligned_buffers, bi, n_this_chunk, 
+                        sizeof (vlib_copy_unit_t));
+      for (i = 0; i < n_this_chunk; i++)
+        {
+          bi[i] = vlib_get_buffer_index (vm, b);
+
+          if (CLIB_DEBUG > 0)
+            vlib_buffer_set_known_state (vm, bi[i], VLIB_BUFFER_KNOWN_FREE);
+          b = vlib_buffer_next_contiguous (b, fl->n_data_bytes);
+        }
+
+      memset (buffers, 0, n_bytes);
+
+      /* Initialize all new buffers. */
+      b = buffers;
+      for (i = 0; i < n_this_chunk; i++)
+        {
+          vlib_buffer_init_for_free_list (b, fl);
+          b = vlib_buffer_next_contiguous (b, fl->n_data_bytes);
+        }
+      
+      if (fl->buffer_init_function)
+        fl->buffer_init_function (vm, fl, bi, n_this_chunk);
+    }
+  return n_alloc;
+}
+
+always_inline uword
+copy_alignment (u32 * x)
+{ return (pointer_to_uword (x) / sizeof (x[0])) % BUFFERS_PER_COPY; }
+
+static u32
+alloc_from_free_list (vlib_main_t * vm,
+		      vlib_buffer_free_list_t * free_list,
+		      u32 * alloc_buffers,
+		      u32 n_alloc_buffers)
+{
+  u32 * dst, * u_src;
+  uword u_len, n_left;
+  uword n_unaligned_start, n_unaligned_end, n_filled;
+
+  ASSERT(os_get_cpu_number() == 0);
+
+  n_left = n_alloc_buffers;
+  dst = alloc_buffers;
+  n_unaligned_start = ((BUFFERS_PER_COPY - copy_alignment (dst))
+		       & (BUFFERS_PER_COPY - 1));
+
+  n_filled = fill_free_list (vm, free_list, n_alloc_buffers);
+  if (n_filled == 0)
+    return 0;
+  
+  n_left = n_filled < n_left ? n_filled : n_left;
+  n_alloc_buffers = n_left;
+
+  if (n_unaligned_start >= n_left)
+    {
+      n_unaligned_start = n_left;
+      n_unaligned_end = 0;
+    }
+  else
+    n_unaligned_end = copy_alignment (dst + n_alloc_buffers);
+
+  fill_unaligned (vm, free_list, n_unaligned_start + n_unaligned_end);
+
+  u_len = vec_len (free_list->unaligned_buffers);
+  u_src = free_list->unaligned_buffers + u_len - 1;
+
+  if (n_unaligned_start)
+    {
+      uword n_copy = n_unaligned_start;
+      if (n_copy > n_left)
+	n_copy = n_left;
+      n_left -= n_copy;
+
+      while (n_copy > 0)
+	{
+	  *dst++ = *u_src--;
+	  n_copy--;
+	  u_len--;
+	}
+
+      /* Now dst should be aligned. */
+      if (n_left > 0)
+	ASSERT (pointer_to_uword (dst) % sizeof (vlib_copy_unit_t) == 0);
+    }
+
+  /* Aligned copy. */
+  {
+    vlib_copy_unit_t * d, * s;
+    uword n_copy;
+
+    if (vec_len(free_list->aligned_buffers) < ((n_left/BUFFERS_PER_COPY)*BUFFERS_PER_COPY))
+        abort();
+
+    n_copy = n_left / BUFFERS_PER_COPY;
+    n_left = n_left % BUFFERS_PER_COPY;
+
+    /* Remove buffers from aligned free list. */
+    _vec_len (free_list->aligned_buffers) -= n_copy * BUFFERS_PER_COPY;
+
+    s = (vlib_copy_unit_t *) vec_end (free_list->aligned_buffers);
+    d = (vlib_copy_unit_t *) dst;
+
+    /* Fast path loop. */
+    while (n_copy >= 4)
+      {
+	d[0] = s[0];
+	d[1] = s[1];
+	d[2] = s[2];
+	d[3] = s[3];
+	n_copy -= 4;
+	s += 4;
+	d += 4;
+      }
+
+    while (n_copy >= 1)
+      {
+	d[0] = s[0];
+	n_copy -= 1;
+	s += 1;
+	d += 1;
+      }
+
+    dst = (void *) d;
+  }
+
+  /* Unaligned copy. */
+  ASSERT (n_unaligned_end == n_left);
+  while (n_left > 0)
+    {
+      *dst++ = *u_src--;
+      n_left--;
+      u_len--;
+    }
+
+  if (! free_list->unaligned_buffers)
+    ASSERT (u_len == 0);
+  else
+    _vec_len (free_list->unaligned_buffers) = u_len;
+
+  /* Verify that buffers are known free. */
+  vlib_buffer_validate_alloc_free (vm, alloc_buffers,
+				   n_alloc_buffers,
+				   VLIB_BUFFER_KNOWN_FREE);
+
+  return n_alloc_buffers;
+}
+
+/* Allocate a given number of buffers into given array.
+   Returns number actually allocated which will be either zero or
+   number requested. */
+u32 vlib_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers)
+{
+  vlib_buffer_main_t * bm = vm->buffer_main;
+  ASSERT(os_get_cpu_number() == 0);
+
+  return alloc_from_free_list
+    (vm,
+     pool_elt_at_index (bm->buffer_free_list_pool,
+			VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX),
+     buffers, n_buffers);
+}
+
+u32 vlib_buffer_alloc_from_free_list (vlib_main_t * vm,
+				      u32 * buffers,
+				      u32 n_buffers,
+				      u32 free_list_index)
+{
+  vlib_buffer_main_t * bm = vm->buffer_main;
+  vlib_buffer_free_list_t * f;
+  f = pool_elt_at_index (bm->buffer_free_list_pool, free_list_index);
+  return alloc_from_free_list (vm, f, buffers, n_buffers);
+}
+
+always_inline void
+add_buffer_to_free_list (vlib_main_t * vm,
+                         vlib_buffer_free_list_t * f,
+			 u32 buffer_index, u8 do_init)
+{
+  vlib_buffer_t * b;
+  b = vlib_get_buffer (vm, buffer_index);
+  if (PREDICT_TRUE(do_init))
+      vlib_buffer_init_for_free_list (b, f);
+  vec_add1_aligned (f->aligned_buffers, buffer_index, sizeof (vlib_copy_unit_t));
+}
+
+always_inline vlib_buffer_free_list_t *
+buffer_get_free_list (vlib_main_t * vm, vlib_buffer_t * b, u32 * index)
+{
+  vlib_buffer_main_t * bm = vm->buffer_main;
+  u32 i;
+
+  *index = i = b->free_list_index;
+  return pool_elt_at_index (bm->buffer_free_list_pool, i);
+}
+
+void *vlib_set_buffer_free_callback (vlib_main_t *vm, void *fp)
+{
+  vlib_buffer_main_t * bm = vm->buffer_main;
+  void * rv = bm->buffer_free_callback;
+
+  bm->buffer_free_callback = fp;
+  return rv;
+}
+
+void vnet_buffer_free_dpdk_mb (vlib_buffer_t * b) __attribute__ ((weak));
+void vnet_buffer_free_dpdk_mb (vlib_buffer_t * b) { }
+
+static_always_inline void
+vlib_buffer_free_inline (vlib_main_t * vm,
+			 u32 * buffers,
+			 u32 n_buffers,
+			 u32 follow_buffer_next)
+{
+  vlib_buffer_main_t * bm = vm->buffer_main;
+  vlib_buffer_free_list_t * fl;
+  static u32 * next_to_free[2];	/* smp bad */
+  u32 i_next_to_free, * b, * n, * f, fi;
+  uword n_left;
+  int i;
+  static vlib_buffer_free_list_t ** announce_list;
+  vlib_buffer_free_list_t * fl0 = 0, * fl1 = 0;
+  u32 bi0=(u32)~0, bi1=(u32)~0, fi0, fi1 = (u32)~0;
+  u8 free0, free1=0, free_next0, free_next1;
+  u32 (*cb)(vlib_main_t * vm, u32 * buffers, u32 n_buffers,
+            u32 follow_buffer_next);
+
+  ASSERT(os_get_cpu_number() == 0);
+  
+  cb = bm->buffer_free_callback;
+
+  if (PREDICT_FALSE (cb != 0))
+    n_buffers = (*cb)(vm, buffers, n_buffers, follow_buffer_next);
+
+  if (! n_buffers)
+    return;
+
+  /* Use first buffer to get default free list. */
+  {
+    u32 bi0 = buffers[0];
+    vlib_buffer_t * b0;
+
+    b0 = vlib_get_buffer (vm, bi0);
+    fl = buffer_get_free_list (vm, b0, &fi);
+    if (fl->buffers_added_to_freelist_function)
+        vec_add1 (announce_list, fl);
+  }
+
+  vec_validate (next_to_free[0], n_buffers - 1);
+  vec_validate (next_to_free[1], n_buffers - 1);
+
+  i_next_to_free = 0;
+  n_left = n_buffers;
+  b = buffers;
+
+ again:
+  /* Verify that buffers are known allocated. */
+  vlib_buffer_validate_alloc_free (vm, b,
+				   n_left,
+				   VLIB_BUFFER_KNOWN_ALLOCATED);
+
+  vec_add2_aligned (fl->aligned_buffers, f, n_left,
+		    /* align */ sizeof (vlib_copy_unit_t));
+
+  n = next_to_free[i_next_to_free];
+  while (n_left >= 4)
+    {
+      vlib_buffer_t * b0, * b1, * binit0, * binit1, dummy_buffers[2];
+
+      bi0 = b[0];
+      bi1 = b[1];
+
+      f[0] = bi0;
+      f[1] = bi1;
+      f += 2;
+      b += 2;
+      n_left -= 2;
+
+      /* Prefetch buffers for next iteration. */
+      vlib_prefetch_buffer_with_index (vm, b[0], WRITE);
+      vlib_prefetch_buffer_with_index (vm, b[1], WRITE);
+
+      b0 = vlib_get_buffer (vm, bi0);
+      b1 = vlib_get_buffer (vm, bi1);
+
+      free0 = b0->clone_count == 0;
+      free1 = b1->clone_count == 0;
+
+      /* Must be before init which will over-write buffer flags. */
+      if (follow_buffer_next)
+	{
+	  n[0] = b0->next_buffer;
+	  free_next0 = free0 && (b0->flags & VLIB_BUFFER_NEXT_PRESENT) != 0;
+	  n += free_next0;
+
+	  n[0] = b1->next_buffer;
+	  free_next1 = free1 && (b1->flags & VLIB_BUFFER_NEXT_PRESENT) != 0;
+	  n += free_next1;
+	}
+      else
+	free_next0 = free_next1 = 0;
+
+      /* Must be before init which will over-write buffer free list. */
+      fi0 = b0->free_list_index;
+      fi1 = b1->free_list_index;
+
+      if (PREDICT_FALSE (fi0 != fi || fi1 != fi))
+	goto slow_path_x2;
+
+      binit0 = free0 ? b0 : &dummy_buffers[0];
+      binit1 = free1 ? b1 : &dummy_buffers[1];
+
+      vlib_buffer_init_two_for_free_list (binit0, binit1, fl);
+      continue;
+
+    slow_path_x2:
+      /* Backup speculation. */
+      f -= 2;
+      n -= free_next0 + free_next1;
+
+      _vec_len (fl->aligned_buffers) = f - fl->aligned_buffers;
+
+      fl0 = pool_elt_at_index (bm->buffer_free_list_pool, fi0);
+      fl1 = pool_elt_at_index (bm->buffer_free_list_pool, fi1);
+
+      add_buffer_to_free_list (vm, fl0, bi0, free0);
+      if (PREDICT_FALSE(fl0->buffers_added_to_freelist_function != 0))
+        {
+          int i;
+          for (i = 0; i < vec_len (announce_list); i++)
+            if (fl0 == announce_list[i])
+              goto no_fl0;
+          vec_add1(announce_list, fl0);
+        }
+    no_fl0:
+      if (PREDICT_FALSE(fl1->buffers_added_to_freelist_function != 0))
+        {
+          int i;
+          for (i = 0; i < vec_len (announce_list); i++)
+            if (fl1 == announce_list[i])
+              goto no_fl1;
+          vec_add1(announce_list, fl1);
+        }
+
+    no_fl1:
+      add_buffer_to_free_list (vm, fl1, bi1, free1);
+
+      /* Possibly change current free list. */
+      if (fi0 != fi && fi1 != fi)
+	{
+	  fi = fi1;
+	  fl = pool_elt_at_index (bm->buffer_free_list_pool, fi);
+	}
+
+      vec_add2_aligned (fl->aligned_buffers, f, n_left,
+			/* align */ sizeof (vlib_copy_unit_t));
+    }
+
+  while (n_left >= 1)
+    {
+      vlib_buffer_t * b0, * binit0, dummy_buffers[1];
+
+      bi0 = b[0];
+      f[0] = bi0;
+      f += 1;
+      b += 1;
+      n_left -= 1;
+
+      b0 = vlib_get_buffer (vm, bi0);
+
+      free0 = b0->clone_count == 0;
+
+      /* Must be before init which will over-write buffer flags. */
+      if (follow_buffer_next)
+	{
+	  n[0] = b0->next_buffer;
+	  free_next0 = free0 && (b0->flags & VLIB_BUFFER_NEXT_PRESENT) != 0;
+	  n += free_next0;
+	}
+      else
+	free_next0 = 0;
+
+      /* Must be before init which will over-write buffer free list. */
+      fi0 = b0->free_list_index;
+
+      if (PREDICT_FALSE (fi0 != fi))
+	goto slow_path_x1;
+
+      binit0 = free0 ? b0 : &dummy_buffers[0];
+
+      vlib_buffer_init_for_free_list (binit0, fl);
+      continue;
+
+    slow_path_x1:
+      /* Backup speculation. */
+      f -= 1;
+      n -= free_next0;
+
+      _vec_len (fl->aligned_buffers) = f - fl->aligned_buffers;
+
+      fl0 = pool_elt_at_index (bm->buffer_free_list_pool, fi0);
+
+      add_buffer_to_free_list (vm, fl0, bi0, free0);
+      if (PREDICT_FALSE(fl0->buffers_added_to_freelist_function != 0))
+        {
+          int i;
+          for (i = 0; i < vec_len (announce_list); i++)
+            if (fl0 == announce_list[i])
+              goto no_fl00;
+          vec_add1(announce_list, fl0);
+        }
+      
+    no_fl00:
+      fi = fi0;
+      fl = pool_elt_at_index (bm->buffer_free_list_pool, fi);
+
+      vec_add2_aligned (fl->aligned_buffers, f, n_left,
+			/* align */ sizeof (vlib_copy_unit_t));
+    }
+
+  if (follow_buffer_next && ((n_left = n - next_to_free[i_next_to_free]) > 0))
+    {
+      b = next_to_free[i_next_to_free];
+      i_next_to_free ^= 1;
+      goto again;
+    }
+
+  _vec_len (fl->aligned_buffers) = f - fl->aligned_buffers;
+
+  if (vec_len(announce_list))
+    {
+      vlib_buffer_free_list_t * fl;
+      for (i = 0; i < vec_len (announce_list); i++)
+        {
+          fl = announce_list[i];
+          fl->buffers_added_to_freelist_function (vm, fl);
+        }
+      _vec_len(announce_list) = 0;
+    }
+}
+
+void vlib_buffer_free (vlib_main_t * vm,
+		       u32 * buffers,
+		       u32 n_buffers)
+{
+  vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */ 1);
+}
+
+void vlib_buffer_free_no_next (vlib_main_t * vm,
+			       u32 * buffers,
+			       u32 n_buffers)
+{
+  vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */ 0);
+}
+
+/* Copy template packet data into buffers as they are allocated. */
+static void
+vlib_packet_template_buffer_init (vlib_main_t * vm,
+				  vlib_buffer_free_list_t * fl,
+				  u32 * buffers,
+				  u32 n_buffers)
+{
+  vlib_packet_template_t * t = uword_to_pointer (fl->buffer_init_function_opaque,
+						 vlib_packet_template_t *);
+  uword i;
+
+  for (i = 0; i < n_buffers; i++)
+    {
+      vlib_buffer_t * b = vlib_get_buffer (vm, buffers[i]);
+      ASSERT (b->current_length == vec_len (t->packet_data));
+      memcpy (vlib_buffer_get_current (b), t->packet_data, b->current_length);
+    }
+}
+
+void vlib_packet_template_init (vlib_main_t * vm,
+				vlib_packet_template_t * t,
+				void * packet_data,
+				uword n_packet_data_bytes,
+				uword min_n_buffers_each_physmem_alloc,
+				char * fmt,
+				...)
+{
+  vlib_buffer_free_list_t * fl;
+  va_list va;
+  u8 * name;
+
+  va_start (va, fmt);
+  name = va_format (0, fmt, &va);
+  va_end (va);
+
+  memset (t, 0, sizeof (t[0]));
+
+  vec_add (t->packet_data, packet_data, n_packet_data_bytes);
+  t->min_n_buffers_each_physmem_alloc = min_n_buffers_each_physmem_alloc;
+
+  t->free_list_index = vlib_buffer_create_free_list_helper
+    (vm, n_packet_data_bytes,
+     /* is_public */ 1,
+     /* is_default */ 0,
+     name);
+
+  ASSERT (t->free_list_index != 0);
+  fl = vlib_buffer_get_free_list (vm, t->free_list_index);
+  fl->min_n_buffers_each_physmem_alloc = t->min_n_buffers_each_physmem_alloc;
+
+  fl->buffer_init_function = vlib_packet_template_buffer_init;
+  fl->buffer_init_function_opaque = pointer_to_uword (t);
+
+  fl->buffer_init_template.current_data = 0;
+  fl->buffer_init_template.current_length = n_packet_data_bytes;
+  fl->buffer_init_template.flags = 0;
+}
+
+void *
+vlib_packet_template_get_packet (vlib_main_t * vm, 
+                                 vlib_packet_template_t * t, 
+                                 u32 * bi_result)
+{
+  u32 bi;
+  vlib_buffer_t * b;
+
+  if (vlib_buffer_alloc (vm, &bi, 1) != 1)
+    return 0;
+
+  *bi_result = bi;
+
+  b = vlib_get_buffer (vm, bi);
+  memcpy (vlib_buffer_get_current (b),
+          t->packet_data, vec_len(t->packet_data));
+  b->current_length = vec_len(t->packet_data);
+
+  return b->data;
+}
+
+void vlib_packet_template_get_packet_helper (vlib_main_t * vm, vlib_packet_template_t * t)
+{
+  word n = t->min_n_buffers_each_physmem_alloc;
+  word l = vec_len (t->packet_data);
+  word n_alloc;
+
+  ASSERT (l > 0);
+  ASSERT (vec_len (t->free_buffers) == 0);
+
+  vec_validate (t->free_buffers, n - 1);
+  n_alloc = vlib_buffer_alloc_from_free_list (vm, t->free_buffers,
+					      n, t->free_list_index);
+  _vec_len (t->free_buffers) = n_alloc;
+}
+
+/* Append given data to end of buffer, possibly allocating new buffers. */
+u32 vlib_buffer_add_data (vlib_main_t * vm,
+			  u32 free_list_index,
+			  u32 buffer_index,
+			  void * data, u32 n_data_bytes)
+{
+  u32 n_buffer_bytes, n_left, n_left_this_buffer, bi;
+  vlib_buffer_t * b;
+  void * d;
+
+  bi = buffer_index;
+  if (bi == 0
+      && 1 != vlib_buffer_alloc_from_free_list (vm, &bi, 1, free_list_index))
+    goto out_of_buffers;
+
+  d = data;
+  n_left = n_data_bytes;
+  n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm, free_list_index);
+  
+  b = vlib_get_buffer (vm, bi);
+  b->flags &= ~VLIB_BUFFER_TOTAL_LENGTH_VALID;
+
+  /* Get to the end of the chain before we try to append data...*/
+  while (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+    b = vlib_get_buffer (vm, b->next_buffer);
+
+  while (1)
+    {
+      u32 n;
+
+      ASSERT (n_buffer_bytes >= b->current_length);
+      n_left_this_buffer = n_buffer_bytes - (b->current_data + b->current_length);
+      n = clib_min (n_left_this_buffer, n_left);
+      memcpy (vlib_buffer_get_current (b) + b->current_length, d, n);
+      b->current_length += n;
+      n_left -= n;
+      if (n_left == 0)
+	break;
+
+      d += n;
+      if (1 != vlib_buffer_alloc_from_free_list (vm, &b->next_buffer, 1, free_list_index))
+	goto out_of_buffers;
+
+      b->flags |= VLIB_BUFFER_NEXT_PRESENT;
+
+      b = vlib_get_buffer (vm, b->next_buffer);
+    }
+
+  return bi;
+
+ out_of_buffers:
+  clib_error ("out of buffers");
+  return bi;
+}
+
+static void vlib_serialize_tx (serialize_main_header_t * m, serialize_stream_t * s)
+{
+  vlib_main_t * vm;
+  vlib_serialize_buffer_main_t * sm;
+  uword n, n_bytes_to_write;
+  vlib_buffer_t * last;
+
+  n_bytes_to_write = s->current_buffer_index;
+  sm = uword_to_pointer (s->data_function_opaque, vlib_serialize_buffer_main_t *);
+  vm = sm->vlib_main;
+
+  ASSERT (sm->tx.max_n_data_bytes_per_chain > 0);
+  if (serialize_stream_is_end_of_stream (s)
+      || sm->tx.n_total_data_bytes + n_bytes_to_write > sm->tx.max_n_data_bytes_per_chain)
+    {
+      vlib_process_t * p = vlib_get_current_process (vm);
+
+      last = vlib_get_buffer (vm, sm->last_buffer);
+      last->current_length = n_bytes_to_write;
+
+      vlib_set_next_frame_buffer (vm, &p->node_runtime, sm->tx.next_index, sm->first_buffer);
+
+      sm->first_buffer = sm->last_buffer = ~0;
+      sm->tx.n_total_data_bytes = 0;
+    }
+
+  else if (n_bytes_to_write == 0 && s->n_buffer_bytes == 0)
+    {
+      ASSERT (sm->first_buffer == ~0);
+      ASSERT (sm->last_buffer == ~0);
+      n = vlib_buffer_alloc_from_free_list (vm, &sm->first_buffer, 1, sm->tx.free_list_index);
+      if (n != 1)
+	serialize_error (m, clib_error_create ("vlib_buffer_alloc_from_free_list fails"));
+      sm->last_buffer = sm->first_buffer;
+      s->n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm, sm->tx.free_list_index);
+    }
+
+  if (n_bytes_to_write > 0)
+    {
+      vlib_buffer_t * prev = vlib_get_buffer (vm, sm->last_buffer);
+      n = vlib_buffer_alloc_from_free_list (vm, &sm->last_buffer, 1, sm->tx.free_list_index);
+      if (n != 1)
+	serialize_error (m, clib_error_create ("vlib_buffer_alloc_from_free_list fails"));
+      sm->tx.n_total_data_bytes += n_bytes_to_write;
+      prev->current_length = n_bytes_to_write;
+      prev->next_buffer = sm->last_buffer;
+      prev->flags |= VLIB_BUFFER_NEXT_PRESENT;
+    }
+
+  if (sm->last_buffer != ~0)
+    {
+      last = vlib_get_buffer (vm, sm->last_buffer);
+      s->buffer = vlib_buffer_get_current (last);
+      s->current_buffer_index = 0;
+      ASSERT (last->current_data == s->current_buffer_index);
+    }
+}
+
+static void vlib_serialize_rx (serialize_main_header_t * m, serialize_stream_t * s)
+{
+  vlib_main_t * vm;
+  vlib_serialize_buffer_main_t * sm;
+  vlib_buffer_t * last;
+
+  sm = uword_to_pointer (s->data_function_opaque, vlib_serialize_buffer_main_t *);
+  vm = sm->vlib_main;
+
+  if (serialize_stream_is_end_of_stream (s))
+    return;
+
+  if (sm->last_buffer != ~0)
+    {
+      last = vlib_get_buffer (vm, sm->last_buffer);
+
+      if (last->flags & VLIB_BUFFER_NEXT_PRESENT)
+	sm->last_buffer = last->next_buffer;
+      else
+	{
+	  vlib_buffer_free (vm, &sm->first_buffer, /* count */ 1);
+	  sm->first_buffer = sm->last_buffer = ~0;
+	}
+    }
+
+  if (sm->last_buffer == ~0)
+    {
+      while (clib_fifo_elts (sm->rx.buffer_fifo) == 0)
+	{
+	  sm->rx.ready_one_time_event = vlib_process_create_one_time_event (vm, vlib_current_process (vm), ~0);
+	  vlib_process_wait_for_one_time_event (vm, /* no event data */ 0, sm->rx.ready_one_time_event);
+	}
+
+      clib_fifo_sub1 (sm->rx.buffer_fifo, sm->first_buffer);
+      sm->last_buffer = sm->first_buffer;
+    }
+
+  ASSERT (sm->last_buffer != ~0);
+
+  last = vlib_get_buffer (vm, sm->last_buffer);
+  s->current_buffer_index = 0;
+  s->buffer = vlib_buffer_get_current (last);
+  s->n_buffer_bytes = last->current_length;
+}
+
+static void
+serialize_open_vlib_helper (serialize_main_t * m,
+			    vlib_main_t * vm,
+			    vlib_serialize_buffer_main_t * sm,
+			    uword is_read)
+{
+  /* Initialize serialize main but save overflow buffer for re-use between calls. */
+  {
+    u8 * save = m->stream.overflow_buffer;
+    memset (m, 0, sizeof (m[0]));
+    m->stream.overflow_buffer = save;
+    if (save)
+      _vec_len (save) = 0;
+  }
+
+  sm->first_buffer = sm->last_buffer = ~0;
+  if (is_read)
+    clib_fifo_reset (sm->rx.buffer_fifo);
+  else
+    sm->tx.n_total_data_bytes = 0;
+  sm->vlib_main = vm;
+  m->header.data_function = is_read ? vlib_serialize_rx : vlib_serialize_tx;
+  m->stream.data_function_opaque = pointer_to_uword (sm);
+}
+
+void serialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm, vlib_serialize_buffer_main_t * sm)
+{ serialize_open_vlib_helper (m, vm, sm, /* is_read */ 0); }
+
+void unserialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm, vlib_serialize_buffer_main_t * sm)
+{ serialize_open_vlib_helper (m, vm, sm, /* is_read */ 1); }
+
+u32 serialize_close_vlib_buffer (serialize_main_t * m)
+{
+  vlib_serialize_buffer_main_t * sm
+    = uword_to_pointer (m->stream.data_function_opaque, vlib_serialize_buffer_main_t *);
+  vlib_buffer_t * last;
+  serialize_stream_t * s = &m->stream;
+
+  last = vlib_get_buffer (sm->vlib_main, sm->last_buffer);
+  last->current_length = s->current_buffer_index;
+
+  if (vec_len (s->overflow_buffer) > 0)
+    {
+      sm->last_buffer
+	= vlib_buffer_add_data (sm->vlib_main, sm->tx.free_list_index,
+				sm->last_buffer == ~0 ? 0 : sm->last_buffer,
+				s->overflow_buffer,
+				vec_len (s->overflow_buffer));
+      _vec_len (s->overflow_buffer) = 0;
+    }
+
+  return sm->first_buffer;
+}
+
+void unserialize_close_vlib_buffer (serialize_main_t * m)
+{
+  vlib_serialize_buffer_main_t * sm
+    = uword_to_pointer (m->stream.data_function_opaque, vlib_serialize_buffer_main_t *);
+  if (sm->first_buffer != ~0)
+    vlib_buffer_free_one (sm->vlib_main, sm->first_buffer);
+  clib_fifo_reset (sm->rx.buffer_fifo);
+  if (m->stream.overflow_buffer)
+    _vec_len (m->stream.overflow_buffer) = 0;
+}
+
+static u8 * format_vlib_buffer_free_list (u8 * s, va_list * va)
+{
+  vlib_buffer_free_list_t * f = va_arg (*va, vlib_buffer_free_list_t *);
+  uword bytes_alloc, bytes_free, n_free, size;
+
+  if (! f)
+    return format (s, "%=30s%=12s%=12s%=12s%=12s%=12s%=12s",
+		   "Name", "Index", "Size", "Alloc", "Free", "#Alloc", "#Free");
+
+  size = sizeof (vlib_buffer_t) + f->n_data_bytes;
+  n_free = vec_len (f->aligned_buffers) + vec_len (f->unaligned_buffers);
+  bytes_alloc = size * f->n_alloc;
+  bytes_free = size * n_free;
+
+  s = format (s, "%30s%12d%12d%=12U%=12U%=12d%=12d",
+	      f->name, f->index, f->n_data_bytes,
+	      format_memory_size, bytes_alloc,
+	      format_memory_size, bytes_free,
+	      f->n_alloc, n_free);
+
+  return s;
+}
+
+static clib_error_t *
+show_buffers (vlib_main_t * vm,
+	      unformat_input_t * input,
+	      vlib_cli_command_t * cmd)
+{
+  vlib_buffer_main_t * bm = vm->buffer_main;
+  vlib_buffer_free_list_t * f;
+
+  vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, 0);
+  pool_foreach (f, bm->buffer_free_list_pool, ({
+    vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, f);
+  }));
+
+  return 0;
+}
+
+VLIB_CLI_COMMAND (show_buffers_command, static) = {
+  .path = "show buffers",
+  .short_help = "Show packet buffer allocation",
+  .function = show_buffers,
+};
+
diff --git a/vlib/vlib/buffer.h b/vlib/vlib/buffer.h
new file mode 100644
index 00000000000..6322481b696
--- /dev/null
+++ b/vlib/vlib/buffer.h
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * buffer.h: VLIB buffers
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_buffer_h
+#define included_vlib_buffer_h
+
+#include <vppinfra/types.h>
+#include <vppinfra/cache.h>
+#include <vppinfra/serialize.h>
+#include <vppinfra/vector.h>
+#include <vlib/error.h>		/* for vlib_error_t */
+#include <vlib/config.h>        /* for __PRE_DATA_SIZE */
+
+#ifdef CLIB_HAVE_VEC128
+typedef u8x16 vlib_copy_unit_t;
+#else
+typedef uword vlib_copy_unit_t;
+#endif
+
+/** \file
+    vlib buffer structure definition and a few select
+    access methods. This structure and the buffer allocation
+    mechanism should perhaps live in vnet, but it would take a lot 
+    of typing to make it so.
+*/
+    
+/* VLIB buffer representation. */
+typedef struct {
+  /* Offset within data[] that we are currently processing.
+     If negative current header points into predata area. */ 
+  i16 current_data;  /**< signed offset in data[], pre_data[]  
+                        that we are currently processing.
+                        If negative current header points into predata area.
+                     */
+  u16 current_length;  /**< Nbytes between current data and 
+                          the end of this buffer.
+                       */
+  u32 flags; /**< buffer flags: 
+                <br> VLIB_BUFFER_IS_TRACED: trace this buffer.
+                <br> VLIB_BUFFER_NEXT_PRESENT: this is a multi-chunk buffer.
+                <br> VLIB_BUFFER_TOTAL_LENGTH_VALID: as it says
+                <br> VLIB_BUFFER_REPL_FAIL: packet replication failure
+                <br> VLIB_BUFFER_FLAG_USER(n): user-defined bit N
+             */
+#define VLIB_BUFFER_IS_TRACED (1 << 0)
+#define VLIB_BUFFER_LOG2_NEXT_PRESENT (1)
+#define VLIB_BUFFER_NEXT_PRESENT (1 << VLIB_BUFFER_LOG2_NEXT_PRESENT) 
+#define VLIB_BUFFER_IS_RECYCLED (1 << 2) 
+#define VLIB_BUFFER_TOTAL_LENGTH_VALID (1 << 3)
+#define VLIB_BUFFER_HGSHM_USER_INDEX_VALID (1 << 4) 
+#define VLIB_BUFFER_REPL_FAIL (1 << 5) 
+
+  /* User defined buffer flags. */
+#define LOG2_VLIB_BUFFER_FLAG_USER(n) (32 - (n))
+#define VLIB_BUFFER_FLAG_USER(n) (1 << LOG2_VLIB_BUFFER_FLAG_USER(n))
+
+  u32 free_list_index; /**< Buffer free list that this buffer was 
+                          allocated from and will be freed to. 
+                       */
+
+  u32 total_length_not_including_first_buffer; 
+  /**< Only valid for first buffer in chain. Current length plus
+     total length given here give total number of bytes in buffer chain.
+  */
+
+
+  u32 next_buffer;   /**< Next buffer for this linked-list of buffers.
+                        Only valid if VLIB_BUFFER_NEXT_PRESENT flag is set. 
+                     */
+
+  u32 trace_index; /**< Specifies index into trace buffer 
+                      if VLIB_PACKET_IS_TRACED flag is set. 
+                   */
+
+
+  u32 clone_count; /**< Specifies whether this buffer should be 
+                      reinitialized when freed. It will be reinitialized 
+                      if the value is 0. This field can be used
+                      as a counter or for other state during packet 
+                      replication. The buffer free function does not 
+                      modify this value. 
+                   */
+
+  vlib_error_t error;   /**< Error code for buffers to be enqueued 
+                           to error handler. 
+                        */
+
+  u32 opaque[8]; /**< Opaque data used by sub-graphs for their own purposes. 
+                    See .../vnet/vnet/buffer.h
+                 */
+  /***** end of first cache line */
+
+  u32 opaque2[16];  /**< More opaque data, in its own cache line */
+
+  /***** end of second cache line */
+  u8 pre_data [__PRE_DATA_SIZE]; /**< Space for inserting data 
+                                     before buffer start.  
+                                     Packet rewrite string will be
+                                     rewritten backwards and may extend 
+                                     back before buffer->data[0].
+                                     Must come directly before packet data.
+                                 */
+
+#define VLIB_BUFFER_PRE_DATA_SIZE (ARRAY_LEN (((vlib_buffer_t *)0)->pre_data))
+  u8 data[0]; /**< Packet data. Hardware DMA here */
+} vlib_buffer_t;  /* Must be a multiple of 64B. */
+
+/** \brief Prefetch buffer metadata.
+    The first 64 bytes of buffer contains most header information
+
+    @param b - (vlib_buffer_t *) pointer to the buffer
+    @param type - LOAD, STORE. In most cases, STORE is the right answer
+*/
+
+#define vlib_prefetch_buffer_header(b,type) CLIB_PREFETCH (b, 64, type)
+
+always_inline vlib_buffer_t *
+vlib_buffer_next_contiguous (vlib_buffer_t * b, u32 buffer_bytes)
+{ return (void *) (b + 1) + buffer_bytes; }
+
+always_inline void
+vlib_buffer_struct_is_sane (vlib_buffer_t * b)
+{
+  ASSERT (sizeof (b[0]) % 64 == 0);
+
+  /* Rewrite data must be before and contiguous with packet data. */
+  ASSERT (b->pre_data + VLIB_BUFFER_PRE_DATA_SIZE == b->data);
+}
+
+/** \brief Get pointer to current data to process
+
+    @param b - (vlib_buffer_t *) pointer to the buffer
+    @return - (void *) (b->data + b->current_data)
+*/    
+
+always_inline void *
+vlib_buffer_get_current (vlib_buffer_t * b)
+{
+  /* Check bounds. */
+  ASSERT ((signed) b->current_data >= (signed) -VLIB_BUFFER_PRE_DATA_SIZE);
+  return b->data + b->current_data;
+}
+
+/** \brief Advance current data pointer by the supplied (signed!) amount
+
+    @param b - (vlib_buffer_t *) pointer to the buffer
+    @param l - (word) signed increment
+*/    
+always_inline void
+vlib_buffer_advance (vlib_buffer_t * b, word l)
+{
+  ASSERT (b->current_length >= l);
+  b->current_data += l;
+  b->current_length -= l;
+}
+
+/** \brief Reset current header & length to state they were in when
+    packet was received.
+
+    @param b - (vlib_buffer_t *) pointer to the buffer
+*/
+
+always_inline void
+vlib_buffer_reset (vlib_buffer_t * b)
+{
+  b->current_length += clib_max (b->current_data, 0);
+  b->current_data = 0;
+}
+
+/** \brief Get pointer to buffer's opaque data array
+
+    @param b - (vlib_buffer_t *) pointer to the buffer
+    @return - (void *) b->opaque
+*/
+always_inline void *
+vlib_get_buffer_opaque (vlib_buffer_t * b)
+{ return (void *) b->opaque; }
+
+/** \brief Get pointer to buffer's opaque2 data array
+
+    @param b - (vlib_buffer_t *) pointer to the buffer
+    @return - (void *) b->opaque2
+*/
+always_inline void *
+vlib_get_buffer_opaque2 (vlib_buffer_t * b)
+{ return (void *) b->opaque2; }
+
+/* Forward declaration. */
+struct vlib_main_t;
+
+typedef struct vlib_buffer_free_list_t {
+  /* Template buffer used to initialize first 16 bytes of buffers
+     allocated on this free list. */
+  vlib_buffer_t buffer_init_template;
+
+  /* Our index into vlib_main_t's buffer_free_list_pool. */
+  u32 index;
+
+  /* Number of data bytes for buffers in this free list. */
+  u32 n_data_bytes;
+
+  /* Number of buffers to allocate when we need to allocate new buffers
+     from physmem heap. */
+  u32 min_n_buffers_each_physmem_alloc;
+
+  /* Total number of buffers allocated from this free list. */
+  u32 n_alloc;
+
+  /* Vector of free buffers.  Each element is a byte offset into I/O heap.
+     Aligned vectors always has naturally aligned vlib_copy_unit_t sized chunks
+     of buffer indices.  Unaligned vector has any left over.  This is meant to
+     speed up copy routines. */
+  u32 * aligned_buffers, * unaligned_buffers;
+
+  /* Memory chunks allocated for this free list
+     recorded here so they can be freed when free list
+     is deleted. */
+  void ** buffer_memory_allocated;
+
+  /* Free list name. */
+  u8 * name;
+
+  /* Callback functions to initialize newly allocated buffers.
+     If null buffers are zeroed. */
+  void (* buffer_init_function) (struct vlib_main_t * vm,
+				 struct vlib_buffer_free_list_t * fl,
+				 u32 * buffers, u32 n_buffers);
+
+  /* Callback function to announce that buffers have been
+     added to the freelist */
+  void (* buffers_added_to_freelist_function) 
+  (struct vlib_main_t * vm,
+   struct vlib_buffer_free_list_t * fl);
+
+  uword buffer_init_function_opaque;
+} __attribute__ ((aligned (16))) vlib_buffer_free_list_t;
+
+typedef struct {
+  /* Buffer free callback, for subversive activities */
+  u32 (*buffer_free_callback) (struct vlib_main_t *vm, 
+                               u32 * buffers,
+                               u32 n_buffers,
+                               u32 follow_buffer_next);
+  /* Pool of buffer free lists.
+     Multiple free lists exist for packet generator which uses
+     separate free lists for each packet stream --- so as to avoid
+     initializing static data for each packet generated. */
+  vlib_buffer_free_list_t * buffer_free_list_pool;
+#define VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX (0)
+
+#if DPDK == 1
+/* must be same as dpdk buffer size */
+#define VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES (2048)
+#else
+#define VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES (512)
+#endif
+
+  /* Hash table mapping buffer size (rounded to next unit of
+     sizeof (vlib_buffer_t)) to free list index. */
+  uword * free_list_by_size;
+
+  /* Hash table mapping buffer index into number
+     0 => allocated but free, 1 => allocated and not-free.
+     If buffer index is not in hash table then this buffer
+     has never been allocated. */
+  uword * buffer_known_hash;
+
+  /* List of free-lists needing Blue Light Special announcements */
+  vlib_buffer_free_list_t **announce_list;
+
+  /*  Vector of rte_mempools per socket */
+#if DPDK == 1
+  struct rte_mempool ** pktmbuf_pools;
+#endif
+} vlib_buffer_main_t;
+
+typedef struct {
+  struct vlib_main_t * vlib_main;
+
+  u32 first_buffer, last_buffer;
+
+  union {
+    struct {
+      /* Total accumulated bytes in chain starting with first_buffer. */
+      u32 n_total_data_bytes;
+
+      /* Max number of bytes to accumulate in chain starting with first_buffer.
+	 As this limit is reached buffers are enqueued to next node. */
+      u32 max_n_data_bytes_per_chain;
+
+      /* Next node to enqueue buffers to relative to current process node. */
+      u32 next_index;
+
+      /* Free list to use to allocate new buffers. */
+      u32 free_list_index;
+    } tx;
+
+    struct {
+      /* CLIB fifo of buffer indices waiting to be unserialized. */
+      u32 * buffer_fifo;
+
+      /* Event type used to signal that RX buffers have been added to fifo. */
+      uword ready_one_time_event;
+    } rx;
+  };
+} vlib_serialize_buffer_main_t;
+
+void serialize_open_vlib_buffer (serialize_main_t * m, struct vlib_main_t * vm, vlib_serialize_buffer_main_t * sm);
+void unserialize_open_vlib_buffer (serialize_main_t * m, struct vlib_main_t * vm, vlib_serialize_buffer_main_t * sm);
+
+u32 serialize_close_vlib_buffer (serialize_main_t * m);
+void unserialize_close_vlib_buffer (serialize_main_t * m);
+void *vlib_set_buffer_free_callback (struct vlib_main_t *vm, void *fp);
+
+always_inline u32
+serialize_vlib_buffer_n_bytes (serialize_main_t * m)
+{
+  serialize_stream_t * s = &m->stream;
+  vlib_serialize_buffer_main_t * sm
+    = uword_to_pointer (m->stream.data_function_opaque, vlib_serialize_buffer_main_t *);
+  return sm->tx.n_total_data_bytes + s->current_buffer_index + vec_len (s->overflow_buffer);
+}
+
+/*
+ */
+
+/** \brief Compile time buffer trajectory tracing option
+    Turn this on if you run into "bad monkey" contexts, 
+    and you want to know exactly which nodes they've visited... 
+    See vlib/main.c...
+*/
+#define VLIB_BUFFER_TRACE_TRAJECTORY 0
+
+#if VLIB_BUFFER_TRACE_TRAJECTORY > 0
+#define VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b) (b)->pre_data[0]=0
+#else 
+#define VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b)
+#endif /* VLIB_BUFFER_TRACE_TRAJECTORY */
+
+#endif /* included_vlib_buffer_h */
diff --git a/vlib/vlib/buffer_funcs.h b/vlib/vlib/buffer_funcs.h
new file mode 100644
index 00000000000..452cdcb26a7
--- /dev/null
+++ b/vlib/vlib/buffer_funcs.h
@@ -0,0 +1,602 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * buffer_funcs.h: VLIB buffer related functions/inlines
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_buffer_funcs_h
+#define included_vlib_buffer_funcs_h
+
+#include <vppinfra/hash.h>
+
+/** \file
+    vlib buffer access methods.
+*/
+
+
+/** \brief Translate buffer index into buffer pointer
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param buffer_index - (u32) buffer index
+    @return - (vlib_buffer_t *) buffer pointer
+*/    
+always_inline vlib_buffer_t *
+vlib_get_buffer (vlib_main_t * vm, u32 buffer_index)
+{
+  return vlib_physmem_at_offset (&vm->physmem_main, ((uword)buffer_index)
+                                 << CLIB_LOG2_CACHE_LINE_BYTES);
+}
+
+/** \brief Translate buffer pointer into buffer index
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param b - (void *) buffer pointer
+    @return - (u32) buffer index
+*/    
+always_inline u32
+vlib_get_buffer_index (vlib_main_t * vm, void * p)
+{
+  uword offset = vlib_physmem_offset_of (&vm->physmem_main, p);
+  ASSERT((offset % (1<<CLIB_LOG2_CACHE_LINE_BYTES)) == 0);
+  return offset >> CLIB_LOG2_CACHE_LINE_BYTES;
+}
+
+/** \brief Get next buffer in buffer linklist, or zero for end of list.
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param b - (void *) buffer pointer
+    @return - (vlib_buffer_t *) next buffer, or NULL
+*/    
+always_inline vlib_buffer_t *
+vlib_get_next_buffer (vlib_main_t * vm, vlib_buffer_t * b)
+{
+  return (b->flags & VLIB_BUFFER_NEXT_PRESENT
+	  ? vlib_get_buffer (vm, b->next_buffer)
+	  : 0);
+}
+
+uword vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm, vlib_buffer_t * b_first);
+
+/** \brief Get length in bytes of the buffer chain
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param b - (void *) buffer pointer
+    @return - (uword) length of buffer chain
+*/    
+always_inline uword
+vlib_buffer_length_in_chain (vlib_main_t * vm, vlib_buffer_t * b)
+{
+  uword l = b->current_length + b->total_length_not_including_first_buffer;
+  if (PREDICT_FALSE ((b->flags & (VLIB_BUFFER_NEXT_PRESENT
+				  | VLIB_BUFFER_TOTAL_LENGTH_VALID))
+		     == VLIB_BUFFER_NEXT_PRESENT))
+    return vlib_buffer_length_in_chain_slow_path (vm, b);
+  return l;
+}
+
+/** \brief Get length in bytes of the buffer index buffer chain
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param bi - (u32) buffer index
+    @return - (uword) length of buffer chain
+*/    
+always_inline uword
+vlib_buffer_index_length_in_chain (vlib_main_t * vm, u32 bi)
+{
+  vlib_buffer_t * b = vlib_get_buffer (vm, bi);
+  return vlib_buffer_length_in_chain (vm, b);
+}
+
+/** \brief Copy buffer contents to memory
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param bi - (u32) buffer index
+    @param contents - (u8 *) memory, <strong>must be large enough</strong>
+    @return - (uword) length of buffer chain
+*/    
+always_inline uword
+vlib_buffer_contents (vlib_main_t * vm, u32 buffer_index, u8 * contents)
+{
+  uword content_len = 0;
+  uword l;
+  vlib_buffer_t * b;
+
+  while (1)
+    {
+      b = vlib_get_buffer (vm, buffer_index);
+      l = b->current_length;
+      memcpy (contents + content_len, b->data + b->current_data, l);
+      content_len += l;
+      if (! (b->flags & VLIB_BUFFER_NEXT_PRESENT))
+	break;
+      buffer_index = b->next_buffer;
+    }
+
+  return content_len;
+}
+
+/* Return physical address of buffer->data start. */
+always_inline u64
+vlib_get_buffer_data_physical_address (vlib_main_t * vm, u32 buffer_index)
+{
+  return vlib_physmem_offset_to_physical (&vm->physmem_main,
+                                          (((uword)buffer_index) <<
+					   CLIB_LOG2_CACHE_LINE_BYTES) +
+                                           STRUCT_OFFSET_OF (vlib_buffer_t, data));
+}
+
+/** \brief Prefetch buffer metadata by buffer index
+    The first 64 bytes of buffer contains most header information
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param bi - (u32) buffer index
+    @param type - LOAD, STORE. In most cases, STORE is the right answer
+*/
+/* Prefetch buffer header given index. */
+#define vlib_prefetch_buffer_with_index(vm,bi,type)	\
+  do {							\
+    vlib_buffer_t * _b = vlib_get_buffer (vm, bi);	\
+    vlib_prefetch_buffer_header (_b, type);		\
+  } while (0)
+
+#if 0
+/* Iterate over known allocated vlib bufs. You probably do not want
+ * to do this!
+ @param vm      the vlib_main_t
+ @param bi      found allocated buffer index
+ @param body    operation to perform on buffer index
+ function executes body for each allocated buffer index
+ */
+#define vlib_buffer_foreach_allocated(vm,bi,body)                \
+do {                                                             \
+  vlib_main_t * _vmain = (vm);                                   \
+  vlib_buffer_main_t * _bmain = &_vmain->buffer_main;            \
+  hash_pair_t * _vbpair;                                         \
+  hash_foreach_pair(_vbpair, _bmain->buffer_known_hash, ({       \
+    if (VLIB_BUFFER_KNOWN_ALLOCATED == _vbpair->value[0]) {      \
+      (bi) = _vbpair->key;                                       \
+      body;                                                      \
+    }                                                            \
+  }));                                                           \
+} while (0)
+#endif
+
+#if DPDK == 0
+
+typedef enum {
+  /* Index is unknown. */
+  VLIB_BUFFER_UNKNOWN,
+
+  /* Index is known and free/allocated. */
+  VLIB_BUFFER_KNOWN_FREE,
+  VLIB_BUFFER_KNOWN_ALLOCATED,
+} vlib_buffer_known_state_t;
+
+always_inline vlib_buffer_known_state_t
+vlib_buffer_is_known (vlib_main_t * vm, u32 buffer_index)
+{
+  vlib_buffer_main_t * bm = vm->buffer_main;
+  ASSERT(os_get_cpu_number() == 0);
+
+  uword * p = hash_get (bm->buffer_known_hash, buffer_index);
+  return p ? p[0] : VLIB_BUFFER_UNKNOWN;
+}
+
+always_inline void
+vlib_buffer_set_known_state (vlib_main_t * vm,
+			     u32 buffer_index,
+			     vlib_buffer_known_state_t state)
+{
+  vlib_buffer_main_t * bm = vm->buffer_main;
+  ASSERT(os_get_cpu_number() == 0);
+  hash_set (bm->buffer_known_hash, buffer_index, state);
+}
+
+/* Validates sanity of a single buffer.
+   Returns format'ed vector with error message if any. */
+u8 * vlib_validate_buffer (vlib_main_t * vm, u32 buffer_index, uword follow_chain);
+
+/* Validate an array of buffers.  As above. */
+u8 * vlib_validate_buffers (vlib_main_t * vm,
+			    u32 * buffers,
+			    uword next_buffer_stride,
+			    uword n_buffers,
+                            vlib_buffer_known_state_t known_state,
+			    uword follow_chain);
+
+#endif /* DPDK == 0 */
+
+clib_error_t *
+vlib_buffer_pool_create(vlib_main_t * vm, unsigned num_mbufs,
+                        unsigned mbuf_size, unsigned socket_id);
+
+/** \brief Allocate buffers into supplied array
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param buffers - (u32 * ) buffer index array
+    @param n_buffers - (u32) number of buffers requested
+    @return - (u32) number of buffers actually allocated, may be 
+    less than the number requested or zero
+*/
+u32 vlib_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers);
+
+always_inline u32
+vlib_buffer_round_size (u32 size)
+{ return round_pow2 (size, sizeof (vlib_buffer_t)); }
+
+/** \brief Allocate buffers from specific freelist into supplied array
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param buffers - (u32 * ) buffer index array
+    @param n_buffers - (u32) number of buffers requested
+    @return - (u32) number of buffers actually allocated, may be 
+    less than the number requested or zero
+*/
+u32 vlib_buffer_alloc_from_free_list (vlib_main_t * vm,
+				      u32 * buffers,
+				      u32 n_buffers,
+				      u32 free_list_index);
+
+/** \brief Free buffers
+    Frees the entire buffer chain for each buffer
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param buffers - (u32 * ) buffer index array
+    @param n_buffers - (u32) number of buffers to free
+
+*/
+void vlib_buffer_free (vlib_main_t * vm,
+		       /* pointer to first buffer */
+		       u32 * buffers,
+		       /* number of buffers to free */
+		       u32 n_buffers);
+
+/** \brief Free buffers, does not free the buffer chain for each buffer
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param buffers - (u32 * ) buffer index array
+    @param n_buffers - (u32) number of buffers to free
+
+*/
+void vlib_buffer_free_no_next (vlib_main_t * vm,
+			       /* pointer to first buffer */
+			       u32 * buffers,
+			       /* number of buffers to free */
+			       u32 n_buffers);
+
+/** \brief Free one buffer
+    Shorthand to free a single buffer chain. 
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param buffer_index - (u32) buffer index to free
+*/
+always_inline void
+vlib_buffer_free_one (vlib_main_t * vm, u32 buffer_index)
+{
+  vlib_buffer_free (vm, &buffer_index, /* n_buffers */ 1);
+}
+
+/* Add/delete buffer free lists. */
+u32 vlib_buffer_create_free_list (vlib_main_t * vm, u32 n_data_bytes, char * fmt, ...);
+void vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index);
+
+/* Find already existing public free list with given size or create one. */
+u32 vlib_buffer_get_or_create_free_list (vlib_main_t * vm, u32 n_data_bytes, char * fmt, ...);
+
+always_inline vlib_buffer_free_list_t *
+vlib_buffer_get_free_list (vlib_main_t * vm, u32 free_list_index)
+{
+  vlib_buffer_main_t * bm = vm->buffer_main;
+  vlib_buffer_free_list_t * f;
+
+  f = pool_elt_at_index (bm->buffer_free_list_pool, free_list_index);
+
+  /* Sanity: indices must match. */
+  ASSERT (f->index == free_list_index);
+
+  return f;
+}
+
+always_inline u32
+vlib_buffer_free_list_buffer_size (vlib_main_t * vm, u32 free_list_index)
+{
+  vlib_buffer_free_list_t * f = vlib_buffer_get_free_list (vm, free_list_index);
+  return f->n_data_bytes;
+}
+
+void
+vlib_aligned_memcpy (void * _dst, void * _src, int n_bytes);
+
+/* Reasonably fast buffer copy routine. */
+always_inline void
+vlib_copy_buffers (u32 * dst, u32 * src, u32 n)
+{
+  while (n >= 4)
+    {
+      dst[0] = src[0];
+      dst[1] = src[1];
+      dst[2] = src[2];
+      dst[3] = src[3];
+      dst += 4;
+      src += 4;
+      n -= 4;
+    }
+  while (n > 0)
+    {
+      dst[0] = src[0];
+      dst += 1;
+      src += 1;
+      n -= 1;
+    }
+}
+
+always_inline void *
+vlib_physmem_alloc_aligned (vlib_main_t * vm, clib_error_t ** error,
+			    uword n_bytes, uword alignment)
+{
+  void * r = vm->os_physmem_alloc_aligned (&vm->physmem_main, n_bytes, alignment);
+  if (! r)
+    *error = clib_error_return (0, "failed to allocate %wd bytes of I/O memory", n_bytes);
+  else
+    *error = 0;
+  return r;
+}
+
+/* By default allocate I/O memory with cache line alignment. */
+always_inline void *
+vlib_physmem_alloc (vlib_main_t * vm, clib_error_t ** error, uword n_bytes)
+{ return vlib_physmem_alloc_aligned (vm, error, n_bytes, CLIB_CACHE_LINE_BYTES); }
+
+always_inline void
+vlib_physmem_free (vlib_main_t * vm, void * mem)
+{ return vm->os_physmem_free (mem); }
+
+always_inline u64
+vlib_physmem_virtual_to_physical (vlib_main_t * vm, void * mem)
+{
+  vlib_physmem_main_t * pm = &vm->physmem_main;
+  uword o = pointer_to_uword (mem) - pm->virtual.start;
+  return vlib_physmem_offset_to_physical (pm, o);
+}
+
+/* Append given data to end of buffer, possibly allocating new buffers. */
+u32 vlib_buffer_add_data (vlib_main_t * vm,
+			  u32 free_list_index,
+			  u32 buffer_index,
+			  void * data, u32 n_data_bytes);
+
+format_function_t format_vlib_buffer, format_vlib_buffer_and_data, format_vlib_buffer_contents;
+
+typedef struct {
+  /* Vector of packet data. */
+  u8 * packet_data;
+
+#if DPDK == 0
+  /* Number of buffers to allocate in each call to physmem
+     allocator. */
+  u32 min_n_buffers_each_physmem_alloc;
+
+  /* Buffer free list for this template. */
+  u32 free_list_index;
+
+  u32 * free_buffers;
+#endif
+} vlib_packet_template_t;
+
+void vlib_packet_template_get_packet_helper (vlib_main_t * vm,
+					     vlib_packet_template_t * t);
+
+void vlib_packet_template_init (vlib_main_t * vm,
+				vlib_packet_template_t * t,
+				void * packet_data,
+				uword n_packet_data_bytes,
+				uword min_n_buffers_each_physmem_alloc,
+				char * fmt, ...);
+
+void *
+vlib_packet_template_get_packet (vlib_main_t * vm, 
+                                 vlib_packet_template_t * t,
+                                 u32 * bi_result);
+
+always_inline void
+vlib_packet_template_free (vlib_main_t * vm, vlib_packet_template_t * t)
+{
+  vec_free (t->packet_data);
+}
+
+always_inline u32
+unserialize_vlib_buffer_n_bytes (serialize_main_t * m)
+{
+  serialize_stream_t * s = &m->stream;
+  vlib_serialize_buffer_main_t * sm
+    = uword_to_pointer (m->stream.data_function_opaque, vlib_serialize_buffer_main_t *);
+  vlib_main_t * vm = sm->vlib_main;
+  u32 n, * f;
+
+  n = s->n_buffer_bytes - s->current_buffer_index;
+  if (sm->last_buffer != ~0)
+    {
+      vlib_buffer_t * b = vlib_get_buffer (vm, sm->last_buffer);
+      while (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+	{
+	  b = vlib_get_buffer (vm, b->next_buffer);
+	  n += b->current_length;
+	}
+    }
+
+  clib_fifo_foreach (f, sm->rx.buffer_fifo, ({
+    n += vlib_buffer_index_length_in_chain (vm, f[0]);
+  }));
+
+  return n;
+}
+
+typedef union {
+  vlib_buffer_t b;
+  vlib_copy_unit_t i[sizeof (vlib_buffer_t) / sizeof (vlib_copy_unit_t)];
+} vlib_buffer_union_t;
+
+/* Set a buffer quickly into "uninitialized" state.  We want this to
+   be extremely cheap and arrange for all fields that need to be
+   initialized to be in the first 128 bits of the buffer. */
+always_inline void
+vlib_buffer_init_for_free_list (vlib_buffer_t * _dst,
+				vlib_buffer_free_list_t * fl)
+{
+  vlib_buffer_union_t * dst = (vlib_buffer_union_t *) _dst;
+  vlib_buffer_union_t * src = (vlib_buffer_union_t *) &fl->buffer_init_template;
+
+  /* Make sure buffer template is sane. */
+  ASSERT (fl->index == fl->buffer_init_template.free_list_index);
+
+  /* Copy template from src->current_data thru src->free_list_index */
+  dst->i[0] = src->i[0];
+  if (1 * sizeof (dst->i[0]) < 16)
+    dst->i[1] = src->i[1];
+  if (2 * sizeof (dst->i[0]) < 16)
+    dst->i[2] = src->i[2];
+
+  /* Make sure it really worked. */
+#define _(f) ASSERT (dst->b.f == src->b.f)
+  _ (current_data);
+  _ (current_length);
+  _ (flags);
+  _ (free_list_index);
+#undef _
+  ASSERT (dst->b.total_length_not_including_first_buffer == 0);
+}
+
+always_inline void
+vlib_buffer_init_two_for_free_list (vlib_buffer_t * _dst0,
+				    vlib_buffer_t * _dst1,
+				    vlib_buffer_free_list_t * fl)
+{
+  vlib_buffer_union_t * dst0 = (vlib_buffer_union_t *) _dst0;
+  vlib_buffer_union_t * dst1 = (vlib_buffer_union_t *) _dst1;
+  vlib_buffer_union_t * src = (vlib_buffer_union_t *) &fl->buffer_init_template;
+
+  /* Make sure buffer template is sane. */
+  ASSERT (fl->index == fl->buffer_init_template.free_list_index);
+
+  /* Copy template from src->current_data thru src->free_list_index */
+  dst0->i[0] = dst1->i[0] = src->i[0];
+  if (1 * sizeof (dst0->i[0]) < 16)
+    dst0->i[1] = dst1->i[1] = src->i[1];
+  if (2 * sizeof (dst0->i[0]) < 16)
+    dst0->i[2] = dst1->i[2] = src->i[2];
+
+  /* Make sure it really worked. */
+#define _(f) ASSERT (dst0->b.f == src->b.f && dst1->b.f == src->b.f)
+  _ (current_data);
+  _ (current_length);
+  _ (flags);
+  _ (free_list_index);
+#undef _
+  ASSERT (dst0->b.total_length_not_including_first_buffer == 0);
+  ASSERT (dst1->b.total_length_not_including_first_buffer == 0);
+}
+
+#if CLIB_DEBUG > 0
+u32 * vlib_buffer_state_validation_lock;
+uword * vlib_buffer_state_validation_hash;
+void * vlib_buffer_state_heap;
+#endif
+
+static inline void 
+vlib_validate_buffer_in_use (vlib_buffer_t * b, u32 expected)
+{
+#if CLIB_DEBUG > 0
+  uword * p;
+  void * oldheap;
+
+  oldheap = clib_mem_set_heap (vlib_buffer_state_heap);
+
+  while (__sync_lock_test_and_set (vlib_buffer_state_validation_lock, 1))
+    ;
+
+  p = hash_get (vlib_buffer_state_validation_hash, b);
+
+  /* If we don't know about b, declare it to be in the expected state */
+  if (!p)
+    {
+      hash_set (vlib_buffer_state_validation_hash, b, expected);
+      goto out;
+    }
+  
+  if (p[0] != expected)
+    {
+      void cj_stop(void);
+      u32 bi;
+      vlib_main_t * vm = &vlib_global_main;
+      
+      cj_stop();
+      
+      bi = vlib_get_buffer_index (vm, b);
+
+      clib_mem_set_heap (oldheap);
+      clib_warning ("%.6f buffer %llx (%d): %s, not %s", 
+                    vlib_time_now(vm), bi,
+                    p[0] ? "busy" : "free",
+                    expected ? "busy" : "free");
+      os_panic();
+    }
+ out:
+  CLIB_MEMORY_BARRIER();
+  *vlib_buffer_state_validation_lock = 0;
+  clib_mem_set_heap (oldheap);
+#endif
+}
+
+static inline void 
+vlib_validate_buffer_set_in_use (vlib_buffer_t * b, u32 expected)
+{
+#if CLIB_DEBUG > 0
+  void * oldheap;
+
+  oldheap = clib_mem_set_heap (vlib_buffer_state_heap);
+
+  while (__sync_lock_test_and_set (vlib_buffer_state_validation_lock, 1))
+    ;
+
+  hash_set (vlib_buffer_state_validation_hash, b, expected);
+
+  CLIB_MEMORY_BARRIER();
+  *vlib_buffer_state_validation_lock = 0;
+  clib_mem_set_heap (oldheap);
+#endif  
+}
+
+#endif /* included_vlib_buffer_funcs_h */
diff --git a/vlib/vlib/buffer_node.h b/vlib/vlib/buffer_node.h
new file mode 100644
index 00000000000..0fa5c8093ca
--- /dev/null
+++ b/vlib/vlib/buffer_node.h
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * buffer_node.h: VLIB buffer handling node helper macros/inlines
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_buffer_node_h
+#define included_vlib_buffer_node_h
+
+#define vlib_validate_buffer_enqueue_x2(vm,node,next_index,to_next,n_left_to_next,bi0,bi1,next0,next1) \
+do {									\
+  int enqueue_code = (next0 != next_index) + 2*(next1 != next_index);	\
+									\
+  if (PREDICT_FALSE (enqueue_code != 0))				\
+    {									\
+      switch (enqueue_code)						\
+	{								\
+	case 1:								\
+	  /* A B A */							\
+	  to_next[-2] = bi1;						\
+	  to_next -= 1;							\
+	  n_left_to_next += 1;						\
+	  vlib_set_next_frame_buffer (vm, node, next0, bi0);		\
+	  break;							\
+									\
+	case 2:								\
+	  /* A A B */							\
+	  to_next -= 1;							\
+	  n_left_to_next += 1;						\
+	  vlib_set_next_frame_buffer (vm, node, next1, bi1);		\
+	  break;							\
+									\
+	case 3:								\
+	  /* A B B or A B C */						\
+	  to_next -= 2;							\
+	  n_left_to_next += 2;						\
+	  vlib_set_next_frame_buffer (vm, node, next0, bi0);		\
+	  vlib_set_next_frame_buffer (vm, node, next1, bi1);		\
+	  if (next0 == next1)						\
+	    {								\
+	      vlib_put_next_frame (vm, node, next_index,		\
+				   n_left_to_next);			\
+	      next_index = next1;					\
+	      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); \
+	    }								\
+	}								\
+    }									\
+} while (0)
+
+#define vlib_validate_buffer_enqueue_x1(vm,node,next_index,to_next,n_left_to_next,bi0,next0) \
+do {									\
+  if (PREDICT_FALSE (next0 != next_index))				\
+    {									\
+      vlib_put_next_frame (vm, node, next_index, n_left_to_next + 1);	\
+      next_index = next0;						\
+      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); \
+									\
+      to_next[0] = bi0;							\
+      to_next += 1;							\
+      n_left_to_next -= 1;						\
+    }									\
+} while (0)
+
+always_inline uword
+generic_buffer_node_inline (vlib_main_t * vm,
+			    vlib_node_runtime_t * node,
+			    vlib_frame_t * frame,
+			    uword sizeof_trace,
+			    void * opaque1,
+			    uword opaque2,
+			    void (* two_buffers) (vlib_main_t * vm,
+						  void * opaque1,
+						  uword opaque2,
+						  vlib_buffer_t * b0, vlib_buffer_t * b1,
+						  u32 * next0, u32 * next1),
+			    void (* one_buffer) (vlib_main_t * vm,
+						 void * opaque1,
+						 uword opaque2,
+						 vlib_buffer_t * b0,
+						 u32 * next0))
+{
+  u32 n_left_from, * from, * to_next;
+  u32 next_index;
+
+  from = vlib_frame_vector_args (frame);
+  n_left_from = frame->n_vectors;
+  next_index = node->cached_next_index;
+
+  if (node->flags & VLIB_NODE_FLAG_TRACE)
+    vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors,
+				   /* stride */ 1, sizeof_trace);
+
+  while (n_left_from > 0)
+    {
+      u32 n_left_to_next;
+
+      vlib_get_next_frame (vm, node, next_index,
+			   to_next, n_left_to_next);
+
+      while (n_left_from >= 4 && n_left_to_next >= 2)
+	{
+	  vlib_buffer_t * p0, * p1;
+	  u32 pi0, next0;
+	  u32 pi1, next1;
+
+	  /* Prefetch next iteration. */
+	  {
+	    vlib_buffer_t * p2, * p3;
+
+	    p2 = vlib_get_buffer (vm, from[2]);
+	    p3 = vlib_get_buffer (vm, from[3]);
+
+	    vlib_prefetch_buffer_header (p2, LOAD);
+	    vlib_prefetch_buffer_header (p3, LOAD);
+
+	    CLIB_PREFETCH (p2->data, 64, LOAD);
+	    CLIB_PREFETCH (p3->data, 64, LOAD);
+	  }
+
+	  pi0 = to_next[0] = from[0];
+	  pi1 = to_next[1] = from[1];
+	  from += 2;
+	  to_next += 2;
+	  n_left_from -= 2;
+	  n_left_to_next -= 2;
+
+	  p0 = vlib_get_buffer (vm, pi0);
+	  p1 = vlib_get_buffer (vm, pi1);
+
+	  two_buffers (vm, opaque1, opaque2, p0, p1, &next0, &next1);
+
+	  vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+					   to_next, n_left_to_next,
+					   pi0, pi1, next0, next1);
+	}
+    
+      while (n_left_from > 0 && n_left_to_next > 0)
+	{
+	  vlib_buffer_t * p0;
+	  u32 pi0, next0;
+
+	  pi0 = from[0];
+	  to_next[0] = pi0;
+	  from += 1;
+	  to_next += 1;
+	  n_left_from -= 1;
+	  n_left_to_next -= 1;
+
+	  p0 = vlib_get_buffer (vm, pi0);
+
+	  one_buffer (vm, opaque1, opaque2, p0, &next0);
+
+	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+					   to_next, n_left_to_next,
+					   pi0, next0);
+	}
+
+      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+    }
+
+  return frame->n_vectors;
+}
+
+#endif /* included_vlib_buffer_node_h */
diff --git a/vlib/vlib/cli.c b/vlib/vlib/cli.c
new file mode 100644
index 00000000000..e5163f260e1
--- /dev/null
+++ b/vlib/vlib/cli.c
@@ -0,0 +1,1015 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * cli.c: command line interface
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+
+/* Root of all show commands. */
+VLIB_CLI_COMMAND (vlib_cli_show_command, static) = {
+  .path = "show",
+  .short_help = "Show commands",
+};
+
+/* Root of all clear commands. */
+VLIB_CLI_COMMAND (vlib_cli_clear_command, static) = {
+  .path = "clear",
+  .short_help = "Clear commands",
+};
+
+/* Root of all set commands. */
+VLIB_CLI_COMMAND (vlib_cli_set_command, static) = {
+  .path = "set",
+  .short_help = "Set commands",
+};
+
+/* Root of all test commands. */
+VLIB_CLI_COMMAND (vlib_cli_test_command, static) = {
+  .path = "test",
+  .short_help = "Test commands",
+};
+
+/* Returns bitmap of commands which match key. */
+static uword *
+vlib_cli_sub_command_match (vlib_cli_command_t * c, unformat_input_t * input)
+{
+  int i, n;
+  uword * match = 0;
+  vlib_cli_parse_position_t * p;
+
+  unformat_skip_white_space (input);
+
+  for (i = 0; ; i++)
+    {
+      uword k;
+
+      k = unformat_get_input (input);
+      switch (k)
+	{
+	case 'a' ... 'z':
+	case 'A' ... 'Z':
+	case '0' ... '9':
+	case '-': case '_':
+	  break;
+
+	case ' ': case '\t': case '\r': case '\n':
+	case UNFORMAT_END_OF_INPUT:
+	  /* White space or end of input removes any non-white
+	     matches that were before possible. */
+	  if (i < vec_len (c->sub_command_positions)
+	      && clib_bitmap_count_set_bits (match) > 1)
+	    {
+	      p = vec_elt_at_index (c->sub_command_positions, i);
+	      for (n = 0; n < vec_len (p->bitmaps); n++)
+		match = clib_bitmap_andnot (match, p->bitmaps[n]);
+	    }
+	  goto done;
+
+	default:
+	  unformat_put_input (input);
+	  goto done;
+	}
+
+      if (i >= vec_len (c->sub_command_positions))
+	{
+	no_match:
+	  clib_bitmap_free (match);
+	  return 0;
+	}
+
+      p = vec_elt_at_index (c->sub_command_positions, i);
+      if (vec_len (p->bitmaps) == 0)
+	goto no_match;
+
+      n = k - p->min_char;
+      if (n < 0 || n >= vec_len (p->bitmaps))
+	goto no_match;
+
+      if (i == 0)
+	match = clib_bitmap_dup (p->bitmaps[n]);
+      else
+	match = clib_bitmap_and (match, p->bitmaps[n]);
+
+      if (clib_bitmap_is_zero (match))
+	goto no_match;
+    }
+
+ done:
+  return match;
+}
+
+/* Looks for string based sub-input formatted { SUB-INPUT }. */
+static uword unformat_vlib_cli_sub_input (unformat_input_t * i, va_list * args)
+{
+  unformat_input_t * sub_input = va_arg (*args, unformat_input_t *);
+  u8 * s;
+  uword c;
+
+  while (1)
+    {
+      c = unformat_get_input (i);
+      switch (c)
+	{
+	case ' ': case '\t':
+	case '\n': case '\r':
+	case '\f':
+	  break;
+
+	case '{':
+	default:
+	  /* Put back paren. */
+	  if (c != UNFORMAT_END_OF_INPUT)
+	    unformat_put_input (i);
+
+	  if (c == '{' && unformat (i, "%v", &s))
+	    {
+	      unformat_init_vector (sub_input, s);
+	      return 1;
+	    }
+	  return 0;
+	}
+    }
+  return 0;
+}
+
+static vlib_cli_command_t *
+get_sub_command (vlib_cli_main_t * cm, vlib_cli_command_t * parent, u32 si)
+{
+  vlib_cli_sub_command_t * s = vec_elt_at_index (parent->sub_commands, si);
+  return vec_elt_at_index (cm->commands, s->index);
+}
+
+static uword unformat_vlib_cli_sub_command (unformat_input_t * i, va_list * args)
+{
+  vlib_main_t * vm = va_arg (*args, vlib_main_t *);
+  vlib_cli_command_t * c = va_arg (*args, vlib_cli_command_t *);
+  vlib_cli_command_t ** result = va_arg (*args, vlib_cli_command_t **);
+  vlib_cli_main_t * cm = &vm->cli_main;
+  uword * match_bitmap, is_unique, index;
+
+  {
+    vlib_cli_sub_rule_t * sr;
+    vlib_cli_parse_rule_t * r;
+    vec_foreach (sr, c->sub_rules)
+      {
+	void ** d;
+	r = vec_elt_at_index (cm->parse_rules, sr->rule_index);
+	vec_add2 (cm->parse_rule_data, d, 1);
+	vec_reset_length (d[0]);
+	if (r->data_size)
+	  d[0] = _vec_resize (d[0],
+			      /* length increment */ 1,
+			      r->data_size,
+			      /* header_bytes */ 0,
+			      /* data align */ sizeof (uword));
+	if (unformat_user (i, r->unformat_function, vm, d[0]))
+	  {
+	    *result = vec_elt_at_index (cm->commands, sr->command_index);
+	    return 1;
+	  }
+      }
+  }
+
+  match_bitmap = vlib_cli_sub_command_match (c, i);
+  is_unique = clib_bitmap_count_set_bits (match_bitmap) == 1;
+  index = ~0;
+  if (is_unique)
+    {
+      index = clib_bitmap_first_set (match_bitmap);
+      *result = get_sub_command (cm, c, index);
+    }
+  clib_bitmap_free (match_bitmap);
+
+  return is_unique;
+}
+
+static u8 * format_vlib_cli_command_help (u8 * s, va_list * args)
+{
+  vlib_cli_command_t * c = va_arg (*args, vlib_cli_command_t *);
+  int is_long = va_arg (*args, int);
+  if (is_long && c->long_help)
+    s = format (s, "%s", c->long_help);
+  else if (c->short_help)
+    s = format (s, "%s", c->short_help);
+  else
+    s = format (s, "%v commands", c->path);
+  return s;
+}
+
+static u8 * format_vlib_cli_parse_rule_name (u8 * s, va_list * args)
+{
+  vlib_cli_parse_rule_t * r  = va_arg (*args, vlib_cli_parse_rule_t *);
+  return format (s, "<%U>", format_c_identifier, r->name);
+}
+
+static u8 * format_vlib_cli_path (u8 * s, va_list * args)
+{
+  u8 * path = va_arg (*args, u8 *);
+  int i, in_rule;
+  in_rule = 0;
+  for (i = 0; i < vec_len (path); i++)
+    {
+      switch (path[i])
+	{
+	case '%':
+	  in_rule = 1;
+	  vec_add1 (s, '<');	/* start of <RULE> */
+	  break;
+
+	case '_':
+	  /* _ -> space in rules. */
+	  vec_add1 (s, in_rule ? ' ' : '_');
+	  break;
+
+	case ' ':
+	  if (in_rule)
+	    {
+	      vec_add1 (s, '>'); /* end of <RULE> */
+	      in_rule = 0;
+	    }
+	  vec_add1 (s, ' ');
+	  break;
+
+	default:
+	  vec_add1 (s, path[i]);
+	  break;
+	}
+    }
+
+  if (in_rule)
+    vec_add1 (s, '>');		/* terminate <RULE> */
+
+  return s;
+}
+
+static vlib_cli_command_t *
+all_subs (vlib_cli_main_t * cm,
+	  vlib_cli_command_t * subs,
+	  u32 command_index)
+{
+  vlib_cli_command_t * c = vec_elt_at_index (cm->commands, command_index);
+  vlib_cli_sub_command_t * sc;
+  vlib_cli_sub_rule_t * sr;
+
+  if (c->function)
+    vec_add1 (subs, c[0]);
+
+  vec_foreach (sr, c->sub_rules)
+    subs = all_subs (cm, subs, sr->command_index);
+  vec_foreach (sc, c->sub_commands)
+    subs = all_subs (cm, subs, sc->index);
+
+  return subs;
+}
+
+static clib_error_t *
+vlib_cli_dispatch_sub_commands (vlib_main_t * vm,
+				vlib_cli_main_t * cm,
+				unformat_input_t * input,
+				uword parent_command_index)
+{
+  vlib_cli_command_t * parent, * c;
+  clib_error_t * error = 0;
+  unformat_input_t sub_input;
+  u8 * string;
+  uword is_main_dispatch = cm == &vm->cli_main;
+
+  parent = vec_elt_at_index (cm->commands, parent_command_index);
+  if (is_main_dispatch && unformat (input, "help"))
+    {
+      uword help_at_end_of_line, i;
+
+      help_at_end_of_line = unformat_check_input (input) == UNFORMAT_END_OF_INPUT;
+      while (1)
+        {
+	  c = parent;
+          if (unformat_user (input, unformat_vlib_cli_sub_command, vm, c, &parent))
+	    ;
+
+	  else if (! unformat_check_input (input) == UNFORMAT_END_OF_INPUT)
+	    goto unknown;
+
+	  else
+	    break;
+        }
+      
+      /* help SUB-COMMAND => long format help.
+	 "help" at end of line: show all commands. */
+      if (! help_at_end_of_line)
+	vlib_cli_output (vm, "%U", format_vlib_cli_command_help, c, /* is_long */ 1);
+
+      else if (vec_len (c->sub_commands) + vec_len (c->sub_rules) == 0)
+	vlib_cli_output (vm, "%v: no sub-commands", c->path);
+
+      else
+	{
+	  vlib_cli_sub_command_t * sc;
+	  vlib_cli_sub_rule_t * sr, * subs;
+
+	  subs = vec_dup (c->sub_rules);
+
+	  /* Add in rules if any. */
+	  vec_foreach (sc, c->sub_commands)
+	    {
+	      vec_add2 (subs, sr, 1);
+	      sr->name = sc->name;
+	      sr->command_index = sc->index;
+	      sr->rule_index = ~0;
+	    }
+
+	  vec_sort (subs, c1, c2, vec_cmp (c1->name, c2->name));
+
+	  for (i = 0; i < vec_len (subs); i++) 
+	    {
+	      vlib_cli_command_t * d;
+	      vlib_cli_parse_rule_t * r;
+
+	      d = vec_elt_at_index (cm->commands, subs[i].command_index);
+	      r = subs[i].rule_index != ~0 ? vec_elt_at_index (cm->parse_rules, subs[i].rule_index) : 0;
+
+	      if (r)
+		vlib_cli_output
+		  (vm, "  %-30U %U",
+		   format_vlib_cli_parse_rule_name, r,
+		   format_vlib_cli_command_help, d, /* is_long */ 0);
+	      else
+		vlib_cli_output
+		  (vm, "  %-30v %U",
+		   subs[i].name,
+		   format_vlib_cli_command_help, d, /* is_long */ 0);
+	    }
+
+	  vec_free (subs);
+	}
+    }
+  
+  else if (is_main_dispatch && (unformat (input, "choices") || unformat (input, "?")))
+    {
+      vlib_cli_command_t * sub, * subs;
+
+      subs = all_subs (cm, 0, parent_command_index);
+      vec_sort (subs, c1, c2, vec_cmp (c1->path, c2->path));
+      vec_foreach (sub, subs)
+	vlib_cli_output (vm, "  %-40U %U",
+			 format_vlib_cli_path, sub->path,
+			 format_vlib_cli_command_help, sub, /* is_long */ 0);
+      vec_free (subs);
+    }
+
+  else if (unformat (input, "comment %v", &string))
+    {
+      vec_free (string);
+    }
+  
+  else if (unformat (input, "uncomment %U",
+		     unformat_vlib_cli_sub_input, &sub_input))
+    {
+      error = vlib_cli_dispatch_sub_commands (vm, cm, &sub_input, parent_command_index);
+      unformat_free (&sub_input);
+    }
+  
+  else if (unformat_user (input, unformat_vlib_cli_sub_command, vm, parent, &c))
+    {
+      unformat_input_t * si;
+      uword has_sub_commands = vec_len (c->sub_commands) + vec_len (c->sub_rules) > 0;
+      
+      si = input;
+      if (unformat_user (input, unformat_vlib_cli_sub_input, &sub_input))
+	si = &sub_input;
+      
+      if (has_sub_commands)
+	error = vlib_cli_dispatch_sub_commands (vm, cm, si, c - cm->commands);
+
+      if (has_sub_commands && ! error)
+	/* Found valid sub-command. */;
+
+      else if (c->function)
+	{
+	  clib_error_t * c_error;
+
+	  /* Skip white space for benefit of called function. */
+	  unformat_skip_white_space (si);
+
+	  if (unformat (si, "?"))
+	    {
+	      vlib_cli_output (vm, "  %-40U %U",
+			       format_vlib_cli_path, c->path,
+			       format_vlib_cli_command_help, c, /* is_long */ 0);
+	    }
+	  else
+	    {
+              if (!c->is_mp_safe)
+                vlib_worker_thread_barrier_sync(vm);
+
+	      c_error = c->function (vm, si, c);
+
+              if (!c->is_mp_safe)
+                vlib_worker_thread_barrier_release(vm);
+
+	      if (c_error)
+		{
+		  error = clib_error_return (0, "%v: %v", c->path, c_error->what);
+		  clib_error_free (c_error);
+		  /* Free sub input. */
+		  if (si != input)
+		    unformat_free (si);
+
+		  return error;
+		}
+	    }
+
+	  /* Free any previous error. */
+	  clib_error_free (error);
+	}
+
+      else if (! error)
+	error = clib_error_return (0, "%v: no sub-commands", c->path);
+
+      /* Free sub input. */
+      if (si != input)
+	unformat_free (si);
+    }
+
+  else
+    goto unknown;
+
+  return error;
+
+ unknown:
+  if (parent->path)
+    return clib_error_return (0, "%v: unknown input `%U'", parent->path, format_unformat_error, input);
+  else
+    return clib_error_return (0, "unknown input `%U'", format_unformat_error, input);
+}
+
+
+void vlib_unix_error_report (vlib_main_t *, clib_error_t *) 
+    __attribute__ ((weak));
+
+void vlib_unix_error_report (vlib_main_t * vm, clib_error_t * error) { }
+
+/* Process CLI input. */
+void vlib_cli_input (vlib_main_t * vm,
+		     unformat_input_t * input,
+		     vlib_cli_output_function_t * function,
+		     uword function_arg)
+{
+  vlib_cli_main_t * cm = &vm->cli_main;
+  clib_error_t * error;
+  vlib_cli_output_function_t * save_function;
+  uword save_function_arg;
+
+  save_function = cm->output_function;
+  save_function_arg = cm->output_function_arg;
+
+  cm->output_function = function;
+  cm->output_function_arg = function_arg;
+
+  do {
+    vec_reset_length (cm->parse_rule_data);
+    error = vlib_cli_dispatch_sub_commands (vm, &vm->cli_main, input, /* parent */ 0);
+  } while (! error && ! unformat (input, "%U", unformat_eof));
+
+  if (error)
+    {
+      vlib_cli_output (vm, "%v", error->what);
+      vlib_unix_error_report (vm, error);
+      clib_error_free (error);
+    }
+
+  cm->output_function = save_function;
+  cm->output_function_arg = save_function_arg;
+}
+
+/* Output to current CLI connection. */
+void vlib_cli_output (vlib_main_t * vm, char * fmt, ...)
+{
+  vlib_cli_main_t * cm = &vm->cli_main;
+  va_list va;
+  u8 * s;
+
+  va_start (va, fmt);
+  s = va_format (0, fmt, &va);
+  va_end (va);
+
+  /* Terminate with \n if not present. */
+  if (vec_len (s) > 0 && s[vec_len (s)-1] != '\n')
+    vec_add1 (s, '\n');
+
+  if (! cm->output_function)
+    fformat (stdout, "%v", s);
+  else
+    cm->output_function (cm->output_function_arg, s, vec_len (s));
+
+  vec_free (s);
+}
+
+static clib_error_t *
+show_memory_usage (vlib_main_t * vm,
+		   unformat_input_t * input,
+		   vlib_cli_command_t * cmd)
+{
+  int verbose = 0;
+  clib_error_t * error;
+  u32 index = 0;
+
+  while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT) 
+    {
+      if (unformat (input, "verbose")) 
+        verbose = 1;
+      else {
+        error = clib_error_return (0, "unknown input `%U'", 
+                                   format_unformat_error, input);
+        return error;
+      }
+  }
+
+  foreach_vlib_main (
+  ({
+      vlib_cli_output (vm, "Thread %d %v\n", index, vlib_worker_threads[index].name);
+      vlib_cli_output (vm, "%U\n", format_mheap, clib_per_cpu_mheaps[index], verbose);
+      index++;
+  }));
+  return 0;
+}
+
+VLIB_CLI_COMMAND (show_memory_usage_command, static) = {
+  .path = "show memory",
+  .short_help = "Show current memory usage",
+  .function = show_memory_usage,
+};
+
+static clib_error_t *
+enable_disable_memory_trace (vlib_main_t * vm,
+			     unformat_input_t * input,
+			     vlib_cli_command_t * cmd)
+{
+  clib_error_t * error = 0;
+  int enable;
+
+  if (! unformat_user (input, unformat_vlib_enable_disable, &enable))
+    {
+      error = clib_error_return (0, "expecting enable/on or disable/off");
+      goto done;
+    }
+
+  clib_mem_trace (enable);
+
+ done:
+  return error;
+}
+
+VLIB_CLI_COMMAND (enable_disable_memory_trace_command, static) = {
+  .path = "memory-trace",
+  .short_help = "Enable/disable memory allocation trace",
+  .function = enable_disable_memory_trace,
+};
+
+
+static clib_error_t *
+test_heap_validate (vlib_main_t * vm, unformat_input_t * input,
+                        vlib_cli_command_t * cmd)
+{
+    clib_error_t * error = 0;
+    void * heap;
+    mheap_t *mheap;
+
+    if (unformat(input, "on")) {
+        foreach_vlib_main({
+          heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index];
+          mheap = mheap_header(heap);
+          mheap->flags |= MHEAP_FLAG_VALIDATE;
+          // Turn off small object cache because it delays detection of errors
+          mheap->flags &= ~MHEAP_FLAG_SMALL_OBJECT_CACHE;
+        });
+
+    } else if (unformat(input, "off")) {
+        foreach_vlib_main({
+          heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index];
+          mheap = mheap_header(heap);
+          mheap->flags &= ~MHEAP_FLAG_VALIDATE;
+          mheap->flags |= MHEAP_FLAG_SMALL_OBJECT_CACHE;
+        });
+
+    } else if (unformat(input, "now")) {
+        foreach_vlib_main({
+          heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index];
+          mheap = mheap_header(heap);
+          mheap_validate(heap);
+        });
+        vlib_cli_output(vm, "heap validation complete");
+
+    } else {
+        return clib_error_return(0, "unknown input `%U'",
+                                  format_unformat_error, input);
+    }
+
+    return error;
+}
+
+VLIB_CLI_COMMAND (cmd_test_heap_validate,static) = {
+    .path = "test heap-validate",
+    .short_help = "<on/off/now> validate heap on future allocs/frees or right now",
+    .function = test_heap_validate,
+};
+
+
+static uword vlib_cli_normalize_path (char * input, char ** result)
+{
+  char * i = input;
+  char * s = 0;
+  uword l = 0;
+  uword index_of_last_space = ~0;
+
+  while (*i != 0)
+    {
+      u8 c = *i++;
+      /* Multiple white space -> single space. */
+      switch (c)
+	{
+	case ' ':
+	case '\t':
+	case '\n':
+	case '\r':
+	  if (l > 0 && s[l-1] != ' ')
+	    {
+	      vec_add1 (s, ' ');
+	      l++;
+	    }
+	  break;
+
+	default:
+	  if (l > 0 && s[l-1] == ' ')
+	    index_of_last_space = vec_len (s);
+	  vec_add1 (s, c);
+	  l++;
+	  break;
+	}
+    }
+
+  /* Remove any extra space at end. */
+  if (l > 0 && s[l-1] == ' ')
+    _vec_len (s) -= 1;
+
+  *result = s;
+  return index_of_last_space;
+}
+
+always_inline uword
+parent_path_len (char * path)
+{
+  word i;
+  for (i = vec_len (path) - 1; i >= 0; i--)
+    {
+      if (path[i] == ' ')
+	return i;
+    }
+  return ~0;
+}
+
+static void add_sub_command (vlib_cli_main_t * cm,
+			     uword parent_index,
+			     uword child_index)
+{
+  vlib_cli_command_t * p, * c;
+  vlib_cli_sub_command_t * sub_c;
+  u8 * sub_name;
+  word i, l;
+
+  p = vec_elt_at_index (cm->commands, parent_index);
+  c = vec_elt_at_index (cm->commands, child_index);
+
+  l = parent_path_len (c->path);
+  if (l == ~0)
+    sub_name = vec_dup ((u8 *) c->path);
+  else
+    {
+      ASSERT (l + 1 < vec_len (c->path));
+      sub_name = 0;
+      vec_add (sub_name, c->path + l + 1, vec_len (c->path) - (l + 1));
+    }
+
+  if (sub_name[0] == '%')
+    {
+      uword * q;
+      vlib_cli_sub_rule_t * sr;
+
+      /* Remove %. */
+      vec_delete (sub_name, 1, 0);
+
+      if (! p->sub_rule_index_by_name)
+	p->sub_rule_index_by_name
+	  = hash_create_vec (/* initial length */ 32,
+			     sizeof (sub_name[0]),
+			     sizeof (uword));
+      q = hash_get_mem (p->sub_rule_index_by_name, sub_name);
+      if (q)
+	{
+	  sr = vec_elt_at_index (p->sub_rules, q[0]);
+	  ASSERT (sr->command_index == child_index);
+	  return;
+	}
+
+      q = hash_get_mem (cm->parse_rule_index_by_name, sub_name);
+      if (! q)
+	clib_error ("reference to unknown rule `%%%v' in path `%v'",
+		    sub_name, c->path);
+
+      hash_set_mem (p->sub_rule_index_by_name, sub_name, vec_len (p->sub_rules));
+      vec_add2 (p->sub_rules, sr, 1);
+      sr->name = sub_name;
+      sr->rule_index = q[0];
+      sr->command_index = child_index;
+      return;
+    }
+
+  if (! p->sub_command_index_by_name)
+    p->sub_command_index_by_name
+      = hash_create_vec (/* initial length */ 32,
+			 sizeof (c->path[0]),
+			 sizeof (uword));
+
+  /* Check if sub-command has already been created. */
+  if (hash_get_mem (p->sub_command_index_by_name, sub_name))
+    {
+      vec_free (sub_name);
+      return;
+    }
+
+  vec_add2 (p->sub_commands, sub_c, 1);
+  sub_c->index = child_index;
+  sub_c->name = sub_name;
+  hash_set_mem (p->sub_command_index_by_name, sub_c->name, sub_c - p->sub_commands);
+
+  vec_validate (p->sub_command_positions, vec_len (sub_c->name) - 1);
+  for (i = 0; i < vec_len (sub_c->name); i++)
+    {
+      int n;
+      vlib_cli_parse_position_t * pos;
+
+      pos = vec_elt_at_index (p->sub_command_positions, i);
+
+      if (! pos->bitmaps)
+	pos->min_char = sub_c->name[i];
+
+      n = sub_c->name[i] - pos->min_char;
+      if (n < 0)
+	{
+	  pos->min_char = sub_c->name[i];
+	  vec_insert (pos->bitmaps, -n, 0);
+	  n = 0;
+	}
+
+      vec_validate (pos->bitmaps, n);
+      pos->bitmaps[n] = clib_bitmap_ori (pos->bitmaps[n], sub_c - p->sub_commands);
+    }
+}
+
+static void
+vlib_cli_make_parent (vlib_cli_main_t * cm, uword ci)
+{
+  uword p_len, pi, * p;
+  char * p_path;
+  vlib_cli_command_t * c, * parent;
+
+  /* Root command (index 0) should have already been added. */
+  ASSERT (vec_len (cm->commands) > 0);
+
+  c = vec_elt_at_index (cm->commands, ci);
+  p_len = parent_path_len (c->path);
+
+  /* No space?  Parent is root command. */ 
+  if (p_len == ~0)
+    {
+      add_sub_command (cm, 0, ci);
+      return;
+    }
+
+  p_path = 0;
+  vec_add (p_path, c->path, p_len);
+
+  p = hash_get_mem (cm->command_index_by_path, p_path);
+
+  /* Parent exists? */
+  if (! p)
+    {
+      /* Parent does not exist; create it. */
+      vec_add2 (cm->commands, parent, 1);
+      parent->path = p_path;
+      hash_set_mem (cm->command_index_by_path, parent->path, parent - cm->commands);
+      pi = parent - cm->commands;
+    }
+  else
+    {
+      pi = p[0];
+      vec_free (p_path);
+    }
+
+  add_sub_command (cm, pi, ci);
+
+  /* Create parent's parent. */
+  if (! p)
+    vlib_cli_make_parent (cm, pi);
+}
+
+always_inline uword
+vlib_cli_command_is_empty (vlib_cli_command_t * c)
+{
+  return (c->long_help == 0
+	  && c->short_help == 0
+	  && c->function == 0);
+}
+
+clib_error_t * vlib_cli_register (vlib_main_t * vm, vlib_cli_command_t * c)
+{
+  vlib_cli_main_t * cm = &vm->cli_main;
+  clib_error_t * error = 0;
+  uword ci, * p;
+  char * normalized_path;
+
+  if ((error = vlib_call_init_function (vm, vlib_cli_init)))
+    return error;
+
+  (void) vlib_cli_normalize_path (c->path, &normalized_path);
+
+  if (! cm->command_index_by_path)
+    cm->command_index_by_path = hash_create_vec (/* initial length */ 32,
+						 sizeof (c->path[0]),
+						 sizeof (uword));
+
+  /* See if command already exists with given path. */
+  p = hash_get_mem (cm->command_index_by_path, normalized_path);
+  if (p)
+    {
+      vlib_cli_command_t * d;
+
+      ci = p[0];
+      d = vec_elt_at_index (cm->commands, ci);
+
+      /* If existing command was created via vlib_cli_make_parent
+	 replaced it with callers data. */
+      if (vlib_cli_command_is_empty (d))
+	{
+	  vlib_cli_command_t save = d[0];
+
+	  ASSERT (! vlib_cli_command_is_empty (c));
+
+	  /* Copy callers fields. */
+	  d[0] = c[0];
+
+	  /* Save internal fields. */
+	  d->path = save.path;
+	  d->sub_commands = save.sub_commands;
+	  d->sub_command_index_by_name = save.sub_command_index_by_name;
+	  d->sub_command_positions = save.sub_command_positions;
+	  d->sub_rules = save.sub_rules;
+	}
+      else
+	error = clib_error_return (0, "duplicate command name with path %v", normalized_path);
+
+      vec_free (normalized_path);
+      if (error)
+	return error;
+    }
+  else
+    {
+      /* Command does not exist: create it. */
+
+      /* Add root command (index 0). */
+      if (vec_len (cm->commands) == 0)
+	{
+	  /* Create command with index 0; path is empty string. */
+	  vec_resize (cm->commands, 1);
+	}
+
+      ci = vec_len (cm->commands);
+      hash_set_mem (cm->command_index_by_path, normalized_path, ci);
+      vec_add1 (cm->commands, c[0]);
+
+      c = vec_elt_at_index (cm->commands, ci);
+      c->path = normalized_path;
+
+      /* Don't inherit from registration. */
+      c->sub_commands = 0;
+      c->sub_command_index_by_name = 0;
+      c->sub_command_positions = 0;
+    }
+
+  vlib_cli_make_parent (cm, ci);
+  return 0;
+}
+
+clib_error_t *
+vlib_cli_register_parse_rule (vlib_main_t * vm, vlib_cli_parse_rule_t * r_reg)
+{
+  vlib_cli_main_t * cm = &vm->cli_main;
+  vlib_cli_parse_rule_t * r;
+  clib_error_t * error = 0;
+  u8 * r_name;
+  uword * p;
+
+  if (! cm->parse_rule_index_by_name)
+    cm->parse_rule_index_by_name = hash_create_vec (/* initial length */ 32,
+						    sizeof (r->name[0]),
+						    sizeof (uword));
+
+  /* Make vector copy of name. */
+  r_name = format (0, "%s", r_reg->name);
+
+  if ((p = hash_get_mem (cm->parse_rule_index_by_name, r_name)))
+    {
+      vec_free (r_name);
+      return clib_error_return (0, "duplicate parse rule name `%s'", r_reg->name);
+    }
+
+  vec_add2 (cm->parse_rules, r, 1);
+  r[0] = r_reg[0];
+  r->name = (char *) r_name;
+  hash_set_mem (cm->parse_rule_index_by_name, r->name, r - cm->parse_rules);
+
+  return error;
+}
+
+#if 0
+/* $$$ turn back on again someday, maybe */
+static clib_error_t *
+vlib_cli_register_parse_rules (vlib_main_t * vm,
+			       vlib_cli_parse_rule_t * lo,
+			       vlib_cli_parse_rule_t * hi)
+
+    __attribute__((unused))
+{
+  clib_error_t * error = 0;
+  vlib_cli_parse_rule_t * r;
+
+  for (r = lo; r < hi; r = clib_elf_section_data_next (r, 0))
+    {
+      if (! r->name || strlen (r->name) == 0)
+	{
+	  error = clib_error_return (0, "parse rule with no name");
+	  goto done;
+	}
+
+      error = vlib_cli_register_parse_rule (vm, r);
+      if (error)
+	goto done;
+    }
+
+ done:
+  return error;
+}
+#endif
+
+static clib_error_t * vlib_cli_init (vlib_main_t * vm)
+{
+  vlib_cli_main_t * cm = &vm->cli_main;
+  clib_error_t * error = 0;
+  vlib_cli_command_t * cmd;
+
+  cmd = cm->cli_command_registrations;
+
+  while (cmd)
+    {
+      error = vlib_cli_register (vm, cmd);
+      if (error)
+        return error;
+      cmd = cmd->next_cli_command;
+    }
+  return error;
+}
+
+VLIB_INIT_FUNCTION (vlib_cli_init);
diff --git a/vlib/vlib/cli.h b/vlib/vlib/cli.h
new file mode 100644
index 00000000000..8c802475176
--- /dev/null
+++ b/vlib/vlib/cli.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * cli.h: command line interface
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_cli_h
+#define included_vlib_cli_h
+
+#include <vppinfra/format.h>
+
+struct vlib_cli_command_t;
+
+typedef struct {
+  u32 min_char;
+
+  /* Indexed by name[position] - min_char. */
+  uword ** bitmaps;
+} vlib_cli_parse_position_t;
+
+typedef struct {
+  u8 * name;
+
+  u32 index;
+} vlib_cli_sub_command_t;
+
+typedef struct {
+  u8 * name;
+
+  u32 rule_index;
+
+  u32 command_index;
+} vlib_cli_sub_rule_t;
+
+typedef struct {
+  char * name;
+  char * short_help;
+  char * long_help;
+
+  /* Number of bytes in parsed data.  Zero for vector. */
+  uword data_size;
+
+  unformat_function_t * unformat_function;
+
+  /* Opaque for unformat function. */
+  uword unformat_function_arg[2];
+} vlib_cli_parse_rule_t;
+
+/* CLI command callback function. */
+typedef clib_error_t * (vlib_cli_command_function_t)
+  (struct vlib_main_t * vm,
+   unformat_input_t * input,
+   struct vlib_cli_command_t * cmd);
+
+typedef struct vlib_cli_command_t {
+  /* Command path (e.g. "show something").
+     Spaces delimit elements of path. */
+  char * path;
+
+  /* Short/long help strings. */
+  char * short_help;
+  char * long_help;
+
+  /* Callback function. */
+  vlib_cli_command_function_t * function;
+
+  /* Opaque. */
+  uword function_arg;
+
+  /* Known MP-safe? */
+  uword is_mp_safe;
+
+  /* Sub commands for this command. */
+  vlib_cli_sub_command_t * sub_commands;
+
+  /* Hash table mapping name (e.g. last path element) to sub command index. */
+  uword * sub_command_index_by_name;
+
+  /* bitmap[p][c][i] says whether sub-command i has character
+     c in position p. */
+  vlib_cli_parse_position_t * sub_command_positions;
+
+  /* Hash table mapping name (e.g. last path element) to sub rule index. */
+  uword * sub_rule_index_by_name;
+
+  /* Vector of possible parse rules for this path. */
+  vlib_cli_sub_rule_t * sub_rules;
+
+  /* List of CLI commands, built by constructors */
+  struct vlib_cli_command_t * next_cli_command;
+
+} vlib_cli_command_t;
+
+typedef void (vlib_cli_output_function_t) (uword arg,
+					   u8 * buffer,
+					   uword buffer_bytes);
+typedef struct {
+  /* Current output function. */
+  vlib_cli_output_function_t * output_function;
+
+  /* Opaque data for output function. */
+  uword output_function_arg;
+
+  /* Vector of all known commands. */
+  vlib_cli_command_t * commands;
+
+  /* Hash table mapping normalized path to index into all_commands. */
+  uword * command_index_by_path;
+
+  /* Vector of all known parse rules. */
+  vlib_cli_parse_rule_t * parse_rules;
+
+  /* Hash table mapping parse rule name to index into parse_rule vector. */
+  uword * parse_rule_index_by_name;
+
+  /* Data parsed for rules. */
+  void ** parse_rule_data;
+
+  /* registration list added by constructors */
+  vlib_cli_command_t *cli_command_registrations;
+} vlib_cli_main_t;
+
+#define VLIB_CLI_COMMAND(x,...)                                         \
+    __VA_ARGS__ vlib_cli_command_t x;                                   \
+static void __vlib_cli_command_registration_##x (void)                  \
+    __attribute__((__constructor__)) ;                                  \
+static void __vlib_cli_command_registration_##x (void)                  \
+{                                                                       \
+    vlib_main_t * vm = vlib_get_main();                                 \
+    vlib_cli_main_t *cm = &vm->cli_main;                                \
+    x.next_cli_command = cm->cli_command_registrations;                 \
+    cm->cli_command_registrations = &x;                                 \
+}                                                                       \
+__VA_ARGS__ vlib_cli_command_t x 
+
+
+#define VLIB_CLI_PARSE_RULE(x) \
+  vlib_cli_parse_rule_t x 
+
+/* Output to current CLI connection. */
+void vlib_cli_output (struct vlib_main_t * vm, char * fmt, ...);
+
+/* Process CLI input. */
+void vlib_cli_input (struct vlib_main_t * vm,
+		     unformat_input_t * input,
+		     vlib_cli_output_function_t * function,
+		     uword function_arg);
+
+clib_error_t * vlib_cli_register (struct vlib_main_t * vm,
+				  vlib_cli_command_t * c);
+clib_error_t * vlib_cli_register_parse_rule (struct vlib_main_t * vm,
+					     vlib_cli_parse_rule_t * c);
+
+#endif /* included_vlib_cli_h */
diff --git a/vlib/vlib/cli_funcs.h b/vlib/vlib/cli_funcs.h
new file mode 100644
index 00000000000..a43ed20a2c2
--- /dev/null
+++ b/vlib/vlib/cli_funcs.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * cli_funcs.h: VLIB CLI related functions/inlines
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_cli_funcs_h
+#define included_vlib_cli_funcs_h
+
+always_inline void *
+vlib_cli_get_parse_rule_result (vlib_main_t * vm, uword index)
+{
+  vlib_cli_main_t * cm = &vm->cli_main;
+  return vec_elt (cm->parse_rule_data, index);
+}
+
+#endif /* included_vlib_cli_funcs_h */
diff --git a/vlib/vlib/counter.c b/vlib/vlib/counter.c
new file mode 100644
index 00000000000..1b94884e319
--- /dev/null
+++ b/vlib/vlib/counter.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * counter.c: simple and packet/byte counters
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+
+void vlib_clear_simple_counters (vlib_simple_counter_main_t * cm)
+{
+  uword i, j;
+  u16 * my_minis;
+
+  for (i = 0; i < VLIB_COUNTER_MAX_CPUS; i++)
+    {
+      my_minis = cm->minis[i];
+      
+      for (j = 0; j < vec_len (my_minis); j++)
+        {
+          cm->maxi[j] += my_minis[j];
+          my_minis[j] = 0;
+        }
+    }
+
+  j = vec_len (cm->maxi);
+  if (j > 0)
+    vec_validate (cm->value_at_last_clear, j - 1);
+  for (i = 0; i < j; i++)
+    cm->value_at_last_clear[i] = cm->maxi[i];
+}
+
+void vlib_clear_combined_counters (vlib_combined_counter_main_t * cm)
+{
+  uword i, j;
+  vlib_mini_counter_t * my_minis;
+
+  for (i = 0; i < VLIB_COUNTER_MAX_CPUS; i++)
+    {
+      my_minis = cm->minis[i];
+      
+      for (j = 0; j < vec_len (my_minis); j++)
+        {
+          cm->maxi[j].packets += my_minis[j].packets;
+          cm->maxi[j].bytes += my_minis[j].bytes;
+          my_minis[j].packets = 0;
+          my_minis[j].bytes = 0;
+        }
+    }
+
+  j = vec_len (cm->maxi);
+  if (j > 0)
+    vec_validate (cm->value_at_last_clear, j - 1);
+
+  for (i = 0; i < j; i++)
+    {
+      vlib_counter_t * c = vec_elt_at_index (cm->value_at_last_clear, i);
+
+      c[0] = cm->maxi[i];
+    }
+}
+
+void serialize_vlib_simple_counter_main (serialize_main_t * m, va_list * va)
+{
+  clib_warning ("unimplemented");
+}
+
+void unserialize_vlib_simple_counter_main (serialize_main_t * m, va_list * va)
+{
+  clib_warning ("unimplemented");
+}
+
+void serialize_vlib_combined_counter_main (serialize_main_t * m, va_list * va)
+{
+  clib_warning ("unimplemented");
+}
+
+void unserialize_vlib_combined_counter_main (serialize_main_t * m, va_list * va)
+{
+  clib_warning ("unimplemented");
+}
diff --git a/vlib/vlib/counter.h b/vlib/vlib/counter.h
new file mode 100644
index 00000000000..804757173bb
--- /dev/null
+++ b/vlib/vlib/counter.h
@@ -0,0 +1,336 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * counter.h: simple and packet/byte counters
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_counter_h
+#define included_vlib_counter_h
+
+/* 
+ * Annoyingly enough, counters are created long before
+ * the CPU configuration is available, so we have to
+ * preallocate the mini-counter per-cpu vectors
+ */
+#define VLIB_COUNTER_MAX_CPUS	32
+
+typedef struct {
+  /* Compact counters that (rarely) can overflow. */
+  u16 ** minis;
+
+  /* Counters to hold overflow. */
+  u64 * maxi;
+
+  /* Counter values as of last clear. */
+  u64 * value_at_last_clear;
+
+  /* Values as of last serialize. */
+  u64 * value_at_last_serialize;
+
+  /* Last counter index serialized incrementally. */
+  u32 last_incremental_serialize_index;
+
+  /* Counter name. */
+  char * name;
+} vlib_simple_counter_main_t;
+
+always_inline void
+vlib_increment_simple_counter (vlib_simple_counter_main_t * cm,
+                               u32 cpu_index,
+			       u32 index,
+			       u32 increment)
+{
+  u16 * my_minis;
+  u16 * mini;
+  u32 old, new;
+
+  my_minis = cm->minis[cpu_index];
+  mini = vec_elt_at_index (my_minis, index);
+  old = mini[0];
+  new = old + increment;
+  mini[0] = new;
+
+  if (PREDICT_FALSE (mini[0] != new))
+    {
+      __sync_fetch_and_add (&cm->maxi[index], new);
+      my_minis[index] = 0;
+    }
+}
+
+always_inline u64
+vlib_get_simple_counter (vlib_simple_counter_main_t * cm, u32 index)
+{
+  u16 *my_minis, *mini;
+  u64 v;
+  int i;
+
+  ASSERT (index < vec_len (cm->maxi));
+
+  v = 0;
+
+  for (i = 0; i < VLIB_COUNTER_MAX_CPUS; i++)
+    {
+      my_minis = cm->minis[i];
+      mini = vec_elt_at_index (my_minis, index);
+      v += mini[0];
+    }
+
+  v += cm->maxi[index];
+
+  if (index < vec_len (cm->value_at_last_clear))
+    {
+      ASSERT (v >= cm->value_at_last_clear[index]);
+      v -= cm->value_at_last_clear[index];
+    }
+
+  return v;
+}
+
+always_inline void
+vlib_zero_simple_counter (vlib_simple_counter_main_t * cm, u32 index)
+{
+  u16 * my_minis;
+  int i;
+
+  ASSERT (index < vec_len (cm->maxi));
+
+  for (i = 0; i < VLIB_COUNTER_MAX_CPUS; i++)
+    {
+      my_minis = cm->minis[i];
+      my_minis[index] = 0;
+    }
+
+  cm->maxi[index] = 0;
+
+  if (index < vec_len (cm->value_at_last_clear))
+    cm->value_at_last_clear[index] = 0;
+}
+
+/* Combined counters hold both packets and byte differences. */
+/* Maxi-packet/byte counter. */
+typedef struct {
+  u64 packets, bytes;
+} vlib_counter_t;
+
+always_inline void
+vlib_counter_add (vlib_counter_t * a, vlib_counter_t * b)
+{
+  a->packets += b->packets;
+  a->bytes += b->bytes;
+}
+
+always_inline void
+vlib_counter_sub (vlib_counter_t * a, vlib_counter_t * b)
+{
+  ASSERT (a->packets >= b->packets);
+  ASSERT (a->bytes >= b->bytes);
+  a->packets -= b->packets;
+  a->bytes -= b->bytes;
+}
+
+always_inline void
+vlib_counter_zero (vlib_counter_t * a)
+{ a->packets = a->bytes = 0; }
+
+/* Micro-counter: 16 bits of packets and 16 bits of byte difference. */
+typedef struct {
+  /* Packet count. */
+  u16 packets;
+
+  /* The average packet size hack doesn't work in a multi-core config */
+  i16 bytes;
+} vlib_mini_counter_t;
+
+typedef struct {
+  /* Compact counters that (rarely) can overflow. */
+  vlib_mini_counter_t ** minis;
+
+  /* Counters to hold overflow. */
+  vlib_counter_t * maxi;
+
+  /* Debug counters for testing. */
+  vlib_counter_t * debug;
+
+  /* Counter values as of last clear. */
+  vlib_counter_t * value_at_last_clear;
+
+  /* Counter values as of last serialize. */
+  vlib_counter_t * value_at_last_serialize;
+
+  /* Last counter index serialized incrementally. */
+  u32 last_incremental_serialize_index;
+
+  /* Average packet sizes used in mini-counter byte differences. */
+  u32 ave_packet_size;
+
+  /* Current summed packets and bytes for average computation. */
+  u32 ave_packets, ave_bytes;
+
+  /* Counter name. */
+  char * name;
+
+} vlib_combined_counter_main_t;
+
+void vlib_clear_simple_counters (vlib_simple_counter_main_t * cm);
+void vlib_clear_combined_counters (vlib_combined_counter_main_t * cm);
+
+always_inline void
+vlib_increment_combined_counter (vlib_combined_counter_main_t * cm,
+                                 u32 cpu_index,
+				 u32 index,
+				 u32 packet_increment,
+				 u32 byte_increment)
+{
+  vlib_mini_counter_t * my_minis, * mini;
+  u32 old_packets, new_packets;
+  i32 old_bytes, new_bytes;
+
+  /* Use this CPU's mini counter array */
+  my_minis = cm->minis[cpu_index];
+
+  mini = vec_elt_at_index (my_minis, index);
+  old_packets = mini->packets;
+  old_bytes = mini->bytes;
+
+  new_packets = old_packets + packet_increment;
+  new_bytes = old_bytes + byte_increment;
+
+  mini->packets = new_packets;
+  mini->bytes = new_bytes;
+
+  /* Bytes always overflow before packets.. */
+  if (PREDICT_FALSE (mini->bytes != new_bytes))
+    {
+      vlib_counter_t * maxi = vec_elt_at_index (cm->maxi, index);
+
+      __sync_fetch_and_add (&maxi->packets, new_packets);
+      __sync_fetch_and_add (&maxi->bytes, new_bytes);
+
+      mini->packets = 0;
+      mini->bytes = 0;
+    }
+}
+
+/* This is never done in the speed path */
+static inline void
+vlib_get_combined_counter (vlib_combined_counter_main_t * cm,
+			   u32 index,
+			   vlib_counter_t * result)
+{
+  vlib_mini_counter_t * my_minis, * mini;
+  vlib_counter_t * maxi;
+  int i;
+
+  result->packets = 0;
+  result->bytes = 0;
+
+  for (i = 0; i < VLIB_COUNTER_MAX_CPUS; i++)
+    {
+      my_minis = cm->minis[i];
+
+      mini = vec_elt_at_index (my_minis, index);
+      result->packets += mini->packets;
+      result->bytes += mini->bytes;
+    }
+
+  maxi = vec_elt_at_index (cm->maxi, index);
+  result->packets += maxi->packets;
+  result->bytes += maxi->bytes;
+
+  if (index < vec_len (cm->value_at_last_clear))
+    vlib_counter_sub (result, &cm->value_at_last_clear[index]);
+}
+
+always_inline void
+vlib_zero_combined_counter (vlib_combined_counter_main_t * cm,
+			    u32 index)
+{
+  vlib_mini_counter_t * mini, * my_minis;
+  int i;
+
+  for (i = 0; i < VLIB_COUNTER_MAX_CPUS; i++)
+    {
+      my_minis = cm->minis[i];
+
+      mini = vec_elt_at_index (my_minis, index);
+      mini->packets = 0;
+      mini->bytes = 0;
+    }
+
+  vlib_counter_zero (&cm->maxi[index]);
+  if (index < vec_len (cm->value_at_last_clear))
+    vlib_counter_zero (&cm->value_at_last_clear[index]);
+}
+
+/* Initialize/allocate given counter index.
+   Works for both simple and combined counters. */
+#define vlib_validate_counter_DEPRECATED(cm,index)              \
+  do {                                                          \
+    int i;                                                      \
+                                                                \
+    vec_validate ((cm)->minis, VLIB_COUNTER_MAX_CPUS-1);        \
+    for (i = 0; i < VLIB_COUNTER_MAX_CPUS; i++)                 \
+      vec_validate ((cm)->minis[i], (index));                   \
+    vec_validate ((cm)->maxi, (index));                         \
+  } while (0)
+
+static inline void
+vlib_validate_simple_counter (vlib_simple_counter_main_t *cm, u32 index)
+{
+  int i;
+  vec_validate (cm->minis, VLIB_COUNTER_MAX_CPUS-1);
+  for (i = 0; i < VLIB_COUNTER_MAX_CPUS; i++)
+    vec_validate_aligned (cm->minis[i], index, CLIB_CACHE_LINE_BYTES);
+  vec_validate_aligned (cm->maxi, index, CLIB_CACHE_LINE_BYTES);
+}
+
+static inline void
+vlib_validate_combined_counter (vlib_combined_counter_main_t *cm, u32 index)
+{
+  int i;
+  vec_validate (cm->minis, VLIB_COUNTER_MAX_CPUS-1);
+  for (i = 0; i < VLIB_COUNTER_MAX_CPUS; i++)
+    vec_validate_aligned (cm->minis[i], index, CLIB_CACHE_LINE_BYTES);
+  vec_validate_aligned (cm->maxi, index, CLIB_CACHE_LINE_BYTES);
+}
+
+/* Number of simple/combined counters allocated. */
+#define vlib_counter_len(cm) vec_len((cm)->maxi)
+
+serialize_function_t serialize_vlib_simple_counter_main, unserialize_vlib_simple_counter_main;
+serialize_function_t serialize_vlib_combined_counter_main, unserialize_vlib_combined_counter_main;
+
+#endif /* included_vlib_counter_h */
diff --git a/vlib/vlib/defs.h b/vlib/vlib/defs.h
new file mode 100644
index 00000000000..ff9046861f3
--- /dev/null
+++ b/vlib/vlib/defs.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * defs.h: VLIB generic C definitions
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_defs_h
+#define included_vlib_defs_h
+
+/* Receive or transmit. */
+typedef enum {
+  VLIB_RX,
+  VLIB_TX,
+  VLIB_N_RX_TX = 2,		/* Used to size arrays. */
+} vlib_rx_or_tx_t;
+
+#define vlib_foreach_rx_tx(v) for (v = 0; v < VLIB_N_RX_TX; v++)
+
+/* Read/write. */
+typedef enum {
+  VLIB_READ,
+  VLIB_WRITE,
+} vlib_read_or_write_t;
+
+/* Up/down. */
+typedef enum {
+  VLIB_DOWN = 0,
+  VLIB_UP = 1,
+} vlib_up_or_down_t;
+
+/* Enable/disable. */
+typedef enum {
+  VLIB_DISABLE = 0,
+  VLIB_ENABLE = 1,
+} vlib_enable_or_disable_t;
+
+#endif /* included_vlib_defs_h */
diff --git a/vlib/vlib/dpdk_buffer.c b/vlib/vlib/dpdk_buffer.c
new file mode 100644
index 00000000000..dbbd5806fd2
--- /dev/null
+++ b/vlib/vlib/dpdk_buffer.c
@@ -0,0 +1,1206 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * buffer.c: allocate/free network buffers.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <rte_config.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_memory.h>
+#include <rte_memcpy.h>
+#include <rte_memzone.h>
+#include <rte_tailq.h>
+#include <rte_eal.h>
+#include <rte_per_lcore.h>
+#include <rte_launch.h>
+#include <rte_atomic.h>
+#include <rte_cycles.h>
+#include <rte_prefetch.h>
+#include <rte_lcore.h>
+#include <rte_per_lcore.h>
+#include <rte_branch_prediction.h>
+#include <rte_interrupts.h>
+#include <rte_pci.h>
+#include <rte_random.h>
+#include <rte_debug.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_ring.h>
+#include <rte_mempool.h>
+#include <rte_mbuf.h>
+
+#include <vlib/vlib.h>
+
+phys_addr_t          __attribute__ ((weak)) rte_mem_virt2phy();
+int                  __attribute__ ((weak)) rte_eal_has_hugepages();
+unsigned             __attribute__ ((weak)) rte_socket_id();
+struct rte_mempool * __attribute__ ((weak)) rte_mempool_create();
+void                 __attribute__ ((weak)) rte_pktmbuf_init();
+void                 __attribute__ ((weak)) rte_pktmbuf_pool_init();
+
+uword vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm, vlib_buffer_t * b_first)
+{
+  vlib_buffer_t * b = b_first;
+  uword l_first = b_first->current_length;
+  uword l = 0;
+  while (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+    {
+      b = vlib_get_buffer (vm, b->next_buffer);
+      l += b->current_length;
+    }
+  b_first->total_length_not_including_first_buffer = l;
+  b_first->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
+  return l + l_first;
+}
+
+u8 * format_vlib_buffer (u8 * s, va_list * args)
+{
+  vlib_buffer_t * b = va_arg (*args, vlib_buffer_t *);
+  uword indent = format_get_indent (s);
+
+  s = format (s, "current data %d, length %d, free-list %d",
+	      b->current_data, b->current_length,
+	      b->free_list_index);
+
+  if (b->flags & VLIB_BUFFER_TOTAL_LENGTH_VALID)
+    s = format (s, ", totlen-nifb %d",
+		b->total_length_not_including_first_buffer);
+
+  if (b->flags & VLIB_BUFFER_IS_TRACED)
+    s = format (s, ", trace 0x%x", b->trace_index);
+
+  while (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+    {
+      vlib_main_t * vm = vlib_get_main();
+      u32 next_buffer = b->next_buffer;
+      b = vlib_get_buffer(vm, next_buffer);
+
+      s = format (s, "\n%Unext-buffer 0x%x, segment length %d",
+		  format_white_space, indent, next_buffer, b->current_length);
+    }
+
+
+  return s;
+}
+
+u8 * format_vlib_buffer_and_data (u8 * s, va_list * args)
+{
+  vlib_buffer_t * b = va_arg (*args, vlib_buffer_t *);
+  
+  s = format (s, "%U, %U",
+	      format_vlib_buffer, b,
+	      format_hex_bytes, vlib_buffer_get_current (b), 64);
+
+  return s;
+}
+
+u8 * format_vlib_buffer_contents (u8 * s, va_list * va)
+{
+  vlib_main_t * vm = va_arg (*va, vlib_main_t *);
+  vlib_buffer_t * b = va_arg (*va, vlib_buffer_t *);
+  
+  while (1)
+    {
+      vec_add (s, vlib_buffer_get_current (b),
+	       b->current_length);
+      if (! (b->flags & VLIB_BUFFER_NEXT_PRESENT))
+	break;
+      b = vlib_get_buffer (vm, b->next_buffer);
+    }
+
+  return s;
+}
+
+vlib_main_t **vlib_mains;
+
+/* Aligned copy routine. */
+void
+vlib_aligned_memcpy (void * _dst, void * _src, int n_bytes)
+{
+  vlib_copy_unit_t * dst = _dst;
+  vlib_copy_unit_t * src = _src;
+
+  /* Arguments must be naturally aligned. */
+  ASSERT (pointer_to_uword (dst) % sizeof (dst[0]) == 0);
+  ASSERT (pointer_to_uword (src) % sizeof (src[0]) == 0);
+  ASSERT (n_bytes % sizeof (dst[0]) == 0);
+
+  if (4 * sizeof (dst[0]) == CLIB_CACHE_LINE_BYTES)
+    {
+      CLIB_PREFETCH (dst + 0, 4 * sizeof (dst[0]), WRITE);
+      CLIB_PREFETCH (src + 0, 4 * sizeof (src[0]), READ);
+
+      while (n_bytes >= 4 * sizeof (dst[0]))
+	{
+	  dst += 4;
+	  src += 4;
+	  n_bytes -= 4 * sizeof (dst[0]);
+	  CLIB_PREFETCH (dst, 4 * sizeof (dst[0]), WRITE);
+	  CLIB_PREFETCH (src, 4 * sizeof (src[0]), READ);
+	  dst[-4] = src[-4];
+	  dst[-3] = src[-3];
+	  dst[-2] = src[-2];
+	  dst[-1] = src[-1];
+	}
+    }
+  else if (8 * sizeof (dst[0]) == CLIB_CACHE_LINE_BYTES)
+    {
+      CLIB_PREFETCH (dst + 0, 8 * sizeof (dst[0]), WRITE);
+      CLIB_PREFETCH (src + 0, 8 * sizeof (src[0]), READ);
+
+      while (n_bytes >= 8 * sizeof (dst[0]))
+	{
+	  dst += 8;
+	  src += 8;
+	  n_bytes -= 8 * sizeof (dst[0]);
+	  CLIB_PREFETCH (dst, 8 * sizeof (dst[0]), WRITE);
+	  CLIB_PREFETCH (src, 8 * sizeof (src[0]), READ);
+	  dst[-8] = src[-8];
+	  dst[-7] = src[-7];
+	  dst[-6] = src[-6];
+	  dst[-5] = src[-5];
+	  dst[-4] = src[-4];
+	  dst[-3] = src[-3];
+	  dst[-2] = src[-2];
+	  dst[-1] = src[-1];
+	}
+    }
+  else
+    /* Cache line size unknown: fall back to slow version. */;
+
+  while (n_bytes > 0)
+    {
+      *dst++ = *src++;
+      n_bytes -= 1 * sizeof (dst[0]);
+    }
+}
+
+#define BUFFERS_PER_COPY (sizeof (vlib_copy_unit_t) / sizeof (u32))
+
+/* Make sure we have at least given number of unaligned buffers. */
+static void
+fill_unaligned (vlib_main_t * vm,
+		vlib_buffer_free_list_t * free_list,
+		uword n_unaligned_buffers)
+{
+  word la = vec_len (free_list->aligned_buffers);
+  word lu = vec_len (free_list->unaligned_buffers);
+
+  /* Aligned come in aligned copy-sized chunks. */
+  ASSERT (la % BUFFERS_PER_COPY == 0);
+
+  ASSERT (la >= n_unaligned_buffers);
+
+  while (lu < n_unaligned_buffers)
+    {
+      /* Copy 4 buffers from end of aligned vector to unaligned vector. */
+      vec_add (free_list->unaligned_buffers,
+	       free_list->aligned_buffers + la - BUFFERS_PER_COPY,
+	       BUFFERS_PER_COPY);
+      la -= BUFFERS_PER_COPY;
+      lu += BUFFERS_PER_COPY;
+    }
+  _vec_len (free_list->aligned_buffers) = la;
+}
+
+/* After free aligned buffers may not contain even sized chunks. */
+static void
+trim_aligned (vlib_buffer_free_list_t * f)
+{
+  uword l, n_trim;
+
+  /* Add unaligned to aligned before trim. */
+  l = vec_len (f->unaligned_buffers);
+  if (l > 0)
+    {
+      vec_add_aligned (f->aligned_buffers, f->unaligned_buffers, l,
+		       /* align */ sizeof (vlib_copy_unit_t));
+
+      _vec_len (f->unaligned_buffers) = 0;
+    }
+
+  /* Remove unaligned buffers from end of aligned vector and save for next trim. */
+  l = vec_len (f->aligned_buffers);
+  n_trim = l % BUFFERS_PER_COPY;
+  if (n_trim)
+    {
+      /* Trim aligned -> unaligned. */
+      vec_add (f->unaligned_buffers, f->aligned_buffers + l - n_trim, n_trim);
+
+      /* Remove from aligned. */
+      _vec_len (f->aligned_buffers) = l - n_trim;
+    }
+}
+
+static void
+merge_free_lists (vlib_buffer_free_list_t * dst,
+		  vlib_buffer_free_list_t * src)
+{
+  uword l;
+  u32 * d;
+  
+  trim_aligned (src);
+  trim_aligned (dst);
+
+  l = vec_len (src->aligned_buffers);
+  if (l > 0)
+    {
+      vec_add2_aligned (dst->aligned_buffers, d, l,
+			/* align */ sizeof (vlib_copy_unit_t));
+      vlib_aligned_memcpy (d, src->aligned_buffers, l * sizeof (d[0]));
+      vec_free (src->aligned_buffers);
+    }
+
+  l = vec_len (src->unaligned_buffers);
+  if (l > 0)
+    {
+      vec_add (dst->unaligned_buffers, src->unaligned_buffers, l);
+      vec_free (src->unaligned_buffers);
+    }
+}
+
+always_inline u32
+vlib_buffer_get_free_list_with_size (vlib_main_t * vm, u32 size)
+{
+  vlib_buffer_main_t * bm = vm->buffer_main;
+
+  size = vlib_buffer_round_size (size);
+  uword * p = hash_get (bm->free_list_by_size, size);
+  return p ? p[0] : ~0;
+}
+
+/* Add buffer free list. */
+static u32
+vlib_buffer_create_free_list_helper (vlib_main_t * vm,
+				     u32 n_data_bytes,
+				     u32 is_public,
+				     u32 is_default,
+				     u8 * name)
+{
+  vlib_buffer_main_t * bm = vm->buffer_main;
+  vlib_buffer_free_list_t * f;
+
+  if (! is_default && pool_elts (bm->buffer_free_list_pool) == 0)
+    {
+      u32 default_free_free_list_index;
+
+      default_free_free_list_index =
+	vlib_buffer_create_free_list_helper (vm,
+					     /* default buffer size */ VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES,
+					     /* is_public */ 1,
+					     /* is_default */ 1,
+					     (u8 *) "default");
+      ASSERT (default_free_free_list_index == VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+
+      if (n_data_bytes == VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES && is_public)
+	return default_free_free_list_index;
+    }
+
+  pool_get_aligned (bm->buffer_free_list_pool, f, CLIB_CACHE_LINE_BYTES);
+
+  memset (f, 0, sizeof (f[0]));
+  f->index = f - bm->buffer_free_list_pool;
+  f->n_data_bytes = vlib_buffer_round_size (n_data_bytes);
+  f->min_n_buffers_each_physmem_alloc = 16;
+  f->name = clib_mem_is_heap_object (name) ? name : format (0, "%s", name);
+
+  /* Setup free buffer template. */
+  f->buffer_init_template.free_list_index = f->index;
+
+  if (is_public)
+    {
+      uword * p = hash_get (bm->free_list_by_size, f->n_data_bytes);
+      if (! p)
+	hash_set (bm->free_list_by_size, f->n_data_bytes, f->index);
+    }
+
+  return f->index;
+}
+
+u32 vlib_buffer_create_free_list (vlib_main_t * vm, u32 n_data_bytes,
+				  char * fmt, ...)
+{
+  va_list va;
+  u8 * name;
+
+  va_start (va, fmt);
+  name = va_format (0, fmt, &va);
+  va_end (va);
+
+  return vlib_buffer_create_free_list_helper (vm, n_data_bytes,
+					      /* is_public */ 0,
+					      /* is_default */ 0,
+					      name);
+}
+
+u32 vlib_buffer_get_or_create_free_list (vlib_main_t * vm, u32 n_data_bytes,
+					 char * fmt, ...)
+{
+  u32 i = vlib_buffer_get_free_list_with_size (vm, n_data_bytes);
+
+  if (i == ~0)
+    {
+      va_list va;
+      u8 * name;
+
+      va_start (va, fmt);
+      name = va_format (0, fmt, &va);
+      va_end (va);
+
+      i = vlib_buffer_create_free_list_helper (vm, n_data_bytes,
+					       /* is_public */ 1,
+					       /* is_default */ 0,
+					       name);
+    }
+  
+  return i;
+}
+
+static void
+del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f)
+{
+  u32 i;
+  struct rte_mbuf *mb;
+  vlib_buffer_t *b;
+
+  for (i = 0; i < vec_len (f->unaligned_buffers); i++) {
+      b = vlib_get_buffer (vm, f->unaligned_buffers[i]);
+      mb = ((struct rte_mbuf *)b)-1;
+      rte_pktmbuf_free (mb);
+  }
+  for (i = 0; i < vec_len (f->aligned_buffers); i++) {
+      b = vlib_get_buffer (vm, f->aligned_buffers[i]);
+      mb = ((struct rte_mbuf *)b)-1;
+      rte_pktmbuf_free (mb);
+  }
+  vec_free (f->name);
+  vec_free (f->unaligned_buffers);
+  vec_free (f->aligned_buffers);
+}
+
+/* Add buffer free list. */
+void vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index)
+{
+  vlib_buffer_main_t * bm = vm->buffer_main;
+  vlib_buffer_free_list_t * f;
+  u32 merge_index;
+
+  f = vlib_buffer_get_free_list (vm, free_list_index);
+
+  merge_index = vlib_buffer_get_free_list_with_size (vm, f->n_data_bytes);
+  if (merge_index != ~0 && merge_index != free_list_index)
+    {
+      merge_free_lists (pool_elt_at_index (bm->buffer_free_list_pool,
+					   merge_index), f);
+    }
+
+  del_free_list (vm, f);
+
+  /* Poison it. */
+  memset (f, 0xab, sizeof (f[0]));
+
+  pool_put (bm->buffer_free_list_pool, f);
+}
+
+/* Make sure free list has at least given number of free buffers. */
+static uword
+fill_free_list (vlib_main_t * vm,
+		vlib_buffer_free_list_t * fl,
+		uword min_free_buffers)
+{
+  vlib_buffer_t * b;
+  int n, i;
+  u32 bi;
+  u32 n_remaining, n_alloc;
+  unsigned socket_id = rte_socket_id ? rte_socket_id() : 0;
+  struct rte_mempool *rmp = vm->buffer_main->pktmbuf_pools[socket_id];
+  struct rte_mbuf *mb;
+      
+  /* Too early? */
+  if (PREDICT_FALSE(rmp == 0))
+    return 0;
+
+  trim_aligned (fl);
+
+  /* Already have enough free buffers on free list? */
+  n = min_free_buffers - vec_len (fl->aligned_buffers);
+  if (n <= 0)
+    return min_free_buffers;
+
+  /* Always allocate round number of buffers. */
+  n = round_pow2 (n, BUFFERS_PER_COPY);
+
+  /* Always allocate new buffers in reasonably large sized chunks. */
+  n = clib_max (n, fl->min_n_buffers_each_physmem_alloc);
+
+  vec_validate (vm->mbuf_alloc_list, n-1);
+  
+  if (rte_mempool_get_bulk (rmp, vm->mbuf_alloc_list, n) < 0)
+    return 0;
+
+  _vec_len (vm->mbuf_alloc_list) = n;
+
+  for (i = 0; i < n; i++)
+    {
+      mb = vm->mbuf_alloc_list[i];
+
+      ASSERT(rte_mbuf_refcnt_read(mb) == 0);
+      rte_mbuf_refcnt_set(mb, 1);
+      mb->next = NULL;
+      mb->data_off = RTE_PKTMBUF_HEADROOM;
+      mb->nb_segs = 1;
+
+      b = (vlib_buffer_t *)(mb+1);
+      bi = vlib_get_buffer_index (vm, b);
+      
+      vec_add1_aligned (fl->aligned_buffers, bi, sizeof (vlib_copy_unit_t));
+      n_alloc++;
+      n_remaining--;
+
+      vlib_buffer_init_for_free_list (b, fl);
+      
+      if (fl->buffer_init_function)
+        fl->buffer_init_function (vm, fl, &bi, 1);
+    }
+
+  fl->n_alloc += n;
+
+  return n;
+}
+
+always_inline uword
+copy_alignment (u32 * x)
+{ return (pointer_to_uword (x) / sizeof (x[0])) % BUFFERS_PER_COPY; }
+
+static u32
+alloc_from_free_list (vlib_main_t * vm,
+		      vlib_buffer_free_list_t * free_list,
+		      u32 * alloc_buffers,
+		      u32 n_alloc_buffers)
+{
+  u32 * dst, * u_src;
+  uword u_len, n_left;
+  uword n_unaligned_start, n_unaligned_end, n_filled;
+
+  n_left = n_alloc_buffers;
+  dst = alloc_buffers;
+  n_unaligned_start = ((BUFFERS_PER_COPY - copy_alignment (dst))
+		       & (BUFFERS_PER_COPY - 1));
+
+  n_filled = fill_free_list (vm, free_list, n_alloc_buffers);
+  if (n_filled == 0)
+    return 0;
+  
+  n_left = n_filled < n_left ? n_filled : n_left;
+  n_alloc_buffers = n_left;
+
+  if (n_unaligned_start >= n_left)
+    {
+      n_unaligned_start = n_left;
+      n_unaligned_end = 0;
+    }
+  else
+    n_unaligned_end = copy_alignment (dst + n_alloc_buffers);
+
+  fill_unaligned (vm, free_list, n_unaligned_start + n_unaligned_end);
+
+  u_len = vec_len (free_list->unaligned_buffers);
+  u_src = free_list->unaligned_buffers + u_len - 1;
+
+  if (n_unaligned_start)
+    {
+      uword n_copy = n_unaligned_start;
+      if (n_copy > n_left)
+	n_copy = n_left;
+      n_left -= n_copy;
+
+      while (n_copy > 0)
+	{
+	  *dst++ = *u_src--;
+	  n_copy--;
+	  u_len--;
+	}
+
+      /* Now dst should be aligned. */
+      if (n_left > 0)
+	ASSERT (pointer_to_uword (dst) % sizeof (vlib_copy_unit_t) == 0);
+    }
+
+  /* Aligned copy. */
+  {
+    vlib_copy_unit_t * d, * s;
+    uword n_copy;
+
+    if (vec_len(free_list->aligned_buffers) < ((n_left/BUFFERS_PER_COPY)*BUFFERS_PER_COPY))
+        abort();
+
+    n_copy = n_left / BUFFERS_PER_COPY;
+    n_left = n_left % BUFFERS_PER_COPY;
+
+    /* Remove buffers from aligned free list. */
+    _vec_len (free_list->aligned_buffers) -= n_copy * BUFFERS_PER_COPY;
+
+    s = (vlib_copy_unit_t *) vec_end (free_list->aligned_buffers);
+    d = (vlib_copy_unit_t *) dst;
+
+    /* Fast path loop. */
+    while (n_copy >= 4)
+      {
+	d[0] = s[0];
+	d[1] = s[1];
+	d[2] = s[2];
+	d[3] = s[3];
+	n_copy -= 4;
+	s += 4;
+	d += 4;
+      }
+
+    while (n_copy >= 1)
+      {
+	d[0] = s[0];
+	n_copy -= 1;
+	s += 1;
+	d += 1;
+      }
+
+    dst = (void *) d;
+  }
+
+  /* Unaligned copy. */
+  ASSERT (n_unaligned_end == n_left);
+  while (n_left > 0)
+    {
+      *dst++ = *u_src--;
+      n_left--;
+      u_len--;
+    }
+
+  if (! free_list->unaligned_buffers)
+    ASSERT (u_len == 0);
+  else
+    _vec_len (free_list->unaligned_buffers) = u_len;
+
+  return n_alloc_buffers;
+}
+
+/* Allocate a given number of buffers into given array.
+   Returns number actually allocated which will be either zero or
+   number requested. */
+u32 vlib_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers)
+{
+  vlib_buffer_main_t * bm = vm->buffer_main;
+
+  return alloc_from_free_list
+    (vm,
+     pool_elt_at_index (bm->buffer_free_list_pool,
+			VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX),
+     buffers, n_buffers);
+}
+
+u32 vlib_buffer_alloc_from_free_list (vlib_main_t * vm,
+				      u32 * buffers,
+				      u32 n_buffers,
+				      u32 free_list_index)
+{
+  vlib_buffer_main_t * bm = vm->buffer_main;
+  vlib_buffer_free_list_t * f;
+  f = pool_elt_at_index (bm->buffer_free_list_pool, free_list_index);
+  return alloc_from_free_list (vm, f, buffers, n_buffers);
+}
+
+always_inline void
+add_buffer_to_free_list (vlib_main_t * vm,
+                         vlib_buffer_free_list_t * f,
+			 u32 buffer_index, u8 do_init)
+{
+  vlib_buffer_t * b;
+  b = vlib_get_buffer (vm, buffer_index);
+  if (PREDICT_TRUE(do_init))
+      vlib_buffer_init_for_free_list (b, f);
+  vec_add1_aligned (f->aligned_buffers, buffer_index, sizeof (vlib_copy_unit_t));
+}
+
+always_inline vlib_buffer_free_list_t *
+buffer_get_free_list (vlib_main_t * vm, vlib_buffer_t * b, u32 * index)
+{
+  vlib_buffer_main_t * bm = vm->buffer_main;
+  u32 i;
+
+  *index = i = b->free_list_index;
+  return pool_elt_at_index (bm->buffer_free_list_pool, i);
+}
+
+void *vlib_set_buffer_free_callback (vlib_main_t *vm, void *fp)
+{
+  vlib_buffer_main_t * bm = vm->buffer_main;
+  void * rv = bm->buffer_free_callback;
+
+  bm->buffer_free_callback = fp;
+  return rv;
+}
+
+static_always_inline void
+vlib_buffer_free_inline (vlib_main_t * vm,
+			 u32 * buffers,
+			 u32 n_buffers,
+			 u32 follow_buffer_next)
+{
+  vlib_buffer_main_t * bm = vm->buffer_main;
+  vlib_buffer_free_list_t * fl;
+  u32 fi;
+  int i;
+  u32 (*cb)(vlib_main_t * vm, u32 * buffers, u32 n_buffers,
+            u32 follow_buffer_next);
+
+  cb = bm->buffer_free_callback;
+
+  if (PREDICT_FALSE (cb != 0))
+    n_buffers = (*cb)(vm, buffers, n_buffers, follow_buffer_next);
+
+  if (! n_buffers)
+    return;
+
+  for (i = 0; i < n_buffers; i++) 
+    {
+      vlib_buffer_t * b;
+      struct rte_mbuf * mb;
+      
+      b = vlib_get_buffer (vm, buffers[i]);
+
+      fl = buffer_get_free_list (vm, b, &fi);
+      
+      /* The only current use of this callback: multicast recycle */
+      if (PREDICT_FALSE (fl->buffers_added_to_freelist_function != 0)) 
+        {
+          int j;
+          
+          add_buffer_to_free_list (vm, fl, buffers[i], b->clone_count == 0);
+          
+          for (j = 0; j < vec_len (bm->announce_list); j++)
+            {
+              if (fl == bm->announce_list[j])
+                goto already_announced;
+            }
+          vec_add1 (bm->announce_list, fl);
+        already_announced:
+          ;
+        }
+      else
+        {
+          mb = ((struct rte_mbuf *)b)-1;
+          rte_pktmbuf_free (mb);
+        }
+    }
+  if (vec_len(bm->announce_list))
+    {
+      vlib_buffer_free_list_t * fl;
+      for (i = 0; i < vec_len (bm->announce_list); i++)
+        {
+          fl = bm->announce_list[i];
+          fl->buffers_added_to_freelist_function (vm, fl);
+        }
+      _vec_len(bm->announce_list) = 0;
+    }
+}
+
+void vlib_buffer_free (vlib_main_t * vm,
+		       u32 * buffers,
+		       u32 n_buffers)
+{
+  vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */ 1);
+}
+
+void vlib_buffer_free_no_next (vlib_main_t * vm,
+			       u32 * buffers,
+			       u32 n_buffers)
+{
+  vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */ 0);
+}
+
+/* Copy template packet data into buffers as they are allocated. */
+__attribute__((unused)) static void
+vlib_packet_template_buffer_init (vlib_main_t * vm,
+				  vlib_buffer_free_list_t * fl,
+				  u32 * buffers,
+				  u32 n_buffers)
+{
+  vlib_packet_template_t * t = uword_to_pointer (fl->buffer_init_function_opaque,
+						 vlib_packet_template_t *);
+  uword i;
+
+  for (i = 0; i < n_buffers; i++)
+    {
+      vlib_buffer_t * b = vlib_get_buffer (vm, buffers[i]);
+      ASSERT (b->current_length == vec_len (t->packet_data));
+      memcpy (vlib_buffer_get_current (b), t->packet_data, b->current_length);
+    }
+}
+
+void vlib_packet_template_init (vlib_main_t * vm,
+				vlib_packet_template_t * t,
+				void * packet_data,
+				uword n_packet_data_bytes,
+				uword min_n_buffers_each_physmem_alloc,
+				char * fmt,
+				...)
+{
+  va_list va;
+  __attribute__((unused)) u8 * name;
+
+  va_start (va, fmt);
+  name = va_format (0, fmt, &va);
+  va_end (va);
+
+  vlib_worker_thread_barrier_sync(vm);
+  memset (t, 0, sizeof (t[0]));
+
+  vec_add (t->packet_data, packet_data, n_packet_data_bytes);
+
+  vlib_worker_thread_barrier_release(vm);
+}
+
+void *
+vlib_packet_template_get_packet (vlib_main_t * vm, 
+                                 vlib_packet_template_t * t, 
+                                 u32 * bi_result)
+{
+  u32 bi;
+  vlib_buffer_t * b;
+
+  if (vlib_buffer_alloc (vm, &bi, 1) != 1)
+    return 0;
+
+  *bi_result = bi;
+
+  b = vlib_get_buffer (vm, bi);
+  memcpy (vlib_buffer_get_current (b),
+          t->packet_data, vec_len(t->packet_data));
+  b->current_length = vec_len(t->packet_data);
+
+  /* Fix up mbuf header length fields */
+  struct rte_mbuf * mb;
+  mb = ((struct rte_mbuf *)b) - 1;
+  mb->data_len = b->current_length;
+  mb->pkt_len = b->current_length;
+
+  return b->data;
+}
+
+/* Append given data to end of buffer, possibly allocating new buffers. */
+u32 vlib_buffer_add_data (vlib_main_t * vm,
+			  u32 free_list_index,
+			  u32 buffer_index,
+			  void * data, u32 n_data_bytes)
+{
+  u32 n_buffer_bytes, n_left, n_left_this_buffer, bi;
+  vlib_buffer_t * b;
+  void * d;
+
+  bi = buffer_index;
+  if (bi == 0
+      && 1 != vlib_buffer_alloc_from_free_list (vm, &bi, 1, free_list_index))
+    goto out_of_buffers;
+
+  d = data;
+  n_left = n_data_bytes;
+  n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm, free_list_index);
+  
+  b = vlib_get_buffer (vm, bi);
+  b->flags &= ~VLIB_BUFFER_TOTAL_LENGTH_VALID;
+
+  /* Get to the end of the chain before we try to append data...*/
+  while (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+    b = vlib_get_buffer (vm, b->next_buffer);
+
+  while (1)
+    {
+      u32 n;
+
+      ASSERT (n_buffer_bytes >= b->current_length);
+      n_left_this_buffer = n_buffer_bytes - (b->current_data + b->current_length);
+      n = clib_min (n_left_this_buffer, n_left);
+      memcpy (vlib_buffer_get_current (b) + b->current_length, d, n);
+      b->current_length += n;
+      n_left -= n;
+      if (n_left == 0)
+	break;
+
+      d += n;
+      if (1 != vlib_buffer_alloc_from_free_list (vm, &b->next_buffer, 1, free_list_index))
+	goto out_of_buffers;
+
+      b->flags |= VLIB_BUFFER_NEXT_PRESENT;
+
+      b = vlib_get_buffer (vm, b->next_buffer);
+    }
+
+  return bi;
+
+ out_of_buffers:
+  clib_error ("out of buffers");
+  return bi;
+}
+
+clib_error_t *
+vlib_buffer_pool_create(vlib_main_t * vm, unsigned num_mbufs,
+                        unsigned mbuf_size, unsigned socket_id)
+{
+  vlib_buffer_main_t * bm = vm->buffer_main;
+  vlib_physmem_main_t * vpm = &vm->physmem_main;
+  struct rte_mempool * rmp;
+  uword new_start, new_size;
+  int i;
+
+  if (!rte_mempool_create)
+    return clib_error_return (0, "not linked with DPDK");
+
+  vec_validate_aligned(bm->pktmbuf_pools, socket_id, CLIB_CACHE_LINE_BYTES);
+
+  /* pool already exists, nothing to do */
+  if (bm->pktmbuf_pools[socket_id])
+    return 0;
+
+  u8 * pool_name = format(0, "mbuf_pool_socket%u%c",socket_id, 0);
+  rmp = rte_mempool_create((char *) pool_name,
+                           num_mbufs, mbuf_size, 512,
+                           sizeof(struct rte_pktmbuf_pool_private),
+                           rte_pktmbuf_pool_init, NULL,
+                           rte_pktmbuf_init, NULL,
+                           socket_id, 0);
+  vec_free(pool_name);
+
+  if (rmp)
+    {
+      new_start = pointer_to_uword(rmp);
+      new_size = rmp->elt_va_end - new_start;
+
+      if (vpm->virtual.size > 0)
+        {
+          ASSERT(new_start != vpm->virtual.start);
+          if (new_start < vpm->virtual.start)
+            {
+              new_size = vpm->virtual.size + vpm->virtual.start - new_start;
+            }
+          else
+            {
+              new_size += new_start - vpm->virtual.start;
+              new_start = vpm->virtual.start;
+            }
+
+          /* check if fits into buffer index range */
+          if (new_size > ( (uword) 1 << (32 + CLIB_LOG2_CACHE_LINE_BYTES)))
+            rmp = 0;
+        }
+    }
+
+  if (rmp)
+    {
+      bm->pktmbuf_pools[socket_id] = rmp;
+      vpm->virtual.start = new_start;
+      vpm->virtual.size = new_size;
+      vpm->virtual.end = new_start + new_size;
+      return 0;
+    }
+
+  /* no usable pool for this socket, try to use pool from another one */
+  for (i = 0; i < vec_len(bm->pktmbuf_pools); i++)
+    {
+      if(bm->pktmbuf_pools[i])
+       {
+          clib_warning("WARNING: Failed to allocate mempool for CPU socket %u. "
+                       "Threads running on socket %u will use socket %u mempool.",
+                       socket_id, socket_id, i);
+          bm->pktmbuf_pools[socket_id] = bm->pktmbuf_pools[i];
+          return 0;
+       }
+    }
+
+  return clib_error_return (0, "failed to allocate mempool on socket %u",
+                            socket_id);
+}
+
+
+static void vlib_serialize_tx (serialize_main_header_t * m, serialize_stream_t * s)
+{
+  vlib_main_t * vm;
+  vlib_serialize_buffer_main_t * sm;
+  uword n, n_bytes_to_write;
+  vlib_buffer_t * last;
+
+  n_bytes_to_write = s->current_buffer_index;
+  sm = uword_to_pointer (s->data_function_opaque, vlib_serialize_buffer_main_t *);
+  vm = sm->vlib_main;
+
+  ASSERT (sm->tx.max_n_data_bytes_per_chain > 0);
+  if (serialize_stream_is_end_of_stream (s)
+      || sm->tx.n_total_data_bytes + n_bytes_to_write > sm->tx.max_n_data_bytes_per_chain)
+    {
+      vlib_process_t * p = vlib_get_current_process (vm);
+
+      last = vlib_get_buffer (vm, sm->last_buffer);
+      last->current_length = n_bytes_to_write;
+
+      vlib_set_next_frame_buffer (vm, &p->node_runtime, sm->tx.next_index, sm->first_buffer);
+
+      sm->first_buffer = sm->last_buffer = ~0;
+      sm->tx.n_total_data_bytes = 0;
+    }
+
+  else if (n_bytes_to_write == 0 && s->n_buffer_bytes == 0)
+    {
+      ASSERT (sm->first_buffer == ~0);
+      ASSERT (sm->last_buffer == ~0);
+      n = vlib_buffer_alloc_from_free_list (vm, &sm->first_buffer, 1, sm->tx.free_list_index);
+      if (n != 1)
+	serialize_error (m, clib_error_create ("vlib_buffer_alloc_from_free_list fails"));
+      sm->last_buffer = sm->first_buffer;
+      s->n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm, sm->tx.free_list_index);
+    }
+
+  if (n_bytes_to_write > 0)
+    {
+      vlib_buffer_t * prev = vlib_get_buffer (vm, sm->last_buffer);
+      n = vlib_buffer_alloc_from_free_list (vm, &sm->last_buffer, 1, sm->tx.free_list_index);
+      if (n != 1)
+	serialize_error (m, clib_error_create ("vlib_buffer_alloc_from_free_list fails"));
+      sm->tx.n_total_data_bytes += n_bytes_to_write;
+      prev->current_length = n_bytes_to_write;
+      prev->next_buffer = sm->last_buffer;
+      prev->flags |= VLIB_BUFFER_NEXT_PRESENT;
+    }
+
+  if (sm->last_buffer != ~0)
+    {
+      last = vlib_get_buffer (vm, sm->last_buffer);
+      s->buffer = vlib_buffer_get_current (last);
+      s->current_buffer_index = 0;
+      ASSERT (last->current_data == s->current_buffer_index);
+    }
+}
+
+static void vlib_serialize_rx (serialize_main_header_t * m, serialize_stream_t * s)
+{
+  vlib_main_t * vm;
+  vlib_serialize_buffer_main_t * sm;
+  vlib_buffer_t * last;
+
+  sm = uword_to_pointer (s->data_function_opaque, vlib_serialize_buffer_main_t *);
+  vm = sm->vlib_main;
+
+  if (serialize_stream_is_end_of_stream (s))
+    return;
+
+  if (sm->last_buffer != ~0)
+    {
+      last = vlib_get_buffer (vm, sm->last_buffer);
+
+      if (last->flags & VLIB_BUFFER_NEXT_PRESENT)
+	sm->last_buffer = last->next_buffer;
+      else
+	{
+	  vlib_buffer_free (vm, &sm->first_buffer, /* count */ 1);
+	  sm->first_buffer = sm->last_buffer = ~0;
+	}
+    }
+
+  if (sm->last_buffer == ~0)
+    {
+      while (clib_fifo_elts (sm->rx.buffer_fifo) == 0)
+	{
+	  sm->rx.ready_one_time_event = vlib_process_create_one_time_event (vm, vlib_current_process (vm), ~0);
+	  vlib_process_wait_for_one_time_event (vm, /* no event data */ 0, sm->rx.ready_one_time_event);
+	}
+
+      clib_fifo_sub1 (sm->rx.buffer_fifo, sm->first_buffer);
+      sm->last_buffer = sm->first_buffer;
+    }
+
+  ASSERT (sm->last_buffer != ~0);
+
+  last = vlib_get_buffer (vm, sm->last_buffer);
+  s->current_buffer_index = 0;
+  s->buffer = vlib_buffer_get_current (last);
+  s->n_buffer_bytes = last->current_length;
+}
+
+static void
+serialize_open_vlib_helper (serialize_main_t * m,
+			    vlib_main_t * vm,
+			    vlib_serialize_buffer_main_t * sm,
+			    uword is_read)
+{
+  /* Initialize serialize main but save overflow buffer for re-use between calls. */
+  {
+    u8 * save = m->stream.overflow_buffer;
+    memset (m, 0, sizeof (m[0]));
+    m->stream.overflow_buffer = save;
+    if (save)
+      _vec_len (save) = 0;
+  }
+
+  sm->first_buffer = sm->last_buffer = ~0;
+  if (is_read)
+    clib_fifo_reset (sm->rx.buffer_fifo);
+  else
+    sm->tx.n_total_data_bytes = 0;
+  sm->vlib_main = vm;
+  m->header.data_function = is_read ? vlib_serialize_rx : vlib_serialize_tx;
+  m->stream.data_function_opaque = pointer_to_uword (sm);
+}
+
+void serialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm, vlib_serialize_buffer_main_t * sm)
+{ serialize_open_vlib_helper (m, vm, sm, /* is_read */ 0); }
+
+void unserialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm, vlib_serialize_buffer_main_t * sm)
+{ serialize_open_vlib_helper (m, vm, sm, /* is_read */ 1); }
+
+u32 serialize_close_vlib_buffer (serialize_main_t * m)
+{
+  vlib_serialize_buffer_main_t * sm
+    = uword_to_pointer (m->stream.data_function_opaque, vlib_serialize_buffer_main_t *);
+  vlib_buffer_t * last;
+  serialize_stream_t * s = &m->stream;
+
+  last = vlib_get_buffer (sm->vlib_main, sm->last_buffer);
+  last->current_length = s->current_buffer_index;
+
+  if (vec_len (s->overflow_buffer) > 0)
+    {
+      sm->last_buffer
+	= vlib_buffer_add_data (sm->vlib_main, sm->tx.free_list_index,
+				sm->last_buffer == ~0 ? 0 : sm->last_buffer,
+				s->overflow_buffer,
+				vec_len (s->overflow_buffer));
+      _vec_len (s->overflow_buffer) = 0;
+    }
+
+  return sm->first_buffer;
+}
+
+void unserialize_close_vlib_buffer (serialize_main_t * m)
+{
+  vlib_serialize_buffer_main_t * sm
+    = uword_to_pointer (m->stream.data_function_opaque, vlib_serialize_buffer_main_t *);
+  if (sm->first_buffer != ~0)
+    vlib_buffer_free_one (sm->vlib_main, sm->first_buffer);
+  clib_fifo_reset (sm->rx.buffer_fifo);
+  if (m->stream.overflow_buffer)
+    _vec_len (m->stream.overflow_buffer) = 0;
+}
+
+static u8 * format_vlib_buffer_free_list (u8 * s, va_list * va)
+{
+  vlib_buffer_free_list_t * f = va_arg (*va, vlib_buffer_free_list_t *);
+  u32 threadnum= va_arg (*va, u32);
+  uword bytes_alloc, bytes_free, n_free, size;
+
+  if (! f)
+    return format (s, "%=7s%=30s%=12s%=12s%=12s%=12s%=12s%=12s",
+		   "Thread", "Name", "Index", "Size", "Alloc", "Free", "#Alloc", "#Free");
+
+  size = sizeof (vlib_buffer_t) + f->n_data_bytes;
+  n_free = vec_len (f->aligned_buffers) + vec_len (f->unaligned_buffers);
+  bytes_alloc = size * f->n_alloc;
+  bytes_free = size * n_free;
+
+  s = format (s, "%7d%30s%12d%12d%=12U%=12U%=12d%=12d",
+              threadnum,
+	      f->name, f->index, f->n_data_bytes,
+	      format_memory_size, bytes_alloc,
+	      format_memory_size, bytes_free,
+	      f->n_alloc, n_free);
+
+  return s;
+}
+
+static clib_error_t *
+show_buffers (vlib_main_t * vm,
+	      unformat_input_t * input,
+	      vlib_cli_command_t * cmd)
+{
+  vlib_buffer_main_t * bm;
+  vlib_buffer_free_list_t * f;
+  vlib_main_t *curr_vm;
+  u32 vm_index = 0;
+
+  vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, 0, 0);
+
+  do {
+    curr_vm = vec_len(vlib_mains) ? vlib_mains[vm_index] : vm;
+    bm = curr_vm->buffer_main;
+  
+    pool_foreach (f, bm->buffer_free_list_pool, ({
+      vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, f, vm_index);
+    }));
+
+    vm_index++;
+  } while (vm_index < vec_len(vlib_mains));
+
+  return 0;
+}
+
+VLIB_CLI_COMMAND (show_buffers_command, static) = {
+  .path = "show buffers",
+  .short_help = "Show packet buffer allocation",
+  .function = show_buffers,
+};
+
+#if CLIB_DEBUG > 0
+
+u32 * vlib_buffer_state_validation_lock;
+uword * vlib_buffer_state_validation_hash;
+void * vlib_buffer_state_heap;
+
+static clib_error_t * 
+buffer_state_validation_init (vlib_main_t * vm)
+{
+  void * oldheap;
+
+  vlib_buffer_state_heap = mheap_alloc (0, 10<<20);
+
+  oldheap = clib_mem_set_heap (vlib_buffer_state_heap);
+
+  vlib_buffer_state_validation_hash = hash_create (0, sizeof(uword));
+  vec_validate_aligned (vlib_buffer_state_validation_lock, 0, 
+                        CLIB_CACHE_LINE_BYTES);
+  clib_mem_set_heap (oldheap);
+  return 0;
+}
+
+VLIB_INIT_FUNCTION (buffer_state_validation_init);
+#endif
diff --git a/vlib/vlib/error.c b/vlib/vlib/error.c
new file mode 100644
index 00000000000..59b89cefc3a
--- /dev/null
+++ b/vlib/vlib/error.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * error.c: VLIB error handler
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vppinfra/heap.h>
+
+uword
+vlib_error_drop_buffers (vlib_main_t * vm,
+			 vlib_node_runtime_t * node,
+			 u32 * buffers,
+			 u32 next_buffer_stride,
+			 u32 n_buffers,
+			 u32 next_index,
+			 u32 drop_error_node,
+			 u32 drop_error_code)
+{
+  u32 n_left_this_frame, n_buffers_left, * args, n_args_left;
+  vlib_error_t drop_error;
+
+  drop_error = vlib_error_set (drop_error_node, drop_error_code);
+
+  n_buffers_left = n_buffers;
+  while (n_buffers_left > 0)
+    {
+      vlib_get_next_frame (vm, node, next_index, args, n_args_left);
+
+      n_left_this_frame = clib_min (n_buffers_left, n_args_left);
+      n_buffers_left -= n_left_this_frame;
+      n_args_left -= n_left_this_frame;
+
+      while (n_left_this_frame >= 4)
+	{
+	  u32 bi0, bi1, bi2, bi3;
+	  vlib_buffer_t * b0, * b1, * b2, * b3;
+
+	  args[0] = bi0 = buffers[0];
+	  args[1] = bi1 = buffers[1];
+	  args[2] = bi2 = buffers[2];
+	  args[3] = bi3 = buffers[3];
+
+	  b0 = vlib_get_buffer (vm, bi0);
+	  b1 = vlib_get_buffer (vm, bi1);
+	  b2 = vlib_get_buffer (vm, bi2);
+	  b3 = vlib_get_buffer (vm, bi3);
+
+	  b0->error = drop_error;
+	  b1->error = drop_error;
+	  b2->error = drop_error;
+	  b3->error = drop_error;
+
+	  buffers += 4;
+	  args += 4;
+	  n_left_this_frame -= 4;
+	}
+
+      while (n_left_this_frame >= 1)
+	{
+	  u32 bi0;
+	  vlib_buffer_t * b0;
+
+	  args[0] = bi0 = buffers[0];
+
+	  b0 = vlib_get_buffer (vm, bi0);
+	  b0->error = drop_error;
+
+	  buffers += 1;
+	  args += 1;
+	  n_left_this_frame -= 1;
+	}
+
+      vlib_put_next_frame (vm, node, next_index, n_args_left);
+    }
+
+  return n_buffers;
+}
+
+/* Convenience node to drop a vector of buffers with a "misc error". */
+static uword
+misc_drop_buffers (vlib_main_t * vm,
+                   vlib_node_runtime_t * node,
+                   vlib_frame_t * frame)
+{
+    return vlib_error_drop_buffers (vm, node,
+                                    vlib_frame_args (frame),
+				    /* buffer stride */ 1,
+                                    frame->n_vectors,
+                                    /* next */ 0,
+                                    node->node_index,
+                                    /* error */ 0);
+}
+
+static char * misc_drop_buffers_error_strings[] = {
+    [0] = "misc. errors",
+};
+
+VLIB_REGISTER_NODE (misc_drop_buffers_node,static) = {
+  .function = misc_drop_buffers,
+  .name = "misc-drop-buffers",
+  .vector_size = sizeof (u32),
+  .n_errors = 1,
+  .n_next_nodes = 1,
+  .next_nodes = {
+      "error-drop",
+  },
+  .error_strings = misc_drop_buffers_error_strings,
+};
+
+/* Reserves given number of error codes for given node. */
+void vlib_register_errors (vlib_main_t * vm,
+			   u32 node_index,
+			   u32 n_errors,
+			   char * error_strings[])
+{
+  vlib_error_main_t * em = &vm->error_main;
+  vlib_node_t * n = vlib_get_node (vm, node_index);
+  uword l;
+
+  /* Free up any previous error strings. */
+  if (n->n_errors > 0)
+    heap_dealloc (em->error_strings_heap, n->error_heap_handle);
+
+  n->n_errors = n_errors;
+  n->error_strings = error_strings;
+
+  if (n_errors == 0)
+    return;
+
+  n->error_heap_index =
+    heap_alloc (em->error_strings_heap, n_errors,
+		n->error_heap_handle);
+
+  l = vec_len (em->error_strings_heap);
+
+  memcpy (vec_elt_at_index (em->error_strings_heap, n->error_heap_index),
+	  error_strings,
+	  n_errors * sizeof (error_strings[0]));
+
+  /* Allocate a counter/elog type for each error. */
+  vec_validate (em->counters, l - 1);
+  vec_validate (vm->error_elog_event_types, l - 1);
+
+  /* Zero counters for re-registrations of errors. */
+  if (n->error_heap_index + n_errors <= vec_len (em->counters_last_clear))
+    memcpy (em->counters + n->error_heap_index,
+	    em->counters_last_clear + n->error_heap_index,
+	    n_errors * sizeof (em->counters[0]));
+  else
+    memset (em->counters + n->error_heap_index,
+	    0,
+	    n_errors * sizeof (em->counters[0]));
+
+  {
+    elog_event_type_t t;
+    uword i;
+
+    memset (&t, 0, sizeof (t));
+    for (i = 0; i < n_errors; i++)
+      {
+	t.format = (char *) format (0, "%v %s: %%d",
+				    n->name,
+				    error_strings[i]);
+	vm->error_elog_event_types[n->error_heap_index + i] = t;
+      }
+  }
+}
+
+static clib_error_t *
+show_errors (vlib_main_t * vm,
+	     unformat_input_t * input,
+	     vlib_cli_command_t * cmd)
+{
+  vlib_error_main_t * em = &vm->error_main;
+  vlib_node_t * n;
+  u32 code, i, ni;
+  u64 c;
+
+  vlib_cli_output (vm, "%=16s%=40s%=20s", "Count", "Node", "Reason");
+
+  for (ni = 0; ni < vec_len (vm->node_main.nodes); ni++)
+    {
+      n = vlib_get_node (vm, ni);
+      for (code = 0; code < n->n_errors; code++)
+	{
+	  i = n->error_heap_index + code;
+	  c = em->counters[i];
+	  if (i < vec_len (em->counters_last_clear))
+	    c -= em->counters_last_clear[i];
+
+	  if (c == 0)
+	    continue;
+
+	  vlib_cli_output (vm, "%16Ld%=40v%s", c, n->name, em->error_strings_heap[i]);
+	}
+    }
+
+  return 0;
+}
+
+VLIB_CLI_COMMAND (cli_show_errors, static) = {
+  .path = "show errors",
+  .short_help = "Show error counts",
+  .function = show_errors,
+};
+
+VLIB_CLI_COMMAND (cli_show_node_counters, static) = {
+  .path = "show node counters",
+  .short_help = "Show node counters",
+  .function = show_errors,
+};
+
+static clib_error_t *
+clear_error_counters (vlib_main_t * vm,
+		      unformat_input_t * input,
+		      vlib_cli_command_t * cmd)
+{
+  vlib_error_main_t * em = &vm->error_main;
+  u32 i;
+
+  vec_validate (em->counters_last_clear, vec_len (em->counters) - 1);
+  for (i = 0; i < vec_len (em->counters); i++)
+    em->counters_last_clear[i] = em->counters[i];
+  return 0;
+}
+
+VLIB_CLI_COMMAND (cli_clear_error_counters, static) = {
+  .path = "clear errors",
+  .short_help = "Clear error counters",
+  .function = clear_error_counters,
+};
+
+VLIB_CLI_COMMAND (cli_clear_node_counters, static) = {
+  .path = "clear node counters",
+  .short_help = "Clear node counters",
+  .function = clear_error_counters,
+};
diff --git a/vlib/vlib/error.h b/vlib/vlib/error.h
new file mode 100644
index 00000000000..4bf0b926718
--- /dev/null
+++ b/vlib/vlib/error.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * error.h: drop/punt error packets
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_error_h
+#define included_vlib_error_h
+
+/* Combined 16 bit node & 16 bit code as 32 bit number. */
+typedef u32 vlib_error_t;
+
+always_inline u32
+vlib_error_get_node (vlib_error_t e)
+{ return e >> 12; }
+
+always_inline u32
+vlib_error_get_code (vlib_error_t e)
+{ return e & 0xfff; }
+
+always_inline vlib_error_t
+vlib_error_set (u32 node_index, u32 code)
+{
+  ASSERT (node_index < (1 << 20));
+  ASSERT (code < (1 << 12));
+  return (node_index << 12) | code;
+}
+
+always_inline vlib_error_t
+vlib_error_set_code (vlib_error_t e, u32 code)
+{
+  ASSERT (vlib_error_get_code (e) == 0);
+  ASSERT (code < (1 << 12));
+  e |= code;
+  return e;
+}
+
+typedef struct {
+  /* Error counters. */
+  u64 * counters;
+
+  /* Counter values as of last counter clear. */
+  u64 * counters_last_clear;
+
+  /* Error name strings in heap.  Heap index
+     indexes counter vector. */
+  char ** error_strings_heap;
+} vlib_error_main_t;
+
+/* Per node error registration. */
+void vlib_register_errors (struct vlib_main_t * vm,
+			   u32 node_index,
+			   u32 n_errors,
+			   char * error_strings[]);
+
+#endif /* included_vlib_error_h */
diff --git a/vlib/vlib/error_funcs.h b/vlib/vlib/error_funcs.h
new file mode 100644
index 00000000000..acdd5d2d898
--- /dev/null
+++ b/vlib/vlib/error_funcs.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * error_funcs.h: VLIB error handling
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_error_funcs_h
+#define included_vlib_error_funcs_h
+
+#include <vlib/node_funcs.h>
+
+always_inline void
+vlib_error_elog_count (vlib_main_t * vm, uword counter, uword increment)
+{
+  elog_main_t * em = &vm->elog_main;
+  if (VLIB_ELOG_MAIN_LOOP > 0 && increment > 0)
+    elog (em, vec_elt_at_index (vm->error_elog_event_types, counter), increment);
+}
+
+always_inline void
+vlib_error_count (vlib_main_t * vm, uword node_index,
+		  uword counter, uword increment)
+{
+  vlib_node_t * n = vlib_get_node (vm, node_index);
+  vlib_error_main_t * em = &vm->error_main;
+
+  ASSERT (counter < n->n_errors);
+  counter += n->error_heap_index;
+
+  ASSERT (counter < vec_len (em->counters));
+  em->counters[counter] += increment;
+
+  vlib_error_elog_count (vm, counter, increment);
+}
+
+/* Drop all buffers in frame with given error code. */
+uword
+vlib_error_drop_buffers (vlib_main_t * vm,
+			 vlib_node_runtime_t * node,
+			 u32 * buffers,
+			 u32 next_buffer_stride,
+			 u32 n_buffers,
+			 u32 error_next_index,
+			 u32 error_node,
+			 u32 error_code);
+
+#endif /* included_vlib_error_funcs_h */
diff --git a/vlib/vlib/format.c b/vlib/vlib/format.c
new file mode 100644
index 00000000000..3c77d8dbd18
--- /dev/null
+++ b/vlib/vlib/format.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * format.c: generic network formatting/unformating
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+
+u8 * format_vlib_rx_tx (u8 * s, va_list * args)
+{
+  vlib_rx_or_tx_t r = va_arg (*args, vlib_rx_or_tx_t);
+  char * t;
+
+  switch (r)
+    {
+    case VLIB_RX: t = "rx"; break;
+    case VLIB_TX: t = "tx"; break;
+    default: t = "INVALID"; break;
+    }
+
+  vec_add (s, t, strlen (t));
+  return s;
+}
+
+u8 * format_vlib_read_write (u8 * s, va_list * args)
+{
+  vlib_rx_or_tx_t r = va_arg (*args, vlib_rx_or_tx_t);
+  char * t;
+
+  switch (r)
+    {
+    case VLIB_READ:  t = "read"; break;
+    case VLIB_WRITE: t = "write"; break;
+    default: t = "INVALID"; break;
+    }
+
+  vec_add (s, t, strlen (t));
+  return s;
+}
+
+/* Formats buffer data as printable ascii or as hex. */
+u8 * format_vlib_buffer_data (u8 * s, va_list * args)
+{
+  u8 * data = va_arg (*args, u8 *);
+  u32 n_data_bytes = va_arg (*args, u32);
+  u32 i, is_printable;
+
+  is_printable = 1;
+  for (i = 0; i < n_data_bytes && is_printable; i++)
+    {
+      u8 c = data[i];
+      if (c < 0x20)
+	is_printable = 0;
+      else if (c >= 0x7f)
+	is_printable = 0;
+    }
+      
+  if (is_printable)
+    vec_add (s, data, n_data_bytes);
+  else
+    s = format (s, "%U", format_hex_bytes, data, n_data_bytes);
+
+  return s;
+}
+
+/* Enable/on => 1; disable/off => 0. */
+uword unformat_vlib_enable_disable (unformat_input_t * input, va_list * args)
+{
+  int * result = va_arg (*args, int *);
+  int enable;
+
+  if (unformat (input, "enable") || unformat (input, "on"))
+    enable = 1;
+  else if (unformat (input, "disable") || unformat (input, "off"))
+    enable = 0;
+  else
+    return 0;
+
+  *result = enable;
+  return 1;
+}
+
+/* rx/tx => VLIB_RX/VLIB_TX. */
+uword unformat_vlib_rx_tx (unformat_input_t * input, va_list * args)
+{
+  int * result = va_arg (*args, int *);
+  if (unformat (input, "rx"))
+    *result = VLIB_RX;
+  else if (unformat (input, "tx"))
+    *result = VLIB_TX;
+  else
+    return 0;
+  return 1;
+}
+
+/* Parse an int either %d or 0x%x. */
+uword unformat_vlib_number (unformat_input_t * input, va_list * args)
+{
+  int * result = va_arg (*args, int *);
+
+  return (unformat (input, "0x%x", result)
+	  || unformat (input, "%d", result));
+}
+
+/* Parse a-zA-Z0-9_ token and hash to value. */
+uword unformat_vlib_number_by_name (unformat_input_t * input, va_list * args)
+{
+  uword * hash = va_arg (*args, uword *);
+  int * result = va_arg (*args, int *);
+  uword * p;
+  u8 * token;
+  int i;
+
+  if (! unformat_user (input, unformat_token, "a-zA-Z0-9_", &token))
+    return 0;
+
+  /* Null terminate. */
+  if (vec_len (token) > 0 &&
+      token[vec_len (token) - 1] != 0)
+    vec_add1 (token, 0);
+
+  /* Check for exact match. */
+  p = hash_get_mem (hash, token);
+  if (p)
+    goto done;
+
+  /* Convert to upper case & try match. */
+  for (i = 0; i < vec_len (token); i++)
+    if (token[i] >= 'a' && token[i] <= 'z')
+      token[i] = 'A' + token[i] - 'a';
+  p = hash_get_mem (hash, token);
+
+ done:
+  vec_free (token);
+  if (p)
+    *result = p[0];
+  return p != 0;
+}
diff --git a/vlib/vlib/format_funcs.h b/vlib/vlib/format_funcs.h
new file mode 100644
index 00000000000..02d8a555d78
--- /dev/null
+++ b/vlib/vlib/format_funcs.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * format_funcs.h: VLIB formatting/unformating
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_format_h
+#define included_vlib_format_h
+
+/* Format vlib_rx_or_tx_t/vlib_read_or_write_t enum as string. */
+u8 * format_vlib_rx_tx (u8 * s, va_list * args);
+u8 * format_vlib_read_write (u8 * s, va_list * args);
+
+/* Formats buffer data as printable ascii or as hex. */
+u8 * format_vlib_buffer_data (u8 * s, va_list * args);
+
+/* Enable/on => 1; disable/off => 0. */
+uword unformat_vlib_enable_disable (unformat_input_t * input, va_list * args);
+
+/* rx/tx => VLIB_RX/VLIB_TX. */
+uword unformat_vlib_rx_tx (unformat_input_t * input, va_list * args);
+
+/* Parse a-zA-Z0-9_ token and hash to value. */
+uword unformat_vlib_number_by_name (unformat_input_t * input, va_list * args);
+
+/* Parse an int either %d or 0x%x. */
+uword unformat_vlib_number (unformat_input_t * input, va_list * args);
+
+/* Flag to format_vlib_*_header functions to tell them not to recurse
+   into the next layer's header.  For example, tells format_vlib_ethernet_header
+   not to format ip header. */
+#define FORMAT_VLIB_HEADER_NO_RECURSION (~0)
+
+#endif /* included_vlib_format_h */
diff --git a/vlib/vlib/global_funcs.h b/vlib/vlib/global_funcs.h
new file mode 100644
index 00000000000..406ce7d71b6
--- /dev/null
+++ b/vlib/vlib/global_funcs.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * global_funcs.h: global data structure access functions
+ */
+
+#ifndef included_vlib_global_funcs_h_
+#define included_vlib_global_funcs_h_
+
+always_inline vlib_main_t *
+vlib_get_main (void)
+{
+  vlib_main_t * vm;
+  vm = vlib_mains ? vlib_mains[os_get_cpu_number()] : &vlib_global_main;
+  ASSERT(vm);
+  return vm;
+}
+
+always_inline vlib_thread_main_t *
+vlib_get_thread_main()
+{
+  return &vlib_thread_main;
+}
+
+#endif /* included_vlib_global_funcs_h_ */
diff --git a/vlib/vlib/init.c b/vlib/vlib/init.c
new file mode 100644
index 00000000000..3991c800147
--- /dev/null
+++ b/vlib/vlib/init.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * init.c: mechanism for functions to be called at init/exit.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+
+clib_error_t *
+vlib_call_init_exit_functions (vlib_main_t * vm, 
+                               _vlib_init_function_list_elt_t *head, 
+                               int call_once)
+{
+  clib_error_t * error = 0;
+  _vlib_init_function_list_elt_t * i;
+
+  i = head;
+  while (i)
+    {
+      if (call_once && !hash_get (vm->init_functions_called, i->f))
+        {
+          if (call_once)
+            hash_set1 (vm->init_functions_called, i->f);
+          error = i->f (vm);
+          if (error)
+            return error;
+        }
+      i = i->next_init_function;
+    }
+  return error;
+}
+
+clib_error_t * vlib_call_all_init_functions (vlib_main_t * vm)
+{
+  /* Call dummy functions to make sure purely static modules are
+     linked in. */
+#define _(f) vlib_##f##_reference ();
+  foreach_vlib_module_reference;
+#undef _
+
+  return vlib_call_init_exit_functions 
+    (vm, vm->init_function_registrations, 1 /* call_once */);
+}
+
+clib_error_t * vlib_call_all_main_loop_enter_functions (vlib_main_t * vm)
+{ 
+  return vlib_call_init_exit_functions 
+    (vm, vm->main_loop_enter_function_registrations, 1 /* call_once */); 
+}
+
+clib_error_t * vlib_call_all_main_loop_exit_functions (vlib_main_t * vm)
+{ 
+  return vlib_call_init_exit_functions 
+    (vm, vm->main_loop_exit_function_registrations, 1 /* call_once */); 
+}
+
+clib_error_t * vlib_call_all_config_functions (vlib_main_t * vm,
+					       unformat_input_t * input,
+                                               int is_early)
+{
+  clib_error_t * error = 0;
+  vlib_config_function_runtime_t * c, ** all;
+  uword * hash = 0, * p;
+  uword i;
+
+  hash = hash_create_string (0, sizeof (uword));
+  all = 0;
+
+  c = vm->config_function_registrations;
+
+  while (c)
+    {
+      hash_set_mem (hash, c->name, vec_len (all));
+      vec_add1 (all, c);
+      unformat_init (&c->input, 0, 0);
+      c = c->next_registration;
+    }
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      u8 * s, * v;
+
+      if (! unformat (input, "%s %v", &s, &v)
+	  || ! (p = hash_get_mem (hash, s)))
+	{
+	  error = clib_error_create ("unknown input `%s %v'", s, v);
+	  goto done;
+	}
+
+      c = all[p[0]];
+      if (vec_len (c->input.buffer) > 0)
+	vec_add1 (c->input.buffer, ' ');
+      vec_add (c->input.buffer, v, vec_len (v));
+      vec_free (v);
+      vec_free (s);
+    }
+
+  for (i = 0; i < vec_len (all); i++)
+    {
+      c = all[i];
+
+      /* Is this an early config? Are we doing early configs? */
+      if (is_early ^ c->is_early)
+        continue;
+
+      /* Already called? */
+      if (hash_get (vm->init_functions_called, c->function))
+	continue;
+      hash_set1 (vm->init_functions_called, c->function);
+
+      error = c->function (vm, &c->input);
+      if (error)
+	goto done;
+    }
+
+ done:
+  for (i = 0; i < vec_len (all); i++)
+    {
+      c = all[i];
+      unformat_free (&c->input);
+    }
+  vec_free (all);
+  hash_free (hash);
+  return error;
+}
diff --git a/vlib/vlib/init.h b/vlib/vlib/init.h
new file mode 100644
index 00000000000..9d940d0745f
--- /dev/null
+++ b/vlib/vlib/init.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * init.h: mechanism for functions to be called at init/exit.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_init_h
+#define included_vlib_init_h
+
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+#include <vppinfra/hash.h>
+
+/* Init/exit functions: called at start/end of main routine.  Init
+   functions are typically used to register and setup packet
+   processing nodes.  */
+
+typedef clib_error_t * (vlib_init_function_t) (struct vlib_main_t * vm);
+
+typedef struct _vlib_init_function_list_elt {
+  struct _vlib_init_function_list_elt * next_init_function;
+  vlib_init_function_t * f;
+} _vlib_init_function_list_elt_t;
+
+/* Configuration functions: called with configuration input just before
+   main polling loop starts. */
+typedef clib_error_t * (vlib_config_function_t) (struct vlib_main_t * vm,
+						 unformat_input_t * input);
+
+typedef struct vlib_config_function_runtime_t {
+  /* Function to call.  Set to null once function has already been called. */
+  vlib_config_function_t * function;
+
+  /* Input for function. */
+  unformat_input_t input;
+
+  /* next config function registration */
+  struct vlib_config_function_runtime_t * next_registration;
+
+  /* To be invoked as soon as the clib heap is available */
+  u8 is_early;
+
+  /* Name used to distinguish input on command line. */
+  char name[32];
+} vlib_config_function_runtime_t;
+
+#define _VLIB_INIT_FUNCTION_SYMBOL(x, type)	\
+  _vlib_##type##_function_##x
+
+#define VLIB_INIT_FUNCTION_SYMBOL(x)		\
+  _VLIB_INIT_FUNCTION_SYMBOL(x, init)
+#define VLIB_MAIN_LOOP_ENTER_FUNCTION_SYMBOL(x)		\
+  _VLIB_INIT_FUNCTION_SYMBOL(x, main_loop_enter)
+#define VLIB_MAIN_LOOP_EXIT_FUNCTION_SYMBOL(x)	\
+  _VLIB_INIT_FUNCTION_SYMBOL(x, main_loop_exit)
+#define VLIB_CONFIG_FUNCTION_SYMBOL(x)		\
+  _VLIB_INIT_FUNCTION_SYMBOL(x, config)
+
+/* Declaration is global (e.g. not static) so that init functions can
+   be called from other modules to resolve init function depend. */
+
+#define VLIB_DECLARE_INIT_FUNCTION(x, tag)                      \
+vlib_init_function_t * _VLIB_INIT_FUNCTION_SYMBOL (x, tag) = x; \
+static void __vlib_add_##tag##_function_##x (void)              \
+    __attribute__((__constructor__)) ;                          \
+static void __vlib_add_##tag##_function_##x (void)              \
+{                                                               \
+ vlib_main_t * vm = vlib_get_main();                            \
+ static _vlib_init_function_list_elt_t _vlib_init_function;     \
+ _vlib_init_function.next_init_function                         \
+    = vm->tag##_function_registrations;                         \
+  vm->tag##_function_registrations = &_vlib_init_function;      \
+ _vlib_init_function.f = &x;                                    \
+} 
+
+#define VLIB_INIT_FUNCTION(x) VLIB_DECLARE_INIT_FUNCTION(x,init)
+
+#define VLIB_MAIN_LOOP_ENTER_FUNCTION(x) \
+  VLIB_DECLARE_INIT_FUNCTION(x,main_loop_enter)
+#define VLIB_MAIN_LOOP_EXIT_FUNCTION(x) \
+VLIB_DECLARE_INIT_FUNCTION(x,main_loop_exit)
+
+#define VLIB_CONFIG_FUNCTION(x,n,...)                           \
+    __VA_ARGS__ vlib_config_function_runtime_t                  \
+    VLIB_CONFIG_FUNCTION_SYMBOL(x);                             \
+static void __vlib_add_config_function_##x (void)               \
+    __attribute__((__constructor__)) ;                          \
+static void __vlib_add_config_function_##x (void)               \
+{                                                               \
+    vlib_main_t * vm = vlib_get_main();                         \
+    VLIB_CONFIG_FUNCTION_SYMBOL(x).next_registration            \
+       = vm->config_function_registrations;                     \
+    vm->config_function_registrations                           \
+       = &VLIB_CONFIG_FUNCTION_SYMBOL(x);                       \
+}                                                               \
+  vlib_config_function_runtime_t                                \
+    VLIB_CONFIG_FUNCTION_SYMBOL (x)                             \
+  = {                                                           \
+    .name = n,                                                  \
+    .function = x,                                              \
+    .is_early = 0,						\
+  }
+
+#define VLIB_EARLY_CONFIG_FUNCTION(x,n,...)                     \
+    __VA_ARGS__ vlib_config_function_runtime_t                  \
+    VLIB_CONFIG_FUNCTION_SYMBOL(x);                             \
+static void __vlib_add_config_function_##x (void)               \
+    __attribute__((__constructor__)) ;                          \
+static void __vlib_add_config_function_##x (void)               \
+{                                                               \
+    vlib_main_t * vm = vlib_get_main();                         \
+    VLIB_CONFIG_FUNCTION_SYMBOL(x).next_registration            \
+       = vm->config_function_registrations;                     \
+    vm->config_function_registrations                           \
+       = &VLIB_CONFIG_FUNCTION_SYMBOL(x);                       \
+}                                                               \
+  vlib_config_function_runtime_t                                \
+    VLIB_CONFIG_FUNCTION_SYMBOL (x)                             \
+  = {                                                           \
+    .name = n,                                                  \
+    .function = x,                                              \
+    .is_early = 1,						\
+  }
+
+/* Call given init function: used for init function dependencies. */
+#define vlib_call_init_function(vm, x)					\
+  ({									\
+    extern vlib_init_function_t * VLIB_INIT_FUNCTION_SYMBOL (x);	\
+    vlib_init_function_t * _f = VLIB_INIT_FUNCTION_SYMBOL (x);		\
+    clib_error_t * _error = 0;						\
+    if (! hash_get (vm->init_functions_called, _f))			\
+      {									\
+	hash_set1 (vm->init_functions_called, _f);			\
+	_error = _f (vm);						\
+      }									\
+    _error;								\
+  })
+
+#define vlib_call_post_graph_init_function(vm, x)			\
+  ({									\
+    extern vlib_init_function_t * VLIB_POST_GRAPH_INIT_FUNCTION_SYMBOL (x); \
+    vlib_init_function_t * _f = VLIB_POST_GRAPH_INIT_FUNCTION_SYMBOL (x); \
+    clib_error_t * _error = 0;						\
+    if (! hash_get (vm->init_functions_called, _f))			\
+      {									\
+	hash_set1 (vm->init_functions_called, _f);			\
+	_error = _f (vm);						\
+      }									\
+    _error;								\
+  })
+
+#define vlib_call_config_function(vm, x)			\
+  ({								\
+    vlib_config_function_runtime_t * _r;			\
+    clib_error_t * _error = 0;					\
+    extern vlib_config_function_runtime_t			\
+      VLIB_CONFIG_FUNCTION_SYMBOL (x);				\
+								\
+    _r = &VLIB_CONFIG_FUNCTION_SYMBOL (x);			\
+    if (! hash_get (vm->init_functions_called, _r->function))	\
+      {								\
+        hash_set1 (vm->init_functions_called, _r->function);	\
+	_error = _r->function (vm, &_r->input);			\
+      }								\
+    _error;							\
+  })
+
+/* External functions. */
+clib_error_t * vlib_call_all_init_functions (struct vlib_main_t * vm);
+clib_error_t * vlib_call_all_config_functions (struct vlib_main_t * vm,
+					       unformat_input_t * input,
+                                               int is_early);
+clib_error_t * vlib_call_all_main_loop_enter_functions (struct vlib_main_t * vm);
+clib_error_t * vlib_call_all_main_loop_exit_functions (struct vlib_main_t * vm);
+clib_error_t * 
+vlib_call_init_exit_functions (struct vlib_main_t * vm,
+                               _vlib_init_function_list_elt_t *head, 
+                               int call_once);
+
+#define foreach_vlib_module_reference		\
+  _ (node_cli)					\
+  _ (trace_cli)
+
+/* Dummy function to get node_cli.c linked in. */
+#define _(x) void vlib_##x##_reference (void);
+foreach_vlib_module_reference
+#undef _
+
+#endif /* included_vlib_init_h */
diff --git a/vlib/vlib/lex.c b/vlib/vlib/lex.c
new file mode 100644
index 00000000000..de650900c11
--- /dev/null
+++ b/vlib/vlib/lex.c
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vlib/vlib.h>
+#include <vlib/lex.h>
+
+vlib_lex_main_t vlib_lex_main;
+
+#define LEX_DEBUG 0
+
+u8 * format_vlib_lex_token (u8 * s, va_list * args)
+{
+  vlib_lex_main_t *lm = va_arg (*args, vlib_lex_main_t *);
+  vlib_lex_token_t  *t  = va_arg (*args, vlib_lex_token_t *);
+
+  if (t->token == VLIB_LEX_word)
+    s = format (s, "%s", t->value.as_pointer);
+  else
+    s = format (s, "%s", lm->lex_token_names[t->token]);
+  return s;
+}
+
+void vlib_lex_get_token (vlib_lex_main_t * lm, vlib_lex_token_t * rv)
+{
+  u8 c;
+  vlib_lex_table_t *t;
+  vlib_lex_table_entry_t *e;
+  uword tv;
+
+  if (PREDICT_FALSE (lm->pushback_sp >= 0))
+    {
+      rv[0] = lm->pushback_vector [lm->pushback_sp--];
+      return;
+    }
+
+  rv->value.as_uword = ~0;
+
+  while (1)
+    {
+      if (PREDICT_FALSE(lm->current_index >= vec_len (lm->input_vector)))
+	{
+	  rv->token = VLIB_LEX_eof;
+	  return;
+	}
+
+      t = vec_elt_at_index (lm->lex_tables, lm->current_table_index);
+      c = (lm->input_vector [lm->current_index++]) & 0x7f;
+      e = &t->entries [c];
+      lm->current_table_index = e->next_table_index;
+
+      switch (e->action)
+	{
+	case VLIB_LEX_IGNORE:
+	  continue;
+
+	case VLIB_LEX_START_NUMBER:
+	  lm->current_token_value = 0;
+	  /* fallthru */
+
+	case VLIB_LEX_ADD_TO_NUMBER:
+	  lm->current_number_base = e->token;
+	  lm->current_token_value *= lm->current_number_base;
+	  tv = c - '0';
+	  if (tv >= lm->current_number_base)
+	    {
+	      tv = 10 + c - 'A';
+	      if (tv >= lm->current_number_base)
+		tv = 10 + c - 'a';
+	    }
+	  lm->current_token_value += tv;
+	  continue;
+
+	case VLIB_LEX_ADD_TO_TOKEN:
+	  vec_add1(lm->token_buffer, c);
+	  continue;
+
+	case VLIB_LEX_KEYWORD_CHECK: {
+	  uword * p;
+
+	  vec_add1 (lm->token_buffer, 0);
+
+	  /* It's either a keyword or just a word. */
+	  p = hash_get_mem (lm->lex_keywords, lm->token_buffer);
+	  if (p)
+	    {
+	      rv->token = p[0];
+	      if (LEX_DEBUG > 0) 
+		clib_warning ("keyword '%s' token %s",
+			      lm->token_buffer, 
+			      lm->lex_token_names[rv->token]);
+	    }
+	  else
+	    {
+	      /* it's a WORD */
+	      rv->token = VLIB_LEX_word;
+	      rv->value.as_pointer = vec_dup (lm->token_buffer);
+	      if (LEX_DEBUG > 0) 
+		clib_warning ("%s, value '%s'",
+			      lm->lex_token_names[VLIB_LEX_word], 
+			      rv->value.as_pointer);
+	    }
+	  _vec_len (lm->token_buffer) = 0;
+
+	  /* Rescan the character which terminated the keyword/word. */
+	  lm->current_index--;
+	  return;
+	}
+
+	case VLIB_LEX_RETURN_AND_RESCAN:
+	  ASSERT(lm->current_index);
+	  lm->current_index--;
+	  /* note flow-through */
+
+	case VLIB_LEX_RETURN:
+	  rv->token = e->token;
+	  rv->value.as_uword = lm->current_token_value;
+	  lm->current_token_value = ~0;
+	  if (LEX_DEBUG > 0)
+	    {
+	      clib_warning ("table %s char '%c'(0x%02x) next table %s return %s",
+			    t->name, c, c, lm->lex_tables[e->next_table_index].name,
+			    lm->lex_token_names[e->token]);
+	      if (rv->token == VLIB_LEX_number) 
+		clib_warning ("  numeric value 0x%x (%d)", rv->value, 
+			      rv->value);
+	    }
+	  return;
+	}
+    }
+}
+
+u16 vlib_lex_add_token (vlib_lex_main_t *lm, char *token_name)
+{
+  uword *p;
+  u16 rv;
+
+  p = hash_get_mem (lm->lex_tokens_by_name, token_name);
+
+  if (p)
+    return p[0];
+
+  rv = vec_len (lm->lex_token_names);
+  hash_set_mem (lm->lex_tokens_by_name, token_name, rv);
+  vec_add1 (lm->lex_token_names, token_name);
+
+  return rv;
+}
+
+static u16 add_keyword (vlib_lex_main_t *lm, char *keyword, char *token_name)
+{
+  uword *p;
+  u16 token;
+
+  p = hash_get_mem (lm->lex_keywords, keyword);
+
+  ASSERT (p == 0);
+
+  token = vlib_lex_add_token (lm, token_name);
+
+  hash_set_mem (lm->lex_keywords, keyword, token);
+  return token;
+}
+
+u16 vlib_lex_find_or_add_keyword (vlib_lex_main_t *lm, char *keyword, char *token_name)
+{
+  uword * p = hash_get_mem (lm->lex_keywords, keyword);
+  return p ? p[0] : add_keyword (lm, keyword, token_name);
+}
+
+void vlib_lex_set_action_range (u32 table_index, u8 lo, u8 hi, u16 action, 
+				u16 token, u32 next_table_index)
+{
+  int i;
+  vlib_lex_main_t *lm = &vlib_lex_main;
+  vlib_lex_table_t *t = pool_elt_at_index (lm->lex_tables, table_index);
+
+  for (i = lo; i <= hi; i++)
+    {
+      ASSERT (i < ARRAY_LEN (t->entries));
+      t->entries[i].action = action;
+      t->entries[i].token = token;
+      t->entries[i].next_table_index = next_table_index;
+    }
+}
+
+u16 vlib_lex_add_table (char *name)
+{
+  vlib_lex_main_t *lm = &vlib_lex_main;
+  vlib_lex_table_t *t;
+  uword *p;
+
+  p = hash_get_mem (lm->lex_tables_by_name, name);
+
+  ASSERT(p == 0);
+
+  pool_get_aligned (lm->lex_tables, t, CLIB_CACHE_LINE_BYTES);
+
+  t->name = name;
+
+  hash_set_mem (lm->lex_tables_by_name, name, t - lm->lex_tables);
+
+  vlib_lex_set_action_range (t - lm->lex_tables, 1, 0x7F, VLIB_LEX_IGNORE, ~0,
+			     t - lm->lex_tables);
+
+  vlib_lex_set_action_range (t - lm->lex_tables, 0, 0, VLIB_LEX_RETURN, VLIB_LEX_eof,
+			     t - lm->lex_tables);
+
+  return t - lm->lex_tables;
+}
+
+void vlib_lex_reset (vlib_lex_main_t *lm, u8 *input_vector)
+{
+  if (lm->pushback_vector)
+    _vec_len (lm->pushback_vector) = 0;
+  lm->pushback_sp = -1;
+
+  lm->input_vector = input_vector;
+  lm->current_index = 0;
+}
+
+static clib_error_t * lex_onetime_init (vlib_main_t * vm)
+{
+  vlib_lex_main_t *lm = &vlib_lex_main;
+
+  lm->lex_tables_by_name = hash_create_string (0, sizeof (uword));
+  lm->lex_tokens_by_name = hash_create_string (0, sizeof (uword));
+  lm->lex_keywords = hash_create_string (0, sizeof (uword));
+  lm->pushback_sp = -1;
+
+#define _(f) { u16 tmp = vlib_lex_add_token (lm, #f); ASSERT (tmp == VLIB_LEX_##f); }
+  foreach_vlib_lex_global_token;
+#undef _
+
+  vec_validate (lm->token_buffer, 127);
+  _vec_len (lm->token_buffer) = 0;
+
+  return 0;
+}
+
+VLIB_INIT_FUNCTION (lex_onetime_init);
diff --git a/vlib/vlib/lex.h b/vlib/vlib/lex.h
new file mode 100644
index 00000000000..d5ea509915c
--- /dev/null
+++ b/vlib/vlib/lex.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef included_vlib_lex_h
+#define included_vlib_lex_h
+
+#include <vppinfra/hash.h>
+#include <vppinfra/bitmap.h>
+#include <vppinfra/error.h>
+#include <vppinfra/pool.h>
+
+#define foreach_vlib_lex_global_token           \
+  _ (invalid)                                   \
+  _ (eof)                                       \
+  _ (word)                                      \
+  _ (number)                                    \
+  _ (lt)                                        \
+  _ (gt)                                        \
+  _ (dot)                                       \
+  _ (slash)                                     \
+  _ (qmark)                                     \
+  _ (equals)                                    \
+  _ (plus)                                      \
+  _ (minus)                                     \
+  _ (star)                                      \
+  _ (lpar)                                      \
+  _ (rpar)
+
+typedef enum {
+#define _(f) VLIB_LEX_##f,
+  foreach_vlib_lex_global_token
+#undef _
+} vlib_lex_global_token_t;
+
+typedef enum {
+  VLIB_LEX_IGNORE,
+  VLIB_LEX_ADD_TO_TOKEN,
+  VLIB_LEX_RETURN,
+  VLIB_LEX_RETURN_AND_RESCAN,
+  VLIB_LEX_KEYWORD_CHECK,
+  VLIB_LEX_START_NUMBER, 
+  VLIB_LEX_ADD_TO_NUMBER,
+} vlib_lex_action_t;
+
+typedef struct {
+  u16 action;
+  u16 next_table_index;
+  u16 token;
+} vlib_lex_table_entry_t;
+
+typedef struct {
+  char *name;
+  vlib_lex_table_entry_t entries [128];
+} vlib_lex_table_t;
+
+typedef struct {
+  u32 token;
+
+  union {
+    uword as_uword;
+    void * as_pointer;
+    char * as_string;
+  } value;
+} vlib_lex_token_t;
+
+typedef struct {
+  vlib_lex_table_t * lex_tables;
+  uword * lex_tables_by_name;
+
+  /* Vector of token strings. */
+  char ** lex_token_names;
+
+  /* Hash mapping c string name to token index. */
+  uword * lex_tokens_by_name;
+
+  /* Hash mapping c string keyword name to token index. */
+  uword * lex_keywords;
+
+  vlib_lex_token_t * pushback_vector;
+
+  i32 pushback_sp;
+
+  u32 current_table_index;
+
+  uword current_token_value;
+
+  uword current_number_base;
+
+  /* Input string we are lex-ing. */
+  u8 *input_vector;
+
+  /* Current index into input vector. */
+  u32 current_index;
+
+  /* Re-used vector for forming token strings and hashing them. */
+  u8 * token_buffer;
+} vlib_lex_main_t;
+
+vlib_lex_main_t vlib_lex_main;
+
+always_inline void
+vlib_lex_cleanup_token (vlib_lex_token_t * t)
+{
+  if (t->token == VLIB_LEX_word)
+    {
+      u8 * tv = t->value.as_pointer;
+      vec_free (tv);
+    }
+}
+
+u16 vlib_lex_add_table (char *name);
+void vlib_lex_get_token (vlib_lex_main_t *lm, vlib_lex_token_t * result);
+u16 vlib_lex_add_token (vlib_lex_main_t *lm, char *token_name);
+void vlib_lex_set_action_range (u32 table_index, u8 lo, u8 hi, u16 action, 
+				u16 token, u32 next_table_index);
+void vlib_lex_reset (vlib_lex_main_t *lm, u8 *input_vector);
+format_function_t format_vlib_lex_token;
+
+#endif /* included_vlib_lex_h */
diff --git a/vlib/vlib/main.c b/vlib/vlib/main.c
new file mode 100644
index 00000000000..64bd3c02b60
--- /dev/null
+++ b/vlib/vlib/main.c
@@ -0,0 +1,1559 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * main.c: main vector processing loop
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <math.h>
+#include <vppinfra/format.h>
+#include <vlib/vlib.h>
+#include <vlib/threads.h>
+
+#include <vlib/unix/cj.h>
+
+CJ_GLOBAL_LOG_PROTOTYPE;
+
+
+//#define VLIB_ELOG_MAIN_LOOP 1
+
+/* Actually allocate a few extra slots of vector data to support
+   speculative vector enqueues which overflow vector data in next frame. */
+#define VLIB_FRAME_SIZE_ALLOC (VLIB_FRAME_SIZE + 4)
+
+always_inline u32
+vlib_frame_bytes (u32 n_scalar_bytes, u32 n_vector_bytes)
+{
+  u32 n_bytes;
+
+  /* Make room for vlib_frame_t plus scalar arguments. */
+  n_bytes = vlib_frame_vector_byte_offset (n_scalar_bytes);
+
+  /* Make room for vector arguments.
+     Allocate a few extra slots of vector data to support
+     speculative vector enqueues which overflow vector data in next frame. */
+#define VLIB_FRAME_SIZE_EXTRA 4
+  n_bytes += (VLIB_FRAME_SIZE + VLIB_FRAME_SIZE_EXTRA) * n_vector_bytes;
+
+  /* Magic number is first 32bit number after vector data.
+     Used to make sure that vector data is never overrun. */
+#define VLIB_FRAME_MAGIC (0xabadc0ed)
+  n_bytes += sizeof (u32);
+
+  /* Pad to cache line. */
+  n_bytes = round_pow2 (n_bytes, CLIB_CACHE_LINE_BYTES);
+
+  return n_bytes;
+}
+
+always_inline u32 *
+vlib_frame_find_magic (vlib_frame_t * f, vlib_node_t * node)
+{
+  void * p = f;
+
+  p += vlib_frame_vector_byte_offset (node->scalar_size);
+
+  p += (VLIB_FRAME_SIZE + VLIB_FRAME_SIZE_EXTRA) * node->vector_size;
+
+  return p;
+}
+
+static vlib_frame_size_t *
+get_frame_size_info (vlib_node_main_t * nm,
+		     u32 n_scalar_bytes, u32 n_vector_bytes)
+{
+  uword key = (n_scalar_bytes << 16) | n_vector_bytes;
+  uword * p, i;
+
+  p = hash_get (nm->frame_size_hash, key);
+  if (p)
+    i = p[0];
+  else
+    {
+      i = vec_len (nm->frame_sizes);
+      vec_validate (nm->frame_sizes, i);
+      hash_set (nm->frame_size_hash, key, i);
+    }
+
+  return vec_elt_at_index (nm->frame_sizes, i);
+}
+
+static u32
+vlib_frame_alloc_to_node (vlib_main_t * vm, u32 to_node_index, u32 frame_flags)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_frame_size_t * fs;
+  vlib_node_t * to_node;
+  vlib_frame_t * f;
+  u32 fi, l, n, scalar_size, vector_size;
+
+  to_node = vlib_get_node (vm, to_node_index);
+
+  scalar_size = to_node->scalar_size;
+  vector_size = to_node->vector_size;
+
+  fs = get_frame_size_info (nm, scalar_size, vector_size);
+  n = vlib_frame_bytes (scalar_size, vector_size);
+  if ((l = vec_len (fs->free_frame_indices)) > 0)
+    {
+      /* Allocate from end of free list. */
+      fi = fs->free_frame_indices[l - 1];
+      f = vlib_get_frame_no_check (vm, fi);
+      _vec_len (fs->free_frame_indices) = l - 1;
+    }
+  else
+    {
+      f = clib_mem_alloc_aligned_no_fail (n, CLIB_CACHE_LINE_BYTES);
+      f->cpu_index = vm->cpu_index;
+      fi = vlib_frame_index_no_check (vm, f);
+    }
+
+  /* Poison frame when debugging. */
+  if (CLIB_DEBUG > 0)
+    {
+      u32 save_cpu_index = f->cpu_index;
+
+      memset (f, 0xfe, n);
+
+      f->cpu_index = save_cpu_index;
+    }
+
+  /* Insert magic number. */
+  {
+    u32 * magic;
+
+    magic = vlib_frame_find_magic (f, to_node);
+    *magic = VLIB_FRAME_MAGIC;
+  }
+
+  f->flags = VLIB_FRAME_IS_ALLOCATED | frame_flags;
+  f->n_vectors = 0;
+  f->scalar_size = scalar_size;
+  f->vector_size = vector_size;
+
+  fs->n_alloc_frames += 1;
+
+  return fi;
+}
+
+/* Allocate a frame for from FROM_NODE to TO_NODE via TO_NEXT_INDEX.
+   Returns frame index. */
+static u32
+vlib_frame_alloc (vlib_main_t * vm, vlib_node_runtime_t * from_node_runtime, u32 to_next_index)
+{
+  vlib_node_t * from_node;
+
+  from_node = vlib_get_node (vm, from_node_runtime->node_index);
+  ASSERT (to_next_index < vec_len (from_node->next_nodes));
+
+  return vlib_frame_alloc_to_node (vm,
+				   from_node->next_nodes[to_next_index],
+				   /* frame_flags */ 0);
+}
+
+vlib_frame_t *
+vlib_get_frame_to_node (vlib_main_t * vm, u32 to_node_index)
+{
+  u32 fi = vlib_frame_alloc_to_node (vm, to_node_index,
+				     /* frame_flags */ VLIB_FRAME_FREE_AFTER_DISPATCH);
+  return vlib_get_frame (vm, fi);
+}
+
+void vlib_put_frame_to_node (vlib_main_t * vm, u32 to_node_index, vlib_frame_t * f)
+{
+  vlib_pending_frame_t * p;
+  vlib_node_t * to_node;
+
+  if (f->n_vectors == 0)
+    return;
+
+  to_node = vlib_get_node (vm, to_node_index);
+
+  vec_add2 (vm->node_main.pending_frames, p, 1);
+
+  f->flags |= VLIB_FRAME_PENDING;
+  p->frame_index = vlib_frame_index (vm, f);
+  p->node_runtime_index = to_node->runtime_index;
+  p->next_frame_index = VLIB_PENDING_FRAME_NO_NEXT_FRAME;
+}
+
+/* Free given frame. */
+void
+vlib_frame_free (vlib_main_t * vm,
+		 vlib_node_runtime_t * r,
+		 vlib_frame_t * f)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_node_t * node;
+  vlib_frame_size_t * fs;
+  u32 frame_index;
+  
+  ASSERT (f->flags & VLIB_FRAME_IS_ALLOCATED);
+
+  node = vlib_get_node (vm, r->node_index);
+  fs = get_frame_size_info (nm, node->scalar_size, node->vector_size);
+
+  frame_index = vlib_frame_index (vm, f);
+
+  ASSERT (f->flags & VLIB_FRAME_IS_ALLOCATED);
+
+  /* No next frames may point to freed frame. */
+  if (CLIB_DEBUG > 0)
+    {
+      vlib_next_frame_t * nf;
+      vec_foreach (nf, vm->node_main.next_frames)
+	ASSERT (nf->frame_index != frame_index);
+    }
+
+  f->flags &= ~VLIB_FRAME_IS_ALLOCATED;
+
+  vec_add1 (fs->free_frame_indices, frame_index);
+  ASSERT (fs->n_alloc_frames > 0);
+  fs->n_alloc_frames -= 1;
+}
+
+static clib_error_t *
+show_frame_stats (vlib_main_t * vm,
+		  unformat_input_t * input,
+		  vlib_cli_command_t * cmd)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_frame_size_t * fs;
+  
+  vlib_cli_output (vm, "%=6s%=12s%=12s", "Size", "# Alloc", "# Free");
+  vec_foreach (fs, nm->frame_sizes)
+    {
+      u32 n_alloc = fs->n_alloc_frames;
+      u32 n_free = vec_len (fs->free_frame_indices);
+
+      if (n_alloc + n_free > 0)
+	vlib_cli_output (vm, "%=6d%=12d%=12d",
+			 fs - nm->frame_sizes, n_alloc, n_free);
+    }
+
+  return 0;
+}
+
+VLIB_CLI_COMMAND (show_frame_stats_cli, static) = {
+  .path = "show vlib frame-allocation",
+  .short_help = "Show node dispatch frame statistics",
+  .function = show_frame_stats,
+};
+
+/* Change ownership of enqueue rights to given next node. */
+static void
+vlib_next_frame_change_ownership (vlib_main_t * vm,
+				  vlib_node_runtime_t * node_runtime,
+				  u32 next_index)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_next_frame_t * next_frame;
+  vlib_node_t * node, * next_node;
+
+  node = vec_elt (nm->nodes, node_runtime->node_index);
+
+  /* Only internal & input nodes are allowed to call other nodes. */
+  ASSERT (node->type == VLIB_NODE_TYPE_INTERNAL
+	  || node->type == VLIB_NODE_TYPE_INPUT
+	  || node->type == VLIB_NODE_TYPE_PROCESS);
+
+  ASSERT (vec_len (node->next_nodes) == node_runtime->n_next_nodes);
+
+  next_frame = vlib_node_runtime_get_next_frame (vm, node_runtime, next_index);
+  next_node = vec_elt (nm->nodes, node->next_nodes[next_index]);
+
+  if (next_node->owner_node_index != VLIB_INVALID_NODE_INDEX)
+    {
+      /* Get frame from previous owner. */
+      vlib_next_frame_t * owner_next_frame;
+      vlib_next_frame_t tmp;
+
+      owner_next_frame =
+	vlib_node_get_next_frame (vm,
+				  next_node->owner_node_index,
+				  next_node->owner_next_index);
+
+      /* Swap target next frame with owner's. */
+      tmp = owner_next_frame[0];
+      owner_next_frame[0] = next_frame[0];
+      next_frame[0] = tmp;
+
+      /*
+       * If next_frame is already pending, we have to track down
+       * all pending frames and fix their next_frame_index fields.
+       */
+      if (next_frame->flags & VLIB_FRAME_PENDING)
+        {
+          vlib_pending_frame_t * p;
+          if (next_frame->frame_index != ~0)
+            {
+              vec_foreach (p, nm->pending_frames)
+                {
+                  if (p->frame_index == next_frame->frame_index)
+                    {
+                      p->next_frame_index = 
+                        next_frame - vm->node_main.next_frames;
+                    }
+                }
+            }
+        }
+    }
+  else
+    {
+      /* No previous owner. Take ownership. */
+      next_frame->flags |= VLIB_FRAME_OWNER;
+    }
+					   
+  /* Record new owner. */
+  next_node->owner_node_index = node->index;
+  next_node->owner_next_index = next_index;
+
+  /* Now we should be owner. */
+  ASSERT (next_frame->flags & VLIB_FRAME_OWNER);
+}	      
+
+/* Make sure that magic number is still there.
+   Otherwise, it is likely that caller has overrun frame arguments. */
+always_inline void
+validate_frame_magic (vlib_main_t * vm,
+		      vlib_frame_t * f,
+		      vlib_node_t * n,
+		      uword next_index)
+{
+  vlib_node_t * next_node = vlib_get_node (vm, n->next_nodes[next_index]);
+  u32 * magic = vlib_frame_find_magic (f, next_node);
+  ASSERT (VLIB_FRAME_MAGIC == magic[0]);
+}
+
+vlib_frame_t *
+vlib_get_next_frame_internal (vlib_main_t * vm,
+			      vlib_node_runtime_t * node,
+			      u32 next_index,
+			      u32 allocate_new_next_frame)
+{
+  vlib_frame_t * f;
+  vlib_next_frame_t * nf;
+  u32 n_used;
+
+  nf = vlib_node_runtime_get_next_frame (vm, node, next_index);
+
+  /* Make sure this next frame owns right to enqueue to destination frame. */
+  if (PREDICT_FALSE (! (nf->flags & VLIB_FRAME_OWNER)))
+    vlib_next_frame_change_ownership (vm, node, next_index);
+
+  /* ??? Don't need valid flag: can use frame_index == ~0 */
+  if (PREDICT_FALSE (! (nf->flags & VLIB_FRAME_IS_ALLOCATED)))
+    {
+      nf->frame_index = vlib_frame_alloc (vm, node, next_index);
+      nf->flags |= VLIB_FRAME_IS_ALLOCATED;
+    }
+
+  f = vlib_get_frame (vm, nf->frame_index);
+
+  /* Has frame been removed from pending vector (e.g. finished dispatching)?
+     If so we can reuse frame. */
+  if ((nf->flags & VLIB_FRAME_PENDING) && ! (f->flags & VLIB_FRAME_PENDING))
+    {
+      nf->flags &= ~VLIB_FRAME_PENDING;
+      f->n_vectors = 0;
+    }
+
+  /* Allocate new frame if current one is already full. */
+  n_used = f->n_vectors;
+  if (n_used >= VLIB_FRAME_SIZE || (allocate_new_next_frame && n_used > 0))
+    {
+      /* Old frame may need to be freed after dispatch, since we'll have
+	 two redundant frames from node -> next node. */
+      if (! (nf->flags & VLIB_FRAME_NO_FREE_AFTER_DISPATCH))
+	{
+	  vlib_frame_t * f_old = vlib_get_frame (vm, nf->frame_index);
+	  f_old->flags |= VLIB_FRAME_FREE_AFTER_DISPATCH;
+	}
+
+      /* Allocate new frame to replace full one. */
+      nf->frame_index = vlib_frame_alloc (vm, node, next_index);
+      f = vlib_get_frame (vm, nf->frame_index);
+      n_used = f->n_vectors;
+    }
+
+  /* Should have free vectors in frame now. */
+  ASSERT (n_used < VLIB_FRAME_SIZE);
+
+  if (CLIB_DEBUG > 0)
+    {
+      validate_frame_magic (vm, f,
+			    vlib_get_node (vm, node->node_index),
+			    next_index);
+    }
+
+  return f;
+}
+
+static void
+vlib_put_next_frame_validate (vlib_main_t * vm,
+			      vlib_node_runtime_t * rt,
+			      u32 next_index,
+			      u32 n_vectors_left)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_next_frame_t * nf;
+  vlib_frame_t * f;
+  vlib_node_runtime_t * next_rt;
+  vlib_node_t * next_node;
+  u32 n_before, n_after;
+
+  nf = vlib_node_runtime_get_next_frame (vm, rt, next_index);
+  f = vlib_get_frame (vm, nf->frame_index);
+
+  ASSERT (n_vectors_left <= VLIB_FRAME_SIZE);
+  n_after = VLIB_FRAME_SIZE - n_vectors_left;
+  n_before = f->n_vectors;
+
+  ASSERT (n_after >= n_before);
+
+  next_rt = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL],
+			      nf->node_runtime_index);
+  next_node = vlib_get_node (vm, next_rt->node_index);
+  if (n_after > 0 && next_node->validate_frame)
+    {
+      u8 * msg = next_node->validate_frame (vm, rt, f);
+      if (msg)
+	{
+	  clib_warning ("%v", msg);
+	  ASSERT (0);
+	}
+      vec_free (msg);
+    }
+}
+
+void
+vlib_put_next_frame (vlib_main_t * vm,
+		     vlib_node_runtime_t * r,
+		     u32 next_index,
+		     u32 n_vectors_left)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_next_frame_t * nf;
+  vlib_frame_t * f;
+  u32 n_vectors_in_frame;
+
+  if (DPDK == 0 && CLIB_DEBUG > 0)
+    vlib_put_next_frame_validate (vm, r, next_index, n_vectors_left);
+
+  nf = vlib_node_runtime_get_next_frame (vm, r, next_index);
+  f = vlib_get_frame (vm, nf->frame_index);
+
+  /* Make sure that magic number is still there.  Otherwise, caller
+     has overrun frame meta data. */
+  if (CLIB_DEBUG > 0)
+    {
+      vlib_node_t * node = vlib_get_node (vm, r->node_index);
+      validate_frame_magic (vm, f, node, next_index);
+    }
+
+  /* Convert # of vectors left -> number of vectors there. */
+  ASSERT (n_vectors_left <= VLIB_FRAME_SIZE);
+  n_vectors_in_frame = VLIB_FRAME_SIZE - n_vectors_left;
+
+  f->n_vectors = n_vectors_in_frame;
+
+  /* If vectors were added to frame, add to pending vector. */
+  if (PREDICT_TRUE (n_vectors_in_frame > 0))
+    {
+      vlib_pending_frame_t * p;
+      u32 v0, v1;
+      
+      r->cached_next_index = next_index;
+
+      if (!(f->flags & VLIB_FRAME_PENDING))
+        {
+          __attribute__((unused)) vlib_node_t *node;
+          vlib_node_t *next_node;
+          vlib_node_runtime_t *next_runtime;
+
+          node = vlib_get_node (vm, r->node_index);
+          next_node = vlib_get_next_node (vm, r->node_index, next_index);
+          next_runtime = vlib_node_get_runtime (vm, next_node->index);
+
+          vec_add2 (nm->pending_frames, p, 1);
+
+          p->frame_index = nf->frame_index;
+          p->node_runtime_index = nf->node_runtime_index;
+          p->next_frame_index = nf - nm->next_frames;
+          nf->flags |= VLIB_FRAME_PENDING;
+          f->flags |= VLIB_FRAME_PENDING;
+
+          /* 
+           * If we're going to dispatch this frame on another thread,
+           * force allocation of a new frame. Otherwise, we create
+           * a dangling frame reference. Each thread has its own copy of
+           * the next_frames vector.
+           */
+          if (0 && r->cpu_index != next_runtime->cpu_index)
+            {
+              nf->frame_index = ~0;
+              nf->flags &= ~(VLIB_FRAME_PENDING | VLIB_FRAME_IS_ALLOCATED);
+            }
+        }
+
+      /* Copy trace flag from next_frame and from runtime. */
+      nf->flags |= (nf->flags & VLIB_NODE_FLAG_TRACE) | (r->flags & VLIB_NODE_FLAG_TRACE);
+
+      v0 = nf->vectors_since_last_overflow;
+      v1 = v0 + n_vectors_in_frame;
+      nf->vectors_since_last_overflow = v1;
+      if (PREDICT_FALSE (v1 < v0))
+	{
+	  vlib_node_t * node = vlib_get_node (vm, r->node_index);
+	  vec_elt (node->n_vectors_by_next_node, next_index) += v0;
+	}
+    }
+}
+
+/* Sync up runtime (32 bit counters) and main node stats (64 bit counters). */
+never_inline void
+vlib_node_runtime_sync_stats (vlib_main_t * vm,
+			      vlib_node_runtime_t * r,
+			      uword n_calls,
+			      uword n_vectors,
+			      uword n_clocks)
+{
+  vlib_node_t * n = vlib_get_node (vm, r->node_index);
+
+  n->stats_total.calls += n_calls + r->calls_since_last_overflow;
+  n->stats_total.vectors += n_vectors + r->vectors_since_last_overflow;
+  n->stats_total.clocks += n_clocks + r->clocks_since_last_overflow;
+  n->stats_total.max_clock = r->max_clock;
+  n->stats_total.max_clock_n = r->max_clock_n;
+
+  r->calls_since_last_overflow = 0;
+  r->vectors_since_last_overflow = 0;
+  r->clocks_since_last_overflow = 0;
+}
+
+always_inline void
+vlib_process_sync_stats (vlib_main_t * vm,
+			 vlib_process_t * p,
+			 uword n_calls,
+			 uword n_vectors,
+			 uword n_clocks)
+{
+  vlib_node_runtime_t * rt = &p->node_runtime;
+  vlib_node_t * n = vlib_get_node (vm, rt->node_index);
+  vlib_node_runtime_sync_stats (vm, rt, n_calls, n_vectors, n_clocks);
+  n->stats_total.suspends += p->n_suspends;
+  p->n_suspends = 0;
+}
+
+void vlib_node_sync_stats (vlib_main_t * vm, vlib_node_t * n)
+{
+  vlib_node_runtime_t * rt;
+
+  if (n->type == VLIB_NODE_TYPE_PROCESS)
+    {
+      /* Nothing to do for PROCESS nodes except in main thread */
+      if (vm != &vlib_global_main) return;
+
+      vlib_process_t * p = vlib_get_process_from_node (vm, n);
+      n->stats_total.suspends += p->n_suspends;
+      p->n_suspends = 0;
+      rt = &p->node_runtime;
+    }
+  else
+    rt = vec_elt_at_index (vm->node_main.nodes_by_type[n->type], n->runtime_index);
+
+  vlib_node_runtime_sync_stats (vm, rt, 0, 0, 0);
+
+  /* Sync up runtime next frame vector counters with main node structure. */
+  {
+    vlib_next_frame_t * nf;
+    uword i;
+    for (i = 0; i < rt->n_next_nodes; i++)
+      {
+	nf = vlib_node_runtime_get_next_frame (vm, rt, i);
+	vec_elt (n->n_vectors_by_next_node, i) += nf->vectors_since_last_overflow;
+	nf->vectors_since_last_overflow = 0;
+      }
+  }
+}
+
+always_inline u32
+vlib_node_runtime_update_stats (vlib_main_t * vm,
+				vlib_node_runtime_t * node,
+				uword n_calls,
+				uword n_vectors,
+				uword n_clocks)
+{
+  u32 ca0, ca1, v0, v1, cl0, cl1, r;
+
+  cl0 = cl1 = node->clocks_since_last_overflow;
+  ca0 = ca1 = node->calls_since_last_overflow;
+  v0 = v1 = node->vectors_since_last_overflow;
+
+  ca1 = ca0 + n_calls;
+  v1 = v0 + n_vectors;
+  cl1 = cl0 + n_clocks;
+
+  node->calls_since_last_overflow = ca1;
+  node->clocks_since_last_overflow = cl1;
+  node->vectors_since_last_overflow = v1;
+  node->max_clock_n = node->max_clock > n_clocks ?
+                      node->max_clock_n : n_vectors;
+  node->max_clock = node->max_clock > n_clocks ?
+                    node->max_clock : n_clocks;
+
+  r = vlib_node_runtime_update_main_loop_vector_stats (vm, node, n_vectors);
+
+  if (PREDICT_FALSE (ca1 < ca0 || v1 < v0 || cl1 < cl0))
+    {
+      node->calls_since_last_overflow = ca0;
+      node->clocks_since_last_overflow = cl0;
+      node->vectors_since_last_overflow = v0;
+      vlib_node_runtime_sync_stats (vm, node, n_calls, n_vectors, n_clocks);
+    }
+
+  return r;
+}
+
+always_inline void
+vlib_process_update_stats (vlib_main_t * vm,
+			   vlib_process_t * p,
+			   uword n_calls,
+			   uword n_vectors,
+			   uword n_clocks)
+{
+  vlib_node_runtime_update_stats (vm, &p->node_runtime,
+				  n_calls, n_vectors, n_clocks);
+}
+
+static clib_error_t *
+vlib_cli_elog_clear (vlib_main_t * vm,
+		     unformat_input_t * input,
+		     vlib_cli_command_t * cmd)
+{
+  elog_reset_buffer (&vm->elog_main);
+  return 0;
+}
+
+VLIB_CLI_COMMAND (elog_clear_cli, static) = {
+  .path = "clear event-logger",
+  .short_help = "Clear current event log",
+  .function = vlib_cli_elog_clear,
+};
+
+#ifdef CLIB_UNIX
+static clib_error_t *
+elog_save_buffer (vlib_main_t * vm,
+		  unformat_input_t * input,
+		  vlib_cli_command_t * cmd)
+{
+  elog_main_t * em = &vm->elog_main;
+  char * file, * chroot_file;
+  clib_error_t * error = 0;
+
+  if (! unformat (input, "%s", &file))
+    {
+      vlib_cli_output (vm, "expected file name, got `%U'",
+		       format_unformat_error, input);
+      return 0;
+    }
+
+  /* It's fairly hard to get "../oopsie" through unformat; just in case */
+  if (strstr(file, "..") || index(file, '/'))
+    {
+      vlib_cli_output (vm, "illegal characters in filename '%s'", file);
+      return 0;
+    }
+
+  chroot_file = (char *) format (0, "/tmp/%s%c", file, 0);
+
+  vec_free(file);
+
+  vlib_cli_output (vm, "Saving %wd of %wd events to %s",
+                   elog_n_events_in_buffer (em),
+                   elog_buffer_capacity (em),
+                   chroot_file);
+  
+  vlib_worker_thread_barrier_sync (vm);
+  error =  elog_write_file (em, chroot_file);
+  vlib_worker_thread_barrier_release(vm);
+  vec_free (chroot_file);
+  return error;
+}
+
+VLIB_CLI_COMMAND (elog_save_cli, static) = {
+  .path = "save event-logger",
+  .short_help = "save event-logger <filename> (saves log in /tmp/<filename>)",
+  .function = elog_save_buffer,
+};
+
+#endif /* CLIB_UNIX */
+
+static void elog_show_buffer_internal (vlib_main_t * vm, u32 n_events_to_show)
+{
+  elog_main_t * em = &vm->elog_main;
+  elog_event_t * e, * es;
+  f64 dt;
+
+  /* Show events in VLIB time since log clock starts after VLIB clock. */
+  dt = (em->init_time.cpu - vm->clib_time.init_cpu_time) 
+    * vm->clib_time.seconds_per_clock;
+
+  es = elog_peek_events (em);
+  vlib_cli_output (vm, "%d events in buffer", vec_len (es));
+  vec_foreach (e, es)
+    {
+      vlib_cli_output (vm, "%18.9f: %U",
+		       e->time + dt,
+		       format_elog_event, em, e);
+      n_events_to_show--;
+      if (n_events_to_show == 0)
+	break;
+    }
+  vec_free (es);
+  
+}
+
+static clib_error_t *
+elog_show_buffer (vlib_main_t * vm,
+		  unformat_input_t * input,
+		  vlib_cli_command_t * cmd)
+{
+  u32 n_events_to_show;
+  clib_error_t * error = 0;
+
+  n_events_to_show = 250;
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "%d", &n_events_to_show))
+	;
+      else if (unformat (input, "all"))
+	n_events_to_show = ~0;
+      else
+	return unformat_parse_error (input);
+    }
+  elog_show_buffer_internal (vm, n_events_to_show);
+  return error;
+}
+
+VLIB_CLI_COMMAND (elog_show_cli, static) = {
+  .path = "show event-logger",
+  .short_help = "Show event logger info",
+  .function = elog_show_buffer,
+};
+
+void vlib_gdb_show_event_log (void)
+{
+  elog_show_buffer_internal (vlib_get_main(), (u32)~0);
+}
+
+always_inline void
+vlib_elog_main_loop_event (vlib_main_t * vm,
+			   u32 node_index,
+			   u64 time,
+			   u32 n_vectors,
+			   u32 is_return)
+{
+  elog_main_t * em = &vm->elog_main;
+
+  if (VLIB_ELOG_MAIN_LOOP)
+    elog (em,
+	  /* event type */
+	  vec_elt_at_index (is_return
+			    ? vm->node_return_elog_event_types
+			    : vm->node_call_elog_event_types,
+			    node_index),
+	  /* data to log */ n_vectors);
+}
+
+void vlib_dump_context_trace (vlib_main_t *vm, u32 bi)
+{
+  vlib_node_main_t * vnm = &vm->node_main;
+  vlib_buffer_t * b;
+  u8 i, n;
+
+  if (VLIB_BUFFER_TRACE_TRAJECTORY)
+    {
+      b = vlib_get_buffer (vm, bi);
+      n = b->pre_data[0];
+
+      fformat(stderr, "Context trace for bi %d b 0x%llx, visited %d\n",
+              bi, b, n);
+
+      if (n == 0 || n > 20)
+        {
+          fformat(stderr, "n is unreasonable\n");
+          return;
+        }
+
+
+      for (i = 0; i < n; i++)
+        {
+          u32 node_index;
+
+          node_index = b->pre_data[i+1];
+
+          if (node_index > vec_len (vnm->nodes))
+            {
+              fformat(stderr, "Skip bogus node index %d\n", node_index);
+              continue;
+            }
+      
+          fformat(stderr, "%v (%d)\n", vnm->nodes[node_index]->name, 
+                  node_index);
+        }
+    }
+  else
+    {
+      fformat(stderr, 
+              "in vlib/buffers.h, #define VLIB_BUFFER_TRACE_TRAJECTORY 1\n");
+    }
+}
+
+
+/* static_always_inline */ u64
+dispatch_node (vlib_main_t * vm,
+	       vlib_node_runtime_t * node,
+	       vlib_node_type_t type,
+	       vlib_node_state_t dispatch_state,
+	       vlib_frame_t * frame,
+	       u64 last_time_stamp)
+{
+  uword n, v;
+  u64 t;
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_next_frame_t * nf;
+
+  if (CLIB_DEBUG > 0)
+    {
+      vlib_node_t * n = vlib_get_node (vm, node->node_index);
+      ASSERT (n->type == type);
+    }
+
+  /* Only non-internal nodes may be disabled. */
+  if (type != VLIB_NODE_TYPE_INTERNAL && node->state != dispatch_state)
+    {
+      ASSERT (type != VLIB_NODE_TYPE_INTERNAL);
+      return last_time_stamp;
+    }
+
+  if ((type == VLIB_NODE_TYPE_PRE_INPUT || type == VLIB_NODE_TYPE_INPUT)
+      && dispatch_state != VLIB_NODE_STATE_INTERRUPT)
+    {
+      u32 c = node->input_main_loops_per_call;
+      /* Only call node when count reaches zero. */
+      if (c)
+	{
+	  node->input_main_loops_per_call = c - 1;
+	  return last_time_stamp;
+	}
+    }
+
+  /* Speculatively prefetch next frames. */
+  if (node->n_next_nodes > 0)
+    {
+      nf = vec_elt_at_index (nm->next_frames, node->next_frame_index);
+      CLIB_PREFETCH (nf, 4 * sizeof (nf[0]), WRITE);
+    }
+
+  vm->cpu_time_last_node_dispatch = last_time_stamp;
+
+  if (1  /* || vm->cpu_index == node->cpu_index */)
+    {
+      vlib_main_t *stat_vm;
+
+      stat_vm = /* vlib_mains ? vlib_mains[0] : */ vm;
+
+      vlib_elog_main_loop_event (vm, node->node_index,
+                                 last_time_stamp,
+                                 frame ? frame->n_vectors : 0,
+                                 /* is_after */ 0);
+      
+      /*
+       * Turn this on if you run into
+       * "bad monkey" contexts, and you want to know exactly
+       * which nodes they've visited... See ixge.c...
+       */
+      if (VLIB_BUFFER_TRACE_TRAJECTORY && frame)
+        {
+          int i;
+          int log_index;
+          u32 * from;
+          from = vlib_frame_vector_args (frame);
+          for (i = 0; i < frame->n_vectors; i++)
+            {
+              vlib_buffer_t *b = vlib_get_buffer (vm, from[i]);
+              ASSERT (b->pre_data[0] < 32);
+              log_index = b->pre_data[0]++ + 1;
+              b->pre_data[log_index] = node->node_index;
+            }
+          n = node->function (vm, node, frame);
+        }
+      else
+          n = node->function (vm, node, frame);
+
+      t = clib_cpu_time_now ();
+
+      vlib_elog_main_loop_event (vm, node->node_index, t, n, /* is_after */ 1);
+
+      vm->main_loop_vectors_processed += n;
+      vm->main_loop_nodes_processed += n > 0;
+
+      v = vlib_node_runtime_update_stats (stat_vm, node,
+                                          /* n_calls */ 1,
+                                          /* n_vectors */ n,
+                                          /* n_clocks */ t - last_time_stamp);
+
+      /* When in interrupt mode and vector rate crosses threshold switch to
+         polling mode. */
+      if ((DPDK == 0 && dispatch_state == VLIB_NODE_STATE_INTERRUPT)
+          || (DPDK == 0 && dispatch_state == VLIB_NODE_STATE_POLLING
+              && (node->flags 
+                  & VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE)))
+        {
+          ELOG_TYPE_DECLARE (e) = {
+            .function = (char *) __FUNCTION__,
+            .format = "%s vector length %d, switching to %s",
+            .format_args = "T4i4t4",
+            .n_enum_strings = 2,
+            .enum_strings = {
+              "interrupt", "polling",
+            },
+          };
+          struct { u32 node_name, vector_length, is_polling; } * ed;
+          
+          if (dispatch_state == VLIB_NODE_STATE_INTERRUPT
+              && v >= nm->polling_threshold_vector_length)
+            {
+              vlib_node_t * n = vlib_get_node (vm, node->node_index);
+              n->state = VLIB_NODE_STATE_POLLING;
+              node->state = VLIB_NODE_STATE_POLLING;
+              ASSERT (! (node->flags & VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE));
+              node->flags &= ~VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE;
+              node->flags |= VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE;
+              nm->input_node_counts_by_state[VLIB_NODE_STATE_INTERRUPT] -= 1;
+              nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] += 1;
+
+              ed = ELOG_DATA (&vm->elog_main, e);
+              ed->node_name = n->name_elog_string;
+              ed->vector_length = v;
+              ed->is_polling = 1;
+            }
+          else if (dispatch_state == VLIB_NODE_STATE_POLLING
+                   && v <= nm->interrupt_threshold_vector_length)
+            {
+              vlib_node_t * n = vlib_get_node (vm, node->node_index);
+              if (node->flags & VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE)
+                {
+                  /* Switch to interrupt mode after dispatch in polling one more time.
+                     This allows driver to re-enable interrupts. */
+                  n->state = VLIB_NODE_STATE_INTERRUPT;
+                  node->state = VLIB_NODE_STATE_INTERRUPT;
+                  node->flags &= ~VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE;
+                  nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] -= 1;
+                  nm->input_node_counts_by_state[VLIB_NODE_STATE_INTERRUPT] += 1;
+
+                }
+              else
+                {
+                  node->flags |= VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE;
+                  ed = ELOG_DATA (&vm->elog_main, e);
+                  ed->node_name = n->name_elog_string;
+                  ed->vector_length = v;
+                  ed->is_polling = 0;
+                }
+            }
+        }
+    }
+
+  return t;
+}
+
+/* static */ u64
+dispatch_pending_node (vlib_main_t * vm,
+		       vlib_pending_frame_t * p,
+		       u64 last_time_stamp)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_frame_t * f;
+  vlib_next_frame_t * nf, nf_dummy;
+  vlib_node_runtime_t * n;
+  u32 restore_frame_index;
+
+  n = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL],
+			p->node_runtime_index);
+
+  f = vlib_get_frame (vm, p->frame_index);
+  if (p->next_frame_index == VLIB_PENDING_FRAME_NO_NEXT_FRAME)
+    {
+      /* No next frame: so use dummy on stack. */
+      nf = &nf_dummy;
+      nf->flags = f->flags & VLIB_NODE_FLAG_TRACE;
+      nf->frame_index = ~p->frame_index;
+    }
+  else
+    nf = vec_elt_at_index (nm->next_frames, p->next_frame_index);
+
+  ASSERT (f->flags & VLIB_FRAME_IS_ALLOCATED);
+
+  /* Force allocation of new frame while current frame is being
+     dispatched. */
+  restore_frame_index = ~0;
+  if (nf->frame_index == p->frame_index)
+    {
+      nf->frame_index = ~0;
+      nf->flags &= ~VLIB_FRAME_IS_ALLOCATED;
+      if (! (n->flags & VLIB_NODE_FLAG_FRAME_NO_FREE_AFTER_DISPATCH))
+	restore_frame_index = p->frame_index;
+    }
+
+  /* Frame must be pending. */
+  ASSERT (f->flags & VLIB_FRAME_PENDING);
+  ASSERT (f->n_vectors > 0);
+
+  /* Copy trace flag from next frame to node.
+     Trace flag indicates that at least one vector in the dispatched
+     frame is traced. */
+  n->flags &= ~VLIB_NODE_FLAG_TRACE;
+  n->flags |= (nf->flags & VLIB_FRAME_TRACE) ? VLIB_NODE_FLAG_TRACE : 0;
+  nf->flags &= ~VLIB_FRAME_TRACE;
+
+  last_time_stamp = dispatch_node (vm, n,
+				   VLIB_NODE_TYPE_INTERNAL,
+				   VLIB_NODE_STATE_POLLING,
+				   f, last_time_stamp);
+
+  f->flags &= ~VLIB_FRAME_PENDING;
+
+  /* Frame is ready to be used again, so restore it. */
+  if (restore_frame_index != ~0)
+    {
+      /* p->next_frame_index can change during node dispatch if node
+	 function decides to change graph hook up. */
+      nf = vec_elt_at_index (nm->next_frames, p->next_frame_index);
+      nf->frame_index = restore_frame_index;
+      nf->flags |= VLIB_FRAME_IS_ALLOCATED;
+    }
+
+  if (f->flags & VLIB_FRAME_FREE_AFTER_DISPATCH)
+    {
+      ASSERT (! (n->flags & VLIB_NODE_FLAG_FRAME_NO_FREE_AFTER_DISPATCH));
+      vlib_frame_free (vm, n, f);
+    }
+
+  return last_time_stamp;
+}
+
+always_inline uword
+vlib_process_stack_is_valid (vlib_process_t * p)
+{ return p->stack[0] == VLIB_PROCESS_STACK_MAGIC; }
+
+typedef struct {
+  vlib_main_t * vm;
+  vlib_process_t * process;
+  vlib_frame_t * frame;
+} vlib_process_bootstrap_args_t;
+
+/* Called in process stack. */
+static uword vlib_process_bootstrap (uword _a)
+{
+  vlib_process_bootstrap_args_t * a;
+  vlib_main_t * vm;
+  vlib_node_runtime_t * node;
+  vlib_frame_t * f;
+  vlib_process_t * p;
+  uword n;
+
+  a = uword_to_pointer (_a, vlib_process_bootstrap_args_t *);
+
+  vm = a->vm;
+  p = a->process;
+  f = a->frame;
+  node = &p->node_runtime;
+
+  n = node->function (vm, node, f);
+
+  ASSERT (vlib_process_stack_is_valid (p));
+
+  clib_longjmp (&p->return_longjmp, n);
+
+  return n;
+}
+
+/* Called in main stack. */
+static_always_inline uword
+vlib_process_startup (vlib_main_t * vm,
+		      vlib_process_t * p,
+		      vlib_frame_t * f)
+{
+  vlib_process_bootstrap_args_t a;
+  uword r;
+
+  a.vm = vm;
+  a.process = p;
+  a.frame = f;
+
+  r = clib_setjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_RETURN);
+  if (r == VLIB_PROCESS_RETURN_LONGJMP_RETURN)
+    r = clib_calljmp (vlib_process_bootstrap, pointer_to_uword (&a),
+		      (void *) p->stack + (1 << p->log2_n_stack_bytes));
+
+  return r;
+}
+
+static_always_inline uword
+vlib_process_resume (vlib_process_t * p)
+{
+  uword r;
+  p->flags &= ~(VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK
+		| VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT
+		| VLIB_PROCESS_RESUME_PENDING);
+  r = clib_setjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_RETURN);
+  if (r == VLIB_PROCESS_RETURN_LONGJMP_RETURN)
+    clib_longjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_RESUME);
+  return r;
+}
+
+static u64
+dispatch_process (vlib_main_t * vm,
+		  vlib_process_t * p,
+		  vlib_frame_t * f,
+		  u64 last_time_stamp)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_node_runtime_t * node_runtime = &p->node_runtime;
+  vlib_node_t * node = vlib_get_node (vm, node_runtime->node_index);
+  u64 t;
+  uword n_vectors, is_suspend;
+
+  if (node->state != VLIB_NODE_STATE_POLLING
+      || (p->flags & (VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK
+		      | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT)))
+    return last_time_stamp;
+
+  p->flags |= VLIB_PROCESS_IS_RUNNING;
+
+  t = last_time_stamp;
+  vlib_elog_main_loop_event (vm, node_runtime->node_index, t,
+			     f ? f->n_vectors : 0, /* is_after */ 0);
+
+  /* Save away current process for suspend. */
+  nm->current_process_index = node->runtime_index;
+
+  n_vectors = vlib_process_startup (vm, p, f);
+
+  nm->current_process_index = ~0;
+
+  ASSERT (n_vectors != VLIB_PROCESS_RETURN_LONGJMP_RETURN);
+  is_suspend = n_vectors == VLIB_PROCESS_RETURN_LONGJMP_SUSPEND;
+  if (is_suspend)
+    {
+      vlib_pending_frame_t * pf;
+
+      n_vectors = 0;
+      pool_get (nm->suspended_process_frames, pf);
+      pf->node_runtime_index = node->runtime_index;
+      pf->frame_index = f ? vlib_frame_index (vm, f) : ~0;
+      pf->next_frame_index = ~0;
+
+      p->n_suspends += 1;
+      p->suspended_process_frame_index = pf - nm->suspended_process_frames;
+
+      if (p->flags & VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK)
+	timing_wheel_insert (&nm->timing_wheel, p->resume_cpu_time,
+			     vlib_timing_wheel_data_set_suspended_process (node->runtime_index));
+    }
+  else
+    p->flags &= ~VLIB_PROCESS_IS_RUNNING;
+
+  t = clib_cpu_time_now ();
+
+  vlib_elog_main_loop_event (vm, node_runtime->node_index, t, is_suspend, /* is_after */ 1);
+
+  vlib_process_update_stats (vm, p,
+			     /* n_calls */ ! is_suspend,
+			     /* n_vectors */ n_vectors,
+			     /* n_clocks */ t - last_time_stamp);
+
+  return t;
+}
+
+void vlib_start_process (vlib_main_t * vm, uword process_index)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_process_t * p = vec_elt (nm->processes, process_index);
+  dispatch_process (vm, p, /* frame */ 0, /* cpu_time_now */ 0);
+}
+
+static u64
+dispatch_suspended_process (vlib_main_t * vm,
+			    uword process_index,
+			    u64 last_time_stamp)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_node_runtime_t * node_runtime;
+  vlib_node_t * node;
+  vlib_frame_t * f;
+  vlib_process_t * p;
+  vlib_pending_frame_t * pf;
+  u64 t, n_vectors, is_suspend;
+  
+  t = last_time_stamp;
+
+  p = vec_elt (nm->processes, process_index);
+  if (PREDICT_FALSE (! (p->flags & VLIB_PROCESS_IS_RUNNING)))
+    return last_time_stamp;
+
+  ASSERT (p->flags & (VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK
+		      | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT));
+
+  pf = pool_elt_at_index (nm->suspended_process_frames, p->suspended_process_frame_index);
+
+  node_runtime = &p->node_runtime;
+  node = vlib_get_node (vm, node_runtime->node_index);
+  f = pf->frame_index != ~0 ? vlib_get_frame (vm, pf->frame_index) : 0;
+
+  vlib_elog_main_loop_event (vm, node_runtime->node_index, t, f ? f->n_vectors : 0, /* is_after */ 0);
+
+  /* Save away current process for suspend. */
+  nm->current_process_index = node->runtime_index;
+
+  n_vectors = vlib_process_resume (p);
+  t = clib_cpu_time_now ();
+
+  nm->current_process_index = ~0;
+
+  is_suspend = n_vectors == VLIB_PROCESS_RETURN_LONGJMP_SUSPEND;
+  if (is_suspend)
+    {
+      /* Suspend it again. */
+      n_vectors = 0;
+      p->n_suspends += 1;
+      if (p->flags & VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK)
+	timing_wheel_insert (&nm->timing_wheel, p->resume_cpu_time,
+			     vlib_timing_wheel_data_set_suspended_process (node->runtime_index));
+    }
+  else
+    {
+      p->flags &= ~VLIB_PROCESS_IS_RUNNING;
+      p->suspended_process_frame_index = ~0;
+      pool_put (nm->suspended_process_frames, pf);
+    }
+
+  t = clib_cpu_time_now ();
+  vlib_elog_main_loop_event (vm, node_runtime->node_index, t, ! is_suspend, /* is_after */ 1);
+
+  vlib_process_update_stats (vm, p,
+			     /* n_calls */ ! is_suspend,
+			     /* n_vectors */ n_vectors,
+			     /* n_clocks */ t - last_time_stamp);
+
+  return t;
+}
+
+static void vlib_main_loop (vlib_main_t * vm)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  uword i;
+  u64 cpu_time_now;
+
+  /* Initialize pending node vector. */
+  vec_resize (nm->pending_frames, 32);
+  _vec_len (nm->pending_frames) = 0;
+
+  /* Mark time of main loop start. */
+  cpu_time_now = vm->clib_time.last_cpu_time;
+  vm->cpu_time_main_loop_start = cpu_time_now;
+
+  /* Arrange for first level of timing wheel to cover times we care
+     most about. */
+  nm->timing_wheel.min_sched_time = 10e-6;
+  nm->timing_wheel.max_sched_time = 10e-3;
+  timing_wheel_init (&nm->timing_wheel,
+		     cpu_time_now,
+		     vm->clib_time.clocks_per_second);
+
+  /* Pre-allocate expired nodes. */
+  vec_alloc (nm->data_from_advancing_timing_wheel, 32);
+  vec_alloc (nm->pending_interrupt_node_runtime_indices, 32);
+
+  if (! nm->polling_threshold_vector_length)
+    nm->polling_threshold_vector_length = 10;
+  if (! nm->interrupt_threshold_vector_length)
+    nm->interrupt_threshold_vector_length = 5;
+
+  nm->current_process_index = ~0;
+
+  /* Start all processes. */
+  {
+    uword i;
+    for (i = 0; i < vec_len (nm->processes); i++)
+      cpu_time_now = dispatch_process (vm, nm->processes[i], /* frame */ 0, cpu_time_now);
+  }
+
+  while (1)
+    {
+      vlib_node_runtime_t * n;
+
+      /* Process pre-input nodes. */
+      vec_foreach (n, nm->nodes_by_type[VLIB_NODE_TYPE_PRE_INPUT])
+	cpu_time_now = dispatch_node (vm, n,
+				      VLIB_NODE_TYPE_PRE_INPUT,
+				      VLIB_NODE_STATE_POLLING,
+				      /* frame */ 0,
+				      cpu_time_now);
+
+      /* Next process input nodes. */
+      vec_foreach (n, nm->nodes_by_type[VLIB_NODE_TYPE_INPUT])
+	cpu_time_now = dispatch_node (vm, n,
+				      VLIB_NODE_TYPE_INPUT,
+				      VLIB_NODE_STATE_POLLING,
+				      /* frame */ 0,
+				      cpu_time_now);
+
+      if (PREDICT_FALSE(vm->queue_signal_pending))
+          if (vm->queue_signal_callback)
+              vm->queue_signal_callback (vm);
+
+      /* Next handle interrupts. */
+      {
+	uword l = _vec_len (nm->pending_interrupt_node_runtime_indices);
+	uword i;
+	if (l > 0)
+	  {
+	    _vec_len (nm->pending_interrupt_node_runtime_indices) = 0;
+	    for (i = 0; i < l; i++)
+	      {
+		n = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT],
+				      nm->pending_interrupt_node_runtime_indices[i]);
+		cpu_time_now = dispatch_node (vm, n,
+					      VLIB_NODE_TYPE_INPUT,
+					      VLIB_NODE_STATE_INTERRUPT,
+					      /* frame */ 0,
+					      cpu_time_now);
+	      }
+	  }
+      }
+
+      /* Check if process nodes have expired from timing wheel. */
+      nm->data_from_advancing_timing_wheel
+	= timing_wheel_advance (&nm->timing_wheel, cpu_time_now,
+				nm->data_from_advancing_timing_wheel,
+				&nm->cpu_time_next_process_ready);
+
+      ASSERT (nm->data_from_advancing_timing_wheel != 0);
+      if (PREDICT_FALSE (_vec_len (nm->data_from_advancing_timing_wheel) > 0))
+	{
+	  uword i;
+
+	processes_timing_wheel_data:
+	  for (i = 0; i < _vec_len (nm->data_from_advancing_timing_wheel); i++)
+	    {
+	      u32 d = nm->data_from_advancing_timing_wheel[i];
+	      u32 di = vlib_timing_wheel_data_get_index (d);
+
+	      if (vlib_timing_wheel_data_is_timed_event (d))
+		{
+		  vlib_signal_timed_event_data_t * te = pool_elt_at_index (nm->signal_timed_event_data_pool, di);
+		  vlib_node_t * n = vlib_get_node (vm, te->process_node_index);
+		  vlib_process_t * p = vec_elt (nm->processes, n->runtime_index);
+		  void * data;
+		  data = vlib_process_signal_event_helper (nm, n, p, te->event_type_index, te->n_data_elts, te->n_data_elt_bytes);
+		  if (te->n_data_bytes < sizeof (te->inline_event_data))
+		    memcpy (data, te->inline_event_data, te->n_data_bytes);
+		  else
+		    {
+		      memcpy (data, te->event_data_as_vector, te->n_data_bytes);
+		      vec_free (te->event_data_as_vector);
+		    }
+		  pool_put (nm->signal_timed_event_data_pool, te);
+		}
+	      else
+		{
+		  cpu_time_now = clib_cpu_time_now();
+		  cpu_time_now = dispatch_suspended_process (vm, di, cpu_time_now);
+		}
+	    }
+
+	  /* Reset vector. */
+	  _vec_len (nm->data_from_advancing_timing_wheel) = 0;
+	}
+
+      /* Input nodes may have added work to the pending vector.
+         Process pending vector until there is nothing left.
+         All pending vectors will be processed from input -> output. */
+      for (i = 0; i < _vec_len (nm->pending_frames); i++)
+          cpu_time_now = dispatch_pending_node (vm, nm->pending_frames + i,
+                                                cpu_time_now);
+      /* Reset pending vector for next iteration. */
+      _vec_len (nm->pending_frames) = 0;
+      
+      /* Pending internal nodes may resume processes. */
+      if (_vec_len (nm->data_from_advancing_timing_wheel) > 0)
+	goto processes_timing_wheel_data;
+
+      vlib_increment_main_loop_counter (vm);
+
+      /* Record time stamp in case there are no enabled nodes and above
+	 calls do not update time stamp. */
+      cpu_time_now = clib_cpu_time_now ();
+    }
+}
+	       
+vlib_main_t vlib_global_main;
+
+static clib_error_t *
+vlib_main_configure (vlib_main_t * vm, unformat_input_t * input)
+{
+  int turn_on_mem_trace = 0;
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "memory-trace"))
+	turn_on_mem_trace = 1;
+
+      else if (unformat (input, "elog-events %d",
+			 &vm->elog_main.event_ring_size))
+	;
+      else
+	return unformat_parse_error (input);
+    }
+
+  unformat_free (input);
+
+  /* Enable memory trace as early as possible. */
+  if (turn_on_mem_trace)
+    clib_mem_trace (1);
+
+  return 0;
+}
+
+VLIB_EARLY_CONFIG_FUNCTION (vlib_main_configure, "vlib");
+
+/* Main function. */
+int vlib_main (vlib_main_t * vm, unformat_input_t * input)
+{
+  clib_error_t * error;
+
+  clib_time_init (&vm->clib_time);
+
+  /* Turn on event log. */
+  if (! vm->elog_main.event_ring_size)
+    vm->elog_main.event_ring_size = 128 << 10;
+  elog_init (&vm->elog_main, vm->elog_main.event_ring_size);
+  elog_enable_disable (&vm->elog_main, 1);
+
+  /* Default name. */
+  if (! vm->name)
+    vm->name = "VLIB";
+
+  vec_validate (vm->buffer_main, 0);
+
+  if ((error = vlib_thread_init (vm)))
+    {
+      clib_error_report (error);
+      goto done;
+    }
+
+  /* Register static nodes so that init functions may use them. */
+  vlib_register_all_static_nodes (vm);
+
+  /* Set seed for random number generator.
+     Allow user to specify seed to make random sequence deterministic. */
+  if (! unformat (input, "seed %wd", &vm->random_seed))
+    vm->random_seed = clib_cpu_time_now ();
+  clib_random_buffer_init (&vm->random_buffer, vm->random_seed);
+
+  /* See unix/main.c; most likely already set up */
+  if (vm->init_functions_called == 0)
+      vm->init_functions_called = hash_create (0, /* value bytes */ 0);
+  if ((error = vlib_call_all_init_functions (vm)))
+    goto done;
+
+  /* Initialize node graph. */
+  if ((error = vlib_node_main_init (vm)))
+    {
+      /* Arrange for graph hook up error to not be fatal when debugging. */
+      if (CLIB_DEBUG > 0)
+	clib_error_report (error);
+      else
+	goto done;
+    }
+
+  /* Create default buffer free list. */
+  vlib_buffer_get_or_create_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES,
+				       "default");
+
+  switch (clib_setjmp (&vm->main_loop_exit, VLIB_MAIN_LOOP_EXIT_NONE))
+    {
+    case VLIB_MAIN_LOOP_EXIT_NONE:
+      vm->main_loop_exit_set = 1;
+      break;
+
+    case VLIB_MAIN_LOOP_EXIT_CLI:
+      goto done;
+
+    default:
+      error = vm->main_loop_error;
+      goto done;
+    }
+
+  if ((error = vlib_call_all_config_functions (vm, input, 0 /* is_early */)))
+    goto done;
+
+  /* Call all main loop enter functions. */
+  {
+    clib_error_t * sub_error;
+    sub_error = vlib_call_all_main_loop_enter_functions (vm);
+    if (sub_error)
+      clib_error_report (sub_error);
+  }
+
+  vlib_main_loop (vm);
+
+ done:
+  /* Call all exit functions. */
+  {
+    clib_error_t * sub_error;
+    sub_error = vlib_call_all_main_loop_exit_functions (vm);
+    if (sub_error)
+      clib_error_report (sub_error);
+  }
+
+  if (error)
+    clib_error_report (error);
+
+  return 0;
+}
diff --git a/vlib/vlib/main.h b/vlib/vlib/main.h
new file mode 100644
index 00000000000..5a8d745661b
--- /dev/null
+++ b/vlib/vlib/main.h
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * main.h: VLIB main data structure
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_main_h
+#define included_vlib_main_h
+
+#include <vppinfra/elog.h>
+#include <vppinfra/format.h>
+#include <vppinfra/longjmp.h>
+#include <vppinfra/pool.h>
+#include <vppinfra/random_buffer.h>
+#include <vppinfra/time.h>
+
+#include <pthread.h>
+
+
+/* By default turn off node/error event logging.
+   Override with -DVLIB_ELOG_MAIN_LOOP */
+#ifndef VLIB_ELOG_MAIN_LOOP
+#define VLIB_ELOG_MAIN_LOOP 0
+#endif
+
+typedef struct vlib_main_t {
+  /* Instruction level timing state. */
+  clib_time_t clib_time;
+
+  /* Time stamp of last node dispatch. */
+  u64 cpu_time_last_node_dispatch;
+
+  /* Time stamp when main loop was entered (time 0). */
+  u64 cpu_time_main_loop_start;
+
+  /* Incremented once for each main loop. */
+  u32 main_loop_count;
+
+  /* Count of vectors processed this main loop. */
+  u32 main_loop_vectors_processed;
+  u32 main_loop_nodes_processed;
+
+  /* Circular buffer of input node vector counts.
+     Indexed by low bits of
+     (main_loop_count >> VLIB_LOG2_INPUT_VECTORS_PER_MAIN_LOOP). */
+  u32 vector_counts_per_main_loop[2];
+  u32 node_counts_per_main_loop[2];
+
+  /* Every so often we switch to the next counter. */
+#define VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE 7
+
+  /* Jump target to exit main loop with given code. */
+  u32 main_loop_exit_set;
+  clib_longjmp_t main_loop_exit;
+#define VLIB_MAIN_LOOP_EXIT_NONE 0
+#define VLIB_MAIN_LOOP_EXIT_PANIC 1
+  /* Exit via CLI. */
+#define VLIB_MAIN_LOOP_EXIT_CLI 2
+
+  /* Error marker to use when exiting main loop. */
+  clib_error_t * main_loop_error;
+
+  /* Name for e.g. syslog. */
+  char * name;
+
+  /* Start and size of CLIB heap. */
+  void * heap_base;
+  uword heap_size;
+
+  vlib_buffer_main_t * buffer_main;
+
+  vlib_physmem_main_t physmem_main;
+
+  /* Allocate/free buffer memory for DMA transfers, descriptor rings, etc.
+     buffer memory is guaranteed to be cache-aligned. */
+  void * (* os_physmem_alloc_aligned) (vlib_physmem_main_t * pm,
+				       uword n_bytes,
+				       uword alignment);
+  void (* os_physmem_free) (void * x);
+
+  /* Node graph main structure. */
+  vlib_node_main_t node_main;
+
+  /* Command line interface. */
+  vlib_cli_main_t cli_main;
+
+  /* Packet trace buffer. */
+  vlib_trace_main_t trace_main;
+
+  /* Error handling. */
+  vlib_error_main_t error_main;
+
+  /* Punt packets to underlying operating system for when fast switching
+     code does not know what to do. */
+  void (* os_punt_frame) (struct vlib_main_t * vm,
+                          struct vlib_node_runtime_t * node,
+			  vlib_frame_t * frame);
+
+  /* Multicast distribution.  Set to zero for MC disabled. */
+  mc_main_t * mc_main;
+
+  /* Stream index to use for distribution when MC is enabled. */
+  u32 mc_stream_index;
+
+  vlib_one_time_waiting_process_t * procs_waiting_for_mc_stream_join;
+
+  /* Event logger. */
+  elog_main_t elog_main;
+
+  /* Node call and return event types. */
+  elog_event_type_t * node_call_elog_event_types;
+  elog_event_type_t * node_return_elog_event_types;
+
+  elog_event_type_t * error_elog_event_types;
+
+  /* Seed for random number generator. */
+  uword random_seed;
+
+  /* Buffer of random data for various uses. */
+  clib_random_buffer_t random_buffer;
+
+  /* Hash table to record which init functions have been called. */
+  uword * init_functions_called;
+
+  /* to compare with node runtime */
+  u32 cpu_index;
+
+  void **mbuf_alloc_list;
+
+  /* List of init functions to call, setup by constructors */
+  _vlib_init_function_list_elt_t *init_function_registrations;
+  _vlib_init_function_list_elt_t *main_loop_enter_function_registrations;
+  _vlib_init_function_list_elt_t *main_loop_exit_function_registrations;
+  _vlib_init_function_list_elt_t *api_init_function_registrations;
+  vlib_config_function_runtime_t *config_function_registrations;
+  mc_serialize_msg_t *mc_msg_registrations; /* mc_main is a pointer... */
+
+  /* control-plane API queue signal pending */
+  volatile u32 queue_signal_pending;
+  void (*queue_signal_callback)(struct vlib_main_t *);
+} vlib_main_t;
+
+/* Global main structure. */
+vlib_main_t vlib_global_main;
+
+always_inline f64
+vlib_time_now (vlib_main_t * vm)
+{ return clib_time_now (&vm->clib_time); }
+
+always_inline f64
+vlib_time_now_ticks (vlib_main_t * vm, u64 n)
+{ return clib_time_now_internal (&vm->clib_time, n); }
+
+/* Busy wait for specified time. */
+always_inline void
+vlib_time_wait (vlib_main_t * vm, f64 wait)
+{
+  f64 t = vlib_time_now (vm);
+  f64 limit = t + wait;
+  while (t < limit)
+    t = vlib_time_now (vm);
+}
+
+/* Time a piece of code. */
+#define vlib_time_code(vm,body)			\
+do {						\
+    f64 _t[2];					\
+    _t[0] = vlib_time_now (vm);			\
+    do { body; } while (0);			\
+    _t[1] = vlib_time_now (vm);			\
+    clib_warning ("%.7e", _t[1] - _t[0]);	\
+} while (0)
+
+#define vlib_wait_with_timeout(vm,suspend_time,timeout_time,test)	\
+({									\
+    uword __vlib_wait_with_timeout = 0;					\
+    f64 __vlib_wait_time = 0;						\
+    while (! (__vlib_wait_with_timeout = (test))			\
+	   && __vlib_wait_time < (timeout_time))			\
+      {									\
+	vlib_process_suspend (vm, suspend_time);			\
+	__vlib_wait_time += suspend_time;				\
+      }									\
+    __vlib_wait_with_timeout;						\
+})
+
+always_inline void
+vlib_panic_with_error (vlib_main_t * vm, clib_error_t * error)
+{
+  vm->main_loop_error = error;
+  clib_longjmp (&vm->main_loop_exit, VLIB_MAIN_LOOP_EXIT_PANIC);
+}
+
+#define vlib_panic_with_msg(vm,args...) \
+  vlib_panic_with_error (vm, clib_error_return (0, args))
+
+always_inline void
+vlib_panic (vlib_main_t * vm)
+{ vlib_panic_with_error (vm, 0); }
+
+always_inline u32
+vlib_vector_input_stats_index (vlib_main_t * vm, word delta)
+{
+  u32 i;
+  i = vm->main_loop_count >> VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE;
+  ASSERT (is_pow2 (ARRAY_LEN (vm->vector_counts_per_main_loop)));
+  return (i + delta) & (ARRAY_LEN (vm->vector_counts_per_main_loop) - 1);
+}
+
+/* Estimate input rate based on previous
+   2^VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE
+   samples. */
+always_inline u32
+vlib_last_vectors_per_main_loop (vlib_main_t * vm)
+{
+  u32 i = vlib_vector_input_stats_index (vm, -1);
+  u32 n = vm->vector_counts_per_main_loop[i];
+  return n >> VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE;
+}
+
+/* Total ave vector count per iteration of main loop. */
+always_inline f64
+vlib_last_vectors_per_main_loop_as_f64 (vlib_main_t * vm)
+{
+  u32 i = vlib_vector_input_stats_index (vm, -1);
+  u32 v = vm->vector_counts_per_main_loop[i];
+  return (f64) v / (f64) (1 << VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE);
+}
+
+/* Total ave vectors/node count per iteration of main loop. */
+always_inline f64
+vlib_last_vector_length_per_node (vlib_main_t * vm)
+{
+  u32 i = vlib_vector_input_stats_index (vm, -1);
+  u32 v = vm->vector_counts_per_main_loop[i];
+  u32 n = vm->node_counts_per_main_loop[i];
+  return n == 0 ? 0 : (f64) v / (f64) n;
+}
+
+u32 wraps;
+
+always_inline void
+vlib_increment_main_loop_counter (vlib_main_t * vm)
+{
+  u32 i, c, n, v, is_wrap;
+
+  c = vm->main_loop_count++;
+
+  is_wrap = (c & pow2_mask (VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE)) == 0;
+
+  if (is_wrap)
+      wraps++;
+
+  i = vlib_vector_input_stats_index (vm, /* delta */ is_wrap);
+
+  v = is_wrap ? 0 : vm->vector_counts_per_main_loop[i];
+  n = is_wrap ? 0 : vm->node_counts_per_main_loop[i];
+
+  v += vm->main_loop_vectors_processed;
+  n += vm->main_loop_nodes_processed;
+  vm->main_loop_vectors_processed = 0;
+  vm->main_loop_nodes_processed = 0;
+  vm->vector_counts_per_main_loop[i] = v;
+  vm->node_counts_per_main_loop[i] = n;
+}
+
+always_inline void vlib_set_queue_signal_callback 
+(vlib_main_t *vm, void (*fp)(vlib_main_t *))
+{
+  vm->queue_signal_callback = fp;
+}
+
+/* Main routine. */
+int vlib_main (vlib_main_t * vm, unformat_input_t * input);
+
+/* Thread stacks, for os_get_cpu_number */
+u8 **vlib_thread_stacks;
+
+/* Number of thread stacks that the application needs */
+u32 vlib_app_num_thread_stacks_needed (void) __attribute__ ((weak));
+
+#endif /* included_vlib_main_h */
diff --git a/vlib/vlib/mc.c b/vlib/vlib/mc.c
new file mode 100644
index 00000000000..460145ef0e6
--- /dev/null
+++ b/vlib/vlib/mc.c
@@ -0,0 +1,2354 @@
+/*
+ * mc.c: vlib reliable sequenced multicast distributed applications
+ *
+ * Copyright (c) 2010 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+
+/* 
+ * 1 to enable msg id training wheels, which are useful for tracking
+ * down catchup and/or partitioned network problems
+ */
+#define MSG_ID_DEBUG 0
+
+static format_function_t format_mc_stream_state;
+
+static u32 elog_id_for_peer_id (mc_main_t * m, u64 peer_id)
+{
+  uword * p, r;
+  mhash_t * h = &m->elog_id_by_peer_id;
+
+  if (! m->elog_id_by_peer_id.hash)
+    mhash_init (h, sizeof (uword), sizeof (mc_peer_id_t));
+  
+  p = mhash_get (h, &peer_id);
+  if (p)
+    return p[0];
+  r = elog_string (m->elog_main, "%U",
+		   m->transport.format_peer_id, peer_id);
+  mhash_set (h, &peer_id, r, /* old_value */ 0);
+  return r;
+}
+
+static u32 elog_id_for_msg_name (mc_main_t * m, char *msg_name)
+{
+  uword * p, r;
+  uword * h = m->elog_id_by_msg_name;
+  u8 *name_copy;
+
+  if (! h)
+      h = m->elog_id_by_msg_name 
+        = hash_create_string (0, sizeof (uword));
+
+  p = hash_get_mem (h, msg_name);
+  if (p)
+    return p[0];
+  r = elog_string (m->elog_main, "%s", msg_name);
+
+  name_copy = format (0, "%s%c", msg_name, 0);
+
+  hash_set_mem (h, name_copy, r);
+  m->elog_id_by_msg_name = h;
+
+  return r;
+}
+
+static void elog_tx_msg (mc_main_t * m, u32 stream_id, u32 local_sequence, u32 retry_count)
+{
+  if (MC_EVENT_LOGGING > 0)
+    {
+      ELOG_TYPE_DECLARE (e) = {
+	.format = "tx-msg: stream %d local seq %d attempt %d",
+	.format_args = "i4i4i4",
+      };
+      struct { u32 stream_id, local_sequence, retry_count; } * ed;
+      ed = ELOG_DATA (m->elog_main, e);
+      ed->stream_id = stream_id;
+      ed->local_sequence = local_sequence;
+      ed->retry_count = retry_count;
+    }
+}
+
+/*
+ * seq_cmp
+ * correctly compare two unsigned sequence numbers. 
+ * This function works so long as x and y are within 2**(n-1) of each
+ * other, where n = bits(x, y).
+ *
+ * Magic decoder ring:
+ * seq_cmp == 0 => x and y are equal
+ * seq_cmp < 0 => x is "in the past" with respect to y
+ * seq_cmp > 0 => x is "in the future" with respect to y
+ */
+always_inline i32 mc_seq_cmp (u32 x, u32 y)
+{ return (i32) x - (i32) y;}
+
+void * mc_get_vlib_buffer (vlib_main_t * vm, u32 n_bytes, u32 * bi_return)
+{
+  u32 n_alloc, bi;
+  vlib_buffer_t * b;
+
+  n_alloc = vlib_buffer_alloc (vm, &bi, 1);
+  ASSERT (n_alloc == 1);
+
+  b = vlib_get_buffer (vm, bi);
+  b->current_length = n_bytes;
+  *bi_return = bi;
+  return (void *) b->data;
+}
+
+static void
+delete_peer_with_index (mc_main_t * mcm, mc_stream_t * s,
+			uword index,
+			int notify_application)
+{
+  mc_stream_peer_t * p = pool_elt_at_index (s->peers, index);
+  ASSERT (p != 0);
+  if (s->config.peer_died && notify_application)
+    s->config.peer_died (mcm, s, p->id);
+
+  s->all_peer_bitmap = clib_bitmap_andnoti (s->all_peer_bitmap, p - s->peers);
+
+  if (MC_EVENT_LOGGING > 0)
+    {
+      ELOG_TYPE_DECLARE (e) = {
+	.format = "delete peer %s from all_peer_bitmap",
+	.format_args = "T4",
+      };
+      struct { u32 peer; } * ed = 0;
+
+      ed = ELOG_DATA (mcm->elog_main, e);
+      ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64);
+    }
+  /* Do not delete the pool / hash table entries, or we lose sequence number state */
+}
+
+static mc_stream_peer_t *
+get_or_create_peer_with_id (mc_main_t * mcm,
+			    mc_stream_t * s, mc_peer_id_t id,
+			    int * created)
+{
+  uword * q = mhash_get (&s->peer_index_by_id, &id);
+  mc_stream_peer_t * p;
+
+  if (q)
+    {
+      p = pool_elt_at_index (s->peers, q[0]);
+      goto done;
+    }
+
+  pool_get (s->peers, p);
+  memset (p, 0, sizeof (p[0]));
+  p->id = id;
+  p->last_sequence_received = ~0;
+  mhash_set (&s->peer_index_by_id, &id, p - s->peers, /* old_value */ 0);
+  if (created)
+    *created = 1;
+
+ done:
+  if (MC_EVENT_LOGGING > 0)
+    {
+      ELOG_TYPE_DECLARE (e) = {
+	.format = "get_or_create %s peer %s stream %d seq %d",
+	.format_args = "t4T4i4i4",
+	.n_enum_strings = 2,
+	.enum_strings = { "old", "new", },
+      };
+      struct { u32 is_new, peer, stream_index, rx_sequence; } * ed = 0;
+
+      ed = ELOG_DATA (mcm->elog_main, e);
+      ed->is_new = q ? 0 : 1;
+      ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64);
+      ed->stream_index = s->index;
+      ed->rx_sequence = p->last_sequence_received;
+    }
+  /* $$$$ Enable or reenable this peer */
+  s->all_peer_bitmap = clib_bitmap_ori (s->all_peer_bitmap, p - s->peers);
+  return p;
+}
+
+static void maybe_send_window_open_event (vlib_main_t * vm, mc_stream_t * stream)
+{
+  vlib_one_time_waiting_process_t * p;
+
+  if (pool_elts (stream->retry_pool) >= stream->config.window_size)
+    return;
+
+  vec_foreach (p, stream->procs_waiting_for_open_window)
+    vlib_signal_one_time_waiting_process (vm, p);
+
+  if (stream->procs_waiting_for_open_window)
+    _vec_len (stream->procs_waiting_for_open_window) = 0;
+}
+
+static void mc_retry_free (mc_main_t * mcm, mc_stream_t *s, mc_retry_t * r)
+{
+  mc_retry_t record, *retp;
+  
+  if (r->unacked_by_peer_bitmap)
+    _vec_len (r->unacked_by_peer_bitmap) = 0;
+
+  if (clib_fifo_elts (s->retired_fifo) >= 2 * s->config.window_size) 
+    {
+      clib_fifo_sub1 (s->retired_fifo, record);
+      vlib_buffer_free_one (mcm->vlib_main, record.buffer_index);
+    }
+  
+  clib_fifo_add2 (s->retired_fifo, retp);
+  
+  retp->buffer_index = r->buffer_index;
+  retp->local_sequence = r->local_sequence;
+
+  r->buffer_index = ~0;		/* poison buffer index in this retry */
+}
+
+static void mc_resend_retired (mc_main_t *mcm, mc_stream_t *s, u32 local_sequence)
+{
+  mc_retry_t *retry;
+
+  if (MC_EVENT_LOGGING > 0)
+    {
+      ELOG_TYPE_DECLARE (e) = {
+	.format = "resend-retired: search for local seq %d",
+	.format_args = "i4",
+      };
+      struct { u32 local_sequence; } * ed;
+      ed = ELOG_DATA (mcm->elog_main, e);
+      ed->local_sequence = local_sequence;
+    }
+
+  clib_fifo_foreach 
+    (retry, s->retired_fifo,
+     ({
+       if (retry->local_sequence == local_sequence)
+         {
+           elog_tx_msg (mcm, s->index, retry->local_sequence, -13);
+
+           mcm->transport.tx_buffer
+             (mcm->transport.opaque,
+              MC_TRANSPORT_USER_REQUEST_TO_RELAY,
+              retry->buffer_index);
+           return;
+         }
+     }));
+
+  if (MC_EVENT_LOGGING > 0)
+    {
+      ELOG_TYPE_DECLARE (e) = {
+	.format = "resend-retired: FAILED search for local seq %d",
+	.format_args = "i4",
+      };
+      struct { u32 local_sequence; } * ed;
+      ed = ELOG_DATA (mcm->elog_main, e);
+      ed->local_sequence = local_sequence;
+    }
+}
+
+static uword *
+delete_retry_fifo_elt (mc_main_t * mcm,
+		       mc_stream_t * stream,
+		       mc_retry_t * r,
+		       uword * dead_peer_bitmap)
+{
+  mc_stream_peer_t * p;
+
+  pool_foreach (p, stream->peers, ({
+    uword pi = p - stream->peers;
+    uword is_alive = 0 == clib_bitmap_get (r->unacked_by_peer_bitmap, pi);
+
+    if (! is_alive)
+      dead_peer_bitmap = clib_bitmap_ori (dead_peer_bitmap, pi);
+
+    if (MC_EVENT_LOGGING > 0)
+      {
+        ELOG_TYPE_DECLARE (e) = {
+          .format = "delete_retry_fifo_elt: peer %s is %s",
+          .format_args = "T4t4",
+          .n_enum_strings = 2,
+          .enum_strings = { "alive", "dead", },
+        };
+        struct { u32 peer, is_alive; } * ed;
+        ed = ELOG_DATA (mcm->elog_main, e);
+        ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64);
+        ed->is_alive = is_alive;
+      }
+  }));
+    
+  hash_unset (stream->retry_index_by_local_sequence, r->local_sequence);
+  mc_retry_free (mcm, stream, r);
+
+  return dead_peer_bitmap;
+}
+
+always_inline mc_retry_t *
+prev_retry (mc_stream_t * s, mc_retry_t * r)
+{
+  return (r->prev_index != ~0
+	  ? pool_elt_at_index (s->retry_pool, r->prev_index)
+	  : 0);
+}
+
+always_inline mc_retry_t *
+next_retry (mc_stream_t * s, mc_retry_t * r)
+{
+  return (r->next_index != ~0
+	  ? pool_elt_at_index (s->retry_pool, r->next_index)
+	  : 0);
+}
+
+always_inline void
+remove_retry_from_pool (mc_stream_t * s, mc_retry_t * r)
+{
+  mc_retry_t * p = prev_retry (s, r);
+  mc_retry_t * n = next_retry (s, r);
+
+  if (p)
+    p->next_index = r->next_index;
+  else
+    s->retry_head_index = r->next_index;
+  if (n)
+    n->prev_index = r->prev_index;
+  else
+    s->retry_tail_index = r->prev_index;
+
+  pool_put_index (s->retry_pool, r - s->retry_pool);
+}
+
+static void check_retry (mc_main_t * mcm, mc_stream_t * s)
+{
+  mc_retry_t * r;
+  vlib_main_t * vm = mcm->vlib_main;
+  f64 now = vlib_time_now(vm);
+  uword * dead_peer_bitmap = 0;
+  u32 ri, ri_next;
+
+  for (ri = s->retry_head_index; ri != ~0; ri = ri_next)
+    {
+      r = pool_elt_at_index (s->retry_pool, ri);
+      ri_next = r->next_index;
+
+      if (now < r->sent_at + s->config.retry_interval)
+	continue;
+
+      r->n_retries += 1;
+      if (r->n_retries > s->config.retry_limit)
+	{
+	  dead_peer_bitmap =
+	    delete_retry_fifo_elt (mcm, s, r, dead_peer_bitmap);
+	  remove_retry_from_pool (s, r);
+	}
+      else
+	{
+	  if (MC_EVENT_LOGGING > 0)
+	    {
+	      mc_stream_peer_t * p;
+	      ELOG_TYPE_DECLARE (t) = {
+		.format = "resend local seq %d attempt %d",
+		.format_args = "i4i4",
+	      };
+
+	      pool_foreach (p, s->peers, ({
+		if (clib_bitmap_get (r->unacked_by_peer_bitmap, p - s->peers))
+		  {
+		    ELOG_TYPE_DECLARE (ev) = {
+		      .format = "resend: needed by peer %s local seq %d",
+		      .format_args = "T4i4",
+		    };
+		    struct { u32 peer, rx_sequence; } * ed;
+		    ed = ELOG_DATA (mcm->elog_main, ev);
+		    ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64);
+		    ed->rx_sequence = r->local_sequence;
+		  }
+	      }));
+
+	      struct { u32 sequence; u32 trail; } * ed;
+	      ed = ELOG_DATA (mcm->elog_main, t);
+	      ed->sequence = r->local_sequence;
+	      ed->trail = r->n_retries;
+	    }
+
+	  r->sent_at = vlib_time_now (vm);
+	  s->stats.n_retries += 1;
+
+	  elog_tx_msg (mcm, s->index, r->local_sequence, r->n_retries);
+
+	  mcm->transport.tx_buffer
+	  (mcm->transport.opaque,
+	   MC_TRANSPORT_USER_REQUEST_TO_RELAY,
+	   r->buffer_index);
+	}
+    }
+
+  maybe_send_window_open_event (mcm->vlib_main, s);
+
+  /* Delete any dead peers we've found. */
+  if (! clib_bitmap_is_zero (dead_peer_bitmap))
+    {
+      uword i;
+
+      clib_bitmap_foreach (i, dead_peer_bitmap, ({
+	delete_peer_with_index (mcm, s, i, /* notify_application */ 1);
+
+	/* Delete any references to just deleted peer in retry pool. */
+	pool_foreach (r, s->retry_pool, ({
+	  r->unacked_by_peer_bitmap =
+	    clib_bitmap_andnoti (r->unacked_by_peer_bitmap, i);
+	}));
+      }));
+      clib_bitmap_free (dead_peer_bitmap);
+    }
+}
+
+always_inline mc_main_t *
+mc_node_get_main (vlib_node_runtime_t * node)
+{
+  mc_main_t ** p = (void *) node->runtime_data;
+  return p[0];
+}
+
+static uword
+mc_retry_process (vlib_main_t * vm,
+		  vlib_node_runtime_t * node,
+		  vlib_frame_t * f)
+{
+  mc_main_t * mcm = mc_node_get_main (node);
+  mc_stream_t * s;
+    
+  while (1)
+    {
+      vlib_process_suspend (vm, 1.0);
+      vec_foreach (s, mcm->stream_vector)
+	{
+	  if (s->state != MC_STREAM_STATE_invalid)
+	    check_retry (mcm, s);
+	}
+    }
+  return 0; /* not likely */
+}
+
+static void send_join_or_leave_request (mc_main_t * mcm, u32 stream_index, u32 is_join)
+{
+  vlib_main_t * vm = mcm->vlib_main;
+  mc_msg_join_or_leave_request_t * mp;
+  u32 bi;
+
+  mp = mc_get_vlib_buffer (vm, sizeof (mp[0]), &bi);
+  memset(mp, 0, sizeof (*mp));
+  mp->type = MC_MSG_TYPE_join_or_leave_request;
+  mp->peer_id = mcm->transport.our_ack_peer_id;
+  mp->stream_index = stream_index;
+  mp->is_join = is_join;
+
+  mc_byte_swap_msg_join_or_leave_request (mp);
+
+  /* 
+   * These msgs are unnumbered, unordered so send on the from-relay
+   * channel. 
+   */
+  mcm->transport.tx_buffer (mcm->transport.opaque, MC_TRANSPORT_JOIN, bi);
+}    
+
+static uword
+mc_join_ager_process (vlib_main_t * vm,
+		      vlib_node_runtime_t * node,
+		      vlib_frame_t * f)
+{
+  mc_main_t * mcm = mc_node_get_main (node);
+    
+  while (1)
+    {
+      if (mcm->joins_in_progress)
+	{
+	  mc_stream_t * s;
+	  vlib_one_time_waiting_process_t * p;
+	  f64 now = vlib_time_now (vm);
+
+	  vec_foreach (s, mcm->stream_vector)
+	    {
+	      if (s->state != MC_STREAM_STATE_join_in_progress)
+		continue;
+
+	      if (now > s->join_timeout)
+		{
+		  s->state = MC_STREAM_STATE_ready;
+
+		  if (MC_EVENT_LOGGING > 0)
+		    {
+		      ELOG_TYPE_DECLARE (e) = {
+			.format = "stream %d join timeout",
+		      };
+		      ELOG (mcm->elog_main, e, s->index);
+		    }
+                  /* Make sure that this app instance exists as a stream peer,
+                     or we may answer a catchup request with a NULL
+                     all_peer_bitmap... */
+                  (void) get_or_create_peer_with_id 
+                      (mcm, s, mcm->transport.our_ack_peer_id, /* created */ 0);
+
+		  vec_foreach (p, s->procs_waiting_for_join_done)
+		    vlib_signal_one_time_waiting_process (vm, p);
+		  if (s->procs_waiting_for_join_done)
+		    _vec_len (s->procs_waiting_for_join_done) = 0;
+
+		  mcm->joins_in_progress--;
+		  ASSERT (mcm->joins_in_progress >= 0);
+		}
+	      else
+		{
+		  /* Resent join request which may have been lost. */
+		  send_join_or_leave_request (mcm, s->index, 
+					      1 /* is_join */);
+ 
+                  /* We're *not* alone, retry for as long as it takes */
+                  if (mcm->relay_state == MC_RELAY_STATE_SLAVE)
+                    s->join_timeout = vlib_time_now (vm) + 2.0;
+
+
+		  if (MC_EVENT_LOGGING > 0)
+		    {
+		      ELOG_TYPE_DECLARE (e) = {
+			.format = "stream %d resend join request",
+		      };
+		      ELOG (mcm->elog_main, e, s->index);
+		    }
+		}
+	    }
+	}
+
+      vlib_process_suspend (vm, .5);
+    }
+
+  return 0; /* not likely */
+}
+
+static void serialize_mc_register_stream_name (serialize_main_t * m, va_list * va)
+{
+  char * name = va_arg (*va, char *);
+  serialize_cstring (m, name);
+}
+
+static void elog_stream_name (char * buf, int n_buf_bytes, char * v)
+{
+  memcpy (buf, v, clib_min (n_buf_bytes - 1, vec_len (v)));
+  buf[n_buf_bytes - 1] = 0;
+}
+
+static void unserialize_mc_register_stream_name (serialize_main_t * m, va_list * va)
+{
+  mc_main_t * mcm = va_arg (*va, mc_main_t *);
+  char * name;
+  mc_stream_t * s;
+  uword * p;
+
+  unserialize_cstring (m, &name);
+
+  if ((p = hash_get_mem (mcm->stream_index_by_name, name)))
+    {
+      if (MC_EVENT_LOGGING > 0)
+	{
+	  ELOG_TYPE_DECLARE (e) = {
+	    .format = "stream index %d already named %s",
+	    .format_args = "i4s16",
+	  };
+	  struct { u32 stream_index; char name[16]; } * ed;
+	  ed = ELOG_DATA (mcm->elog_main, e);
+	  ed->stream_index = p[0];
+	  elog_stream_name (ed->name, sizeof (ed->name), name);
+	}
+
+      vec_free (name);
+      return;
+    }
+
+  vec_add2 (mcm->stream_vector, s, 1);
+  mc_stream_init (s);
+  s->state = MC_STREAM_STATE_name_known;
+  s->index = s - mcm->stream_vector;
+  s->config.name = name;
+
+  if (MC_EVENT_LOGGING > 0)
+    {
+      ELOG_TYPE_DECLARE (e) = {
+	.format = "stream index %d named %s",
+	.format_args = "i4s16",
+      };
+      struct { u32 stream_index; char name[16]; } * ed;
+      ed = ELOG_DATA (mcm->elog_main, e);
+      ed->stream_index = s->index;
+      elog_stream_name (ed->name, sizeof (ed->name), name);
+    }
+
+  hash_set_mem (mcm->stream_index_by_name, name, s->index);
+
+  p = hash_get (mcm->procs_waiting_for_stream_name_by_name, name);
+  if (p)
+    {
+      vlib_one_time_waiting_process_t * wp, ** w;
+      w = pool_elt_at_index (mcm->procs_waiting_for_stream_name_pool, p[0]);
+      vec_foreach (wp, w[0])
+	vlib_signal_one_time_waiting_process (mcm->vlib_main, wp);
+      pool_put (mcm->procs_waiting_for_stream_name_pool, w);
+      hash_unset_mem (mcm->procs_waiting_for_stream_name_by_name, name);
+    }
+}
+
+MC_SERIALIZE_MSG (mc_register_stream_name_msg, static) = {
+  .name = "mc_register_stream_name",
+  .serialize = serialize_mc_register_stream_name,
+  .unserialize = unserialize_mc_register_stream_name,
+};
+
+void
+mc_rx_buffer_unserialize (mc_main_t * mcm,
+			  mc_stream_t * stream,
+			  mc_peer_id_t peer_id,
+			  u32 buffer_index)
+{ return mc_unserialize (mcm, stream, buffer_index); }
+
+static u8 *
+mc_internal_catchup_snapshot (mc_main_t * mcm,
+			      u8 * data_vector,
+			      u32 last_global_sequence_processed)
+{
+  serialize_main_t m;
+
+  /* Append serialized data to data vector. */
+  serialize_open_vector (&m, data_vector);
+  m.stream.current_buffer_index = vec_len (data_vector);
+
+  serialize (&m, serialize_mc_main, mcm);
+  return serialize_close_vector (&m);
+}
+
+static void
+mc_internal_catchup (mc_main_t * mcm,
+		     u8 * data,
+		     u32 n_data_bytes)
+{
+  serialize_main_t s;
+
+  unserialize_open_data (&s, data, n_data_bytes);
+
+  unserialize (&s, unserialize_mc_main, mcm);
+}
+
+/* Overridden from the application layer, not actually used here */
+void mc_stream_join_process_hold (void) __attribute__ ((weak));
+void mc_stream_join_process_hold (void) { }
+
+static u32
+mc_stream_join_helper (mc_main_t * mcm,
+		       mc_stream_config_t * config,
+		       u32 is_internal)
+{
+  mc_stream_t * s;
+  vlib_main_t * vm = mcm->vlib_main;
+    
+  s = 0;
+  if (! is_internal)
+    {
+      uword * p;
+
+      /* Already have a stream with given name? */
+      if ((s = mc_stream_by_name (mcm, config->name)))
+	{
+	  /* Already joined and ready? */
+	  if (s->state == MC_STREAM_STATE_ready)
+	    return s->index;
+	}
+
+      /* First join MC internal stream. */
+      if (! mcm->stream_vector
+          || (mcm->stream_vector[MC_STREAM_INDEX_INTERNAL].state
+              == MC_STREAM_STATE_invalid))
+        {
+	  static mc_stream_config_t c = {
+	    .name = "mc-internal",
+	    .rx_buffer = mc_rx_buffer_unserialize,
+	    .catchup = mc_internal_catchup,
+	    .catchup_snapshot = mc_internal_catchup_snapshot,
+	  };
+
+          c.save_snapshot = config->save_snapshot;
+
+	  mc_stream_join_helper (mcm, &c, /* is_internal */ 1);
+	}
+
+      /* If stream is still unknown register this name and wait for
+	 sequenced message to name stream.  This way all peers agree
+	 on stream name to index mappings. */
+      s = mc_stream_by_name (mcm, config->name);
+      if (! s)
+	{
+	  vlib_one_time_waiting_process_t * wp, ** w;
+	  u8 * name_copy = format (0, "%s", config->name);
+
+	  mc_serialize_stream (mcm,
+			       MC_STREAM_INDEX_INTERNAL,
+			       &mc_register_stream_name_msg,
+			       config->name);
+
+	  /* Wait for this stream to be named. */
+	  p = hash_get_mem (mcm->procs_waiting_for_stream_name_by_name, name_copy);
+	  if (p)
+	    w = pool_elt_at_index (mcm->procs_waiting_for_stream_name_pool, p[0]);
+	  else
+	    {
+	      pool_get (mcm->procs_waiting_for_stream_name_pool, w);
+	      if (! mcm->procs_waiting_for_stream_name_by_name)
+		mcm->procs_waiting_for_stream_name_by_name
+		  = hash_create_string (/* elts */ 0, /* value size */ sizeof (uword));
+	      hash_set_mem (mcm->procs_waiting_for_stream_name_by_name,
+			    name_copy,
+			    w - mcm->procs_waiting_for_stream_name_pool);
+	      w[0] = 0;
+	    }
+
+	  vec_add2 (w[0], wp, 1);
+	  vlib_current_process_wait_for_one_time_event (vm, wp);
+	  vec_free (name_copy);
+	}
+
+      /* Name should be known now. */
+      s = mc_stream_by_name (mcm, config->name);
+      ASSERT (s != 0);
+      ASSERT (s->state == MC_STREAM_STATE_name_known);
+    }
+
+  if (! s)
+    {
+      vec_add2 (mcm->stream_vector, s, 1);
+      mc_stream_init (s);
+      s->index = s - mcm->stream_vector;
+    }
+
+  {
+    /* Save name since we could have already used it as hash key. */
+    char * name_save = s->config.name;
+
+    s->config = config[0];
+
+    if (name_save)
+      s->config.name = name_save;
+  }
+
+  if (s->config.window_size == 0)
+    s->config.window_size = 8;
+
+  if (s->config.retry_interval == 0.0)
+      s->config.retry_interval = 1.0;
+
+  /* Sanity. */
+  ASSERT (s->config.retry_interval < 30);
+
+  if (s->config.retry_limit == 0)
+      s->config.retry_limit = 7;
+
+  s->state = MC_STREAM_STATE_join_in_progress;
+  if (! s->peer_index_by_id.hash)
+    mhash_init (&s->peer_index_by_id, sizeof (uword), sizeof (mc_peer_id_t));
+
+  /* If we don't hear from someone in 5 seconds, we're alone */
+  s->join_timeout = vlib_time_now (vm) + 5.0;
+  mcm->joins_in_progress++;
+
+  if (MC_EVENT_LOGGING > 0)
+    {
+      ELOG_TYPE_DECLARE (e) = {
+	.format = "stream index %d join request %s",
+	.format_args = "i4s16",
+      };
+      struct { u32 stream_index; char name[16]; } * ed;
+      ed = ELOG_DATA (mcm->elog_main, e);
+      ed->stream_index = s->index;
+      elog_stream_name (ed->name, sizeof (ed->name), s->config.name);
+    }
+
+  send_join_or_leave_request (mcm, s->index, 1 /* join */);
+    
+  vlib_current_process_wait_for_one_time_event_vector
+    (vm, &s->procs_waiting_for_join_done);
+    
+  if (MC_EVENT_LOGGING)
+    {
+      ELOG_TYPE (e, "join complete stream %d");
+      ELOG (mcm->elog_main, e, s->index);
+    }
+
+  return s->index;
+}
+
+u32 mc_stream_join (mc_main_t * mcm, mc_stream_config_t * config)
+{ return mc_stream_join_helper (mcm, config, /* is_internal */ 0); }
+
+void mc_stream_leave (mc_main_t * mcm, u32 stream_index)
+{
+  mc_stream_t * s = mc_stream_by_index (mcm, stream_index);
+  
+  if (! s)
+    return;
+
+  if (MC_EVENT_LOGGING)
+    {
+      ELOG_TYPE_DECLARE (t) = {
+        .format = "leave-stream: %d",
+        .format_args = "i4",
+      };
+      struct { u32 index; } * ed;
+      ed = ELOG_DATA (mcm->elog_main, t);
+      ed->index = stream_index;
+    }
+
+  send_join_or_leave_request (mcm, stream_index, 0 /* is_join */);
+  mc_stream_free (s);
+  s->state = MC_STREAM_STATE_name_known;
+}
+
+void mc_msg_join_or_leave_request_handler (mc_main_t * mcm, 
+					   mc_msg_join_or_leave_request_t * req, 
+                                           u32 buffer_index)
+{
+  mc_stream_t * s;
+  mc_msg_join_reply_t * rep;
+  u32 bi;
+
+  mc_byte_swap_msg_join_or_leave_request (req);
+
+  s = mc_stream_by_index (mcm, req->stream_index);
+  if (! s || s->state != MC_STREAM_STATE_ready)
+    return;
+
+  /* If the peer is joining, create it */
+  if (req->is_join) 
+    {
+      mc_stream_t * this_s;
+
+      /* We're not in a position to catch up a peer until all
+         stream joins are complete. */
+      if (0)
+        {
+          /* XXX This is hard to test so we've. */
+          vec_foreach (this_s, mcm->stream_vector)
+            {
+              if (this_s->state != MC_STREAM_STATE_ready
+                  && this_s->state != MC_STREAM_STATE_name_known)
+                return;
+            }
+        }
+      else
+        if (mcm->joins_in_progress > 0)
+          return;
+      
+      (void) get_or_create_peer_with_id (mcm,
+                                         s,
+                                         req->peer_id,
+                                         /* created */ 0);
+
+      rep = mc_get_vlib_buffer (mcm->vlib_main, sizeof (rep[0]), &bi);
+      memset (rep, 0, sizeof (rep[0]));
+      rep->type = MC_MSG_TYPE_join_reply;
+      rep->stream_index = req->stream_index;
+      
+      mc_byte_swap_msg_join_reply (rep);
+      /* These two are already in network byte order... */
+      rep->peer_id = mcm->transport.our_ack_peer_id;
+      rep->catchup_peer_id = mcm->transport.our_catchup_peer_id;
+
+      mcm->transport.tx_buffer (mcm->transport.opaque, MC_TRANSPORT_JOIN, bi);
+    }
+  else
+    {
+      if (s->config.peer_died)
+	s->config.peer_died (mcm, s, req->peer_id);
+    }
+}
+
+void mc_msg_join_reply_handler (mc_main_t * mcm,
+				mc_msg_join_reply_t * mp,
+				u32 buffer_index)
+{
+  mc_stream_t * s;
+
+  mc_byte_swap_msg_join_reply (mp);
+
+  s = mc_stream_by_index (mcm, mp->stream_index);
+
+  if (! s || s->state != MC_STREAM_STATE_join_in_progress)
+    return;
+
+  /* Switch to catchup state; next join reply 
+     for this stream will be ignored. */
+  s->state = MC_STREAM_STATE_catchup;
+
+  mcm->joins_in_progress--;
+  mcm->transport.catchup_request_fun (mcm->transport.opaque,
+				      mp->stream_index,
+				      mp->catchup_peer_id);
+}
+
+void mc_wait_for_stream_ready (mc_main_t * m, char * stream_name)
+{
+  mc_stream_t * s;
+
+  while (1)
+    {
+      s = mc_stream_by_name (m, stream_name);
+      if (s)
+	break;
+      vlib_process_suspend (m->vlib_main, .1);
+    }
+
+  /* It's OK to send a message in catchup and ready states. */
+  if (s->state == MC_STREAM_STATE_catchup
+      || s->state == MC_STREAM_STATE_ready)
+    return;
+
+  /* Otherwise we are waiting for a join to finish. */
+  vlib_current_process_wait_for_one_time_event_vector
+    (m->vlib_main, &s->procs_waiting_for_join_done);
+}
+
+u32 mc_stream_send (mc_main_t * mcm, u32 stream_index, u32 buffer_index)
+{
+  mc_stream_t * s = mc_stream_by_index (mcm, stream_index);
+  vlib_main_t * vm = mcm->vlib_main;
+  mc_retry_t * r;
+  mc_msg_user_request_t * mp;    
+  vlib_buffer_t * b = vlib_get_buffer (vm, buffer_index);
+  u32 ri;
+
+  if (! s)
+    return 0;
+
+  if (s->state != MC_STREAM_STATE_ready)
+    vlib_current_process_wait_for_one_time_event_vector
+      (vm, &s->procs_waiting_for_join_done);
+
+  while (pool_elts (s->retry_pool) >= s->config.window_size)
+    {
+      vlib_current_process_wait_for_one_time_event_vector
+	(vm, &s->procs_waiting_for_open_window);
+    }
+
+  pool_get (s->retry_pool, r);
+  ri = r - s->retry_pool;
+
+  r->prev_index = s->retry_tail_index;
+  r->next_index = ~0;
+  s->retry_tail_index = ri;
+
+  if (r->prev_index == ~0)
+    s->retry_head_index = ri;
+  else
+    {
+      mc_retry_t * p = pool_elt_at_index (s->retry_pool, r->prev_index);
+      p->next_index = ri;
+    }
+
+  vlib_buffer_advance (b, -sizeof (mp[0]));
+  mp = vlib_buffer_get_current (b);
+
+  mp->peer_id = mcm->transport.our_ack_peer_id;
+  /* mp->transport.global_sequence set by relay agent. */
+  mp->global_sequence = 0xdeadbeef;
+  mp->stream_index = s->index;
+  mp->local_sequence = s->our_local_sequence++;
+  mp->n_data_bytes = vlib_buffer_index_length_in_chain (vm, buffer_index) - sizeof (mp[0]);
+
+  r->buffer_index = buffer_index;
+  r->local_sequence = mp->local_sequence;
+  r->sent_at = vlib_time_now(vm);
+  r->n_retries = 0;
+
+  /* Retry will be freed when all currently known peers have acked. */
+  vec_validate (r->unacked_by_peer_bitmap, vec_len (s->all_peer_bitmap) - 1);
+  vec_copy (r->unacked_by_peer_bitmap, s->all_peer_bitmap);
+
+  hash_set (s->retry_index_by_local_sequence, r->local_sequence, r - s->retry_pool);
+
+  elog_tx_msg (mcm, s->index, mp->local_sequence, r->n_retries);
+
+  mc_byte_swap_msg_user_request (mp);
+
+  mcm->transport.tx_buffer (mcm->transport.opaque, MC_TRANSPORT_USER_REQUEST_TO_RELAY, buffer_index);
+
+  s->user_requests_sent++;
+
+  /* return amount of window remaining */
+  return s->config.window_size - pool_elts (s->retry_pool);
+}
+
+void mc_msg_user_request_handler (mc_main_t * mcm, mc_msg_user_request_t * mp, u32 buffer_index)
+{
+  vlib_main_t * vm = mcm->vlib_main;
+  mc_stream_t * s;
+  mc_stream_peer_t * peer;
+  i32 seq_cmp_result;
+  static int once=0;
+
+  mc_byte_swap_msg_user_request (mp);
+
+  s = mc_stream_by_index (mcm, mp->stream_index);
+
+  /* Not signed up for this stream? Turf-o-matic */
+  if (! s || s->state != MC_STREAM_STATE_ready)
+    {
+      vlib_buffer_free_one (vm, buffer_index);
+      return;
+    }
+
+  /* Find peer, including ourselves. */
+  peer = get_or_create_peer_with_id (mcm,
+				     s, mp->peer_id,
+				     /* created */ 0);
+
+  seq_cmp_result = mc_seq_cmp (mp->local_sequence, 
+			       peer->last_sequence_received + 1);
+
+  if (MC_EVENT_LOGGING > 0)
+    {
+      ELOG_TYPE_DECLARE (e) = {
+	.format = "rx-msg: peer %s stream %d rx seq %d seq_cmp %d",
+	.format_args = "T4i4i4i4",
+      };
+      struct { u32 peer, stream_index, rx_sequence; i32 seq_cmp_result; } * ed;
+      ed = ELOG_DATA (mcm->elog_main, e);
+      ed->peer = elog_id_for_peer_id (mcm, peer->id.as_u64);
+      ed->stream_index = mp->stream_index;
+      ed->rx_sequence = mp->local_sequence;
+      ed->seq_cmp_result = seq_cmp_result;
+    }
+
+  if (0 && mp->stream_index == 1 && once == 0) 
+    {
+      once = 1;
+      ELOG_TYPE (e, "FAKE lost msg on stream 1");
+      ELOG (mcm->elog_main,e,0);
+      return;
+    }
+
+  peer->last_sequence_received += seq_cmp_result == 0;
+  s->user_requests_received++;
+
+  if (seq_cmp_result > 0)
+      peer->stats.n_msgs_from_future += 1;
+
+  /* Send ack even if msg from future */
+  if (1)
+    {
+      mc_msg_user_ack_t * rp;
+      u32 bi;
+
+      rp = mc_get_vlib_buffer (vm, sizeof (rp[0]), &bi);
+      rp->peer_id = mcm->transport.our_ack_peer_id;
+      rp->stream_index = s->index;
+      rp->local_sequence = mp->local_sequence;
+      rp->seq_cmp_result = seq_cmp_result;
+
+      if (MC_EVENT_LOGGING > 0)
+	{
+	  ELOG_TYPE_DECLARE (e) = {
+	    .format = "tx-ack: stream %d local seq %d",
+	    .format_args = "i4i4",
+	  };
+	  struct { u32 stream_index; u32 local_sequence; } * ed;
+	  ed = ELOG_DATA (mcm->elog_main, e);
+	  ed->stream_index = rp->stream_index;
+	  ed->local_sequence = rp->local_sequence;
+	}
+
+      mc_byte_swap_msg_user_ack (rp);
+
+      mcm->transport.tx_ack (mcm->transport.opaque, mp->peer_id, bi);
+      /* Msg from past? If so, free the buffer... */
+      if (seq_cmp_result < 0) 
+        {
+          vlib_buffer_free_one (vm, buffer_index);
+          peer->stats.n_msgs_from_past += 1;
+        }
+    }
+    
+  if (seq_cmp_result == 0)
+    {
+      vlib_buffer_t * b = vlib_get_buffer (vm, buffer_index);
+      switch (s->state)
+	{
+	case MC_STREAM_STATE_ready:
+	  vlib_buffer_advance (b, sizeof (mp[0]));
+	  s->config.rx_buffer(mcm, s, mp->peer_id, buffer_index);
+
+	  /* Stream vector can change address via rx callback for mc-internal
+	     stream. */
+	  s = mc_stream_by_index (mcm, mp->stream_index);
+	  ASSERT (s != 0);
+	  s->last_global_sequence_processed = mp->global_sequence;
+	  break;
+
+	case MC_STREAM_STATE_catchup:
+	  clib_fifo_add1 (s->catchup_fifo, buffer_index);
+	  break;
+
+	default:
+	  clib_warning ("stream in unknown state %U",
+			format_mc_stream_state, s->state);
+	  break;
+	}
+    }
+}
+
+void mc_msg_user_ack_handler (mc_main_t * mcm, mc_msg_user_ack_t * mp, u32 buffer_index)
+{
+  vlib_main_t * vm = mcm->vlib_main;
+  uword *p;
+  mc_stream_t * s;
+  mc_stream_peer_t * peer;
+  mc_retry_t * r;
+  int peer_created = 0;
+
+  mc_byte_swap_msg_user_ack (mp);
+
+  s = mc_stream_by_index (mcm, mp->stream_index);
+
+  if (MC_EVENT_LOGGING > 0)
+    {
+      ELOG_TYPE_DECLARE (t) = {
+	.format = "rx-ack: local seq %d peer %s seq_cmp_result %d",
+	.format_args = "i4T4i4",
+      };
+      struct { u32 local_sequence; u32 peer; i32 seq_cmp_result;} * ed;
+      ed = ELOG_DATA (mcm->elog_main, t);
+      ed->local_sequence = mp->local_sequence;
+      ed->peer = elog_id_for_peer_id (mcm, mp->peer_id.as_u64);
+      ed->seq_cmp_result = mp->seq_cmp_result;
+    }
+
+  /* Unknown stream? */
+  if (! s)
+    return;
+
+  /* Find the peer which just ack'ed. */
+  peer = get_or_create_peer_with_id (mcm, s, mp->peer_id,
+				     /* created */ &peer_created);
+
+  /* 
+   * Peer reports message from the future. If it's not in the retry
+   * fifo, look for a retired message. 
+   */
+  if (mp->seq_cmp_result > 0)
+    {
+      p = hash_get (s->retry_index_by_local_sequence, mp->local_sequence - 
+                    mp->seq_cmp_result);
+      if (p == 0)
+          mc_resend_retired (mcm, s, mp->local_sequence - mp->seq_cmp_result);
+
+      /* Normal retry should fix it... */
+      return;
+    }
+
+  /* 
+   * Pointer to the indicated retry fifo entry.
+   * Worth hashing because we could use a window size of 100 or 1000.
+   */
+  p = hash_get (s->retry_index_by_local_sequence, mp->local_sequence);
+
+  /* 
+   * Is this a duplicate ACK, received after we've retired the
+   * fifo entry. This can happen when learning about new
+   * peers.
+   */
+  if (p == 0)
+    {
+      if (MC_EVENT_LOGGING > 0)
+        {
+          ELOG_TYPE_DECLARE (t) = 
+            {
+              .format = "ack: for seq %d from peer %s no fifo elt",
+              .format_args = "i4T4",
+	    };
+          struct { u32 seq; u32 peer; } * ed;
+          ed = ELOG_DATA (mcm->elog_main, t);
+          ed->seq = mp->local_sequence;
+          ed->peer = elog_id_for_peer_id (mcm, mp->peer_id.as_u64);
+        }
+
+      return;
+    }
+  
+  r = pool_elt_at_index (s->retry_pool, p[0]);
+
+  /* Make sure that this new peer ACKs our msgs from now on */
+  if (peer_created)
+    {
+      mc_retry_t *later_retry = next_retry (s, r);
+
+      while (later_retry)
+	{
+	  later_retry->unacked_by_peer_bitmap =
+	    clib_bitmap_ori (later_retry->unacked_by_peer_bitmap,
+			     peer - s->peers);
+	  later_retry = next_retry (s, later_retry);
+	}
+    }
+
+  ASSERT (mp->local_sequence == r->local_sequence);
+  
+  /* If we weren't expecting to hear from this peer */
+  if (!peer_created && 
+      ! clib_bitmap_get (r->unacked_by_peer_bitmap, peer - s->peers))
+    {
+      if (MC_EVENT_LOGGING > 0)
+        {
+          ELOG_TYPE_DECLARE (t) = 
+            {
+              .format = "dup-ack: for seq %d from peer %s",
+              .format_args = "i4T4",
+	    };
+          struct { u32 seq; u32 peer; } * ed;
+          ed = ELOG_DATA (mcm->elog_main, t);
+          ed->seq = r->local_sequence;
+          ed->peer = elog_id_for_peer_id (mcm, peer->id.as_u64);
+        }
+      if (! clib_bitmap_is_zero (r->unacked_by_peer_bitmap))
+	return;
+    }
+
+  if (MC_EVENT_LOGGING > 0)
+    {
+      ELOG_TYPE_DECLARE (t) = 
+        {
+          .format = "ack: for seq %d from peer %s",
+          .format_args = "i4T4",
+        };
+      struct { u32 seq; u32 peer; } * ed;
+      ed = ELOG_DATA (mcm->elog_main, t);
+      ed->seq = mp->local_sequence;
+      ed->peer = elog_id_for_peer_id (mcm, peer->id.as_u64);
+    }
+
+  r->unacked_by_peer_bitmap = 
+    clib_bitmap_andnoti (r->unacked_by_peer_bitmap, peer - s->peers);
+  
+  /* Not all clients have ack'ed */
+  if (! clib_bitmap_is_zero (r->unacked_by_peer_bitmap))
+    {
+      return;
+    }
+  if (MC_EVENT_LOGGING > 0)
+    {
+      ELOG_TYPE_DECLARE (t) = 
+        {
+          .format = "ack: retire fifo elt loc seq %d after %d acks",
+          .format_args = "i4i4",
+        };
+      struct { u32 seq; u32 npeers; } * ed;
+      ed = ELOG_DATA (mcm->elog_main, t);
+      ed->seq = r->local_sequence;
+      ed->npeers = pool_elts (s->peers);
+    }
+  
+  hash_unset (s->retry_index_by_local_sequence, mp->local_sequence);
+  mc_retry_free (mcm, s, r);
+  remove_retry_from_pool (s, r);
+  maybe_send_window_open_event (vm, s);
+}
+
+#define EVENT_MC_SEND_CATCHUP_DATA 0
+
+static uword
+mc_catchup_process (vlib_main_t * vm,
+		    vlib_node_runtime_t * node,
+		    vlib_frame_t * f)
+{
+  mc_main_t * mcm = mc_node_get_main (node);
+  uword *event_data = 0;
+  mc_catchup_process_arg_t * args;
+  int i;
+    
+  while (1)
+    {
+      if (event_data)
+	_vec_len(event_data) = 0;
+      vlib_process_wait_for_event_with_type (vm, &event_data, EVENT_MC_SEND_CATCHUP_DATA);
+
+      for (i = 0; i < vec_len(event_data); i++)
+	{
+          args = pool_elt_at_index (mcm->catchup_process_args,
+                                    event_data[i]);
+            
+	  mcm->transport.catchup_send_fun (mcm->transport.opaque,
+					   args->catchup_opaque,
+					   args->catchup_snapshot);
+
+	  /* Send function will free snapshot data vector. */
+	  pool_put (mcm->catchup_process_args, args);
+	}
+    }
+
+  return 0; /* not likely */
+}
+
+static void serialize_mc_stream (serialize_main_t * m, va_list * va)
+{
+  mc_stream_t * s = va_arg (*va, mc_stream_t *);
+  mc_stream_peer_t * p;
+
+  serialize_integer (m, pool_elts (s->peers), sizeof (u32));
+  pool_foreach (p, s->peers, ({
+    u8 * x = serialize_get (m, sizeof (p->id));
+    memcpy (x, p->id.as_u8, sizeof (p->id));
+    serialize_integer (m, p->last_sequence_received, 
+                       sizeof (p->last_sequence_received));
+  }));
+  serialize_bitmap (m, s->all_peer_bitmap);
+}
+
+void unserialize_mc_stream (serialize_main_t * m, va_list * va)
+{
+  mc_stream_t * s = va_arg (*va, mc_stream_t *);
+  u32 i, n_peers;
+  mc_stream_peer_t * p;
+
+  unserialize_integer (m, &n_peers, sizeof (u32));
+  mhash_init (&s->peer_index_by_id, sizeof (uword), sizeof (mc_peer_id_t));
+  for (i = 0; i < n_peers; i++)
+    {
+      u8 * x;
+      pool_get (s->peers, p);
+      x = unserialize_get (m, sizeof (p->id));
+      memcpy (p->id.as_u8, x, sizeof (p->id));
+      unserialize_integer (m, &p->last_sequence_received, sizeof (p->last_sequence_received));
+      mhash_set (&s->peer_index_by_id, &p->id, p - s->peers, /* old_value */ 0);
+    }
+  s->all_peer_bitmap = unserialize_bitmap (m);
+
+  /* This is really bad. */
+  if (!s->all_peer_bitmap)
+    clib_warning ("BUG: stream %s all_peer_bitmap NULL", s->config.name);
+}
+
+void mc_msg_catchup_request_handler (mc_main_t * mcm, mc_msg_catchup_request_t * req, u32 catchup_opaque)
+{
+  vlib_main_t * vm = mcm->vlib_main;
+  mc_stream_t * s;
+  mc_catchup_process_arg_t * args;
+
+  mc_byte_swap_msg_catchup_request (req);
+
+  s = mc_stream_by_index (mcm, req->stream_index);
+  if (! s || s->state != MC_STREAM_STATE_ready)
+    return;
+    
+  if (MC_EVENT_LOGGING > 0)
+    {
+      ELOG_TYPE_DECLARE (t) = 
+	{
+	  .format = "catchup-request: from %s stream %d",
+	  .format_args = "T4i4",
+	};
+      struct { u32 peer, stream; } * ed;
+      ed = ELOG_DATA (mcm->elog_main, t);
+      ed->peer = elog_id_for_peer_id (mcm, req->peer_id.as_u64);
+      ed->stream = req->stream_index;
+    }
+
+  /* 
+   * The application has to snapshoot its data structures right 
+   * here, right now. If we process any messages after 
+   * noting the last global sequence we've processed, the client 
+   * won't be able to accurately reconstruct our data structures.
+   *
+   * Once the data structures are e.g. vec_dup()'ed, we 
+   * send the resulting messages from a separate process, to
+   * make sure that we don't cause a bunch of message retransmissions
+   */
+  pool_get (mcm->catchup_process_args, args);
+
+  args->stream_index = s - mcm->stream_vector;
+  args->catchup_opaque = catchup_opaque;
+  args->catchup_snapshot = 0;
+
+  /* Construct catchup reply and snapshot state for stream to send as
+     catchup reply payload. */
+  {
+    mc_msg_catchup_reply_t * rep;
+    serialize_main_t m;
+
+    vec_resize (args->catchup_snapshot, sizeof (rep[0]));
+
+    rep = (void *) args->catchup_snapshot;
+
+    rep->peer_id = req->peer_id;
+    rep->stream_index = req->stream_index;
+    rep->last_global_sequence_included = s->last_global_sequence_processed;
+
+    /* Setup for serialize to append to catchup snapshot. */
+    serialize_open_vector (&m, args->catchup_snapshot);
+    m.stream.current_buffer_index = vec_len (m.stream.buffer);
+
+    serialize (&m, serialize_mc_stream, s);
+
+    args->catchup_snapshot = serialize_close_vector (&m);
+
+    /* Actually copy internal state */
+    args->catchup_snapshot = s->config.catchup_snapshot
+      (mcm,
+       args->catchup_snapshot,
+       rep->last_global_sequence_included);
+
+    rep = (void *) args->catchup_snapshot;
+    rep->n_data_bytes = vec_len (args->catchup_snapshot) - sizeof (rep[0]);
+
+    mc_byte_swap_msg_catchup_reply (rep);
+  }
+
+  /* now go send it... */
+  vlib_process_signal_event (vm, mcm->catchup_process,
+			     EVENT_MC_SEND_CATCHUP_DATA, 
+                             args - mcm->catchup_process_args);
+}
+
+#define EVENT_MC_UNSERIALIZE_BUFFER 0
+#define EVENT_MC_UNSERIALIZE_CATCHUP 1
+
+void mc_msg_catchup_reply_handler (mc_main_t * mcm, mc_msg_catchup_reply_t * mp, u32 catchup_opaque)
+{
+  vlib_process_signal_event (mcm->vlib_main,
+			     mcm->unserialize_process,
+			     EVENT_MC_UNSERIALIZE_CATCHUP,
+			     pointer_to_uword (mp));
+}
+
+static void perform_catchup (mc_main_t * mcm, mc_msg_catchup_reply_t * mp)
+{
+  mc_stream_t * s;
+  i32 seq_cmp_result;
+    
+  mc_byte_swap_msg_catchup_reply (mp);
+
+  s = mc_stream_by_index (mcm, mp->stream_index);
+
+  /* Never heard of this stream or already caught up. */
+  if (! s || s->state == MC_STREAM_STATE_ready)
+    return;
+    
+  {
+    serialize_main_t m;
+    mc_stream_peer_t * p;
+    u32 n_stream_bytes;
+
+    /* For offline sim replay: save the entire catchup snapshot... */
+    if (s->config.save_snapshot)
+      s->config.save_snapshot (mcm, /* is_catchup */ 1, mp->data, mp->n_data_bytes);
+
+    unserialize_open_data (&m, mp->data, mp->n_data_bytes);
+    unserialize (&m, unserialize_mc_stream, s);
+
+    /* Make sure we start numbering our messages as expected */
+    pool_foreach (p, s->peers, ({
+      if (p->id.as_u64 == mcm->transport.our_ack_peer_id.as_u64)
+        s->our_local_sequence = p->last_sequence_received + 1;
+    }));
+
+    n_stream_bytes = m.stream.current_buffer_index;
+
+    /* No need to unserialize close; nothing to free. */
+
+    /* After serialized stream is user's catchup data. */
+    s->config.catchup (mcm, mp->data + n_stream_bytes,
+                       mp->n_data_bytes - n_stream_bytes);
+  }
+
+  /* Vector could have been moved by catchup.
+     This can only happen for mc-internal stream. */
+  s = mc_stream_by_index (mcm, mp->stream_index);
+
+  s->last_global_sequence_processed = mp->last_global_sequence_included;
+
+  while (clib_fifo_elts (s->catchup_fifo))
+    {
+      mc_msg_user_request_t * gp;
+      u32 bi;
+      vlib_buffer_t * b;
+
+      clib_fifo_sub1(s->catchup_fifo, bi);
+        
+      b = vlib_get_buffer (mcm->vlib_main, bi);
+      gp = vlib_buffer_get_current (b);
+
+      /* Make sure we're replaying "new" news */
+      seq_cmp_result = mc_seq_cmp (gp->global_sequence,
+				   mp->last_global_sequence_included);
+
+      if (seq_cmp_result > 0)
+	{
+	  vlib_buffer_advance (b, sizeof (gp[0]));
+	  s->config.rx_buffer (mcm, s, gp->peer_id, bi);
+	  s->last_global_sequence_processed = gp->global_sequence;
+
+	  if (MC_EVENT_LOGGING)
+	    {
+	      ELOG_TYPE_DECLARE (t) = {
+		.format = "catchup replay local sequence 0x%x",
+		.format_args = "i4",
+	      };
+	      struct { u32 local_sequence; } * ed;
+	      ed = ELOG_DATA (mcm->elog_main, t);
+	      ed->local_sequence = gp->local_sequence;
+	    }
+	}
+      else
+	{
+	  if (MC_EVENT_LOGGING)
+	    {
+	      ELOG_TYPE_DECLARE (t) = {
+		.format = "catchup discard local sequence 0x%x",
+		.format_args = "i4",
+	      };
+	      struct { u32 local_sequence; } * ed;
+	      ed = ELOG_DATA (mcm->elog_main, t);
+	      ed->local_sequence = gp->local_sequence;
+	    }
+
+	  vlib_buffer_free_one (mcm->vlib_main, bi);
+	}
+    }
+
+  s->state = MC_STREAM_STATE_ready;
+
+  /* Now that we are caught up wake up joining process. */
+  {
+    vlib_one_time_waiting_process_t * wp;
+    vec_foreach (wp, s->procs_waiting_for_join_done)
+      vlib_signal_one_time_waiting_process (mcm->vlib_main, wp);
+    if (s->procs_waiting_for_join_done)
+      _vec_len (s->procs_waiting_for_join_done) = 0;
+  }
+}
+
+static void this_node_maybe_master (mc_main_t * mcm)
+{
+  vlib_main_t * vm = mcm->vlib_main;
+  mc_msg_master_assert_t * mp;
+  uword event_type;
+  int timeouts = 0;
+  int is_master = mcm->relay_state == MC_RELAY_STATE_MASTER;
+  clib_error_t * error;
+  f64 now, time_last_master_assert = -1;
+  u32 bi;
+
+  while (1)
+    {
+      if (! mcm->we_can_be_relay_master)
+	{
+	  mcm->relay_state = MC_RELAY_STATE_SLAVE;
+	  if (MC_EVENT_LOGGING)
+	    {
+	      ELOG_TYPE (e, "become slave (config)");
+	      ELOG (mcm->elog_main, e, 0);
+	    }
+	  return;
+	}
+
+      now = vlib_time_now (vm);
+      if (now >= time_last_master_assert + 1)
+	{
+	  time_last_master_assert = now;
+	  mp = mc_get_vlib_buffer (mcm->vlib_main, sizeof (mp[0]), &bi);
+
+	  mp->peer_id = mcm->transport.our_ack_peer_id;
+	  mp->global_sequence = mcm->relay_global_sequence;
+
+          /* 
+           * these messages clog the event log, set MC_EVENT_LOGGING higher
+           * if you want them
+           */
+	  if (MC_EVENT_LOGGING > 1)
+	    {
+	      ELOG_TYPE_DECLARE (e) = {
+		.format = "tx-massert: peer %s global seq %u",
+		.format_args = "T4i4",
+	      };
+	      struct { u32 peer, global_sequence; } * ed;
+	      ed = ELOG_DATA (mcm->elog_main, e);
+	      ed->peer = elog_id_for_peer_id (mcm, mp->peer_id.as_u64);
+	      ed->global_sequence = mp->global_sequence;
+	    }
+
+          mc_byte_swap_msg_master_assert (mp);
+
+	  error = mcm->transport.tx_buffer (mcm->transport.opaque, MC_TRANSPORT_MASTERSHIP, bi);
+	  if (error)
+	    clib_error_report (error);
+	}
+
+      vlib_process_wait_for_event_or_clock (vm, 1.0);
+      event_type = vlib_process_get_events (vm, /* no event data */ 0);
+
+      switch (event_type)
+	{
+	case ~0:
+	  if (! is_master && timeouts++ > 2)
+	    {
+	      mcm->relay_state = MC_RELAY_STATE_MASTER;
+	      mcm->relay_master_peer_id = mcm->transport.our_ack_peer_id.as_u64;
+	      if (MC_EVENT_LOGGING)
+		{
+		  ELOG_TYPE (e, "become master (was maybe_master)");
+		  ELOG (mcm->elog_main, e, 0);
+		}
+	      return;
+	    }
+	  break;
+            
+	case MC_RELAY_STATE_SLAVE:
+	  mcm->relay_state = MC_RELAY_STATE_SLAVE;
+	  if (MC_EVENT_LOGGING && mcm->relay_state != MC_RELAY_STATE_SLAVE)
+	    {
+	      ELOG_TYPE (e, "become slave (was maybe_master)");
+	      ELOG (mcm->elog_main, e, 0);
+	    }
+	  return;
+	}
+    }
+}
+
+static void this_node_slave (mc_main_t * mcm)
+{
+  vlib_main_t * vm = mcm->vlib_main;
+  uword event_type;
+  int timeouts = 0;
+
+  if (MC_EVENT_LOGGING)
+    {
+      ELOG_TYPE (e, "become slave");
+      ELOG (mcm->elog_main, e, 0);
+    }
+
+  while (1)
+    {
+      vlib_process_wait_for_event_or_clock (vm, 1.0);
+      event_type = vlib_process_get_events (vm, /* no event data */ 0);
+
+      switch (event_type)
+	{
+	case ~0:
+	  if (timeouts++ > 2)
+	    {
+	      mcm->relay_state = MC_RELAY_STATE_NEGOTIATE;
+	      mcm->relay_master_peer_id = ~0ULL;
+	      if (MC_EVENT_LOGGING)
+		{
+		  ELOG_TYPE (e, "timeouts; negoitate mastership");
+		  ELOG (mcm->elog_main, e, 0);
+		}
+	      return;
+	    }
+	  break;
+
+	case MC_RELAY_STATE_SLAVE:
+	  mcm->relay_state = MC_RELAY_STATE_SLAVE;
+	  timeouts = 0;
+	  break;
+	}
+    }
+}
+
+static uword
+mc_mastership_process (vlib_main_t * vm,
+		       vlib_node_runtime_t * node,
+		       vlib_frame_t * f)
+{
+  mc_main_t * mcm = mc_node_get_main (node);
+    
+  while (1)
+    {
+      switch (mcm->relay_state)
+	{
+	case MC_RELAY_STATE_NEGOTIATE:
+	case MC_RELAY_STATE_MASTER:
+	  this_node_maybe_master(mcm);
+	  break;
+
+	case MC_RELAY_STATE_SLAVE:
+	  this_node_slave (mcm);
+	  break;
+	}
+    }
+  return 0; /* not likely */
+}
+
+void mc_enable_disable_mastership (mc_main_t * mcm, int we_can_be_master)
+{
+  if (we_can_be_master != mcm->we_can_be_relay_master)
+    {
+      mcm->we_can_be_relay_master = we_can_be_master;
+      vlib_process_signal_event (mcm->vlib_main, 
+				 mcm->mastership_process,
+				 MC_RELAY_STATE_NEGOTIATE, 0);
+    }
+}
+
+void mc_msg_master_assert_handler (mc_main_t * mcm, mc_msg_master_assert_t * mp, u32 buffer_index)
+{
+  mc_peer_id_t his_peer_id, our_peer_id;
+  i32 seq_cmp_result;
+  u8 signal_slave = 0;
+  u8 update_global_sequence = 0;
+
+  mc_byte_swap_msg_master_assert (mp);
+
+  his_peer_id = mp->peer_id;
+  our_peer_id = mcm->transport.our_ack_peer_id;
+
+  /* compare the incoming global sequence with ours */
+  seq_cmp_result = mc_seq_cmp (mp->global_sequence,
+			       mcm->relay_global_sequence);
+
+  /* If the sender has a lower peer id and the sender's sequence >=
+     our global sequence, we become a slave.  Otherwise we are master. */
+  if (mc_peer_id_compare (his_peer_id, our_peer_id) < 0 && seq_cmp_result >= 0)
+    {
+      vlib_process_signal_event (mcm->vlib_main, 
+				 mcm->mastership_process,
+				 MC_RELAY_STATE_SLAVE, 0);
+      signal_slave = 1;
+    }
+
+  /* Update our global sequence. */
+  if (seq_cmp_result > 0)
+    {
+      mcm->relay_global_sequence = mp->global_sequence;
+      update_global_sequence = 1;
+    }
+
+  {
+    uword * q = mhash_get (&mcm->mastership_peer_index_by_id, &his_peer_id);
+    mc_mastership_peer_t * p;
+
+    if (q)
+      p = vec_elt_at_index (mcm->mastership_peers, q[0]);
+    else
+      {
+	vec_add2 (mcm->mastership_peers, p, 1);
+	p->peer_id = his_peer_id;
+	mhash_set (&mcm->mastership_peer_index_by_id, &p->peer_id, p - mcm->mastership_peers,
+		   /* old_value */ 0);
+      }
+    p->time_last_master_assert_received = vlib_time_now (mcm->vlib_main);
+  }
+
+  /* 
+   * these messages clog the event log, set MC_EVENT_LOGGING higher
+   * if you want them.
+   */
+  if (MC_EVENT_LOGGING > 1)
+    {
+      ELOG_TYPE_DECLARE (e) = {
+	.format = "rx-massert: peer %s global seq %u upd %d slave %d",
+	.format_args = "T4i4i1i1",
+      };
+      struct { 
+	u32 peer;
+	u32 global_sequence; 
+	u8 update_sequence;
+	u8 slave;
+      } * ed;
+      ed = ELOG_DATA (mcm->elog_main, e);
+      ed->peer = elog_id_for_peer_id (mcm, his_peer_id.as_u64);
+      ed->global_sequence = mp->global_sequence;
+      ed->update_sequence = update_global_sequence;
+      ed->slave = signal_slave;
+    }
+}
+
+static void
+mc_serialize_init (mc_main_t * mcm)
+{
+  mc_serialize_msg_t * m;
+  vlib_main_t * vm = vlib_get_main();
+
+  mcm->global_msg_index_by_name
+      = hash_create_string (/* elts */ 0, sizeof (uword));
+
+  m = vm->mc_msg_registrations;
+  
+  while (m)
+    {
+      m->global_index = vec_len (mcm->global_msgs);
+      hash_set_mem (mcm->global_msg_index_by_name,
+                    m->name,
+                    m->global_index);
+      vec_add1 (mcm->global_msgs, m);
+      m = m->next_registration;
+    }
+}
+
+clib_error_t *
+mc_serialize_va (mc_main_t * mc,
+                 u32 stream_index,
+		 u32 multiple_messages_per_vlib_buffer,
+                 mc_serialize_msg_t * msg,
+                 va_list * va)
+{
+  mc_stream_t * s;
+  clib_error_t * error;
+  serialize_main_t * m = &mc->serialize_mains[VLIB_TX];
+  vlib_serialize_buffer_main_t * sbm = &mc->serialize_buffer_mains[VLIB_TX];
+  u32 bi, n_before, n_after, n_total, n_this_msg;
+  u32 si, gi;
+
+  if (! sbm->vlib_main)
+    {
+      sbm->tx.max_n_data_bytes_per_chain = 4096;
+      sbm->tx.free_list_index = VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX;
+    }
+
+  if (sbm->first_buffer == 0)
+    serialize_open_vlib_buffer (m, mc->vlib_main, sbm);
+
+  n_before = serialize_vlib_buffer_n_bytes (m);
+
+  s = mc_stream_by_index (mc, stream_index);
+  gi = msg->global_index;
+  ASSERT (msg == vec_elt (mc->global_msgs, gi));
+
+  si = ~0;
+  if (gi < vec_len (s->stream_msg_index_by_global_index))
+    si = s->stream_msg_index_by_global_index[gi];
+
+  serialize_likely_small_unsigned_integer (m, si);
+
+  /* For first time message is sent, use name to identify message. */
+  if (si == ~0 || MSG_ID_DEBUG) 
+    serialize_cstring (m, msg->name);
+
+  if (MSG_ID_DEBUG && MC_EVENT_LOGGING > 0)
+    {
+      ELOG_TYPE_DECLARE (e) = {
+        .format = "serialize-msg: %s index %d",
+        .format_args = "T4i4",
+      };
+      struct { u32 c[2]; } * ed;
+      ed = ELOG_DATA (mc->elog_main, e);
+      ed->c[0] = elog_id_for_msg_name (mc, msg->name);
+      ed->c[1] = si;
+    }
+
+  error = va_serialize (m, va);
+
+  n_after = serialize_vlib_buffer_n_bytes (m);
+  n_this_msg = n_after - n_before;
+  n_total = n_after + sizeof (mc_msg_user_request_t);
+
+  /* For max message size ignore first message where string name is sent. */
+  if (si != ~0)
+    msg->max_n_bytes_serialized = clib_max (msg->max_n_bytes_serialized, n_this_msg);
+
+  if (! multiple_messages_per_vlib_buffer
+      || si == ~0
+      || n_total + msg->max_n_bytes_serialized > mc->transport.max_packet_size)
+    {
+      bi = serialize_close_vlib_buffer (m);
+      sbm->first_buffer = 0;
+      if (! error)
+	mc_stream_send (mc, stream_index, bi);
+      else if (bi != ~0)
+	vlib_buffer_free_one (mc->vlib_main, bi);
+    }
+
+  return error;
+}
+
+clib_error_t *
+mc_serialize_internal (mc_main_t * mc,
+		       u32 stream_index,
+		       u32 multiple_messages_per_vlib_buffer,
+		       mc_serialize_msg_t * msg,
+		       ...)
+{
+  vlib_main_t * vm = mc->vlib_main;
+  va_list va;
+  clib_error_t * error;
+
+  if (stream_index == ~0)
+    {
+      if (vm->mc_main && vm->mc_stream_index == ~0)
+	vlib_current_process_wait_for_one_time_event_vector
+	  (vm, &vm->procs_waiting_for_mc_stream_join);
+      stream_index = vm->mc_stream_index;
+    }
+
+  va_start (va, msg);
+  error = mc_serialize_va (mc, stream_index,
+			   multiple_messages_per_vlib_buffer,
+			   msg, &va);
+  va_end (va);
+  return error;
+}
+
+uword mc_unserialize_message (mc_main_t * mcm,
+                              mc_stream_t * s,
+                              serialize_main_t * m)
+{
+  mc_serialize_stream_msg_t * sm;
+  u32 gi, si;
+
+  si = unserialize_likely_small_unsigned_integer (m);
+
+  if (! (si == ~0 || MSG_ID_DEBUG))
+    {
+      sm = vec_elt_at_index (s->stream_msgs, si);
+      gi = sm->global_index;
+    }
+  else
+    {
+      char * name;
+
+      unserialize_cstring (m, &name);
+
+      if (MSG_ID_DEBUG && MC_EVENT_LOGGING > 0)
+        {
+          ELOG_TYPE_DECLARE (e) = {
+            .format = "unserialize-msg: %s rx index %d",
+            .format_args = "T4i4",
+          };
+          struct { u32 c[2]; } * ed;
+          ed = ELOG_DATA (mcm->elog_main, e);
+          ed->c[0] = elog_id_for_msg_name (mcm, name);
+          ed->c[1] = si;
+        }
+
+      {
+        uword * p = hash_get_mem (mcm->global_msg_index_by_name, name);
+        gi = p ? p[0] : ~0;
+      }
+
+      /* Unknown message? */
+      if (gi == ~0)
+        {
+          vec_free (name);
+          goto done;
+        }
+
+      vec_validate_init_empty (s->stream_msg_index_by_global_index, gi, ~0);
+      si = s->stream_msg_index_by_global_index[gi];
+
+      /* Stream local index unknown?  Create it. */
+      if (si == ~0)
+        {
+          vec_add2 (s->stream_msgs, sm, 1);
+
+          si = sm - s->stream_msgs;
+          sm->global_index = gi;
+          s->stream_msg_index_by_global_index[gi] = si;
+
+          if (MC_EVENT_LOGGING > 0)
+            {
+              ELOG_TYPE_DECLARE (e) = {
+                .format = "msg-bind: stream %d %s to index %d",
+                .format_args = "i4T4i4",
+              };
+              struct { u32 c[3]; } * ed;
+              ed = ELOG_DATA (mcm->elog_main, e);
+              ed->c[0] = s->index;
+              ed->c[1] = elog_id_for_msg_name (mcm, name);
+              ed->c[2] = si;
+            }
+        }
+      else
+        {
+          sm = vec_elt_at_index (s->stream_msgs, si);
+          if (gi != sm->global_index && MC_EVENT_LOGGING > 0)
+            {
+              ELOG_TYPE_DECLARE (e) = {
+                .format = "msg-id-ERROR: %s index %d expected %d",
+                .format_args = "T4i4i4",
+              };
+              struct { u32 c[3]; } * ed;
+              ed = ELOG_DATA (mcm->elog_main, e);
+              ed->c[0] = elog_id_for_msg_name (mcm, name);
+              ed->c[1] = si;
+              ed->c[2] = ~0;
+              if (sm->global_index < vec_len (s->stream_msg_index_by_global_index))
+                ed->c[2] = s->stream_msg_index_by_global_index[sm->global_index];
+            }
+        }
+
+      vec_free (name);
+    }
+
+  if (gi != ~0)
+    {
+      mc_serialize_msg_t * msg;
+      msg = vec_elt (mcm->global_msgs, gi);
+      unserialize (m, msg->unserialize, mcm);
+    }
+
+ done:
+  return gi != ~0;
+}
+
+void
+mc_unserialize_internal (mc_main_t * mcm, u32 stream_and_buffer_index)
+{
+  vlib_main_t * vm = mcm->vlib_main;
+  serialize_main_t * m = &mcm->serialize_mains[VLIB_RX];
+  vlib_serialize_buffer_main_t * sbm = &mcm->serialize_buffer_mains[VLIB_RX];
+  mc_stream_and_buffer_t * sb;
+  mc_stream_t * stream;
+  u32 buffer_index;
+
+  sb = pool_elt_at_index (mcm->mc_unserialize_stream_and_buffers, stream_and_buffer_index);
+  buffer_index = sb->buffer_index;
+  stream = vec_elt_at_index (mcm->stream_vector, sb->stream_index);
+  pool_put (mcm->mc_unserialize_stream_and_buffers, sb);
+
+  if (stream->config.save_snapshot)
+    {
+      u32 n_bytes = vlib_buffer_index_length_in_chain (vm, buffer_index);
+      static u8 * contents;
+      vec_reset_length (contents);
+      vec_validate (contents, n_bytes - 1);
+      vlib_buffer_contents (vm, buffer_index, contents);
+      stream->config.save_snapshot (mcm, /* is_catchup */ 0, contents, n_bytes);
+    }
+
+  ASSERT (vlib_in_process_context (vm));
+
+  unserialize_open_vlib_buffer (m, vm, sbm);
+
+  clib_fifo_add1 (sbm->rx.buffer_fifo, buffer_index);
+
+  while (unserialize_vlib_buffer_n_bytes (m) > 0)
+    mc_unserialize_message (mcm, stream, m);
+
+  /* Frees buffer. */
+  unserialize_close_vlib_buffer (m);
+}
+
+void
+mc_unserialize (mc_main_t * mcm, mc_stream_t * s, u32 buffer_index)
+{
+  vlib_main_t * vm = mcm->vlib_main;
+  mc_stream_and_buffer_t * sb;
+  pool_get (mcm->mc_unserialize_stream_and_buffers, sb);
+  sb->stream_index = s->index;
+  sb->buffer_index = buffer_index;
+  vlib_process_signal_event (vm, mcm->unserialize_process, 
+                             EVENT_MC_UNSERIALIZE_BUFFER, sb - mcm->mc_unserialize_stream_and_buffers);
+}
+
+static uword
+mc_unserialize_process (vlib_main_t * vm,
+			vlib_node_runtime_t * node,
+			vlib_frame_t * f)
+{
+  mc_main_t * mcm = mc_node_get_main (node);
+  uword event_type, * event_data = 0;
+  int i;
+    
+  while (1)
+    {
+      if (event_data)
+	_vec_len(event_data) = 0;
+
+      vlib_process_wait_for_event (vm);
+      event_type = vlib_process_get_events (vm, &event_data);
+      switch (event_type)
+	{
+	case EVENT_MC_UNSERIALIZE_BUFFER:
+	  for (i = 0; i < vec_len (event_data); i++)
+	    mc_unserialize_internal (mcm, event_data[i]);
+	  break;
+
+	case EVENT_MC_UNSERIALIZE_CATCHUP:
+	  for (i = 0; i < vec_len (event_data); i++)
+	    {
+	      u8 * mp = uword_to_pointer (event_data[i], u8 *);
+	      perform_catchup (mcm, (void *) mp);
+	      vec_free (mp);
+	    }
+	  break;
+
+	default:
+	  break;
+	}
+    }
+
+  return 0; /* not likely */
+}
+
+void serialize_mc_main (serialize_main_t * m, va_list * va)
+{
+  mc_main_t * mcm = va_arg (*va, mc_main_t *);
+  mc_stream_t * s;
+  mc_serialize_stream_msg_t * sm;
+  mc_serialize_msg_t * msg;
+
+  serialize_integer (m, vec_len (mcm->stream_vector), sizeof (u32));
+  vec_foreach (s, mcm->stream_vector)
+    {
+      /* Stream name. */
+      serialize_cstring (m, s->config.name);
+
+      /* Serialize global names for all sent messages. */
+      serialize_integer (m, vec_len (s->stream_msgs), sizeof (u32));
+      vec_foreach (sm, s->stream_msgs)
+        {
+          msg = vec_elt (mcm->global_msgs, sm->global_index);
+          serialize_cstring (m, msg->name);
+        }
+    }
+}
+
+void unserialize_mc_main (serialize_main_t * m, va_list * va)
+{
+  mc_main_t * mcm = va_arg (*va, mc_main_t *);
+  u32 i, n_streams, n_stream_msgs;
+  char * name;
+  mc_stream_t * s;
+  mc_serialize_stream_msg_t * sm;
+
+  unserialize_integer (m, &n_streams, sizeof (u32));
+  for (i = 0; i < n_streams; i++)
+    {
+      unserialize_cstring (m, &name);
+      if (i != MC_STREAM_INDEX_INTERNAL
+          && ! mc_stream_by_name (mcm, name))
+        {
+          vec_validate (mcm->stream_vector, i);
+          s = vec_elt_at_index (mcm->stream_vector, i);
+          mc_stream_init (s);
+          s->index = s - mcm->stream_vector;
+          s->config.name = name;
+          s->state = MC_STREAM_STATE_name_known;
+          hash_set_mem (mcm->stream_index_by_name, s->config.name, s->index);
+        }
+      else
+        vec_free (name);
+
+      s = vec_elt_at_index (mcm->stream_vector, i);
+
+      vec_free (s->stream_msgs);
+      vec_free (s->stream_msg_index_by_global_index);
+
+      unserialize_integer (m, &n_stream_msgs, sizeof (u32));
+      vec_resize (s->stream_msgs, n_stream_msgs);
+      vec_foreach (sm, s->stream_msgs)
+        {
+          uword * p;
+          u32 si, gi;
+
+          unserialize_cstring (m, &name);
+          p = hash_get (mcm->global_msg_index_by_name, name);
+          gi = p ? p[0] : ~0;
+          si = sm - s->stream_msgs;
+
+          if (MC_EVENT_LOGGING > 0)
+            {
+              ELOG_TYPE_DECLARE (e) = {
+                .format = "catchup-bind: %s to %d global index %d stream %d",
+                .format_args = "T4i4i4i4",
+              };
+              struct { u32 c[4]; } * ed;
+              ed = ELOG_DATA (mcm->elog_main, e);
+              ed->c[0] = elog_id_for_msg_name (mcm, name);
+              ed->c[1] = si;
+              ed->c[2] = gi;
+              ed->c[3] = s->index;
+            }
+
+          vec_free (name);
+
+          sm->global_index = gi;
+	  if (gi != ~0)
+	    {
+	      vec_validate_init_empty (s->stream_msg_index_by_global_index,
+				       gi, ~0);
+	      s->stream_msg_index_by_global_index[gi] = si;
+	    }
+        }
+    }
+}
+
+void mc_main_init (mc_main_t * mcm, char * tag)
+{
+  vlib_main_t * vm = vlib_get_main();
+
+  mcm->vlib_main = vm;
+  mcm->elog_main = &vm->elog_main;
+
+  mcm->relay_master_peer_id = ~0ULL;
+  mcm->relay_state = MC_RELAY_STATE_NEGOTIATE;
+
+  mcm->stream_index_by_name
+    = hash_create_string (/* elts */ 0, /* value size */ sizeof (uword));
+
+  {
+    vlib_node_registration_t r;
+
+    memset (&r, 0, sizeof (r));
+
+    r.type = VLIB_NODE_TYPE_PROCESS;
+
+    /* Point runtime data to main instance. */
+    r.runtime_data = &mcm;
+    r.runtime_data_bytes = sizeof (&mcm);
+
+    r.name = (char *) format (0, "mc-mastership-%s", tag);
+    r.function = mc_mastership_process;
+    mcm->mastership_process = vlib_register_node (vm, &r);
+
+    r.name = (char *) format (0, "mc-join-ager-%s", tag);
+    r.function = mc_join_ager_process;
+    mcm->join_ager_process = vlib_register_node (vm, &r);
+
+    r.name = (char *) format (0, "mc-retry-%s", tag);
+    r.function = mc_retry_process;
+    mcm->retry_process = vlib_register_node (vm, &r);
+
+    r.name = (char *) format (0, "mc-catchup-%s", tag);
+    r.function = mc_catchup_process;
+    mcm->catchup_process = vlib_register_node (vm, &r);
+
+    r.name = (char *) format (0, "mc-unserialize-%s", tag);
+    r.function = mc_unserialize_process;
+    mcm->unserialize_process = vlib_register_node (vm, &r);
+  }
+
+  if (MC_EVENT_LOGGING > 0)
+    mhash_init (&mcm->elog_id_by_peer_id, sizeof (uword), sizeof (mc_peer_id_t));
+
+  mhash_init (&mcm->mastership_peer_index_by_id, sizeof (uword), sizeof (mc_peer_id_t));
+  mc_serialize_init (mcm);
+}
+
+static u8 * format_mc_relay_state (u8 * s, va_list * args)
+{
+  mc_relay_state_t state = va_arg (*args, mc_relay_state_t);
+  char * t = 0;
+  switch (state)
+    {
+    case MC_RELAY_STATE_NEGOTIATE:
+      t = "negotiate";
+      break;
+    case MC_RELAY_STATE_MASTER:
+      t = "master";
+      break;
+    case MC_RELAY_STATE_SLAVE:
+      t = "slave";
+      break;
+    default:
+      return format (s, "unknown 0x%x", state);
+    }
+
+  return format (s, "%s", t);
+}
+
+static u8 * format_mc_stream_state (u8 * s, va_list * args)
+{
+  mc_stream_state_t state = va_arg (*args, mc_stream_state_t);
+  char * t = 0;
+  switch (state)
+    {
+#define _(f) case MC_STREAM_STATE_##f: t = #f; break;
+      foreach_mc_stream_state
+#undef _
+    default:
+      return format (s, "unknown 0x%x", state);
+    }
+
+  return format (s, "%s", t);
+}
+
+u8 * format_mc_main (u8 * s, va_list * args)
+{
+  mc_main_t * mcm = va_arg (*args, mc_main_t *);
+  mc_stream_t * t;
+  mc_stream_peer_t * p, * ps;
+  uword indent = format_get_indent (s);
+
+  s = format (s, "MC state %U, %d streams joined, global sequence 0x%x", 
+	      format_mc_relay_state, mcm->relay_state, 
+	      vec_len (mcm->stream_vector),
+	      mcm->relay_global_sequence);
+
+  {
+    mc_mastership_peer_t * mp;
+    f64 now = vlib_time_now (mcm->vlib_main);
+    s = format (s, "\n%UMost recent mastership peers:",
+		format_white_space, indent + 2);
+    vec_foreach (mp, mcm->mastership_peers)
+      {
+	s = format (s, "\n%U%-30U%.4e",
+		    format_white_space, indent + 4,
+		    mcm->transport.format_peer_id, mp->peer_id,
+		    now - mp->time_last_master_assert_received);
+      }
+  }
+
+  vec_foreach (t, mcm->stream_vector)
+    {
+      s = format (s, "\n%Ustream `%s' index %d",
+		  format_white_space, indent + 2,
+		  t->config.name, t->index);
+
+      s = format (s, "\n%Ustate %U",
+		  format_white_space, indent + 4,
+		  format_mc_stream_state, t->state);
+
+      s = format (s, "\n%Uretries: interval %.0f sec, limit %d, pool elts %d, %Ld sent",
+		  format_white_space, indent + 4, t->config.retry_interval, 
+                  t->config.retry_limit,
+		  pool_elts (t->retry_pool),
+		  t->stats.n_retries - t->stats_last_clear.n_retries);
+
+      s = format (s, "\n%U%Ld/%Ld user requests sent/received",
+		  format_white_space, indent + 4,
+		  t->user_requests_sent, t->user_requests_received);
+
+      s = format (s, "\n%U%d peers, local/global sequence 0x%x/0x%x",
+		  format_white_space, indent + 4,
+		  pool_elts (t->peers),
+		  t->our_local_sequence,
+		  t->last_global_sequence_processed);
+
+      ps = 0;
+      pool_foreach (p, t->peers, 
+      ({ 
+        if (clib_bitmap_get (t->all_peer_bitmap, p - t->peers))
+          vec_add1 (ps, p[0]); 
+      }));
+      vec_sort (ps, p1, p2, mc_peer_id_compare (p1->id, p2->id));
+      s = format (s, "\n%U%=30s%10s%16s%16s",
+                  format_white_space, indent + 6,
+                  "Peer", "Last seq", "Retries", "Future");
+                  
+      vec_foreach (p, ps)
+	{
+	  s = format (s, "\n%U%-30U0x%08x%16Ld%16Ld%s",
+		      format_white_space, indent + 6,
+		      mcm->transport.format_peer_id, p->id.as_u64,
+		      p->last_sequence_received,
+                      p->stats.n_msgs_from_past - p->stats_last_clear.n_msgs_from_past,
+                      p->stats.n_msgs_from_future - p->stats_last_clear.n_msgs_from_future,
+		      (mcm->transport.our_ack_peer_id.as_u64 == p->id.as_u64
+		       ? " (self)" : ""));
+	}
+      vec_free (ps);
+    }
+
+  return s;
+}
diff --git a/vlib/vlib/mc.h b/vlib/vlib/mc.h
new file mode 100644
index 00000000000..55dce2822c6
--- /dev/null
+++ b/vlib/vlib/mc.h
@@ -0,0 +1,674 @@
+/*
+ * mc.h: vlib reliable sequenced multicast distributed applications
+ *
+ * Copyright (c) 2010 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_vlib_mc_h
+#define included_vlib_mc_h
+
+#include <vppinfra/elog.h>
+#include <vppinfra/fifo.h>
+#include <vppinfra/mhash.h>
+#include <vlib/node.h>
+
+#ifndef MC_EVENT_LOGGING
+#define MC_EVENT_LOGGING 1
+#endif
+
+always_inline uword
+mc_need_byte_swap (void)
+{ return CLIB_ARCH_IS_LITTLE_ENDIAN; }
+
+/* 
+ * Used to uniquely identify hosts.  
+ * For IP4 this would be ip4_address plus tcp/udp port. 
+ */
+typedef union {
+  u8 as_u8[8];
+  u64 as_u64;
+} mc_peer_id_t;
+
+always_inline mc_peer_id_t
+mc_byte_swap_peer_id (mc_peer_id_t i)
+{ 
+  /* Peer id is already in network byte order. */
+  return i;
+}
+
+always_inline int
+mc_peer_id_compare (mc_peer_id_t a, mc_peer_id_t b)
+{
+  return memcmp (a.as_u8, b.as_u8, sizeof (a.as_u8));
+}
+
+/* Assert mastership.  Lowest peer_id amount all peers wins mastership.
+   Only sent/received over mastership channel (MC_TRANSPORT_MASTERSHIP).
+   So, we don't need a message opcode. */
+typedef CLIB_PACKED (struct {
+  /* Peer id asserting mastership. */
+  mc_peer_id_t peer_id;
+
+  /* Global sequence number asserted. */
+  u32 global_sequence;
+}) mc_msg_master_assert_t;
+
+always_inline void
+mc_byte_swap_msg_master_assert (mc_msg_master_assert_t * r)
+{
+  if (mc_need_byte_swap ())
+    {
+      r->peer_id = mc_byte_swap_peer_id (r->peer_id);
+      r->global_sequence = clib_byte_swap_u32 (r->global_sequence);
+    }
+}
+
+#define foreach_mc_msg_type			\
+  _ (master_assert)				\
+  _ (join_or_leave_request)			\
+  _ (join_reply)				\
+  _ (user_request)				\
+  _ (user_ack)					\
+  _ (catchup_request)				\
+  _ (catchup_reply)
+
+typedef enum {
+#define _(f) MC_MSG_TYPE_##f,
+  foreach_mc_msg_type
+#undef _
+} mc_relay_msg_type_t;
+
+/* Request to join a given stream.  Multicast over MC_TRANSPORT_JOIN. */
+typedef CLIB_PACKED (struct {
+  mc_peer_id_t peer_id;
+
+  mc_relay_msg_type_t type : 32; /* MC_MSG_TYPE_join_or_leave_request */
+
+  /* Stream to join or leave. */
+  u32 stream_index;
+
+  /* join = 1, leave = 0 */
+  u8 is_join;
+}) mc_msg_join_or_leave_request_t;
+
+always_inline void
+mc_byte_swap_msg_join_or_leave_request (mc_msg_join_or_leave_request_t * r)
+{
+  if (mc_need_byte_swap ())
+    {
+      r->peer_id = mc_byte_swap_peer_id (r->peer_id);
+      r->type = clib_byte_swap_u32 (r->type);
+      r->stream_index = clib_byte_swap_u32 (r->stream_index);
+    }
+}
+
+/* Join reply.  Multicast over MC_TRANSPORT_JOIN. */
+typedef CLIB_PACKED (struct {
+  mc_peer_id_t peer_id;
+
+  mc_relay_msg_type_t type : 32; /* MC_MSG_TYPE_join_reply */
+
+  u32 stream_index;
+
+  /* Peer ID to contact to catchup with this stream. */
+  mc_peer_id_t catchup_peer_id;
+}) mc_msg_join_reply_t;
+
+always_inline void
+mc_byte_swap_msg_join_reply (mc_msg_join_reply_t * r)
+{
+  if (mc_need_byte_swap ())
+    {
+      r->peer_id = mc_byte_swap_peer_id (r->peer_id);
+      r->type = clib_byte_swap_u32 (r->type);
+      r->stream_index = clib_byte_swap_u32 (r->stream_index);
+      r->catchup_peer_id = mc_byte_swap_peer_id (r->catchup_peer_id);
+    }
+}
+
+/* Generic (application) request.  Multicast over MC_TRANSPORT_USER_REQUEST_TO_RELAY and then
+   relayed by relay master after filling in global sequence number. */
+typedef CLIB_PACKED (struct {
+  mc_peer_id_t peer_id;
+
+  u32 stream_index;
+
+  /* Global sequence number as filled in by relay master. */
+  u32 global_sequence;
+
+  /* Local sequence number as filled in by peer sending message. */
+  u32 local_sequence;
+
+  /* Size of request data. */
+  u32 n_data_bytes; 
+
+  /* Opaque request data. */
+  u8 data[0];
+}) mc_msg_user_request_t;
+
+always_inline void
+mc_byte_swap_msg_user_request (mc_msg_user_request_t * r)
+{
+  if (mc_need_byte_swap ())
+    {
+      r->peer_id = mc_byte_swap_peer_id (r->peer_id);
+      r->stream_index = clib_byte_swap_u32 (r->stream_index);
+      r->global_sequence = clib_byte_swap_u32 (r->global_sequence);
+      r->local_sequence = clib_byte_swap_u32 (r->local_sequence);
+      r->n_data_bytes = clib_byte_swap_u32 (r->n_data_bytes);
+    }
+}
+
+/* Sent unicast over ACK channel. */
+typedef CLIB_PACKED (struct {
+  mc_peer_id_t peer_id;
+  u32 global_sequence;
+  u32 stream_index;
+  u32 local_sequence;
+  i32 seq_cmp_result;
+}) mc_msg_user_ack_t;
+
+always_inline void
+mc_byte_swap_msg_user_ack (mc_msg_user_ack_t * r)
+{
+  if (mc_need_byte_swap ())
+    {
+      r->peer_id = mc_byte_swap_peer_id (r->peer_id);
+      r->stream_index = clib_byte_swap_u32 (r->stream_index);
+      r->global_sequence = clib_byte_swap_u32 (r->global_sequence);
+      r->local_sequence = clib_byte_swap_u32 (r->local_sequence);
+      r->seq_cmp_result = clib_byte_swap_i32 (r->seq_cmp_result);
+    }
+}
+
+/* Sent/received unicast over catchup channel (e.g. using TCP). */
+typedef CLIB_PACKED (struct {
+  mc_peer_id_t peer_id;
+  u32 stream_index;
+}) mc_msg_catchup_request_t;
+
+always_inline void
+mc_byte_swap_msg_catchup_request (mc_msg_catchup_request_t * r)
+{
+  if (mc_need_byte_swap ())
+    {
+      r->peer_id = mc_byte_swap_peer_id (r->peer_id);
+      r->stream_index = clib_byte_swap_u32 (r->stream_index);
+    }
+}
+
+/* Sent/received unicast over catchup channel. */
+typedef CLIB_PACKED (struct {
+  mc_peer_id_t peer_id;
+
+  u32 stream_index;
+
+  /* Last global sequence number included in catchup data. */
+  u32 last_global_sequence_included;
+
+  /* Size of catchup data. */
+  u32 n_data_bytes;
+
+  /* Catchup data. */
+  u8 data[0];
+}) mc_msg_catchup_reply_t;
+
+always_inline void
+mc_byte_swap_msg_catchup_reply (mc_msg_catchup_reply_t * r)
+{
+  if (mc_need_byte_swap ())
+    {
+      r->peer_id = mc_byte_swap_peer_id (r->peer_id);
+      r->stream_index = clib_byte_swap_u32 (r->stream_index);
+      r->last_global_sequence_included = clib_byte_swap_u32 (r->last_global_sequence_included);
+      r->n_data_bytes = clib_byte_swap_u32 (r->n_data_bytes);
+    }
+}
+
+typedef struct _mc_serialize_msg {
+  /* Name for this type. */
+  char * name;
+
+  /* Functions to serialize/unserialize data. */
+  serialize_function_t * serialize;
+  serialize_function_t * unserialize;
+
+  /* Maximum message size in bytes when serialized.
+     If zero then this will be set to the largest sent message. */
+  u32 max_n_bytes_serialized;
+
+  /* Opaque to use for first argument to serialize/unserialize function. */
+  u32 opaque;
+
+  /* Index in global message vector. */
+  u32 global_index;
+
+  /* Registration list */
+  struct _mc_serialize_msg * next_registration;
+} mc_serialize_msg_t;
+
+typedef struct {
+  /* Index into global message vector. */
+  u32 global_index;
+} mc_serialize_stream_msg_t;
+
+#define MC_SERIALIZE_MSG(x,...)                                 \
+    __VA_ARGS__ mc_serialize_msg_t x;                           \
+static void __mc_serialize_msg_registration_##x (void)          \
+    __attribute__((__constructor__)) ;                          \
+static void __mc_serialize_msg_registration_##x (void)          \
+{                                                               \
+    vlib_main_t * vm = vlib_get_main();                         \
+    x.next_registration = vm->mc_msg_registrations;             \
+    vm->mc_msg_registrations = &x;                              \
+}                                                               \
+__VA_ARGS__ mc_serialize_msg_t x 
+
+typedef enum {
+  MC_TRANSPORT_MASTERSHIP,
+  MC_TRANSPORT_JOIN,
+  MC_TRANSPORT_USER_REQUEST_TO_RELAY,
+  MC_TRANSPORT_USER_REQUEST_FROM_RELAY,
+  MC_N_TRANSPORT_TYPE,
+} mc_transport_type_t;
+
+typedef struct {
+  clib_error_t * (* tx_buffer) (void * opaque, mc_transport_type_t type, u32 buffer_index);
+
+  clib_error_t * (* tx_ack) (void * opaque, mc_peer_id_t peer_id, u32 buffer_index);
+
+  /* Returns catchup opaque. */
+  uword (* catchup_request_fun) (void * opaque, u32 stream_index, mc_peer_id_t catchup_peer_id);
+
+  void (* catchup_send_fun) (void * opaque, uword catchup_opaque, u8 * data_vector);
+
+  /* Opaque passed to callbacks. */
+  void * opaque;
+
+  mc_peer_id_t our_ack_peer_id;
+  mc_peer_id_t our_catchup_peer_id;
+
+  /* Max packet size (MTU) for this transport.
+     For IP this is interface MTU less IP + UDP header size. */
+  u32 max_packet_size;
+
+  format_function_t * format_peer_id;
+} mc_transport_t;
+
+typedef struct {
+  /* Count of messages received from this peer from the past/future
+     (with seq_cmp != 0). */
+  u64 n_msgs_from_past;
+  u64 n_msgs_from_future;
+} mc_stream_peer_stats_t;
+
+typedef struct {
+  /* ID of this peer. */
+  mc_peer_id_t id;
+
+  /* The last sequence we received from this peer. */
+  u32 last_sequence_received;
+
+  mc_stream_peer_stats_t stats, stats_last_clear;
+} mc_stream_peer_t;
+
+typedef struct {
+  u32 buffer_index;
+
+  /* Cached copy of local sequence number from buffer. */
+  u32 local_sequence;
+
+  /* Number of times this buffer has been sent (retried). */
+  u32 n_retries;
+
+  /* Previous/next retries in doubly-linked list. */
+  u32 prev_index, next_index;
+
+  /* Bitmap of all peers which have acked this msg */
+  uword * unacked_by_peer_bitmap;
+
+  /* Message send or resend time */
+  f64 sent_at;
+} mc_retry_t;
+
+typedef struct {
+  /* Number of retries sent for this stream. */
+  u64 n_retries;
+} mc_stream_stats_t;
+
+struct mc_main_t;
+struct mc_stream_t;
+
+typedef struct {
+  /* Stream name. */
+  char * name;
+
+  /* Number of outstanding messages. */
+  u32 window_size;
+
+  /* Retry interval, in seconds */
+  f64 retry_interval;
+
+  /* Retry limit */
+  u32 retry_limit;
+
+  /* User rx buffer callback */
+  void (* rx_buffer) (struct mc_main_t * mc_main,
+		      struct mc_stream_t * stream,
+		      mc_peer_id_t peer_id,
+		      u32 buffer_index);
+
+  /* User callback to create a snapshot */
+  u8 * (* catchup_snapshot) (struct mc_main_t *mc_main,
+			     u8 * snapshot_vector,
+			     u32 last_global_sequence_included);
+
+  /* User callback to replay a snapshot */
+  void (* catchup) (struct mc_main_t *mc_main,
+		    u8 * snapshot_data,
+		    u32 n_snapshot_data_bytes);
+
+  /* Callback to save a snapshot for offline replay */
+  void (* save_snapshot) (struct mc_main_t *mc_main,
+                          u32 is_catchup,
+                          u8 * snapshot_data,
+                          u32 n_snapshot_data_bytes);
+
+  /* Called when a peer dies */
+  void (* peer_died) (struct mc_main_t * mc_main,
+		      struct mc_stream_t * stream,
+		      mc_peer_id_t peer_id);
+} mc_stream_config_t;
+
+#define foreach_mc_stream_state			\
+  _ (invalid)					\
+  _ (name_known)				\
+  _ (join_in_progress)				\
+  _ (catchup)					\
+  _ (ready)
+
+typedef enum {
+#define _(f) MC_STREAM_STATE_##f,
+  foreach_mc_stream_state
+#undef _
+} mc_stream_state_t;
+
+typedef struct mc_stream_t {
+  mc_stream_config_t config;
+
+  mc_stream_state_t state;
+
+  /* Index in stream pool. */
+  u32 index;
+
+  /* Stream index 0 is always for MC internal use. */
+#define MC_STREAM_INDEX_INTERNAL 0
+
+  mc_retry_t * retry_pool;
+
+  /* Head and tail index of retry pool. */
+  u32 retry_head_index, retry_tail_index;
+
+  /* 
+   * Country club for recently retired messages
+   * If the set of peers is expanding and a new peer
+   * misses a message, we can easily retire the FIFO
+   * element before we even know about the new peer
+   */
+  mc_retry_t * retired_fifo;
+
+  /* Hash mapping local sequence to retry pool index. */
+  uword * retry_index_by_local_sequence;
+
+  /* catch-up fifo of VLIB buffer indices.
+     start recording when catching up. */
+  u32 * catchup_fifo;
+
+  mc_stream_stats_t stats, stats_last_clear;
+
+  /* Peer pool. */
+  mc_stream_peer_t * peers;
+
+  /* Bitmap with ones for all peers in peer pool. */
+  uword * all_peer_bitmap;
+
+  /* Map of 64 bit id to index in stream pool. */
+  mhash_t peer_index_by_id;
+
+  /* Timeout, in case we're alone in the world */
+  f64 join_timeout;
+
+  vlib_one_time_waiting_process_t * procs_waiting_for_join_done;
+
+  vlib_one_time_waiting_process_t * procs_waiting_for_open_window;
+
+  /* Next sequence number to use */
+  u32 our_local_sequence;
+
+  /* 
+   * Last global sequence we processed.
+   * When supplying catchup data, we need to tell
+   * the client precisely where to start replaying
+   */
+  u32 last_global_sequence_processed;
+
+  /* Vector of unique messages we've sent on this stream. */
+  mc_serialize_stream_msg_t * stream_msgs;
+
+  /* Vector global message index into per stream message index. */
+  u32 * stream_msg_index_by_global_index;
+
+  /* Hashed by message name. */
+  uword * stream_msg_index_by_name;
+
+  u64 user_requests_sent;
+  u64 user_requests_received;
+} mc_stream_t;
+
+always_inline void
+mc_stream_free (mc_stream_t * s)
+{
+  pool_free (s->retry_pool);
+  hash_free (s->retry_index_by_local_sequence);
+  clib_fifo_free (s->catchup_fifo);
+  pool_free (s->peers);
+  mhash_free (&s->peer_index_by_id);
+  vec_free (s->procs_waiting_for_join_done);
+  vec_free (s->procs_waiting_for_open_window);
+}
+
+always_inline void
+mc_stream_init (mc_stream_t * s)
+{
+  memset (s, 0, sizeof (s[0]));
+  s->retry_head_index = s->retry_tail_index = ~0;
+}
+
+typedef struct {
+  u32 stream_index;
+  u32 catchup_opaque;
+  u8 *catchup_snapshot;
+} mc_catchup_process_arg_t;
+
+typedef enum {
+  MC_RELAY_STATE_NEGOTIATE,
+  MC_RELAY_STATE_MASTER,
+  MC_RELAY_STATE_SLAVE,
+} mc_relay_state_t;
+
+typedef struct {
+  mc_peer_id_t peer_id;
+
+  f64 time_last_master_assert_received;
+} mc_mastership_peer_t;
+
+typedef struct {
+  u32 stream_index;
+  u32 buffer_index;
+} mc_stream_and_buffer_t;
+
+typedef struct mc_main_t {
+  mc_relay_state_t relay_state;
+
+  /* Mastership */
+  u32 we_can_be_relay_master;
+
+  u64 relay_master_peer_id;
+
+  mc_mastership_peer_t * mastership_peers;
+
+  /* Map of 64 bit id to index in stream pool. */
+  mhash_t mastership_peer_index_by_id;
+
+  /* The transport we're using. */
+  mc_transport_t transport;
+    
+  /* Last-used global sequence number. */
+  u32 relay_global_sequence;
+
+  /* Vector of streams. */
+  mc_stream_t * stream_vector;
+
+  /* Hash table mapping stream name to pool index. */
+  uword * stream_index_by_name;
+
+  uword * procs_waiting_for_stream_name_by_name;
+
+  vlib_one_time_waiting_process_t ** procs_waiting_for_stream_name_pool;
+
+  int joins_in_progress;
+
+  mc_catchup_process_arg_t * catchup_process_args;
+
+  /* Node indices for mastership, join ager,
+     retry and catchup processes. */
+  u32 mastership_process;
+  u32 join_ager_process;
+  u32 retry_process;
+  u32 catchup_process;
+  u32 unserialize_process;
+
+  /* Global vector of messages. */
+  mc_serialize_msg_t ** global_msgs;
+
+  /* Hash table mapping message name to index. */
+  uword * global_msg_index_by_name;
+
+  /* Shared serialize/unserialize main. */
+  serialize_main_t serialize_mains[VLIB_N_RX_TX];
+
+  vlib_serialize_buffer_main_t serialize_buffer_mains[VLIB_N_RX_TX];
+
+  /* Convenience variables */
+  struct vlib_main_t * vlib_main;
+  elog_main_t * elog_main;
+
+  /* Maps 64 bit peer id to elog string table offset for this formatted peer id. */
+  mhash_t elog_id_by_peer_id;
+
+  uword *elog_id_by_msg_name;
+
+  /* For mc_unserialize. */
+  mc_stream_and_buffer_t * mc_unserialize_stream_and_buffers;
+} mc_main_t;
+
+always_inline mc_stream_t *
+mc_stream_by_name (mc_main_t * m, char * name)
+{
+  uword * p = hash_get (m->stream_index_by_name, name);
+  return p ? vec_elt_at_index (m->stream_vector, p[0]) : 0;
+}
+
+always_inline mc_stream_t *
+mc_stream_by_index (mc_main_t * m, u32 i)
+{
+  return i < vec_len (m->stream_vector) ? m->stream_vector + i : 0;
+}
+
+always_inline void
+mc_clear_stream_stats (mc_main_t * m)
+{
+  mc_stream_t * s;
+  mc_stream_peer_t * p;
+  vec_foreach (s, m->stream_vector)
+    {
+      s->stats_last_clear = s->stats;
+      pool_foreach (p, s->peers, ({
+	p->stats_last_clear = p->stats;
+      }));
+    }
+}
+
+/* Declare all message handlers. */
+#define _(f) void mc_msg_##f##_handler (mc_main_t * mcm, mc_msg_##f##_t * msg, u32 buffer_index);
+foreach_mc_msg_type
+#undef _
+
+u32 mc_stream_join (mc_main_t * mcm, mc_stream_config_t *);
+
+void mc_stream_leave (mc_main_t * mcm, u32 stream_index);
+
+void mc_wait_for_stream_ready (mc_main_t * m, char * stream_name);
+
+u32 mc_stream_send (mc_main_t * mcm, u32 stream_index, u32 buffer_index);
+
+void mc_main_init (mc_main_t * mcm, char * tag);
+
+void mc_enable_disable_mastership (mc_main_t * mcm, int we_can_be_master);
+
+void * mc_get_vlib_buffer (struct vlib_main_t * vm, u32 n_bytes, u32 * bi_return);
+
+format_function_t format_mc_main;
+
+clib_error_t *
+mc_serialize_internal (mc_main_t * mc,
+		       u32 stream_index,
+		       u32 multiple_messages_per_vlib_buffer,
+		       mc_serialize_msg_t * msg,
+		       ...);
+
+clib_error_t *
+mc_serialize_va (mc_main_t * mc,
+                 u32 stream_index,
+		 u32 multiple_messages_per_vlib_buffer,
+                 mc_serialize_msg_t * msg,
+                 va_list * va);
+
+#define mc_serialize_stream(mc,si,msg,args...)			\
+  mc_serialize_internal((mc),(si),(0),(msg),(msg)->serialize,args)
+
+#define mc_serialize(mc,msg,args...)				\
+  mc_serialize_internal((mc),(~0),(0),(msg),(msg)->serialize,args)
+
+#define mc_serialize2(mc,add,msg,args...)				\
+  mc_serialize_internal((mc),(~0),(add),(msg),(msg)->serialize,args)
+
+void mc_unserialize (mc_main_t * mcm, mc_stream_t * s, u32 buffer_index);
+uword mc_unserialize_message (mc_main_t * mcm, mc_stream_t * s,
+                              serialize_main_t * m);
+
+serialize_function_t serialize_mc_main, unserialize_mc_main;
+
+always_inline uword
+mc_max_message_size_in_bytes (mc_main_t * mcm)
+{ return mcm->transport.max_packet_size - sizeof (mc_msg_user_request_t); }
+
+always_inline word
+mc_serialize_n_bytes_left (mc_main_t * mcm, serialize_main_t * m)
+{ return mc_max_message_size_in_bytes (mcm) - serialize_vlib_buffer_n_bytes (m); }
+
+void unserialize_mc_stream (serialize_main_t * m, va_list * va);
+void mc_stream_join_process_hold (void);
+
+#endif /* included_vlib_mc_h */
diff --git a/vlib/vlib/node.c b/vlib/vlib/node.c
new file mode 100644
index 00000000000..4fb117e4f3e
--- /dev/null
+++ b/vlib/vlib/node.c
@@ -0,0 +1,566 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * node.c: VLIB processing nodes
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/threads.h>
+
+/* Query node given name. */
+vlib_node_t * vlib_get_node_by_name (vlib_main_t * vm, u8 * name)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  uword * p;
+  u8 * key = name;
+  if (! clib_mem_is_heap_object (key))
+    key = format (0, "%s", key);
+  p = hash_get (nm->node_by_name, key);
+  if (key != name)
+    vec_free (key);
+  return p ? vec_elt (nm->nodes, p[0]) : 0;
+}
+
+static void node_set_elog_name (vlib_main_t * vm, uword node_index)
+{
+  vlib_node_t * n = vlib_get_node (vm, node_index);
+  elog_event_type_t * t;
+
+  t = vec_elt_at_index (vm->node_call_elog_event_types, node_index);
+  vec_free (t->format);
+  t->format = (char *) format (0, "%v (%%d)", n->name);
+
+  t = vec_elt_at_index (vm->node_return_elog_event_types, node_index);
+  vec_free (t->format);
+  t->format = (char *) format (0, "%v () = %%d", n->name);
+
+  n->name_elog_string = elog_string (&vm->elog_main, "%v", n->name);
+}
+
+void vlib_node_rename (vlib_main_t * vm, u32 node_index, char * fmt, ...)
+{
+  va_list va;
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_node_t * n = vlib_get_node (vm, node_index);
+
+  va_start (va, fmt);
+  hash_unset (nm->node_by_name, n->name);
+  vec_free (n->name);
+  n->name = va_format (0, fmt, &va);
+  va_end (va);
+  hash_set (nm->node_by_name, n->name, n->index);
+
+  node_set_elog_name (vm, node_index);
+}
+
+static void
+vlib_node_runtime_update (vlib_main_t * vm,
+			  u32 node_index,
+			  u32 next_index)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_node_runtime_t * r, * s;
+  vlib_node_t * node, * next_node;
+  vlib_next_frame_t * nf;
+  vlib_pending_frame_t * pf;
+  i32 i, j, n_insert;
+
+  ASSERT(os_get_cpu_number() == 0);
+
+  vlib_worker_thread_barrier_sync(vm);
+
+  node = vec_elt (nm->nodes, node_index);
+  r = vlib_node_get_runtime (vm, node_index);
+
+  n_insert = vec_len (node->next_nodes) - r->n_next_nodes;
+  if (n_insert > 0)
+    {
+      i = r->next_frame_index + r->n_next_nodes;
+      vec_insert (nm->next_frames, n_insert, i);
+
+      /* Initialize newly inserted next frames. */
+      for (j = 0; j < n_insert; j++)
+	vlib_next_frame_init (nm->next_frames + i + j);
+
+      /* Relocate other next frames at higher indices. */
+      for (j = 0; j < vec_len (nm->nodes); j++)
+	{
+	  s = vlib_node_get_runtime (vm, j);
+	  if (j != node_index
+	      && s->next_frame_index >= i)
+	    s->next_frame_index += n_insert;
+	}
+
+      /* Pending frames may need to be relocated also. */
+      vec_foreach (pf, nm->pending_frames)
+	{
+	  if (pf->next_frame_index != VLIB_PENDING_FRAME_NO_NEXT_FRAME
+	      && pf->next_frame_index >= i)
+	    pf->next_frame_index += n_insert;
+	}
+      pool_foreach (pf, nm->suspended_process_frames, ({
+	  if (pf->next_frame_index != ~0 && pf->next_frame_index >= i)
+	    pf->next_frame_index += n_insert;
+      }));
+
+      r->n_next_nodes = vec_len (node->next_nodes);
+    }
+
+  /* Set frame's node runtime index. */
+  next_node = vlib_get_node (vm, node->next_nodes[next_index]);
+  nf = nm->next_frames + r->next_frame_index + next_index;
+  nf->node_runtime_index = next_node->runtime_index;
+
+  vlib_worker_thread_node_runtime_update();
+
+  vlib_worker_thread_barrier_release(vm);
+}
+
+/* Add next node to given node in given slot. */
+uword
+vlib_node_add_next_with_slot (vlib_main_t * vm,
+			      uword node_index,
+			      uword next_node_index,
+			      uword slot)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_node_t * node, * next;
+  uword * p;
+
+  node = vec_elt (nm->nodes, node_index);
+  next = vec_elt (nm->nodes, next_node_index);
+
+  /* Fill in static next nodes if runtime has yet to be initialized. */
+  if (slot == ~0 && ! (nm->flags & VLIB_NODE_MAIN_RUNTIME_STARTED))
+    {
+      uword i;
+      for (i = 0; i < vec_len (node->next_node_names); i++)
+	{
+	  char * a = node->next_node_names[i];
+	  if (a)
+	    vlib_node_add_named_next_with_slot (vm, node->index, a, i);
+	}
+    }
+
+  if ((p = hash_get (node->next_slot_by_node, next_node_index)))
+    {
+      /* Next already exists: slot must match. */
+      if (slot != ~0)
+	ASSERT (slot == p[0]);
+      return p[0];
+    }
+
+  if (slot == ~0)
+    slot = vec_len (node->next_nodes);
+
+  vec_validate_init_empty (node->next_nodes, slot, ~0);
+  vec_validate (node->n_vectors_by_next_node, slot);
+
+  node->next_nodes[slot] = next_node_index;
+  hash_set (node->next_slot_by_node, next_node_index, slot);
+
+  vlib_node_runtime_update (vm, node_index, slot);
+
+  next->prev_node_bitmap = clib_bitmap_ori (next->prev_node_bitmap,
+					    node_index);
+
+  /* Siblings all get same node structure. */
+  {
+    uword sib_node_index, sib_slot;
+    vlib_node_t * sib_node;
+    clib_bitmap_foreach (sib_node_index, node->sibling_bitmap, ({
+      sib_node = vec_elt (nm->nodes, sib_node_index);
+      if (sib_node != node)
+	{
+	  sib_slot = vlib_node_add_next_with_slot (vm, sib_node_index, next_node_index, slot);
+	  ASSERT (sib_slot == slot);
+	}
+    }));
+  }
+
+  return slot;
+}
+
+/* Add named next node to given node in given slot. */
+uword
+vlib_node_add_named_next_with_slot (vlib_main_t * vm,
+				    uword node,
+				    char * name,
+				    uword slot)
+{
+  vlib_node_main_t * nm;
+  vlib_node_t * n, * n_next;
+
+  nm = &vm->node_main;
+  n = vlib_get_node (vm, node);
+
+  n_next = vlib_get_node_by_name (vm, (u8 *) name);
+  if (! n_next)
+    {
+      if (nm->flags & VLIB_NODE_MAIN_RUNTIME_STARTED)
+	return ~0;
+
+      if (slot == ~0)
+	slot = clib_max (vec_len (n->next_node_names),
+			 vec_len (n->next_nodes));
+      vec_validate (n->next_node_names, slot);
+      n->next_node_names[slot] = name;
+      return slot;
+    }
+
+  return vlib_node_add_next_with_slot (vm, node, n_next->index, slot);
+}
+
+static void node_elog_init (vlib_main_t * vm, uword ni)
+{
+  elog_event_type_t t;
+
+  memset (&t, 0, sizeof (t));
+
+  /* 2 event types for this node: one when node function is called.
+     One when it returns. */
+  vec_validate (vm->node_call_elog_event_types, ni);
+  vm->node_call_elog_event_types[ni] = t;
+
+  vec_validate (vm->node_return_elog_event_types, ni);
+  vm->node_return_elog_event_types[ni] = t;
+
+  node_set_elog_name (vm, ni);
+}
+
+#ifdef CLIB_UNIX
+#define STACK_ALIGN 4096
+#else
+#define STACK_ALIGN CLIB_CACHE_LINE_BYTES
+#endif
+
+static void register_node (vlib_main_t * vm,
+			   vlib_node_registration_t * r)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_node_t * n;
+  int i;
+
+  if (CLIB_DEBUG > 0)
+    {
+      /* Default (0) type should match INTERNAL. */
+      vlib_node_t zero = {0};
+      ASSERT (VLIB_NODE_TYPE_INTERNAL == zero.type);
+    }
+
+  ASSERT (r->function != 0);
+
+  n = clib_mem_alloc_no_fail (sizeof (n[0]));
+  memset (n, 0, sizeof (n[0]));
+  n->index = vec_len (nm->nodes);
+
+  vec_add1 (nm->nodes, n);
+	
+  /* Name is always a vector so it can be formatted with %v. */
+  if (clib_mem_is_heap_object (vec_header (r->name, 0)))
+    n->name = vec_dup ((u8 *) r->name);
+  else
+    n->name = format (0, "%s", r->name);
+
+  if (! nm->node_by_name)
+    nm->node_by_name = hash_create_vec (/* size */ 32,
+					sizeof (n->name[0]),
+					sizeof (uword));
+
+  /* Node names must be unique. */
+  {
+    vlib_node_t * o = vlib_get_node_by_name (vm, n->name);
+    if (o)
+      clib_error ("more than one node named `%v'", n->name);
+  }
+
+  hash_set (nm->node_by_name, n->name, n->index);
+
+  r->index = n->index;		/* save index in registration */
+  n->function = r->function;
+
+  /* Node index of next sibling will be filled in by vlib_node_main_init. */
+  n->sibling_of = r->sibling_of;
+
+  if (r->type == VLIB_NODE_TYPE_INTERNAL)
+    ASSERT (r->vector_size > 0);
+
+#define _(f) n->f = r->f
+
+  _ (type);
+  _ (flags);
+  _ (state);
+  _ (scalar_size);
+  _ (vector_size);
+  _ (format_buffer);
+  _ (unformat_buffer);
+  _ (format_trace);
+  _ (validate_frame);
+
+  /* Register error counters. */
+  vlib_register_errors (vm, n->index, r->n_errors, r->error_strings);
+  node_elog_init (vm, n->index);
+
+  _ (runtime_data_bytes);
+  if (r->runtime_data_bytes > 0)
+    {
+      vec_resize (n->runtime_data, r->runtime_data_bytes);
+      if (r->runtime_data)
+	memcpy (n->runtime_data, r->runtime_data, r->runtime_data_bytes);
+    }
+
+  vec_resize (n->next_node_names, r->n_next_nodes);
+  for (i = 0; i < r->n_next_nodes; i++)
+    n->next_node_names[i] = r->next_nodes[i];
+
+  vec_validate_init_empty (n->next_nodes, r->n_next_nodes - 1, ~0);
+  vec_validate (n->n_vectors_by_next_node, r->n_next_nodes - 1);
+
+  n->owner_node_index = n->owner_next_index = ~0;
+
+  /* Initialize node runtime. */
+  {
+    vlib_node_runtime_t * rt;
+    u32 i;
+
+    if (n->type == VLIB_NODE_TYPE_PROCESS)
+      {
+	vlib_process_t * p;
+	uword log2_n_stack_bytes;
+
+	log2_n_stack_bytes = clib_max (r->process_log2_n_stack_bytes, 15);
+
+	p = clib_mem_alloc_aligned_no_fail 
+            (sizeof (p[0]) + (1 << log2_n_stack_bytes),
+             STACK_ALIGN);
+
+	memset (p, 0, sizeof (p[0]));
+	p->log2_n_stack_bytes = log2_n_stack_bytes;
+
+	/* Process node's runtime index is really index into process
+	   pointer vector. */
+	n->runtime_index = vec_len (nm->processes);
+
+	vec_add1 (nm->processes, p);
+
+	/* Paint first stack word with magic number so we can at least
+	   detect process stack overruns. */
+	p->stack[0] = VLIB_PROCESS_STACK_MAGIC;
+
+	/* Node runtime is stored inside of process. */
+	rt = &p->node_runtime;
+
+#ifdef CLIB_UNIX
+        /* 
+         * Disallow writes to the bottom page of the stack, to
+         * catch stack overflows.
+         */
+        if (mprotect (p->stack, 4096, PROT_READ) < 0)
+            clib_unix_warning ("process stack");
+#endif
+
+      }
+    else
+      {
+	vec_add2_aligned (nm->nodes_by_type[n->type], rt, 1,
+			  /* align */ CLIB_CACHE_LINE_BYTES);
+	n->runtime_index = rt - nm->nodes_by_type[n->type];
+      }
+
+    if (n->type == VLIB_NODE_TYPE_INPUT)
+      nm->input_node_counts_by_state[n->state] += 1;
+
+    rt->function = n->function;
+    rt->flags = n->flags;
+    rt->state = n->state;
+    rt->node_index = n->index;
+
+    rt->n_next_nodes = r->n_next_nodes;
+    rt->next_frame_index = vec_len (nm->next_frames);
+
+    vec_resize (nm->next_frames, rt->n_next_nodes);
+    for (i = 0; i < rt->n_next_nodes; i++)
+      vlib_next_frame_init (nm->next_frames + rt->next_frame_index + i);
+
+    vec_resize (rt->errors, r->n_errors);
+    for (i = 0; i < vec_len (rt->errors); i++)
+      rt->errors[i] = vlib_error_set (n->index, i);
+
+    ASSERT (vec_len (n->runtime_data) <= sizeof (rt->runtime_data));
+    if (vec_len (n->runtime_data) > 0)
+      memcpy (rt->runtime_data, n->runtime_data, vec_len (n->runtime_data));
+
+    vec_free (n->runtime_data);
+  }
+}
+
+/* Register new packet processing node. */
+u32 vlib_register_node (vlib_main_t * vm, vlib_node_registration_t * r)
+{
+  register_node (vm, r);
+  return r->index;
+}
+
+void vlib_register_all_static_nodes (vlib_main_t * vm)
+{
+  vlib_node_registration_t * r;
+  
+  r = vm->node_main.node_registrations;
+  while (r) {
+    register_node (vm, r);
+    r = r->next_registration;
+  }
+}
+
+clib_error_t *
+vlib_node_main_init (vlib_main_t * vm)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  clib_error_t * error = 0;
+  vlib_node_t * n;
+  uword ni;
+
+  nm->flags |= VLIB_NODE_MAIN_RUNTIME_STARTED;
+
+  /* Resolve next names into next indices. */
+  for (ni = 0; ni < vec_len (nm->nodes); ni++)
+    {
+      uword i;
+
+      n = vec_elt (nm->nodes, ni);
+
+      for (i = 0; i < vec_len (n->next_node_names); i++)
+	{
+	  char * a = n->next_node_names[i];
+
+	  if (! a)
+	    continue;
+
+	  if (~0 == vlib_node_add_named_next_with_slot (vm, n->index, a, i))
+	    {
+	      error = clib_error_create
+		("node `%v' refers to unknown node `%s'", n->name, a);
+	      goto done;
+	    }
+	}
+
+      vec_free (n->next_node_names);
+    }
+
+  /* Set previous node pointers. */
+  for (ni = 0; ni < vec_len (nm->nodes); ni++)
+    {
+      vlib_node_t * n_next;
+      uword i;
+
+      n = vec_elt (nm->nodes, ni);
+
+      for (i = 0; i < vec_len (n->next_nodes); i++)
+	{
+	  if (n->next_nodes[i] >= vec_len (nm->nodes))
+	    continue;
+
+	  n_next = vec_elt (nm->nodes, n->next_nodes[i]);
+	  n_next->prev_node_bitmap =
+	    clib_bitmap_ori (n_next->prev_node_bitmap, n->index);
+	}
+    }
+
+  {
+    vlib_next_frame_t * nf;
+    vlib_node_runtime_t * r;
+    vlib_node_t * next;
+    uword i;
+
+    vec_foreach (r, nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL])
+      {
+	if (r->n_next_nodes == 0)
+	  continue;
+
+	n = vlib_get_node (vm, r->node_index);
+	nf = vec_elt_at_index (nm->next_frames, r->next_frame_index);
+
+	for (i = 0; i < vec_len (n->next_nodes); i++)
+	  {
+	    next = vlib_get_node (vm, n->next_nodes[i]);
+
+	    /* Validate node runtime indices are correctly initialized. */
+	    ASSERT (nf[i].node_runtime_index == next->runtime_index);
+
+	    nf[i].flags = 0;
+	    if (next->flags & VLIB_NODE_FLAG_FRAME_NO_FREE_AFTER_DISPATCH)
+	      nf[i].flags |= VLIB_FRAME_NO_FREE_AFTER_DISPATCH;
+	  }
+      }
+  }
+
+  /* Generate node sibling relationships. */
+  {
+    vlib_node_t * n, * sib;
+    uword si;
+
+    for (ni = 0; ni < vec_len (nm->nodes); ni++)
+      {
+	n = vec_elt (nm->nodes, ni);
+
+	if (! n->sibling_of)
+	  continue;
+
+	sib = vlib_get_node_by_name (vm, (u8 *) n->sibling_of);
+	if (! sib)
+	  clib_error ("sibling `%s' not found for node `%v'", n->sibling_of, n->name);
+
+	clib_bitmap_foreach (si, sib->sibling_bitmap, ({
+	  vlib_node_t * m = vec_elt (nm->nodes, si);
+
+	  /* Connect all of sibling's siblings to us. */
+	  m->sibling_bitmap = clib_bitmap_ori (m->sibling_bitmap, n->index);
+
+	  /* Connect us to all of sibling's siblings. */
+	  n->sibling_bitmap = clib_bitmap_ori (n->sibling_bitmap, si);
+	}));
+
+	/* Connect sibling to us. */
+	sib->sibling_bitmap = clib_bitmap_ori (sib->sibling_bitmap, n->index);
+
+	/* Connect us to sibling. */
+	n->sibling_bitmap = clib_bitmap_ori (n->sibling_bitmap, sib->index);
+      }
+  }
+
+ done:
+  return error;
+}
diff --git a/vlib/vlib/node.h b/vlib/vlib/node.h
new file mode 100644
index 00000000000..806a9dae1b5
--- /dev/null
+++ b/vlib/vlib/node.h
@@ -0,0 +1,621 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * node.h: VLIB processing nodes
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_node_h
+#define included_vlib_node_h
+
+#include <vppinfra/longjmp.h>
+#include <vppinfra/timing_wheel.h>
+#include <vlib/trace.h>		/* for vlib_trace_filter_t */
+
+/* Forward declaration. */
+struct vlib_node_runtime_t;
+struct vlib_frame_t;
+
+/* Internal nodes (including output nodes) move data from node to
+   node (or out of the graph for output nodes). */
+typedef uword (vlib_node_function_t) (struct vlib_main_t * vm,
+				      struct vlib_node_runtime_t * node,
+				      struct vlib_frame_t * frame);
+
+typedef enum {
+  /* An internal node on the call graph (could be output). */
+  VLIB_NODE_TYPE_INTERNAL,
+
+  /* Nodes which input data into the processing graph.
+     Input nodes are called for each iteration of main loop. */
+  VLIB_NODE_TYPE_INPUT,
+
+  /* Nodes to be called before all input nodes.
+     Used, for example, to clean out driver TX rings before
+     processing input. */
+  VLIB_NODE_TYPE_PRE_INPUT,
+
+  /* "Process" nodes which can be suspended and later resumed. */
+  VLIB_NODE_TYPE_PROCESS,
+
+  VLIB_N_NODE_TYPE,
+} vlib_node_type_t;
+
+typedef struct _vlib_node_registration {
+  /* Vector processing function for this node. */
+  vlib_node_function_t * function;
+
+  /* Node name. */
+  char * name;
+
+  /* Name of sibling (if applicable). */
+  char * sibling_of;
+
+  /* Node index filled in by registration. */
+  u32 index;
+
+  /* Type of this node. */
+  vlib_node_type_t type;
+
+  /* Error strings indexed by error code for this node. */
+  char ** error_strings;
+
+  /* Buffer format/unformat for this node. */
+  format_function_t * format_buffer;
+  unformat_function_t * unformat_buffer;
+
+  /* Trace format/unformat for this node. */
+  format_function_t * format_trace;
+  unformat_function_t * unformat_trace;
+
+  /* Function to validate incoming frames. */
+  u8 * (* validate_frame) (struct vlib_main_t * vm,
+			   struct vlib_node_runtime_t *,
+			   struct vlib_frame_t * f);
+
+  /* Per-node runtime data. */
+  void * runtime_data;
+
+  /* Process stack size. */
+  u16 process_log2_n_stack_bytes;
+
+  /* Number of bytes of per-node run time data. */
+  u8 runtime_data_bytes;
+
+  /* State for input nodes. */
+  u8 state;
+
+  /* Node flags. */
+  u16 flags;
+
+  /* Size of scalar and vector arguments in bytes. */
+  u16 scalar_size, vector_size;
+
+  /* Number of error codes used by this node. */
+  u16 n_errors;
+
+  /* Number of next node names that follow. */
+  u16 n_next_nodes;
+
+  /* Constructor link-list, don't ask... */
+  struct _vlib_node_registration * next_registration;
+
+  /* Names of next nodes which this node feeds into. */
+  char * next_nodes[];
+
+} vlib_node_registration_t;
+
+#define VLIB_REGISTER_NODE(x,...)                                       \
+    __VA_ARGS__ vlib_node_registration_t x;                             \
+static void __vlib_add_node_registration_##x (void)                     \
+    __attribute__((__constructor__)) ;                                  \
+static void __vlib_add_node_registration_##x (void)                     \
+{                                                                       \
+    vlib_main_t * vm = vlib_get_main();                                 \
+    x.next_registration = vm->node_main.node_registrations;             \
+    vm->node_main.node_registrations = &x;                              \
+}                                                                       \
+__VA_ARGS__ vlib_node_registration_t x 
+
+always_inline vlib_node_registration_t *
+vlib_node_next_registered (vlib_node_registration_t * c)
+{
+  c = clib_elf_section_data_next (c, c->n_next_nodes * sizeof (c->next_nodes[0]));
+  return c;
+}
+
+typedef struct {
+  /* Total calls, clock ticks and vector elements processed for this node. */
+  u64 calls, vectors, clocks, suspends;
+  u64 max_clock;
+  u64 max_clock_n;
+} vlib_node_stats_t;
+
+#define foreach_vlib_node_state					\
+  /* Input node is called each iteration of main loop.		\
+     This is the default (zero). */				\
+  _ (POLLING)							\
+  /* Input node is called when device signals an interrupt. */	\
+  _ (INTERRUPT)							\
+  /* Input node is never called. */				\
+  _ (DISABLED)
+
+typedef enum {
+#define _(f) VLIB_NODE_STATE_##f,
+  foreach_vlib_node_state
+#undef _
+  VLIB_N_NODE_STATE,
+} vlib_node_state_t;
+
+typedef struct vlib_node_t {
+  /* Vector processing function for this node. */
+  vlib_node_function_t * function;
+
+  /* Node name. */
+  u8 * name;
+
+  /* Node name index in elog string table. */
+  u32 name_elog_string;
+
+  /* Total statistics for this node. */
+  vlib_node_stats_t stats_total;
+
+  /* Saved values as of last clear (or zero if never cleared).
+     Current values are always stats_total - stats_last_clear. */
+  vlib_node_stats_t stats_last_clear;
+
+  /* Type of this node. */
+  vlib_node_type_t type;
+
+  /* Node index. */
+  u32 index;
+
+  /* Index of corresponding node runtime. */
+  u32 runtime_index;
+
+  /* Runtime data for this node. */
+  void * runtime_data;
+
+  /* Node flags. */
+  u16 flags;
+
+  /* Processing function keeps frame.  Tells node dispatching code not
+     to free frame after dispatch is done.  */
+#define VLIB_NODE_FLAG_FRAME_NO_FREE_AFTER_DISPATCH (1 << 0)
+
+  /* Node counts as output/drop/punt node for stats purposes. */
+#define VLIB_NODE_FLAG_IS_OUTPUT (1 << 1)
+#define VLIB_NODE_FLAG_IS_DROP (1 << 2)
+#define VLIB_NODE_FLAG_IS_PUNT (1 << 3)
+#define VLIB_NODE_FLAG_IS_HANDOFF (1 << 4)
+
+  /* Set if current node runtime has traced vectors. */
+#define VLIB_NODE_FLAG_TRACE (1 << 5)
+
+#define VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE (1 << 6)
+#define VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE (1 << 7)
+
+  /* State for input nodes. */
+  u8 state;
+
+  /* Number of bytes of run time data. */
+  u8 runtime_data_bytes;
+
+  /* Number of error codes used by this node. */
+  u16 n_errors;
+
+  /* Size of scalar and vector arguments in bytes. */
+  u16 scalar_size, vector_size;
+
+  /* Handle/index in error heap for this node. */
+  u32 error_heap_handle;
+  u32 error_heap_index;
+
+  /* Error strings indexed by error code for this node. */
+  char ** error_strings;
+
+  /* Vector of next node names.
+     Only used before next_nodes array is initialized. */
+  char ** next_node_names;
+
+  /* Next node indices for this node. */
+  u32 * next_nodes;
+
+  /* Name of node that we are sibling of. */
+  char * sibling_of;
+
+  /* Bitmap of all of this node's siblings. */
+  uword * sibling_bitmap;
+
+  /* Total number of vectors sent to each next node. */
+  u64 * n_vectors_by_next_node;
+
+  /* Hash table mapping next node index into slot in
+     next_nodes vector.  Quickly determines whether this node
+     is connected to given next node and, if so, with which slot. */
+  uword * next_slot_by_node;
+
+  /* Bitmap of node indices which feed this node. */
+  uword * prev_node_bitmap;
+
+  /* Node/next-index which own enqueue rights with to this node. */
+  u32 owner_node_index, owner_next_index;
+
+  /* Buffer format/unformat for this node. */
+  format_function_t * format_buffer;
+  unformat_function_t * unformat_buffer;
+
+  /* Trace buffer format/unformat for this node. */
+  format_function_t * format_trace;
+
+  /* Function to validate incoming frames. */
+  u8 * (* validate_frame) (struct vlib_main_t * vm,
+			   struct vlib_node_runtime_t *,
+			   struct vlib_frame_t * f);
+} vlib_node_t;
+
+#define VLIB_INVALID_NODE_INDEX ((u32) ~0)
+
+/* Max number of vector elements to process at once per node. */
+#define VLIB_FRAME_SIZE 256
+
+/* Calling frame (think stack frame) for a node. */
+typedef struct vlib_frame_t {
+  /* Frame flags. */
+  u16 flags;
+
+  /* Number of scalar bytes in arguments. */
+  u8 scalar_size;
+
+  /* Number of bytes per vector argument. */
+  u8 vector_size;
+
+  /* Number of vector elements currently in frame. */
+  u16 n_vectors;
+
+  /* Owner cpuid / heap id */
+  u16 cpu_index;
+
+  /* Scalar and vector arguments to next node. */
+  u8 arguments[0];
+} vlib_frame_t;
+
+typedef struct {
+  /* Frame index. */
+  u32 frame_index;
+
+  /* Node runtime for this next. */
+  u32 node_runtime_index;
+
+  /* Next frame flags. */
+  u32 flags;
+
+  /* Reflects node frame-used flag for this next. */
+#define VLIB_FRAME_NO_FREE_AFTER_DISPATCH \
+  VLIB_NODE_FLAG_FRAME_NO_FREE_AFTER_DISPATCH
+
+  /* This next frame owns enqueue to node
+     corresponding to node_runtime_index. */
+#define VLIB_FRAME_OWNER (1 << 15)
+
+  /* Set when frame has been allocated for this next. */
+#define VLIB_FRAME_IS_ALLOCATED	VLIB_NODE_FLAG_IS_OUTPUT
+
+  /* Set when frame has been added to pending vector. */
+#define VLIB_FRAME_PENDING VLIB_NODE_FLAG_IS_DROP
+
+  /* Set when frame is to be freed after dispatch. */
+#define VLIB_FRAME_FREE_AFTER_DISPATCH VLIB_NODE_FLAG_IS_PUNT
+
+  /* Set when frame has traced packets. */
+#define VLIB_FRAME_TRACE VLIB_NODE_FLAG_TRACE
+
+  /* Number of vectors enqueue to this next since last overflow. */
+  u32 vectors_since_last_overflow;
+} vlib_next_frame_t;
+
+always_inline void
+vlib_next_frame_init (vlib_next_frame_t * nf)
+{
+  memset (nf, 0, sizeof (nf[0]));
+  nf->frame_index = ~0;
+  nf->node_runtime_index = ~0;
+}
+
+/* A frame pending dispatch by main loop. */
+typedef struct {
+  /* Node and runtime for this frame. */
+  u32 node_runtime_index;
+
+  /* Frame index (in the heap). */
+  u32 frame_index;
+
+  /* Start of next frames for this node. */
+  u32 next_frame_index;
+
+  /* Special value for next_frame_index when there is no next frame. */
+#define VLIB_PENDING_FRAME_NO_NEXT_FRAME ((u32) ~0)
+} vlib_pending_frame_t;
+
+typedef struct vlib_node_runtime_t {
+  /* Node function to call. */
+  vlib_node_function_t * function;
+
+  /* Vector of errors for this node. */
+  vlib_error_t * errors;
+
+  /* Number of clock cycles. */
+  u32 clocks_since_last_overflow;
+
+  /* Maximum clock cycle for an invocation. */
+  u32 max_clock;
+
+  /* Number of vectors in the recorded max_clock. */
+  u32 max_clock_n;
+
+  /* Number of calls. */
+  u32 calls_since_last_overflow;
+
+  /* Number of vector elements processed by this node. */
+  u32 vectors_since_last_overflow;
+
+  /* Start of next frames for this node. */
+  u32 next_frame_index;
+
+  /* Node index. */
+  u32 node_index;
+
+  /* For input nodes: decremented on each main loop interation until it reaches zero
+     and function is called.  Allows some input nodes to be called
+     more than others. */
+  u32 input_main_loops_per_call;
+
+  /* Saved main loop counter of last dispatch of this node. */
+  u32 main_loop_count_last_dispatch;
+
+  u32 main_loop_vector_stats[2];
+
+  /* Copy of main node flags. */
+  u16 flags;
+
+  /* Input node state. */
+  u16 state;
+
+  u16 n_next_nodes;
+
+  /* Next frame index that vector arguments were last enqueued to
+     last time this node ran.  Set to zero before first run
+     of this node. */
+  u16 cached_next_index;
+
+  /* CPU this node runs on */
+  u16 cpu_index;
+
+  /* Function dependent node-runtime. */
+  uword runtime_data[(128
+		      - 1 * sizeof (vlib_node_function_t *)
+		      - 1 * sizeof (vlib_error_t *)
+		      - 11 * sizeof (u32)
+		      - 5 * sizeof (u16)) / sizeof (uword)];
+} vlib_node_runtime_t;
+
+typedef struct {
+  /* Number of allocated frames for this scalar/vector size. */
+  u32 n_alloc_frames;
+
+  /* Vector of free frame indices for this scalar/vector size. */
+  u32 * free_frame_indices;
+} vlib_frame_size_t;
+
+typedef struct {
+  /* Users opaque value for event type. */
+  uword opaque;
+} vlib_process_event_type_t;
+
+typedef struct {
+  /* Node runtime for this process. */
+  vlib_node_runtime_t node_runtime;
+
+  /* Where to longjmp when process is done. */
+  clib_longjmp_t return_longjmp;
+
+#define VLIB_PROCESS_RETURN_LONGJMP_RETURN ((uword) ~0 - 0)
+#define VLIB_PROCESS_RETURN_LONGJMP_SUSPEND ((uword) ~0 - 1)
+
+  /* Where to longjmp to resume node after suspend. */
+  clib_longjmp_t resume_longjmp;
+#define VLIB_PROCESS_RESUME_LONGJMP_SUSPEND 0
+#define VLIB_PROCESS_RESUME_LONGJMP_RESUME  1
+
+  u16 flags;
+#define VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK (1 << 0)
+#define VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT (1 << 1)
+  /* Set to indicate that this process has been added to resume vector. */
+#define VLIB_PROCESS_RESUME_PENDING (1 << 2)
+
+  /* Process function is currently running. */
+#define VLIB_PROCESS_IS_RUNNING (1 << 3)
+
+  /* Size of process stack. */
+  u16 log2_n_stack_bytes;
+
+  u32 suspended_process_frame_index;
+
+  /* Number of times this process was suspended. */
+  u32 n_suspends;
+
+  /* Vectors of pending event data indexed by event type index. */
+  void ** pending_event_data_by_type_index;
+
+  /* Bitmap of event type-indices with non-empty vectors. */
+  uword * non_empty_event_type_bitmap;
+
+  /* Bitmap of event type-indices which are one time events. */
+  uword * one_time_event_type_bitmap;
+
+  /* Type is opaque pointer -- typically a pointer to an event handler
+     function.  Hash table to map opaque to a type index. */
+  uword * event_type_index_by_type_opaque;
+
+  /* Pool of currently valid event types. */
+  vlib_process_event_type_t * event_type_pool;
+
+  /* When suspending saves cpu cycle counter when process is to be resumed. */
+  u64 resume_cpu_time;
+
+#ifdef CLIB_UNIX
+  /* Pad to a multiple of the page size so we can mprotect process stacks */
+  CLIB_PAD_FROM_TO (0x140, 0x1000);
+#endif
+  /* Process stack.  Starts here and extends 2^log2_n_stack_bytes
+     bytes. */
+
+#define VLIB_PROCESS_STACK_MAGIC (0xdead7ead)
+  u32 stack[0];
+} vlib_process_t __attribute__ ((aligned (CLIB_CACHE_LINE_BYTES)));
+
+typedef struct {
+    u32 node_index;
+
+    u32 one_time_event;
+} vlib_one_time_waiting_process_t;
+
+typedef struct {
+  u16 n_data_elts;
+
+  u16 n_data_elt_bytes;
+
+  /* n_data_elts * n_data_elt_bytes */
+  u32 n_data_bytes;
+
+  /* Process node & event type to be used to signal event. */
+  u32 process_node_index;
+
+  u32 event_type_index;
+
+  union {
+    u8 inline_event_data[64 - 3 * sizeof (u32) - 2 * sizeof (u16)];
+
+    /* Vector of event data used only when data does not fit inline. */
+    u8 * event_data_as_vector;
+  };
+} vlib_signal_timed_event_data_t;
+
+always_inline uword
+vlib_timing_wheel_data_is_timed_event (u32 d)
+{ return d & 1; }
+
+always_inline u32
+vlib_timing_wheel_data_set_suspended_process (u32 i)
+{ return 0 + 2*i; }
+
+always_inline u32
+vlib_timing_wheel_data_set_timed_event (u32 i)
+{ return 1 + 2*i; }
+
+always_inline uword
+vlib_timing_wheel_data_get_index (u32 d)
+{ return d / 2; }
+
+typedef struct {
+  /* Public nodes. */
+  vlib_node_t ** nodes;
+
+  /* Node index hashed by node name. */
+  uword * node_by_name;
+
+  u32 flags;
+#define VLIB_NODE_MAIN_RUNTIME_STARTED (1 << 0)
+
+  /* Nodes segregated by type for cache locality.
+     Does not apply to nodes of type VLIB_NODE_TYPE_INTERNAL. */
+  vlib_node_runtime_t * nodes_by_type[VLIB_N_NODE_TYPE];
+
+  /* Node runtime indices for input nodes with pending interrupts. */
+  u32 * pending_interrupt_node_runtime_indices;
+
+  /* Input nodes are switched from/to interrupt to/from polling mode
+     when average vector length goes above/below polling/interrupt
+     thresholds. */
+  u32 polling_threshold_vector_length;
+  u32 interrupt_threshold_vector_length;
+
+  /* Vector of next frames. */
+  vlib_next_frame_t * next_frames;
+
+  /* Vector of internal node's frames waiting to be called. */
+  vlib_pending_frame_t * pending_frames;
+
+  /* Timing wheel for scheduling time-based node dispatch. */
+  timing_wheel_t timing_wheel;
+
+  vlib_signal_timed_event_data_t * signal_timed_event_data_pool;
+
+  /* Opaque data vector added via timing_wheel_advance. */
+  u32 * data_from_advancing_timing_wheel;
+
+  /* CPU time of next process to be ready on timing wheel. */
+  u64 cpu_time_next_process_ready;
+
+  /* Vector of process nodes.
+     One for each node of type VLIB_NODE_TYPE_PROCESS. */
+  vlib_process_t ** processes;
+
+  /* Current running process or ~0 if no process running. */
+  u32 current_process_index;
+
+  /* Pool of pending process frames. */
+  vlib_pending_frame_t * suspended_process_frames;
+
+  /* Vector of event data vectors pending recycle. */
+  void ** recycled_event_data_vectors;
+
+  /* Current counts of nodes in each state. */
+  u32 input_node_counts_by_state[VLIB_N_NODE_STATE];
+
+  /* Hash of (scalar_size,vector_size) to frame_sizes index. */
+  uword * frame_size_hash;
+
+  /* Per-size frame allocation information. */
+  vlib_frame_size_t * frame_sizes;
+
+  /* Time of last node runtime stats clear. */
+  f64 time_last_runtime_stats_clear;
+
+  /* Node registrations added by constructors */
+  vlib_node_registration_t * node_registrations;
+} vlib_node_main_t;
+
+#endif /* included_vlib_node_h */
diff --git a/vlib/vlib/node_cli.c b/vlib/vlib/node_cli.c
new file mode 100644
index 00000000000..58c3776a67b
--- /dev/null
+++ b/vlib/vlib/node_cli.c
@@ -0,0 +1,441 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * node_cli.c: node CLI
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/threads.h>
+
+static clib_error_t *
+show_node_graph (vlib_main_t * vm,
+		 unformat_input_t * input,
+		 vlib_cli_command_t * cmd)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_node_t * n;
+  u32 node_index;
+
+  vlib_cli_output (vm, "%U\n", format_vlib_node_graph, nm, 0);
+
+  if (unformat (input, "%U", unformat_vlib_node, vm, &node_index))
+    {
+      n = vlib_get_node (vm, node_index);
+      vlib_cli_output (vm, "%U\n", format_vlib_node_graph, nm, n);
+    }
+  else
+    {
+      vlib_node_t ** nodes = vec_dup (nm->nodes);
+      uword i;
+
+      vec_sort (nodes, n1, n2,
+		vec_cmp (n1[0]->name, n2[0]->name));
+
+      for (i = 0; i < vec_len (nodes); i++)
+	vlib_cli_output (vm, "%U\n\n", format_vlib_node_graph, nm, nodes[i]);
+
+      vec_free (nodes);
+    }
+
+  return 0;
+}
+
+VLIB_CLI_COMMAND (show_node_graph_command, static) = {
+  .path = "show vlib graph",
+  .short_help = "Show packet processing node graph",
+  .function = show_node_graph,
+};
+
+static u8 * format_vlib_node_stats (u8 * s, va_list * va)
+{
+  vlib_main_t * vm = va_arg (*va, vlib_main_t *);
+  vlib_node_t * n = va_arg (*va, vlib_node_t *);
+  int max = va_arg (*va, int);
+  f64 v;
+  char * state;
+  u8 * ns;
+  u8 * misc_info = 0;
+  u64 c, p, l, d;
+  f64 x;
+  f64 maxc, maxcn; 
+  u32 maxn;
+  uword indent;
+
+  if (! n)
+    {
+      if (max) 
+        return format (s,
+            "%=30s%=17s%=16s%=16s%=16s%=16s",
+	    "Name", "Max Node Clocks", "Vectors at Max", "Max Clocks", "Avg Clocks", "Avg Vectors/Call");
+      else
+        return format (s,
+            "%=30s%=12s%=16s%=16s%=16s%=16s%=16s",
+            "Name", "State", "Calls", "Vectors", "Suspends", "Clocks", "Vectors/Call");
+    }
+
+  indent = format_get_indent (s);
+
+  l = n->stats_total.clocks - n->stats_last_clear.clocks;
+  c = n->stats_total.calls - n->stats_last_clear.calls;
+  p = n->stats_total.vectors - n->stats_last_clear.vectors;
+  d = n->stats_total.suspends - n->stats_last_clear.suspends;
+  maxc = (f64)n->stats_total.max_clock;
+  maxn = n->stats_total.max_clock_n;
+  if (n->stats_total.max_clock_n) 
+    maxcn = (f64)n->stats_total.max_clock / (f64)maxn;
+  else 
+    maxcn = 0.0;
+
+  /* Clocks per packet, per call or per suspend. */
+  x = 0;
+  if (p > 0)
+    x = (f64) l / (f64) p;
+  else if (c > 0)
+    x = (f64) l / (f64) c;
+  else if (d > 0)
+    x = (f64) l / (f64) d;
+    
+  if (c > 0)
+    v = (double)p / (double)c;
+  else
+    v = 0;
+
+  state = "active";
+  if (n->type == VLIB_NODE_TYPE_PROCESS)
+    {
+      vlib_process_t * p = vlib_get_process_from_node (vm, n);
+
+      /* Show processes with events pending.  This helps spot bugs where events are not
+	 being handled. */
+      if (! clib_bitmap_is_zero (p->non_empty_event_type_bitmap))
+	misc_info = format (misc_info, "events pending, ");
+
+      switch (p->flags & (VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK
+			  | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT))
+	{
+	default:
+	  if (! (p->flags & VLIB_PROCESS_IS_RUNNING))
+	    state = "done";
+	  break;
+
+	case VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK:
+	  state = "time wait";
+	  break;
+
+	case VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT:
+	  state = "event wait";
+	  break;
+
+	case (VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT
+	      | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK):
+	  state = "any wait";
+	  break;
+	}
+    }      
+  else if (n->type != VLIB_NODE_TYPE_INTERNAL)
+    {
+      state = "polling";
+      if (n->state == VLIB_NODE_STATE_DISABLED)
+	state = "disabled";
+      else if (n->state == VLIB_NODE_STATE_INTERRUPT)
+	state = "interrupt wait";
+    }
+
+  ns = n->name;
+
+  if (max)
+    s = format (s, "%-30v%=17.2e%=16d%=16.2e%=16.2e%=16.2e",
+                   ns, maxc, maxn, maxcn, x, v);
+  else
+    s = format (s, "%-30v%=12s%16Ld%16Ld%16Ld%16.2e%16.2f", ns, state,
+                c, p, d, x, v);
+
+  if (ns != n->name)
+    vec_free (ns);
+
+  if (misc_info)
+    {
+      s = format (s, "\n%U%v", format_white_space, indent + 4, misc_info);
+      vec_free (misc_info);
+    }
+
+  return s;
+}
+
+static clib_error_t *
+show_node_runtime (vlib_main_t * vm,
+		   unformat_input_t * input,
+		   vlib_cli_command_t * cmd)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_node_t * n;
+  f64 time_now;
+  u32 node_index;
+  vlib_node_t *** node_dups = 0;
+  f64 * vectors_per_main_loop = 0;
+  f64 * last_vector_length_per_node = 0;
+
+  time_now = vlib_time_now (vm);
+
+  if (unformat (input, "%U", unformat_vlib_node, vm, &node_index))
+    {
+      n = vlib_get_node (vm, node_index);
+      vlib_node_sync_stats (vm, n);
+      vlib_cli_output (vm, "%U\n", format_vlib_node_stats, vm, 0, 0);
+      vlib_cli_output (vm, "%U\n", format_vlib_node_stats, vm, n, 0);
+    }
+  else
+    {
+      vlib_node_t ** nodes;
+      uword i, j;
+      f64 dt;
+      u64 n_input, n_output, n_drop, n_punt;
+      u64 n_internal_vectors, n_internal_calls;
+      u64 n_clocks, l, v, c, d;
+      int brief = 1;
+      int max = 0;
+      vlib_main_t ** stat_vms = 0, *stat_vm;
+
+      /* Suppress nodes with zero calls since last clear */
+      if (unformat (input, "brief") || unformat (input, "b"))
+        brief = 1;
+      if (unformat (input, "verbose") || unformat(input, "v"))
+        brief = 0;
+      if (unformat (input, "max") || unformat(input, "m"))
+        max = 1;
+
+      if (vec_len(vlib_mains) == 0)
+        vec_add1 (stat_vms, vm);
+      else
+        {
+          for (i = 0; i < vec_len (vlib_mains); i++)
+            {
+              stat_vm = vlib_mains[i];
+              if (stat_vm)
+                vec_add1 (stat_vms, stat_vm);
+            }
+        }
+
+      /* 
+       * Barrier sync across stats scraping.
+       * Otherwise, the counts will be grossly inaccurate.
+       */
+      vlib_worker_thread_barrier_sync(vm);
+
+      for (j = 0; j < vec_len (stat_vms); j++)
+        {
+          stat_vm = stat_vms[j];
+          nm = &stat_vm->node_main;
+
+          for (i = 0; i < vec_len (nm->nodes); i++)
+            {
+              n = nm->nodes[i];
+              vlib_node_sync_stats (stat_vm, n);
+            }
+
+          nodes = vec_dup (nm->nodes);
+
+          vec_add1(node_dups, nodes);
+          vec_add1 (vectors_per_main_loop, 
+                    vlib_last_vectors_per_main_loop_as_f64 (stat_vm));
+          vec_add1 (last_vector_length_per_node, 
+                    vlib_last_vector_length_per_node (stat_vm));
+        }
+      vlib_worker_thread_barrier_release(vm);
+
+
+      for (j = 0; j < vec_len (stat_vms); j++)
+        {
+          stat_vm = stat_vms[j];
+          nodes = node_dups[j];
+
+          vec_sort (nodes, n1, n2,
+                    vec_cmp (n1[0]->name, n2[0]->name));
+
+          n_input = n_output = n_drop = n_punt = n_clocks = 0;
+          n_internal_vectors = n_internal_calls = 0;
+          for (i = 0; i < vec_len (nodes); i++)
+            {
+              n = nodes[i];
+
+              l = n->stats_total.clocks - n->stats_last_clear.clocks;
+              n_clocks += l;
+
+              v = n->stats_total.vectors - n->stats_last_clear.vectors;
+              c = n->stats_total.calls - n->stats_last_clear.calls;
+
+              switch (n->type)
+                {
+                default:
+                  continue;
+
+                case VLIB_NODE_TYPE_INTERNAL:
+                  n_output += (n->flags & VLIB_NODE_FLAG_IS_OUTPUT) ? v : 0;
+                  n_drop += (n->flags & VLIB_NODE_FLAG_IS_DROP) ? v : 0;
+                  n_punt += (n->flags & VLIB_NODE_FLAG_IS_PUNT) ? v : 0;
+                  if (! (n->flags & VLIB_NODE_FLAG_IS_OUTPUT))
+                    {
+                      n_internal_vectors += v;
+                      n_internal_calls += c;
+                    }
+                  if (n->flags & VLIB_NODE_FLAG_IS_HANDOFF)
+                      n_input += v;
+                  break;
+
+                case VLIB_NODE_TYPE_INPUT:
+                  n_input += v;
+                  break;
+                }
+            }
+
+          if (vec_len (vlib_mains))
+            {
+              vlib_worker_thread_t *w = vlib_worker_threads + j;
+              if (j > 0)
+                vlib_cli_output (vm, "---------------");
+
+              if ( w->dpdk_lcore_id > -1)
+                vlib_cli_output (vm, "Thread %d %v (lcore %u)", j, w->name,
+                                 w->dpdk_lcore_id);
+              else
+                vlib_cli_output (vm, "Thread %d %v", j,
+                                 w->name);
+            }
+
+          dt = time_now - nm->time_last_runtime_stats_clear;
+          vlib_cli_output
+            (vm,
+             "Time %.1f, average vectors/node %.2f, last %d main loops %.2f per node %.2f"
+             "\n  vector rates in %.4e, out %.4e, drop %.4e, punt %.4e",
+             dt,
+             (n_internal_calls > 0
+              ? (f64) n_internal_vectors / (f64) n_internal_calls
+              : 0),
+             1 << VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE,
+             vectors_per_main_loop [j],
+             last_vector_length_per_node [j],
+             (f64) n_input / dt,
+             (f64) n_output / dt,
+             (f64) n_drop / dt,
+             (f64) n_punt / dt);
+
+          vlib_cli_output (vm, "%U", format_vlib_node_stats, stat_vm, 0, max);
+          for (i = 0; i < vec_len (nodes); i++)
+            {
+              c = nodes[i]->stats_total.calls - nodes[i]->stats_last_clear.calls;
+              d = nodes[i]->stats_total.suspends - nodes[i]->stats_last_clear.suspends;
+              if (c || d || ! brief)
+              {
+                vlib_cli_output (vm, "%U", format_vlib_node_stats, stat_vm, 
+                                 nodes[i], max);
+              }
+            }
+          vec_free (nodes);
+        }
+      vec_free (stat_vms);
+      vec_free (node_dups);
+      vec_free (vectors_per_main_loop);
+      vec_free (last_vector_length_per_node);
+    }
+
+  return 0;
+}
+
+VLIB_CLI_COMMAND (show_node_runtime_command, static) = {
+  .path = "show runtime",
+  .short_help = "Show packet processing runtime",
+  .function = show_node_runtime,
+  .is_mp_safe = 1,
+};
+
+static clib_error_t *
+clear_node_runtime (vlib_main_t * vm,
+		    unformat_input_t * input,
+		    vlib_cli_command_t * cmd)
+{
+  vlib_node_main_t * nm;
+  vlib_node_t * n;
+  int i, j;
+  vlib_main_t ** stat_vms = 0, *stat_vm;
+  vlib_node_runtime_t * r;
+
+  if (vec_len(vlib_mains) == 0)
+    vec_add1 (stat_vms, vm);
+  else
+    {
+      for (i = 0; i < vec_len (vlib_mains); i++)
+        {
+          stat_vm = vlib_mains[i];
+          if (stat_vm)
+            vec_add1 (stat_vms, stat_vm);
+        }
+    }
+  
+  vlib_worker_thread_barrier_sync(vm);
+
+  for (j = 0; j < vec_len (stat_vms); j++)
+    {
+      stat_vm = stat_vms[j];
+      nm = &stat_vm->node_main;
+      
+      for (i = 0; i < vec_len (nm->nodes); i++)
+        {
+          n = nm->nodes[i];
+          vlib_node_sync_stats (stat_vm, n);
+          n->stats_last_clear = n->stats_total;
+
+          r = vlib_node_get_runtime (stat_vm, n->index);
+          r->max_clock = 0;
+        }
+      /* Note: input/output rates computed using vlib_global_main */
+      nm->time_last_runtime_stats_clear = vlib_time_now (vm);
+    }
+
+  vlib_worker_thread_barrier_release(vm);
+      
+  vec_free (stat_vms);
+
+  return 0;
+}
+
+VLIB_CLI_COMMAND (clear_node_runtime_command, static) = {
+  .path = "clear runtime",
+  .short_help = "Clear packet processing runtime statistics",
+  .function = clear_node_runtime,
+};
+
+/* Dummy function to get us linked in. */
+void vlib_node_cli_reference (void) {}
diff --git a/vlib/vlib/node_format.c b/vlib/vlib/node_format.c
new file mode 100644
index 00000000000..d1d415e1376
--- /dev/null
+++ b/vlib/vlib/node_format.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * node_format.c: node formatting
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+
+u8 * format_vlib_node_graph (u8 * s, va_list * va)
+{
+  vlib_node_main_t * nm = va_arg (*va, vlib_node_main_t *);
+  vlib_node_t * n = va_arg (*va, vlib_node_t *);
+  int i, j;
+  uword indent;
+  typedef struct {
+    u32 next_node;
+    u32 next_slot;
+    u32 prev_node;
+  } tmp_t;
+  tmp_t * tmps = 0;
+  tmp_t empty = { .next_node = ~0, .prev_node = ~0 };
+
+  if (! n)
+    return format (s,
+		   "%=26s%=26s%=26s",
+		   "Name", "Next", "Previous");
+
+  s = format (s, "%-26v", n->name);
+
+  indent = format_get_indent (s);
+
+  for (i = j = 0; i < vec_len (n->next_nodes); i++)
+    {
+      if (n->next_nodes[i] == VLIB_INVALID_NODE_INDEX)
+	continue;
+      vec_validate_init_empty (tmps, j, empty);
+      tmps[j].next_node = n->next_nodes[i];
+      tmps[j].next_slot = i;
+      j++;
+    }
+
+  j = 0;
+  clib_bitmap_foreach (i, n->prev_node_bitmap, ({
+	vec_validate_init_empty (tmps, j, empty);
+	tmps[j].prev_node = i;
+	j++;
+      }));
+
+  for (i = 0; i < vec_len (tmps); i++)
+    {
+      if (i > 0)
+	s = format (s, "\n%U", format_white_space, indent);
+
+      if (tmps[i].next_node != ~0)
+	{
+	  vlib_node_t * x;
+	  u8 * t = 0;
+
+	  x = vec_elt (nm->nodes, tmps[i].next_node);
+	  t = format (t, "%v [%d]", x->name, tmps[i].next_slot);
+	  s = format (s, "%=26v", t);
+	  vec_free (t);
+	}
+      else
+	s = format (s, "%26s", "");
+
+      if (tmps[i].prev_node != ~0)
+	{
+	  vlib_node_t * x;
+	  x = vec_elt (nm->nodes, tmps[i].prev_node);
+	  s = format (s, "%=26v", x->name);
+	}
+    }
+
+  vec_free (tmps);
+
+  return s;
+}
+
+u8 * format_vlib_node_and_next (u8 * s, va_list * va)
+{
+  vlib_main_t * vm = va_arg (*va, vlib_main_t *);
+  vlib_node_t * n = va_arg (*va, vlib_node_t *);
+  u32 next_index = va_arg (*va, u32);
+  vlib_node_t * n_next;
+  u32 * ni;
+
+  ni = vec_elt_at_index (n->next_nodes, next_index);
+  n_next = vlib_get_node (vm, ni[0]);
+  return format (s, "%v -> %v", n->name, n_next->name);
+}
+
+u8 * format_vlib_node_name (u8 * s, va_list * va)
+{
+  vlib_main_t * vm = va_arg (*va, vlib_main_t *);
+  u32 node_index = va_arg (*va, u32);
+  vlib_node_t * n = vlib_get_node (vm, node_index);
+
+  return format (s, "%v", n->name);
+}
+
+u8 * format_vlib_next_node_name (u8 * s, va_list * va)
+{
+  vlib_main_t * vm = va_arg (*va, vlib_main_t *);
+  u32 node_index = va_arg (*va, u32);
+  u32 next_index = va_arg (*va, u32);
+  vlib_node_t * next = vlib_get_next_node (vm, node_index, next_index);
+  return format (s, "%v", next->name);
+}
+
+/* Parse node name -> node index. */
+uword unformat_vlib_node (unformat_input_t * input, va_list * args)
+{
+  vlib_main_t * vm = va_arg (*args, vlib_main_t *);
+  u32 * result = va_arg (*args, u32 *);
+
+  return unformat_user (input, unformat_hash_vec_string,
+			vm->node_main.node_by_name, result);
+}
+
+u8 * format_vlib_time (u8 * s, va_list * va)
+{
+  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+  f64 time = va_arg (*va, f64);
+  return format (s, "%12.4f", time);
+}
+
+u8 * format_vlib_cpu_time (u8 * s, va_list * va)
+{
+  vlib_main_t * vm = va_arg (*va, vlib_main_t *);
+  u64 cpu_time = va_arg (*va, u64);
+  f64 dt;
+  
+  dt = (cpu_time - vm->clib_time.init_cpu_time) * vm->clib_time.seconds_per_clock;
+  return format (s, "%U", format_vlib_time, vm, dt);
+}
diff --git a/vlib/vlib/node_funcs.h b/vlib/vlib/node_funcs.h
new file mode 100644
index 00000000000..80dc3c602a1
--- /dev/null
+++ b/vlib/vlib/node_funcs.h
@@ -0,0 +1,979 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * node_funcs.h: processing nodes global functions/inlines
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_node_funcs_h
+#define included_vlib_node_funcs_h
+
+#include <vppinfra/fifo.h>
+
+always_inline vlib_node_t *
+vlib_get_node (vlib_main_t * vm, u32 i)
+{ return vec_elt (vm->node_main.nodes, i); }
+
+always_inline vlib_node_t *
+vlib_get_next_node (vlib_main_t * vm, u32 node_index, u32 next_index)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_node_t * n;
+
+  n = vec_elt (nm->nodes, node_index);
+  ASSERT (next_index < vec_len (n->next_nodes));
+  return vlib_get_node (vm, n->next_nodes[next_index]);
+}
+
+always_inline vlib_node_runtime_t *
+vlib_node_get_runtime (vlib_main_t * vm, u32 node_index)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_node_t * n = vec_elt (nm->nodes, node_index);
+  vlib_process_t * p;
+  if (n->type != VLIB_NODE_TYPE_PROCESS)
+    return vec_elt_at_index (nm->nodes_by_type[n->type], n->runtime_index);
+  else
+    {
+      p = vec_elt (nm->processes, n->runtime_index);
+      return &p->node_runtime;
+    }
+}
+
+always_inline void *
+vlib_node_get_runtime_data (vlib_main_t * vm, u32 node_index)
+{
+  vlib_node_runtime_t * r = vlib_node_get_runtime (vm, node_index);
+  return r->runtime_data;
+}
+
+always_inline void
+vlib_node_set_runtime_data (vlib_main_t * vm, u32 node_index,
+			    void * runtime_data,
+			    u32 n_runtime_data_bytes)
+{
+  vlib_node_t * n = vlib_get_node (vm, node_index);
+  vlib_node_runtime_t * r = vlib_node_get_runtime (vm, node_index);
+
+  n->runtime_data_bytes = n_runtime_data_bytes;
+  vec_free (n->runtime_data);
+  vec_add (n->runtime_data, runtime_data, n_runtime_data_bytes);
+
+  ASSERT (vec_len (n->runtime_data) <= sizeof (r->runtime_data));
+  if (vec_len (n->runtime_data) > 0)
+    memcpy (r->runtime_data, n->runtime_data, vec_len (n->runtime_data));
+}
+
+always_inline void
+vlib_node_set_state (vlib_main_t * vm, u32 node_index, vlib_node_state_t new_state)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_node_t * n;
+  vlib_node_runtime_t * r;
+
+  n = vec_elt (nm->nodes, node_index);
+  if (n->type == VLIB_NODE_TYPE_PROCESS)
+    {
+      vlib_process_t * p = vec_elt (nm->processes, n->runtime_index);
+      r = &p->node_runtime;
+
+      /* When disabling make sure flags are cleared. */
+      p->flags &= ~(VLIB_PROCESS_RESUME_PENDING
+		    | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK
+		    | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT);
+    }
+  else
+    r = vec_elt_at_index (nm->nodes_by_type[n->type], n->runtime_index);
+
+  ASSERT (new_state < VLIB_N_NODE_STATE);
+
+  if (n->type == VLIB_NODE_TYPE_INPUT)
+    {
+      ASSERT (nm->input_node_counts_by_state[n->state] > 0);
+      nm->input_node_counts_by_state[n->state] -= 1;
+      nm->input_node_counts_by_state[new_state] += 1;
+    }
+
+  n->state = new_state;
+  r->state = new_state;
+}
+
+always_inline void
+vlib_node_set_interrupt_pending (vlib_main_t * vm, u32 node_index)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_node_t * n = vec_elt (nm->nodes, node_index);
+  ASSERT (n->type == VLIB_NODE_TYPE_INPUT);
+  vec_add1 (nm->pending_interrupt_node_runtime_indices, n->runtime_index);
+}
+
+always_inline vlib_process_t *
+vlib_get_process_from_node (vlib_main_t * vm, vlib_node_t * node)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  ASSERT (node->type == VLIB_NODE_TYPE_PROCESS);
+  return vec_elt (nm->processes, node->runtime_index);
+}
+
+/* Fetches frame with given handle. */
+always_inline vlib_frame_t *
+vlib_get_frame_no_check (vlib_main_t * vm, uword frame_index)
+{
+  vlib_frame_t * f;
+  u32 cpu_index = frame_index & VLIB_CPU_MASK;
+  u32 offset = frame_index & VLIB_OFFSET_MASK;
+  vm = vlib_mains ? vlib_mains[cpu_index] : vm;
+  f = vm->heap_base + offset;
+  return f;
+}
+
+always_inline u32
+vlib_frame_index_no_check (vlib_main_t * vm, vlib_frame_t * f)
+{
+  u32 i;
+
+  ASSERT (((uword)f & VLIB_CPU_MASK)==0);
+
+  vm = vlib_mains ? vlib_mains[f->cpu_index] : vm;
+
+  i = ((u8 *) f - (u8 *) vm->heap_base);
+  return i | f->cpu_index;
+}
+
+always_inline vlib_frame_t *
+vlib_get_frame (vlib_main_t * vm, uword frame_index)
+{
+  vlib_frame_t * f = vlib_get_frame_no_check (vm, frame_index);
+  ASSERT (f->flags & VLIB_FRAME_IS_ALLOCATED);
+  return f;
+}
+
+always_inline u32
+vlib_frame_index (vlib_main_t * vm, vlib_frame_t * f)
+{
+  uword i = vlib_frame_index_no_check (vm, f);
+  ASSERT (vlib_get_frame (vm, i) == f);
+  return i;
+}
+
+/* Byte alignment for vector arguments. */
+#define VLIB_FRAME_VECTOR_ALIGN (1 << 4)
+
+always_inline u32
+vlib_frame_vector_byte_offset (u32 scalar_size)
+{
+  return round_pow2 (sizeof (vlib_frame_t) + scalar_size,
+		     VLIB_FRAME_VECTOR_ALIGN);
+}
+
+always_inline void *
+vlib_frame_vector_args (vlib_frame_t * f)
+{
+  return (void *) f + vlib_frame_vector_byte_offset (f->scalar_size);
+}
+
+/* Scalar data lies before aligned vector data. */
+always_inline void *
+vlib_frame_args (vlib_frame_t * f)
+{ return vlib_frame_vector_args (f) - f->scalar_size; }
+
+always_inline vlib_next_frame_t *
+vlib_node_runtime_get_next_frame (vlib_main_t * vm,
+				  vlib_node_runtime_t * n,
+				  u32 next_index)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_next_frame_t * nf;
+
+  ASSERT (next_index < n->n_next_nodes);
+  nf = vec_elt_at_index (nm->next_frames,
+			 n->next_frame_index + next_index);
+
+  if (CLIB_DEBUG > 0)
+    {
+      vlib_node_t * node, * next;
+      node = vec_elt (nm->nodes, n->node_index);
+      next = vec_elt (nm->nodes, node->next_nodes[next_index]);
+      ASSERT (nf->node_runtime_index == next->runtime_index);
+    }
+
+  return nf;
+}
+
+always_inline vlib_next_frame_t *
+vlib_node_get_next_frame (vlib_main_t * vm,
+			  u32 node_index,
+			  u32 next_index)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_node_t * n;
+  vlib_node_runtime_t * r;
+
+  n = vec_elt (nm->nodes, node_index);
+  r = vec_elt_at_index (nm->nodes_by_type[n->type], n->runtime_index);
+  return vlib_node_runtime_get_next_frame (vm, r, next_index);
+}
+
+vlib_frame_t *
+vlib_get_next_frame_internal (vlib_main_t * vm,
+			      vlib_node_runtime_t * node,
+			      u32 next_index,
+			      u32 alloc_new_frame);
+
+#define vlib_get_next_frame_macro(vm,node,next_index,vectors,n_vectors_left,alloc_new_frame) \
+do {									\
+  vlib_frame_t * _f							\
+    = vlib_get_next_frame_internal ((vm), (node), (next_index),		\
+				    (alloc_new_frame));			\
+  u32 _n = _f->n_vectors;						\
+  (vectors) = vlib_frame_vector_args (_f) + _n * sizeof ((vectors)[0]); \
+  (n_vectors_left) = VLIB_FRAME_SIZE - _n;				\
+} while (0)
+
+#define vlib_get_next_frame(vm,node,next_index,vectors,n_vectors_left)	\
+  vlib_get_next_frame_macro (vm, node, next_index,			\
+			     vectors, n_vectors_left,			\
+			     /* alloc new frame */ 0)
+
+#define vlib_get_new_next_frame(vm,node,next_index,vectors,n_vectors_left) \
+  vlib_get_next_frame_macro (vm, node, next_index,			\
+			     vectors, n_vectors_left,			\
+			     /* alloc new frame */ 1)
+
+void
+vlib_put_next_frame (vlib_main_t * vm,
+		     vlib_node_runtime_t * r,
+		     u32 next_index,
+		     u32 n_packets_left);
+
+/* Combination get plus put.  Returns vector argument just added. */
+#define vlib_set_next_frame(vm,node,next_index,v)			\
+({									\
+  uword _n_left;							\
+  vlib_get_next_frame ((vm), (node), (next_index), (v), _n_left);	\
+  ASSERT (_n_left > 0);							\
+  vlib_put_next_frame ((vm), (node), (next_index), _n_left - 1);	\
+  (v);									\
+})
+
+always_inline void
+vlib_set_next_frame_buffer (vlib_main_t * vm,
+			    vlib_node_runtime_t * node,
+			    u32 next_index,
+			    u32 buffer_index)
+{
+  u32 * p;
+  p = vlib_set_next_frame (vm, node, next_index, p);
+  p[0] = buffer_index;
+}
+
+vlib_frame_t * vlib_get_frame_to_node (vlib_main_t * vm, u32 to_node_index);
+void vlib_put_frame_to_node (vlib_main_t * vm, u32 to_node_index, vlib_frame_t * f);
+
+always_inline vlib_process_t *
+vlib_get_current_process (vlib_main_t * vm)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  return vec_elt (nm->processes, nm->current_process_index);
+}
+
+always_inline uword
+vlib_in_process_context (vlib_main_t * vm)
+{ return vm->node_main.current_process_index != ~0; }
+
+always_inline uword
+vlib_current_process (vlib_main_t * vm)
+{ return vlib_get_current_process (vm)->node_runtime.node_index; }
+
+/* Anything less than 1e-6 is considered zero. */
+always_inline uword
+vlib_process_suspend_time_is_zero (f64 dt)
+{ return dt < 1e-6; }
+
+always_inline uword
+vlib_process_suspend (vlib_main_t * vm, f64 dt)
+{
+  uword r;
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_process_t * p = vec_elt (nm->processes, nm->current_process_index);
+  u64 dt_cpu = dt * vm->clib_time.clocks_per_second;
+
+  if (vlib_process_suspend_time_is_zero (dt))
+    return VLIB_PROCESS_RESUME_LONGJMP_RESUME;
+
+  p->flags |= VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK;
+  r = clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND);
+  if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND)
+    {
+      p->resume_cpu_time = clib_cpu_time_now () + dt_cpu;
+      clib_longjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_SUSPEND);
+    }
+
+  return r;
+}
+
+always_inline void
+vlib_process_free_event_type (vlib_process_t * p, uword t, uword is_one_time_event)
+{
+  ASSERT (! pool_is_free_index (p->event_type_pool, t));
+  pool_put_index (p->event_type_pool, t);
+  if (is_one_time_event)
+    p->one_time_event_type_bitmap =
+      clib_bitmap_andnoti (p->one_time_event_type_bitmap, t);
+}
+
+always_inline void
+vlib_process_maybe_free_event_type (vlib_process_t * p, uword t)
+{
+  ASSERT (! pool_is_free_index (p->event_type_pool, t));
+  if (clib_bitmap_get (p->one_time_event_type_bitmap, t))
+    vlib_process_free_event_type (p, t, /* is_one_time_event */ 1);
+}
+
+always_inline void *
+vlib_process_get_event_data (vlib_main_t * vm, uword * return_event_type_opaque)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_process_t * p;
+  vlib_process_event_type_t * et;
+  uword t, l;
+  void * event_data_vector;
+
+  p = vec_elt (nm->processes, nm->current_process_index);
+
+  /* Find first type with events ready.
+     Return invalid type when there's nothing there. */
+  t = clib_bitmap_first_set (p->non_empty_event_type_bitmap);
+  if (t == ~0)
+    return 0;
+
+  p->non_empty_event_type_bitmap = clib_bitmap_andnoti (p->non_empty_event_type_bitmap, t);
+
+  l = _vec_len (p->pending_event_data_by_type_index[t]);
+  ASSERT (l > 0);
+  event_data_vector = p->pending_event_data_by_type_index[t];
+  p->pending_event_data_by_type_index[t] = 0;
+
+  et = pool_elt_at_index (p->event_type_pool, t);
+
+  /* Return user's opaque value and possibly index. */
+  *return_event_type_opaque = et->opaque;
+
+  vlib_process_maybe_free_event_type (p, t);
+
+  return event_data_vector;
+}
+
+/* Return event data vector for later reuse.  We reuse event data to avoid
+   repeatedly allocating event vectors in cases where we care about speed. */
+always_inline void
+vlib_process_put_event_data (vlib_main_t * vm, void * event_data)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vec_add1 (nm->recycled_event_data_vectors, event_data);
+}
+
+/* Return type & add any events to data vector. */
+always_inline uword
+vlib_process_get_events (vlib_main_t * vm, uword ** data_vector)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_process_t * p;
+  vlib_process_event_type_t * et;
+  uword r, t, l;
+
+  p = vec_elt (nm->processes, nm->current_process_index);
+
+  /* Find first type with events ready.
+     Return invalid type when there's nothing there. */
+  t = clib_bitmap_first_set (p->non_empty_event_type_bitmap);
+  if (t == ~0)
+    return t;
+
+  p->non_empty_event_type_bitmap = clib_bitmap_andnoti (p->non_empty_event_type_bitmap, t);
+
+  l = _vec_len (p->pending_event_data_by_type_index[t]);
+  if (data_vector)
+    vec_add (*data_vector, p->pending_event_data_by_type_index[t], l);
+  _vec_len (p->pending_event_data_by_type_index[t]) = 0;
+
+  et = pool_elt_at_index (p->event_type_pool, t);
+
+  /* Return user's opaque value. */
+  r = et->opaque;
+
+  vlib_process_maybe_free_event_type (p, t);
+
+  return r;
+}
+
+always_inline uword
+vlib_process_get_events_helper (vlib_process_t * p, uword t, uword ** data_vector)
+{
+  uword l;
+
+  p->non_empty_event_type_bitmap = clib_bitmap_andnoti (p->non_empty_event_type_bitmap, t);
+
+  l = _vec_len (p->pending_event_data_by_type_index[t]);
+  if (data_vector)
+    vec_add (*data_vector, p->pending_event_data_by_type_index[t], l);
+  _vec_len (p->pending_event_data_by_type_index[t]) = 0;
+
+  vlib_process_maybe_free_event_type (p, t);
+
+  return l;
+}
+
+/* As above but query as specified type of event.  Returns number of
+   events found. */
+always_inline uword
+vlib_process_get_events_with_type (vlib_main_t * vm, uword ** data_vector,
+				   uword with_type_opaque)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_process_t * p;
+  uword t, * h;
+
+  p = vec_elt (nm->processes, nm->current_process_index);
+  h = hash_get (p->event_type_index_by_type_opaque, with_type_opaque);
+  if (! h)
+    /* This can happen when an event has not yet been
+       signaled with given opaque type. */
+    return 0;
+
+  t = h[0];
+  if (! clib_bitmap_get (p->non_empty_event_type_bitmap, t))
+    return 0;
+
+  return vlib_process_get_events_helper (p, t, data_vector);
+}
+
+always_inline uword *
+vlib_process_wait_for_event (vlib_main_t * vm)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_process_t * p;
+  uword r;
+
+  p = vec_elt (nm->processes, nm->current_process_index);
+  if (clib_bitmap_is_zero (p->non_empty_event_type_bitmap))
+    {
+      p->flags |= VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT;
+      r = clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND);
+      if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND)
+	clib_longjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_SUSPEND);
+    }
+
+  return p->non_empty_event_type_bitmap;
+}
+
+always_inline uword
+vlib_process_wait_for_one_time_event (vlib_main_t * vm,
+				      uword ** data_vector,
+				      uword with_type_index)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_process_t * p;
+  uword r;
+
+  p = vec_elt (nm->processes, nm->current_process_index);
+  ASSERT (! pool_is_free_index (p->event_type_pool, with_type_index));
+  while (! clib_bitmap_get (p->non_empty_event_type_bitmap, with_type_index))
+    {
+      p->flags |= VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT;
+      r = clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND);
+      if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND)
+	clib_longjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_SUSPEND);
+    }
+
+  return vlib_process_get_events_helper (p, with_type_index, data_vector);
+}
+
+always_inline uword
+vlib_process_wait_for_event_with_type (vlib_main_t * vm,
+				       uword ** data_vector,
+				       uword with_type_opaque)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_process_t * p;
+  uword r, * h;
+
+  p = vec_elt (nm->processes, nm->current_process_index);
+  h = hash_get (p->event_type_index_by_type_opaque, with_type_opaque);
+  while (! h || ! clib_bitmap_get (p->non_empty_event_type_bitmap, h[0]))
+    {
+      p->flags |= VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT;
+      r = clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND);
+      if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND)
+	clib_longjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_SUSPEND);
+
+      /* See if unknown event type has been signaled now. */
+      if (! h)
+	h = hash_get (p->event_type_index_by_type_opaque, with_type_opaque);
+    }
+
+  return vlib_process_get_events_helper (p, h[0], data_vector);
+}
+
+always_inline f64
+vlib_process_wait_for_event_or_clock (vlib_main_t * vm, f64 dt)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_process_t * p;
+  f64 wakeup_time;
+  uword r;
+
+  p = vec_elt (nm->processes, nm->current_process_index);
+
+  if (vlib_process_suspend_time_is_zero (dt)
+      || ! clib_bitmap_is_zero (p->non_empty_event_type_bitmap))
+    return dt;
+
+  wakeup_time = vlib_time_now (vm) + dt;
+
+  /* Suspend waiting for both clock and event to occur. */
+  p->flags |= (VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT
+	       | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK);
+
+  r = clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND);
+  if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND)
+    {
+      p->resume_cpu_time = (clib_cpu_time_now ()
+			    + (dt * vm->clib_time.clocks_per_second));
+      clib_longjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_SUSPEND);
+    }
+
+  /* Return amount of time still left to sleep.
+     If <= 0 then we've been waken up by the clock (and not an event). */
+  return wakeup_time - vlib_time_now (vm);
+}
+
+always_inline vlib_process_event_type_t *
+vlib_process_new_event_type (vlib_process_t * p, uword with_type_opaque)
+{
+  vlib_process_event_type_t * et;
+  pool_get (p->event_type_pool, et);
+  et->opaque = with_type_opaque;
+  return et;
+}
+
+always_inline uword
+vlib_process_create_one_time_event (vlib_main_t * vm, uword node_index, uword with_type_opaque)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_node_t * n = vlib_get_node (vm, node_index);
+  vlib_process_t * p = vec_elt (nm->processes, n->runtime_index);
+  vlib_process_event_type_t * et;
+  uword t;
+
+  et = vlib_process_new_event_type (p, with_type_opaque);
+  t = et - p->event_type_pool;
+  p->one_time_event_type_bitmap = clib_bitmap_ori (p->one_time_event_type_bitmap, t);
+  return t;
+}
+
+always_inline void
+vlib_process_delete_one_time_event (vlib_main_t * vm, uword node_index, uword t)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_node_t * n = vlib_get_node (vm, node_index);
+  vlib_process_t * p = vec_elt (nm->processes, n->runtime_index);
+
+  ASSERT (clib_bitmap_get (p->one_time_event_type_bitmap, t));
+  vlib_process_free_event_type (p, t, /* is_one_time_event */ 1);
+}
+
+always_inline void *
+vlib_process_signal_event_helper (vlib_node_main_t * nm,
+				  vlib_node_t * n,
+				  vlib_process_t * p,
+				  uword t,
+				  uword n_data_elts,
+				  uword n_data_elt_bytes)
+{
+  uword p_flags, add_to_pending, delete_from_wheel;
+  void * data_to_be_written_by_caller;
+
+  ASSERT (! pool_is_free_index (p->event_type_pool, t));
+
+  vec_validate (p->pending_event_data_by_type_index, t);
+
+  /* Resize data vector and return caller's data to be written. */
+  {
+    void * data_vec = p->pending_event_data_by_type_index[t];
+    uword l;
+
+    if (! data_vec && vec_len (nm->recycled_event_data_vectors))
+      {
+	data_vec = vec_pop (nm->recycled_event_data_vectors);
+	_vec_len (data_vec) = 0;
+      }
+
+    l = vec_len (data_vec);
+
+    data_vec = _vec_resize (data_vec,
+			    /* length_increment */ n_data_elts,
+			    /* total size after increment */ (l + n_data_elts) * n_data_elt_bytes,
+			    /* header_bytes */ 0, /* data_align */ 0);
+
+    p->pending_event_data_by_type_index[t] = data_vec;
+    data_to_be_written_by_caller = data_vec + l * n_data_elt_bytes;
+  }
+
+  p->non_empty_event_type_bitmap = clib_bitmap_ori (p->non_empty_event_type_bitmap, t);
+
+  p_flags = p->flags;
+
+  /* Event was already signalled? */
+  add_to_pending = (p_flags & VLIB_PROCESS_RESUME_PENDING) == 0;
+
+  /* Process will resume when suspend time elapses? */
+  delete_from_wheel = 0;
+  if (p_flags & VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK)
+    {
+      /* Waiting for both event and clock? */
+      if (p_flags & VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT)
+	delete_from_wheel = 1;
+      else
+	/* Waiting only for clock.  Event will be queue and may be
+	   handled when timer expires. */
+	add_to_pending = 0;
+    }
+
+  /* Never add current process to pending vector since current process is
+     already running. */
+  add_to_pending &= nm->current_process_index != n->runtime_index;
+
+  if (add_to_pending)
+    {
+      u32 x = vlib_timing_wheel_data_set_suspended_process (n->runtime_index);
+      p->flags = p_flags | VLIB_PROCESS_RESUME_PENDING;
+      vec_add1 (nm->data_from_advancing_timing_wheel, x);
+      if (delete_from_wheel)
+        timing_wheel_delete (&nm->timing_wheel, x);
+    }
+
+  return data_to_be_written_by_caller;
+}
+
+always_inline void *
+vlib_process_signal_event_data (vlib_main_t * vm,
+				uword node_index,
+				uword type_opaque,
+				uword n_data_elts,
+				uword n_data_elt_bytes)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_node_t * n = vlib_get_node (vm, node_index);
+  vlib_process_t * p = vec_elt (nm->processes, n->runtime_index);
+  uword * h, t;
+
+  h = hash_get (p->event_type_index_by_type_opaque, type_opaque);
+  if (! h)
+    {
+      vlib_process_event_type_t * et = vlib_process_new_event_type (p, type_opaque);
+      t = et - p->event_type_pool;
+      hash_set (p->event_type_index_by_type_opaque, type_opaque, t);
+    }
+  else
+    t = h[0];
+
+  return vlib_process_signal_event_helper (nm, n, p, t, n_data_elts, n_data_elt_bytes);
+}
+
+always_inline void *
+vlib_process_signal_event_at_time (vlib_main_t * vm,
+				   f64 dt,
+				   uword node_index,
+				   uword type_opaque,
+				   uword n_data_elts,
+				   uword n_data_elt_bytes)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_node_t * n = vlib_get_node (vm, node_index);
+  vlib_process_t * p = vec_elt (nm->processes, n->runtime_index);
+  uword * h, t;
+
+  h = hash_get (p->event_type_index_by_type_opaque, type_opaque);
+  if (! h)
+    {
+      vlib_process_event_type_t * et = vlib_process_new_event_type (p, type_opaque);
+      t = et - p->event_type_pool;
+      hash_set (p->event_type_index_by_type_opaque, type_opaque, t);
+    }
+  else
+    t = h[0];
+
+  if (vlib_process_suspend_time_is_zero (dt))
+    return vlib_process_signal_event_helper (nm, n, p, t, n_data_elts, n_data_elt_bytes);
+  else
+    {
+      vlib_signal_timed_event_data_t * te;
+      u64 dt_cpu = dt * vm->clib_time.clocks_per_second;
+
+      pool_get_aligned (nm->signal_timed_event_data_pool, te, sizeof (te[0]));
+
+      te->n_data_elts = n_data_elts;
+      te->n_data_elt_bytes = n_data_elt_bytes;
+      te->n_data_bytes = n_data_elts * n_data_elt_bytes;
+
+      /* Assert that structure fields are big enough. */
+      ASSERT (te->n_data_elts == n_data_elts);
+      ASSERT (te->n_data_elt_bytes == n_data_elt_bytes);
+      ASSERT (te->n_data_bytes == n_data_elts * n_data_elt_bytes);
+
+      te->process_node_index = n->runtime_index;
+      te->event_type_index = t;
+
+      timing_wheel_insert (&nm->timing_wheel, clib_cpu_time_now () + dt_cpu,
+			   vlib_timing_wheel_data_set_timed_event (te - nm->signal_timed_event_data_pool));
+
+      /* Inline data big enough to hold event? */
+      if (te->n_data_bytes < sizeof (te->inline_event_data))
+	return te->inline_event_data;
+      else
+	{
+	  te->event_data_as_vector = 0;
+	  vec_resize (te->event_data_as_vector, te->n_data_bytes);
+	  return te->event_data_as_vector;
+	}
+    }
+}
+
+always_inline void *
+vlib_process_signal_one_time_event_data (vlib_main_t * vm,
+					 uword node_index,
+					 uword type_index,
+					 uword n_data_elts,
+					 uword n_data_elt_bytes)
+{
+  vlib_node_main_t * nm = &vm->node_main;
+  vlib_node_t * n = vlib_get_node (vm, node_index);
+  vlib_process_t * p = vec_elt (nm->processes, n->runtime_index);
+  return vlib_process_signal_event_helper (nm, n, p, type_index, n_data_elts, n_data_elt_bytes);
+}
+
+always_inline void
+vlib_process_signal_event (vlib_main_t * vm,
+			   uword node_index,
+			   uword type_opaque,
+			   uword data)
+{
+  uword * d = vlib_process_signal_event_data (vm, node_index, type_opaque,
+					      1 /* elts */, sizeof (uword));
+  d[0] = data;
+}
+
+always_inline void
+vlib_process_signal_event_pointer (vlib_main_t * vm,
+				   uword node_index,
+				   uword type_opaque,
+				   void * data)
+{
+  void ** d = vlib_process_signal_event_data (vm, node_index, type_opaque,
+					      1 /* elts */, sizeof (data));
+  d[0] = data;
+}
+
+always_inline void
+vlib_process_signal_one_time_event (vlib_main_t * vm,
+				    uword node_index,
+				    uword type_index,
+				    uword data)
+{
+  uword * d = vlib_process_signal_one_time_event_data (vm, node_index, type_index,
+							1 /* elts */, sizeof (uword));
+  d[0] = data;
+}
+
+always_inline void
+vlib_signal_one_time_waiting_process (vlib_main_t * vm, vlib_one_time_waiting_process_t * p)
+{
+  vlib_process_signal_one_time_event (vm, p->node_index, p->one_time_event, /* data */ ~0);
+  memset (p, ~0, sizeof (p[0]));
+}
+
+always_inline void
+vlib_signal_one_time_waiting_process_vector (vlib_main_t * vm,
+					     vlib_one_time_waiting_process_t ** wps)
+{
+  vlib_one_time_waiting_process_t * wp;
+  vec_foreach (wp, *wps)
+    vlib_signal_one_time_waiting_process (vm, wp);
+  vec_free (*wps);
+}
+
+always_inline void
+vlib_current_process_wait_for_one_time_event (vlib_main_t * vm, vlib_one_time_waiting_process_t * p)
+{
+  p->node_index = vlib_current_process (vm);
+  p->one_time_event =
+    vlib_process_create_one_time_event (vm, p->node_index, /* type opaque */ ~0);
+  vlib_process_wait_for_one_time_event (vm,
+					/* don't care about data */ 0,
+					p->one_time_event);
+}
+
+always_inline void
+vlib_current_process_wait_for_one_time_event_vector (vlib_main_t * vm,
+						     vlib_one_time_waiting_process_t ** wps)
+{
+  vlib_one_time_waiting_process_t * wp;
+  vec_add2 (*wps, wp, 1);
+  vlib_current_process_wait_for_one_time_event (vm, wp);
+}
+
+always_inline u32
+vlib_node_runtime_update_main_loop_vector_stats (vlib_main_t * vm,
+						 vlib_node_runtime_t * node,
+						 uword n_vectors)
+{
+  u32 i, d, vi0, vi1;
+  u32 i0, i1;
+
+  ASSERT (is_pow2 (ARRAY_LEN (node->main_loop_vector_stats)));
+  i = ((vm->main_loop_count >> VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE)
+       & (ARRAY_LEN (node->main_loop_vector_stats) - 1));
+  i0 = i ^ 0;
+  i1 = i ^ 1;
+  d = ((vm->main_loop_count >> VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE)
+       - (node->main_loop_count_last_dispatch >> VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE));
+  vi0 = node->main_loop_vector_stats[i0];
+  vi1 = node->main_loop_vector_stats[i1];
+  vi0 = d == 0 ? vi0 : 0;
+  vi1 = d <= 1 ? vi1 : 0;
+  vi0 += n_vectors;
+  node->main_loop_vector_stats[i0] = vi0;
+  node->main_loop_vector_stats[i1] = vi1;
+  node->main_loop_count_last_dispatch = vm->main_loop_count;
+  /* Return previous counter. */
+  return node->main_loop_vector_stats[i1];
+}
+
+always_inline f64
+vlib_node_vectors_per_main_loop_as_float (vlib_main_t * vm, u32 node_index)
+{
+  vlib_node_runtime_t * rt = vlib_node_get_runtime (vm, node_index);
+  u32 v;
+
+  v = vlib_node_runtime_update_main_loop_vector_stats (vm, rt, /* n_vectors */ 0);
+  return (f64) v / (1 << VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE);
+}
+
+always_inline u32
+vlib_node_vectors_per_main_loop_as_integer (vlib_main_t * vm, u32 node_index)
+{
+  vlib_node_runtime_t * rt = vlib_node_get_runtime (vm, node_index);
+  u32 v;
+
+  v = vlib_node_runtime_update_main_loop_vector_stats (vm, rt, /* n_vectors */ 0);
+  return v >> VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE;
+}
+
+void
+vlib_frame_free (vlib_main_t * vm,
+		 vlib_node_runtime_t * r,
+		 vlib_frame_t * f);
+
+/* Add next node to given node in given slot. */
+uword
+vlib_node_add_next_with_slot (vlib_main_t * vm,
+			      uword node,
+			      uword next_node,
+			      uword slot);
+
+/* As above but adds to end of node's next vector. */
+always_inline uword
+vlib_node_add_next (vlib_main_t * vm, uword node, uword next_node)
+{ return vlib_node_add_next_with_slot (vm, node, next_node, ~0); }
+
+/* Add next node to given node in given slot. */
+uword
+vlib_node_add_named_next_with_slot (vlib_main_t * vm,
+				    uword node,
+				    char * next_name,
+				    uword slot);
+
+/* As above but adds to end of node's next vector. */
+always_inline uword
+vlib_node_add_named_next (vlib_main_t * vm,
+			  uword node,
+			  char * name)
+{ return vlib_node_add_named_next_with_slot (vm, node, name, ~0); }
+
+/* Query node given name. */
+vlib_node_t * vlib_get_node_by_name (vlib_main_t * vm, u8 * name);
+
+/* Rename a node. */
+void vlib_node_rename (vlib_main_t * vm, u32 node_index, char * fmt, ...);
+
+/* Register new packet processing node.  Nodes can be registered
+   dynamically via this call or statically via the VLIB_REGISTER_NODE
+   macro. */
+u32 vlib_register_node (vlib_main_t * vm, vlib_node_registration_t * r);
+
+/* Register all static nodes registered via VLIB_REGISTER_NODE. */
+void vlib_register_all_static_nodes (vlib_main_t * vm);
+
+/* Start a process. */
+void vlib_start_process (vlib_main_t * vm, uword process_index);
+
+/* Sync up runtime and main node stats. */
+void
+vlib_node_sync_stats (vlib_main_t * vm, vlib_node_t * n);
+
+/* Node graph initialization function. */
+clib_error_t * vlib_node_main_init (vlib_main_t * vm);
+
+format_function_t format_vlib_node_graph;
+format_function_t format_vlib_node_name;
+format_function_t format_vlib_next_node_name;
+format_function_t format_vlib_node_and_next;
+format_function_t format_vlib_cpu_time;
+format_function_t format_vlib_time;
+/* Parse node name -> node index. */
+unformat_function_t unformat_vlib_node;
+
+always_inline void 
+vlib_node_increment_counter (vlib_main_t *vm, u32 node_index, 
+                             u32 counter_index, u64 increment)
+{
+  vlib_node_t * n = vlib_get_node (vm, node_index);
+  vlib_error_main_t * em = &vm->error_main;
+  u32 node_counter_base_index = n->error_heap_index;
+  em->counters[node_counter_base_index + counter_index] += increment;
+}
+
+#endif /* included_vlib_node_funcs_h */
diff --git a/vlib/vlib/parse.c b/vlib/vlib/parse.c
new file mode 100644
index 00000000000..844be8aafe3
--- /dev/null
+++ b/vlib/vlib/parse.c
@@ -0,0 +1,980 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vlib/parse.h>
+
+#define PARSE_DEBUG 0
+
+u16 word_type_index, number_type_index, eof_type_index, rule_eof_type_index,
+    plus_type_index, minus_type_index, star_type_index, slash_type_index,
+    lpar_type_index, rpar_type_index;
+
+u8 * format_vlib_parse_value (u8 * s, va_list * args)
+{
+  vlib_parse_main_t *pm = va_arg (*args, vlib_parse_main_t *);
+  vlib_parse_type_t *type;
+  vlib_parse_value_t *v;
+  u16 type_index;
+
+  s = format (s, "%d items:\n", vec_len (pm->parse_value));
+  vec_foreach (v, pm->parse_value)
+    {
+      type_index = v->type;
+      type = pool_elt_at_index (pm->parse_types, type_index);
+      if (type->format_value)
+	s = format (s, "[%d]: %U\n", v - pm->parse_value, 
+		    type->format_value, v);
+      else
+	s = format (s, "[%d]: (nofun)\n", v - pm->parse_value);
+    }
+  return s;
+}
+
+static u8 * format_vlib_parse_match (u8 * s, va_list * args)
+{
+  vlib_parse_match_t m = va_arg (*args, vlib_parse_match_t);
+  char * t = 0;
+  switch (m)
+    {
+#define _(a) case VLIB_PARSE_##a: t = #a; break;
+      foreach_parse_match_type
+#undef _
+    default: t = 0; break;
+    }
+  
+  if (t)
+    return format (s, "%s", t);
+  else
+    return format (s, "unknown 0x%x", m);
+}
+
+static u8 * format_vlib_parse_item (u8 * s, va_list * args)
+{
+  vlib_parse_main_t *pm = va_arg (*args, vlib_parse_main_t *);
+  vlib_parse_item_t *item = va_arg (*args, vlib_parse_item_t *);
+  vlib_parse_type_t *type = pool_elt_at_index (pm->parse_types, item->type);
+
+  if (item->type == word_type_index)
+    s = format (s, "%s", item->value.as_pointer);
+  else
+    s = format (s, "<%s>", type->name);
+  return s;
+}
+
+static u8 * format_vlib_parse_graph (u8 * s, va_list * args)
+{
+  vlib_parse_main_t *pm = va_arg (*args, vlib_parse_main_t *);
+  vlib_parse_graph_t *node = va_arg (*args, vlib_parse_graph_t *);
+  vlib_parse_item_t *item;
+  vlib_parse_type_t *type;
+
+  /* $$$ hash table */
+  pool_foreach (type, pm->parse_types, 
+                ({
+                  if (type->rule_index == node - pm->parse_graph)
+                    s = format (s, "\n<%s>\n", type->name);
+                }));
+
+  if (pm->root_index == (node - pm->parse_graph))
+    s = format (s, "\n<root>\n");
+
+  item = pool_elt_at_index (pm->parse_items, node->item);
+
+  s = format (s, "[%d] %U ", node - pm->parse_graph,
+	      format_vlib_parse_item, pm, item);
+
+  if (node->peer == (u32)~0) 
+    s = format (s, "peer nil  ");
+  else
+    s = format (s, "peer %4u ", node->peer);
+
+  if (node->deeper == (u32)~0) 
+    s = format (s, "deeper nil  ");
+  else
+    s = format (s, "deeper %4u ", node->deeper);
+
+  return s;
+}
+
+void dump_parse_graph (void)
+{
+  vlib_parse_main_t *pm = &vlib_parse_main;
+  vlib_parse_graph_t *node;
+
+  pool_foreach (node, pm->parse_graph, ({
+    fformat(stdout, "%U\n", format_vlib_parse_graph, pm, node);
+  }));
+}
+
+always_inline void
+parse_cleanup_value (vlib_parse_main_t *pm, vlib_parse_value_t *pv)
+{
+  vlib_parse_type_t *type = pool_elt_at_index (pm->parse_types, pv->type);
+  if (type->value_cleanup_function)
+    type->value_cleanup_function (pv);
+}
+
+static void parse_reset (vlib_parse_main_t *pm, u8 *input)
+{
+  vlib_lex_token_t *t;
+  vlib_parse_value_t *pv;
+
+  vlib_lex_reset (pm->lex_main, input);
+
+  vec_foreach (t, pm->tokens)
+    vlib_lex_cleanup_token (t);
+
+  vec_foreach (pv, pm->parse_value)
+    parse_cleanup_value (pm, pv);
+
+  _vec_len (pm->parse_value) = 0;
+  _vec_len (pm->tokens) = 0;
+  pm->current_token_index = 0;
+}
+
+static void parse_help (vlib_parse_main_t *pm, u32 index)
+{
+  vlib_parse_graph_t *node;
+  vlib_parse_item_t  *item;
+  vlib_parse_type_t  *type;
+  vlib_main_t *vm = pm->vlib_main;
+  u8 *help_input;
+  int i;
+    
+  help_input = vec_dup (pm->lex_main->input_vector);
+
+  for (i = vec_len(help_input)-1; i >= 0; i--)
+    if (help_input[i] == '?')
+      {
+	help_input[i] = 0;
+	_vec_len(help_input) = i;
+	break;
+      }
+
+  for (i = vec_len(help_input)-1; i >= 0; i--)
+    {
+      if (help_input[i] != ' ' && help_input[i] != '\t')
+	break;
+      help_input[i] = 0;
+      break;
+    }
+  _vec_len(help_input) = i+1;
+
+  while (index != (u32)~0)
+    {
+      node = pool_elt_at_index (pm->parse_graph, index);
+      item = pool_elt_at_index (pm->parse_items, node->item);
+      type = pool_elt_at_index (pm->parse_types, item->type);
+        
+      if (item->type == eof_type_index && vec_len (pm->match_items) == 0)
+	/* do nothing */;
+      else if (item->type == word_type_index)
+	vlib_cli_output (vm, "%s %s\n", help_input, item->value.as_pointer);
+      else
+	vlib_cli_output (vm, "%s <%s>\n", help_input, type->name);
+      index = node->peer;
+    }
+  vec_free (help_input);
+}
+
+static vlib_parse_match_t
+parse_eval_internal (vlib_parse_main_t *pm, u32 index)
+{
+  vlib_parse_graph_t *node;
+  vlib_parse_item_t  *item;
+  vlib_parse_type_t  *type;
+  vlib_parse_value_t value, *pv;
+  vlib_parse_match_t rv;
+  u32 *partial_matches = 0;
+  vlib_lex_token_t *t;
+  u32 save_token_index=(u32)~0, save_match_items=0;
+  int had_value = 0;
+
+  if (pm->current_token_index >= vec_len(pm->tokens))
+    return VLIB_PARSE_MATCH_FAIL;
+
+  /* current token */
+  t = vec_elt_at_index (pm->tokens, pm->current_token_index);
+
+  /* Help ? */
+  if (PREDICT_FALSE(t->token == VLIB_LEX_qmark))
+    {
+      parse_help (pm, index);
+      _vec_len (pm->match_items) = 0;
+      return VLIB_PARSE_MATCH_DONE;
+    }
+
+  /* Across all peers at this level of the parse graph */
+  while (index != (u32)~0)
+    {
+      node = pool_elt_at_index (pm->parse_graph, index);
+      item = pool_elt_at_index (pm->parse_items, node->item);
+      type = pool_elt_at_index (pm->parse_types, item->type);
+      
+      /* 
+       * Save the token index. We may have to back up several
+       * trie plies. Type-specific match functions can consume 
+       * multiple tokens, and they may not be optimally careful
+       */
+      save_token_index = pm->current_token_index;
+      save_match_items = vec_len (pm->match_items);
+      vec_add1 (pm->match_items, node->item);
+      
+      if (PARSE_DEBUG > 1)
+        clib_warning ("Try to match token %U against node %d",
+                      format_vlib_lex_token, pm->lex_main, t, index);
+      
+      /* Call the type-specific match function */
+      rv = type->match_function (pm, type, t, &value);
+      
+      if (PARSE_DEBUG > 1)
+        clib_warning ("returned %U", format_vlib_parse_match, rv);
+      
+      switch (rv)
+        {
+        case VLIB_PARSE_MATCH_VALUE:
+          /* 
+           * Matched, and returned a value to append to the
+           * set of args passed to the action function 
+           */
+          value.type = item->type;
+          vec_add1 (pm->parse_value, value);
+          had_value = 1;
+          /* fallthrough */
+
+        case VLIB_PARSE_MATCH_FULL:
+        unambiguous_partial_match:
+          /* Consume the matched token */
+          pm->current_token_index++;
+
+          /* continue matching along this path */
+          rv = parse_eval_internal (pm, node->deeper);
+
+          /* this is not the right path */
+          if (rv == VLIB_PARSE_MATCH_FAIL)
+            {
+              if (had_value)
+                {
+                  /* Delete the value */
+                  value = pm->parse_value [vec_len (pm->parse_value)-1];
+                  parse_cleanup_value (pm, &value);
+                  _vec_len (pm->parse_value) -= 1;
+                }
+              /* Continue with the next sibling */
+              pm->current_token_index = save_token_index;
+              _vec_len (pm->match_items) = save_match_items;
+              index = node->peer;
+              break;
+            } 
+          return rv;
+
+        case VLIB_PARSE_MATCH_PARTIAL:
+          /* Partial (substring) match, remember it but keep going */
+          vec_add1 (partial_matches, node - pm->parse_graph);
+          index = node->peer;
+          break;
+
+        case VLIB_PARSE_MATCH_FAIL:
+          /* Continue with the next sibling */
+          index = node->peer;
+          _vec_len (pm->match_items) = save_match_items;
+          break;
+
+        case VLIB_PARSE_MATCH_DONE:
+          /* Parse complete, invoke the action function */
+          if (PARSE_DEBUG > 0)
+            clib_warning ("parse_value: %U", format_vlib_parse_value, pm);
+
+          {
+            vlib_parse_eval_function_t * f = item->value.as_pointer;
+            if (f)
+	      rv = f (pm, item, pm->parse_value);
+          }
+
+          vec_foreach (pv, pm->parse_value)
+            parse_cleanup_value (pm, pv);
+          _vec_len (pm->parse_value) = 0;
+          _vec_len (pm->match_items) = 0;
+          return rv;
+                
+        case VLIB_PARSE_MATCH_AMBIGUOUS:
+        case VLIB_PARSE_MATCH_EVAL_FAIL:
+        case VLIB_PARSE_MATCH_RULE:
+          _vec_len (pm->match_items) = save_match_items;
+          return rv;
+        }
+    }
+
+  /* 
+   * Out of siblings. If we have exactly one partial match
+   * we win
+   */
+  if (vec_len (partial_matches) == 1)
+    {
+      index = partial_matches[0];
+      node = pool_elt_at_index (pm->parse_graph, index);
+      vec_free (partial_matches);
+      goto unambiguous_partial_match;
+    }
+
+  /* Ordinary loser */
+  rv = VLIB_PARSE_MATCH_FAIL;
+
+  /* Ambiguous loser */
+  if (vec_len (partial_matches) > 1)
+    {
+      vec_free (partial_matches);
+      rv = VLIB_PARSE_MATCH_AMBIGUOUS;
+    }
+
+  _vec_len (pm->match_items) = save_match_items;
+  return rv;
+}
+
+vlib_parse_match_t rule_match (vlib_parse_main_t *pm, vlib_parse_type_t *type,
+			       vlib_lex_token_t *t,
+			       vlib_parse_value_t *valuep)
+{
+  vlib_parse_match_t rv;
+  static int recursion_level;
+
+  if (PARSE_DEBUG > 1)
+    clib_warning ("[%d]: try to match type %s graph index %d", 
+                  recursion_level,
+                  type->name,
+                  type->rule_index);
+  recursion_level++;
+  rv = parse_eval_internal (pm, type->rule_index);
+  recursion_level--;
+
+  /* Break the recusive unwind here... */
+  if (rv == VLIB_PARSE_MATCH_RULE)
+    {
+      if (PARSE_DEBUG > 1)
+        clib_warning ("[%d]: type %s matched", recursion_level, type->name);
+
+      return VLIB_PARSE_MATCH_FULL;
+    } 
+  else 
+    {
+      if (PARSE_DEBUG > 1)
+        clib_warning ("[%d]: type %s returns %U", recursion_level, type->name, 
+                      format_vlib_parse_match, rv);
+    }
+  return rv;
+}
+
+static int parse_eval (vlib_parse_main_t *pm, u8 *input)
+{
+  vlib_lex_token_t * t;
+
+  parse_reset (pm, input);
+    
+  /* Tokenize the entire input vector */
+  do {
+    vec_add2 (pm->tokens, t, 1);
+    vlib_lex_get_token (pm->lex_main, t);
+  } while (t->token != VLIB_LEX_eof);
+
+  /* Feed it to the parser */
+  return parse_eval_internal (pm, pm->root_index);
+}
+
+/* Temporary vlib stub */
+vlib_parse_match_t vlib_parse_eval (u8 *input)
+{
+  return parse_eval (&vlib_parse_main, input);
+}
+
+u16 parse_type_find_or_create (vlib_parse_main_t *pm, vlib_parse_type_t *t)
+{
+  uword *p;
+  vlib_parse_type_t *n;
+  u8 *name_copy;
+
+  p = hash_get_mem (pm->parse_type_by_name_hash, t->name);
+  if (p)
+    return p[0];
+
+  pool_get (pm->parse_types, n);
+  *n = *t;
+  n->rule_index = (u32) ~0;
+
+  name_copy = format (0, "%s%c", n->name, 0);
+
+  hash_set_mem (pm->parse_type_by_name_hash, name_copy, n - pm->parse_types);
+  return n - pm->parse_types;
+}
+
+u16 parse_type_find_by_name (vlib_parse_main_t *pm, char *name)
+{
+  uword *p;
+
+  p = hash_get_mem (pm->parse_type_by_name_hash, name);
+  if (p)
+    return p[0];
+
+  return (u16) ~0;
+}
+
+u32 parse_item_find_or_create (vlib_parse_main_t *pm, vlib_parse_item_t *item)
+                               
+{
+  uword *p;
+  vlib_parse_item_t *i;
+
+  /* Exact match the entire item */
+  p = mhash_get (&pm->parse_item_hash, item);
+  if (p)
+    return p[0];
+
+  pool_get (pm->parse_items, i);
+  *i = *item;
+
+  mhash_set (&pm->parse_item_hash, i, i - pm->parse_items, 0);
+  return i - pm->parse_items;
+}
+
+static void parse_type_and_graph_init (vlib_parse_main_t *pm)
+{
+  u32 eof_index;
+  vlib_parse_type_t type;
+  vlib_parse_item_t item;
+
+  memset (&type, 0, sizeof (type));
+
+#define foreach_token_type                      \
+  _ (eof)                                       \
+  _ (rule_eof)                                  \
+  _ (word)                                      \
+  _ (number)                                    \
+  _ (plus)                                      \
+  _ (minus)                                     \
+  _ (star)                                      \
+  _ (slash)                                     \
+  _ (lpar)                                      \
+  _ (rpar)
+
+#define _(a) a##_type_index = parse_type_find_by_name (pm, #a);
+    foreach_token_type
+#undef _
+
+  memset (&item, 0, sizeof (item));
+  item.type = eof_type_index;
+    
+  eof_index = parse_item_find_or_create (pm, &item);
+  pm->root_index = (u32)~0;
+
+#if 0
+  pool_get (pm->parse_graph, g);
+  memset (g, 0xff, sizeof (*g));
+  g->item = eof_index;
+  pm->root_index = 0;
+#endif
+}
+
+
+
+static void tokenize (vlib_parse_main_t *pm, parse_registration_t *pr)
+{
+  vlib_lex_token_t *t;
+  pm->register_input = format (pm->register_input, 
+                               "%s%c", pr->initializer, 0);
+  
+  parse_reset (pm, pm->register_input);
+  
+  do {
+    vec_add2 (pm->tokens, t, 1);
+    vlib_lex_get_token (pm->lex_main, t);
+  } while (t->token != VLIB_LEX_eof);
+  _vec_len (pm->register_input) = 0;
+}
+
+static int is_typed_rule (vlib_parse_main_t *pm)
+{
+  vlib_lex_token_t *t = vec_elt_at_index (pm->tokens, 0);
+  
+  /* <mytype> = blah blah blah */
+  if (vec_len(pm->tokens) >= 4 
+      && t[0].token == VLIB_LEX_lt
+      && t[1].token == VLIB_LEX_word
+      && t[2].token == VLIB_LEX_gt
+      && t[3].token == VLIB_LEX_equals)
+    return 1;
+  return 0;
+}
+
+static int token_matches_graph_node (vlib_parse_main_t *pm,
+                                     vlib_lex_token_t *t, 
+                                     vlib_parse_graph_t *node,
+                                     vlib_parse_item_t *item,
+                                     vlib_parse_type_t *type,
+                                     u32 *token_increment)
+{
+  /* EOFs don't match */
+  if (t->token == VLIB_LEX_eof)
+    return 0;
+
+  /* New chain element is a word */
+  if (t->token == VLIB_LEX_word)
+    {
+      /* but the item in hand is not a word */
+      if (item->type != word_type_index)
+        return 0;
+      
+      /* Or it's not this particular word */
+      if (strcmp (t->value.as_pointer, item->value.as_pointer))
+        return 0;
+      *token_increment = 1;
+      return 1;
+    }
+  /* New chain element is a type-name: < TYPE-NAME > */
+  if (t->token == VLIB_LEX_lt)
+    {
+      u16 token_type_index;
+      
+      /* < TYPE > */
+      if (t[1].token != VLIB_LEX_word ||
+          t[2].token != VLIB_LEX_gt)
+        {
+          clib_warning (0, "broken type name in '%s'", pm->register_input);
+          return 0;
+        }
+      
+      token_type_index = parse_type_find_by_name (pm, t[1].value.as_pointer);
+      if (token_type_index == (u16)~0)
+        {
+          clib_warning (0, "unknown type '%s'", t[1].value.as_pointer);
+          return 0;
+        }
+      
+      /* Its a known type but does not match. */
+      if (item->type != token_type_index)
+        return 0;
+      
+      *token_increment = 3;
+      return 1;
+    }
+  clib_warning ("BUG: t->token = %d", t->token);
+  return 0;
+}
+
+u32 generate_subgraph_from_tokens (vlib_parse_main_t *pm,
+                                   vlib_lex_token_t *t, 
+                                   u32 *new_subgraph_depth,
+                                   parse_registration_t *pr,
+                                   int not_a_rule)
+{
+  vlib_parse_graph_t *g, *last_g;
+  vlib_parse_item_t new_item;
+  u32 rv = (u32)~0, new_item_index, last_index = (u32)~0;
+  u16 token_type_index;
+  u32 depth = 0;
+
+  while (t < pm->tokens + vec_len (pm->tokens))
+    {
+      memset (&new_item, 0, sizeof (new_item));
+
+      if (t->token == VLIB_LEX_word)
+	{
+	  new_item.type = word_type_index;
+	  new_item.value.as_pointer = vec_dup ((u8 *) t->value.as_pointer);
+	  new_item_index = parse_item_find_or_create (pm, &new_item);
+          t++;
+	}
+      else if (t->token == VLIB_LEX_lt)
+	{
+	  if (t[1].token != VLIB_LEX_word ||
+	      t[2].token != VLIB_LEX_gt)
+	    {
+	      clib_warning ("broken type name in '%s'", pm->register_input);
+              goto screwed;
+	    }
+	  token_type_index = parse_type_find_by_name (pm, 
+                                                      t[1].value.as_pointer);
+	  if (token_type_index == (u16)~0)
+	    {
+              clib_warning ("unknown type 2 '%s'", t[1].value.as_pointer);
+              goto screwed;
+	    }
+
+	  new_item.type = token_type_index;
+	  new_item.value.as_pointer = 0;
+	  new_item_index = parse_item_find_or_create (pm, &new_item);
+	  t += 3; /* skip < <type-name> and > */
+	}
+      else if (t->token == VLIB_LEX_eof)
+	{
+        screwed:
+	  new_item.type = not_a_rule ? eof_type_index : rule_eof_type_index;
+	  new_item.value.as_pointer = pr->eof_match;
+	  new_item_index = parse_item_find_or_create (pm, &new_item);
+          t++;
+	}
+      else
+	{
+	  clib_warning ("unexpected token %U index %d in '%s'", 
+                        format_vlib_lex_token, pm->lex_main, t, 
+                        t - pm->tokens, pm->register_input);
+          goto screwed;
+	}
+
+      pool_get (pm->parse_graph, g);
+      memset (g, 0xff, sizeof (*g));
+      g->item = new_item_index;
+      depth++;
+        
+      if (rv == (u32)~0)
+        {
+          rv = g - pm->parse_graph;
+          last_index = rv;
+        }
+      else
+        {
+          last_g = pool_elt_at_index (pm->parse_graph, last_index);
+          last_index = last_g->deeper = g - pm->parse_graph;
+        }
+    }
+  *new_subgraph_depth = depth;
+  return rv;
+}
+
+static u32 measure_depth (vlib_parse_main_t *pm, u32 index)
+{
+  vlib_parse_graph_t *node;
+  vlib_parse_item_t *item;
+  u32 max=0;
+  u32 depth;
+
+  if (index == (u32)~0)
+    return 0;
+
+  node = pool_elt_at_index (pm->parse_graph, index);
+  item = pool_elt_at_index (pm->parse_items, node->item);
+
+  if (item->type == eof_type_index)
+    return 1;
+
+  while (index != (u32)~0)
+    {
+      node = pool_elt_at_index (pm->parse_graph, index);
+      depth = measure_depth (pm, node->deeper);
+      if (max < depth)
+        max = depth;
+      index = node->peer;
+    }
+
+  return max + 1;
+}
+
+static void add_subgraph_to_graph (vlib_parse_main_t *pm, 
+                                   u32 last_matching_index, 
+                                   u32 graph_root_index, 
+                                   u32 new_subgraph_index,
+                                   u32 new_subgraph_depth)
+{
+  vlib_parse_graph_t *parent_node;
+  int new_subgraph_longest = 1;
+  u32 current_peer_index;
+  u32 current_depth;
+  vlib_parse_graph_t *current_peer = 0;
+  vlib_parse_graph_t *new_subgraph_node = 
+    pool_elt_at_index (pm->parse_graph, new_subgraph_index);
+
+  /* 
+   * Case 1: top-level peer. Splice into the top-level
+   * peer chain according to rule depth 
+   */
+  if (last_matching_index == (u32)~0) 
+    {
+      u32 index = graph_root_index;
+      while (1) {
+        current_peer = pool_elt_at_index (pm->parse_graph, index);
+        current_depth = measure_depth (pm, index);
+        if (current_depth < new_subgraph_depth 
+            || current_peer->peer == (u32)~0)
+          break;
+        index = current_peer->peer;
+      }
+      new_subgraph_node->peer = current_peer->peer;
+      current_peer->peer = new_subgraph_index;
+      return;
+    }
+
+  parent_node = pool_elt_at_index (pm->parse_graph, last_matching_index);
+  current_peer_index = parent_node->deeper;
+
+  while (current_peer_index != (u32)~0)
+    {
+      current_peer = pool_elt_at_index (pm->parse_graph, current_peer_index);
+      current_depth = measure_depth (pm, current_peer_index);
+      if (current_depth < new_subgraph_depth)
+        break;
+      new_subgraph_longest = 0;
+      current_peer_index = current_peer->peer;
+    }
+
+  ASSERT (current_peer);
+
+  if (new_subgraph_longest)
+    {
+      new_subgraph_node->peer = parent_node->deeper;
+      parent_node->deeper = new_subgraph_index;
+    }
+  else
+    {
+      new_subgraph_node->peer = current_peer->peer;
+      current_peer->peer = new_subgraph_index;
+    }
+}
+
+static clib_error_t *
+parse_register_one (vlib_parse_main_t *pm, parse_registration_t *pr)
+{
+  u32 graph_root_index;
+  u16 subgraph_type_index = (u16)~0;
+  vlib_parse_type_t *subgraph_type = 0;
+  vlib_lex_token_t *t;
+  vlib_parse_graph_t *node;
+  u32 node_index, last_index, token_increment, new_subgraph_index;
+  u32 new_subgraph_depth, last_matching_index;
+  vlib_parse_item_t *item;
+  vlib_parse_type_t *type;
+
+  int use_main_graph = 1;
+
+  tokenize (pm, pr);
+  
+  /* A typed rule? */
+  if (is_typed_rule (pm))
+    {
+      /* Get the type and its current subgraph root, if any */
+      t = vec_elt_at_index (pm->tokens, 1);
+      subgraph_type_index = parse_type_find_by_name (pm, t->value.as_pointer);
+      if (subgraph_type_index == (u16)~0)
+        return clib_error_return (0, "undeclared type '%s'", 
+                                  t->value.as_pointer);
+      subgraph_type = pool_elt_at_index (pm->parse_types, subgraph_type_index);
+      graph_root_index = subgraph_type->rule_index;
+      /* Skip "mytype> = */
+      t += 3;
+      use_main_graph = 0;
+  }
+  else
+    {
+      /* top-level graph */
+      graph_root_index = pm->root_index;
+      t = vec_elt_at_index (pm->tokens, 0);
+    }
+
+  last_matching_index = (u32)~0;
+  last_index = node_index = graph_root_index;
+
+  /* Find the first token which isn't already being parsed */
+  while (t < pm->tokens + vec_len (pm->tokens) && node_index != (u32) ~0)
+    {
+      node = pool_elt_at_index (pm->parse_graph, node_index);
+      item = pool_elt_at_index (pm->parse_items, node->item);
+      type = pool_elt_at_index (pm->parse_types, item->type);
+      last_index = node_index;
+      
+      if (token_matches_graph_node (pm, t, node, item, type, &token_increment)) 
+        {
+          t += token_increment;
+          last_matching_index = node_index;
+          node_index = node->deeper;
+        }
+      else
+        node_index = node->peer;
+    }
+     
+  new_subgraph_index = 
+    generate_subgraph_from_tokens (pm, t, &new_subgraph_depth, pr, 
+                                   use_main_graph);
+  
+  /* trivial cases: first graph node or first type rule */
+  if (graph_root_index == (u32)~0) 
+    {
+      if (use_main_graph)
+        pm->root_index = new_subgraph_index;
+      else
+        subgraph_type->rule_index = new_subgraph_index;
+      return 0;
+    }
+    
+  add_subgraph_to_graph (pm, last_matching_index, graph_root_index,
+                         new_subgraph_index,
+                         new_subgraph_depth);
+  return 0;
+}
+
+static clib_error_t *
+parse_register (vlib_main_t * vm,
+                parse_registration_t * lo,
+                parse_registration_t * hi,
+                vlib_parse_main_t *pm)
+{
+  parse_registration_t * pr;
+    
+  for (pr = lo; pr < hi; pr = vlib_elf_section_data_next (pr, 0))
+    vec_add1 (pm->parse_registrations, pr);
+
+  return 0;
+}
+
+static clib_error_t *
+parse_register_one_type (vlib_parse_main_t *pm, vlib_parse_type_t *rp)
+{
+  (void) parse_type_find_or_create (pm, (vlib_parse_type_t *)rp);
+  return 0;
+}
+
+static clib_error_t *
+parse_type_register (vlib_main_t * vm,
+                     vlib_parse_type_t * lo,
+                     vlib_parse_type_t * hi,
+                     vlib_parse_main_t *pm)
+{
+  clib_error_t * error = 0;
+  vlib_parse_type_t * ptr;
+    
+  for (ptr = lo; ptr < hi; ptr = vlib_elf_section_data_next (ptr, 0)) {
+    error = parse_register_one_type (pm, ptr);
+    if (error)
+      goto done;
+  }
+    
+ done:
+  return error;
+}
+
+clib_error_t *vlib_stdlex_init (vlib_main_t *vm) __attribute__((weak));
+clib_error_t *vlib_stdlex_init (vlib_main_t *vm) 
+{ 
+  (void) vlib_lex_add_table ("ignore_everything");
+  return 0; 
+}
+
+static int compute_rule_length (parse_registration_t *r)
+{
+  int length, i;
+  vlib_parse_main_t *pm = &vlib_parse_main;
+
+  if (r->rule_length)
+    return r->rule_length;
+
+  length = 0;
+
+  tokenize (pm, r);
+  length = vec_len (pm->tokens);
+
+  /* Account for "<foo> = " in "<foo> = bar" etc. */
+  if (is_typed_rule (pm))
+    length -= 2;
+
+  for (i = 0; i < vec_len (pm->tokens); i++)
+    {
+      switch (pm->tokens[i].token)
+        {
+        case VLIB_LEX_lt:
+        case VLIB_LEX_gt:
+          length -= 1;
+
+        default:
+          break;
+        }
+    }
+
+  ASSERT (length > 0);
+  r->rule_length = length;
+  return length;
+}
+
+static int rule_length_compare (parse_registration_t *r1, 
+                                parse_registration_t *r2)
+{
+  compute_rule_length (r1);
+  compute_rule_length (r2);
+  /* Descending sort */
+  return r2->rule_length - r1->rule_length;
+}
+
+
+static clib_error_t * parse_init (vlib_main_t *vm)
+{
+  vlib_parse_main_t *pm = &vlib_parse_main;
+  vlib_lex_main_t *lm = &vlib_lex_main;
+  vlib_elf_section_bounds_t * b, * bounds;
+  clib_error_t * error = 0;
+  parse_registration_t *rule;
+  int i;
+
+  if ((error = vlib_call_init_function (vm, lex_onetime_init)))
+    return error;
+
+  if ((error = vlib_stdlex_init(vm)))
+    return error;
+
+  if ((error = vlib_call_init_function (vm, parse_builtin_init)))
+    return error;
+
+  pm->vlib_main = vm;
+  pm->lex_main = lm;
+
+  mhash_init (&pm->parse_item_hash, sizeof (u32), sizeof (vlib_parse_item_t));
+  pm->parse_type_by_name_hash = hash_create_string (0, sizeof (u32));
+
+  vec_validate (pm->parse_value, 16);
+  vec_validate (pm->tokens, 16);
+  vec_validate (pm->register_input, 32);
+  vec_validate (pm->match_items, 16);
+
+  _vec_len (pm->parse_value) = 0;
+  _vec_len (pm->tokens) = 0;
+  _vec_len (pm->register_input) = 0;
+  _vec_len (pm->match_items) = 0;
+
+  bounds = vlib_get_elf_section_bounds (vm, "parse_type_registrations");
+  vec_foreach (b, bounds)
+    {
+      error = parse_type_register (vm, b->lo, b->hi, pm);
+      if (error)
+	break;
+    }
+  vec_free (bounds);
+
+  parse_type_and_graph_init (pm);
+
+  bounds = vlib_get_elf_section_bounds (vm, "parse_registrations");
+  vec_foreach (b, bounds)
+    {
+      error = parse_register (vm, b->lo, b->hi, pm);
+      if (error)
+	break;
+    }
+  vec_free (bounds);
+
+  vec_sort (pm->parse_registrations, r1, r2, 
+            rule_length_compare (r1[0], r2[0]));
+
+  for (i = 0; i < vec_len (pm->parse_registrations); i++)
+    {
+      rule = pm->parse_registrations[i];
+      parse_register_one (pm, rule);
+    }
+
+  return error;
+}
+
+VLIB_INIT_FUNCTION (parse_init);
diff --git a/vlib/vlib/parse.h b/vlib/vlib/parse.h
new file mode 100644
index 00000000000..5b9acebf774
--- /dev/null
+++ b/vlib/vlib/parse.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef included_vlib_parse_h
+#define included_vlib_parse_h
+
+#include <vlib/vlib.h>
+#include <vlib/lex.h>
+#include <vppinfra/mhash.h>
+
+typedef struct {
+  /* Word aligned value. */
+  union {
+    u8 as_u8[32 - 1 * sizeof (u16)];
+    void * as_pointer;
+    uword as_uword;
+    word as_word;
+    u64 as_u64;
+  } value;
+
+  /* 16 bit type at end so that 30 bytes of value are aligned. */
+  u16 type;
+} __attribute ((packed)) vlib_parse_value_t;
+
+/* Instance of a type. */
+typedef struct {
+  u32 type;
+
+  u32 origin;
+
+  u32 help_index;
+
+  union {
+    void * as_pointer;
+    uword as_uword;
+  } value;
+} vlib_parse_item_t;
+
+typedef struct {
+  /* Index of item for this node. */
+  u32 item;
+
+  /* Graph index of peer (sibling) node (linked list of peers). */
+  u32 peer;
+
+  /* Graph index of deeper (child) node (linked list of children). */
+  u32 deeper;
+} vlib_parse_graph_t;
+
+#define foreach_parse_match_type                \
+  _(MATCH_DONE)					\
+  _(MATCH_RULE)					\
+  _(MATCH_FAIL)					\
+  _(MATCH_FULL)					\
+  _(MATCH_VALUE)				\
+  _(MATCH_PARTIAL)				\
+  _(MATCH_AMBIGUOUS)				\
+  _(MATCH_EVAL_FAIL)
+
+typedef enum {
+#define _(a) VLIB_PARSE_##a,
+  foreach_parse_match_type
+#undef _
+} vlib_parse_match_t;
+
+struct vlib_parse_type;
+struct vlib_parse_main;
+
+typedef vlib_parse_match_t (vlib_parse_match_function_t)
+  (struct vlib_parse_main *, 
+   struct vlib_parse_type *,
+   vlib_lex_token_t *,
+   vlib_parse_value_t *);
+typedef void (vlib_parse_value_cleanup_function_t) (vlib_parse_value_t *);
+
+typedef struct vlib_parse_type {
+  /* Type name. */
+  char * name;
+
+  vlib_parse_match_function_t * match_function;
+
+  vlib_parse_value_cleanup_function_t * value_cleanup_function;
+
+  format_function_t * format_value;
+
+  u32 rule_index;
+} vlib_parse_type_t;
+
+typedef struct {
+  char *initializer;
+  void * eof_match;
+  int rule_length;
+} parse_registration_t;
+
+typedef struct vlib_parse_main {
+  /* (type, origin, help, value) tuples */
+  vlib_parse_item_t *parse_items;
+  mhash_t parse_item_hash;
+
+  /* (item, peer, deeper) tuples */
+  vlib_parse_graph_t *parse_graph;
+  u32 root_index;
+
+  u8 *register_input;
+
+  /* parser types */
+  vlib_parse_type_t * parse_types;
+  uword *parse_type_by_name_hash;
+
+  /* Vector of MATCH_VALUEs */
+  vlib_parse_value_t * parse_value;
+  u32 * match_items;
+
+  /* Parse registrations */
+  parse_registration_t **parse_registrations;
+
+  /* Token vector */
+  vlib_lex_token_t *tokens;
+  u32 current_token_index;
+    
+  vlib_lex_main_t *lex_main;
+  vlib_main_t *vlib_main;
+} vlib_parse_main_t;
+
+vlib_parse_main_t vlib_parse_main;
+
+typedef vlib_parse_match_t (vlib_parse_eval_function_t)
+  (vlib_parse_main_t *,
+   vlib_parse_item_t *, 
+   vlib_parse_value_t *);
+
+vlib_parse_match_t vlib_parse_eval (u8 * input);
+
+format_function_t format_vlib_parse_value;
+
+/* FIXME need these to be global? */
+vlib_parse_match_function_t rule_match, eof_match, word_match, number_match;
+
+#define _PARSE_REGISTRATION_DATA(x) \
+VLIB_ELF_SECTION_DATA(x##_registration,parse_registration_t,parse_registrations)
+
+#define PARSE_INIT(x, s, e)                     \
+static _PARSE_REGISTRATION_DATA(x) = {          \
+    .initializer = s,                           \
+    .eof_match = e,                             \
+};
+
+#define _PARSE_TYPE_REGISTRATION_DATA(x) \
+VLIB_ELF_SECTION_DATA(x##_type_registration,vlib_parse_type_t, \
+parse_type_registrations)
+
+#define PARSE_TYPE_INIT(n, m, c, f)             \
+static _PARSE_TYPE_REGISTRATION_DATA(n) = {     \
+    .name = #n,                                 \
+    .match_function = m,			\
+    .value_cleanup_function = c,		\
+    .format_value = f,				\
+};
+
+#endif /* included_vlib_parse_h */
diff --git a/vlib/vlib/parse_builtin.c b/vlib/vlib/parse_builtin.c
new file mode 100644
index 00000000000..df830db4e21
--- /dev/null
+++ b/vlib/vlib/parse_builtin.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vlib/parse.h>
+
+always_inline void *
+parse_last_match_value (vlib_parse_main_t * pm)
+{
+  vlib_parse_item_t * i;
+  i = pool_elt_at_index (pm->parse_items,
+			 vec_elt (pm->match_items, vec_len (pm->match_items) - 1));
+  return i->value.as_pointer;
+}
+
+vlib_parse_match_t eof_match (vlib_parse_main_t *pm, vlib_parse_type_t *type, 
+			      vlib_lex_token_t *t, vlib_parse_value_t *valuep)
+{ return t->token == VLIB_LEX_eof ? VLIB_PARSE_MATCH_DONE : VLIB_PARSE_MATCH_FAIL; }
+
+PARSE_TYPE_INIT (eof, eof_match, 0 /* cleanup value */, 0 /* format value */);
+
+vlib_parse_match_t rule_eof_match (vlib_parse_main_t *pm, vlib_parse_type_t *type, 
+			      vlib_lex_token_t *t, vlib_parse_value_t *valuep)
+{
+  vlib_parse_match_function_t * fp = parse_last_match_value (pm);
+  pm->current_token_index--;
+  return fp ? fp (pm, type, t, valuep) : VLIB_PARSE_MATCH_RULE;
+}
+
+PARSE_TYPE_INIT (rule_eof, rule_eof_match, 0, 0);
+
+vlib_parse_match_t word_match (vlib_parse_main_t *pm, vlib_parse_type_t *type, 
+			  vlib_lex_token_t *t, vlib_parse_value_t *valuep)
+{
+  u8 * tv, * iv;
+  int i;
+
+  if (t->token != VLIB_LEX_word)
+    return VLIB_PARSE_MATCH_FAIL;
+
+  tv = t->value.as_pointer;
+  iv = parse_last_match_value (pm);
+
+  for (i = 0; tv[i]; i++)
+    {
+      if (tv[i] != iv[i])
+	return VLIB_PARSE_MATCH_FAIL;
+    }
+
+  return iv[i] == 0 ? VLIB_PARSE_MATCH_FULL : VLIB_PARSE_MATCH_PARTIAL;
+}
+
+PARSE_TYPE_INIT (word, word_match, 0 /* clnup value */, 0 /* format value */);
+
+vlib_parse_match_t number_match (vlib_parse_main_t *pm, vlib_parse_type_t *type, 
+				 vlib_lex_token_t *t, vlib_parse_value_t *valuep)
+{
+  if (t->token == VLIB_LEX_number)
+    {
+      valuep->value.as_uword = t->value.as_uword;
+      return VLIB_PARSE_MATCH_VALUE;
+    }
+  return VLIB_PARSE_MATCH_FAIL;
+}
+
+static u8 * format_value_number (u8 * s, va_list * args)
+{
+  vlib_parse_value_t * v = va_arg (*args, vlib_parse_value_t *);
+  uword a = v->value.as_uword;
+
+  if (BITS(uword) == 64) 
+    s = format (s, "%lld(0x%llx)", a, a);
+  else 
+    s = format (s, "%ld(0x%lx)", a, a);
+  return s;
+}
+
+PARSE_TYPE_INIT (number, number_match, 0 /* cln value */, 
+                 format_value_number /* fmt value */);
+
+
+#define foreach_vanilla_lex_match_function      \
+    _(plus)                                     \
+    _(minus)                                    \
+    _(star)                                     \
+    _(slash)                                    \
+    _(lpar)                                     \
+    _(rpar)
+
+#define LEX_MATCH_DEBUG 0
+
+#define _(name)                                                 \
+vlib_parse_match_t name##_match (vlib_parse_main_t *pm,         \
+                                 vlib_parse_type_t *type,       \
+                                 vlib_lex_token_t *t,           \
+                                 vlib_parse_value_t *valuep)    \
+{                                                               \
+  if (LEX_MATCH_DEBUG > 0)                                      \
+    clib_warning ("against %U returns %s",                      \
+                  format_vlib_lex_token, pm->lex_main, t,       \
+                  (t->token == VLIB_LEX_##name)                 \
+                  ? "VLIB_PARSE_MATCH_FULL" :                   \
+                  "VLIB_PARSE_MATCH_FAIL");                     \
+  if (t->token == VLIB_LEX_##name)                              \
+    return VLIB_PARSE_MATCH_FULL;                               \
+  return VLIB_PARSE_MATCH_FAIL;                                 \
+}                                                               \
+                                                                \
+PARSE_TYPE_INIT (name, name##_match, 0 /* cln value */,         \
+                 0 /* fmt val */);
+
+foreach_vanilla_lex_match_function
+#undef _
+
+/* So we're linked in. */
+static clib_error_t *
+parse_builtin_init (vlib_main_t * vm)
+{
+  return 0;
+}
+
+VLIB_INIT_FUNCTION (parse_builtin_init);
diff --git a/vlib/vlib/physmem.h b/vlib/vlib/physmem.h
new file mode 100644
index 00000000000..6e70291c1d9
--- /dev/null
+++ b/vlib/vlib/physmem.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * physmem.h: virtual <-> physical memory mapping for VLIB buffers
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_physmem_h
+#define included_vlib_physmem_h
+
+typedef struct {
+  uword start, end, size;
+} vlib_physmem_region_t;
+
+typedef struct {
+  vlib_physmem_region_t virtual;
+
+  uword log2_n_bytes_per_page;
+
+  /* 1 << log2_n_bytes_per_page - 1. */
+  uword page_mask;
+
+  u64 * page_table;
+} vlib_physmem_main_t;
+
+always_inline u64
+vlib_physmem_offset_to_physical (vlib_physmem_main_t * pm, uword o)
+{
+  uword page_index = o >> pm->log2_n_bytes_per_page;
+  ASSERT (o < pm->virtual.size);
+  ASSERT (pm->page_table[page_index] != 0);
+  return (vec_elt (pm->page_table, page_index) + (o & pm->page_mask));
+}
+
+always_inline int
+vlib_physmem_is_virtual (vlib_physmem_main_t * pm, uword p)
+{ return p >= pm->virtual.start && p < pm->virtual.end; }
+
+always_inline uword
+vlib_physmem_offset_of (vlib_physmem_main_t * pm, void * p)
+{
+  uword a = pointer_to_uword (p);
+  uword o;
+
+  ASSERT (vlib_physmem_is_virtual (pm, a));
+  o = a - pm->virtual.start;
+
+  /* Offset must fit in 32 bits. */
+  ASSERT ((uword) o == a - pm->virtual.start);
+
+  return o;
+}
+
+always_inline void *
+vlib_physmem_at_offset (vlib_physmem_main_t * pm, uword offset)
+{
+  ASSERT (offset < pm->virtual.size);
+  return uword_to_pointer (pm->virtual.start + offset, void *);
+}
+
+#endif /* included_vlib_physmem_h */
diff --git a/vlib/vlib/threads.c b/vlib/vlib/threads.c
new file mode 100644
index 00000000000..4621f843dd5
--- /dev/null
+++ b/vlib/vlib/threads.c
@@ -0,0 +1,1166 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <signal.h>
+#include <math.h>
+#include <vppinfra/format.h>
+#include <vlib/vlib.h>
+
+#include <vlib/threads.h>
+#include <vlib/unix/physmem.h>
+
+#include <vlib/unix/cj.h>
+
+#if DPDK==1
+#include <rte_config.h>
+#include <rte_common.h>
+#include <rte_eal.h>
+#include <rte_launch.h>
+#include <rte_lcore.h>
+#endif
+DECLARE_CJ_GLOBAL_LOG;
+
+#define FRAME_QUEUE_NELTS 32
+
+
+#if DPDK==1
+/*
+ *  Weak definitions of DPDK symbols used in this file.
+ *  Needed for linking test programs without DPDK libs.
+ */
+unsigned __thread      __attribute__((weak)) RTE_PER_LCORE(_lcore_id);
+struct lcore_config    __attribute__((weak)) lcore_config[];
+unsigned               __attribute__((weak)) rte_socket_id();
+int                    __attribute__((weak)) rte_eal_remote_launch();
+#endif
+u32 vl(void *p)
+{
+  return vec_len (p);
+}
+
+void debug_hex_bytes (u8 *s, u32 n)
+{
+    fformat (stderr, "%U\n", format_hex_bytes, s, n);
+}
+
+vlib_thread_main_t vlib_thread_main;
+
+uword
+os_get_cpu_number (void)
+{
+  void * sp;
+  uword n;
+  u32 len;
+
+  len = vec_len (vlib_thread_stacks);
+  if (len == 0)
+    return 0;
+
+  /* Get any old stack address. */
+  sp = &sp;
+
+  n = ((uword)sp - (uword)vlib_thread_stacks[0]) 
+      >> VLIB_LOG2_THREAD_STACK_SIZE;
+
+  /* "processes" have their own stacks, and they always run in thread 0 */
+  n = n >= len ? 0 : n;
+
+  return n;
+}
+
+void
+vlib_set_thread_name (char *name)
+{
+  int pthread_setname_np (pthread_t __target_thread, const char *__name);
+  pthread_t thread = pthread_self();
+
+  if (thread) 
+    pthread_setname_np(thread, name);
+}
+
+static int sort_registrations_by_no_clone  (void *a0, void * a1)
+{ 
+  vlib_thread_registration_t ** tr0 = a0;
+  vlib_thread_registration_t ** tr1 = a1;
+
+  return ((i32)((*tr0)->no_data_structure_clone) 
+          - ((i32)((*tr1)->no_data_structure_clone)));
+}
+
+static uword *
+vlib_sysfs_list_to_bitmap(char * filename)
+{
+  FILE *fp;
+  uword *r = 0;
+
+  fp = fopen (filename, "r");
+
+  if (fp != NULL)
+    {
+      u8 * buffer = 0;
+      vec_validate (buffer, 256-1);
+      if (fgets ((char *)buffer, 256, fp))
+        {
+          unformat_input_t in;
+          unformat_init_string (&in, (char *) buffer, strlen ((char *) buffer));
+          unformat(&in, "%U", unformat_bitmap_list, &r);
+          unformat_free (&in);
+        }
+      vec_free(buffer);
+      fclose(fp);
+    }
+  return r;
+}
+
+
+/* Called early in the init sequence */
+
+clib_error_t *
+vlib_thread_init (vlib_main_t * vm)
+{
+  vlib_thread_main_t * tm = &vlib_thread_main;
+  vlib_worker_thread_t * w;
+  vlib_thread_registration_t * tr;
+  u32 n_vlib_mains = 1;
+  u32 first_index = 1;
+  u32 i;
+  uword * avail_cpu;
+
+  /* get bitmaps of active cpu cores and sockets */
+  tm->cpu_core_bitmap =
+    vlib_sysfs_list_to_bitmap("/sys/devices/system/cpu/online");
+  tm->cpu_socket_bitmap =
+    vlib_sysfs_list_to_bitmap("/sys/devices/system/node/online");
+
+  avail_cpu = clib_bitmap_dup(tm->cpu_core_bitmap);
+
+  /* skip cores */
+  for (i=0; i < tm->skip_cores; i++)
+    {
+      uword c = clib_bitmap_first_set(avail_cpu);
+      if (c == ~0)
+        return clib_error_return (0, "no available cpus to skip");
+
+      avail_cpu = clib_bitmap_set(avail_cpu, c, 0);
+    }
+
+  /* grab cpu for main thread */
+  if (!tm->main_lcore)
+    {
+      tm->main_lcore = clib_bitmap_first_set(avail_cpu);
+      if (tm->main_lcore == ~0)
+        return clib_error_return (0, "no available cpus to be used for the"
+                                  " main thread");
+    }
+  else
+    {
+      if (clib_bitmap_get(avail_cpu, tm->main_lcore) == 0)
+        return clib_error_return (0, "cpu %u is not available to be used"
+                                  " for the main thread", tm->main_lcore);
+    }
+  avail_cpu = clib_bitmap_set(avail_cpu, tm->main_lcore, 0);
+
+  /* assume that there is socket 0 only if there is no data from sysfs */
+  if (!tm->cpu_socket_bitmap)
+    tm->cpu_socket_bitmap = clib_bitmap_set(0, 0, 1);
+
+  /* as many threads as stacks... */
+  vec_validate_aligned (vlib_worker_threads, vec_len(vlib_thread_stacks)-1,
+                        CLIB_CACHE_LINE_BYTES);
+
+  /* Preallocate thread 0 */
+  _vec_len(vlib_worker_threads) = 1;
+  w = vlib_worker_threads;
+  w->thread_mheap = clib_mem_get_heap();
+  w->thread_stack = vlib_thread_stacks[0];
+  w->dpdk_lcore_id = -1;
+  w->lwp = syscall(SYS_gettid);
+  tm->n_vlib_mains = 1;
+
+  /* assign threads to cores and set n_vlib_mains */
+  tr = tm->next;
+
+  while (tr)
+    {
+      vec_add1 (tm->registrations, tr);
+      tr = tr->next;
+    }
+
+  vec_sort_with_function
+    (tm->registrations, sort_registrations_by_no_clone);
+
+  for (i = 0; i < vec_len (tm->registrations); i++)
+    {
+      int j;
+      tr = tm->registrations[i];
+      tr->first_index = first_index;
+      first_index += tr->count;
+      n_vlib_mains += (tr->no_data_structure_clone == 0) ? tr->count : 0;
+
+      /* construct coremask */
+      if (tr->use_pthreads || !tr->count)
+        continue;
+
+      if (tr->coremask)
+        {
+          uword c;
+          clib_bitmap_foreach (c, tr->coremask, ({
+            if (clib_bitmap_get(avail_cpu, c) == 0)
+              return clib_error_return (0, "cpu %u is not available to be used"
+                                        " for the '%s' thread",c, tr->name);
+
+            avail_cpu = clib_bitmap_set(avail_cpu, c, 0);
+          }));
+
+        }
+      else
+        {
+          for (j=0; j < tr->count; j++)
+            {
+              uword c = clib_bitmap_first_set(avail_cpu);
+              if (c == ~0)
+              return clib_error_return (0, "no available cpus to be used for"
+                                        " the '%s' thread", tr->name);
+
+              avail_cpu = clib_bitmap_set(avail_cpu, c, 0);
+              tr->coremask = clib_bitmap_set(tr->coremask, c, 1);
+            }
+        }
+    }
+
+  clib_bitmap_free(avail_cpu);
+
+  tm->n_vlib_mains = n_vlib_mains;
+
+  vec_validate_aligned (vlib_worker_threads, first_index-1,
+                        CLIB_CACHE_LINE_BYTES);
+
+
+  tm->efd.enabled = VLIB_EFD_DISABLED;
+  tm->efd.queue_hi_thresh = ((VLIB_EFD_DEF_WORKER_HI_THRESH_PCT *
+                              FRAME_QUEUE_NELTS)/100);
+  return 0;
+}
+
+vlib_worker_thread_t *
+vlib_alloc_thread (vlib_main_t * vm)
+{
+  vlib_worker_thread_t * w;
+
+  if (vec_len(vlib_worker_threads) >= vec_len (vlib_thread_stacks))
+    {
+      clib_warning ("out of worker threads... Quitting...");
+      exit(1);
+    }
+  vec_add2 (vlib_worker_threads, w, 1);
+  w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads];
+  return w;
+}
+
+vlib_frame_queue_t * vlib_frame_queue_alloc (int nelts)
+{
+  vlib_frame_queue_t * fq;
+
+  fq = clib_mem_alloc_aligned(sizeof (*fq), CLIB_CACHE_LINE_BYTES);
+  memset (fq, 0, sizeof (*fq));
+  fq->nelts = nelts;
+  fq->vector_threshold = 128; // packets
+  vec_validate_aligned (fq->elts, nelts-1, CLIB_CACHE_LINE_BYTES);
+
+  if (1)
+  {
+    if (((uword)&fq->tail) & (CLIB_CACHE_LINE_BYTES - 1))
+      fformat(stderr, "WARNING: fq->tail unaligned\n");
+    if (((uword)&fq->head) & (CLIB_CACHE_LINE_BYTES - 1))
+      fformat(stderr, "WARNING: fq->head unaligned\n");
+    if (((uword)fq->elts) & (CLIB_CACHE_LINE_BYTES - 1))
+      fformat(stderr, "WARNING: fq->elts unaligned\n");
+    
+    if (sizeof (fq->elts[0]) % CLIB_CACHE_LINE_BYTES)
+      fformat(stderr, "WARNING: fq->elts[0] size %d\n", 
+              sizeof (fq->elts[0]));
+    if (nelts & (nelts -1))
+      {
+        fformat (stderr, "FATAL: nelts MUST be a power of 2\n");
+        abort();
+      }
+  }
+  
+  return (fq);
+}
+
+void vl_msg_api_handler_no_free (void *) __attribute__ ((weak));
+void vl_msg_api_handler_no_free (void *v) { }
+
+/* Turned off, save as reference material... */
+#if 0
+static inline int vlib_frame_queue_dequeue_internal (int thread_id, 
+                                                      vlib_main_t *vm, 
+                                                      vlib_node_main_t *nm)
+{
+  vlib_frame_queue_t *fq = vlib_frame_queues[thread_id];
+  vlib_frame_queue_elt_t *elt;
+  vlib_frame_t *f;
+  vlib_pending_frame_t *p;
+  vlib_node_runtime_t *r;
+  u32 node_runtime_index;
+  int msg_type;
+  u64 before;
+  int processed = 0;
+  
+  ASSERT(vm == vlib_mains[thread_id]);
+
+  while (1)
+    {
+      if (fq->head == fq->tail)
+        return processed;
+
+      elt = fq->elts + ((fq->head+1) & (fq->nelts-1));
+
+      if (!elt->valid)
+        return processed;
+
+      before = clib_cpu_time_now();
+
+      f = elt->frame;
+      node_runtime_index = elt->node_runtime_index;
+      msg_type = elt->msg_type;
+
+      switch (msg_type)
+        {
+        case VLIB_FRAME_QUEUE_ELT_FREE_BUFFERS:
+          vlib_buffer_free (vm, vlib_frame_vector_args (f), f->n_vectors);
+          /* note fallthrough... */
+        case VLIB_FRAME_QUEUE_ELT_FREE_FRAME:
+          r = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL], 
+                                node_runtime_index);
+          vlib_frame_free (vm, r, f);
+          break;
+        case VLIB_FRAME_QUEUE_ELT_DISPATCH_FRAME:
+          vec_add2 (vm->node_main.pending_frames, p, 1);
+          f->flags |= (VLIB_FRAME_PENDING | VLIB_FRAME_FREE_AFTER_DISPATCH);
+          p->node_runtime_index = elt->node_runtime_index;
+          p->frame_index = vlib_frame_index (vm, f);
+          p->next_frame_index = VLIB_PENDING_FRAME_NO_NEXT_FRAME;
+          fq->dequeue_vectors += (u64) f->n_vectors;
+          break;
+        case VLIB_FRAME_QUEUE_ELT_API_MSG:
+          vl_msg_api_handler_no_free (f);
+          break;
+        default:
+          clib_warning ("bogus frame queue message, type %d", msg_type);
+          break;
+        }
+      elt->valid = 0;
+      fq->dequeues++;
+      fq->dequeue_ticks += clib_cpu_time_now() - before;
+      CLIB_MEMORY_BARRIER();
+      fq->head++;
+      processed++;
+    }
+  ASSERT(0);
+  return processed;
+}
+
+int vlib_frame_queue_dequeue (int thread_id, 
+                               vlib_main_t *vm, 
+                               vlib_node_main_t *nm)
+{
+  return vlib_frame_queue_dequeue_internal (thread_id, vm, nm);
+}
+
+int vlib_frame_queue_enqueue (vlib_main_t *vm, u32 node_runtime_index,
+                              u32 frame_queue_index, vlib_frame_t *frame,
+                              vlib_frame_queue_msg_type_t type)
+{
+  vlib_frame_queue_t *fq = vlib_frame_queues[frame_queue_index];
+  vlib_frame_queue_elt_t *elt;
+  u32 save_count;
+  u64 new_tail;
+  u64 before = clib_cpu_time_now();
+  
+  ASSERT (fq);
+
+  new_tail = __sync_add_and_fetch (&fq->tail, 1);
+
+  /* Wait until a ring slot is available */
+  while (new_tail >= fq->head + fq->nelts)
+    {
+      f64 b4 = vlib_time_now_ticks (vm, before);
+      vlib_worker_thread_barrier_check (vm, b4);
+      /* Bad idea. Dequeue -> enqueue -> dequeue -> trouble */
+      // vlib_frame_queue_dequeue (vm->cpu_index, vm, nm);
+    }
+
+  elt = fq->elts + (new_tail & (fq->nelts-1));
+
+  /* this would be very bad... */
+  while (elt->valid) 
+    {
+    }
+
+  /* Once we enqueue the frame, frame->n_vectors is owned elsewhere... */
+  save_count = frame->n_vectors;
+
+  elt->frame = frame;
+  elt->node_runtime_index = node_runtime_index;
+  elt->msg_type = type;
+  CLIB_MEMORY_BARRIER();
+  elt->valid = 1;
+
+  return save_count;
+}
+#endif /* 0 */
+
+/* To be called by vlib worker threads upon startup */
+void vlib_worker_thread_init (vlib_worker_thread_t * w)
+{
+  vlib_thread_main_t *tm = vlib_get_thread_main();
+  
+  /* worker threads wants no signals. */
+  {
+    sigset_t s;
+    sigfillset (&s);
+    pthread_sigmask (SIG_SETMASK, &s, 0);
+  }
+
+  clib_mem_set_heap (w->thread_mheap);
+
+  if (vec_len(tm->thread_prefix) && w->registration->short_name)
+    {
+      w->name = format(0, "%v_%s_%d%c", tm->thread_prefix,
+                                        w->registration->short_name,
+                                        w->instance_id,
+                                        '\0');
+      vlib_set_thread_name((char *)w->name);
+    }
+
+  if (!w->registration->use_pthreads)
+    {
+
+      /* Initial barrier sync, for both worker and i/o threads */
+      clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, 1);
+
+      while (*vlib_worker_threads->wait_at_barrier)
+          ;
+
+      clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, -1);
+    }
+}
+
+void *vlib_worker_thread_bootstrap_fn (void *arg)
+{
+  void *rv;
+  vlib_worker_thread_t *w = arg;
+  
+  w->lwp = syscall(SYS_gettid);
+  w->dpdk_lcore_id = -1;
+#if DPDK==1
+  if (w->registration && !w->registration->use_pthreads &&
+      rte_socket_id) /* do we really have dpdk linked */
+    {
+      unsigned lcore = rte_lcore_id();
+      lcore = lcore < RTE_MAX_LCORE ? lcore : -1;
+      w->dpdk_lcore_id = lcore;
+    }
+#endif
+
+  rv = (void *) clib_calljmp 
+      ((uword (*)(uword)) w->thread_function, 
+       (uword) arg, w->thread_stack + VLIB_THREAD_STACK_SIZE);
+  /* NOTREACHED, we hope */
+  return rv;
+}
+
+static int
+vlib_launch_thread (void *fp, vlib_worker_thread_t *w, unsigned lcore_id)
+{
+  pthread_t dummy;
+  void *(*fp_arg)(void *) = fp;
+
+#if DPDK==1
+  if (!w->registration->use_pthreads)
+    if (rte_eal_remote_launch) /* do we have dpdk linked */
+      return rte_eal_remote_launch (fp, (void *)w, lcore_id);
+    else
+      return -1;
+  else
+#endif
+    return pthread_create (&dummy, NULL /* attr */, fp_arg, (void *)w);
+}
+
+static clib_error_t * start_workers (vlib_main_t * vm)
+{
+  int i, j;
+  vlib_worker_thread_t *w;
+  vlib_main_t *vm_clone;
+  void *oldheap;
+  vlib_frame_queue_t *fq;
+  vlib_thread_main_t * tm = &vlib_thread_main;
+  vlib_thread_registration_t * tr; 
+  vlib_node_runtime_t * rt;
+  u32 n_vlib_mains = tm->n_vlib_mains;
+  u32 worker_thread_index;
+
+  vec_reset_length (vlib_worker_threads);
+
+  /* Set up the main thread */
+  vec_add2_aligned (vlib_worker_threads, w, 1, CLIB_CACHE_LINE_BYTES);
+  w->elog_track.name = "thread 0";
+  elog_track_register (&vm->elog_main, &w->elog_track);
+
+  if (vec_len(tm->thread_prefix))
+    {
+      w->name = format(0, "%v_main%c", tm->thread_prefix, '\0');
+      vlib_set_thread_name((char *)w->name);
+    }
+
+#if DPDK==1
+  w->dpdk_lcore_id = -1;
+  if (rte_socket_id) /* do we really have dpdk linked */
+    {
+      unsigned lcore = rte_lcore_id();
+      w->dpdk_lcore_id = lcore < RTE_MAX_LCORE ? lcore : -1;;
+    }
+#endif
+
+  if (n_vlib_mains > 1)
+    {
+      u8 * heap = clib_mem_get_per_cpu_heap();
+      mheap_t * h = mheap_header (heap);
+      
+      /* make the main heap thread-safe */
+      h->flags |= MHEAP_FLAG_THREAD_SAFE;
+      
+      /* Make the event-log MP-safe */
+      vm->elog_main.lock = 
+        clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES, 
+                                CLIB_CACHE_LINE_BYTES);
+  
+      vm->elog_main.lock[0] = 0;
+
+      vec_validate (vlib_mains, tm->n_vlib_mains - 1);
+      _vec_len (vlib_mains) = 0;
+      vec_add1 (vlib_mains, vm);
+
+      vec_validate (vlib_frame_queues, tm->n_vlib_mains - 1);
+      _vec_len (vlib_frame_queues) = 0;
+      fq = vlib_frame_queue_alloc (FRAME_QUEUE_NELTS);
+      vec_add1 (vlib_frame_queues, fq);
+
+      vlib_worker_threads->wait_at_barrier = 
+        clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES);
+      vlib_worker_threads->workers_at_barrier =
+        clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES);
+
+      /* Ask for an initial barrier sync */
+      *vlib_worker_threads->workers_at_barrier = 0;
+      *vlib_worker_threads->wait_at_barrier = 1;
+
+      worker_thread_index = 1;
+
+      for (i = 0; i < vec_len(tm->registrations); i++)
+        {
+          vlib_node_main_t *nm, *nm_clone;
+          vlib_buffer_main_t *bm_clone;
+          vlib_buffer_free_list_t *fl_clone, *fl_orig;
+          vlib_buffer_free_list_t *orig_freelist_pool;
+          int k;
+
+          tr = tm->registrations[i];
+
+          if (tr->count == 0)
+            continue;
+
+          for (k = 0; k < tr->count; k++)
+          {
+            vec_add2 (vlib_worker_threads, w, 1);
+            /* 
+             * Share the main heap which is now thread-safe.
+             *
+             * To allocate separate heaps, code:
+             * mheap_alloc (0 / * use VM * /, tr->mheap_size);
+             */
+            w->thread_mheap = heap;
+            w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads];
+            w->thread_function = tr->function;
+            w->thread_function_arg = w;
+            w->instance_id = k;
+            w->registration = tr; 
+            
+            w->elog_track.name = (char *) format (0, "thread %d", i+1);
+            vec_add1 (w->elog_track.name, 0);
+            elog_track_register (&vm->elog_main, &w->elog_track);
+            
+            if (tr->no_data_structure_clone)
+              continue;
+
+            /* Allocate "to-worker-N" frame queue */
+            fq = vlib_frame_queue_alloc (FRAME_QUEUE_NELTS);
+            vec_validate (vlib_frame_queues, worker_thread_index);
+            vlib_frame_queues[worker_thread_index] = fq;
+
+            /* Fork vlib_global_main et al. Look for bugs here */
+            oldheap = clib_mem_set_heap (w->thread_mheap);
+
+            vm_clone = clib_mem_alloc (sizeof (*vm_clone));
+            memcpy (vm_clone, vlib_mains[0], sizeof (*vm_clone));
+
+            vm_clone->cpu_index = worker_thread_index;
+            vm_clone->heap_base = w->thread_mheap;
+            vm_clone->mbuf_alloc_list = 0;
+            memset (&vm_clone->random_buffer, 0, sizeof (vm_clone->random_buffer));
+
+            nm = &vlib_mains[0]->node_main;
+            nm_clone = &vm_clone->node_main;
+            /* fork next frames array, preserving node runtime indices */
+            nm_clone->next_frames = vec_dup (nm->next_frames);
+            for (j = 0; j < vec_len (nm_clone->next_frames); j++)
+              {
+                vlib_next_frame_t *nf = &nm_clone->next_frames[j];
+                u32 save_node_runtime_index;
+
+                save_node_runtime_index = nf->node_runtime_index;
+                vlib_next_frame_init (nf);
+                nf->node_runtime_index = save_node_runtime_index;
+              }
+
+            /* fork the frame dispatch queue */
+            nm_clone->pending_frames = 0;
+            vec_validate (nm_clone->pending_frames, 10); /* $$$$$?????? */
+            _vec_len (nm_clone->pending_frames) = 0;
+
+            /* fork nodes */
+            nm_clone->nodes = 0;
+            for (j = 0; j < vec_len (nm->nodes); j++) 
+              {
+                vlib_node_t *n;
+                n = clib_mem_alloc_no_fail (sizeof(*n));
+                memcpy (n, nm->nodes[j], sizeof (*n));
+                /* none of the copied nodes have enqueue rights given out */
+                n->owner_node_index = VLIB_INVALID_NODE_INDEX;
+                memset (&n->stats_total, 0, sizeof (n->stats_total));
+                memset (&n->stats_last_clear, 0, sizeof (n->stats_last_clear));
+                vec_add1 (nm_clone->nodes, n);
+              }
+            nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] =
+              vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]);
+
+            nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] =
+              vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]);
+            vec_foreach(rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT])
+              rt->cpu_index = vm_clone->cpu_index;
+
+            nm_clone->processes = vec_dup (nm->processes);
+
+            /* zap the (per worker) frame freelists, etc */
+            nm_clone->frame_sizes = 0;
+            nm_clone->frame_size_hash = 0;
+
+            /* Packet trace buffers are guaranteed to be empty, nothing to do here */
+
+            clib_mem_set_heap (oldheap);
+            vec_add1 (vlib_mains, vm_clone);
+
+            unix_physmem_init (vm_clone, 0 /* physmem not required */);
+
+            /* Fork the vlib_buffer_main_t free lists, etc. */
+            bm_clone = vec_dup (vm_clone->buffer_main);
+            vm_clone->buffer_main = bm_clone;
+
+            orig_freelist_pool = bm_clone->buffer_free_list_pool;
+            bm_clone->buffer_free_list_pool = 0;
+
+            pool_foreach (fl_orig, orig_freelist_pool,
+                          ({
+                            pool_get_aligned (bm_clone->buffer_free_list_pool, 
+                                              fl_clone, CLIB_CACHE_LINE_BYTES);
+                            ASSERT (fl_orig - orig_freelist_pool 
+                                    == fl_clone - bm_clone->buffer_free_list_pool);
+
+                            fl_clone[0] = fl_orig[0];
+                            fl_clone->aligned_buffers = 0;
+                            fl_clone->unaligned_buffers = 0;
+                            fl_clone->n_alloc = 0;
+                          }));
+
+            worker_thread_index++;
+          }
+        }
+    }
+  else
+    {
+      /* only have non-data-structure copy threads to create... */
+      for (i = 0; i < vec_len(tm->registrations); i++)
+        {
+          tr = tm->registrations[i];
+
+          for (j = 0; j < tr->count; j++)
+            {
+              vec_add2 (vlib_worker_threads, w, 1);
+              w->thread_mheap = mheap_alloc (0 /* use VM */, tr->mheap_size);
+              w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads];
+              w->thread_function = tr->function;
+              w->thread_function_arg = w;
+              w->instance_id = j;
+              w->elog_track.name = (char *) format (0, "thread %d", i+1);
+              w->registration = tr;
+              vec_add1 (w->elog_track.name, 0);
+              elog_track_register (&vm->elog_main, &w->elog_track);
+            }
+        }
+    }
+
+  worker_thread_index = 1;
+
+  for (i = 0; i < vec_len (tm->registrations); i++)
+    {
+      int j;
+
+      tr = tm->registrations[i];
+
+      if (tr->use_pthreads || tm->use_pthreads)
+        {
+          for (j = 0; j < tr->count; j++)
+            {
+              w = vlib_worker_threads + worker_thread_index++;
+              if (vlib_launch_thread (vlib_worker_thread_bootstrap_fn, w, 0) < 0)
+                clib_warning ("Couldn't start '%s' pthread ", tr->name);
+            }
+        }
+      else
+        {
+            uword c;
+            clib_bitmap_foreach (c, tr->coremask, ({
+              w = vlib_worker_threads + worker_thread_index++;
+              if (vlib_launch_thread (vlib_worker_thread_bootstrap_fn, w, c) < 0)
+                clib_warning ("Couldn't start DPDK lcore %d", c);
+
+            }));
+        }
+    }
+  vlib_worker_thread_barrier_sync(vm);
+  vlib_worker_thread_barrier_release(vm);
+  return 0;
+}
+
+VLIB_MAIN_LOOP_ENTER_FUNCTION (start_workers);
+
+void vlib_worker_thread_node_runtime_update(void)
+{
+  int i, j;
+  vlib_worker_thread_t *w;
+  vlib_main_t *vm;
+  vlib_node_main_t *nm, *nm_clone;
+  vlib_node_t ** old_nodes_clone;
+  vlib_main_t *vm_clone;
+  vlib_node_runtime_t * rt, * old_rt;
+  void *oldheap;
+  never_inline void
+    vlib_node_runtime_sync_stats (vlib_main_t * vm,
+                                  vlib_node_runtime_t * r,
+                                  uword n_calls,
+                                  uword n_vectors,
+                                  uword n_clocks);
+  
+  ASSERT (os_get_cpu_number() == 0);
+
+  if (vec_len (vlib_mains) == 0)
+    return;
+
+  vm = vlib_mains[0];
+  nm = &vm->node_main;
+
+  ASSERT (os_get_cpu_number() == 0);
+  ASSERT (*vlib_worker_threads->wait_at_barrier == 1);
+
+  /* 
+   * Scrape all runtime stats, so we don't lose node runtime(s) with
+   * pending counts, or throw away worker / io thread counts.
+   */
+  for (j = 0; j < vec_len (nm->nodes); j++) 
+    {
+      vlib_node_t * n;
+      n = nm->nodes[j];
+      vlib_node_sync_stats (vm, n);
+    }
+
+  for (i = 1; i < vec_len (vlib_mains); i++)
+    {
+      vlib_node_t * n;
+      
+      vm_clone = vlib_mains[i];
+      nm_clone = &vm_clone->node_main;
+
+      for (j = 0; j < vec_len (nm_clone->nodes); j++) 
+        {
+          n = nm_clone->nodes[j];
+
+          rt = vlib_node_get_runtime (vm_clone, n->index);
+          vlib_node_runtime_sync_stats (vm_clone, rt, 0, 0, 0);
+        }
+    }
+
+  for (i = 1; i < vec_len (vlib_mains); i++)
+    {
+      vlib_node_runtime_t * rt;
+      w = vlib_worker_threads + i;
+      oldheap = clib_mem_set_heap (w->thread_mheap);
+      
+      vm_clone = vlib_mains[i];
+
+      /* Re-clone error heap */
+      memcpy (&vm_clone->error_main, &vm->error_main, sizeof (vm->error_main));
+
+      nm_clone = &vm_clone->node_main;
+      vec_free (nm_clone->next_frames);
+      nm_clone->next_frames = vec_dup (nm->next_frames);
+      
+      for (j = 0; j < vec_len (nm_clone->next_frames); j++)
+        {
+          vlib_next_frame_t *nf = &nm_clone->next_frames[j];
+          u32 save_node_runtime_index;
+
+          save_node_runtime_index = nf->node_runtime_index;
+          vlib_next_frame_init (nf);
+          nf->node_runtime_index = save_node_runtime_index;
+        }
+
+      old_nodes_clone = nm_clone->nodes;
+      nm_clone->nodes = 0;
+
+      /* re-fork nodes */
+      for (j = 0; j < vec_len (nm->nodes); j++) {
+        vlib_node_t *old_n_clone;
+        vlib_node_t *new_n, *new_n_clone;
+
+        new_n = nm->nodes[j];
+        old_n_clone = old_nodes_clone[j];
+
+        new_n_clone = clib_mem_alloc_no_fail (sizeof(*new_n_clone));
+        memcpy (new_n_clone, new_n, sizeof (*new_n));
+        /* none of the copied nodes have enqueue rights given out */
+        new_n_clone->owner_node_index = VLIB_INVALID_NODE_INDEX;
+
+        if (j >= vec_len (old_nodes_clone))
+          {
+            /* new node, set to zero */
+            memset (&new_n_clone->stats_total, 0, 
+                    sizeof (new_n_clone->stats_total));
+            memset (&new_n_clone->stats_last_clear, 0, 
+                    sizeof (new_n_clone->stats_last_clear));
+          }
+        else
+          {
+            /* Copy stats if the old data is valid */
+            memcpy (&new_n_clone->stats_total, 
+                    &old_n_clone->stats_total,
+                    sizeof (new_n_clone->stats_total));
+            memcpy (&new_n_clone->stats_last_clear, 
+                    &old_n_clone->stats_last_clear,
+                    sizeof (new_n_clone->stats_last_clear));
+
+            /* keep previous node state */
+            new_n_clone->state = old_n_clone->state;
+          }
+        vec_add1 (nm_clone->nodes, new_n_clone);
+      }
+      /* Free the old node clone */
+      for (j = 0; j < vec_len(old_nodes_clone); j++)
+        clib_mem_free (old_nodes_clone[j]);
+      vec_free (old_nodes_clone);
+      
+      vec_free (nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]);
+
+      nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] =
+          vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]);
+
+      /* clone input node runtime */
+      old_rt = nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT];
+
+      nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] =
+        vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]);
+
+      vec_foreach(rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT])
+        {
+          rt->cpu_index = vm_clone->cpu_index;
+        }
+
+      for (j=0; j < vec_len(old_rt); j++)
+        {
+          rt = vlib_node_get_runtime (vm_clone, old_rt[j].node_index);
+          rt->state = old_rt[j].state;
+        }
+
+      vec_free(old_rt);
+
+      nm_clone->processes = vec_dup (nm->processes);
+
+      clib_mem_set_heap (oldheap);
+
+      // vnet_main_fork_fixup (i);
+    }
+}
+
+static clib_error_t *
+cpu_config (vlib_main_t * vm, unformat_input_t * input)
+{
+  vlib_thread_registration_t *tr;
+  uword * p;
+  vlib_thread_main_t * tm = &vlib_thread_main;
+  u8 * name;
+  u64 coremask;
+  uword * bitmap;
+  u32 count;
+
+  tm->thread_registrations_by_name = hash_create_string (0, sizeof (uword));
+  tm->n_thread_stacks = 1;      /* account for main thread */
+
+  tr = tm->next;
+
+  while (tr)
+    {
+      hash_set_mem (tm->thread_registrations_by_name, tr->name, (uword)tr);
+      tr = tr->next;
+    }
+
+  while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "main-thread-io"))
+        tm->main_thread_is_io_node = 1;
+      else if (unformat (input, "use-pthreads"))
+        tm->use_pthreads = 1;
+      else if (unformat (input, "thread-prefix %v", &tm->thread_prefix))
+          ;
+      else if (unformat (input, "main-core %u", &tm->main_lcore))
+          ;
+      else if (unformat (input, "skip-cores %u", &tm->skip_cores))
+          ;
+      else if (unformat (input, "coremask-%s %llx", &name, &coremask))
+        {
+          p = hash_get_mem (tm->thread_registrations_by_name, name);
+          if (p == 0)
+            return clib_error_return (0, "no such thread type '%s'", name);
+
+          tr = (vlib_thread_registration_t *)p[0];
+
+          if  (tr->use_pthreads)
+            return clib_error_return (0, "coremask cannot be set for '%s' threads",
+                                      name);
+
+          tr->coremask = clib_bitmap_set_multiple 
+            (tr->coremask, 0, coremask, BITS(coremask));
+          tr->count = clib_bitmap_count_set_bits (tr->coremask);
+        }
+      else if (unformat (input, "corelist-%s %U", &name, unformat_bitmap_list,
+               &bitmap))
+        {
+          p = hash_get_mem (tm->thread_registrations_by_name, name);
+          if (p == 0)
+            return clib_error_return (0, "no such thread type '%s'", name);
+
+          tr = (vlib_thread_registration_t *)p[0];
+
+          if  (tr->use_pthreads)
+            return clib_error_return (0, "corelist cannot be set for '%s' threads",
+                                      name);
+
+          tr->coremask = bitmap;
+          tr->count = clib_bitmap_count_set_bits (tr->coremask);
+        }
+      else if (unformat (input, "%s %u", &name, &count))
+        {
+          p = hash_get_mem (tm->thread_registrations_by_name, name);
+          if (p == 0)
+              return clib_error_return (0, "no such thread type '%s'", name);
+                                        
+          tr = (vlib_thread_registration_t *)p[0];
+          if (tr->fixed_count)
+            return clib_error_return 
+              (0, "number of %s threads not configurable", tr->name);
+          tr->count = count;
+        }
+      else 
+        break;
+    }
+
+  tr = tm->next;
+
+  if (!tm->thread_prefix)
+    tm->thread_prefix = format(0, "vpp");
+
+  while (tr)
+    {
+      tm->n_thread_stacks += tr->count;
+      tm->n_pthreads += tr->count * tr->use_pthreads;
+      tm->n_eal_threads += tr->count * (tr->use_pthreads == 0);
+      tr = tr->next;
+    }
+
+  return 0;
+}
+
+VLIB_EARLY_CONFIG_FUNCTION (cpu_config, "cpu");
+
+#if !defined (__x86_64__)
+void __sync_fetch_and_add_8 (void)
+{
+  fformat(stderr, "%s called\n", __FUNCTION__);
+  abort();
+}
+void __sync_add_and_fetch_8 (void)
+{
+  fformat(stderr, "%s called\n", __FUNCTION__);
+  abort();
+}
+#endif
+
+void vnet_main_fixup (vlib_fork_fixup_t which) __attribute__ ((weak));
+void vnet_main_fixup (vlib_fork_fixup_t which) { }
+
+void vlib_worker_thread_fork_fixup (vlib_fork_fixup_t which)
+{
+  vlib_main_t * vm = vlib_get_main();
+
+  if (vlib_mains == 0)
+    return;
+
+  ASSERT(os_get_cpu_number() == 0);
+  vlib_worker_thread_barrier_sync(vm);
+
+  switch (which)
+    {
+    case VLIB_WORKER_THREAD_FORK_FIXUP_NEW_SW_IF_INDEX:
+      vnet_main_fixup (VLIB_WORKER_THREAD_FORK_FIXUP_NEW_SW_IF_INDEX);
+      break;
+
+    default:
+      ASSERT(0);
+    }
+  vlib_worker_thread_barrier_release(vm);
+}
+
+void vlib_worker_thread_barrier_sync(vlib_main_t *vm)
+{
+  f64 deadline;
+  u32 count;
+  
+  if (!vlib_mains)
+      return;
+
+  count = vec_len (vlib_mains) - 1;
+
+  /* Tolerate recursive calls */
+  if (++vlib_worker_threads[0].recursion_level > 1)
+      return;
+
+  ASSERT (os_get_cpu_number() == 0);
+
+  deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT;
+
+  *vlib_worker_threads->wait_at_barrier = 1;
+  while (*vlib_worker_threads->workers_at_barrier != count)
+    {
+      if (vlib_time_now(vm) > deadline)
+        {
+          fformat(stderr, "%s: worker thread deadlock\n", __FUNCTION__);
+          os_panic();
+        }
+    }
+}
+
+void vlib_worker_thread_barrier_release(vlib_main_t * vm)
+{
+  f64 deadline;
+
+  if (!vlib_mains)
+      return;
+
+  if (--vlib_worker_threads[0].recursion_level > 0)
+    return;
+
+  deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT;
+
+  *vlib_worker_threads->wait_at_barrier = 0;
+
+  while (*vlib_worker_threads->workers_at_barrier > 0)
+    {
+      if (vlib_time_now(vm) > deadline)
+        {
+          fformat(stderr, "%s: worker thread deadlock\n", __FUNCTION__);
+          os_panic();
+        }
+    }
+}
+
+static clib_error_t *
+show_threads_fn (vlib_main_t * vm,
+       unformat_input_t * input,
+       vlib_cli_command_t * cmd)
+{
+  vlib_worker_thread_t * w;
+  int i;
+
+  vlib_cli_output (vm, "%-7s%-20s%-12s%-8s%-7s%-7s%-7s%-10s",
+                   "ID", "Name", "Type", "LWP",
+                   "lcore", "Core", "Socket", "State");
+
+  for (i = 0; i < vec_len(vlib_worker_threads); i++)
+    {
+      w = vlib_worker_threads + i;
+      u8 * line = NULL;
+
+      line = format(line, "%-7d%-20s%-12s%-8d",
+                    i,
+                    w->name ? w->name : (u8 *) "",
+                    w->registration ? w->registration->name : "",
+                    w->lwp);
+
+      int lcore = w->dpdk_lcore_id;
+      if (lcore > -1)
+        {
+          line = format(line, "%-7u%-7u%-7u",
+                        lcore,
+                        lcore_config[lcore].core_id,
+                        lcore_config[lcore].socket_id);
+
+          switch(lcore_config[lcore].state)
+            {
+              case WAIT:
+                line = format(line, "wait");
+                break;
+              case RUNNING:
+                line = format(line, "running");
+                break;
+              case FINISHED:
+                line = format(line, "finished");
+                break;
+              default:
+                line = format(line, "unknown");
+            }
+        }
+
+      vlib_cli_output(vm, "%v", line);
+      vec_free(line);
+    }
+
+  return 0;
+}
+
+
+VLIB_CLI_COMMAND (show_threads_command, static) = {
+  .path = "show threads",
+  .short_help = "Show threads",
+  .function = show_threads_fn,
+};
diff --git a/vlib/vlib/threads.h b/vlib/vlib/threads.h
new file mode 100644
index 00000000000..9ce42a1367d
--- /dev/null
+++ b/vlib/vlib/threads.h
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef included_vlib_threads_h
+#define included_vlib_threads_h
+
+#include <vlib/main.h>
+
+vlib_main_t **vlib_mains;
+
+static inline uword
+vlib_get_cpu_number_inline (void)
+{
+  void * sp;
+  uword n;
+  u32 len;
+
+  /* Get any old stack address. */
+  sp = &sp;
+
+  n = ((uword)sp - (uword)vlib_thread_stacks[0]) >> 20;
+
+  /* "processes" have their own stacks, and they always run in thread 0 */
+  n = n >= len ? 0 : n;
+
+  return n;
+}
+
+void
+vlib_set_thread_name (char *name);
+
+/* arg is actually a vlib__thread_t * */
+typedef void (vlib_thread_function_t) (void * arg);
+
+typedef struct vlib_thread_registration_ {
+  /* constructor generated list of thread registrations */
+  struct vlib_thread_registration_ * next;
+
+  /* config parameters */
+  char * name;
+  char * short_name;
+  vlib_thread_function_t * function;
+  uword mheap_size;
+  int fixed_count;
+  u32 count;
+  int no_data_structure_clone;
+  /* All threads of this type run on pthreads */
+  int use_pthreads;
+  u32 first_index;
+  uword * coremask;
+} vlib_thread_registration_t;
+
+#define VLIB_MAX_CPUS 32
+
+/* 
+ * Objects passed around by "index" are cache-line aligned.
+ * We can stick the owner CPU into the low 6 bits.
+ */
+#if VLIB_MAX_CPUS > 64
+#error VLIB_MAX_CPUS must be <= 64
+#endif
+
+#define VLIB_CPU_MASK (VLIB_MAX_CPUS - 1) /* 0x3f, max */
+#define VLIB_OFFSET_MASK (~VLIB_CPU_MASK)
+
+#define VLIB_LOG2_THREAD_STACK_SIZE (20)
+#define VLIB_THREAD_STACK_SIZE (1<<VLIB_LOG2_THREAD_STACK_SIZE)
+
+typedef enum {
+    VLIB_FRAME_QUEUE_ELT_DISPATCH_FRAME,
+} vlib_frame_queue_msg_type_t;
+
+typedef struct {
+  volatile u32 valid;
+  u32 msg_type;         
+  u32 n_vectors;
+  u32 last_n_vectors;
+
+  /* 256 * 4 = 1024 bytes, even mult of cache line size */
+  u32 buffer_index[VLIB_FRAME_SIZE];
+
+  /* Pad to a cache line boundary */
+  u8 pad[CLIB_CACHE_LINE_BYTES - 4 * sizeof(u32)];
+} vlib_frame_queue_elt_t;
+
+typedef struct {
+  /* First cache line */
+  volatile u32 *wait_at_barrier;
+  volatile u32 *workers_at_barrier;
+  u8 pad0[CLIB_CACHE_LINE_BYTES - (2 * sizeof (u32 *))];
+
+  /* Second Cache Line */
+  void *thread_mheap;
+  u8 * thread_stack;
+  void (*thread_function)(void *);
+  void * thread_function_arg;
+  i64 recursion_level; 
+  elog_track_t elog_track; 
+  u32 instance_id;
+  vlib_thread_registration_t *registration;
+  u8 *name;
+
+  long lwp;
+  int dpdk_lcore_id;
+} vlib_worker_thread_t;
+
+vlib_worker_thread_t *vlib_worker_threads;
+
+typedef struct {
+  /* enqueue side */
+  volatile u64 tail;
+  u64 enqueues;
+  u64 enqueue_ticks;
+  u64 enqueue_vectors;
+  u32 enqueue_full_events;
+  u32 enqueue_efd_discards;
+  u8 pad2[CLIB_CACHE_LINE_BYTES 
+          - (2 * sizeof(u32))
+          - (4 * sizeof(u64))];
+
+  /* dequeue side */
+  volatile u64 head;
+  u64 dequeues;
+  u64 dequeue_ticks;
+  u64 dequeue_vectors;
+  u64 trace;
+  u64 vector_threshold;
+  u8 pad4[CLIB_CACHE_LINE_BYTES 
+          - (6 * sizeof(u64))];
+
+  /* dequeue hint to enqueue side */
+  volatile u64 head_hint;
+  u8 pad5 [CLIB_CACHE_LINE_BYTES - sizeof(u64)];
+
+  /* read-only, constant, shared */
+  vlib_frame_queue_elt_t *elts;
+  u32 nelts;
+} vlib_frame_queue_t;
+
+vlib_frame_queue_t **vlib_frame_queues;
+
+/* Called early, in thread 0's context */
+clib_error_t * vlib_thread_init (vlib_main_t * vm);
+
+vlib_worker_thread_t * vlib_alloc_thread (vlib_main_t * vm);
+
+int vlib_frame_queue_enqueue (vlib_main_t *vm, u32 node_runtime_index,
+                              u32 frame_queue_index, vlib_frame_t *frame,
+                              vlib_frame_queue_msg_type_t type);
+
+int vlib_frame_queue_dequeue (int thread_id, 
+                              vlib_main_t *vm, 
+                              vlib_node_main_t *nm);
+
+u64 dispatch_node (vlib_main_t * vm,
+                   vlib_node_runtime_t * node,
+                   vlib_node_type_t type,
+                   vlib_node_state_t dispatch_state,
+                   vlib_frame_t * frame,
+                   u64 last_time_stamp);
+
+u64 dispatch_pending_node (vlib_main_t * vm,
+                           vlib_pending_frame_t * p,
+                           u64 last_time_stamp);
+
+void vlib_worker_thread_node_runtime_update(void);
+
+void vlib_create_worker_threads (vlib_main_t *vm, int n, 
+                                 void (*thread_function)(void *));
+
+void vlib_worker_thread_init (vlib_worker_thread_t * w);
+
+/* Check for a barrier sync request every 30ms */
+#define BARRIER_SYNC_DELAY (0.030000)
+
+#if CLIB_DEBUG > 0
+/* long barrier timeout, for gdb... */
+#define BARRIER_SYNC_TIMEOUT (600.1)
+#else
+#define BARRIER_SYNC_TIMEOUT (1.0)
+#endif
+
+void vlib_worker_thread_barrier_sync(vlib_main_t *vm);
+void vlib_worker_thread_barrier_release(vlib_main_t *vm);
+
+always_inline void vlib_smp_unsafe_warning (void)
+{
+  if (CLIB_DEBUG > 0)
+    {
+      if (os_get_cpu_number())
+        fformat(stderr, "%s: SMP unsafe warning...\n", __FUNCTION__);
+    }
+}
+
+typedef enum {
+    VLIB_WORKER_THREAD_FORK_FIXUP_ILLEGAL = 0,
+    VLIB_WORKER_THREAD_FORK_FIXUP_NEW_SW_IF_INDEX,
+} vlib_fork_fixup_t;
+
+void vlib_worker_thread_fork_fixup (vlib_fork_fixup_t which);
+
+static inline void vlib_worker_thread_barrier_check (void)
+{
+    if (PREDICT_FALSE(*vlib_worker_threads->wait_at_barrier))
+    {
+        clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, 1);
+        while (*vlib_worker_threads->wait_at_barrier)
+            ;
+        clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, -1);
+    }
+}
+
+#define foreach_vlib_main(body)			                        \
+do {                                                                    \
+    vlib_main_t ** __vlib_mains = 0, *this_vlib_main;                   \
+    int ii;                                                             \
+                                                                        \
+    if (vec_len (vlib_mains) == 0)                                      \
+        vec_add1 (__vlib_mains, &vlib_global_main);                     \
+    else                                                                \
+    {                                                                   \
+        for (ii = 0; ii < vec_len (vlib_mains); ii++)                   \
+        {                                                               \
+            this_vlib_main = vlib_mains[ii];                            \
+            if (this_vlib_main)                                         \
+                vec_add1 (__vlib_mains, this_vlib_main);                \
+        }                                                               \
+    }                                                                   \
+                                                                        \
+    for (ii = 0; ii < vec_len (__vlib_mains); ii++)                     \
+    {                                                                   \
+        this_vlib_main = __vlib_mains[ii];                              \
+        /* body uses this_vlib_main... */                               \
+        (body);                                                         \
+    }                                                                   \
+    vec_free (__vlib_mains);                                            \
+} while (0);
+
+
+/* Early-Fast-Discard (EFD) */
+#define VLIB_EFD_DISABLED                   0
+#define VLIB_EFD_DISCARD_ENABLED            (1 << 0)
+#define VLIB_EFD_MONITOR_ENABLED            (1 << 1)
+
+#define VLIB_EFD_DEF_WORKER_HI_THRESH_PCT   90
+
+/* EFD worker thread settings */
+typedef struct vlib_efd_t {
+  u16 enabled;
+  u16 queue_hi_thresh;
+  u8  ip_prec_bitmap;
+  u8  mpls_exp_bitmap;
+  u8  vlan_cos_bitmap;
+  u8  pad;
+} vlib_efd_t;
+
+typedef struct {
+  /* Link list of registrations, built by constructors */
+  vlib_thread_registration_t * next;
+  
+  /* Vector of registrations, w/ non-data-structure clones at the top */
+  vlib_thread_registration_t ** registrations;
+
+  uword * thread_registrations_by_name;
+
+  vlib_worker_thread_t * worker_threads;
+
+  /* thread / cpu / io thread parameters */
+  u32 main_thread_is_io_node;
+
+  /* 
+   * Launch all threads as pthreads, 
+   * not eal_rte_launch (strict affinity) threads 
+   */
+  int use_pthreads;
+
+  /* Number of vlib_main / vnet_main clones */
+  u32 n_vlib_mains;
+
+  /* Number of thread stacks to create */
+  u32 n_thread_stacks;
+
+  /* Number of pthreads */
+  u32 n_pthreads;
+
+  /* Number of DPDK eal threads */
+  u32 n_eal_threads;
+
+  /* Number of cores to skip, must match the core mask */
+  u32 skip_cores;
+
+  /* Thread prefix name */
+  u8 *thread_prefix;
+
+  /* main thread lcore */
+  u8 main_lcore;
+
+  /* Bitmap of available CPU cores */
+  uword * cpu_core_bitmap;
+
+  /* Bitmap of available CPU sockets (NUMA nodes) */
+  uword * cpu_socket_bitmap;
+
+  vlib_efd_t efd;
+  
+} vlib_thread_main_t;
+
+vlib_thread_main_t vlib_thread_main;
+
+#define VLIB_REGISTER_THREAD(x,...)                     \
+  __VA_ARGS__ vlib_thread_registration_t x;             \
+static void __vlib_add_thread_registration_##x (void)   \
+  __attribute__((__constructor__)) ;                    \
+static void __vlib_add_thread_registration_##x (void)   \
+{                                                       \
+  vlib_thread_main_t * tm = &vlib_thread_main;          \
+  x.next = tm->next;                                    \
+  tm->next = &x;                                        \
+}                                                       \
+__VA_ARGS__ vlib_thread_registration_t x 
+
+#endif /* included_vlib_threads_h */
diff --git a/vlib/vlib/trace.c b/vlib/vlib/trace.c
new file mode 100644
index 00000000000..6272d853145
--- /dev/null
+++ b/vlib/vlib/trace.c
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * trace.c: VLIB trace buffer.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/threads.h>
+
+/* Helper function for nodes which only trace buffer data. */
+void
+vlib_trace_frame_buffers_only (vlib_main_t * vm,
+			       vlib_node_runtime_t * node,
+			       u32 * buffers,
+			       uword n_buffers,
+			       uword next_buffer_stride,
+			       uword n_buffer_data_bytes_in_trace)
+{
+  u32 n_left, * from;
+
+  n_left = n_buffers;
+  from = buffers;
+  
+  while (n_left >= 4)
+    {
+      u32 bi0, bi1;
+      vlib_buffer_t * b0, * b1;
+      u8 * t0, * t1;
+
+      /* Prefetch next iteration. */
+      vlib_prefetch_buffer_with_index (vm, from[2], LOAD);
+      vlib_prefetch_buffer_with_index (vm, from[3], LOAD);
+
+      bi0 = from[0];
+      bi1 = from[1];
+
+      b0 = vlib_get_buffer (vm, bi0);
+      b1 = vlib_get_buffer (vm, bi1);
+
+      if (b0->flags & VLIB_BUFFER_IS_TRACED)
+	{
+	  t0 = vlib_add_trace (vm, node, b0, n_buffer_data_bytes_in_trace);
+	  memcpy (t0, b0->data + b0->current_data,
+		  n_buffer_data_bytes_in_trace);
+	}
+      if (b1->flags & VLIB_BUFFER_IS_TRACED)
+	{
+	  t1 = vlib_add_trace (vm, node, b1, n_buffer_data_bytes_in_trace);
+	  memcpy (t1, b1->data + b1->current_data,
+		  n_buffer_data_bytes_in_trace);
+	}
+      from += 2;
+      n_left -= 2;
+    }
+
+  while (n_left >= 1)
+    {
+      u32 bi0;
+      vlib_buffer_t * b0;
+      u8 * t0;
+
+      bi0 = from[0];
+
+      b0 = vlib_get_buffer (vm, bi0);
+
+      if (b0->flags & VLIB_BUFFER_IS_TRACED)
+	{
+	  t0 = vlib_add_trace (vm, node, b0, n_buffer_data_bytes_in_trace);
+	  memcpy (t0, b0->data + b0->current_data,
+		  n_buffer_data_bytes_in_trace);
+	}
+      from += 1;
+      n_left -= 1;
+    }
+}
+
+/* Free up all trace buffer memory. */
+always_inline void
+clear_trace_buffer (vlib_trace_main_t * tm)
+{
+  int i;
+
+  foreach_vlib_main (
+  ({
+    void *mainheap;
+
+    tm = &this_vlib_main->trace_main;
+    mainheap = clib_mem_set_heap (this_vlib_main->heap_base);
+
+    for (i = 0; i < vec_len (tm->trace_buffer_pool); i++)
+      if (! pool_is_free_index (tm->trace_buffer_pool, i))
+        vec_free (tm->trace_buffer_pool[i]);
+    pool_free (tm->trace_buffer_pool);
+    clib_mem_set_heap (mainheap);
+  }));
+}
+
+static u8 * format_vlib_trace (u8 * s, va_list * va)
+{
+  vlib_main_t * vm = va_arg (*va, vlib_main_t *);
+  vlib_trace_header_t * h = va_arg (*va, vlib_trace_header_t *);
+  vlib_trace_header_t * e = vec_end (h);
+  vlib_node_t * node, * prev_node;
+  clib_time_t * ct = &vm->clib_time;
+  f64 t;
+  
+  prev_node = 0;
+  while (h < e)
+    {
+      node = vlib_get_node (vm, h->node_index);
+
+      if (node != prev_node)
+	{
+	  t = (h->time - vm->cpu_time_main_loop_start) * ct->seconds_per_clock;
+	  s = format (s, "\n%U: %v",
+		      format_time_interval, "h:m:s:u", t,
+		      node->name);
+	}
+      prev_node = node;
+
+      if (node->format_trace)
+	s = format (s, "\n  %U",
+		    node->format_trace, vm, node, h->data);
+      else
+	s = format (s, "\n  %U",
+		    node->format_buffer, h->data);
+
+      h = vlib_trace_header_next (h);
+    }
+
+  return s;
+}
+
+/* Root of all trace cli commands. */
+VLIB_CLI_COMMAND (trace_cli_command,static) = {
+  .path = "trace",
+  .short_help = "Packet tracer commands",
+};
+
+static clib_error_t *
+cli_show_trace_buffer (vlib_main_t * vm,
+		       unformat_input_t * input,
+		       vlib_cli_command_t * cmd)
+{
+  vlib_trace_main_t * tm = &vm->trace_main;
+  vlib_trace_header_t ** h, ** traces;
+  u32 i, index = 0;
+  char * fmt;
+
+  /* Get active traces from pool. */
+
+  foreach_vlib_main (
+  ({
+    void *mainheap;
+
+    fmt = "------------------- Start of thread %d %v -------------------";
+    vlib_cli_output (vm, fmt, index, vlib_worker_threads[index].name);
+
+    tm = &this_vlib_main->trace_main;
+
+    mainheap = clib_mem_set_heap (this_vlib_main->heap_base);
+    traces = 0;
+    pool_foreach (h, tm->trace_buffer_pool, 
+    ({
+      vec_add1 (traces, h[0]);
+    }));
+    
+    if (vec_len (traces) == 0)
+      {
+        clib_mem_set_heap (mainheap);
+        vlib_cli_output (vm, "No packets in trace buffer");
+        goto done;
+      }
+    
+    /* Sort them by increasing time. */
+    vec_sort (traces, t0, t1, ({
+          i64 dt = t0[0]->time - t1[0]->time;
+          dt < 0 ? -1 : (dt > 0 ? +1 : 0);
+        }));
+    
+    for (i = 0; i < vec_len (traces); i++)
+      {
+        clib_mem_set_heap (mainheap);
+        
+        vlib_cli_output (vm, "Packet %d\n%U\n\n", i + 1,
+                         format_vlib_trace, vm, traces[i]);
+
+        mainheap = clib_mem_set_heap (this_vlib_main->heap_base);
+      }
+    
+  done:
+    vec_free (traces);
+    clib_mem_set_heap (mainheap);
+
+    index++;
+  }));
+
+  return 0;
+}
+
+VLIB_CLI_COMMAND (show_trace_cli,static) = {
+  .path = "show trace",
+  .short_help = "Show trace buffer",
+  .function = cli_show_trace_buffer,
+};
+
+static clib_error_t *
+cli_add_trace_buffer (vlib_main_t * vm,
+		      unformat_input_t * input,
+		      vlib_cli_command_t * cmd)
+{
+  vlib_trace_main_t * tm;
+  vlib_trace_node_t * tn;
+  u32 node_index, add;
+
+  if (unformat (input, "%U %d", unformat_vlib_node, vm, &node_index, &add))
+    ;
+  else
+    return clib_error_create ("expected NODE COUNT, got `%U'",
+                              format_unformat_error, input);
+
+  foreach_vlib_main (
+  ({
+    void *oldheap;
+    tm = &this_vlib_main->trace_main;
+
+    oldheap = clib_mem_set_heap (this_vlib_main->heap_base);
+
+    vec_validate (tm->nodes, node_index);
+    tn = tm->nodes + node_index;
+    tn->limit += add;
+    clib_mem_set_heap (oldheap);
+  }));
+
+  return 0;
+}
+
+VLIB_CLI_COMMAND (add_trace_cli,static) = {
+  .path = "trace add",
+  .short_help = "Trace given number of packets",
+  .function = cli_add_trace_buffer,
+};
+
+static clib_error_t *
+cli_clear_trace_buffer (vlib_main_t * vm,
+			unformat_input_t * input,
+			vlib_cli_command_t * cmd)
+{
+  vlib_trace_main_t * tm = &vm->trace_main;
+  clear_trace_buffer (tm);
+  return 0;
+}
+
+VLIB_CLI_COMMAND (clear_trace_cli,static) = {
+  .path = "clear trace",
+  .short_help = "Clear trace buffer and free memory",
+  .function = cli_clear_trace_buffer,
+};
+
+/* Dummy function to get us linked in. */
+void vlib_trace_cli_reference (void) {}
diff --git a/vlib/vlib/trace.h b/vlib/vlib/trace.h
new file mode 100644
index 00000000000..228a22abb95
--- /dev/null
+++ b/vlib/vlib/trace.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * trace.h: VLIB trace buffer.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_trace_h
+#define included_vlib_trace_h
+
+#include <vppinfra/pool.h>
+
+typedef struct {
+  /* CPU time stamp trace was made. */
+  u64 time;
+
+  /* Node which generated this trace. */
+  u32 node_index;
+
+  /* Number of data words in this trace. */
+  u32 n_data;
+
+  /* Trace data follows. */
+  u8 data[0];
+} vlib_trace_header_t;
+
+typedef struct {
+  /* Current number of traces in buffer. */
+  u32 count;
+
+  /* Max. number of traces to be added to buffer. */
+  u32 limit;
+} vlib_trace_node_t;
+
+typedef struct {
+  /* Pool of trace buffers. */
+  vlib_trace_header_t ** trace_buffer_pool;
+
+  /* Per node trace counts. */
+  vlib_trace_node_t * nodes;
+} vlib_trace_main_t;
+
+#endif /* included_vlib_trace_h */
diff --git a/vlib/vlib/trace_funcs.h b/vlib/vlib/trace_funcs.h
new file mode 100644
index 00000000000..3dc7471e152
--- /dev/null
+++ b/vlib/vlib/trace_funcs.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * trace_funcs.h: VLIB trace buffer.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_trace_funcs_h
+#define included_vlib_trace_funcs_h
+
+always_inline void
+vlib_validate_trace (vlib_trace_main_t * tm, vlib_buffer_t * b)
+{
+  /* 
+   * this assert seems right, but goes off constantly. 
+   * disabling it appears to make the pain go away
+   */
+  ASSERT (1 || b->flags & VLIB_BUFFER_IS_TRACED);
+  ASSERT (! pool_is_free_index (tm->trace_buffer_pool, b->trace_index));
+}
+
+always_inline void *
+vlib_add_trace (vlib_main_t * vm,
+		vlib_node_runtime_t * r,
+		vlib_buffer_t * b,
+		u32 n_data_bytes)
+{
+  vlib_trace_main_t * tm = &vm->trace_main;
+  vlib_trace_header_t * h;
+  u32 n_data_words;
+
+  vlib_validate_trace (tm, b);
+
+  n_data_bytes = round_pow2 (n_data_bytes, sizeof (h[0]));
+  n_data_words = n_data_bytes / sizeof (h[0]);
+  vec_add2_aligned (tm->trace_buffer_pool[b->trace_index], h,
+                    1 + n_data_words,
+                    sizeof (h[0]));
+
+  h->time = vm->cpu_time_last_node_dispatch;
+  h->n_data = n_data_words;
+  h->node_index = r->node_index;
+
+  return h->data;
+}
+				      
+always_inline vlib_trace_header_t *
+vlib_trace_header_next (vlib_trace_header_t * h)
+{ return h + 1 + h->n_data; }
+
+always_inline void
+vlib_free_trace (vlib_main_t * vm, vlib_buffer_t * b)
+{
+  vlib_trace_main_t * tm = &vm->trace_main;
+  vlib_validate_trace (tm, b);
+  _vec_len (tm->trace_buffer_pool[b->trace_index]) = 0;
+  pool_put_index (tm->trace_buffer_pool, b->trace_index);
+}
+
+always_inline void
+vlib_trace_next_frame (vlib_main_t * vm,
+		       vlib_node_runtime_t * r,
+		       u32 next_index)
+{
+  vlib_next_frame_t * nf;
+  nf = vlib_node_runtime_get_next_frame (vm, r, next_index);
+  nf->flags |= VLIB_FRAME_TRACE;
+}
+
+/* Mark buffer as traced and allocate trace buffer. */
+always_inline void
+vlib_trace_buffer (vlib_main_t * vm,
+		   vlib_node_runtime_t * r,
+		   u32 next_index,
+		   vlib_buffer_t * b,
+		   int follow_chain)
+{
+  vlib_trace_main_t * tm = &vm->trace_main;
+  vlib_trace_header_t ** h;
+
+  vlib_trace_next_frame (vm, r, next_index);
+
+  pool_get (tm->trace_buffer_pool, h);
+
+  do {
+    b->flags |= VLIB_BUFFER_IS_TRACED;
+    b->trace_index = h - tm->trace_buffer_pool;
+  } while (follow_chain && (b = vlib_get_next_buffer (vm, b)));
+}
+
+always_inline void
+vlib_buffer_copy_trace_flag (vlib_main_t * vm, vlib_buffer_t * b, u32 bi_target)
+{
+  vlib_buffer_t * b_target = vlib_get_buffer (vm, bi_target);
+  b_target->flags |= b->flags & VLIB_BUFFER_IS_TRACED;
+  b_target->trace_index = b->trace_index;
+}
+
+always_inline u32
+vlib_get_trace_count (vlib_main_t * vm, vlib_node_runtime_t * rt)
+{
+  vlib_trace_main_t * tm = &vm->trace_main;
+  vlib_trace_node_t * tn;
+  int n;
+
+  if (rt->node_index >= vec_len (tm->nodes))
+    return 0;
+  tn = tm->nodes + rt->node_index;
+  n = tn->limit - tn->count;
+  ASSERT (n >= 0);
+
+  return n;
+}
+
+always_inline void
+vlib_set_trace_count (vlib_main_t * vm, vlib_node_runtime_t * rt,
+		      u32 count)
+{
+  vlib_trace_main_t * tm = &vm->trace_main;
+  vlib_trace_node_t * tn = vec_elt_at_index (tm->nodes, rt->node_index);
+
+  ASSERT (count <= tn->limit);
+  tn->count = tn->limit - count;
+}
+
+/* Helper function for nodes which only trace buffer data. */
+void
+vlib_trace_frame_buffers_only (vlib_main_t * vm,
+			       vlib_node_runtime_t * node,
+			       u32 * buffers,
+			       uword n_buffers,
+			       uword next_buffer_stride,
+			       uword n_buffer_data_bytes_in_trace);
+
+#endif /* included_vlib_trace_funcs_h */
diff --git a/vlib/vlib/unix/cj.c b/vlib/vlib/unix/cj.c
new file mode 100644
index 00000000000..665a13fa4f5
--- /dev/null
+++ b/vlib/vlib/unix/cj.c
@@ -0,0 +1,218 @@
+/* 
+ *------------------------------------------------------------------
+ * cj.c
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#include <stdio.h>
+#include <vlib/vlib.h>
+
+#include <vlib/unix/cj.h>
+
+cj_main_t cj_main;
+
+void
+cj_log (u32 type, void * data0, void * data1)
+{
+  u64 new_tail;
+  cj_main_t * cjm = &cj_main;
+  cj_record_t * r;
+
+  if (cjm->enable == 0)
+    return;
+
+  new_tail = __sync_add_and_fetch (&cjm->tail, 1);
+
+  r = (cj_record_t *) &(cjm->records[new_tail & (cjm->num_records - 1)]);
+  r->time = vlib_time_now (cjm->vlib_main);
+  r->cpu = os_get_cpu_number();
+  r->type = type;
+  r->data[0] = (u64) data0;
+  r->data[1] = (u64) data1;
+}
+
+void cj_stop(void)
+{
+  cj_main_t * cjm = &cj_main;
+
+  cjm->enable = 0;
+}
+
+
+clib_error_t * cj_init (vlib_main_t * vm)
+{
+  cj_main_t * cjm = &cj_main;
+
+  cjm->vlib_main = vm;
+  return 0;
+}
+VLIB_INIT_FUNCTION (cj_init);
+
+static clib_error_t *
+cj_config (vlib_main_t * vm, unformat_input_t * input)
+{
+  cj_main_t * cjm = &cj_main;
+  int matched = 0;
+  int enable = 0;
+
+  while (unformat_check_input(input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "records %d", &cjm->num_records))
+        matched = 1;
+      else if (unformat (input, "on"))
+        enable = 1;
+      else
+        return clib_error_return (0, "cj_config: unknown input '%U'",
+                                  format_unformat_error, input);
+    }
+
+  if (matched == 0)
+    return 0;
+
+  cjm->num_records = max_pow2 (cjm->num_records);
+  vec_validate (cjm->records, cjm->num_records-1);
+  memset (cjm->records, 0xff, cjm->num_records * sizeof (cj_record_t));
+  cjm->tail = ~0;
+  cjm->enable = enable;
+
+  return 0;
+}
+
+VLIB_CONFIG_FUNCTION (cj_config, "cj");
+
+void cj_enable_disable (int is_enable)
+{
+  cj_main_t * cjm = &cj_main;
+  
+  if (cjm->num_records)
+    cjm->enable = is_enable;
+  else
+    vlib_cli_output (cjm->vlib_main, "CJ not configured...");
+}
+
+static inline void cj_dump_one_record (cj_record_t * r)
+{
+  fprintf (stderr, "[%d]: %10.6f T%02d %llx %llx\n",
+           r->cpu, r->time, r->type, (long long unsigned int) r->data[0], 
+           (long long unsigned int) r->data[1]);
+}
+
+static void cj_dump_internal (u8 filter0_enable, u64 filter0, 
+                              u8 filter1_enable, u64 filter1)
+{
+  cj_main_t * cjm = &cj_main;
+  cj_record_t * r;
+  u32 i, index;
+  
+  if (cjm->num_records == 0)
+    {
+      fprintf (stderr, "CJ not configured...\n");
+      return;
+    }
+
+  if (cjm->tail == (u64)~0)
+    {
+      fprintf (stderr, "No data collected...\n");
+      return;
+    }
+
+  /* Has the trace wrapped? */
+  index = (cjm->tail+1) & (cjm->num_records - 1);
+  r = &(cjm->records[index]);
+
+  if (r->cpu != (u32)~0)
+    {
+        /* Yes, dump from tail + 1 to the end */
+      for (i = index; i < cjm->num_records; i++)
+        {
+          if (filter0_enable && (r->data[0] != filter0))
+            goto skip;
+          if (filter1_enable && (r->data[1] != filter1))
+            goto skip;
+          cj_dump_one_record (r);
+        skip:
+          r++;
+        }
+    }
+  /* dump from the beginning through the final tail */
+  r = cjm->records;
+  for (i = 0; i <= cjm->tail; i++)
+    {
+      if (filter0_enable && (r->data[0] != filter0))
+        goto skip2;
+      if (filter1_enable && (r->data[1] != filter1))
+        goto skip2;
+      cj_dump_one_record (r);
+    skip2:
+      r++;
+    }
+}
+
+void cj_dump (void)
+{
+  cj_dump_internal (0, 0, 0, 0);
+}
+
+void cj_dump_filter_data0 (u64 filter0)
+{
+  cj_dump_internal (1/* enable f0 */, filter0, 0, 0);
+}
+
+void cj_dump_filter_data1 (u64 filter1)
+{
+  cj_dump_internal (0, 0, 1 /* enable f1 */, filter1);
+}
+
+void cj_dump_filter_data12 (u64 filter0, u64 filter1)
+{
+  cj_dump_internal (1, filter0, 1, filter1);
+}
+
+static clib_error_t *
+cj_command_fn (vlib_main_t * vm,
+               unformat_input_t * input,
+               vlib_cli_command_t * cmd)
+{
+  int is_enable = -1;
+  int is_dump = -1;
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) {
+    if (unformat (input, "enable") || unformat (input, "on"))
+      is_enable = 1;
+    else if (unformat (input, "disable") || unformat (input, "off"))
+      is_enable = 0;
+    else if (unformat (input, "dump"))
+      is_dump = 1;
+    else
+      return clib_error_return (0, "unknown input `%U'",
+                                format_unformat_error, input);
+    }
+
+  if (is_enable >= 0)
+    cj_enable_disable (is_enable);
+
+  if (is_dump > 0)
+    cj_dump ();
+
+  return 0;
+}
+
+VLIB_CLI_COMMAND (cj_command,static) = {
+  .path = "cj",
+  .short_help = "cj",
+  .function = cj_command_fn,
+};
+
diff --git a/vlib/vlib/unix/cj.h b/vlib/vlib/unix/cj.h
new file mode 100644
index 00000000000..3c37f2bf22f
--- /dev/null
+++ b/vlib/vlib/unix/cj.h
@@ -0,0 +1,68 @@
+/* 
+ *------------------------------------------------------------------
+ * cj.h
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __included_cj_h__
+#define __included_cj_h__
+
+typedef struct {
+  f64 time;
+  u32 cpu;
+  u32 type;
+  u64 data[2];
+} cj_record_t;
+
+typedef struct {
+  volatile u64 tail;
+  cj_record_t * records;
+  u32 num_records;
+  volatile u32 enable;
+  
+  vlib_main_t * vlib_main;
+} cj_main_t;
+
+void cj_log (u32 type, void * data0, void * data1);
+
+/* 
+ * Supply in application main, so we can log from any library...
+ * Declare a weak reference in the library, off you go.
+ */
+
+#define DECLARE_CJ_GLOBAL_LOG                                   \
+void cj_global_log (unsigned type, void * data0, void * data1)  \
+  __attribute__ ((weak));                                       \
+                                                                \
+unsigned __cj_type;                                             \
+void * __cj_data0;                                              \
+void * __cj_data1;                                              \
+                                                                \
+void                                                            \
+cj_global_log (unsigned type, void * data0, void * data1)       \
+{                                                               \
+  __cj_type = type;                                             \
+  __cj_data0 = data0;                                           \
+  __cj_data1 = data1;                                           \
+}
+
+#define CJ_GLOBAL_LOG_PROTOTYPE
+void cj_global_log (unsigned type, void * data0, void * data1)  \
+  __attribute__ ((weak));                                       \
+
+void cj_stop(void);
+
+#endif /* __included_cj_h__ */
diff --git a/vlib/vlib/unix/cli.c b/vlib/vlib/unix/cli.c
new file mode 100644
index 00000000000..3cb13fc8550
--- /dev/null
+++ b/vlib/vlib/unix/cli.c
@@ -0,0 +1,900 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * cli.c: Unix stdin/socket CLI.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <termios.h>
+#include <unistd.h>
+#include <arpa/telnet.h>
+
+typedef struct {
+  u32 unix_file_index;
+
+  /* Vector of output pending write to file descriptor. */
+  u8 * output_vector;
+
+  /* Vector of input saved by Unix input node to be processed by
+     CLI process. */
+  u8 * input_vector;
+
+  u8 has_history;
+  u8 ** command_history;
+  u8 * current_command;
+  i32 excursion;
+  u32 history_limit;
+  u8 * search_key;
+  int search_mode;
+
+  u32 process_node_index;
+} unix_cli_file_t;
+
+always_inline void
+unix_cli_file_free (unix_cli_file_t * f)
+{
+  vec_free (f->output_vector);
+  vec_free (f->input_vector);
+}
+
+typedef struct {
+  /* Prompt string for CLI. */
+  u8 * cli_prompt;
+
+  unix_cli_file_t * cli_file_pool;
+
+  u32 * unused_cli_process_node_indices;
+
+  /* File pool index of current input. */
+  u32 current_input_file_index;
+} unix_cli_main_t;
+
+static unix_cli_main_t unix_cli_main;
+
+static void
+unix_cli_add_pending_output (unix_file_t * uf,
+			     unix_cli_file_t * cf,
+			     u8 * buffer,
+			     uword buffer_bytes)
+{
+  unix_main_t * um = &unix_main;
+
+  vec_add (cf->output_vector, buffer, buffer_bytes);
+  if (vec_len (cf->output_vector) > 0)
+    {
+      int skip_update = 0 != (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE);
+      uf->flags |= UNIX_FILE_DATA_AVAILABLE_TO_WRITE;
+      if (! skip_update)
+	um->file_update (uf, UNIX_FILE_UPDATE_MODIFY);
+    }
+}
+
+static void
+unix_cli_del_pending_output (unix_file_t * uf,
+			     unix_cli_file_t * cf,
+			     uword n_bytes)
+{
+  unix_main_t * um = &unix_main;
+
+  vec_delete (cf->output_vector, n_bytes, 0);
+  if (vec_len (cf->output_vector) <= 0)
+    {
+      int skip_update = 0 == (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE);
+      uf->flags &= ~UNIX_FILE_DATA_AVAILABLE_TO_WRITE;
+      if (! skip_update)
+	um->file_update (uf, UNIX_FILE_UPDATE_MODIFY);
+    }
+}
+
+/* VLIB cli output function. */
+static void unix_vlib_cli_output (uword cli_file_index,
+				  u8 * buffer,
+				  uword buffer_bytes)
+{
+  unix_main_t * um = &unix_main;
+  unix_cli_main_t * cm = &unix_cli_main;
+  unix_cli_file_t * cf;
+  unix_file_t * uf;
+  int n;
+ 
+  cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index);
+  uf = pool_elt_at_index (um->file_pool, cf->unix_file_index);
+  n = 0;
+  if (vec_len (cf->output_vector) == 0)
+    n = write (uf->file_descriptor, buffer, buffer_bytes);
+  if (n < 0 && errno != EAGAIN)
+    clib_unix_warning ("write");
+
+  else if ((word) n < (word) buffer_bytes)
+    {
+      if (n < 0) n = 0;
+      unix_cli_add_pending_output (uf, cf, buffer + n, buffer_bytes - n);
+    }
+}
+
+static int unix_cli_line_edit (unix_main_t * um, unix_cli_file_t * cf)
+{
+  unix_file_t * uf = pool_elt_at_index (um->file_pool, cf->unix_file_index);
+  u8 * prev;
+  int i, j, delta;
+
+  for (i = 0; i < vec_len (cf->input_vector); i++)
+    {
+      switch (cf->input_vector[i])
+        {
+        case 0:
+          continue;
+            
+        case '?':
+          /* Erase the current command (if any) plus ?*/
+          for (j = 0; j < (vec_len (cf->current_command)+1); j++)
+            unix_cli_add_pending_output (uf, cf, (u8 *) "\b \b", 3);
+          
+          unix_cli_add_pending_output (uf, cf, (u8 *) "\r\nHistory:\r\n", 12);
+
+          for (j = 0; j < vec_len (cf->command_history); j++)
+            {
+              unix_cli_add_pending_output (uf, cf, cf->command_history[j],
+                                           vec_len(cf->command_history[j]));
+              unix_cli_add_pending_output (uf, cf, (u8 *) "\r\n", 2);
+            }
+          goto crlf;
+
+          /* ^R - reverse search */
+        case 'R' - '@':
+        case 'S' - '@':
+          if (cf->search_mode == 0)
+            {
+              /* Erase the current command (if any) plus ^R */
+              for (j = 0; j < (vec_len (cf->current_command)+2); j++)
+                  unix_cli_add_pending_output (uf, cf, (u8 *) "\b \b", 3);
+              
+              vec_reset_length (cf->search_key);
+              vec_reset_length (cf->current_command);
+              if (cf->input_vector[i] == 'R' - '@')
+                  cf->search_mode = -1;
+              else
+                  cf->search_mode = 1;
+            }
+          else
+            {
+              if (cf->input_vector[i] == 'R' - '@')
+                cf->search_mode = -1;
+              else
+                cf->search_mode = 1;
+
+              cf->excursion += cf->search_mode;
+              unix_cli_add_pending_output (uf, cf, (u8 *) "\b \b", 3);
+              goto search_again;
+            }
+          break;
+
+          /* ^U - line-kill */
+        case 'U'-'@':
+          /* Erase the command, plus ^U */
+          for (j = 0; j < (vec_len (cf->current_command)+2); j++)
+            unix_cli_add_pending_output (uf, cf, (u8 *) "\b \b", 3);
+          vec_reset_length (cf->current_command);
+          cf->search_mode = 0;
+          continue;
+
+          /* ^P - previous, ^N - next */
+        case 'P' - '@':
+        case 'N' - '@':
+          cf->search_mode = 0;
+          /* Erase the command, plus ^P */
+          for (j = 0; j < (vec_len (cf->current_command)+2); j++)
+            unix_cli_add_pending_output (uf, cf, (u8 *) "\b \b", 3);
+          vec_reset_length (cf->current_command);
+          if (vec_len (cf->command_history))
+            {
+              if (cf->input_vector[i] == 'P' - '@')
+                delta = -1;
+              else
+                delta = 1;
+
+              cf->excursion += delta;
+
+              if (cf->excursion > (i32) vec_len (cf->command_history) -1)
+                cf->excursion = 0;
+              else if (cf->excursion < 0)
+                cf->excursion = vec_len (cf->command_history) -1;
+
+              prev = cf->command_history [cf->excursion];
+              vec_validate (cf->current_command, vec_len(prev)-1);
+
+              memcpy (cf->current_command, prev, vec_len(prev));
+              _vec_len (cf->current_command) = vec_len(prev);
+              unix_cli_add_pending_output (uf, cf, cf->current_command,
+                                           vec_len (cf->current_command));
+              break;
+            }
+          break;
+
+        case 0x7f:
+        case 'H' - '@':
+          for (j = 0; j < 2; j++)
+            unix_cli_add_pending_output (uf, cf, (u8 *) "\b \b", 3);
+          if (vec_len (cf->current_command))
+            {
+              unix_cli_add_pending_output (uf, cf, (u8 *) "\b \b", 3);
+              _vec_len (cf->current_command)--;
+            }
+          cf->search_mode = 0;
+          cf->excursion = 0;
+          cf->search_mode = 0;
+          vec_reset_length (cf->search_key);
+          break;
+
+        case '\r':
+        case '\n':
+        crlf:
+          vec_add1 (cf->current_command, '\r');
+          vec_add1 (cf->current_command, '\n');
+          unix_cli_add_pending_output (uf, cf, (u8 *) "\b\b  \b\b\r\n", 8);
+
+          vec_validate (cf->input_vector, vec_len(cf->current_command)-1);
+          memcpy (cf->input_vector, cf->current_command, 
+                  vec_len(cf->current_command));
+          _vec_len(cf->input_vector) = _vec_len (cf->current_command);
+
+          if (vec_len(cf->command_history) >= cf->history_limit)
+            {
+              vec_free (cf->command_history[0]);
+              vec_delete (cf->command_history, 1, 0);
+            }
+          /* Don't add blank lines to the cmd history */
+          if (vec_len (cf->current_command) > 2)
+            {
+              _vec_len (cf->current_command) -= 2;
+              vec_add1 (cf->command_history, cf->current_command);
+              cf->current_command = 0;
+            }
+          else
+            vec_reset_length (cf->current_command);
+          cf->excursion = 0;
+          cf->search_mode = 0;
+          vec_reset_length (cf->search_key);
+          return 0;
+
+          /* telnet "mode character" blort, echo but don't process. */
+        case 0xff:
+            unix_cli_add_pending_output (uf, cf, cf->input_vector + i, 
+                                         6);
+            i += 6;
+            continue;
+
+        default:
+          if (cf->search_mode)
+            {
+              int j, k, limit, offset;
+              u8 * item;
+
+              vec_add1 (cf->search_key, cf->input_vector[i]);
+
+            search_again:
+              for (j = 0; j < vec_len(cf->command_history); j++)
+                {
+                  if (cf->excursion > (i32) vec_len (cf->command_history) -1)
+                    cf->excursion = 0;
+                  else if (cf->excursion < 0)
+                    cf->excursion = vec_len (cf->command_history) -1;
+
+                  item = cf->command_history[cf->excursion];
+
+                  limit = (vec_len(cf->search_key) > vec_len (item)) ?
+                    vec_len(item) : vec_len (cf->search_key);
+
+                  for (offset = 0; offset <= vec_len(item) - limit; offset++)
+                    {
+                      for (k = 0; k < limit; k++)
+                        {
+                          if (item[k+offset] != cf->search_key[k])
+                            goto next_offset;
+                        }
+                      goto found_at_offset;
+
+                    next_offset:
+                      ;
+                    }
+                  goto next;
+
+                found_at_offset:
+                  for (j = 0; j < vec_len (cf->current_command)+1; j++)
+                    unix_cli_add_pending_output (uf, cf, (u8 *) "\b \b", 3);
+
+                  vec_validate (cf->current_command, vec_len(item)-1);
+
+                  memcpy (cf->current_command, item, vec_len(item));
+                  _vec_len (cf->current_command) = vec_len(item);
+                  unix_cli_add_pending_output (uf, cf, cf->current_command,
+                                               vec_len (cf->current_command));
+                  goto found;
+
+                next:
+                  cf->excursion += cf->search_mode;
+                }
+              
+              unix_cli_add_pending_output (uf, cf, (u8 *)"\r\nno match..", 12);
+              vec_reset_length (cf->search_key);
+              vec_reset_length (cf->current_command);
+              cf->search_mode = 0;
+              goto crlf;
+            }
+          else
+            vec_add1 (cf->current_command, cf->input_vector[i]);
+
+        found:
+
+          break;
+        }
+    }
+  vec_reset_length(cf->input_vector);
+  return 1;
+}
+
+static void unix_cli_process_input (unix_cli_main_t * cm, uword cli_file_index)
+{
+  unix_main_t * um = &unix_main;
+  unix_file_t * uf;
+  unix_cli_file_t * cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index);
+  unformat_input_t input;
+  int vlib_parse_eval (u8 *);
+
+  /* Try vlibplex first.  Someday... */
+  if (0 && vlib_parse_eval (cf->input_vector) == 0)
+      goto done;
+
+  /* Line edit, echo, etc. */
+  if (cf->has_history && unix_cli_line_edit (um, cf))
+    return;
+
+  if (um->log_fd)
+    {
+      static u8 * lv;
+      vec_reset_length (lv);
+      lv = format (lv, "%U[%d]: %v", 
+                   format_timeval,
+                   0 /* current bat-time */,
+                   0 /* current bat-format */,
+                   cli_file_index,
+                   cf->input_vector);
+      {
+	int rv __attribute__((unused)) = 
+	  write (um->log_fd, lv, vec_len(lv));
+      }
+    }
+
+  unformat_init_vector (&input, cf->input_vector);
+
+  /* Remove leading white space from input. */
+  (void) unformat (&input, "");
+
+  cm->current_input_file_index = cli_file_index;
+
+  if (unformat_check_input (&input) != UNFORMAT_END_OF_INPUT)
+    vlib_cli_input (um->vlib_main, &input, unix_vlib_cli_output, cli_file_index);
+
+  /* Re-fetch pointer since pool may have moved. */
+  cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index);
+
+  /* Zero buffer since otherwise unformat_free will call vec_free on it. */
+  input.buffer = 0;
+
+  unformat_free (&input);
+
+  /* Re-use input vector. */
+done:
+  _vec_len (cf->input_vector) = 0;
+
+  /* Prompt. */
+  uf = pool_elt_at_index (um->file_pool, cf->unix_file_index);
+  unix_cli_add_pending_output (uf, cf,
+			       cm->cli_prompt,
+			       vec_len (cm->cli_prompt));
+}
+
+static void unix_cli_kill (unix_cli_main_t * cm, uword cli_file_index)
+{
+  unix_main_t * um = &unix_main;
+  unix_cli_file_t * cf;
+  unix_file_t * uf;
+  int i;
+
+  cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index);
+  uf = pool_elt_at_index (um->file_pool, cf->unix_file_index);
+
+  /* Quit/EOF on stdin means quit program. */
+  if (uf->file_descriptor == 0)
+    clib_longjmp (&um->vlib_main->main_loop_exit, VLIB_MAIN_LOOP_EXIT_CLI);
+
+  vec_free (cf->current_command);
+  vec_free (cf->search_key);
+
+  for (i = 0; i < vec_len (cf->command_history); i++)
+      vec_free (cf->command_history[i]);
+
+  vec_free (cf->command_history);
+
+  unix_file_del (um, uf);
+
+  unix_cli_file_free (cf);
+  pool_put (cm->cli_file_pool, cf);
+}
+
+typedef enum {
+  UNIX_CLI_PROCESS_EVENT_READ_READY,
+  UNIX_CLI_PROCESS_EVENT_QUIT,
+} unix_cli_process_event_type_t;
+
+static uword
+unix_cli_process (vlib_main_t * vm,
+		  vlib_node_runtime_t * rt,
+		  vlib_frame_t * f)
+{
+  unix_cli_main_t * cm = &unix_cli_main;
+  uword i, * data = 0;
+
+  while (1)
+    {
+      unix_cli_process_event_type_t event_type;
+      vlib_process_wait_for_event (vm);
+      event_type = vlib_process_get_events (vm, &data);
+
+      switch (event_type)
+	{
+	case UNIX_CLI_PROCESS_EVENT_READ_READY:
+	  for (i = 0; i < vec_len (data); i++)
+	    unix_cli_process_input (cm, data[i]);
+	  break;
+
+	case UNIX_CLI_PROCESS_EVENT_QUIT:
+	  /* Kill this process. */
+	  for (i = 0; i < vec_len (data); i++)
+	    unix_cli_kill (cm, data[i]);
+	  goto done;
+	}
+
+      if (data)
+	_vec_len (data) = 0;
+    }
+
+ done:
+  vec_free (data);
+
+  vlib_node_set_state (vm, rt->node_index, VLIB_NODE_STATE_DISABLED);
+
+  /* Add node index so we can re-use this process later. */
+  vec_add1 (cm->unused_cli_process_node_indices, rt->node_index);
+
+  return 0;
+}
+
+static clib_error_t * unix_cli_write_ready (unix_file_t * uf)
+{
+  unix_cli_main_t * cm = &unix_cli_main;
+  unix_cli_file_t * cf;
+  int n;
+
+  cf = pool_elt_at_index (cm->cli_file_pool, uf->private_data);
+
+  /* Flush output vector. */
+  n = write (uf->file_descriptor,
+	     cf->output_vector, vec_len (cf->output_vector));
+
+  if (n < 0 && errno != EAGAIN)
+    return clib_error_return_unix (0, "write");
+
+  else if (n > 0)
+    unix_cli_del_pending_output (uf, cf, n);
+
+  return /* no error */ 0;
+}
+
+static clib_error_t * unix_cli_read_ready (unix_file_t * uf)
+{
+  unix_main_t * um = &unix_main;
+  unix_cli_main_t * cm = &unix_cli_main;
+  unix_cli_file_t * cf;
+  uword l;
+  int n, n_read, n_try;
+
+  cf = pool_elt_at_index (cm->cli_file_pool, uf->private_data);
+
+  n = n_try = 4096;
+  while (n == n_try) {
+      l = vec_len (cf->input_vector);
+      vec_resize (cf->input_vector, l + n_try);
+
+      n = read (uf->file_descriptor, cf->input_vector + l, n_try);
+
+      /* Error? */
+      if (n < 0 && errno != EAGAIN)
+          return clib_error_return_unix (0, "read");
+  
+      n_read = n < 0 ? 0 : n;
+      _vec_len (cf->input_vector) = l + n_read;
+  }
+
+  if (! (n < 0))
+    vlib_process_signal_event (um->vlib_main,
+			       cf->process_node_index,
+			       (n_read == 0
+				? UNIX_CLI_PROCESS_EVENT_QUIT
+				: UNIX_CLI_PROCESS_EVENT_READ_READY),
+			       /* event data */ uf->private_data);
+
+  return /* no error */ 0;
+}
+
+static u32 unix_cli_file_add (unix_cli_main_t * cm, char * name, int fd)
+{
+  unix_main_t * um = &unix_main;
+  unix_cli_file_t * cf;
+  unix_file_t * uf, template = {0};
+  vlib_main_t * vm = um->vlib_main;
+  vlib_node_t * n;
+
+  name = (char *) format (0, "unix-cli-%s", name);
+
+  if (vec_len (cm->unused_cli_process_node_indices) > 0)
+    {
+      uword l = vec_len (cm->unused_cli_process_node_indices);
+
+      /* Find node and give it new name. */
+      n = vlib_get_node (vm, cm->unused_cli_process_node_indices[l - 1]);
+      vec_free (n->name);
+      n->name = (u8 *) name;
+
+      vlib_node_set_state (vm, n->index, VLIB_NODE_STATE_POLLING);
+
+      _vec_len (cm->unused_cli_process_node_indices) = l - 1;
+    }
+  else
+    {
+      static vlib_node_registration_t r = {
+	.function = unix_cli_process,
+	.type = VLIB_NODE_TYPE_PROCESS,
+	.process_log2_n_stack_bytes = 14,
+      };
+
+      r.name = name;
+      vlib_register_node (vm, &r);
+      vec_free (name);
+
+      n = vlib_get_node (vm, r.index);
+    }
+
+  pool_get (cm->cli_file_pool, cf);
+  memset (cf, 0, sizeof (*cf));
+
+  template.read_function = unix_cli_read_ready;
+  template.write_function = unix_cli_write_ready;
+  template.file_descriptor = fd;
+  template.private_data = cf - cm->cli_file_pool;
+
+  cf->process_node_index = n->index;
+  cf->unix_file_index = unix_file_add (um, &template);
+  cf->output_vector = 0;
+  cf->input_vector = 0;
+
+  uf = pool_elt_at_index (um->file_pool, cf->unix_file_index);
+
+  /* Prompt. */
+  unix_cli_add_pending_output (uf, cf,
+			       cm->cli_prompt, vec_len (cm->cli_prompt));
+
+  vlib_start_process (vm, n->runtime_index);
+  return cf - cm->cli_file_pool;
+}
+
+static clib_error_t * unix_cli_listen_read_ready (unix_file_t * uf)
+{
+  unix_main_t * um = &unix_main;
+  unix_cli_main_t * cm = &unix_cli_main;
+  clib_socket_t * s = &um->cli_listen_socket;
+  clib_socket_t client;
+  char * client_name;
+  clib_error_t * error;
+  unix_cli_file_t * cf;
+  u32 cf_index;
+
+  error = clib_socket_accept (s, &client);
+  if (error)
+    return error;
+
+  client_name = (char *) format (0, "%U%c", format_sockaddr, &client.peer, 0);
+
+  cf_index = unix_cli_file_add (cm, client_name, client.fd);
+  cf = pool_elt_at_index (cm->cli_file_pool, cf_index);
+
+  /* No longer need CLIB version of socket. */
+  clib_socket_free (&client);
+
+  vec_free (client_name);
+
+  /* if we're supposed to run telnet session in character mode (default) */
+  if (um->cli_line_mode == 0)
+    {
+      u8 charmode_option[6];
+
+      cf->has_history = 1;
+      cf->history_limit = um->cli_history_limit ? um->cli_history_limit : 50;
+
+      /* 
+       * Set telnet client character mode, echo on, suppress "go-ahead" 
+       * Empirically, this sequence works. YMMV.
+       */
+
+      /* Tell the client no linemode, echo */
+      charmode_option[0] = IAC;
+      charmode_option[1] = DONT;
+      charmode_option[2] = TELOPT_LINEMODE;
+      charmode_option[3] = IAC;
+      charmode_option[4] = DO;
+      charmode_option[5] = TELOPT_SGA;
+      
+      uf = pool_elt_at_index (um->file_pool, cf->unix_file_index);
+      
+      unix_cli_add_pending_output (uf, cf, charmode_option, 
+                                   ARRAY_LEN(charmode_option));
+    }
+
+  return error;
+}
+
+static clib_error_t *
+unix_cli_config (vlib_main_t * vm, unformat_input_t * input)
+{
+  unix_main_t * um = &unix_main;
+  unix_cli_main_t * cm = &unix_cli_main;
+  int flags, standard_input_fd;
+  clib_error_t * error;
+
+  /* We depend on unix flags being set. */
+  if ((error = vlib_call_config_function (vm, unix_config)))
+    return error;
+
+  if (um->flags & UNIX_FLAG_INTERACTIVE)
+    {
+      standard_input_fd = 0;
+
+      /* Set stdin to be non-blocking. */
+      if ((flags = fcntl (standard_input_fd, F_GETFL, 0)) < 0)
+	flags = 0;
+      fcntl (standard_input_fd, F_SETFL, flags | O_NONBLOCK);
+
+      unix_cli_file_add (cm, "stdin", standard_input_fd);
+    }
+
+  {
+    /* CLI listen. */
+    clib_socket_t * s = &um->cli_listen_socket;
+    unix_file_t template = {0};
+
+    s->flags = SOCKET_IS_SERVER; /* listen, don't connect */
+
+    error = clib_socket_init (s);
+    if (error)
+      return error;
+
+    template.read_function = unix_cli_listen_read_ready;
+    template.file_descriptor = s->fd;
+
+    unix_file_add (um, &template);
+  }
+
+  /* Set CLI prompt. */
+  if (! cm->cli_prompt)
+    cm->cli_prompt = format (0, "VLIB: ");
+
+  return 0;
+}
+
+VLIB_CONFIG_FUNCTION (unix_cli_config, "unix-cli");
+
+void vlib_unix_cli_set_prompt (char * prompt)
+{
+  char * fmt = (prompt[strlen(prompt)-1] == ' ') ? "%s" : "%s ";
+  unix_cli_main_t * cm = &unix_cli_main;
+  if (cm->cli_prompt)
+    vec_free (cm->cli_prompt);
+  cm->cli_prompt = format (0, fmt, prompt);
+}
+
+static clib_error_t *
+unix_cli_quit (vlib_main_t * vm,
+	       unformat_input_t * input,
+	       vlib_cli_command_t * cmd)
+{
+  unix_cli_main_t * cm = &unix_cli_main;
+
+  vlib_process_signal_event (vm,
+			     vlib_current_process (vm),
+			     UNIX_CLI_PROCESS_EVENT_QUIT,
+			     cm->current_input_file_index);
+  return 0;
+}
+
+VLIB_CLI_COMMAND (unix_cli_quit_command, static) = {
+  .path = "quit",
+  .short_help = "Exit CLI",
+  .function = unix_cli_quit,
+};
+
+static clib_error_t *
+unix_cli_exec (vlib_main_t * vm,
+	       unformat_input_t * input,
+	       vlib_cli_command_t * cmd)
+{
+  char * file_name;
+  int fd;
+  unformat_input_t sub_input;
+  clib_error_t * error;
+
+  file_name = 0;
+  fd = -1;
+  error = 0;
+
+  if (! unformat (input, "%s", &file_name))
+    {
+      error = clib_error_return (0, "expecting file name, got `%U'",
+				 format_unformat_error, input);
+      goto done;
+    }
+
+  fd = open (file_name, O_RDONLY);
+  if (fd < 0)
+    {
+      error = clib_error_return_unix (0, "failed to open `%s'", file_name);
+      goto done;
+    }
+
+  /* Make sure its a regular file. */
+  {
+    struct stat s;
+
+    if (fstat (fd, &s) < 0)
+      {
+	error = clib_error_return_unix (0, "failed to stat `%s'", file_name);
+	goto done;
+      }
+    
+    if (! (S_ISREG (s.st_mode) || S_ISLNK (s.st_mode)))
+      {
+	error = clib_error_return (0, "not a regular file `%s'", file_name);
+	goto done;
+      }
+  }
+
+  unformat_init_unix_file (&sub_input, fd);
+
+  vlib_cli_input (vm, &sub_input, 0, 0);
+  unformat_free (&sub_input);
+
+ done:
+  if (fd > 0)
+    close (fd);
+  vec_free (file_name);
+
+  return error;
+}
+
+VLIB_CLI_COMMAND (cli_exec, static) = {
+  .path = "exec",
+  .short_help = "Execute commands from file",
+  .function = unix_cli_exec,
+};
+
+static clib_error_t *
+unix_show_errors (vlib_main_t * vm,
+		  unformat_input_t * input,
+		  vlib_cli_command_t * cmd)
+{
+  unix_main_t * um = &unix_main;
+  clib_error_t * error = 0;
+  int i, n_errors_to_show;
+  unix_error_history_t * unix_errors = 0;
+
+  n_errors_to_show = 1 << 30;
+
+  if (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (! unformat (input, "%d", &n_errors_to_show))
+	{
+	  error = clib_error_return (0, "expecting integer number of errors to show, got `%U'",
+				     format_unformat_error, input);
+	  goto done;
+	}
+    }
+
+  n_errors_to_show = clib_min (ARRAY_LEN (um->error_history), n_errors_to_show);
+
+  i = um->error_history_index > 0 ? um->error_history_index - 1 : ARRAY_LEN (um->error_history) - 1;
+
+  while (n_errors_to_show > 0)
+    {
+      unix_error_history_t * eh = um->error_history + i;
+
+      if (! eh->error)
+	break;
+
+      vec_add1 (unix_errors, eh[0]);
+      n_errors_to_show -= 1;
+      if (i == 0)
+	i = ARRAY_LEN (um->error_history) - 1;
+      else
+	i--;
+    }
+
+  if (vec_len (unix_errors) == 0)
+    vlib_cli_output (vm, "no Unix errors so far");
+  else
+    {
+      vlib_cli_output (vm, "%Ld total errors seen", um->n_total_errors);
+      for (i = vec_len (unix_errors) - 1; i >= 0; i--)
+	{
+	  unix_error_history_t * eh = vec_elt_at_index (unix_errors, i);
+	  vlib_cli_output (vm, "%U: %U",
+			   format_time_interval, "h:m:s:u", eh->time,
+			   format_clib_error, eh->error);
+	}
+      vlib_cli_output (vm, "%U: time now",
+		       format_time_interval, "h:m:s:u", vlib_time_now (vm));
+    }
+
+ done:
+  vec_free (unix_errors);
+  return error;
+}
+
+VLIB_CLI_COMMAND (cli_unix_show_errors, static) = {
+  .path = "show unix-errors",
+  .short_help = "Show Unix system call error history",
+  .function = unix_show_errors,
+};
+
+static clib_error_t *
+unix_cli_init (vlib_main_t * vm)
+{
+  return 0;
+}
+
+VLIB_INIT_FUNCTION (unix_cli_init);
diff --git a/vlib/vlib/unix/input.c b/vlib/vlib/unix/input.c
new file mode 100644
index 00000000000..ea10e4fc354
--- /dev/null
+++ b/vlib/vlib/unix/input.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * input.c: Unix file input
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <signal.h>
+
+/* FIXME autoconf */
+#define HAVE_LINUX_EPOLL
+
+#ifdef HAVE_LINUX_EPOLL
+
+#include <sys/epoll.h>
+
+typedef struct {
+  int epoll_fd;
+  struct epoll_event * epoll_events;
+
+  /* Statistics. */
+  u64 epoll_files_ready;
+  u64 epoll_waits;
+} linux_epoll_main_t;
+
+static linux_epoll_main_t linux_epoll_main;
+
+static void
+linux_epoll_file_update (unix_file_t * f,
+			 unix_file_update_type_t update_type)
+{
+  unix_main_t * um = &unix_main;
+  linux_epoll_main_t * em = &linux_epoll_main;
+  struct epoll_event e;
+
+  memset (&e, 0, sizeof (e));
+
+  e.events = EPOLLIN;
+  if (f->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE)
+    e.events |= EPOLLOUT;
+  e.data.u32 = f - um->file_pool;
+
+  if (epoll_ctl (em->epoll_fd,
+		 (update_type == UNIX_FILE_UPDATE_ADD
+		  ? EPOLL_CTL_ADD
+		  : (update_type == UNIX_FILE_UPDATE_MODIFY
+		     ? EPOLL_CTL_MOD
+		     : EPOLL_CTL_DEL)),
+		 f->file_descriptor,
+		 &e) < 0)
+    clib_warning ("epoll_ctl");
+}
+
+static uword
+linux_epoll_input (vlib_main_t * vm,
+		   vlib_node_runtime_t * node,
+		   vlib_frame_t * frame)
+{
+  unix_main_t * um = &unix_main;
+  linux_epoll_main_t * em = &linux_epoll_main;
+  struct epoll_event * e;
+  int n_fds_ready;
+
+  {
+    vlib_node_main_t * nm = &vm->node_main;
+    u64 t = nm->cpu_time_next_process_ready;
+    f64 timeout;
+    int timeout_ms, max_timeout_ms = 10;
+    f64 vector_rate = vlib_last_vectors_per_main_loop (vm);
+
+    if (t == ~0ULL)
+      {
+	timeout = 10e-3;
+	timeout_ms = max_timeout_ms;
+      }
+    else
+      {
+	timeout =
+	  (((i64) t - (i64) clib_cpu_time_now ())
+	   * vm->clib_time.seconds_per_clock)
+	  /* subtract off some slop time */ - 50e-6;
+	timeout_ms = timeout * 1e3;
+	
+	/* Must be between 1 and 10 ms. */
+	timeout_ms = clib_max (1, timeout_ms);
+	timeout_ms = clib_min (max_timeout_ms, timeout_ms);
+      }
+
+    /* If we still have input nodes polling (e.g. vnet packet generator)
+       don't sleep. */
+    if (nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] > 0)
+      timeout_ms = 0;
+
+    if (vector_rate > 1)
+      {
+	/* When busy don't wait & only epoll for input every 8 times
+	   through main loop. */
+	timeout_ms = 0;
+	node->input_main_loops_per_call = 1024;
+      }
+    else
+      /* We're not busy; go to sleep for a while. */
+      node->input_main_loops_per_call = 0;
+
+    /* Allow any signal to wakeup our sleep. */
+    {
+      static sigset_t unblock_all_signals;
+      n_fds_ready = epoll_pwait (em->epoll_fd,
+                                 em->epoll_events,
+                                 vec_len (em->epoll_events),
+                                 timeout_ms,
+                                 &unblock_all_signals);
+      
+      /* This kludge is necessary to run over absurdly old kernels */
+      if (n_fds_ready < 0 && errno == ENOSYS)
+        {
+          n_fds_ready = epoll_wait (em->epoll_fd,
+                                    em->epoll_events,
+                                    vec_len (em->epoll_events),
+                                    timeout_ms);
+        }
+    }
+  }
+
+  if (n_fds_ready < 0)
+    {
+      if (unix_error_is_fatal (errno))
+        vlib_panic_with_error (vm, clib_error_return_unix (0, "epoll_wait"));
+
+      /* non fatal error (e.g. EINTR). */
+      return 0;
+    }
+
+  em->epoll_waits += 1;
+  em->epoll_files_ready += n_fds_ready;
+
+  for (e = em->epoll_events; e < em->epoll_events + n_fds_ready; e++)
+    {
+      u32 i = e->data.u32;
+      unix_file_t * f = pool_elt_at_index (um->file_pool, i);
+      clib_error_t * errors[4];
+      int n_errors = 0;
+
+      if (PREDICT_TRUE (! (e->events & EPOLLERR)))
+	{
+	  if (e->events & EPOLLIN)
+	    {
+	      errors[n_errors] = f->read_function (f);
+	      n_errors += errors[n_errors] != 0;
+	    }
+	  if (e->events & EPOLLOUT)
+	    {
+	      errors[n_errors] = f->write_function (f);
+	      n_errors += errors[n_errors] != 0;
+	    }
+	}
+      else
+	{
+          if (f->error_function)
+            {
+              errors[n_errors] = f->error_function (f);
+              n_errors += errors[n_errors] != 0;
+            }
+	}
+
+      ASSERT (n_errors < ARRAY_LEN (errors));
+      for (i = 0; i < n_errors; i++)
+	{
+	  unix_save_error (um, errors[i]);
+	}
+    }
+
+  return 0;
+}
+
+VLIB_REGISTER_NODE (linux_epoll_input_node,static) = {
+  .function = linux_epoll_input,
+  .type = VLIB_NODE_TYPE_PRE_INPUT,
+  .name = "unix-epoll-input",
+};
+
+clib_error_t *
+linux_epoll_input_init (vlib_main_t * vm)
+{
+  linux_epoll_main_t * em = &linux_epoll_main;
+  unix_main_t * um = &unix_main;
+  
+  /* Allocate some events. */
+  vec_resize (em->epoll_events, VLIB_FRAME_SIZE);
+
+  em->epoll_fd = epoll_create (vec_len (em->epoll_events));
+  if (em->epoll_fd < 0)
+    return clib_error_return_unix (0, "epoll_create");
+
+  um->file_update = linux_epoll_file_update;
+
+  return 0;
+}
+
+VLIB_INIT_FUNCTION (linux_epoll_input_init);
+
+#endif /* HAVE_LINUX_EPOLL */
+
+static clib_error_t *
+unix_input_init (vlib_main_t * vm)
+{
+  return vlib_call_init_function (vm, linux_epoll_input_init);
+}
+
+VLIB_INIT_FUNCTION (unix_input_init);
diff --git a/vlib/vlib/unix/main.c b/vlib/vlib/unix/main.c
new file mode 100644
index 00000000000..b85f3e73326
--- /dev/null
+++ b/vlib/vlib/unix/main.c
@@ -0,0 +1,471 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * main.c: Unix main routine
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vlib/unix/plugin.h>
+
+#include <signal.h>
+#include <sys/ucontext.h>
+#include <syslog.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+unix_main_t unix_main;
+
+static clib_error_t *
+unix_main_init (vlib_main_t * vm)
+{
+  unix_main_t * um = &unix_main;
+  um->vlib_main = vm;
+  return vlib_call_init_function (vm, unix_input_init);
+}
+
+VLIB_INIT_FUNCTION (unix_main_init);
+
+static void unix_signal_handler (int signum, siginfo_t * si, ucontext_t * uc)
+{
+  uword fatal;
+  u8 * msg = 0;
+
+  msg = format (msg, "received signal %U, PC %U",
+		format_signal, signum,
+		format_ucontext_pc, uc);
+
+  if (signum == SIGSEGV)
+    msg = format (msg, ", faulting address %p", si->si_addr);
+
+  switch (signum)
+    {
+      /* these (caught) signals cause the application to exit */
+    case SIGTERM:
+      if (unix_main.vlib_main->main_loop_exit_set) 
+        {
+          syslog (LOG_ERR | LOG_DAEMON, "received SIGTERM, exiting...");
+
+          clib_longjmp (&unix_main.vlib_main->main_loop_exit, 
+                        VLIB_MAIN_LOOP_EXIT_CLI);
+        }
+    case SIGQUIT:
+    case SIGINT:
+    case SIGILL:
+    case SIGBUS:
+    case SIGSEGV:
+    case SIGHUP:
+    case SIGFPE:
+      fatal = 1;
+      break;
+
+      /* by default, print a message and continue */
+    default:
+      fatal = 0;
+      break;
+    }
+
+  /* Null terminate. */
+  vec_add1 (msg, 0);
+
+  if (fatal)
+    {
+      syslog (LOG_ERR | LOG_DAEMON, "%s", msg);
+      os_exit (1);
+    }
+  else
+    clib_warning ("%s", msg);
+
+  vec_free (msg);
+}
+
+static clib_error_t *
+setup_signal_handlers (unix_main_t * um)
+{
+  uword i;
+  struct sigaction sa;
+
+  for (i = 1; i < 32; i++)
+    {
+      memset (&sa, 0, sizeof (sa));
+      sa.sa_sigaction = (void *) unix_signal_handler;
+      sa.sa_flags = SA_SIGINFO;
+
+      switch (i)
+	{
+          /* these signals take the default action */
+        case SIGABRT:
+	case SIGKILL:
+	case SIGSTOP:
+        case SIGUSR1:
+        case SIGUSR2:
+	  continue;
+
+          /* ignore SIGPIPE, SIGCHLD */
+	case SIGPIPE:
+	case SIGCHLD:
+	  sa.sa_sigaction = (void *) SIG_IGN;
+	  break;
+
+          /* catch and handle all other signals */
+	default:
+	  break;
+	}
+
+      if (sigaction (i, &sa, 0) < 0)
+	return clib_error_return_unix (0, "sigaction %U", format_signal, i);
+    }
+
+  return 0;
+}
+
+static void unix_error_handler (void * arg, u8 * msg, int msg_len)
+{
+  unix_main_t * um = arg;
+
+  /* Echo to stderr when interactive. */
+  if (um->flags & UNIX_FLAG_INTERACTIVE)
+    {
+      CLIB_UNUSED (int r) = write (2, msg, msg_len);
+    }
+  else
+    {
+      char save = msg[msg_len - 1];
+
+      /* Null Terminate. */
+      msg[msg_len-1] = 0;
+
+      syslog (LOG_ERR | LOG_DAEMON, "%s", msg);
+
+      msg[msg_len-1] = save;
+    }
+}
+
+void vlib_unix_error_report (vlib_main_t * vm, clib_error_t * error)
+{
+    unix_main_t * um = &unix_main;
+
+    if (um->flags & UNIX_FLAG_INTERACTIVE || error == 0)
+      return;
+
+    {
+      char save;
+      u8 * msg;
+      u32 msg_len;
+
+      msg = error->what;
+      msg_len = vec_len(msg);
+
+      /* Null Terminate. */
+      save = msg[msg_len-1];
+      msg[msg_len-1] = 0;
+
+      syslog (LOG_ERR | LOG_DAEMON, "%s", msg);
+
+      msg[msg_len-1] = save;
+    }
+}
+
+static uword
+startup_config_process (vlib_main_t * vm,
+                 vlib_node_runtime_t * rt,
+                 vlib_frame_t * f)
+{
+  unix_main_t * um = &unix_main;
+  u8 * buf = 0;
+  uword l, n = 1;
+
+  vlib_process_suspend (vm, 2.0);
+
+  while (um->unix_config_complete == 0)
+    vlib_process_suspend (vm, 0.1);
+
+  if (um->startup_config_filename) {
+    unformat_input_t sub_input;
+    int fd;
+    struct stat s;
+    char *fn = (char *)um->startup_config_filename;
+        
+    fd = open (fn, O_RDONLY);
+    if (fd < 0) {
+      clib_warning ("failed to open `%s'", fn);
+      return 0;
+    }
+
+    if (fstat (fd, &s) < 0) {
+      clib_warning ("failed to stat `%s'", fn);
+    bail:
+      close(fd);
+      return 0;
+    }
+    
+    if (! (S_ISREG (s.st_mode) || S_ISLNK (s.st_mode))) {
+      clib_warning ("not a regular file: `%s'", fn);
+      goto bail;
+    }
+
+    while (n > 0)
+      {
+        l = vec_len (buf);
+        vec_resize (buf, 4096);
+        n = read (fd, buf + l, 4096);
+        if (n > 0)
+          {
+            _vec_len (buf) = l + n;
+            if (n < 4096)
+              break;
+          }
+        else
+          break;
+      }
+    if (um->log_fd && vec_len (buf))
+      {
+        u8 * lv = 0;
+        lv = format (lv, "%U: ***** Startup Config *****\n%v", 
+                     format_timeval,
+                     0 /* current bat-time */,
+                     0 /* current bat-format */,
+                     buf);
+	{
+	  int rv __attribute__((unused)) = 
+	    write (um->log_fd, lv, vec_len(lv));
+	}
+        vec_reset_length (lv);
+        lv = format (lv, "%U: ***** End Startup Config *****\n", 
+                     format_timeval,
+                     0 /* current bat-time */,
+                     0 /* current bat-format */);
+	{
+	  int rv __attribute__((unused)) = 
+	    write (um->log_fd, lv, vec_len(lv));
+	}
+        vec_free (lv);
+      }
+
+    if (vec_len(buf))
+      {
+        unformat_init_vector (&sub_input, buf);
+        vlib_cli_input (vm, &sub_input, 0, 0);
+        /* frees buf for us */
+        unformat_free (&sub_input);
+      }
+    close(fd);
+  }
+  return 0;
+}
+
+VLIB_REGISTER_NODE (startup_config_node,static) = {
+    .function = startup_config_process,
+    .type = VLIB_NODE_TYPE_PROCESS,
+    .name = "startup-config-process",
+};
+
+static clib_error_t *
+unix_config (vlib_main_t * vm, unformat_input_t * input)
+{
+  unix_main_t * um = &unix_main;
+  clib_error_t * error = 0;
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      char * cli_prompt;
+      if (unformat (input, "interactive"))
+	um->flags |= UNIX_FLAG_INTERACTIVE;
+      else if (unformat (input, "nodaemon"))
+        um->flags |= UNIX_FLAG_NODAEMON;  
+      else if (unformat (input, "cli-prompt %s", &cli_prompt))
+	vlib_unix_cli_set_prompt (cli_prompt);
+      else if (unformat (input, "cli-listen %s", &um->cli_listen_socket.config))
+	;
+      else if (unformat (input, "cli-line-mode"))
+        um->cli_line_mode = 1;
+      else if (unformat (input, "cli-history-limit %d", &um->cli_history_limit))
+        ;
+      else if (unformat (input, "full-coredump"))
+        {
+          int fd;
+
+          fd = open ("/proc/self/coredump_filter", O_WRONLY);
+          if (fd > 0)
+            {
+              if (write (fd, "0x6f\n", 5) != 5)
+                clib_unix_warning ("coredump filter write failed!");
+              close(fd);
+            }
+          else
+            clib_unix_warning ("couldn't open /proc/self/coredump_filter");
+        }
+      else if (unformat (input, "startup-config %s", 
+                         &um->startup_config_filename))
+        ;
+      else if (unformat (input, "exec %s",
+                         &um->startup_config_filename))
+        ;
+      else if (unformat (input, "log %s", &um->log_filename))
+        {
+          um->log_fd = open ((char *) um->log_filename, 
+                             O_CREAT | O_WRONLY | O_APPEND, 0644);
+          if (um->log_fd < 0)
+            {
+              clib_warning ("couldn't open log '%s'\n", um->log_filename);
+              um->log_fd = 0;
+            }
+          else
+            {
+              u8 * lv = 0;
+              lv = format (0, "%U: ***** Start: PID %d *****\n", 
+                           format_timeval,
+                           0 /* current bat-time */,
+                           0 /* current bat-format */,
+                           getpid());
+	      {
+		int rv __attribute__((unused)) = 
+		  write (um->log_fd, lv, vec_len(lv));
+	      }
+              vec_free (lv);
+            }
+        }
+      else
+	return clib_error_return (0, "unknown input `%U'",
+				  format_unformat_error, input);
+    }
+
+  if (! (um->flags & UNIX_FLAG_INTERACTIVE))
+    {
+      error = setup_signal_handlers (um);
+      if (error)
+	return error;
+
+      openlog (vm->name, LOG_CONS | LOG_PERROR | LOG_PID, LOG_DAEMON);
+      clib_error_register_handler (unix_error_handler, um);
+
+      if (! (um->flags & UNIX_FLAG_NODAEMON)
+	  && daemon (/* chdir to / */ 0,
+		     /* stdin/stdout/stderr -> /dev/null */ 0) < 0)
+	clib_error_return (0, "daemon () fails");
+    }
+  um->unix_config_complete = 1;
+
+  return 0;
+}
+
+/* unix { ... } configuration. */
+VLIB_CONFIG_FUNCTION (unix_config, "unix");
+
+static clib_error_t *
+unix_exit (vlib_main_t * vm)
+{
+  /* Close syslog connection. */
+  closelog ();
+  return 0;
+}
+
+VLIB_MAIN_LOOP_EXIT_FUNCTION (unix_exit);
+
+u8 **vlib_thread_stacks;
+
+static char **argv_global;
+  
+static uword thread0 (uword arg)
+{
+  vlib_main_t * vm = (vlib_main_t *)arg;
+  unformat_input_t input;
+  int i;
+  
+  unformat_init_command_line (&input, argv_global);
+  i = vlib_main (vm, &input);
+  unformat_free (&input);
+  
+  return i;
+ }
+
+int vlib_unix_main (int argc, char * argv[])
+{
+  vlib_main_t * vm = &vlib_global_main; /* one and only time for this! */
+  
+  clib_smp_main_t * sm = &clib_smp_main;
+  vlib_thread_main_t * tm = &vlib_thread_main;
+  unformat_input_t input;
+  u8 * thread_stacks;
+  clib_error_t * e;
+  int i;
+
+  argv_global = argv;
+  vm->name = argv[0];
+  vm->heap_base = clib_mem_get_heap ();
+  ASSERT(vm->heap_base);
+
+  i = vlib_plugin_early_init (vm);
+  if (i)
+    return i;
+  
+  unformat_init_command_line (&input, argv_global);
+  vm->init_functions_called = hash_create (0, /* value bytes */ 0);
+  e = vlib_call_all_config_functions (vm, &input, 1 /* early */);
+  if (e != 0)
+    {
+      clib_error_report(e);
+      return 1;
+    }
+  unformat_free (&input);
+
+  /* allocate N x 1mb stacks, aligned e.g. to a 16mb boundary */
+  thread_stacks = clib_mem_alloc_aligned 
+      (tm->n_thread_stacks * VLIB_THREAD_STACK_SIZE,
+       (VLIB_MAX_CPUS << VLIB_LOG2_THREAD_STACK_SIZE));
+
+  sm->vm_base = thread_stacks;
+  sm->log2_n_per_cpu_vm_bytes = VLIB_LOG2_THREAD_STACK_SIZE;
+
+  vec_validate (vlib_thread_stacks, tm->n_thread_stacks - 1);
+  for (i = 0; i < vec_len (vlib_thread_stacks); i++)
+    {
+      vlib_thread_stacks[i] = thread_stacks;
+
+      /* 
+       * Disallow writes to the bottom page of the stack, to
+       * catch stack overflows.
+       */
+      if (mprotect (thread_stacks, 4096, PROT_READ) < 0)
+          clib_unix_warning ("thread stack");
+
+      thread_stacks += VLIB_THREAD_STACK_SIZE;
+    }
+  
+  i = clib_calljmp (thread0, (uword) vm, 
+                    (void *)(vlib_thread_stacks[0] + VLIB_THREAD_STACK_SIZE));
+  return i;
+}
diff --git a/vlib/vlib/unix/mc_socket.c b/vlib/vlib/unix/mc_socket.c
new file mode 100644
index 00000000000..1169203f855
--- /dev/null
+++ b/vlib/vlib/unix/mc_socket.c
@@ -0,0 +1,972 @@
+/*
+ * mc_socket.c: socket based multicast for vlib mc
+ *
+ * Copyright (c) 2010 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/unix/mc_socket.h>
+
+#include <sys/ioctl.h>		/* for FIONBIO */
+#include <netinet/tcp.h>        /* for TCP_NODELAY */
+#include <net/if.h>		/* for struct ifreq */
+
+static u8 * format_socket_peer_id (u8 * s, va_list * args)
+{
+  u64 peer_id_as_u64 = va_arg (*args, u64);
+  mc_peer_id_t peer_id;
+  peer_id.as_u64 = peer_id_as_u64;
+  u32 a = mc_socket_peer_id_get_address (peer_id);
+  u32 p = mc_socket_peer_id_get_port (peer_id);
+
+  s = format (s, "%U:%04x", format_network_address, AF_INET, &a,
+	      ntohs (p));
+
+  return s;
+}
+
+typedef void (mc_msg_handler_t) (mc_main_t * mcm, void * msg, u32 buffer_index);
+
+always_inline void msg_handler (mc_main_t * mcm,
+				u32 buffer_index,
+				u32 handler_frees_buffer,
+				void * _h)
+{
+  vlib_main_t * vm = mcm->vlib_main;
+  mc_msg_handler_t * h = _h;
+  vlib_buffer_t * b = vlib_get_buffer (vm, buffer_index);
+  void * the_msg = vlib_buffer_get_current (b);
+
+  h (mcm, the_msg, buffer_index);
+  if (! handler_frees_buffer)
+    vlib_buffer_free_one (vm, buffer_index);
+}
+
+static uword
+append_buffer_index_to_iovec (vlib_main_t * vm,
+			      u32 buffer_index,
+			      struct iovec ** iovs_return)
+{
+  struct iovec * i;
+  vlib_buffer_t * b;
+  u32 bi = buffer_index;
+  u32 l = 0;
+
+  while (1)
+    {
+      b = vlib_get_buffer (vm, bi);
+      vec_add2 (*iovs_return, i, 1);
+      i->iov_base = vlib_buffer_get_current (b);
+      i->iov_len = b->current_length;
+      l += i->iov_len;
+      if (! (b->flags & VLIB_BUFFER_NEXT_PRESENT))
+	break;
+      bi = b->next_buffer;
+    }
+
+  return l;
+}
+
+static clib_error_t *
+sendmsg_helper (mc_socket_main_t * msm,
+		int socket,
+		struct sockaddr_in * tx_addr,
+		u32 buffer_index)
+{
+  vlib_main_t * vm = msm->mc_main.vlib_main;
+  struct msghdr h;
+  word n_bytes, n_bytes_tx, n_retries;
+
+  memset (&h, 0, sizeof (h));
+  h.msg_name = tx_addr;
+  h.msg_namelen = sizeof (tx_addr[0]);
+
+  if (msm->iovecs)
+    _vec_len (msm->iovecs) = 0;
+
+  n_bytes = append_buffer_index_to_iovec (vm, buffer_index, &msm->iovecs);
+  ASSERT (n_bytes <= msm->mc_main.transport.max_packet_size);
+  if (n_bytes > msm->mc_main.transport.max_packet_size)
+    clib_error ("sending packet larger than interace MTU %d bytes", n_bytes);
+
+  h.msg_iov = msm->iovecs;
+  h.msg_iovlen = vec_len (msm->iovecs);
+
+  n_retries = 0;
+  while ((n_bytes_tx = sendmsg (socket, &h, /* flags */ 0)) != n_bytes
+         && errno == EAGAIN)
+    n_retries++;
+  if (n_bytes_tx != n_bytes)
+    {
+      clib_unix_warning ("sendmsg");
+      return 0;
+    }
+  if (n_retries)
+    {
+      ELOG_TYPE_DECLARE (e) = {
+        .format = "sendmsg-helper: %d retries",
+        .format_args = "i4",
+      };
+      struct { u32 retries; } * ed = 0;
+
+      ed = ELOG_DATA (&vm->elog_main, e);
+      ed->retries = n_retries;
+    }
+  return 0;
+}
+
+static clib_error_t *
+tx_buffer (void * transport, mc_transport_type_t type, u32 buffer_index)
+{
+  mc_socket_main_t *msm = (mc_socket_main_t *)transport;
+  vlib_main_t * vm = msm->mc_main.vlib_main;
+  mc_multicast_socket_t * ms = &msm->multicast_sockets[type];
+  clib_error_t * error;
+  error = sendmsg_helper (msm, ms->socket, &ms->tx_addr, buffer_index);
+  if (type != MC_TRANSPORT_USER_REQUEST_TO_RELAY)
+    vlib_buffer_free_one (vm, buffer_index);
+  return error;
+}
+
+static clib_error_t *
+tx_ack (void *transport, mc_peer_id_t dest_peer_id, u32 buffer_index)
+{
+  struct sockaddr_in tx_addr;
+  mc_socket_main_t *msm = (mc_socket_main_t *)transport;
+  vlib_main_t * vm = msm->mc_main.vlib_main;
+  clib_error_t * error;
+
+  memset (&tx_addr, 0, sizeof (tx_addr));
+  tx_addr.sin_family = AF_INET;
+  tx_addr.sin_addr.s_addr = mc_socket_peer_id_get_address (dest_peer_id);
+  tx_addr.sin_port = mc_socket_peer_id_get_port (dest_peer_id);
+
+  error = sendmsg_helper (msm, msm->ack_socket, &tx_addr, buffer_index);
+  vlib_buffer_free_one (vm, buffer_index);
+  return error;
+}
+
+static clib_error_t *
+recvmsg_helper (mc_socket_main_t * msm,
+		int socket,
+		struct sockaddr_in * rx_addr,
+		u32 * buffer_index,
+		u32 drop_message)
+{
+  vlib_main_t * vm = msm->mc_main.vlib_main;
+  vlib_buffer_t * b;
+  uword n_left, n_alloc, n_mtu, i, i_rx;
+  const uword buffer_size = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES;
+  word n_bytes_left;
+
+  /* Make sure we have at least a MTU worth of buffers. */
+  n_mtu = msm->rx_mtu_n_buffers;
+  n_left = vec_len (msm->rx_buffers);
+  if (n_left < n_mtu)
+    {
+      uword max_alloc = 8 * n_mtu;
+      vec_validate (msm->rx_buffers, max_alloc - 1);
+      n_alloc = vlib_buffer_alloc (vm, msm->rx_buffers + n_left, max_alloc - n_left);
+      _vec_len (msm->rx_buffers) = n_left + n_alloc;
+    }
+
+  ASSERT (vec_len (msm->rx_buffers) >= n_mtu);
+  vec_validate (msm->iovecs, n_mtu - 1);
+
+  /* Allocate RX buffers from end of rx_buffers.
+     Turn them into iovecs to pass to readv. */
+  i_rx = vec_len (msm->rx_buffers) - 1;
+  for (i = 0; i < n_mtu; i++)
+    {
+      b = vlib_get_buffer (vm, msm->rx_buffers[i_rx - i]);
+      msm->iovecs[i].iov_base = b->data;
+      msm->iovecs[i].iov_len = buffer_size;
+    }
+  _vec_len (msm->iovecs) = n_mtu;
+
+  {
+    struct msghdr h;
+
+    memset (&h, 0, sizeof (h));
+    if (rx_addr)
+      {
+	h.msg_name = rx_addr;
+	h.msg_namelen = sizeof (rx_addr[0]);
+      }
+    h.msg_iov = msm->iovecs;
+    h.msg_iovlen = vec_len (msm->iovecs);
+
+    n_bytes_left = recvmsg (socket, &h, 0);
+    if (n_bytes_left < 0)
+      return clib_error_return_unix (0, "recvmsg");
+  }
+
+  if (drop_message)
+    {
+      *buffer_index = ~0;
+      return 0;
+    }
+
+  *buffer_index = msm->rx_buffers[i_rx];
+  while (1)
+    {
+      b = vlib_get_buffer (vm, msm->rx_buffers[i_rx]);
+
+      b->flags = 0;
+      b->current_data = 0;
+      b->current_length = n_bytes_left < buffer_size ? n_bytes_left : buffer_size;
+
+      n_bytes_left -= buffer_size;
+
+      if (n_bytes_left <= 0)
+	break;
+
+      i_rx--;
+      b->flags |= VLIB_BUFFER_NEXT_PRESENT;
+      b->next_buffer = msm->rx_buffers[i_rx];
+    }
+
+  _vec_len (msm->rx_buffers) = i_rx;
+
+  return 0 /* no error */;
+}
+
+static clib_error_t * mastership_socket_read_ready (unix_file_t * uf)
+{
+  mc_socket_main_t *msm = (mc_socket_main_t *)uf->private_data;
+  mc_main_t * mcm = &msm->mc_main;
+  mc_multicast_socket_t * ms = &msm->multicast_sockets[MC_TRANSPORT_MASTERSHIP];
+  clib_error_t * error;
+  u32 bi;
+
+  error = recvmsg_helper (msm, ms->socket, /* rx_addr */ 0, &bi, /* drop_message */ 0);
+  if (! error)
+    msg_handler (mcm, bi,
+		 /* handler_frees_buffer */ 0,
+		 mc_msg_master_assert_handler);
+
+  return error;
+}
+
+static clib_error_t * to_relay_socket_read_ready (unix_file_t * uf)
+{
+  mc_socket_main_t *msm = (mc_socket_main_t *)uf->private_data;
+  mc_main_t *mcm = &msm->mc_main;
+  vlib_main_t * vm = msm->mc_main.vlib_main;
+  mc_multicast_socket_t * ms_to_relay = &msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_TO_RELAY];
+  mc_multicast_socket_t * ms_from_relay = &msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_FROM_RELAY];
+  clib_error_t * error;
+  u32 bi;
+  u32 is_master = mcm->relay_state == MC_RELAY_STATE_MASTER;
+
+  /* Not the ordering master? Turf the msg */
+  error = recvmsg_helper (msm, ms_to_relay->socket, /* rx_addr */ 0, &bi,
+			  /* drop_message */ ! is_master);
+
+  /* If we are the master, number and rebroadcast the msg. */
+  if (! error && is_master)
+    {
+      vlib_buffer_t * b = vlib_get_buffer (vm, bi);
+      mc_msg_user_request_t * mp = vlib_buffer_get_current (b);
+      mp->global_sequence = clib_host_to_net_u32 (mcm->relay_global_sequence);
+      mcm->relay_global_sequence++;
+      error = sendmsg_helper (msm, ms_from_relay->socket, &ms_from_relay->tx_addr, bi);
+      vlib_buffer_free_one (vm, bi);
+    }
+
+  return error;
+}
+
+static clib_error_t * from_relay_socket_read_ready (unix_file_t * uf)
+{
+  mc_socket_main_t *msm = (mc_socket_main_t *)uf->private_data;
+  mc_main_t * mcm = &msm->mc_main;
+  mc_multicast_socket_t * ms = &msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_FROM_RELAY];
+  clib_error_t * error;
+  u32 bi;
+
+  error = recvmsg_helper (msm, ms->socket, /* rx_addr */ 0, &bi, /* drop_message */ 0);
+  if (! error)
+    {
+      msg_handler (mcm, bi, /* handler_frees_buffer */ 1,
+		   mc_msg_user_request_handler);
+    }
+  return error;
+}
+
+static clib_error_t * join_socket_read_ready (unix_file_t * uf)
+{
+  mc_socket_main_t *msm = (mc_socket_main_t *)uf->private_data;
+  mc_main_t * mcm = &msm->mc_main;
+  vlib_main_t * vm = mcm->vlib_main;
+  mc_multicast_socket_t * ms = &msm->multicast_sockets[MC_TRANSPORT_JOIN];
+  clib_error_t * error;
+  u32 bi;
+
+  error = recvmsg_helper (msm, ms->socket, /* rx_addr */ 0, &bi, /* drop_message */ 0);
+  if (! error)
+    {
+      vlib_buffer_t * b = vlib_get_buffer (vm, bi);
+      mc_msg_join_or_leave_request_t * mp = vlib_buffer_get_current (b);
+
+      switch (clib_host_to_net_u32 (mp->type))
+	{
+	case MC_MSG_TYPE_join_or_leave_request:
+	  msg_handler (mcm, bi, /* handler_frees_buffer */ 0,
+		       mc_msg_join_or_leave_request_handler);
+	  break;
+
+	case MC_MSG_TYPE_join_reply:
+	  msg_handler (mcm, bi, /* handler_frees_buffer */ 0,
+		       mc_msg_join_reply_handler);
+	  break;
+
+	default:
+	  ASSERT (0);
+	  break;
+	}
+    }
+  return error;
+}
+
+static clib_error_t * ack_socket_read_ready (unix_file_t * uf)
+{
+  mc_socket_main_t *msm = (mc_socket_main_t *)uf->private_data;
+  mc_main_t * mcm = &msm->mc_main;
+  clib_error_t * error;
+  u32 bi;
+
+  error = recvmsg_helper (msm, msm->ack_socket, /* rx_addr */ 0, &bi, /* drop_message */ 0);
+  if (! error)
+    msg_handler (mcm, bi, /* handler_frees_buffer */ 0,
+		 mc_msg_user_ack_handler);
+  return error;
+}
+
+static void catchup_cleanup (mc_socket_main_t *msm,
+                             mc_socket_catchup_t *c,
+                             unix_main_t *um, unix_file_t *uf)
+{
+  hash_unset (msm->catchup_index_by_file_descriptor, uf->file_descriptor);
+  unix_file_del (um, uf);
+  vec_free (c->input_vector);
+  vec_free (c->output_vector);
+  pool_put (msm->catchups, c);
+}
+
+static mc_socket_catchup_t *
+find_catchup_from_file_descriptor (mc_socket_main_t * msm, int file_descriptor)
+{
+  uword * p = hash_get (msm->catchup_index_by_file_descriptor, file_descriptor);
+  return p ? pool_elt_at_index (msm->catchups, p[0]) : 0;
+}
+
+static clib_error_t * catchup_socket_read_ready (unix_file_t * uf, int is_server)
+{
+  unix_main_t * um = &unix_main;
+  mc_socket_main_t *msm = (mc_socket_main_t *)uf->private_data;
+  mc_main_t *mcm = &msm->mc_main;
+  mc_socket_catchup_t * c = find_catchup_from_file_descriptor (msm, uf->file_descriptor);
+  word l, n, is_eof;
+
+  l = vec_len (c->input_vector);
+  vec_resize (c->input_vector, 4096);
+  n = read (uf->file_descriptor, c->input_vector + l, vec_len (c->input_vector) - l);
+  is_eof = n == 0;
+
+  if (n < 0)
+    {
+      if (errno == EAGAIN)
+	n = 0;
+      else
+	{
+	  catchup_cleanup (msm, c, um, uf);
+	  return clib_error_return_unix (0, "read");
+	}
+    }
+
+  _vec_len (c->input_vector) = l + n;
+
+  if (is_eof && vec_len (c->input_vector) > 0)
+    {
+      if (is_server)
+	{
+	  mc_msg_catchup_request_handler (mcm, (void *) c->input_vector, c - msm->catchups);
+	  _vec_len (c->input_vector) = 0;
+	}
+      else
+	{
+	  mc_msg_catchup_reply_handler (mcm, (void *) c->input_vector, c - msm->catchups);
+	  c->input_vector = 0;	/* reply handler is responsible for freeing vector */
+	  catchup_cleanup (msm, c, um, uf);
+	}
+    }
+
+  return 0 /* no error */;
+}
+
+static clib_error_t * catchup_server_read_ready (unix_file_t * uf)
+{ return catchup_socket_read_ready (uf, /* is_server */ 1); }
+
+static clib_error_t * catchup_client_read_ready (unix_file_t * uf)
+{ 
+    if (MC_EVENT_LOGGING)
+      {
+        mc_socket_main_t *msm = (mc_socket_main_t *)uf->private_data;        
+        vlib_main_t * vm = msm->mc_main.vlib_main;
+        
+        ELOG_TYPE (e, "catchup_client_read_ready");
+        ELOG (&vm->elog_main, e, 0);
+      }
+    return catchup_socket_read_ready (uf, /* is_server */ 0); 
+}
+
+static clib_error_t * 
+catchup_socket_write_ready (unix_file_t * uf, int is_server)
+{
+  unix_main_t * um = &unix_main;
+  mc_socket_main_t *msm = (mc_socket_main_t *)uf->private_data;
+  mc_socket_catchup_t *c = find_catchup_from_file_descriptor (msm, uf->file_descriptor);
+  clib_error_t * error = 0;
+  int n;
+
+  if (c->connect_in_progress)
+    {
+      u32 len, value;
+
+      c->connect_in_progress = 0;
+      len = sizeof (value);
+      if (getsockopt (c->socket, SOL_SOCKET,
+		      SO_ERROR, &value, &len) < 0)
+	{
+	  error = clib_error_return_unix (0, "getsockopt SO_ERROR");
+	  goto error_quit;
+	}
+      if (value != 0)
+	{
+	  error = clib_error_return_code (0, value, CLIB_ERROR_ERRNO_VALID, "connect fails");
+	  goto error_quit;
+	}
+    }
+
+  while (1) 
+    {
+      u32 n_this_write;
+      
+      n_this_write = 
+        clib_min (vec_len (c->output_vector) - c->output_vector_n_written,
+                  msm->rx_mtu_n_bytes - 64 /* ip + tcp + option allowance */);
+
+      if (n_this_write <= 0)
+        break;
+
+      do {
+        n = write (uf->file_descriptor,
+                   c->output_vector + c->output_vector_n_written,
+                   n_this_write);
+      } while (n < 0 && errno == EAGAIN);
+      
+      if (n < 0)
+        {
+          error = clib_error_return_unix (0, "write");
+          goto error_quit;
+        }
+      c->output_vector_n_written += n;
+  }
+
+  if (c->output_vector_n_written >= vec_len (c->output_vector))
+    {
+      if (! is_server)
+	{
+          uf->flags &= ~UNIX_FILE_DATA_AVAILABLE_TO_WRITE;
+          unix_main.file_update (uf, UNIX_FILE_UPDATE_MODIFY);
+	  /* Send EOF to other side. */
+	  shutdown (uf->file_descriptor, SHUT_WR);
+	  return error;
+	}
+      else
+	{
+	error_quit:
+	  catchup_cleanup (msm, c, um, uf);
+	}
+    }
+  return error;
+}
+
+static clib_error_t * 
+catchup_server_write_ready (unix_file_t * uf)
+{ return catchup_socket_write_ready (uf, /* is_server */ 1); }
+
+static clib_error_t * 
+catchup_client_write_ready (unix_file_t * uf)
+{ return catchup_socket_write_ready (uf, /* is_server */ 0); }
+
+static clib_error_t *catchup_socket_error_ready (unix_file_t *uf)
+{
+  unix_main_t *um = &unix_main;
+  mc_socket_main_t *msm = (mc_socket_main_t *)uf->private_data;
+  mc_socket_catchup_t *c = find_catchup_from_file_descriptor (msm, uf->file_descriptor);
+  catchup_cleanup (msm, c, um, uf);
+  return clib_error_return (0, "error");
+}
+
+static clib_error_t *catchup_listen_read_ready (unix_file_t * uf)
+{
+  mc_socket_main_t *msm = (mc_socket_main_t *)uf->private_data;
+  struct sockaddr_in client_addr;
+  int client_len;
+  mc_socket_catchup_t *c;
+  unix_file_t template = {0};
+
+  pool_get (msm->catchups, c);
+  memset(c, 0, sizeof (c[0]));
+
+  client_len = sizeof(client_addr);
+    
+  /* Acquires the non-blocking attrib from the server socket. */
+  c->socket = accept (uf->file_descriptor, 
+		      (struct sockaddr *)&client_addr, 
+		      (socklen_t *)&client_len);
+    
+  if (c->socket < 0)
+    {
+      pool_put (msm->catchups, c);
+      return clib_error_return_unix (0, "accept");
+    }
+
+  if (MC_EVENT_LOGGING)
+    {
+      mc_main_t * mcm = &msm->mc_main;
+      vlib_main_t * vm = mcm->vlib_main;
+
+      ELOG_TYPE_DECLARE (e) = {
+	.format = "catchup accepted from 0x%lx",
+	.format_args = "i4",
+      };
+      struct { u32 addr; } * ed = 0;
+
+      ed = ELOG_DATA (&vm->elog_main, e);
+      ed->addr = ntohl(client_addr.sin_addr.s_addr);
+    }
+
+  /* Disable the Nagle algorithm, ship catchup pkts immediately */
+  {
+    int one = 1;
+    if ((setsockopt(c->socket, IPPROTO_TCP, 
+                    TCP_NODELAY, (void *)&one, sizeof(one))) < 0) {
+      clib_unix_warning("catchup socket: set TCP_NODELAY");
+    }
+  }
+
+  template.read_function = catchup_server_read_ready;
+  template.write_function = catchup_server_write_ready;
+  template.error_function = catchup_socket_error_ready;
+  template.file_descriptor = c->socket;
+  template.private_data = pointer_to_uword (msm);
+  c->unix_file_index = unix_file_add (&unix_main, &template);
+  hash_set (msm->catchup_index_by_file_descriptor, c->socket, c - msm->catchups);
+
+  return 0;
+}
+
+/* Return and bind to an unused port. */
+static word find_and_bind_to_free_port (word sock, word port)
+{
+  for (; port < 1 << 16; port++)
+    {
+      struct sockaddr_in a;
+
+      memset (&a, 0, sizeof(a)); /* Warnings be gone */
+
+      a.sin_family = PF_INET;
+      a.sin_addr.s_addr = INADDR_ANY;
+      a.sin_port = htons (port);
+
+      if (bind (sock, (struct sockaddr *) &a, sizeof (a)) >= 0)
+	break;
+    }
+	
+  return port < 1 << 16 ? port : -1;
+}
+
+static clib_error_t *
+setup_mutlicast_socket (mc_socket_main_t * msm,
+			mc_multicast_socket_t * ms,
+			char * type,
+			uword udp_port)
+{
+  int one = 1;
+  struct ip_mreq mcast_req;
+
+  if (! msm->multicast_ttl)
+    msm->multicast_ttl = 1;
+
+  /* mastership (multicast) TX socket */
+  if ((ms->socket = socket (PF_INET, SOCK_DGRAM, IPPROTO_UDP)) < 0)
+    return clib_error_return_unix(0, "%s socket", type);
+
+  {
+    u8 ttl = msm->multicast_ttl;
+
+    if ((setsockopt(ms->socket, IPPROTO_IP, 
+		    IP_MULTICAST_TTL, (void *)&ttl, sizeof(ttl))) < 0)
+      return clib_error_return_unix(0, "%s set multicast ttl", type);
+  }
+
+  if (setsockopt(ms->socket, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) < 0)
+    return clib_error_return_unix (0, "%s setsockopt SO_REUSEADDR", type);
+
+  memset (&ms->tx_addr, 0, sizeof (ms->tx_addr));
+  ms->tx_addr.sin_family = AF_INET;
+  ms->tx_addr.sin_addr.s_addr = htonl (msm->multicast_tx_ip4_address_host_byte_order);
+  ms->tx_addr.sin_port = htons (udp_port);
+    
+  if (bind(ms->socket, (struct sockaddr *)&ms->tx_addr, 
+	   sizeof (ms->tx_addr)) < 0)
+    return clib_error_return_unix(0, "%s bind", type);
+
+  memset (&mcast_req, 0, sizeof (mcast_req));
+  mcast_req.imr_multiaddr.s_addr = htonl (msm->multicast_tx_ip4_address_host_byte_order);
+  mcast_req.imr_interface.s_addr = msm->if_ip4_address_net_byte_order;
+
+  if ((setsockopt(ms->socket, IPPROTO_IP, 
+		  IP_ADD_MEMBERSHIP, (void *)&mcast_req, 
+		  sizeof (mcast_req))) < 0)
+    return clib_error_return_unix(0, "%s IP_ADD_MEMBERSHIP setsockopt", type);
+
+  if (ioctl (ms->socket, FIONBIO, &one) < 0)
+    return clib_error_return_unix (0, "%s set FIONBIO", type);
+
+  /* FIXME remove this when we support tx_ready. */
+  {
+    u32 len = 1 << 20;
+    socklen_t sl = sizeof (len);
+    if (setsockopt(ms->socket, SOL_SOCKET, SO_SNDBUF, &len, sl) < 0)
+      clib_unix_error ("setsockopt");
+  }
+
+  return 0;
+}
+
+static clib_error_t *
+socket_setup (mc_socket_main_t *msm)
+{
+  int one = 1;
+  clib_error_t * error;
+  u32 port;
+
+  if (! msm->base_multicast_udp_port_host_byte_order)
+    msm->base_multicast_udp_port_host_byte_order = 
+        0xffff - ((MC_N_TRANSPORT_TYPE + 2 /* ack socket, catchup socket */) 
+                  - 1);
+
+  port = msm->base_multicast_udp_port_host_byte_order;
+
+  error = setup_mutlicast_socket (msm,
+				  &msm->multicast_sockets[MC_TRANSPORT_MASTERSHIP],
+				  "mastership",
+				  port++);
+  if (error)
+    return error;
+
+  error = setup_mutlicast_socket (msm,
+				  &msm->multicast_sockets[MC_TRANSPORT_JOIN],
+				  "join",
+				  port++);
+  if (error)
+    return error;
+
+  error = setup_mutlicast_socket (msm,
+				  &msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_TO_RELAY],
+				  "to relay",
+				  port++);
+  if (error)
+    return error;
+
+  error = setup_mutlicast_socket (msm,
+				  &msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_FROM_RELAY],
+				  "from relay",
+				  port++);
+  if (error)
+    return error;
+
+  /* ACK rx socket */
+  msm->ack_socket = socket (PF_INET, SOCK_DGRAM, IPPROTO_UDP);
+  if (msm->ack_socket < 0)
+    return clib_error_return_unix(0, "ack socket");
+
+  msm->ack_udp_port = find_and_bind_to_free_port (msm->ack_socket, port++);
+
+  if (ioctl (msm->ack_socket, FIONBIO, &one) < 0)
+    return clib_error_return_unix (0, "ack socket FIONBIO");
+
+  msm->catchup_server_socket = socket(AF_INET, SOCK_STREAM, 0);
+  if (msm->catchup_server_socket < 0)
+    return clib_error_return_unix (0, "catchup server socket");
+    
+  msm->catchup_tcp_port = find_and_bind_to_free_port (msm->catchup_server_socket, port++);
+
+  if (ioctl (msm->catchup_server_socket, FIONBIO, &one) < 0)
+    return clib_error_return_unix (0, "catchup server socket FIONBIO");
+
+  if (listen(msm->catchup_server_socket, 5) < 0)
+    return clib_error_return_unix (0, "catchup server socket listen");
+    
+  /* epoll setup for multicast mastership socket */
+  {
+    unix_file_t template = {0};
+
+    template.read_function = mastership_socket_read_ready;
+    template.file_descriptor = msm->multicast_sockets[MC_TRANSPORT_MASTERSHIP].socket;
+    template.private_data = (uword) msm;
+    unix_file_add (&unix_main, &template);
+
+    /* epoll setup for multicast to_relay socket */
+    template.read_function = to_relay_socket_read_ready;
+    template.file_descriptor = msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_TO_RELAY].socket;
+    template.private_data = (uword) msm;
+    unix_file_add (&unix_main, &template);
+
+    /* epoll setup for multicast from_relay socket */
+    template.read_function = from_relay_socket_read_ready;
+    template.file_descriptor = msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_FROM_RELAY].socket;
+    template.private_data = (uword) msm;
+    unix_file_add (&unix_main, &template);
+
+    template.read_function = join_socket_read_ready;
+    template.file_descriptor = msm->multicast_sockets[MC_TRANSPORT_JOIN].socket;
+    template.private_data = (uword) msm;
+    unix_file_add (&unix_main, &template);
+
+    /* epoll setup for ack rx socket */
+    template.read_function = ack_socket_read_ready;
+    template.file_descriptor = msm->ack_socket;
+    template.private_data = (uword) msm;
+    unix_file_add (&unix_main, &template);
+
+    /* epoll setup for TCP catchup server */
+    template.read_function = catchup_listen_read_ready;
+    template.file_descriptor = msm->catchup_server_socket;
+    template.private_data = (uword) msm;
+    unix_file_add (&unix_main, &template);
+  }
+
+  return 0;
+}
+
+static void *
+catchup_add_pending_output (mc_socket_catchup_t * c, uword n_bytes, u8 * set_output_vector)
+{
+  unix_file_t * uf = pool_elt_at_index (unix_main.file_pool,
+                                        c->unix_file_index);
+  u8 * result=0;
+
+  if (set_output_vector)
+    c->output_vector = set_output_vector;
+  else
+    vec_add2 (c->output_vector, result, n_bytes);
+  if (vec_len (c->output_vector) > 0)
+    {
+      int skip_update = 0 != (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE);
+      uf->flags |= UNIX_FILE_DATA_AVAILABLE_TO_WRITE;
+      if (! skip_update)
+	unix_main.file_update (uf, UNIX_FILE_UPDATE_MODIFY);
+    }
+  return result;
+}
+
+static uword catchup_request_fun (void *transport_main, 
+                                  u32 stream_index,
+                                  mc_peer_id_t catchup_peer_id)
+{
+  mc_socket_main_t *msm = (mc_socket_main_t *)transport_main;
+  mc_main_t * mcm = &msm->mc_main;
+  vlib_main_t * vm = mcm->vlib_main;
+  mc_socket_catchup_t *c;
+  struct sockaddr_in addr;
+  unix_main_t *um = &unix_main;
+  int one = 1;
+
+  pool_get (msm->catchups, c);
+  memset (c, 0, sizeof (*c));
+
+  c->socket = socket(AF_INET, SOCK_STREAM, 0);
+  if (c->socket < 0)
+    {
+      clib_unix_warning ("socket");
+      return 0;
+    }
+
+  if (ioctl (c->socket, FIONBIO, &one) < 0)
+    {
+      clib_unix_warning ("FIONBIO");
+      return 0;
+    }
+
+  memset(&addr, 0, sizeof(addr));
+  addr.sin_family = AF_INET;
+  addr.sin_addr.s_addr = mc_socket_peer_id_get_address (catchup_peer_id);
+  addr.sin_port = mc_socket_peer_id_get_port (catchup_peer_id);
+    
+  c->connect_in_progress = 1;
+
+  if (MC_EVENT_LOGGING)
+    {
+      ELOG_TYPE_DECLARE (e) = {
+	.format = "connecting to peer 0x%Lx",
+	.format_args = "i8",
+      };
+      struct { u64 peer; } * ed;
+      ed = ELOG_DATA (&vm->elog_main, e);
+      ed->peer = catchup_peer_id.as_u64;
+    }
+      
+  if (connect(c->socket, (const void *)&addr,sizeof(addr)) 
+      < 0 && errno != EINPROGRESS)
+    {
+      clib_unix_warning ("connect to %U fails",
+			 format_socket_peer_id, catchup_peer_id);
+      return 0;
+    }
+  
+  {
+    unix_file_t template = {0};
+    
+    template.read_function = catchup_client_read_ready;
+    template.write_function = catchup_client_write_ready;
+    template.error_function = catchup_socket_error_ready;
+    template.file_descriptor = c->socket;
+    template.private_data = (uword) msm;
+    c->unix_file_index = unix_file_add (um, &template);
+
+    hash_set (msm->catchup_index_by_file_descriptor, c->socket, c - msm->catchups);
+  }
+
+  {
+    mc_msg_catchup_request_t * mp;
+    mp = catchup_add_pending_output (c, sizeof (mp[0]), /* set_output_vector */ 0);
+    mp->peer_id = msm->mc_main.transport.our_catchup_peer_id; 
+    mp->stream_index = stream_index;
+    mc_byte_swap_msg_catchup_request (mp);
+  }
+
+  return c - msm->catchups;
+}
+
+static void catchup_send_fun (void *transport_main, uword opaque, u8 * data)
+{
+  mc_socket_main_t *msm = (mc_socket_main_t *)transport_main;
+  mc_socket_catchup_t *c = pool_elt_at_index (msm->catchups, opaque);
+  catchup_add_pending_output (c, 0, data);
+}
+
+static int
+find_interface_ip4_address (char * if_name, u32 * ip4_address, u32 * mtu)
+{
+  int fd;
+  struct ifreq ifr;
+  struct sockaddr_in * sa;
+
+  /* Dig up our IP address */
+  fd = socket (PF_INET, AF_INET, 0);
+  if (fd < 0) {
+    clib_unix_error ("socket");
+    return -1;
+  }
+
+  ifr.ifr_addr.sa_family = AF_INET;
+  strncpy (ifr.ifr_name, if_name, sizeof(ifr.ifr_name)-1);
+  if (ioctl (fd, SIOCGIFADDR, &ifr) < 0) {
+    clib_unix_error ("ioctl(SIOCFIGADDR)");
+    return -1;
+  }
+
+  sa = (void *) &ifr.ifr_addr;
+  memcpy (ip4_address, &sa->sin_addr.s_addr, sizeof (ip4_address[0]));
+
+  if (ioctl (fd, SIOCGIFMTU, &ifr) < 0)
+    return -1;
+  if (mtu)
+    *mtu = ifr.ifr_mtu - (/* IP4 header */ 20 + /* UDP header */ 8);
+
+  close (fd);
+
+  return 0;
+}
+
+clib_error_t *
+mc_socket_main_init (mc_socket_main_t * msm, char **intfc_probe_list,
+                     int n_intfcs_to_probe)
+{
+  clib_error_t * error;
+  mc_main_t * mcm;
+  u32 mtu;
+
+  mcm = &msm->mc_main;
+
+  /* 239.255.0.7 */
+  if (! msm->multicast_tx_ip4_address_host_byte_order)
+    msm->multicast_tx_ip4_address_host_byte_order = 0xefff0007;
+
+  {
+    u32 i, a, win;
+
+    win = 0;
+    if (msm->multicast_interface_name)
+      {
+	win = ! find_interface_ip4_address (msm->multicast_interface_name, &a, &mtu);
+      }
+    else
+      {
+	for (i = 0; i < n_intfcs_to_probe; i++)
+	  if (! find_interface_ip4_address (intfc_probe_list[i], &a, &mtu))
+	    {
+	      win = 1;
+	      msm->multicast_interface_name = intfc_probe_list[i];
+	      break;
+	    }
+      }
+
+    if (! win)
+      return clib_error_return (0, "can't find interface ip4 address");
+
+    msm->if_ip4_address_net_byte_order = a;
+  }
+
+  msm->rx_mtu_n_bytes = mtu;
+  msm->rx_mtu_n_buffers = msm->rx_mtu_n_bytes / VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES;
+  msm->rx_mtu_n_buffers += (msm->rx_mtu_n_bytes % VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES) != 0;
+
+  error = socket_setup (msm);
+  if (error)
+    return error;
+
+  mcm->transport.our_ack_peer_id =
+    mc_socket_set_peer_id (msm->if_ip4_address_net_byte_order, msm->ack_udp_port);
+
+  mcm->transport.our_catchup_peer_id =
+    mc_socket_set_peer_id (msm->if_ip4_address_net_byte_order, msm->catchup_tcp_port);
+
+  mcm->transport.tx_buffer = tx_buffer;
+  mcm->transport.tx_ack = tx_ack;
+  mcm->transport.catchup_request_fun = catchup_request_fun;
+  mcm->transport.catchup_send_fun = catchup_send_fun;
+  mcm->transport.format_peer_id = format_socket_peer_id;
+  mcm->transport.opaque = msm;
+  mcm->transport.max_packet_size = mtu;
+
+  mc_main_init (mcm, "socket");
+
+  return error;
+}
diff --git a/vlib/vlib/unix/mc_socket.h b/vlib/vlib/unix/mc_socket.h
new file mode 100644
index 00000000000..7dd6b5e27b1
--- /dev/null
+++ b/vlib/vlib/unix/mc_socket.h
@@ -0,0 +1,126 @@
+/*
+ * mc_socket.h: socket based multicast for vlib mc
+ *
+ * Copyright (c) 2010 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __included_mc_socket_h__
+#define __included_mc_socket_h__
+
+#include <vlib/unix/unix.h>
+#include <netinet/in.h>
+
+typedef struct {
+  int socket;
+  struct sockaddr_in tx_addr;
+} mc_multicast_socket_t;
+
+/* TCP catchup socket */
+typedef struct {
+  int socket;
+  u32 unix_file_index;
+
+  u8 * input_vector;
+  u8 * output_vector;
+  u32 output_vector_n_written;
+
+  u32 connect_in_progress;
+} mc_socket_catchup_t;
+
+typedef struct mc_socket_main_t {
+  mc_main_t mc_main;
+
+  /* Multicast mastership/to-relay/from-relay sockets. */
+  mc_multicast_socket_t multicast_sockets[MC_N_TRANSPORT_TYPE];
+
+  /* Unicast UDP ack sockets */
+  int ack_socket;
+
+  /* TCP catchup server socket */
+  int catchup_server_socket;
+
+  /* Pool of stream-private catchup sockets */
+  mc_socket_catchup_t *catchups;
+
+  uword * catchup_index_by_file_descriptor;
+
+  u32 rx_mtu_n_bytes;
+
+  /* Receive MTU in bytes and VLIB buffers. */
+  u32 rx_mtu_n_buffers;
+
+  /* Vector of RX VLIB buffers. */
+  u32 * rx_buffers;
+  /* Vector of scatter/gather descriptors for sending/receiving VLIB buffers
+     via kernel. */
+  struct iovec * iovecs;
+
+  /* IP address of interface to use for multicast. */
+  u32 if_ip4_address_net_byte_order;
+  
+  u32 ack_udp_port;
+  u32 catchup_tcp_port;
+
+  /* Interface on which to listen for multicasts. */
+  char * multicast_interface_name;
+
+  /* Multicast address to use (e.g. 0xefff0000).
+     Host byte order. */
+  u32 multicast_tx_ip4_address_host_byte_order;
+
+  /* TTL to use for multicasts. */
+  u32 multicast_ttl;
+
+  /* Multicast ports for mastership, joins, etc. will be chosen
+     starting at the given port in host byte order.
+     A total of MC_N_TRANSPORT_TYPE ports will be used. */
+  u32 base_multicast_udp_port_host_byte_order;
+} mc_socket_main_t;
+
+always_inline u32
+mc_socket_peer_id_get_address (mc_peer_id_t i)
+{
+  u32 a = ((i.as_u8[0] << 24)
+           | (i.as_u8[1] << 16)
+           | (i.as_u8[2] << 8)
+           | (i.as_u8[3] << 0));
+  return clib_host_to_net_u32 (a);
+}
+
+always_inline u32
+mc_socket_peer_id_get_port (mc_peer_id_t i)
+{ return clib_host_to_net_u16 ((i.as_u8[4] << 8) | i.as_u8[5]); }
+
+static_always_inline mc_peer_id_t
+mc_socket_set_peer_id (u32 address_net_byte_order, u32 port_host_byte_order)
+{
+  mc_peer_id_t i;
+  u32 a = ntohl (address_net_byte_order);
+  u32 p = port_host_byte_order;
+  i.as_u8[0] = (a >> 24) & 0xff;
+  i.as_u8[1] = (a >> 16) & 0xff;
+  i.as_u8[2] = (a >>  8) & 0xff;
+  i.as_u8[3] = (a >>  0) & 0xff;
+  i.as_u8[4] = (p >>  8) & 0xff;
+  i.as_u8[5] = (p >>  0) & 0xff;
+  i.as_u8[6] = 0;
+  i.as_u8[7] = 0;
+  return i;
+}
+
+clib_error_t * 
+mc_socket_main_init (mc_socket_main_t * msm, char **intfc_probe_list,
+                     int n_intfcs_to_probe);
+#endif /* __included_mc_socket_h__ */
+
diff --git a/vlib/vlib/unix/pci.c b/vlib/vlib/unix/pci.c
new file mode 100644
index 00000000000..02c37f72707
--- /dev/null
+++ b/vlib/vlib/unix/pci.c
@@ -0,0 +1,577 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * pci.c: Linux user space PCI bus management.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/pci/pci.h>
+#include <vlib/unix/unix.h>
+#include <vlib/unix/pci.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <dirent.h>
+
+linux_pci_main_t linux_pci_main;
+
+static clib_error_t *
+foreach_directory_file (char * dir_name,
+			clib_error_t * (* f) (void * arg, u8 * path_name, u8 * file_name),
+			void * arg,
+			int scan_dirs)
+{
+  DIR * d;
+  struct dirent * e;
+  clib_error_t * error = 0;
+  u8 * s, * t;
+
+  d = opendir (dir_name);
+  if (! d)
+    {
+      /* System has no PCI bus. */
+      if (errno == ENOENT)
+        return 0;
+      return clib_error_return_unix (0, "open `%s'", dir_name);
+    }
+
+  s = t = 0;
+  while (1)
+    {
+      e = readdir (d);
+      if (! e)
+	break;
+      if (scan_dirs)
+	{
+	  if (e->d_type == DT_DIR
+	      && (! strcmp (e->d_name, ".")
+		  || ! strcmp (e->d_name, "..")))
+	    continue;
+	}
+      else
+	{
+	  if (e->d_type == DT_DIR)
+	    continue;
+	}
+
+      s = format (s, "%s/%s", dir_name, e->d_name);
+      t = format (t, "%s", e->d_name);
+      error = f (arg, s, t);
+      _vec_len (s) = 0;
+      _vec_len (t) = 0;
+
+      if (error)
+	break;
+    }
+
+  vec_free (s);
+  closedir (d);
+
+  return error;
+}
+
+static clib_error_t *
+write_sys_fs (char * file_name, char * fmt, ...)
+{
+  u8 * s;
+  int fd;
+
+  fd = open (file_name, O_WRONLY);
+  if (fd < 0)
+    return clib_error_return_unix (0, "open `%s'", file_name);
+
+  va_list va;
+  va_start (va, fmt);
+  s = va_format (0, fmt, &va);
+  va_end (va);
+
+  if (write (fd, s, vec_len (s)) < 0)
+    return clib_error_return_unix (0, "write `%s'", file_name);
+
+  vec_free (s);
+  close (fd);
+  return 0;
+}
+
+static clib_error_t *
+scan_uio_dir (void * arg, u8 * path_name, u8 * file_name)
+{
+  linux_pci_device_t * l = arg;
+  unformat_input_t input;
+
+  unformat_init_string (&input, (char *) file_name, vec_len (file_name));
+
+  if (! unformat (&input, "uio%d", &l->uio_minor))
+    abort ();
+
+  unformat_free (&input);
+  return 0;
+}
+
+static clib_error_t * linux_pci_uio_read_ready (unix_file_t * uf)
+{
+  linux_pci_main_t * pm = &linux_pci_main;
+  vlib_main_t * vm = pm->vlib_main;
+  linux_pci_device_t * l;
+  u32 li = uf->private_data;
+
+  l = pool_elt_at_index (pm->pci_devices, li);
+  vlib_node_set_interrupt_pending (vm, l->device_input_node_index);
+
+  /* Let node know which device is interrupting. */
+  {
+    vlib_node_runtime_t * rt = vlib_node_get_runtime (vm, l->device_input_node_index);
+    rt->runtime_data[0] |= 1 << l->device_index;
+  }
+
+  return /* no error */ 0;
+}
+
+static clib_error_t *linux_pci_uio_error_ready (unix_file_t *uf)
+{
+  u32 error_index = (u32) uf->private_data;
+
+  return clib_error_return (0, "pci device %d: error", error_index);
+}
+
+static uword pci_resource_size (uword os_handle, uword resource)
+{
+  linux_pci_main_t * pm = &linux_pci_main;
+  linux_pci_device_t * p;
+  u8 * file_name;
+  struct stat b;
+  uword result = 0;
+
+  p = pool_elt_at_index (pm->pci_devices, os_handle);
+
+  file_name = format (0, "%v/resource%d%c", p->dev_dir_name, resource, 0);
+  if (stat ((char *) file_name, &b) >= 0)
+    result = b.st_size;
+  vec_free (file_name);
+  return result;
+}
+
+void os_add_pci_disable_interrupts_reg (uword os_handle, u32 resource,
+					u32 reg_offset, u32 reg_value)
+{
+  linux_pci_main_t * pm = &linux_pci_main;
+  linux_pci_device_t * l;
+  char * file_name;
+  clib_error_t * error;
+
+  l = pool_elt_at_index (pm->pci_devices, os_handle);
+  ASSERT (resource == 0);
+  ASSERT (reg_offset < pci_resource_size (os_handle, resource));
+  file_name = (char *) format (0, "%s/disable_interrupt_regs%c", l->dev_dir_name, 0);
+  error = write_sys_fs (file_name, "%x %x", reg_offset, reg_value);
+  if (error)
+    clib_error_report (error);
+  vec_free (file_name);
+}
+
+static void add_device (pci_device_t * dev, linux_pci_device_t * pdev)
+{
+  linux_pci_main_t * pm = &linux_pci_main;
+  linux_pci_device_t * l;
+  pci_config_header_t * c;
+  u32 x[4];
+  clib_error_t * error;
+
+  c = &dev->config0.header;
+
+  pool_get (pm->pci_devices, l);
+  l[0] = pdev[0];
+
+  l->dev_dir_name = vec_dup (l->dev_dir_name);
+
+  /* Parse bus, dev, function from directory name. */
+  {
+    unformat_input_t input;
+
+    unformat_init_string (&input, (char *) l->dev_dir_name,
+			  vec_len (l->dev_dir_name));
+
+    if (! unformat (&input, "/sys/bus/pci/devices/%x:%x:%x.%x",
+		    &x[0], &x[1], &x[2], &x[3]))
+      abort ();
+
+    unformat_free (&input);
+
+    l->bus_address.bus = x[1];
+    l->bus_address.slot_function = (x[2] << 3) | x[3];
+    dev->bus_address = l->bus_address;
+  }
+
+  dev->os_handle = l - pm->pci_devices;
+
+  error = write_sys_fs ("/sys/bus/pci/drivers/uio_pci_dma/new_id",
+			"%x %x", c->vendor_id, c->device_id);
+  if (error)
+    clib_error_report (error);
+  error = write_sys_fs ("/sys/bus/pci/drivers/uio_pci_dma/bind",
+			"%04x:%02x:%02x.%x", x[0], x[1], x[2], x[3]);
+  /* Errors happen when re-binding so just ignore them. */
+  if (error)
+    clib_error_free (error);
+
+  {
+    u8 * uio_dir = format (0, "%s/uio", l->dev_dir_name);
+    foreach_directory_file ((char *) uio_dir, scan_uio_dir, l, /* scan_dirs */ 1);
+    vec_free (uio_dir);
+  }
+
+  {
+    char * uio_name = (char *) format (0, "/dev/uio%d%c", l->uio_minor, 0);
+    l->uio_fd = open (uio_name, O_RDWR);
+    if (l->uio_fd < 0)
+      clib_unix_error ("open `%s'", uio_name);
+    vec_free (uio_name);
+  }
+
+  {
+    unix_file_t template = {0};
+    unix_main_t * um = &unix_main;
+
+    template.read_function = linux_pci_uio_read_ready;
+    template.file_descriptor = l->uio_fd;
+    template.error_function = linux_pci_uio_error_ready;
+    template.private_data = l - pm->pci_devices;
+
+    /* To be filled in by driver. */
+    l->device_input_node_index = ~0;
+    l->device_index = 0;
+
+    l->unix_file_index = unix_file_add (um, &template);
+  }
+}
+
+static void linux_pci_device_free (linux_pci_device_t * l)
+{
+  int i;
+  for (i = 0; i < vec_len (l->resource_fds); i++)
+    if (l->resource_fds[i] > 0)
+      close (l->resource_fds[i]);
+  if (l->config_fd > 0)
+    close (l->config_fd);
+  if (l->uio_fd > 0)
+    close (l->uio_fd);
+  vec_free (l->resource_fds);
+  vec_free (l->dev_dir_name);
+}
+
+/* Configuration space read/write. */
+clib_error_t *
+os_read_write_pci_config (uword os_handle,
+			  vlib_read_or_write_t read_or_write,
+			  uword address,
+			  void * data,
+			  u32 n_bytes)
+{
+  linux_pci_main_t * pm = &linux_pci_main;
+  linux_pci_device_t * p;
+  int n;
+
+  p = pool_elt_at_index (pm->pci_devices, os_handle);
+
+  if (address != lseek (p->config_fd, address, SEEK_SET))
+    return clib_error_return_unix (0, "seek offset %d", address);
+
+  if (read_or_write == VLIB_READ)
+    n = read (p->config_fd, data, n_bytes);
+  else
+    n = write (p->config_fd, data, n_bytes);
+
+  if (n != n_bytes)
+    return clib_error_return_unix (0, "%s",
+				   read_or_write == VLIB_READ
+				   ? "read" : "write");
+
+  return 0;
+}
+
+static clib_error_t *
+os_map_pci_resource_internal (uword os_handle,
+                              u32 resource,
+                              u8 *addr,
+                              void ** result)
+{
+  linux_pci_main_t * pm = &linux_pci_main;
+  linux_pci_device_t * p;
+  struct stat stat_buf;
+  u8 * file_name;
+  int fd;
+  clib_error_t * error;
+  int flags = MAP_SHARED;
+
+  error = 0;
+  p = pool_elt_at_index (pm->pci_devices, os_handle);
+
+  file_name = format (0, "%v/resource%d%c", p->dev_dir_name, resource, 0);
+  fd = open ((char *) file_name, O_RDWR);
+  if (fd < 0)
+    {
+      error = clib_error_return_unix (0, "open `%s'", file_name);
+      goto done;
+    }
+
+  if (fstat (fd, &stat_buf) < 0)
+    {
+      error = clib_error_return_unix (0, "fstat `%s'", file_name);
+      goto done;
+    }
+
+  vec_validate (p->resource_fds, resource);
+  p->resource_fds[resource] = fd;
+  if (addr != 0)
+    flags |= MAP_FIXED;
+
+  *result = mmap (addr,
+		  /* size */ stat_buf.st_size,
+		  PROT_READ | PROT_WRITE,
+                  flags,
+		  /* file */ fd,
+		  /* offset */ 0);
+  if (*result == (void *) -1)
+    {
+      error = clib_error_return_unix (0, "mmap `%s'", file_name);
+      goto done;
+    }
+
+ done:
+  if (error)
+    {
+      if (fd > 0)
+	close (fd);
+    }
+  vec_free (file_name);
+  return error;
+}
+
+clib_error_t *
+os_map_pci_resource (uword os_handle,
+		     u32 resource,
+		     void ** result)
+{
+  return (os_map_pci_resource_internal (os_handle, resource, 0 /* addr */,
+                                        result));
+}
+
+clib_error_t *
+os_map_pci_resource_fixed (uword os_handle,
+                           u32 resource,
+                           u8 *addr,
+                           void ** result)
+{
+  return (os_map_pci_resource_internal (os_handle, resource, addr, result));
+}
+
+void os_free_pci_device (uword os_handle)
+{
+  linux_pci_main_t * pm = &linux_pci_main;
+  linux_pci_device_t * l;
+
+  l = pool_elt_at_index (pm->pci_devices, os_handle);
+  linux_pci_device_free (l);
+  pool_put (pm->pci_devices, l);
+}
+
+u8 * format_os_pci_handle (u8 * s, va_list * va)
+{
+  linux_pci_main_t * pm = &linux_pci_main;
+  uword os_pci_handle = va_arg (*va, uword);
+  linux_pci_device_t * l;
+
+  l = pool_elt_at_index (pm->pci_devices, os_pci_handle);
+  return format (s, "%x/%x/%x", l->bus_address.bus,
+		 (l->bus_address.slot_function >> 3),
+		 (l->bus_address.slot_function & 0x7));
+}
+
+static inline pci_device_registration_t *
+pci_device_next_registered (pci_device_registration_t * r)
+{
+  uword i;
+
+  /* Null vendor id marks end of initialized list. */
+  for (i = 0; r->supported_devices[i].vendor_id != 0; i++)
+    ;
+
+  return clib_elf_section_data_next (r, i * sizeof (r->supported_devices[0]));
+}
+
+static inline u8 kernel_driver_installed (pci_device_registration_t *r)
+{
+  u8 * link_name;
+  struct stat b;
+
+  link_name = format (0, "/sys/bus/pci/drivers/%s", r->kernel_driver);
+  if (stat ((char *)link_name, &b) >= 0)
+    r->kernel_driver_running++;
+  else
+    r->kernel_driver_running=0;
+
+  vec_free (link_name);
+  return r->kernel_driver_running;
+}
+
+static clib_error_t *
+init_device_from_registered (vlib_main_t * vm,
+			     pci_device_t * dev,
+			     linux_pci_device_t * pdev)
+{
+  unix_main_t * um = vlib_unix_get_main();
+  pci_device_registration_t * r;
+  pci_device_id_t * i;
+  pci_config_header_t * c;
+
+  c = &dev->config0.header;
+
+  r = um->pci_device_registrations;
+
+  while (r)
+    {
+      for (i = r->supported_devices; i->vendor_id != 0; i++)
+        if (i->vendor_id == c->vendor_id && i->device_id == c->device_id)
+          {
+            if (r->kernel_driver && kernel_driver_installed(r))
+              {
+                if (r->kernel_driver_running == 1)
+                  {
+                    clib_warning("PCI device type [%04x:%04x] is busy!\n"
+                                 "\tUninstall the associated linux kernel "
+                                 "driver:  sudo rmmod %s",
+                                 c->vendor_id, c->device_id, r->kernel_driver);
+                  }
+                continue;
+              }
+            add_device (dev, pdev);
+            return r->init_function (vm, dev);
+          }
+      r = r->next_registration;
+  }
+  /* No driver, close the PCI config-space FD */
+  close (pdev->config_fd);
+  return 0;
+}
+
+static clib_error_t *
+init_device (vlib_main_t * vm,
+	     pci_device_t * dev,
+	     linux_pci_device_t * pdev)
+{
+  return init_device_from_registered (vm, dev, pdev);
+}
+
+static clib_error_t *
+scan_device (void * arg, u8 * dev_dir_name, u8 * ignored)
+{
+  vlib_main_t * vm = arg;
+  int fd;
+  u8 * f;
+  clib_error_t * error = 0;
+  pci_device_t dev = {0};
+  linux_pci_device_t pdev = {0};
+
+  f = format (0, "%v/config%c", dev_dir_name, 0);
+  fd = open ((char *) f, O_RDWR);
+
+  /* Try read-only access if write fails. */
+  if (fd < 0)
+    fd = open ((char *) f, O_RDONLY);
+
+  if (fd < 0)
+    {
+      error = clib_error_return_unix (0, "open `%s'", f);
+      goto done;
+    }
+
+  /* You can only read more that 64 bytes of config space as root; so we try to
+     read the full space but fall back to just the first 64 bytes. */
+  if (read (fd, &dev.config_data, sizeof (dev.config_data)) != sizeof (dev.config_data)
+      && read (fd, &dev.config0, sizeof (dev.config0)) != sizeof (dev.config0))
+    {
+      error = clib_error_return_unix (0, "read `%s'", f);
+      goto done;
+    }
+
+  {
+    static pci_config_header_t all_ones;
+    if (all_ones.vendor_id == 0)
+      memset (&all_ones, ~0, sizeof (all_ones));
+    
+    if (! memcmp (&dev.config0.header, &all_ones, sizeof (all_ones)))
+      {
+	error = clib_error_return (0, "invalid PCI config for `%s'", f);
+	goto done;
+      }
+  }
+
+  if (dev.config0.header.header_type == 0)
+    pci_config_type0_little_to_host (&dev.config0);
+  else
+    pci_config_type1_little_to_host (&dev.config1);
+
+  pdev.config_fd = fd;
+  pdev.dev_dir_name = dev_dir_name;
+
+  error = init_device (vm, &dev, &pdev);
+
+ done:
+  vec_free (f);
+  return error;
+}
+
+clib_error_t * pci_bus_init (vlib_main_t * vm)
+{
+  linux_pci_main_t * pm = &linux_pci_main;
+  clib_error_t * error;
+
+  pm->vlib_main = vm;
+
+  if ((error = vlib_call_init_function (vm, unix_input_init)))
+    return error;
+
+  error = foreach_directory_file ("/sys/bus/pci/devices", scan_device, vm, /* scan_dirs */ 0);
+
+  /* Complain and continue. might not be root, etc. */
+  if (error)
+    clib_error_report (error);
+
+  return error;
+}
+
+VLIB_INIT_FUNCTION (pci_bus_init);
diff --git a/vlib/vlib/unix/pci.h b/vlib/vlib/unix/pci.h
new file mode 100644
index 00000000000..b384250eb47
--- /dev/null
+++ b/vlib/vlib/unix/pci.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * unix/pci.h: Linux specific pci state
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_unix_pci_h
+#define included_unix_pci_h
+
+#include <vlib/pci/pci.h>
+
+typedef struct {
+  /* /sys/bus/pci/devices/... directory name for this device. */
+  u8 * dev_dir_name;
+
+  /* Resource file descriptors. */
+  int * resource_fds;
+
+  /* File descriptor for config space read/write. */
+  int config_fd;
+
+  /* PCI bus address for this devices parsed from /sys/bus/pci/devices name. */
+  pci_bus_address_t bus_address;
+
+  /* File descriptor for /dev/uio%d */
+  int uio_fd;
+
+  /* Minor device for uio device. */
+  u32 uio_minor;
+
+  /* Index given by unix_file_add. */
+  u32 unix_file_index;
+
+  /* Input node to handle interrupts for this device. */ 
+  u32 device_input_node_index;
+  
+  /* Node runtime will be a bitmap of device indices with pending interrupts. */
+  u32 device_index;
+} linux_pci_device_t;
+
+/* Pool of PCI devices. */
+typedef struct {
+  vlib_main_t * vlib_main;
+  linux_pci_device_t * pci_devices;
+} linux_pci_main_t;
+
+extern linux_pci_main_t linux_pci_main;
+
+always_inline linux_pci_device_t *
+pci_dev_for_linux (pci_device_t * dev)
+{
+  linux_pci_main_t * pm = &linux_pci_main;
+  return pool_elt_at_index (pm->pci_devices, dev->os_handle);
+}
+
+/* Call to allocate/initialize the pci subsystem.
+   This is not an init function so that users can explicitly enable
+   pci only when it's needed. */
+clib_error_t * pci_bus_init (vlib_main_t * vm);
+
+#endif /* included_unix_pci_h */
diff --git a/vlib/vlib/unix/physmem.c b/vlib/vlib/unix/physmem.c
new file mode 100644
index 00000000000..83b40be6449
--- /dev/null
+++ b/vlib/vlib/unix/physmem.c
@@ -0,0 +1,472 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * physmem.c: Unix physical memory
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/unix/physmem.h>
+
+static physmem_main_t physmem_main;
+
+static void *
+unix_physmem_alloc_aligned (vlib_physmem_main_t * vpm, uword n_bytes, uword alignment)
+{
+  physmem_main_t * pm = &physmem_main;
+  uword lo_offset, hi_offset;
+  uword * to_free = 0;
+
+#if DPDK > 0
+  clib_warning ("unsafe alloc!");
+#endif
+
+  /* IO memory is always at least cache aligned. */
+  alignment = clib_max (alignment, CLIB_CACHE_LINE_BYTES);
+
+  while (1)
+    {
+      mheap_get_aligned (pm->heap, n_bytes,
+			 /* align */ alignment,
+			 /* align offset */ 0,
+			 &lo_offset);
+
+      /* Allocation failed? */
+      if (lo_offset == ~0)
+	break;
+
+      /* Make sure allocation does not span DMA physical chunk boundary. */
+      hi_offset = lo_offset + n_bytes - 1;
+
+      if ((lo_offset >> vpm->log2_n_bytes_per_page) ==
+	  (hi_offset >> vpm->log2_n_bytes_per_page))
+	break;
+
+      /* Allocation would span chunk boundary, queue it to be freed as soon as
+	 we find suitable chunk. */
+      vec_add1 (to_free, lo_offset);
+    }
+
+  if (to_free != 0)
+    {
+      uword i;
+      for (i = 0; i < vec_len (to_free); i++)
+	mheap_put (pm->heap, to_free[i]);
+      vec_free (to_free);
+    }
+
+  return lo_offset != ~0 ? pm->heap + lo_offset : 0;
+}
+
+static void unix_physmem_free (void * x)
+{
+  physmem_main_t * pm = &physmem_main;
+
+  /* Return object to region's heap. */
+  mheap_put (pm->heap, x - pm->heap);
+}
+
+static void htlb_shutdown(void)
+{
+  physmem_main_t * pm = &physmem_main;
+  
+  if (! pm->shmid)
+    return;
+  shmctl (pm->shmid, IPC_RMID, 0);
+  pm->shmid = 0;
+}
+
+/* try to use huge TLB pgs if possible */
+static int htlb_init (vlib_main_t * vm)
+{
+  vlib_physmem_main_t * vpm = &vm->physmem_main;
+  physmem_main_t * pm = &physmem_main;
+  u64 hugepagesize, pagesize;
+  u64 pfn, seek_loc;
+  u64 cur, physaddr, ptbits;
+  int fd, i;
+
+  pm->shmid = shmget (11 /* key, my amp goes to 11 */, pm->mem_size,
+                      IPC_CREAT | SHM_HUGETLB | SHM_R | SHM_W);
+  if (pm->shmid < 0)
+    {
+      clib_unix_warning ("shmget");
+      return 0;
+    }
+
+  pm->mem = shmat (pm->shmid, NULL, 0 /* flags */);
+  if (pm->mem == 0)
+    {
+      shmctl (pm->shmid, IPC_RMID, 0);
+      return 0;
+    }
+
+  memset (pm->mem, 0, pm->mem_size);
+
+  /* $$$ get page size info from /proc/meminfo */
+  hugepagesize = 2<<20;
+  pagesize = 4<<10;
+  vpm->log2_n_bytes_per_page = min_log2 (hugepagesize);
+  vec_resize (vpm->page_table, pm->mem_size / hugepagesize);
+
+  vpm->page_mask = pow2_mask (vpm->log2_n_bytes_per_page);
+  vpm->virtual.start = pointer_to_uword (pm->mem);
+  vpm->virtual.size = pm->mem_size;
+  vpm->virtual.end = vpm->virtual.start + vpm->virtual.size;
+
+  fd = open("/proc/self/pagemap", O_RDONLY);
+
+  if (fd < 0) 
+    {
+      (void) shmdt (pm->mem);
+      return 0;
+    }
+  
+  pm->heap = mheap_alloc_with_flags
+    (pm->mem, pm->mem_size,
+     /* Don't want mheap mmap/munmap with IO memory. */
+     MHEAP_FLAG_DISABLE_VM);
+  
+  cur = (u64) pm->mem;
+  i = 0;
+
+  while (cur < (u64) pm->mem + pm->mem_size)
+    {
+      pfn = (u64) cur / pagesize;
+      seek_loc = pfn * sizeof (u64);
+      if (lseek (fd, seek_loc, SEEK_SET) != seek_loc)
+        {
+          clib_unix_warning ("lseek to 0x%llx", seek_loc);
+          shmctl (pm->shmid, IPC_RMID, 0);
+          close(fd);
+          return 0;
+        }
+      if (read (fd, &ptbits, sizeof (ptbits)) != (sizeof(ptbits)))
+        {
+          clib_unix_warning ("read ptbits");
+          shmctl (pm->shmid, IPC_RMID, 0);
+          close(fd);
+          return 0;
+        }
+          
+      /* bits 0-54 are the physical page number */
+      physaddr = (ptbits & 0x7fffffffffffffULL) * pagesize;
+      if (CLIB_DEBUG > 1)
+        fformat(stderr, "pm: virtual 0x%llx physical 0x%llx\n",
+                cur, physaddr);
+      vpm->page_table[i++] = physaddr;
+
+      cur += hugepagesize;
+    }
+  close(fd);
+  atexit (htlb_shutdown);
+  return 1;
+}
+
+int vlib_app_physmem_init (vlib_main_t * vm, 
+                           physmem_main_t * pm, int) __attribute__ ((weak));
+int vlib_app_physmem_init (vlib_main_t * vm, physmem_main_t * pm, int x)
+{
+  return 0;
+}
+
+clib_error_t * unix_physmem_init (vlib_main_t * vm, int physical_memory_required)
+{
+  vlib_physmem_main_t * vpm = &vm->physmem_main;
+  physmem_main_t * pm = &physmem_main;
+  clib_error_t * error = 0;
+  char * dev_uio_dma_file = "/dev/uio-dma";
+  int using_fake_memory = 0;
+
+  /* Avoid multiple calls. */
+  if (vm->os_physmem_alloc_aligned)
+      return error;
+
+  vm->os_physmem_alloc_aligned = unix_physmem_alloc_aligned;
+  vm->os_physmem_free = unix_physmem_free;
+  pm->mem = MAP_FAILED;
+
+  if (pm->mem_size == 0)
+    pm->mem_size = 16 << 20;
+
+  /* OK, Mr. App, you tell us */
+  if (vlib_app_physmem_init (vm, pm, physical_memory_required))
+      return 0;
+
+  if (physical_memory_required)
+    {
+      if (!pm->no_hugepages && htlb_init(vm))
+        {
+          fformat(stderr, "%s: use huge pages\n", __FUNCTION__);
+          return 0;
+        }
+      pm->uio_dma_fd = open (dev_uio_dma_file, O_RDWR);
+    }
+  else
+    pm->uio_dma_fd = -1;
+
+  if (pm->uio_dma_fd < 0)
+    {
+      if (physical_memory_required)
+	{
+	  error = clib_error_return_unix (0, "open `%s'", dev_uio_dma_file);
+	  goto done;
+	}
+
+      using_fake_memory = 1;
+      pm->mem = mmap (0, pm->mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+      if (pm->mem == MAP_FAILED)
+	{
+	  error = clib_error_return_unix (0, "mmap");
+	  goto done;
+	}
+
+      pm->heap = mheap_alloc (pm->mem, pm->mem_size);
+
+      /* Identity map with a single page. */
+      vpm->log2_n_bytes_per_page = min_log2 (pm->mem_size);
+      vec_add1 (vpm->page_table, pointer_to_uword (pm->mem));
+    }
+  else
+    error = clib_error_return (0, "uio_dma deprecated");
+
+  if (using_fake_memory)
+      fformat(stderr, "%s: use fake dma pages\n", __FUNCTION__);
+  else
+      fformat(stderr, "%s: use uio dma pages\n", __FUNCTION__);
+
+ done:
+  if (error)
+    {
+      if (pm->mem != MAP_FAILED)
+	munmap (pm->mem, pm->mem_size);
+      if (pm->uio_dma_fd >= 0)
+	{
+	  close (pm->uio_dma_fd);
+	  pm->uio_dma_fd = -1;
+	}
+    }
+  return error;
+}
+
+static clib_error_t *
+show_physmem (vlib_main_t * vm,
+	      unformat_input_t * input,
+	      vlib_cli_command_t * cmd)
+{
+#if DPDK > 0
+      vlib_cli_output (vm, "Not supported with DPDK drivers.");
+#else
+  physmem_main_t * pm = &physmem_main;
+
+  if (pm->heap)
+      vlib_cli_output (vm, "%U", format_mheap, pm->heap, /* verbose */ 0);
+  else
+      vlib_cli_output (vm, "No physmem allocated.");
+#endif
+  return 0;
+}
+
+VLIB_CLI_COMMAND (show_physmem_command, static) = {
+  .path = "show physmem",
+  .short_help = "Show physical memory allocation",
+  .function = show_physmem,
+};
+
+static clib_error_t *
+show_affinity (vlib_main_t * vm,
+                   unformat_input_t * input,
+                   vlib_cli_command_t * cmd)
+{
+  cpu_set_t set;
+  cpu_set_t *setp = &set;
+  int i, rv;
+  u8 *s = 0;
+  int first_set_bit_in_run = -1;
+  int last_set_bit_in_run = -1;
+  int output_done = 0;
+
+  rv = sched_getaffinity (0 /* pid, 0 = this proc */,
+                          sizeof (*setp), setp);
+  if (rv < 0)
+    {
+      vlib_cli_output (vm, "Couldn't get affinity mask: %s\n",
+                       strerror(errno));
+      return 0;
+    }
+  
+  for (i = 0; i < 64; i++)
+    {
+      if (CPU_ISSET(i, setp))
+        {
+          if (first_set_bit_in_run == -1)
+            {
+              first_set_bit_in_run = i;
+              last_set_bit_in_run = i;
+              if (output_done)
+                s = format (s, ",");
+              s = format (s, "%d-", i);
+              output_done = 1;
+            }
+          else
+            {
+              if (i == (last_set_bit_in_run+1))
+                last_set_bit_in_run = i;
+            }
+        }
+      else
+        {
+          if (first_set_bit_in_run != -1)
+            {
+              if (first_set_bit_in_run == (i-1))
+                {
+                  _vec_len (s) -= 2 + ((first_set_bit_in_run/10));
+                }
+              s = format (s, "%d", last_set_bit_in_run);
+              first_set_bit_in_run = -1;
+              last_set_bit_in_run = -1;
+            }
+        }
+    }
+  
+  if (first_set_bit_in_run != -1)    
+    s = format (s, "%d", first_set_bit_in_run);
+
+  vlib_cli_output (vm, "Process runs on: %v", s);
+  return 0;
+}
+
+VLIB_CLI_COMMAND (show_affinity_command, static) = {
+  .path = "show affinity",
+  .short_help = "Show process cpu affinity",
+  .function = show_affinity,
+};
+
+static clib_error_t *
+set_affinity (vlib_main_t * vm,
+              unformat_input_t * input,
+              vlib_cli_command_t * cmd)
+{
+  cpu_set_t set;
+  cpu_set_t *setp = &set;
+  int i, rv;
+  int another_round;
+  u32 first, last;
+
+  memset (setp, 0, sizeof (*setp));
+
+  do {
+    another_round = 0;
+    if (unformat (input, "%d-%d,", &first, &last))
+      {
+        if (first > 64 || last > 64)
+          {
+          barf1:
+            vlib_cli_output (vm, "range %d-%d invalid", first, last);
+            return 0;
+          }
+        
+        for (i = first; i <= last; i++)
+          CPU_SET(i, setp);
+        another_round = 1;
+      }
+    else if (unformat (input, "%d-%d", &first, &last))
+      {
+        if (first > 64 || last > 64)
+            goto barf1;
+        
+        for (i = first; i <= last; i++)
+          CPU_SET(i, setp);
+      }
+    else if (unformat (input, "%d,", &first))
+      {
+        if (first > 64)
+          {
+          barf2:
+            vlib_cli_output (vm, "cpu %d invalid", first);
+            return 0;
+          }
+        CPU_SET(first, setp);
+        another_round = 1;
+      }
+    else if (unformat (input, "%d", &first))
+      {
+        if (first > 64)
+          goto barf2;
+
+        CPU_SET(first, setp);
+      }
+  } while (another_round);
+
+  rv = sched_setaffinity (0 /* pid, 0 = this proc */,
+                          sizeof (*setp), setp);
+
+  if (rv < 0)
+    {
+      vlib_cli_output (vm, "Couldn't get affinity mask: %s\n",
+                       strerror(errno));
+      return 0;
+    }
+  return show_affinity (vm, input, cmd);
+}
+
+VLIB_CLI_COMMAND (set_affinity_command, static) = {
+  .path = "set affinity",
+  .short_help = "Set process cpu affinity",
+  .function = set_affinity,
+};
+
+static clib_error_t *
+vlib_physmem_configure (vlib_main_t * vm, unformat_input_t * input)
+{
+  physmem_main_t * pm = &physmem_main;
+  u32 size_in_mb;
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "no-huge") || unformat (input, "no-huge-pages"))
+        pm->no_hugepages = 1;
+
+      else if (unformat(input, "size-in-mb %d", &size_in_mb) ||
+               unformat(input, "size %d", &size_in_mb))
+        pm->mem_size = size_in_mb << 20;
+      else
+	return unformat_parse_error (input);
+    }
+
+  unformat_free (input);
+  return 0;
+}
+
+VLIB_EARLY_CONFIG_FUNCTION (vlib_physmem_configure, "physmem");
diff --git a/vlib/vlib/unix/physmem.h b/vlib/vlib/unix/physmem.h
new file mode 100644
index 00000000000..a963be746d8
--- /dev/null
+++ b/vlib/vlib/unix/physmem.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __included_physmem_h__
+#define __included_physmem_h__
+
+/* Manage I/O physical memory. */
+#define _GNU_SOURCE 
+#include <sched.h>
+#include <vppinfra/cache.h>
+#include <vppinfra/error.h>
+#include <vppinfra/mheap.h>
+#include <vppinfra/os.h>
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+
+#include <sys/fcntl.h>		/* for open */
+#include <sys/file.h>		/* for flock */
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+
+typedef struct {
+  /* File descriptor for /dev/uio-dma. */
+  int uio_dma_fd;
+
+  /* Virtual memory via mmaped. */
+  void * mem;
+
+  /* Size in bytes. */
+  uword mem_size;
+
+  /* Heap allocated out of virtual memory. */
+  void * heap;
+
+  /* huge TLB segment id */
+  int shmid;
+
+  /* should we try to use htlb ? */
+  int no_hugepages;
+
+} physmem_main_t;
+
+#endif /* __included_physmem_h__ */
diff --git a/vlib/vlib/unix/plugin.c b/vlib/vlib/unix/plugin.c
new file mode 100644
index 00000000000..3411ef340af
--- /dev/null
+++ b/vlib/vlib/unix/plugin.c
@@ -0,0 +1,210 @@
+/*
+ * plugin.c: plugin handling
+ *
+ * Copyright (c) 2011 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/unix/plugin.h>
+#include <dlfcn.h>
+#include <dirent.h>
+
+plugin_main_t vlib_plugin_main;
+
+void vlib_set_get_handoff_structure_cb (void *cb)
+{
+  plugin_main_t * pm = &vlib_plugin_main;
+  pm->handoff_structure_get_cb = cb;
+}
+
+static void * vnet_get_handoff_structure (void)
+{
+  void * (*fp)(void);
+
+  fp = vlib_plugin_main.handoff_structure_get_cb;
+  if (fp == 0)
+    return 0;
+  else
+    return (*fp)();
+}
+
+static int 
+load_one_plugin (plugin_main_t *pm, plugin_info_t *pi, int from_early_init)
+{
+  void *handle, *register_handle;
+  clib_error_t * (*fp)(vlib_main_t *, void *, int);
+  clib_error_t * error;
+  void *handoff_structure;
+  
+  handle = dlopen ((char *)pi->name, RTLD_LAZY);
+
+  /* 
+   * Note: this can happen if the plugin has an undefined symbol reference,
+   * so print a warning. Otherwise, the poor slob won't know what happened.
+   * Ask me how I know that...
+   */
+  if (handle == 0)
+    {
+      clib_warning ("%s", dlerror());
+      return -1;
+    }
+  
+  pi->handle = handle;
+
+  register_handle = dlsym (pi->handle, "vlib_plugin_register");
+  if (register_handle == 0)
+    {
+      dlclose (handle);
+      return 0;
+    }
+
+  fp = register_handle;
+
+  handoff_structure = vnet_get_handoff_structure();
+
+  if (handoff_structure == 0)
+    error = clib_error_return (0, "handoff structure callback returned 0");
+  else
+    error = (*fp)(pm->vlib_main, handoff_structure, from_early_init);
+
+  if (error)
+    {
+      clib_error_report (error);
+      dlclose (handle);
+      return 1;
+    }
+
+  clib_warning ("Loaded plugin: %s", pi->name);
+
+  return 0;
+}
+
+static u8 **split_plugin_path (plugin_main_t *pm)
+{
+  int i;
+  u8 **rv = 0;
+  u8 *path = pm->plugin_path;
+  u8 *this = 0;
+
+  for (i = 0; i < vec_len (pm->plugin_path); i++)
+    {
+      if (path[i] != ':')
+        {
+          vec_add1(this, path[i]);
+          continue;
+        }
+      vec_add1(this, 0);
+      vec_add1 (rv, this);
+      this = 0;
+    }
+  if (this)
+    {
+      vec_add1 (this, 0);
+      vec_add1 (rv, this);
+    }
+  return rv;
+}
+
+int vlib_load_new_plugins (plugin_main_t *pm, int from_early_init)
+{
+  DIR *dp;
+  struct dirent *entry;
+  struct stat statb;
+  uword *p;
+  plugin_info_t *pi;
+  u8 **plugin_path;
+  int i;
+
+  plugin_path = split_plugin_path (pm);
+  
+  for (i = 0; i < vec_len (plugin_path); i++)
+    {
+      dp = opendir ((char *)plugin_path[i]);
+  
+      if (dp == 0)
+        continue;
+      
+      while ((entry = readdir (dp)))
+        {
+          u8 *plugin_name;
+          
+          if (pm->plugin_name_filter)
+            {
+              int j;
+              for (j = 0; j < vec_len (pm->plugin_name_filter); j++)
+                if (entry->d_name[j] != pm->plugin_name_filter[j])
+                  goto next;
+            }
+
+          plugin_name = format (0, "%s/%s%c", plugin_path[i],
+                                entry->d_name, 0);
+          
+          /* unreadable */
+          if (stat ((char *)plugin_name, &statb) < 0)
+            {
+            ignore:
+              vec_free (plugin_name);
+              continue;
+            }
+          
+          /* a dir or other things which aren't plugins */
+          if (!S_ISREG(statb.st_mode))
+            goto ignore;
+          
+          p = hash_get_mem (pm->plugin_by_name_hash, plugin_name);
+          if (p == 0) 
+            {
+              vec_add2 (pm->plugin_info, pi, 1);
+              pi->name = plugin_name;
+              pi->file_info = statb;
+              
+              if (load_one_plugin (pm, pi, from_early_init))
+                {
+                  vec_free (plugin_name);
+                  _vec_len (pm->plugin_info) = vec_len (pm->plugin_info) - 1;
+                  continue;
+                }
+              memset (pi, 0, sizeof (*pi));
+              hash_set_mem (pm->plugin_by_name_hash, plugin_name, 
+                            pi - pm->plugin_info);
+            }
+        next:
+          ;
+        }
+      closedir (dp);
+      vec_free (plugin_path[i]);
+    }
+  vec_free (plugin_path);
+  return 0;
+}
+char *vlib_plugin_path __attribute__((weak));
+char *vlib_plugin_path = "";
+char *vlib_plugin_name_filter __attribute__((weak));
+char *vlib_plugin_name_filter = 0;
+
+int vlib_plugin_early_init (vlib_main_t *vm)
+{
+  plugin_main_t *pm = &vlib_plugin_main;
+
+  pm->plugin_path = format (0, "%s%c", vlib_plugin_path, 0);
+
+  clib_warning ("plugin path %s", pm->plugin_path);
+
+  if (vlib_plugin_name_filter)
+    pm->plugin_name_filter = format (0, "%s%c", vlib_plugin_name_filter, 0);
+
+  pm->plugin_by_name_hash = hash_create_string (0, sizeof (uword));
+  pm->vlib_main = vm;
+  
+  return vlib_load_new_plugins (pm, 1 /* from_early_init */);
+}
diff --git a/vlib/vlib/unix/plugin.h b/vlib/vlib/unix/plugin.h
new file mode 100644
index 00000000000..e7d75099ed9
--- /dev/null
+++ b/vlib/vlib/unix/plugin.h
@@ -0,0 +1,88 @@
+/*
+ * plugin.h: plugin handling
+ *
+ * Copyright (c) 2011 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __included_plugin_h__
+#define __included_plugin_h__
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+/*
+ * vlib plugin scheme
+ *
+ * Almost anything which can be made to work in a vlib unix
+ * application will also work in a vlib plugin. 
+ * 
+ * The elf-section magic which registers static objects 
+ * works so long as plugins are preset when the vlib unix process
+ * starts. But wait: there's more... 
+ * 
+ * If an application calls vlib_load_new_plugins() -- possibly after
+ * changing vlib_plugin_main.plugin_path / vlib_plugin_main.plugin_name_filter,
+ * -- new plugins will be loaded. That, in turn, allows considerable
+ * flexibility in terms of adding feature code or fixing bugs without
+ * requiring the data-plane process to restart.
+ *
+ * When the plugin mechanism loads a plugin, it uses dlsym to locate
+ * and call the plugin's function vlib_plugin_register() if it exists.
+ * A plugin which expects to be loaded after the vlib application
+ * starts uses this callback to modify the application. If vlib_plugin_register
+ * returns non-zero, the plugin mechanism dlclose()'s the plugin.
+ *
+ * Applications control the plugin search path and name filter by
+ * declaring the variables vlib_plugin_path and vlib_plugin_name_filter.
+ * libvlib_unix.la supplies weak references for these symbols which
+ * effectively disable the scheme. In order for the elf-section magic to
+ * work, static plugins must be loaded at the earliest possible moment.
+ *
+ * An application can change these parameters at any time and call
+ * vlib_load_new_plugins().
+ */
+ 
+
+
+typedef struct {
+  u8 *name;
+  struct stat file_info;
+  void *handle;
+} plugin_info_t;
+
+typedef struct {
+  /* loaded plugin info */
+  plugin_info_t *plugin_info;
+  uword *plugin_by_name_hash;
+
+  /* path and name filter */
+  u8 *plugin_path;
+  u8 *plugin_name_filter;
+
+  /* handoff structure get callback */
+  void *handoff_structure_get_cb;
+
+  /* usual */
+  vlib_main_t *vlib_main;
+} plugin_main_t;
+
+plugin_main_t vlib_plugin_main;
+
+int vlib_plugin_early_init (vlib_main_t *vm);
+int vlib_load_new_plugins (plugin_main_t *pm, int from_early_init);
+
+#endif /* __included_plugin_h__ */
diff --git a/vlib/vlib/unix/unix.h b/vlib/vlib/unix/unix.h
new file mode 100644
index 00000000000..0802a93baa3
--- /dev/null
+++ b/vlib/vlib/unix/unix.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * unix.h: Unix specific main state
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_unix_unix_h
+#define included_unix_unix_h
+
+#include <vppinfra/socket.h>
+
+struct unix_file;
+typedef clib_error_t * (unix_file_function_t) (struct unix_file * f);
+
+typedef struct unix_file {
+  /* Unix file descriptor from open/socket. */
+  u32 file_descriptor;
+
+  u32 flags;
+#define UNIX_FILE_DATA_AVAILABLE_TO_WRITE (1 << 0)
+
+  /* Data available for function's use. */
+  uword private_data;
+
+  /* Functions to be called when read/write data becomes ready. */
+  unix_file_function_t * read_function, * write_function, * error_function;
+} unix_file_t;
+
+typedef struct {
+  f64 time;
+  clib_error_t * error;
+} unix_error_history_t;
+
+typedef enum {
+  UNIX_FILE_UPDATE_ADD,
+  UNIX_FILE_UPDATE_MODIFY,
+  UNIX_FILE_UPDATE_DELETE,
+} unix_file_update_type_t;
+
+typedef struct {
+  /* Back pointer to main structure. */
+  vlib_main_t * vlib_main;
+
+  u32 flags;
+  /* Run interactively or as daemon (background process). */
+#define UNIX_FLAG_INTERACTIVE (1 << 0)
+#define UNIX_FLAG_NODAEMON (1 << 1)
+
+  /* Pool of files to poll for input/output. */
+  unix_file_t * file_pool;
+
+  /* CLI listen socket. */
+  clib_socket_t cli_listen_socket;
+
+  void (* file_update) (unix_file_t * file, unix_file_update_type_t update_type);
+
+  /* Circular buffer of last unix errors. */
+  unix_error_history_t error_history[128];
+  u32 error_history_index;
+  u64 n_total_errors;
+
+  /* startup-config filename */
+  u8 *startup_config_filename;
+
+  /* unix config complete */
+  volatile int unix_config_complete;
+
+  /* CLI log file. GIGO. */
+  u8 *log_filename;
+  int log_fd;
+  /* Don't put telnet connections into character mode */
+  int cli_line_mode;
+  u32 cli_history_limit;
+  
+} unix_main_t;
+
+/* Global main structure. */
+extern unix_main_t unix_main;
+
+always_inline uword
+unix_file_add (unix_main_t * um, unix_file_t * template)
+{
+  unix_file_t * f;
+  pool_get (um->file_pool, f);
+  f[0] = template[0];
+  um->file_update (f, UNIX_FILE_UPDATE_ADD);
+  return f - um->file_pool;
+}
+
+always_inline void
+unix_file_del (unix_main_t * um, unix_file_t * f)
+{
+  um->file_update (f, UNIX_FILE_UPDATE_DELETE);
+  close (f->file_descriptor);
+  f->file_descriptor = ~0;
+  pool_put (um->file_pool, f);
+}
+
+always_inline uword
+unix_file_set_data_available_to_write (u32 unix_file_index, uword is_available)
+{
+  unix_file_t * uf = pool_elt_at_index (unix_main.file_pool, unix_file_index);
+  uword was_available = (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE);
+  if ((was_available != 0) != (is_available != 0))
+    {
+      uf->flags ^= UNIX_FILE_DATA_AVAILABLE_TO_WRITE;
+      unix_main.file_update (uf, UNIX_FILE_UPDATE_MODIFY);
+    }
+  return was_available != 0;
+}
+
+always_inline void
+unix_save_error (unix_main_t * um, clib_error_t * error)
+{
+  unix_error_history_t * eh = um->error_history + um->error_history_index;
+  clib_error_free_vector (eh->error);
+  eh->error = error;
+  eh->time = vlib_time_now (um->vlib_main);
+  um->n_total_errors += 1;
+  if (++um->error_history_index >= ARRAY_LEN (um->error_history))
+    um->error_history_index = 0;
+}
+
+/* Main function for Unix VLIB. */
+int vlib_unix_main (int argc, char * argv[]);
+
+/* Call to allocate/initialize physical DMA memory subsystem.
+   This is not an init function so that users can explicitly enable/disable
+   physmem when its not needed. */
+clib_error_t * unix_physmem_init (vlib_main_t * vm,
+				  int fail_if_physical_memory_not_present);
+
+/* Set prompt for CLI. */
+void vlib_unix_cli_set_prompt (char * prompt);
+
+static inline unix_main_t * vlib_unix_get_main (void)
+{
+  return &unix_main;
+}
+
+/* thread stack array; vec_len = max number of threads */
+u8 **vlib_thread_stacks;
+
+#endif /* included_unix_unix_h */
diff --git a/vlib/vlib/vlib.h b/vlib/vlib/vlib.h
new file mode 100644
index 00000000000..74101f8d297
--- /dev/null
+++ b/vlib/vlib/vlib.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * vlib.h: top-level include file
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_h
+#define included_vlib_h
+
+#include <vppinfra/clib.h>
+#include <vppinfra/elf_clib.h>
+
+/* Generic definitions. */
+#include <vlib/defs.h>
+
+/* Forward declarations of structs to avoid circular dependencies. */
+struct vlib_main_t;
+
+/* All includes in alphabetical order. */
+#include <vlib/buffer.h>
+#include <vlib/cli.h>
+#include <vlib/counter.h>
+#include <vlib/error.h>
+#include <vlib/init.h>
+#include <vlib/mc.h>
+#include <vlib/node.h>
+#include <vlib/physmem.h>
+#include <vlib/trace.h>
+
+/* Main include depends on other vlib/ includes so we put it last. */
+#include <vlib/main.h>
+
+/* Inline/extern function declarations. */
+#include <vlib/threads.h>
+#include <vlib/buffer_funcs.h>
+#include <vlib/cli_funcs.h>
+#include <vlib/error_funcs.h>
+#include <vlib/format_funcs.h>
+#include <vlib/node_funcs.h>
+#include <vlib/trace_funcs.h>
+#include <vlib/global_funcs.h>
+
+#include <vlib/buffer_node.h>
+
+#endif /* included_vlib_h */
author	Ed Warnicke <eaw@cisco.com>	2015-12-08 15:45:58 -0700
committer	Ed Warnicke <eaw@cisco.com>	2015-12-08 15:47:27 -0700
commit	cb9cadad578297ffd78fa8a33670bdf1ab669e7e (patch)
tree	6ac2be912482cc7849a26f0ab845561c3d7f4e26 /vlib/vlib
parent	fb0815d4ae4bb0fe27bd9313f34b45c8593b907e (diff)