64 files changed, 30997 insertions, 0 deletions
diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c
new file mode 100644
index 00000000..7399b618
--- /dev/null
+++ b/src/vlib/buffer.c
@@ -0,0 +1,1134 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * buffer.c: allocate/free network buffers.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file
+ *
+ * Allocate/free network buffers.
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+
+vlib_buffer_callbacks_t *vlib_buffer_callbacks = 0;
+static u32 vlib_buffer_physmem_sz = 32 << 20;
+
+uword
+vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm,
+				       vlib_buffer_t * b_first)
+{
+  vlib_buffer_t *b = b_first;
+  uword l_first = b_first->current_length;
+  uword l = 0;
+  while (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+    {
+      b = vlib_get_buffer (vm, b->next_buffer);
+      l += b->current_length;
+    }
+  b_first->total_length_not_including_first_buffer = l;
+  b_first->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
+  return l + l_first;
+}
+
+u8 *
+format_vlib_buffer (u8 * s, va_list * args)
+{
+  vlib_buffer_t *b = va_arg (*args, vlib_buffer_t *);
+  uword indent = format_get_indent (s);
+
+  s = format (s, "current data %d, length %d, free-list %d, clone-count %u",
+	      b->current_data, b->current_length,
+	      vlib_buffer_get_free_list_index (b), b->n_add_refs);
+
+  if (b->flags & VLIB_BUFFER_TOTAL_LENGTH_VALID)
+    s = format (s, ", totlen-nifb %d",
+		b->total_length_not_including_first_buffer);
+
+  if (b->flags & VLIB_BUFFER_IS_TRACED)
+    s = format (s, ", trace 0x%x", b->trace_index);
+
+  while (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+    {
+      vlib_main_t *vm = vlib_get_main ();
+      u32 next_buffer = b->next_buffer;
+      b = vlib_get_buffer (vm, next_buffer);
+
+      s =
+	format (s, "\n%Unext-buffer 0x%x, segment length %d, clone-count %u",
+		format_white_space, indent, next_buffer, b->current_length,
+		b->n_add_refs);
+    }
+
+  return s;
+}
+
+u8 *
+format_vlib_buffer_and_data (u8 * s, va_list * args)
+{
+  vlib_buffer_t *b = va_arg (*args, vlib_buffer_t *);
+
+  s = format (s, "%U, %U",
+	      format_vlib_buffer, b,
+	      format_hex_bytes, vlib_buffer_get_current (b), 64);
+
+  return s;
+}
+
+static u8 *
+format_vlib_buffer_known_state (u8 * s, va_list * args)
+{
+  vlib_buffer_known_state_t state = va_arg (*args, vlib_buffer_known_state_t);
+  char *t;
+
+  switch (state)
+    {
+    case VLIB_BUFFER_UNKNOWN:
+      t = "unknown";
+      break;
+
+    case VLIB_BUFFER_KNOWN_ALLOCATED:
+      t = "known-allocated";
+      break;
+
+    case VLIB_BUFFER_KNOWN_FREE:
+      t = "known-free";
+      break;
+
+    default:
+      t = "invalid";
+      break;
+    }
+
+  return format (s, "%s", t);
+}
+
+u8 *
+format_vlib_buffer_contents (u8 * s, va_list * va)
+{
+  vlib_main_t *vm = va_arg (*va, vlib_main_t *);
+  vlib_buffer_t *b = va_arg (*va, vlib_buffer_t *);
+
+  while (1)
+    {
+      vec_add (s, vlib_buffer_get_current (b), b->current_length);
+      if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT))
+	break;
+      b = vlib_get_buffer (vm, b->next_buffer);
+    }
+
+  return s;
+}
+
+static u8 *
+vlib_validate_buffer_helper (vlib_main_t * vm,
+			     u32 bi,
+			     uword follow_buffer_next, uword ** unique_hash)
+{
+  vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+  vlib_buffer_main_t *bm = vm->buffer_main;
+  vlib_buffer_free_list_t *fl;
+
+  if (pool_is_free_index
+      (bm->buffer_free_list_pool, vlib_buffer_get_free_list_index (b)))
+    return format (0, "unknown free list 0x%x",
+		   vlib_buffer_get_free_list_index (b));
+
+  fl =
+    pool_elt_at_index (bm->buffer_free_list_pool,
+		       vlib_buffer_get_free_list_index (b));
+
+  if ((signed) b->current_data < (signed) -VLIB_BUFFER_PRE_DATA_SIZE)
+    return format (0, "current data %d before pre-data", b->current_data);
+
+  if (b->current_data + b->current_length > fl->n_data_bytes)
+    return format (0, "%d-%d beyond end of buffer %d",
+		   b->current_data, b->current_length, fl->n_data_bytes);
+
+  if (follow_buffer_next && (b->flags & VLIB_BUFFER_NEXT_PRESENT))
+    {
+      vlib_buffer_known_state_t k;
+      u8 *msg, *result;
+
+      k = vlib_buffer_is_known (vm, b->next_buffer);
+      if (k != VLIB_BUFFER_KNOWN_ALLOCATED)
+	return format (0, "next 0x%x: %U",
+		       b->next_buffer, format_vlib_buffer_known_state, k);
+
+      if (unique_hash)
+	{
+	  if (hash_get (*unique_hash, b->next_buffer))
+	    return format (0, "duplicate buffer 0x%x", b->next_buffer);
+
+	  hash_set1 (*unique_hash, b->next_buffer);
+	}
+
+      msg = vlib_validate_buffer (vm, b->next_buffer, follow_buffer_next);
+      if (msg)
+	{
+	  result = format (0, "next 0x%x: %v", b->next_buffer, msg);
+	  vec_free (msg);
+	  return result;
+	}
+    }
+
+  return 0;
+}
+
+u8 *
+vlib_validate_buffer (vlib_main_t * vm, u32 bi, uword follow_buffer_next)
+{
+  return vlib_validate_buffer_helper (vm, bi, follow_buffer_next,
+				      /* unique_hash */ 0);
+}
+
+u8 *
+vlib_validate_buffers (vlib_main_t * vm,
+		       u32 * buffers,
+		       uword next_buffer_stride,
+		       uword n_buffers,
+		       vlib_buffer_known_state_t known_state,
+		       uword follow_buffer_next)
+{
+  uword i, *hash;
+  u32 bi, *b = buffers;
+  vlib_buffer_known_state_t k;
+  u8 *msg = 0, *result = 0;
+
+  hash = hash_create (0, 0);
+  for (i = 0; i < n_buffers; i++)
+    {
+      bi = b[0];
+      b += next_buffer_stride;
+
+      /* Buffer is not unique. */
+      if (hash_get (hash, bi))
+	{
+	  msg = format (0, "not unique");
+	  goto done;
+	}
+
+      k = vlib_buffer_is_known (vm, bi);
+      if (k != known_state)
+	{
+	  msg = format (0, "is %U; expected %U",
+			format_vlib_buffer_known_state, k,
+			format_vlib_buffer_known_state, known_state);
+	  goto done;
+	}
+
+      msg = vlib_validate_buffer_helper (vm, bi, follow_buffer_next, &hash);
+      if (msg)
+	goto done;
+
+      hash_set1 (hash, bi);
+    }
+
+done:
+  if (msg)
+    {
+      result = format (0, "0x%x: %v", bi, msg);
+      vec_free (msg);
+    }
+  hash_free (hash);
+  return result;
+}
+
+/*
+ * Hand-craft a static vector w/ length 1, so vec_len(vlib_mains) =1
+ * and vlib_mains[0] = &vlib_global_main from the beginning of time.
+ *
+ * The only place which should ever expand vlib_mains is start_workers()
+ * in threads.c. It knows about the bootstrap vector.
+ */
+/* *INDENT-OFF* */
+static struct
+{
+  vec_header_t h;
+  vlib_main_t *vm;
+} __attribute__ ((packed)) __bootstrap_vlib_main_vector
+  __attribute__ ((aligned (CLIB_CACHE_LINE_BYTES))) =
+{
+  .h.len = 1,
+  .vm = &vlib_global_main,
+};
+/* *INDENT-ON* */
+
+vlib_main_t **vlib_mains = &__bootstrap_vlib_main_vector.vm;
+
+
+/* When dubugging validate that given buffers are either known allocated
+   or known free. */
+static void
+vlib_buffer_validate_alloc_free (vlib_main_t * vm,
+				 u32 * buffers,
+				 uword n_buffers,
+				 vlib_buffer_known_state_t expected_state)
+{
+  u32 *b;
+  uword i, bi, is_free;
+
+  if (CLIB_DEBUG == 0)
+    return;
+
+  is_free = expected_state == VLIB_BUFFER_KNOWN_ALLOCATED;
+  b = buffers;
+  for (i = 0; i < n_buffers; i++)
+    {
+      vlib_buffer_known_state_t known;
+
+      bi = b[0];
+      b += 1;
+      known = vlib_buffer_is_known (vm, bi);
+      if (known != expected_state)
+	{
+	  ASSERT (0);
+	  vlib_panic_with_msg
+	    (vm, "%s %U buffer 0x%x",
+	     is_free ? "freeing" : "allocating",
+	     format_vlib_buffer_known_state, known, bi);
+	}
+
+      vlib_buffer_set_known_state
+	(vm, bi,
+	 is_free ? VLIB_BUFFER_KNOWN_FREE : VLIB_BUFFER_KNOWN_ALLOCATED);
+    }
+}
+
+void
+vlib_buffer_merge_free_lists (vlib_buffer_free_list_t * dst,
+			      vlib_buffer_free_list_t * src)
+{
+  uword l;
+  u32 *d;
+
+  l = vec_len (src->buffers);
+  if (l > 0)
+    {
+      vec_add2_aligned (dst->buffers, d, l, CLIB_CACHE_LINE_BYTES);
+      clib_memcpy (d, src->buffers, l * sizeof (d[0]));
+      vec_free (src->buffers);
+    }
+}
+
+/* Add buffer free list. */
+static u32
+vlib_buffer_create_free_list_helper (vlib_main_t * vm,
+				     u32 n_data_bytes,
+				     u32 is_public, u32 is_default, u8 * name)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+  vlib_buffer_free_list_t *f;
+  int i;
+
+  ASSERT (vlib_get_thread_index () == 0);
+
+  if (!is_default && pool_elts (bm->buffer_free_list_pool) == 0)
+    {
+      u32 default_free_free_list_index;
+
+      /* *INDENT-OFF* */
+      default_free_free_list_index =
+        vlib_buffer_create_free_list_helper
+        (vm,
+         /* default buffer size */ VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES,
+         /* is_public */ 1,
+         /* is_default */ 1,
+         (u8 *) "default");
+      /* *INDENT-ON* */
+      ASSERT (default_free_free_list_index ==
+	      VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+
+      if (n_data_bytes == VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES && is_public)
+	return default_free_free_list_index;
+    }
+
+  pool_get_aligned (bm->buffer_free_list_pool, f, CLIB_CACHE_LINE_BYTES);
+
+  memset (f, 0, sizeof (f[0]));
+  f->index = f - bm->buffer_free_list_pool;
+  f->n_data_bytes = vlib_buffer_round_size (n_data_bytes);
+  f->min_n_buffers_each_physmem_alloc = VLIB_FRAME_SIZE;
+  f->name = clib_mem_is_vec (name) ? name : format (0, "%s", name);
+
+  /* Setup free buffer template. */
+  vlib_buffer_set_free_list_index (&f->buffer_init_template, f->index);
+  f->buffer_init_template.n_add_refs = 0;
+
+  if (is_public)
+    {
+      uword *p = hash_get (bm->free_list_by_size, f->n_data_bytes);
+      if (!p)
+	hash_set (bm->free_list_by_size, f->n_data_bytes, f->index);
+    }
+
+  clib_spinlock_init (&f->global_buffers_lock);
+
+  for (i = 1; i < vec_len (vlib_mains); i++)
+    {
+      vlib_buffer_main_t *wbm = vlib_mains[i]->buffer_main;
+      vlib_buffer_free_list_t *wf;
+      pool_get_aligned (wbm->buffer_free_list_pool,
+			wf, CLIB_CACHE_LINE_BYTES);
+      ASSERT (f - bm->buffer_free_list_pool ==
+	      wf - wbm->buffer_free_list_pool);
+      wf[0] = f[0];
+      wf->buffers = 0;
+      wf->n_alloc = 0;
+    }
+
+  return f->index;
+}
+
+u32
+vlib_buffer_create_free_list (vlib_main_t * vm, u32 n_data_bytes,
+			      char *fmt, ...)
+{
+  va_list va;
+  u8 *name;
+
+  va_start (va, fmt);
+  name = va_format (0, fmt, &va);
+  va_end (va);
+
+  return vlib_buffer_create_free_list_helper (vm, n_data_bytes,
+					      /* is_public */ 0,
+					      /* is_default */ 0,
+					      name);
+}
+
+u32
+vlib_buffer_get_or_create_free_list (vlib_main_t * vm, u32 n_data_bytes,
+				     char *fmt, ...)
+{
+  u32 i = vlib_buffer_get_free_list_with_size (vm, n_data_bytes);
+
+  if (i == ~0)
+    {
+      va_list va;
+      u8 *name;
+
+      va_start (va, fmt);
+      name = va_format (0, fmt, &va);
+      va_end (va);
+
+      i = vlib_buffer_create_free_list_helper (vm, n_data_bytes,
+					       /* is_public */ 1,
+					       /* is_default */ 0,
+					       name);
+    }
+
+  return i;
+}
+
+static void
+del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f)
+{
+  u32 i;
+
+  for (i = 0; i < vec_len (f->buffer_memory_allocated); i++)
+    vm->os_physmem_free (vm, vm->buffer_main->physmem_region,
+			 f->buffer_memory_allocated[i]);
+  vec_free (f->name);
+  vec_free (f->buffer_memory_allocated);
+  vec_free (f->buffers);
+}
+
+/* Add buffer free list. */
+void
+vlib_buffer_delete_free_list_internal (vlib_main_t * vm, u32 free_list_index)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+  vlib_buffer_free_list_t *f;
+  u32 merge_index;
+  int i;
+
+  ASSERT (vlib_get_thread_index () == 0);
+
+  f = vlib_buffer_get_free_list (vm, free_list_index);
+
+  ASSERT (vec_len (f->buffers) == f->n_alloc);
+  merge_index = vlib_buffer_get_free_list_with_size (vm, f->n_data_bytes);
+  if (merge_index != ~0 && merge_index != free_list_index)
+    {
+      vlib_buffer_merge_free_lists (pool_elt_at_index
+				    (bm->buffer_free_list_pool, merge_index),
+				    f);
+    }
+
+  del_free_list (vm, f);
+
+  /* Poison it. */
+  memset (f, 0xab, sizeof (f[0]));
+
+  pool_put (bm->buffer_free_list_pool, f);
+
+  for (i = 1; i < vec_len (vlib_mains); i++)
+    {
+      bm = vlib_mains[i]->buffer_main;
+      f = vlib_buffer_get_free_list (vlib_mains[i], free_list_index);;
+      memset (f, 0xab, sizeof (f[0]));
+      pool_put (bm->buffer_free_list_pool, f);
+    }
+}
+
+/* Make sure free list has at least given number of free buffers. */
+static uword
+fill_free_list (vlib_main_t * vm,
+		vlib_buffer_free_list_t * fl, uword min_free_buffers)
+{
+  vlib_buffer_t *buffers, *b;
+  vlib_buffer_free_list_t *mfl;
+  int n, n_bytes, i;
+  u32 *bi;
+  u32 n_remaining, n_alloc, n_this_chunk;
+
+  /* Already have enough free buffers on free list? */
+  n = min_free_buffers - vec_len (fl->buffers);
+  if (n <= 0)
+    return min_free_buffers;
+
+  mfl = vlib_buffer_get_free_list (vlib_mains[0], fl->index);
+  if (vec_len (mfl->global_buffers) > 0)
+    {
+      int n_copy, n_left;
+      clib_spinlock_lock (&mfl->global_buffers_lock);
+      n_copy = clib_min (vec_len (mfl->global_buffers), n);
+      n_left = vec_len (mfl->global_buffers) - n_copy;
+      vec_add_aligned (fl->buffers, mfl->global_buffers + n_left, n_copy,
+		       CLIB_CACHE_LINE_BYTES);
+      _vec_len (mfl->global_buffers) = n_left;
+      clib_spinlock_unlock (&mfl->global_buffers_lock);
+      n = min_free_buffers - vec_len (fl->buffers);
+      if (n <= 0)
+	return min_free_buffers;
+    }
+
+  /* Always allocate round number of buffers. */
+  n = round_pow2 (n, CLIB_CACHE_LINE_BYTES / sizeof (u32));
+
+  /* Always allocate new buffers in reasonably large sized chunks. */
+  n = clib_max (n, fl->min_n_buffers_each_physmem_alloc);
+
+  n_remaining = n;
+  n_alloc = 0;
+  while (n_remaining > 0)
+    {
+      n_this_chunk = clib_min (n_remaining, 16);
+
+      n_bytes = n_this_chunk * (sizeof (b[0]) + fl->n_data_bytes);
+
+      /* drb: removed power-of-2 ASSERT */
+      buffers =
+	vm->os_physmem_alloc_aligned (vm, vm->buffer_main->physmem_region,
+				      n_bytes, sizeof (vlib_buffer_t));
+      if (!buffers)
+	return n_alloc;
+
+      /* Record chunk as being allocated so we can free it later. */
+      vec_add1 (fl->buffer_memory_allocated, buffers);
+
+      fl->n_alloc += n_this_chunk;
+      n_alloc += n_this_chunk;
+      n_remaining -= n_this_chunk;
+
+      b = buffers;
+      vec_add2_aligned (fl->buffers, bi, n_this_chunk, CLIB_CACHE_LINE_BYTES);
+      for (i = 0; i < n_this_chunk; i++)
+	{
+	  bi[i] = vlib_get_buffer_index (vm, b);
+
+	  if (CLIB_DEBUG > 0)
+	    vlib_buffer_set_known_state (vm, bi[i], VLIB_BUFFER_KNOWN_FREE);
+	  b = vlib_buffer_next_contiguous (b, fl->n_data_bytes);
+	}
+
+      memset (buffers, 0, n_bytes);
+
+      /* Initialize all new buffers. */
+      b = buffers;
+      for (i = 0; i < n_this_chunk; i++)
+	{
+	  vlib_buffer_init_for_free_list (b, fl);
+	  b = vlib_buffer_next_contiguous (b, fl->n_data_bytes);
+	}
+
+      if (fl->buffer_init_function)
+	fl->buffer_init_function (vm, fl, bi, n_this_chunk);
+    }
+  return n_alloc;
+}
+
+static u32
+alloc_from_free_list (vlib_main_t * vm,
+		      vlib_buffer_free_list_t * free_list,
+		      u32 * alloc_buffers, u32 n_alloc_buffers)
+{
+  u32 *dst, *src;
+  uword len;
+  uword n_filled;
+
+  dst = alloc_buffers;
+
+  n_filled = fill_free_list (vm, free_list, n_alloc_buffers);
+  if (n_filled == 0)
+    return 0;
+
+  len = vec_len (free_list->buffers);
+  ASSERT (len >= n_alloc_buffers);
+
+  src = free_list->buffers + len - n_alloc_buffers;
+  clib_memcpy (dst, src, n_alloc_buffers * sizeof (u32));
+
+  _vec_len (free_list->buffers) -= n_alloc_buffers;
+
+  /* Verify that buffers are known free. */
+  vlib_buffer_validate_alloc_free (vm, alloc_buffers,
+				   n_alloc_buffers, VLIB_BUFFER_KNOWN_FREE);
+
+  return n_alloc_buffers;
+}
+
+
+/* Allocate a given number of buffers into given array.
+   Returns number actually allocated which will be either zero or
+   number requested. */
+static u32
+vlib_buffer_alloc_internal (vlib_main_t * vm, u32 * buffers, u32 n_buffers)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+
+  return alloc_from_free_list
+    (vm,
+     pool_elt_at_index (bm->buffer_free_list_pool,
+			VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX),
+     buffers, n_buffers);
+}
+
+static u32
+vlib_buffer_alloc_from_free_list_internal (vlib_main_t * vm,
+					   u32 * buffers,
+					   u32 n_buffers, u32 free_list_index)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+  vlib_buffer_free_list_t *f;
+  f = pool_elt_at_index (bm->buffer_free_list_pool, free_list_index);
+  return alloc_from_free_list (vm, f, buffers, n_buffers);
+}
+
+void *
+vlib_set_buffer_free_callback (vlib_main_t * vm, void *fp)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+  void *rv = bm->buffer_free_callback;
+
+  bm->buffer_free_callback = fp;
+  return rv;
+}
+
+static_always_inline void
+vlib_buffer_free_inline (vlib_main_t * vm,
+			 u32 * buffers, u32 n_buffers, u32 follow_buffer_next)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+  vlib_buffer_free_list_t *fl;
+  u32 fi;
+  int i;
+  u32 (*cb) (vlib_main_t * vm, u32 * buffers, u32 n_buffers,
+	     u32 follow_buffer_next);
+
+  cb = bm->buffer_free_callback;
+
+  if (PREDICT_FALSE (cb != 0))
+    n_buffers = (*cb) (vm, buffers, n_buffers, follow_buffer_next);
+
+  if (!n_buffers)
+    return;
+
+  for (i = 0; i < n_buffers; i++)
+    {
+      vlib_buffer_t *b;
+      u32 bi = buffers[i];
+
+      b = vlib_get_buffer (vm, bi);
+      VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b);
+      fl = vlib_buffer_get_buffer_free_list (vm, b, &fi);
+
+      /* The only current use of this callback: multicast recycle */
+      if (PREDICT_FALSE (fl->buffers_added_to_freelist_function != 0))
+	{
+	  int j;
+
+	  vlib_buffer_add_to_free_list
+	    (vm, fl, buffers[i], (b->flags & VLIB_BUFFER_RECYCLE) == 0);
+
+	  for (j = 0; j < vec_len (bm->announce_list); j++)
+	    {
+	      if (fl == bm->announce_list[j])
+		goto already_announced;
+	    }
+	  vec_add1 (bm->announce_list, fl);
+	already_announced:
+	  ;
+	}
+      else
+	{
+	  if (PREDICT_TRUE ((b->flags & VLIB_BUFFER_RECYCLE) == 0))
+	    {
+	      u32 flags, next;
+
+	      do
+		{
+		  vlib_buffer_t *nb = vlib_get_buffer (vm, bi);
+		  flags = nb->flags;
+		  next = nb->next_buffer;
+		  if (nb->n_add_refs)
+		    nb->n_add_refs--;
+		  else
+		    {
+		      vlib_buffer_validate_alloc_free (vm, &bi, 1,
+						       VLIB_BUFFER_KNOWN_ALLOCATED);
+		      vlib_buffer_add_to_free_list (vm, fl, bi, 1);
+		    }
+		  bi = next;
+		}
+	      while (follow_buffer_next
+		     && (flags & VLIB_BUFFER_NEXT_PRESENT));
+
+	    }
+	}
+    }
+  if (vec_len (bm->announce_list))
+    {
+      vlib_buffer_free_list_t *fl;
+      for (i = 0; i < vec_len (bm->announce_list); i++)
+	{
+	  fl = bm->announce_list[i];
+	  fl->buffers_added_to_freelist_function (vm, fl);
+	}
+      _vec_len (bm->announce_list) = 0;
+    }
+}
+
+static void
+vlib_buffer_free_internal (vlib_main_t * vm, u32 * buffers, u32 n_buffers)
+{
+  vlib_buffer_free_inline (vm, buffers, n_buffers,	/* follow_buffer_next */
+			   1);
+}
+
+static void
+vlib_buffer_free_no_next_internal (vlib_main_t * vm, u32 * buffers,
+				   u32 n_buffers)
+{
+  vlib_buffer_free_inline (vm, buffers, n_buffers,	/* follow_buffer_next */
+			   0);
+}
+
+/* Copy template packet data into buffers as they are allocated. */
+static void __attribute__ ((unused))
+vlib_packet_template_buffer_init (vlib_main_t * vm,
+				  vlib_buffer_free_list_t * fl,
+				  u32 * buffers, u32 n_buffers)
+{
+  vlib_packet_template_t *t =
+    uword_to_pointer (fl->buffer_init_function_opaque,
+		      vlib_packet_template_t *);
+  uword i;
+
+  for (i = 0; i < n_buffers; i++)
+    {
+      vlib_buffer_t *b = vlib_get_buffer (vm, buffers[i]);
+      ASSERT (b->current_length == vec_len (t->packet_data));
+      clib_memcpy (vlib_buffer_get_current (b), t->packet_data,
+		   b->current_length);
+    }
+}
+
+void
+vlib_packet_template_init (vlib_main_t * vm,
+			   vlib_packet_template_t * t,
+			   void *packet_data,
+			   uword n_packet_data_bytes,
+			   uword min_n_buffers_each_physmem_alloc,
+			   char *fmt, ...)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+  va_list va;
+  u8 *name;
+  vlib_buffer_free_list_t *fl;
+
+  va_start (va, fmt);
+  name = va_format (0, fmt, &va);
+  va_end (va);
+
+  if (bm->cb.vlib_packet_template_init_cb)
+    bm->cb.vlib_packet_template_init_cb (vm, (void *) t, packet_data,
+					 n_packet_data_bytes,
+					 min_n_buffers_each_physmem_alloc,
+					 name);
+
+  vlib_worker_thread_barrier_sync (vm);
+
+  memset (t, 0, sizeof (t[0]));
+
+  vec_add (t->packet_data, packet_data, n_packet_data_bytes);
+  t->min_n_buffers_each_physmem_alloc = min_n_buffers_each_physmem_alloc;
+
+  t->free_list_index = vlib_buffer_create_free_list_helper
+    (vm, n_packet_data_bytes,
+     /* is_public */ 1,
+     /* is_default */ 0,
+     name);
+
+  ASSERT (t->free_list_index != 0);
+  fl = vlib_buffer_get_free_list (vm, t->free_list_index);
+  fl->min_n_buffers_each_physmem_alloc = t->min_n_buffers_each_physmem_alloc;
+
+  fl->buffer_init_function = vlib_packet_template_buffer_init;
+  fl->buffer_init_function_opaque = pointer_to_uword (t);
+
+  fl->buffer_init_template.current_data = 0;
+  fl->buffer_init_template.current_length = n_packet_data_bytes;
+  fl->buffer_init_template.flags = 0;
+  fl->buffer_init_template.n_add_refs = 0;
+  vlib_worker_thread_barrier_release (vm);
+}
+
+void *
+vlib_packet_template_get_packet (vlib_main_t * vm,
+				 vlib_packet_template_t * t, u32 * bi_result)
+{
+  u32 bi;
+  vlib_buffer_t *b;
+
+  if (vlib_buffer_alloc (vm, &bi, 1) != 1)
+    return 0;
+
+  *bi_result = bi;
+
+  b = vlib_get_buffer (vm, bi);
+  clib_memcpy (vlib_buffer_get_current (b),
+	       t->packet_data, vec_len (t->packet_data));
+  b->current_length = vec_len (t->packet_data);
+
+  return b->data;
+}
+
+void
+vlib_packet_template_get_packet_helper (vlib_main_t * vm,
+					vlib_packet_template_t * t)
+{
+  word n = t->min_n_buffers_each_physmem_alloc;
+  word l = vec_len (t->packet_data);
+  word n_alloc;
+
+  ASSERT (l > 0);
+  ASSERT (vec_len (t->free_buffers) == 0);
+
+  vec_validate (t->free_buffers, n - 1);
+  n_alloc = vlib_buffer_alloc_from_free_list (vm, t->free_buffers,
+					      n, t->free_list_index);
+  _vec_len (t->free_buffers) = n_alloc;
+}
+
+/* Append given data to end of buffer, possibly allocating new buffers. */
+u32
+vlib_buffer_add_data (vlib_main_t * vm,
+		      u32 free_list_index,
+		      u32 buffer_index, void *data, u32 n_data_bytes)
+{
+  u32 n_buffer_bytes, n_left, n_left_this_buffer, bi;
+  vlib_buffer_t *b;
+  void *d;
+
+  bi = buffer_index;
+  if (bi == 0
+      && 1 != vlib_buffer_alloc_from_free_list (vm, &bi, 1, free_list_index))
+    goto out_of_buffers;
+
+  d = data;
+  n_left = n_data_bytes;
+  n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm, free_list_index);
+
+  b = vlib_get_buffer (vm, bi);
+  b->flags &= ~VLIB_BUFFER_TOTAL_LENGTH_VALID;
+
+  /* Get to the end of the chain before we try to append data... */
+  while (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+    b = vlib_get_buffer (vm, b->next_buffer);
+
+  while (1)
+    {
+      u32 n;
+
+      ASSERT (n_buffer_bytes >= b->current_length);
+      n_left_this_buffer =
+	n_buffer_bytes - (b->current_data + b->current_length);
+      n = clib_min (n_left_this_buffer, n_left);
+      clib_memcpy (vlib_buffer_get_current (b) + b->current_length, d, n);
+      b->current_length += n;
+      n_left -= n;
+      if (n_left == 0)
+	break;
+
+      d += n;
+      if (1 !=
+	  vlib_buffer_alloc_from_free_list (vm, &b->next_buffer, 1,
+					    free_list_index))
+	goto out_of_buffers;
+
+      b->flags |= VLIB_BUFFER_NEXT_PRESENT;
+
+      b = vlib_get_buffer (vm, b->next_buffer);
+    }
+
+  return bi;
+
+out_of_buffers:
+  clib_error ("out of buffers");
+  return bi;
+}
+
+u16
+vlib_buffer_chain_append_data_with_alloc (vlib_main_t * vm,
+					  u32 free_list_index,
+					  vlib_buffer_t * first,
+					  vlib_buffer_t ** last,
+					  void *data, u16 data_len)
+{
+  vlib_buffer_t *l = *last;
+  u32 n_buffer_bytes =
+    vlib_buffer_free_list_buffer_size (vm, free_list_index);
+  u16 copied = 0;
+  ASSERT (n_buffer_bytes >= l->current_length + l->current_data);
+  while (data_len)
+    {
+      u16 max = n_buffer_bytes - l->current_length - l->current_data;
+      if (max == 0)
+	{
+	  if (1 !=
+	      vlib_buffer_alloc_from_free_list (vm, &l->next_buffer, 1,
+						free_list_index))
+	    return copied;
+	  *last = l = vlib_buffer_chain_buffer (vm, first, l, l->next_buffer);
+	  max = n_buffer_bytes - l->current_length - l->current_data;
+	}
+
+      u16 len = (data_len > max) ? max : data_len;
+      clib_memcpy (vlib_buffer_get_current (l) + l->current_length,
+		   data + copied, len);
+      vlib_buffer_chain_increase_length (first, l, len);
+      data_len -= len;
+      copied += len;
+    }
+  return copied;
+}
+
+void
+vlib_buffer_add_mem_range (vlib_main_t * vm, uword start, uword size)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+
+  if (bm->buffer_mem_size == 0)
+    {
+      bm->buffer_mem_start = start;
+      bm->buffer_mem_size = size;
+    }
+  else if (start < bm->buffer_mem_start)
+    {
+      bm->buffer_mem_size += bm->buffer_mem_start - start;
+      bm->buffer_mem_start = start;
+      if (size > bm->buffer_mem_size)
+	bm->buffer_mem_size = size;
+    }
+  else if (start > bm->buffer_mem_start)
+    {
+      uword new_size = start - bm->buffer_mem_start + size;
+      if (new_size > bm->buffer_mem_size)
+	bm->buffer_mem_size = new_size;
+    }
+
+  if ((u64) bm->buffer_mem_size >
+      ((u64) 1 << (32 + CLIB_LOG2_CACHE_LINE_BYTES)))
+    {
+      clib_panic ("buffer memory size out of range!");
+    }
+}
+
+static u8 *
+format_vlib_buffer_free_list (u8 * s, va_list * va)
+{
+  vlib_buffer_free_list_t *f = va_arg (*va, vlib_buffer_free_list_t *);
+  u32 threadnum = va_arg (*va, u32);
+  uword bytes_alloc, bytes_free, n_free, size;
+
+  if (!f)
+    return format (s, "%=7s%=30s%=12s%=12s%=12s%=12s%=12s%=12s",
+		   "Thread", "Name", "Index", "Size", "Alloc", "Free",
+		   "#Alloc", "#Free");
+
+  size = sizeof (vlib_buffer_t) + f->n_data_bytes;
+  n_free = vec_len (f->buffers);
+  bytes_alloc = size * f->n_alloc;
+  bytes_free = size * n_free;
+
+  s = format (s, "%7d%30v%12d%12d%=12U%=12U%=12d%=12d", threadnum,
+	      f->name, f->index, f->n_data_bytes,
+	      format_memory_size, bytes_alloc,
+	      format_memory_size, bytes_free, f->n_alloc, n_free);
+
+  return s;
+}
+
+static clib_error_t *
+show_buffers (vlib_main_t * vm,
+	      unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  vlib_buffer_main_t *bm;
+  vlib_buffer_free_list_t *f;
+  vlib_main_t *curr_vm;
+  u32 vm_index = 0;
+
+  vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, 0, 0);
+
+  do
+    {
+      curr_vm = vlib_mains[vm_index];
+      bm = curr_vm->buffer_main;
+
+    /* *INDENT-OFF* */
+    pool_foreach (f, bm->buffer_free_list_pool, ({
+      vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, f, vm_index);
+    }));
+    /* *INDENT-ON* */
+
+      vm_index++;
+    }
+  while (vm_index < vec_len (vlib_mains));
+
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (show_buffers_command, static) = {
+  .path = "show buffers",
+  .short_help = "Show packet buffer allocation",
+  .function = show_buffers,
+};
+/* *INDENT-ON* */
+
+clib_error_t *
+vlib_buffer_main_init (struct vlib_main_t * vm)
+{
+  vlib_buffer_main_t *bm;
+  clib_error_t *error;
+
+  vec_validate (vm->buffer_main, 0);
+  bm = vm->buffer_main;
+
+  if (vlib_buffer_callbacks)
+    {
+      /* external plugin has registered own buffer callbacks
+         so we just copy them  and quit */
+      vlib_buffer_main_t *bm = vm->buffer_main;
+      clib_memcpy (&bm->cb, vlib_buffer_callbacks,
+		   sizeof (vlib_buffer_callbacks_t));
+      bm->callbacks_registered = 1;
+      return 0;
+    }
+
+  bm->cb.vlib_buffer_alloc_cb = &vlib_buffer_alloc_internal;
+  bm->cb.vlib_buffer_alloc_from_free_list_cb =
+    &vlib_buffer_alloc_from_free_list_internal;
+  bm->cb.vlib_buffer_free_cb = &vlib_buffer_free_internal;
+  bm->cb.vlib_buffer_free_no_next_cb = &vlib_buffer_free_no_next_internal;
+  bm->cb.vlib_buffer_delete_free_list_cb =
+    &vlib_buffer_delete_free_list_internal;
+  clib_spinlock_init (&bm->buffer_known_hash_lockp);
+
+  /* allocate default region */
+  error = vlib_physmem_region_alloc (vm, "buffers",
+				     vlib_buffer_physmem_sz, 0,
+				     VLIB_PHYSMEM_F_INIT_MHEAP |
+				     VLIB_PHYSMEM_F_HAVE_BUFFERS,
+				     &bm->physmem_region);
+
+  if (error == 0)
+    return 0;
+
+  clib_error_free (error);
+
+  /* we my be running unpriviledged, so try to allocate fake physmem */
+  error = vlib_physmem_region_alloc (vm, "buffers (fake)",
+				     vlib_buffer_physmem_sz, 0,
+				     VLIB_PHYSMEM_F_FAKE |
+				     VLIB_PHYSMEM_F_INIT_MHEAP |
+				     VLIB_PHYSMEM_F_HAVE_BUFFERS,
+				     &bm->physmem_region);
+  return error;
+}
+
+static clib_error_t *
+vlib_buffers_configure (vlib_main_t * vm, unformat_input_t * input)
+{
+  u32 size_in_mb;
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "memory-size-in-mb %d", &size_in_mb))
+	vlib_buffer_physmem_sz = size_in_mb << 20;
+      else
+	return unformat_parse_error (input);
+    }
+
+  unformat_free (input);
+  return 0;
+}
+
+VLIB_EARLY_CONFIG_FUNCTION (vlib_buffers_configure, "buffers");
+
+
+/** @endcond */
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h
new file mode 100644
index 00000000..e47dbc6d
--- /dev/null
+++ b/src/vlib/buffer.h
@@ -0,0 +1,540 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * buffer.h: VLIB buffers
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_buffer_h
+#define included_vlib_buffer_h
+
+#include <vppinfra/types.h>
+#include <vppinfra/cache.h>
+#include <vppinfra/serialize.h>
+#include <vppinfra/vector.h>
+#include <vppinfra/lock.h>
+#include <vlib/error.h>		/* for vlib_error_t */
+
+#include <vlib/config.h>	/* for __PRE_DATA_SIZE */
+#define VLIB_BUFFER_DATA_SIZE		(2048)
+#define VLIB_BUFFER_PRE_DATA_SIZE	__PRE_DATA_SIZE
+
+/** \file
+    vlib buffer structure definition and a few select
+    access methods. This structure and the buffer allocation
+    mechanism should perhaps live in vnet, but it would take a lot
+    of typing to make it so.
+*/
+
+/* VLIB buffer representation. */
+typedef struct
+{
+  CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+  STRUCT_MARK (template_start);
+  /* Offset within data[] that we are currently processing.
+     If negative current header points into predata area. */
+  i16 current_data;  /**< signed offset in data[], pre_data[]
+                        that we are currently processing.
+                        If negative current header points into predata area.
+                     */
+  u16 current_length;  /**< Nbytes between current data and
+                          the end of this buffer.
+                       */
+  u32 flags; /**< buffer flags:
+                <br> VLIB_BUFFER_FREE_LIST_INDEX_MASK: bits used to store free list index,
+                <br> VLIB_BUFFER_IS_TRACED: trace this buffer.
+                <br> VLIB_BUFFER_NEXT_PRESENT: this is a multi-chunk buffer.
+                <br> VLIB_BUFFER_TOTAL_LENGTH_VALID: as it says
+                <br> VLIB_BUFFER_REPL_FAIL: packet replication failure
+                <br> VLIB_BUFFER_RECYCLE: as it says
+                <br> VLIB_BUFFER_FLOW_REPORT: buffer is a flow report,
+                <br> VLIB_BUFFER_EXT_HDR_VALID: buffer contains valid external buffer manager header,
+                set to avoid adding it to a flow report
+                <br> VLIB_BUFFER_FLAG_USER(n): user-defined bit N
+             */
+
+/* any change to the following line requres update of
+ * vlib_buffer_get_free_list_index(...) and
+ * vlib_buffer_set_free_list_index(...) functions */
+#define VLIB_BUFFER_FREE_LIST_INDEX_MASK ((1 << 5) - 1)
+
+#define VLIB_BUFFER_IS_TRACED (1 << 5)
+#define VLIB_BUFFER_LOG2_NEXT_PRESENT (6)
+#define VLIB_BUFFER_NEXT_PRESENT (1 << VLIB_BUFFER_LOG2_NEXT_PRESENT)
+#define VLIB_BUFFER_IS_RECYCLED (1 << 7)
+#define VLIB_BUFFER_TOTAL_LENGTH_VALID (1 << 8)
+#define VLIB_BUFFER_REPL_FAIL (1 << 9)
+#define VLIB_BUFFER_RECYCLE (1 << 10)
+#define VLIB_BUFFER_FLOW_REPORT (1 << 11)
+#define VLIB_BUFFER_EXT_HDR_VALID (1 << 12)
+
+  /* User defined buffer flags. */
+#define LOG2_VLIB_BUFFER_FLAG_USER(n) (32 - (n))
+#define VLIB_BUFFER_FLAG_USER(n) (1 << LOG2_VLIB_BUFFER_FLAG_USER(n))
+
+    STRUCT_MARK (template_end);
+
+  u32 next_buffer;   /**< Next buffer for this linked-list of buffers.
+                        Only valid if VLIB_BUFFER_NEXT_PRESENT flag is set.
+                     */
+
+  vlib_error_t error;	/**< Error code for buffers to be enqueued
+                           to error handler.
+                        */
+  u32 current_config_index; /**< Used by feature subgraph arcs to
+                               visit enabled feature nodes
+                            */
+
+  u8 feature_arc_index;	/**< Used to identify feature arcs by intermediate
+                           feature node
+                        */
+
+  u8 n_add_refs; /**< Number of additional references to this buffer. */
+
+  u8 dont_waste_me[2]; /**< Available space in the (precious)
+                          first 32 octets of buffer metadata
+                          Before allocating any of it, discussion required!
+                       */
+
+  u32 opaque[10]; /**< Opaque data used by sub-graphs for their own purposes.
+                    See .../vnet/vnet/buffer.h
+                 */
+    CLIB_CACHE_LINE_ALIGN_MARK (cacheline1);
+
+  u32 trace_index; /**< Specifies index into trace buffer
+                      if VLIB_PACKET_IS_TRACED flag is set.
+                   */
+  u32 recycle_count; /**< Used by L2 path recycle code */
+
+  u32 total_length_not_including_first_buffer;
+  /**< Only valid for first buffer in chain. Current length plus
+     total length given here give total number of bytes in buffer chain.
+  */
+  u32 opaque2[13];  /**< More opaque data, currently unused */
+
+  /***** end of second cache line */
+    CLIB_CACHE_LINE_ALIGN_MARK (cacheline2);
+  u8 pre_data[VLIB_BUFFER_PRE_DATA_SIZE];  /**< Space for inserting data
+                                               before buffer start.
+                                               Packet rewrite string will be
+                                               rewritten backwards and may extend
+                                               back before buffer->data[0].
+                                               Must come directly before packet data.
+                                            */
+
+  u8 data[0]; /**< Packet data. Hardware DMA here */
+} vlib_buffer_t;		/* Must be a multiple of 64B. */
+
+#define VLIB_BUFFER_HDR_SIZE  (sizeof(vlib_buffer_t) - VLIB_BUFFER_PRE_DATA_SIZE)
+
+/** \brief Prefetch buffer metadata.
+    The first 64 bytes of buffer contains most header information
+
+    @param b - (vlib_buffer_t *) pointer to the buffer
+    @param type - LOAD, STORE. In most cases, STORE is the right answer
+*/
+
+#define vlib_prefetch_buffer_header(b,type) CLIB_PREFETCH (b, 64, type)
+
+always_inline vlib_buffer_t *
+vlib_buffer_next_contiguous (vlib_buffer_t * b, u32 buffer_bytes)
+{
+  return (void *) (b + 1) + buffer_bytes;
+}
+
+always_inline void
+vlib_buffer_struct_is_sane (vlib_buffer_t * b)
+{
+  ASSERT (sizeof (b[0]) % 64 == 0);
+
+  /* Rewrite data must be before and contiguous with packet data. */
+  ASSERT (b->pre_data + VLIB_BUFFER_PRE_DATA_SIZE == b->data);
+}
+
+/** \brief Get pointer to current data to process
+
+    @param b - (vlib_buffer_t *) pointer to the buffer
+    @return - (void *) (b->data + b->current_data)
+*/
+
+always_inline void *
+vlib_buffer_get_current (vlib_buffer_t * b)
+{
+  /* Check bounds. */
+  ASSERT ((signed) b->current_data >= (signed) -VLIB_BUFFER_PRE_DATA_SIZE);
+  return b->data + b->current_data;
+}
+
+/** \brief Advance current data pointer by the supplied (signed!) amount
+
+    @param b - (vlib_buffer_t *) pointer to the buffer
+    @param l - (word) signed increment
+*/
+always_inline void
+vlib_buffer_advance (vlib_buffer_t * b, word l)
+{
+  ASSERT (b->current_length >= l);
+  b->current_data += l;
+  b->current_length -= l;
+}
+
+/** \brief Check if there is enough space in buffer to advance
+
+    @param b - (vlib_buffer_t *) pointer to the buffer
+    @param l - (word) size to check
+    @return - 0 if there is less space than 'l' in buffer
+*/
+always_inline u8
+vlib_buffer_has_space (vlib_buffer_t * b, word l)
+{
+  return b->current_length >= l;
+}
+
+/** \brief Reset current header & length to state they were in when
+    packet was received.
+
+    @param b - (vlib_buffer_t *) pointer to the buffer
+*/
+
+always_inline void
+vlib_buffer_reset (vlib_buffer_t * b)
+{
+  b->current_length += clib_max (b->current_data, 0);
+  b->current_data = 0;
+}
+
+/** \brief Get pointer to buffer's opaque data array
+
+    @param b - (vlib_buffer_t *) pointer to the buffer
+    @return - (void *) b->opaque
+*/
+always_inline void *
+vlib_get_buffer_opaque (vlib_buffer_t * b)
+{
+  return (void *) b->opaque;
+}
+
+/** \brief Get pointer to buffer's opaque2 data array
+
+    @param b - (vlib_buffer_t *) pointer to the buffer
+    @return - (void *) b->opaque2
+*/
+always_inline void *
+vlib_get_buffer_opaque2 (vlib_buffer_t * b)
+{
+  return (void *) b->opaque2;
+}
+
+/** \brief Get pointer to the end of buffer's data
+ * @param b     pointer to the buffer
+ * @return      pointer to tail of packet's data
+ */
+always_inline u8 *
+vlib_buffer_get_tail (vlib_buffer_t * b)
+{
+  return b->data + b->current_data + b->current_length;
+}
+
+/** \brief Append uninitialized data to buffer
+ * @param b     pointer to the buffer
+ * @param size  number of uninitialized bytes
+ * @return      pointer to beginning of uninitialized data
+ */
+always_inline void *
+vlib_buffer_put_uninit (vlib_buffer_t * b, u8 size)
+{
+  void *p = vlib_buffer_get_tail (b);
+  /* XXX make sure there's enough space */
+  b->current_length += size;
+  return p;
+}
+
+/** \brief Prepend uninitialized data to buffer
+ * @param b     pointer to the buffer
+ * @param size  number of uninitialized bytes
+ * @return      pointer to beginning of uninitialized data
+ */
+always_inline void *
+vlib_buffer_push_uninit (vlib_buffer_t * b, u8 size)
+{
+  ASSERT (b->current_data + VLIB_BUFFER_PRE_DATA_SIZE >= size);
+  b->current_data -= size;
+  b->current_length += size;
+
+  return vlib_buffer_get_current (b);
+}
+
+/** \brief Make head room, typically for packet headers
+ * @param b     pointer to the buffer
+ * @param size  number of head room bytes
+ * @return      pointer to start of buffer (current data)
+ */
+always_inline void *
+vlib_buffer_make_headroom (vlib_buffer_t * b, u8 size)
+{
+  ASSERT (b->current_data + VLIB_BUFFER_PRE_DATA_SIZE >= size);
+  b->current_data += size;
+  return vlib_buffer_get_current (b);
+}
+
+/** \brief Retrieve bytes from buffer head
+ * @param b     pointer to the buffer
+ * @param size  number of bytes to pull
+ * @return      pointer to start of buffer (current data)
+ */
+always_inline void *
+vlib_buffer_pull (vlib_buffer_t * b, u8 size)
+{
+  if (b->current_length + VLIB_BUFFER_PRE_DATA_SIZE < size)
+    return 0;
+
+  void *data = vlib_buffer_get_current (b);
+  vlib_buffer_advance (b, size);
+  return data;
+}
+
+/* Forward declaration. */
+struct vlib_main_t;
+
+typedef struct vlib_buffer_free_list_t
+{
+  /* Template buffer used to initialize first 16 bytes of buffers
+     allocated on this free list. */
+  vlib_buffer_t buffer_init_template;
+
+  /* Our index into vlib_main_t's buffer_free_list_pool. */
+  u32 index;
+
+  /* Number of data bytes for buffers in this free list. */
+  u32 n_data_bytes;
+
+  /* Number of buffers to allocate when we need to allocate new buffers
+     from physmem heap. */
+  u32 min_n_buffers_each_physmem_alloc;
+
+  /* Total number of buffers allocated from this free list. */
+  u32 n_alloc;
+
+  /* Vector of free buffers.  Each element is a byte offset into I/O heap. */
+  u32 *buffers;
+
+  /* global vector of free buffers, used only on main thread.
+     Bufers are returned to global buffers only in case when number of
+     buffers on free buffers list grows about threshold */
+  u32 *global_buffers;
+  clib_spinlock_t global_buffers_lock;
+
+  /* Memory chunks allocated for this free list
+     recorded here so they can be freed when free list
+     is deleted. */
+  void **buffer_memory_allocated;
+
+  /* Free list name. */
+  u8 *name;
+
+  /* Callback functions to initialize newly allocated buffers.
+     If null buffers are zeroed. */
+  void (*buffer_init_function) (struct vlib_main_t * vm,
+				struct vlib_buffer_free_list_t * fl,
+				u32 * buffers, u32 n_buffers);
+
+  /* Callback function to announce that buffers have been
+     added to the freelist */
+  void (*buffers_added_to_freelist_function)
+    (struct vlib_main_t * vm, struct vlib_buffer_free_list_t * fl);
+
+  uword buffer_init_function_opaque;
+} __attribute__ ((aligned (16))) vlib_buffer_free_list_t;
+
+typedef struct
+{
+  u32 (*vlib_buffer_alloc_cb) (struct vlib_main_t * vm, u32 * buffers,
+			       u32 n_buffers);
+  u32 (*vlib_buffer_alloc_from_free_list_cb) (struct vlib_main_t * vm,
+					      u32 * buffers, u32 n_buffers,
+					      u32 free_list_index);
+  void (*vlib_buffer_free_cb) (struct vlib_main_t * vm, u32 * buffers,
+			       u32 n_buffers);
+  void (*vlib_buffer_free_no_next_cb) (struct vlib_main_t * vm, u32 * buffers,
+				       u32 n_buffers);
+  void (*vlib_packet_template_init_cb) (struct vlib_main_t * vm, void *t,
+					void *packet_data,
+					uword n_packet_data_bytes,
+					uword
+					min_n_buffers_each_physmem_alloc,
+					u8 * name);
+  void (*vlib_buffer_delete_free_list_cb) (struct vlib_main_t * vm,
+					   u32 free_list_index);
+} vlib_buffer_callbacks_t;
+
+extern vlib_buffer_callbacks_t *vlib_buffer_callbacks;
+
+typedef struct
+{
+  CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+  /* Virtual memory address and size of buffer memory, used for calculating
+     buffer index */
+  uword buffer_mem_start;
+  uword buffer_mem_size;
+  vlib_physmem_region_index_t physmem_region;
+
+  /* Buffer free callback, for subversive activities */
+    u32 (*buffer_free_callback) (struct vlib_main_t * vm,
+				 u32 * buffers,
+				 u32 n_buffers, u32 follow_buffer_next);
+  /* Pool of buffer free lists.
+     Multiple free lists exist for packet generator which uses
+     separate free lists for each packet stream --- so as to avoid
+     initializing static data for each packet generated. */
+  vlib_buffer_free_list_t *buffer_free_list_pool;
+#define VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX (0)
+#define VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES VLIB_BUFFER_DATA_SIZE
+
+  /* Hash table mapping buffer size (rounded to next unit of
+     sizeof (vlib_buffer_t)) to free list index. */
+  uword *free_list_by_size;
+
+  /* Hash table mapping buffer index into number
+     0 => allocated but free, 1 => allocated and not-free.
+     If buffer index is not in hash table then this buffer
+     has never been allocated. */
+  uword *buffer_known_hash;
+  clib_spinlock_t buffer_known_hash_lockp;
+
+  /* List of free-lists needing Blue Light Special announcements */
+  vlib_buffer_free_list_t **announce_list;
+
+  /* Callbacks */
+  vlib_buffer_callbacks_t cb;
+  int callbacks_registered;
+} vlib_buffer_main_t;
+
+void vlib_buffer_add_mem_range (struct vlib_main_t *vm, uword start,
+				uword size);
+clib_error_t *vlib_buffer_main_init (struct vlib_main_t *vm);
+
+typedef struct
+{
+  struct vlib_main_t *vlib_main;
+
+  u32 first_buffer, last_buffer;
+
+  union
+  {
+    struct
+    {
+      /* Total accumulated bytes in chain starting with first_buffer. */
+      u32 n_total_data_bytes;
+
+      /* Max number of bytes to accumulate in chain starting with first_buffer.
+         As this limit is reached buffers are enqueued to next node. */
+      u32 max_n_data_bytes_per_chain;
+
+      /* Next node to enqueue buffers to relative to current process node. */
+      u32 next_index;
+
+      /* Free list to use to allocate new buffers. */
+      u32 free_list_index;
+    } tx;
+
+    struct
+    {
+      /* CLIB fifo of buffer indices waiting to be unserialized. */
+      u32 *buffer_fifo;
+
+      /* Event type used to signal that RX buffers have been added to fifo. */
+      uword ready_one_time_event;
+    } rx;
+  };
+} vlib_serialize_buffer_main_t;
+
+void serialize_open_vlib_buffer (serialize_main_t * m, struct vlib_main_t *vm,
+				 vlib_serialize_buffer_main_t * sm);
+void unserialize_open_vlib_buffer (serialize_main_t * m,
+				   struct vlib_main_t *vm,
+				   vlib_serialize_buffer_main_t * sm);
+
+u32 serialize_close_vlib_buffer (serialize_main_t * m);
+void unserialize_close_vlib_buffer (serialize_main_t * m);
+void *vlib_set_buffer_free_callback (struct vlib_main_t *vm, void *fp);
+
+always_inline u32
+serialize_vlib_buffer_n_bytes (serialize_main_t * m)
+{
+  serialize_stream_t *s = &m->stream;
+  vlib_serialize_buffer_main_t *sm
+    = uword_to_pointer (m->stream.data_function_opaque,
+			vlib_serialize_buffer_main_t *);
+  return sm->tx.n_total_data_bytes + s->current_buffer_index +
+    vec_len (s->overflow_buffer);
+}
+
+/*
+ */
+
+/** \brief Compile time buffer trajectory tracing option
+    Turn this on if you run into "bad monkey" contexts,
+    and you want to know exactly which nodes they've visited...
+    See vlib/main.c...
+*/
+#define VLIB_BUFFER_TRACE_TRAJECTORY 0
+
+#if VLIB_BUFFER_TRACE_TRAJECTORY > 0
+#define VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b) (b)->pre_data[0]=0
+#else
+#define VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b)
+#endif /* VLIB_BUFFER_TRACE_TRAJECTORY */
+
+#endif /* included_vlib_buffer_h */
+
+#define VLIB_BUFFER_REGISTER_CALLBACKS(x,...)                           \
+    __VA_ARGS__ vlib_buffer_callbacks_t __##x##_buffer_callbacks;       \
+static void __vlib_add_buffer_callbacks_t_##x (void)                    \
+    __attribute__((__constructor__)) ;                                  \
+static void __vlib_add_buffer_callbacks_t_##x (void)                    \
+{                                                                       \
+    if (vlib_buffer_callbacks)                                          \
+      clib_panic ("vlib buffer callbacks already registered");          \
+    vlib_buffer_callbacks = &__##x##_buffer_callbacks;                  \
+}                                                                       \
+__VA_ARGS__ vlib_buffer_callbacks_t __##x##_buffer_callbacks
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h
new file mode 100644
index 00000000..d51de6be
--- /dev/null
+++ b/src/vlib/buffer_funcs.h
@@ -0,0 +1,946 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * buffer_funcs.h: VLIB buffer related functions/inlines
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_buffer_funcs_h
+#define included_vlib_buffer_funcs_h
+
+#include <vppinfra/hash.h>
+
+/** \file
+    vlib buffer access methods.
+*/
+
+
+/** \brief Translate buffer index into buffer pointer
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param buffer_index - (u32) buffer index
+    @return - (vlib_buffer_t *) buffer pointer
+*/
+always_inline vlib_buffer_t *
+vlib_get_buffer (vlib_main_t * vm, u32 buffer_index)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+  uword offset = ((uword) buffer_index) << CLIB_LOG2_CACHE_LINE_BYTES;
+  ASSERT (offset < bm->buffer_mem_size);
+
+  return uword_to_pointer (bm->buffer_mem_start + offset, void *);
+}
+
+/** \brief Translate buffer pointer into buffer index
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param p - (void *) buffer pointer
+    @return - (u32) buffer index
+*/
+
+always_inline u32
+vlib_get_buffer_index (vlib_main_t * vm, void *p)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+  uword offset = pointer_to_uword (p) - bm->buffer_mem_start;
+  ASSERT (pointer_to_uword (p) >= bm->buffer_mem_start);
+  ASSERT (offset < bm->buffer_mem_size);
+  ASSERT ((offset % (1 << CLIB_LOG2_CACHE_LINE_BYTES)) == 0);
+  return offset >> CLIB_LOG2_CACHE_LINE_BYTES;
+}
+
+/** \brief Get next buffer in buffer linklist, or zero for end of list.
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param b - (void *) buffer pointer
+    @return - (vlib_buffer_t *) next buffer, or NULL
+*/
+always_inline vlib_buffer_t *
+vlib_get_next_buffer (vlib_main_t * vm, vlib_buffer_t * b)
+{
+  return (b->flags & VLIB_BUFFER_NEXT_PRESENT
+	  ? vlib_get_buffer (vm, b->next_buffer) : 0);
+}
+
+uword vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm,
+					     vlib_buffer_t * b_first);
+
+/** \brief Get length in bytes of the buffer chain
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param b - (void *) buffer pointer
+    @return - (uword) length of buffer chain
+*/
+always_inline uword
+vlib_buffer_length_in_chain (vlib_main_t * vm, vlib_buffer_t * b)
+{
+  uword len = b->current_length;
+
+  if (PREDICT_TRUE ((b->flags & VLIB_BUFFER_NEXT_PRESENT) == 0))
+    return len;
+
+  if (PREDICT_TRUE (b->flags & VLIB_BUFFER_TOTAL_LENGTH_VALID))
+    return len + b->total_length_not_including_first_buffer;
+
+  return vlib_buffer_length_in_chain_slow_path (vm, b);
+}
+
+/** \brief Get length in bytes of the buffer index buffer chain
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param bi - (u32) buffer index
+    @return - (uword) length of buffer chain
+*/
+always_inline uword
+vlib_buffer_index_length_in_chain (vlib_main_t * vm, u32 bi)
+{
+  vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+  return vlib_buffer_length_in_chain (vm, b);
+}
+
+/** \brief Copy buffer contents to memory
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param buffer_index - (u32) buffer index
+    @param contents - (u8 *) memory, <strong>must be large enough</strong>
+    @return - (uword) length of buffer chain
+*/
+always_inline uword
+vlib_buffer_contents (vlib_main_t * vm, u32 buffer_index, u8 * contents)
+{
+  uword content_len = 0;
+  uword l;
+  vlib_buffer_t *b;
+
+  while (1)
+    {
+      b = vlib_get_buffer (vm, buffer_index);
+      l = b->current_length;
+      clib_memcpy (contents + content_len, b->data + b->current_data, l);
+      content_len += l;
+      if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT))
+	break;
+      buffer_index = b->next_buffer;
+    }
+
+  return content_len;
+}
+
+/* Return physical address of buffer->data start. */
+always_inline u64
+vlib_get_buffer_data_physical_address (vlib_main_t * vm, u32 buffer_index)
+{
+  return vlib_physmem_offset_to_physical (vm, vm->buffer_main->physmem_region,
+					  (((uword) buffer_index) <<
+					   CLIB_LOG2_CACHE_LINE_BYTES) +
+					  STRUCT_OFFSET_OF (vlib_buffer_t,
+							    data));
+}
+
+/** \brief Prefetch buffer metadata by buffer index
+    The first 64 bytes of buffer contains most header information
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param bi - (u32) buffer index
+    @param type - LOAD, STORE. In most cases, STORE is the right answer
+*/
+/* Prefetch buffer header given index. */
+#define vlib_prefetch_buffer_with_index(vm,bi,type)	\
+  do {							\
+    vlib_buffer_t * _b = vlib_get_buffer (vm, bi);	\
+    vlib_prefetch_buffer_header (_b, type);		\
+  } while (0)
+
+#if 0
+/* Iterate over known allocated vlib bufs. You probably do not want
+ * to do this!
+ @param vm      the vlib_main_t
+ @param bi      found allocated buffer index
+ @param body    operation to perform on buffer index
+ function executes body for each allocated buffer index
+ */
+#define vlib_buffer_foreach_allocated(vm,bi,body)                \
+do {                                                             \
+  vlib_main_t * _vmain = (vm);                                   \
+  vlib_buffer_main_t * _bmain = &_vmain->buffer_main;            \
+  hash_pair_t * _vbpair;                                         \
+  hash_foreach_pair(_vbpair, _bmain->buffer_known_hash, ({       \
+    if (VLIB_BUFFER_KNOWN_ALLOCATED == _vbpair->value[0]) {      \
+      (bi) = _vbpair->key;                                       \
+      body;                                                      \
+    }                                                            \
+  }));                                                           \
+} while (0)
+#endif
+
+typedef enum
+{
+  /* Index is unknown. */
+  VLIB_BUFFER_UNKNOWN,
+
+  /* Index is known and free/allocated. */
+  VLIB_BUFFER_KNOWN_FREE,
+  VLIB_BUFFER_KNOWN_ALLOCATED,
+} vlib_buffer_known_state_t;
+
+always_inline vlib_buffer_known_state_t
+vlib_buffer_is_known (vlib_main_t * vm, u32 buffer_index)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+
+  clib_spinlock_lock (&bm->buffer_known_hash_lockp);
+  uword *p = hash_get (bm->buffer_known_hash, buffer_index);
+  clib_spinlock_unlock (&bm->buffer_known_hash_lockp);
+  return p ? p[0] : VLIB_BUFFER_UNKNOWN;
+}
+
+always_inline void
+vlib_buffer_set_known_state (vlib_main_t * vm,
+			     u32 buffer_index,
+			     vlib_buffer_known_state_t state)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+  clib_spinlock_lock (&bm->buffer_known_hash_lockp);
+  hash_set (bm->buffer_known_hash, buffer_index, state);
+  clib_spinlock_unlock (&bm->buffer_known_hash_lockp);
+}
+
+/* Validates sanity of a single buffer.
+   Returns format'ed vector with error message if any. */
+u8 *vlib_validate_buffer (vlib_main_t * vm, u32 buffer_index,
+			  uword follow_chain);
+
+/** \brief Allocate buffers into supplied array
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param buffers - (u32 * ) buffer index array
+    @param n_buffers - (u32) number of buffers requested
+    @return - (u32) number of buffers actually allocated, may be
+    less than the number requested or zero
+*/
+always_inline u32
+vlib_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+
+  ASSERT (bm->cb.vlib_buffer_alloc_cb);
+
+  return bm->cb.vlib_buffer_alloc_cb (vm, buffers, n_buffers);
+}
+
+always_inline u32
+vlib_buffer_round_size (u32 size)
+{
+  return round_pow2 (size, sizeof (vlib_buffer_t));
+}
+
+always_inline u32
+vlib_buffer_get_free_list_index (vlib_buffer_t * b)
+{
+  return b->flags & VLIB_BUFFER_FREE_LIST_INDEX_MASK;
+}
+
+always_inline void
+vlib_buffer_set_free_list_index (vlib_buffer_t * b, u32 index)
+{
+  /* if there is an need for more free lists we should consider
+     storig data in the 2nd cacheline */
+  ASSERT (VLIB_BUFFER_FREE_LIST_INDEX_MASK & 1);
+  ASSERT (index <= VLIB_BUFFER_FREE_LIST_INDEX_MASK);
+
+  b->flags &= ~VLIB_BUFFER_FREE_LIST_INDEX_MASK;
+  b->flags |= index & VLIB_BUFFER_FREE_LIST_INDEX_MASK;
+}
+
+/** \brief Allocate buffers from specific freelist into supplied array
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param buffers - (u32 * ) buffer index array
+    @param n_buffers - (u32) number of buffers requested
+    @return - (u32) number of buffers actually allocated, may be
+    less than the number requested or zero
+*/
+always_inline u32
+vlib_buffer_alloc_from_free_list (vlib_main_t * vm,
+				  u32 * buffers,
+				  u32 n_buffers, u32 free_list_index)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+
+  ASSERT (bm->cb.vlib_buffer_alloc_from_free_list_cb);
+
+  return bm->cb.vlib_buffer_alloc_from_free_list_cb (vm, buffers, n_buffers,
+						     free_list_index);
+}
+
+/** \brief Free buffers
+    Frees the entire buffer chain for each buffer
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param buffers - (u32 * ) buffer index array
+    @param n_buffers - (u32) number of buffers to free
+
+*/
+always_inline void
+vlib_buffer_free (vlib_main_t * vm,
+		  /* pointer to first buffer */
+		  u32 * buffers,
+		  /* number of buffers to free */
+		  u32 n_buffers)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+
+  ASSERT (bm->cb.vlib_buffer_free_cb);
+
+  return bm->cb.vlib_buffer_free_cb (vm, buffers, n_buffers);
+}
+
+/** \brief Free buffers, does not free the buffer chain for each buffer
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param buffers - (u32 * ) buffer index array
+    @param n_buffers - (u32) number of buffers to free
+
+*/
+always_inline void
+vlib_buffer_free_no_next (vlib_main_t * vm,
+			  /* pointer to first buffer */
+			  u32 * buffers,
+			  /* number of buffers to free */
+			  u32 n_buffers)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+
+  ASSERT (bm->cb.vlib_buffer_free_no_next_cb);
+
+  return bm->cb.vlib_buffer_free_no_next_cb (vm, buffers, n_buffers);
+}
+
+/** \brief Free one buffer
+    Shorthand to free a single buffer chain.
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param buffer_index - (u32) buffer index to free
+*/
+always_inline void
+vlib_buffer_free_one (vlib_main_t * vm, u32 buffer_index)
+{
+  vlib_buffer_free (vm, &buffer_index, /* n_buffers */ 1);
+}
+
+/* Add/delete buffer free lists. */
+u32 vlib_buffer_create_free_list (vlib_main_t * vm, u32 n_data_bytes,
+				  char *fmt, ...);
+always_inline void
+vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+
+  ASSERT (bm->cb.vlib_buffer_delete_free_list_cb);
+
+  bm->cb.vlib_buffer_delete_free_list_cb (vm, free_list_index);
+}
+
+/* Find already existing public free list with given size or create one. */
+u32 vlib_buffer_get_or_create_free_list (vlib_main_t * vm, u32 n_data_bytes,
+					 char *fmt, ...);
+
+/* Merge two free lists */
+void vlib_buffer_merge_free_lists (vlib_buffer_free_list_t * dst,
+				   vlib_buffer_free_list_t * src);
+
+/* Make sure we have at least given number of unaligned buffers. */
+void vlib_buffer_free_list_fill_unaligned (vlib_main_t * vm,
+					   vlib_buffer_free_list_t *
+					   free_list,
+					   uword n_unaligned_buffers);
+
+always_inline u32
+vlib_buffer_get_free_list_with_size (vlib_main_t * vm, u32 size)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+
+  size = vlib_buffer_round_size (size);
+  uword *p = hash_get (bm->free_list_by_size, size);
+  return p ? p[0] : ~0;
+}
+
+always_inline vlib_buffer_free_list_t *
+vlib_buffer_get_buffer_free_list (vlib_main_t * vm, vlib_buffer_t * b,
+				  u32 * index)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+  u32 i;
+
+  *index = i = vlib_buffer_get_free_list_index (b);
+  return pool_elt_at_index (bm->buffer_free_list_pool, i);
+}
+
+always_inline vlib_buffer_free_list_t *
+vlib_buffer_get_free_list (vlib_main_t * vm, u32 free_list_index)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+  vlib_buffer_free_list_t *f;
+
+  f = pool_elt_at_index (bm->buffer_free_list_pool, free_list_index);
+
+  /* Sanity: indices must match. */
+  ASSERT (f->index == free_list_index);
+
+  return f;
+}
+
+always_inline u32
+vlib_buffer_free_list_buffer_size (vlib_main_t * vm, u32 free_list_index)
+{
+  vlib_buffer_free_list_t *f =
+    vlib_buffer_get_free_list (vm, free_list_index);
+  return f->n_data_bytes;
+}
+
+void vlib_aligned_memcpy (void *_dst, void *_src, int n_bytes);
+
+/* Reasonably fast buffer copy routine. */
+always_inline void
+vlib_copy_buffers (u32 * dst, u32 * src, u32 n)
+{
+  while (n >= 4)
+    {
+      dst[0] = src[0];
+      dst[1] = src[1];
+      dst[2] = src[2];
+      dst[3] = src[3];
+      dst += 4;
+      src += 4;
+      n -= 4;
+    }
+  while (n > 0)
+    {
+      dst[0] = src[0];
+      dst += 1;
+      src += 1;
+      n -= 1;
+    }
+}
+
+/* Append given data to end of buffer, possibly allocating new buffers. */
+u32 vlib_buffer_add_data (vlib_main_t * vm,
+			  u32 free_list_index,
+			  u32 buffer_index, void *data, u32 n_data_bytes);
+
+/* duplicate all buffers in chain */
+always_inline vlib_buffer_t *
+vlib_buffer_copy (vlib_main_t * vm, vlib_buffer_t * b)
+{
+  vlib_buffer_t *s, *d, *fd;
+  uword n_alloc, n_buffers = 1;
+  u32 flag_mask = VLIB_BUFFER_NEXT_PRESENT | VLIB_BUFFER_TOTAL_LENGTH_VALID;
+  int i;
+
+  s = b;
+  while (s->flags & VLIB_BUFFER_NEXT_PRESENT)
+    {
+      n_buffers++;
+      s = vlib_get_buffer (vm, s->next_buffer);
+    }
+  u32 new_buffers[n_buffers];
+
+  n_alloc = vlib_buffer_alloc (vm, new_buffers, n_buffers);
+
+  /* No guarantee that we'll get all the buffers we asked for */
+  if (PREDICT_FALSE (n_alloc < n_buffers))
+    {
+      if (n_alloc > 0)
+	vlib_buffer_free (vm, new_buffers, n_alloc);
+      return 0;
+    }
+
+  /* 1st segment */
+  s = b;
+  fd = d = vlib_get_buffer (vm, new_buffers[0]);
+  d->current_data = s->current_data;
+  d->current_length = s->current_length;
+  d->flags = s->flags & flag_mask;
+  d->total_length_not_including_first_buffer =
+    s->total_length_not_including_first_buffer;
+  clib_memcpy (d->opaque, s->opaque, sizeof (s->opaque));
+  clib_memcpy (vlib_buffer_get_current (d),
+	       vlib_buffer_get_current (s), s->current_length);
+
+  /* next segments */
+  for (i = 1; i < n_buffers; i++)
+    {
+      /* previous */
+      d->next_buffer = new_buffers[i];
+      /* current */
+      s = vlib_get_buffer (vm, s->next_buffer);
+      d = vlib_get_buffer (vm, new_buffers[i]);
+      d->current_data = s->current_data;
+      d->current_length = s->current_length;
+      clib_memcpy (vlib_buffer_get_current (d),
+		   vlib_buffer_get_current (s), s->current_length);
+      d->flags = s->flags & flag_mask;
+    }
+
+  return fd;
+}
+
+/** \brief Create multiple clones of buffer and store them in the supplied array
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param src_buffer - (u32) source buffer index
+    @param buffers - (u32 * ) buffer index array
+    @param n_buffers - (u8) number of buffer clones requested
+    @param head_end_offset - (u16) offset relative to current position
+           where packet head ends
+    @return - (u8) number of buffers actually cloned, may be
+    less than the number requested or zero
+*/
+
+always_inline u8
+vlib_buffer_clone (vlib_main_t * vm, u32 src_buffer, u32 * buffers,
+		   u8 n_buffers, u16 head_end_offset)
+{
+  u8 i;
+  vlib_buffer_t *s = vlib_get_buffer (vm, src_buffer);
+
+  ASSERT (s->n_add_refs == 0);
+  ASSERT (n_buffers);
+
+  if (s->current_length <= head_end_offset + CLIB_CACHE_LINE_BYTES * 2)
+    {
+      buffers[0] = src_buffer;
+      for (i = 1; i < n_buffers; i++)
+	{
+	  vlib_buffer_t *d;
+	  d = vlib_buffer_copy (vm, s);
+	  if (d == 0)
+	    return i;
+	  buffers[i] = vlib_get_buffer_index (vm, d);
+
+	}
+      return n_buffers;
+    }
+
+  n_buffers = vlib_buffer_alloc_from_free_list (vm, buffers, n_buffers,
+						vlib_buffer_get_free_list_index
+						(s));
+  if (PREDICT_FALSE (n_buffers == 0))
+    {
+      buffers[0] = src_buffer;
+      return 1;
+    }
+
+  for (i = 0; i < n_buffers; i++)
+    {
+      vlib_buffer_t *d = vlib_get_buffer (vm, buffers[i]);
+      d->current_data = s->current_data;
+      d->current_length = head_end_offset;
+      vlib_buffer_set_free_list_index (d,
+				       vlib_buffer_get_free_list_index (s));
+      d->total_length_not_including_first_buffer =
+	s->total_length_not_including_first_buffer + s->current_length -
+	head_end_offset;
+      d->flags = s->flags | VLIB_BUFFER_NEXT_PRESENT;
+      d->flags &= ~VLIB_BUFFER_EXT_HDR_VALID;
+      clib_memcpy (d->opaque, s->opaque, sizeof (s->opaque));
+      clib_memcpy (vlib_buffer_get_current (d), vlib_buffer_get_current (s),
+		   head_end_offset);
+      d->next_buffer = src_buffer;
+    }
+  vlib_buffer_advance (s, head_end_offset);
+  s->n_add_refs = n_buffers - 1;
+  while (s->flags & VLIB_BUFFER_NEXT_PRESENT)
+    {
+      s = vlib_get_buffer (vm, s->next_buffer);
+      s->n_add_refs = n_buffers - 1;
+    }
+
+  return n_buffers;
+}
+
+/** \brief Attach cloned tail to the buffer
+
+    @param vm - (vlib_main_t *) vlib main data structure pointer
+    @param head - (vlib_buffer_t *) head buffer
+    @param tail - (Vlib buffer_t *) tail buffer to clone and attach to head
+*/
+
+always_inline void
+vlib_buffer_attach_clone (vlib_main_t * vm, vlib_buffer_t * head,
+			  vlib_buffer_t * tail)
+{
+  ASSERT ((head->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
+  ASSERT (vlib_buffer_get_free_list_index (head) ==
+	  vlib_buffer_get_free_list_index (tail));
+
+  head->flags |= VLIB_BUFFER_NEXT_PRESENT;
+  head->flags &= ~VLIB_BUFFER_TOTAL_LENGTH_VALID;
+  head->flags &= ~VLIB_BUFFER_EXT_HDR_VALID;
+  head->flags |= (tail->flags & VLIB_BUFFER_TOTAL_LENGTH_VALID);
+  head->next_buffer = vlib_get_buffer_index (vm, tail);
+  head->total_length_not_including_first_buffer = tail->current_length +
+    tail->total_length_not_including_first_buffer;
+
+next_segment:
+  __sync_add_and_fetch (&tail->n_add_refs, 1);
+
+  if (tail->flags & VLIB_BUFFER_NEXT_PRESENT)
+    {
+      tail = vlib_get_buffer (vm, tail->next_buffer);
+      goto next_segment;
+    }
+}
+
+/* Initializes the buffer as an empty packet with no chained buffers. */
+always_inline void
+vlib_buffer_chain_init (vlib_buffer_t * first)
+{
+  first->total_length_not_including_first_buffer = 0;
+  first->current_length = 0;
+  first->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
+  first->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID;
+}
+
+/* The provided next_bi buffer index is appended to the end of the packet. */
+always_inline vlib_buffer_t *
+vlib_buffer_chain_buffer (vlib_main_t * vm,
+			  vlib_buffer_t * first,
+			  vlib_buffer_t * last, u32 next_bi)
+{
+  vlib_buffer_t *next_buffer = vlib_get_buffer (vm, next_bi);
+  last->next_buffer = next_bi;
+  last->flags |= VLIB_BUFFER_NEXT_PRESENT;
+  next_buffer->current_length = 0;
+  next_buffer->flags &= ~VLIB_BUFFER_NEXT_PRESENT;
+  return next_buffer;
+}
+
+/* Increases or decreases the packet length.
+ * It does not allocate or deallocate new buffers.
+ * Therefore, the added length must be compatible
+ * with the last buffer. */
+always_inline void
+vlib_buffer_chain_increase_length (vlib_buffer_t * first,
+				   vlib_buffer_t * last, i32 len)
+{
+  last->current_length += len;
+  if (first != last)
+    first->total_length_not_including_first_buffer += len;
+}
+
+/* Copy data to the end of the packet and increases its length.
+ * It does not allocate new buffers.
+ * Returns the number of copied bytes. */
+always_inline u16
+vlib_buffer_chain_append_data (vlib_main_t * vm,
+			       u32 free_list_index,
+			       vlib_buffer_t * first,
+			       vlib_buffer_t * last, void *data, u16 data_len)
+{
+  u32 n_buffer_bytes =
+    vlib_buffer_free_list_buffer_size (vm, free_list_index);
+  ASSERT (n_buffer_bytes >= last->current_length + last->current_data);
+  u16 len = clib_min (data_len,
+		      n_buffer_bytes - last->current_length -
+		      last->current_data);
+  clib_memcpy (vlib_buffer_get_current (last) + last->current_length, data,
+	       len);
+  vlib_buffer_chain_increase_length (first, last, len);
+  return len;
+}
+
+/* Copy data to the end of the packet and increases its length.
+ * Allocates additional buffers from the free list if necessary.
+ * Returns the number of copied bytes.
+ * 'last' value is modified whenever new buffers are allocated and
+ * chained and points to the last buffer in the chain. */
+u16
+vlib_buffer_chain_append_data_with_alloc (vlib_main_t * vm,
+					  u32 free_list_index,
+					  vlib_buffer_t * first,
+					  vlib_buffer_t ** last,
+					  void *data, u16 data_len);
+void vlib_buffer_chain_validate (vlib_main_t * vm, vlib_buffer_t * first);
+
+format_function_t format_vlib_buffer, format_vlib_buffer_and_data,
+  format_vlib_buffer_contents;
+
+typedef struct
+{
+  /* Vector of packet data. */
+  u8 *packet_data;
+
+  /* Number of buffers to allocate in each call to physmem
+     allocator. */
+  u32 min_n_buffers_each_physmem_alloc;
+
+  /* Buffer free list for this template. */
+  u32 free_list_index;
+
+  u32 *free_buffers;
+} vlib_packet_template_t;
+
+void vlib_packet_template_get_packet_helper (vlib_main_t * vm,
+					     vlib_packet_template_t * t);
+
+void vlib_packet_template_init (vlib_main_t * vm,
+				vlib_packet_template_t * t,
+				void *packet_data,
+				uword n_packet_data_bytes,
+				uword min_n_buffers_each_physmem_alloc,
+				char *fmt, ...);
+
+void *vlib_packet_template_get_packet (vlib_main_t * vm,
+				       vlib_packet_template_t * t,
+				       u32 * bi_result);
+
+always_inline void
+vlib_packet_template_free (vlib_main_t * vm, vlib_packet_template_t * t)
+{
+  vec_free (t->packet_data);
+}
+
+always_inline u32
+unserialize_vlib_buffer_n_bytes (serialize_main_t * m)
+{
+  serialize_stream_t *s = &m->stream;
+  vlib_serialize_buffer_main_t *sm
+    = uword_to_pointer (m->stream.data_function_opaque,
+			vlib_serialize_buffer_main_t *);
+  vlib_main_t *vm = sm->vlib_main;
+  u32 n, *f;
+
+  n = s->n_buffer_bytes - s->current_buffer_index;
+  if (sm->last_buffer != ~0)
+    {
+      vlib_buffer_t *b = vlib_get_buffer (vm, sm->last_buffer);
+      while (b->flags & VLIB_BUFFER_NEXT_PRESENT)
+	{
+	  b = vlib_get_buffer (vm, b->next_buffer);
+	  n += b->current_length;
+	}
+    }
+
+  /* *INDENT-OFF* */
+  clib_fifo_foreach (f, sm->rx.buffer_fifo, ({
+    n += vlib_buffer_index_length_in_chain (vm, f[0]);
+  }));
+/* *INDENT-ON* */
+
+  return n;
+}
+
+/* Set a buffer quickly into "uninitialized" state.  We want this to
+   be extremely cheap and arrange for all fields that need to be
+   initialized to be in the first 128 bits of the buffer. */
+always_inline void
+vlib_buffer_init_for_free_list (vlib_buffer_t * dst,
+				vlib_buffer_free_list_t * fl)
+{
+  vlib_buffer_t *src = &fl->buffer_init_template;
+
+  /* Make sure vlib_buffer_t is cacheline aligned and sized */
+  ASSERT (STRUCT_OFFSET_OF (vlib_buffer_t, cacheline0) == 0);
+  ASSERT (STRUCT_OFFSET_OF (vlib_buffer_t, cacheline1) ==
+	  CLIB_CACHE_LINE_BYTES);
+  ASSERT (STRUCT_OFFSET_OF (vlib_buffer_t, cacheline2) ==
+	  CLIB_CACHE_LINE_BYTES * 2);
+
+  /* Make sure buffer template is sane. */
+  ASSERT (fl->index == vlib_buffer_get_free_list_index (src));
+
+  clib_memcpy (STRUCT_MARK_PTR (dst, template_start),
+	       STRUCT_MARK_PTR (src, template_start),
+	       STRUCT_OFFSET_OF (vlib_buffer_t, template_end) -
+	       STRUCT_OFFSET_OF (vlib_buffer_t, template_start));
+
+  /* Not in the first 16 octets. */
+  dst->n_add_refs = src->n_add_refs;
+
+  /* Make sure it really worked. */
+#define _(f) ASSERT (dst->f == src->f);
+  _(current_data);
+  _(current_length);
+  _(flags);
+#undef _
+  /* ASSERT (dst->total_length_not_including_first_buffer == 0); */
+  /* total_length_not_including_first_buffer is not in the template anymore
+   * so it may actually not zeroed for some buffers. One option is to
+   * uncomment the line lower (comes at a cost), the other, is to just  not
+   * care */
+  /* dst->total_length_not_including_first_buffer = 0; */
+  ASSERT (dst->n_add_refs == 0);
+}
+
+always_inline void
+vlib_buffer_add_to_free_list (vlib_main_t * vm,
+			      vlib_buffer_free_list_t * f,
+			      u32 buffer_index, u8 do_init)
+{
+  vlib_buffer_t *b;
+  b = vlib_get_buffer (vm, buffer_index);
+  if (PREDICT_TRUE (do_init))
+    vlib_buffer_init_for_free_list (b, f);
+  vec_add1_aligned (f->buffers, buffer_index, CLIB_CACHE_LINE_BYTES);
+
+  if (vec_len (f->buffers) > 4 * VLIB_FRAME_SIZE)
+    {
+      vlib_buffer_free_list_t *mf;
+      mf = vlib_buffer_get_free_list (vlib_mains[0], f->index);
+      clib_spinlock_lock (&mf->global_buffers_lock);
+      /* keep last stored buffers, as they are more likely hot in the cache */
+      vec_add_aligned (mf->global_buffers, f->buffers, VLIB_FRAME_SIZE,
+		       CLIB_CACHE_LINE_BYTES);
+      vec_delete (f->buffers, VLIB_FRAME_SIZE, 0);
+      clib_spinlock_unlock (&mf->global_buffers_lock);
+    }
+}
+
+always_inline void
+vlib_buffer_init_two_for_free_list (vlib_buffer_t * dst0,
+				    vlib_buffer_t * dst1,
+				    vlib_buffer_free_list_t * fl)
+{
+  vlib_buffer_t *src = &fl->buffer_init_template;
+
+  /* Make sure buffer template is sane. */
+  ASSERT (fl->index == vlib_buffer_get_free_list_index (src));
+
+  clib_memcpy (STRUCT_MARK_PTR (dst0, template_start),
+	       STRUCT_MARK_PTR (src, template_start),
+	       STRUCT_OFFSET_OF (vlib_buffer_t, template_end) -
+	       STRUCT_OFFSET_OF (vlib_buffer_t, template_start));
+
+  clib_memcpy (STRUCT_MARK_PTR (dst1, template_start),
+	       STRUCT_MARK_PTR (src, template_start),
+	       STRUCT_OFFSET_OF (vlib_buffer_t, template_end) -
+	       STRUCT_OFFSET_OF (vlib_buffer_t, template_start));
+
+  /* Not in the first 16 octets. */
+  dst0->n_add_refs = src->n_add_refs;
+  dst1->n_add_refs = src->n_add_refs;
+
+  /* Make sure it really worked. */
+#define _(f) ASSERT (dst0->f == src->f);  ASSERT( dst1->f == src->f)
+  _(current_data);
+  _(current_length);
+  _(flags);
+#undef _
+
+  ASSERT (dst0->total_length_not_including_first_buffer == 0);
+  ASSERT (dst1->total_length_not_including_first_buffer == 0);
+  ASSERT (dst0->n_add_refs == 0);
+  ASSERT (dst1->n_add_refs == 0);
+}
+
+#if CLIB_DEBUG > 0
+extern u32 *vlib_buffer_state_validation_lock;
+extern uword *vlib_buffer_state_validation_hash;
+extern void *vlib_buffer_state_heap;
+#endif
+
+static inline void
+vlib_validate_buffer_in_use (vlib_buffer_t * b, u32 expected)
+{
+#if CLIB_DEBUG > 0
+  uword *p;
+  void *oldheap;
+
+  oldheap = clib_mem_set_heap (vlib_buffer_state_heap);
+
+  while (__sync_lock_test_and_set (vlib_buffer_state_validation_lock, 1))
+    ;
+
+  p = hash_get (vlib_buffer_state_validation_hash, b);
+
+  /* If we don't know about b, declare it to be in the expected state */
+  if (!p)
+    {
+      hash_set (vlib_buffer_state_validation_hash, b, expected);
+      goto out;
+    }
+
+  if (p[0] != expected)
+    {
+      void cj_stop (void);
+      u32 bi;
+      vlib_main_t *vm = &vlib_global_main;
+
+      cj_stop ();
+
+      bi = vlib_get_buffer_index (vm, b);
+
+      clib_mem_set_heap (oldheap);
+      clib_warning ("%.6f buffer %llx (%d): %s, not %s",
+		    vlib_time_now (vm), bi,
+		    p[0] ? "busy" : "free", expected ? "busy" : "free");
+      os_panic ();
+    }
+out:
+  CLIB_MEMORY_BARRIER ();
+  *vlib_buffer_state_validation_lock = 0;
+  clib_mem_set_heap (oldheap);
+#endif
+}
+
+static inline void
+vlib_validate_buffer_set_in_use (vlib_buffer_t * b, u32 expected)
+{
+#if CLIB_DEBUG > 0
+  void *oldheap;
+
+  oldheap = clib_mem_set_heap (vlib_buffer_state_heap);
+
+  while (__sync_lock_test_and_set (vlib_buffer_state_validation_lock, 1))
+    ;
+
+  hash_set (vlib_buffer_state_validation_hash, b, expected);
+
+  CLIB_MEMORY_BARRIER ();
+  *vlib_buffer_state_validation_lock = 0;
+  clib_mem_set_heap (oldheap);
+#endif
+}
+
+#endif /* included_vlib_buffer_funcs_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/buffer_node.h b/src/vlib/buffer_node.h
new file mode 100644
index 00000000..8a779049
--- /dev/null
+++ b/src/vlib/buffer_node.h
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * buffer_node.h: VLIB buffer handling node helper macros/inlines
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_buffer_node_h
+#define included_vlib_buffer_node_h
+
+/** \file
+    vlib buffer/node functions
+*/
+
+/** \brief Finish enqueueing two buffers forward in the graph.
+ Standard dual loop boilerplate element. This is a MACRO,
+ with MULTIPLE SIDE EFFECTS. In the ideal case,
+ <code>next_index == next0 == next1</code>,
+ which means that the speculative enqueue at the top of the dual loop
+ has correctly dealt with both packets. In that case, the macro does
+ nothing at all.
+
+ @param vm vlib_main_t pointer, varies by thread
+ @param node current node vlib_node_runtime_t pointer
+ @param next_index speculated next index used for both packets
+ @param to_next speculated vector pointer used for both packets
+ @param n_left_to_next number of slots left in speculated vector
+ @param bi0 first buffer index
+ @param bi1 second buffer index
+ @param next0 actual next index to be used for the first packet
+ @param next1 actual next index to be used for the second packet
+
+ @return @c next_index -- speculative next index to be used for future packets
+ @return @c to_next -- speculative frame to be used for future packets
+ @return @c n_left_to_next -- number of slots left in speculative frame
+*/
+
+#define vlib_validate_buffer_enqueue_x2(vm,node,next_index,to_next,n_left_to_next,bi0,bi1,next0,next1) \
+do {									\
+  int enqueue_code = (next0 != next_index) + 2*(next1 != next_index);	\
+									\
+  if (PREDICT_FALSE (enqueue_code != 0))				\
+    {									\
+      switch (enqueue_code)						\
+	{								\
+	case 1:								\
+	  /* A B A */							\
+	  to_next[-2] = bi1;						\
+	  to_next -= 1;							\
+	  n_left_to_next += 1;						\
+	  vlib_set_next_frame_buffer (vm, node, next0, bi0);		\
+	  break;							\
+									\
+	case 2:								\
+	  /* A A B */							\
+	  to_next -= 1;							\
+	  n_left_to_next += 1;						\
+	  vlib_set_next_frame_buffer (vm, node, next1, bi1);		\
+	  break;							\
+									\
+	case 3:								\
+	  /* A B B or A B C */						\
+	  to_next -= 2;							\
+	  n_left_to_next += 2;						\
+	  vlib_set_next_frame_buffer (vm, node, next0, bi0);		\
+	  vlib_set_next_frame_buffer (vm, node, next1, bi1);		\
+	  if (next0 == next1)						\
+	    {								\
+	      vlib_put_next_frame (vm, node, next_index,		\
+				   n_left_to_next);			\
+	      next_index = next1;					\
+	      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); \
+	    }								\
+	}								\
+    }									\
+} while (0)
+
+
+/** \brief Finish enqueueing four buffers forward in the graph.
+ Standard quad loop boilerplate element. This is a MACRO,
+ with MULTIPLE SIDE EFFECTS. In the ideal case,
+ <code>next_index == next0 == next1 == next2 == next3</code>,
+ which means that the speculative enqueue at the top of the quad loop
+ has correctly dealt with all four packets. In that case, the macro does
+ nothing at all.
+
+ @param vm vlib_main_t pointer, varies by thread
+ @param node current node vlib_node_runtime_t pointer
+ @param next_index speculated next index used for both packets
+ @param to_next speculated vector pointer used for both packets
+ @param n_left_to_next number of slots left in speculated vector
+ @param bi0 first buffer index
+ @param bi1 second buffer index
+ @param bi2 third buffer index
+ @param bi3 fourth buffer index
+ @param next0 actual next index to be used for the first packet
+ @param next1 actual next index to be used for the second packet
+ @param next2 actual next index to be used for the third packet
+ @param next3 actual next index to be used for the fourth packet
+
+ @return @c next_index -- speculative next index to be used for future packets
+ @return @c to_next -- speculative frame to be used for future packets
+ @return @c n_left_to_next -- number of slots left in speculative frame
+*/
+
+#define vlib_validate_buffer_enqueue_x4(vm,node,next_index,to_next,n_left_to_next,bi0,bi1,bi2,bi3,next0,next1,next2,next3) \
+do {                                                                    \
+  /* After the fact: check the [speculative] enqueue to "next" */       \
+  u32 fix_speculation = next_index != next0 || next_index != next1      \
+    || next_index != next2 || next_index != next3;                      \
+  if (PREDICT_FALSE(fix_speculation))                                   \
+    {                                                                   \
+      /* rewind... */                                                   \
+      to_next -= 4;                                                     \
+      n_left_to_next += 4;                                              \
+                                                                        \
+      /* If bi0 belongs to "next", send it there */                     \
+      if (next_index == next0)                                          \
+        {                                                               \
+          to_next[0] = bi0;                                             \
+          to_next++;                                                    \
+          n_left_to_next --;                                            \
+        }                                                               \
+      else              /* send it where it needs to go */              \
+        vlib_set_next_frame_buffer (vm, node, next0, bi0);              \
+                                                                        \
+      if (next_index == next1)                                          \
+        {                                                               \
+          to_next[0] = bi1;                                             \
+          to_next++;                                                    \
+          n_left_to_next --;                                            \
+        }                                                               \
+      else                                                              \
+        vlib_set_next_frame_buffer (vm, node, next1, bi1);              \
+                                                                        \
+      if (next_index == next2)                                          \
+        {                                                               \
+          to_next[0] = bi2;                                             \
+          to_next++;                                                    \
+          n_left_to_next --;                                            \
+        }                                                               \
+      else                                                              \
+        vlib_set_next_frame_buffer (vm, node, next2, bi2);              \
+                                                                        \
+      if (next_index == next3)                                          \
+        {                                                               \
+          to_next[0] = bi3;                                             \
+          to_next++;                                                    \
+          n_left_to_next --;                                            \
+        }                                                               \
+      else                                                              \
+        vlib_set_next_frame_buffer (vm, node, next3, bi3);              \
+                                                                        \
+      /* Change speculation: last 2 packets went to the same node */    \
+      if (next2 == next3)                                               \
+        {                                                               \
+          vlib_put_next_frame (vm, node, next_index, n_left_to_next);   \
+          next_index = next3;                                           \
+          vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); \
+        }                                                               \
+    }                                                                   \
+ } while(0);
+
+/** \brief Finish enqueueing one buffer forward in the graph.
+ Standard single loop boilerplate element. This is a MACRO,
+ with MULTIPLE SIDE EFFECTS. In the ideal case,
+ <code>next_index == next0</code>,
+ which means that the speculative enqueue at the top of the single loop
+ has correctly dealt with the packet in hand. In that case, the macro does
+ nothing at all.
+
+ @param vm vlib_main_t pointer, varies by thread
+ @param node current node vlib_node_runtime_t pointer
+ @param next_index speculated next index used for both packets
+ @param to_next speculated vector pointer used for both packets
+ @param n_left_to_next number of slots left in speculated vector
+ @param bi0 first buffer index
+ @param next0 actual next index to be used for the first packet
+
+ @return @c next_index -- speculative next index to be used for future packets
+ @return @c to_next -- speculative frame to be used for future packets
+ @return @c n_left_to_next -- number of slots left in speculative frame
+*/
+#define vlib_validate_buffer_enqueue_x1(vm,node,next_index,to_next,n_left_to_next,bi0,next0) \
+do {									\
+  if (PREDICT_FALSE (next0 != next_index))				\
+    {									\
+      vlib_put_next_frame (vm, node, next_index, n_left_to_next + 1);	\
+      next_index = next0;						\
+      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); \
+									\
+      to_next[0] = bi0;							\
+      to_next += 1;							\
+      n_left_to_next -= 1;						\
+    }									\
+} while (0)
+
+always_inline uword
+generic_buffer_node_inline (vlib_main_t * vm,
+			    vlib_node_runtime_t * node,
+			    vlib_frame_t * frame,
+			    uword sizeof_trace,
+			    void *opaque1,
+			    uword opaque2,
+			    void (*two_buffers) (vlib_main_t * vm,
+						 void *opaque1,
+						 uword opaque2,
+						 vlib_buffer_t * b0,
+						 vlib_buffer_t * b1,
+						 u32 * next0, u32 * next1),
+			    void (*one_buffer) (vlib_main_t * vm,
+						void *opaque1, uword opaque2,
+						vlib_buffer_t * b0,
+						u32 * next0))
+{
+  u32 n_left_from, *from, *to_next;
+  u32 next_index;
+
+  from = vlib_frame_vector_args (frame);
+  n_left_from = frame->n_vectors;
+  next_index = node->cached_next_index;
+
+  if (node->flags & VLIB_NODE_FLAG_TRACE)
+    vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors,
+				   /* stride */ 1, sizeof_trace);
+
+  while (n_left_from > 0)
+    {
+      u32 n_left_to_next;
+
+      vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
+
+      while (n_left_from >= 4 && n_left_to_next >= 2)
+	{
+	  vlib_buffer_t *p0, *p1;
+	  u32 pi0, next0;
+	  u32 pi1, next1;
+
+	  /* Prefetch next iteration. */
+	  {
+	    vlib_buffer_t *p2, *p3;
+
+	    p2 = vlib_get_buffer (vm, from[2]);
+	    p3 = vlib_get_buffer (vm, from[3]);
+
+	    vlib_prefetch_buffer_header (p2, LOAD);
+	    vlib_prefetch_buffer_header (p3, LOAD);
+
+	    CLIB_PREFETCH (p2->data, 64, LOAD);
+	    CLIB_PREFETCH (p3->data, 64, LOAD);
+	  }
+
+	  pi0 = to_next[0] = from[0];
+	  pi1 = to_next[1] = from[1];
+	  from += 2;
+	  to_next += 2;
+	  n_left_from -= 2;
+	  n_left_to_next -= 2;
+
+	  p0 = vlib_get_buffer (vm, pi0);
+	  p1 = vlib_get_buffer (vm, pi1);
+
+	  two_buffers (vm, opaque1, opaque2, p0, p1, &next0, &next1);
+
+	  vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
+					   to_next, n_left_to_next,
+					   pi0, pi1, next0, next1);
+	}
+
+      while (n_left_from > 0 && n_left_to_next > 0)
+	{
+	  vlib_buffer_t *p0;
+	  u32 pi0, next0;
+
+	  pi0 = from[0];
+	  to_next[0] = pi0;
+	  from += 1;
+	  to_next += 1;
+	  n_left_from -= 1;
+	  n_left_to_next -= 1;
+
+	  p0 = vlib_get_buffer (vm, pi0);
+
+	  one_buffer (vm, opaque1, opaque2, p0, &next0);
+
+	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
+					   to_next, n_left_to_next,
+					   pi0, next0);
+	}
+
+      vlib_put_next_frame (vm, node, next_index, n_left_to_next);
+    }
+
+  return frame->n_vectors;
+}
+
+#endif /* included_vlib_buffer_node_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/buffer_serialize.c b/src/vlib/buffer_serialize.c
new file mode 100644
index 00000000..96a5f0a0
--- /dev/null
+++ b/src/vlib/buffer_serialize.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * buffer.c: allocate/free network buffers.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+
+static void
+vlib_serialize_tx (serialize_main_header_t * m, serialize_stream_t * s)
+{
+  vlib_main_t *vm;
+  vlib_serialize_buffer_main_t *sm;
+  uword n, n_bytes_to_write;
+  vlib_buffer_t *last;
+
+  n_bytes_to_write = s->current_buffer_index;
+  sm =
+    uword_to_pointer (s->data_function_opaque,
+		      vlib_serialize_buffer_main_t *);
+  vm = sm->vlib_main;
+
+  ASSERT (sm->tx.max_n_data_bytes_per_chain > 0);
+  if (serialize_stream_is_end_of_stream (s)
+      || sm->tx.n_total_data_bytes + n_bytes_to_write >
+      sm->tx.max_n_data_bytes_per_chain)
+    {
+      vlib_process_t *p = vlib_get_current_process (vm);
+
+      last = vlib_get_buffer (vm, sm->last_buffer);
+      last->current_length = n_bytes_to_write;
+
+      vlib_set_next_frame_buffer (vm, &p->node_runtime, sm->tx.next_index,
+				  sm->first_buffer);
+
+      sm->first_buffer = sm->last_buffer = ~0;
+      sm->tx.n_total_data_bytes = 0;
+    }
+
+  else if (n_bytes_to_write == 0 && s->n_buffer_bytes == 0)
+    {
+      ASSERT (sm->first_buffer == ~0);
+      ASSERT (sm->last_buffer == ~0);
+      n =
+	vlib_buffer_alloc_from_free_list (vm, &sm->first_buffer, 1,
+					  sm->tx.free_list_index);
+      if (n != 1)
+	serialize_error (m,
+			 clib_error_create
+			 ("vlib_buffer_alloc_from_free_list fails"));
+      sm->last_buffer = sm->first_buffer;
+      s->n_buffer_bytes =
+	vlib_buffer_free_list_buffer_size (vm, sm->tx.free_list_index);
+    }
+
+  if (n_bytes_to_write > 0)
+    {
+      vlib_buffer_t *prev = vlib_get_buffer (vm, sm->last_buffer);
+      n =
+	vlib_buffer_alloc_from_free_list (vm, &sm->last_buffer, 1,
+					  sm->tx.free_list_index);
+      if (n != 1)
+	serialize_error (m,
+			 clib_error_create
+			 ("vlib_buffer_alloc_from_free_list fails"));
+      sm->tx.n_total_data_bytes += n_bytes_to_write;
+      prev->current_length = n_bytes_to_write;
+      prev->next_buffer = sm->last_buffer;
+      prev->flags |= VLIB_BUFFER_NEXT_PRESENT;
+    }
+
+  if (sm->last_buffer != ~0)
+    {
+      last = vlib_get_buffer (vm, sm->last_buffer);
+      s->buffer = vlib_buffer_get_current (last);
+      s->current_buffer_index = 0;
+      ASSERT (last->current_data == s->current_buffer_index);
+    }
+}
+
+static void
+vlib_serialize_rx (serialize_main_header_t * m, serialize_stream_t * s)
+{
+  vlib_main_t *vm;
+  vlib_serialize_buffer_main_t *sm;
+  vlib_buffer_t *last;
+
+  sm =
+    uword_to_pointer (s->data_function_opaque,
+		      vlib_serialize_buffer_main_t *);
+  vm = sm->vlib_main;
+
+  if (serialize_stream_is_end_of_stream (s))
+    return;
+
+  if (sm->last_buffer != ~0)
+    {
+      last = vlib_get_buffer (vm, sm->last_buffer);
+
+      if (last->flags & VLIB_BUFFER_NEXT_PRESENT)
+	sm->last_buffer = last->next_buffer;
+      else
+	{
+	  vlib_buffer_free (vm, &sm->first_buffer, /* count */ 1);
+	  sm->first_buffer = sm->last_buffer = ~0;
+	}
+    }
+
+  if (sm->last_buffer == ~0)
+    {
+      while (clib_fifo_elts (sm->rx.buffer_fifo) == 0)
+	{
+	  sm->rx.ready_one_time_event =
+	    vlib_process_create_one_time_event (vm, vlib_current_process (vm),
+						~0);
+	  vlib_process_wait_for_one_time_event (vm, /* no event data */ 0,
+						sm->rx.ready_one_time_event);
+	}
+
+      clib_fifo_sub1 (sm->rx.buffer_fifo, sm->first_buffer);
+      sm->last_buffer = sm->first_buffer;
+    }
+
+  ASSERT (sm->last_buffer != ~0);
+
+  last = vlib_get_buffer (vm, sm->last_buffer);
+  s->current_buffer_index = 0;
+  s->buffer = vlib_buffer_get_current (last);
+  s->n_buffer_bytes = last->current_length;
+}
+
+static void
+serialize_open_vlib_helper (serialize_main_t * m,
+			    vlib_main_t * vm,
+			    vlib_serialize_buffer_main_t * sm, uword is_read)
+{
+  /* Initialize serialize main but save overflow buffer for re-use between calls. */
+  {
+    u8 *save = m->stream.overflow_buffer;
+    memset (m, 0, sizeof (m[0]));
+    m->stream.overflow_buffer = save;
+    if (save)
+      _vec_len (save) = 0;
+  }
+
+  sm->first_buffer = sm->last_buffer = ~0;
+  if (is_read)
+    clib_fifo_reset (sm->rx.buffer_fifo);
+  else
+    sm->tx.n_total_data_bytes = 0;
+  sm->vlib_main = vm;
+  m->header.data_function = is_read ? vlib_serialize_rx : vlib_serialize_tx;
+  m->stream.data_function_opaque = pointer_to_uword (sm);
+}
+
+void
+serialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm,
+			    vlib_serialize_buffer_main_t * sm)
+{
+  serialize_open_vlib_helper (m, vm, sm, /* is_read */ 0);
+}
+
+void
+unserialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm,
+			      vlib_serialize_buffer_main_t * sm)
+{
+  serialize_open_vlib_helper (m, vm, sm, /* is_read */ 1);
+}
+
+u32
+serialize_close_vlib_buffer (serialize_main_t * m)
+{
+  vlib_serialize_buffer_main_t *sm
+    = uword_to_pointer (m->stream.data_function_opaque,
+			vlib_serialize_buffer_main_t *);
+  vlib_buffer_t *last;
+  serialize_stream_t *s = &m->stream;
+
+  last = vlib_get_buffer (sm->vlib_main, sm->last_buffer);
+  last->current_length = s->current_buffer_index;
+
+  if (vec_len (s->overflow_buffer) > 0)
+    {
+      sm->last_buffer
+	= vlib_buffer_add_data (sm->vlib_main, sm->tx.free_list_index,
+				sm->last_buffer == ~0 ? 0 : sm->last_buffer,
+				s->overflow_buffer,
+				vec_len (s->overflow_buffer));
+      _vec_len (s->overflow_buffer) = 0;
+    }
+
+  return sm->first_buffer;
+}
+
+void
+unserialize_close_vlib_buffer (serialize_main_t * m)
+{
+  vlib_serialize_buffer_main_t *sm
+    = uword_to_pointer (m->stream.data_function_opaque,
+			vlib_serialize_buffer_main_t *);
+  if (sm->first_buffer != ~0)
+    vlib_buffer_free_one (sm->vlib_main, sm->first_buffer);
+  clib_fifo_reset (sm->rx.buffer_fifo);
+  if (m->stream.overflow_buffer)
+    _vec_len (m->stream.overflow_buffer) = 0;
+}
+
+/** @endcond */
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/cli.c b/src/vlib/cli.c
new file mode 100644
index 00000000..48cf0426
--- /dev/null
+++ b/src/vlib/cli.c
@@ -0,0 +1,1345 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * cli.c: command line interface
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vppinfra/cpu.h>
+#include <unistd.h>
+#include <ctype.h>
+
+/* Root of all show commands. */
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (vlib_cli_show_command, static) = {
+  .path = "show",
+  .short_help = "Show commands",
+};
+/* *INDENT-ON* */
+
+/* Root of all clear commands. */
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (vlib_cli_clear_command, static) = {
+  .path = "clear",
+  .short_help = "Clear commands",
+};
+/* *INDENT-ON* */
+
+/* Root of all set commands. */
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (vlib_cli_set_command, static) = {
+  .path = "set",
+  .short_help = "Set commands",
+};
+/* *INDENT-ON* */
+
+/* Root of all test commands. */
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (vlib_cli_test_command, static) = {
+  .path = "test",
+  .short_help = "Test commands",
+};
+/* *INDENT-ON* */
+
+/* Returns bitmap of commands which match key. */
+static uword *
+vlib_cli_sub_command_match (vlib_cli_command_t * c, unformat_input_t * input)
+{
+  int i, n;
+  uword *match = 0;
+  vlib_cli_parse_position_t *p;
+
+  unformat_skip_white_space (input);
+
+  for (i = 0;; i++)
+    {
+      uword k;
+
+      k = unformat_get_input (input);
+      switch (k)
+	{
+	case 'a' ... 'z':
+	case 'A' ... 'Z':
+	case '0' ... '9':
+	case '-':
+	case '_':
+	  break;
+
+	case ' ':
+	case '\t':
+	case '\r':
+	case '\n':
+	case UNFORMAT_END_OF_INPUT:
+	  /* White space or end of input removes any non-white
+	     matches that were before possible. */
+	  if (i < vec_len (c->sub_command_positions)
+	      && clib_bitmap_count_set_bits (match) > 1)
+	    {
+	      p = vec_elt_at_index (c->sub_command_positions, i);
+	      for (n = 0; n < vec_len (p->bitmaps); n++)
+		match = clib_bitmap_andnot (match, p->bitmaps[n]);
+	    }
+	  goto done;
+
+	default:
+	  unformat_put_input (input);
+	  goto done;
+	}
+
+      if (i >= vec_len (c->sub_command_positions))
+	{
+	no_match:
+	  clib_bitmap_free (match);
+	  return 0;
+	}
+
+      p = vec_elt_at_index (c->sub_command_positions, i);
+      if (vec_len (p->bitmaps) == 0)
+	goto no_match;
+
+      n = k - p->min_char;
+      if (n < 0 || n >= vec_len (p->bitmaps))
+	goto no_match;
+
+      if (i == 0)
+	match = clib_bitmap_dup (p->bitmaps[n]);
+      else
+	match = clib_bitmap_and (match, p->bitmaps[n]);
+
+      if (clib_bitmap_is_zero (match))
+	goto no_match;
+    }
+
+done:
+  return match;
+}
+
+/* Looks for string based sub-input formatted { SUB-INPUT }. */
+uword
+unformat_vlib_cli_sub_input (unformat_input_t * i, va_list * args)
+{
+  unformat_input_t *sub_input = va_arg (*args, unformat_input_t *);
+  u8 *s;
+  uword c;
+
+  while (1)
+    {
+      c = unformat_get_input (i);
+      switch (c)
+	{
+	case ' ':
+	case '\t':
+	case '\n':
+	case '\r':
+	case '\f':
+	  break;
+
+	case '{':
+	default:
+	  /* Put back paren. */
+	  if (c != UNFORMAT_END_OF_INPUT)
+	    unformat_put_input (i);
+
+	  if (c == '{' && unformat (i, "%v", &s))
+	    {
+	      unformat_init_vector (sub_input, s);
+	      return 1;
+	    }
+	  return 0;
+	}
+    }
+  return 0;
+}
+
+static vlib_cli_command_t *
+get_sub_command (vlib_cli_main_t * cm, vlib_cli_command_t * parent, u32 si)
+{
+  vlib_cli_sub_command_t *s = vec_elt_at_index (parent->sub_commands, si);
+  return vec_elt_at_index (cm->commands, s->index);
+}
+
+static uword
+unformat_vlib_cli_sub_command (unformat_input_t * i, va_list * args)
+{
+  vlib_main_t *vm = va_arg (*args, vlib_main_t *);
+  vlib_cli_command_t *c = va_arg (*args, vlib_cli_command_t *);
+  vlib_cli_command_t **result = va_arg (*args, vlib_cli_command_t **);
+  vlib_cli_main_t *cm = &vm->cli_main;
+  uword *match_bitmap, is_unique, index;
+
+  {
+    vlib_cli_sub_rule_t *sr;
+    vlib_cli_parse_rule_t *r;
+    vec_foreach (sr, c->sub_rules)
+    {
+      void **d;
+      r = vec_elt_at_index (cm->parse_rules, sr->rule_index);
+      vec_add2 (cm->parse_rule_data, d, 1);
+      vec_reset_length (d[0]);
+      if (r->data_size)
+	d[0] = _vec_resize (d[0],
+			    /* length increment */ 1,
+			    r->data_size,
+			    /* header_bytes */ 0,
+			    /* data align */ sizeof (uword));
+      if (unformat_user (i, r->unformat_function, vm, d[0]))
+	{
+	  *result = vec_elt_at_index (cm->commands, sr->command_index);
+	  return 1;
+	}
+    }
+  }
+
+  match_bitmap = vlib_cli_sub_command_match (c, i);
+  is_unique = clib_bitmap_count_set_bits (match_bitmap) == 1;
+  index = ~0;
+  if (is_unique)
+    {
+      index = clib_bitmap_first_set (match_bitmap);
+      *result = get_sub_command (cm, c, index);
+    }
+  clib_bitmap_free (match_bitmap);
+
+  return is_unique;
+}
+
+static int
+vlib_cli_cmp_strings (void *a1, void *a2)
+{
+  u8 *c1 = *(u8 **) a1;
+  u8 *c2 = *(u8 **) a2;
+
+  return vec_cmp (c1, c2);
+}
+
+u8 **
+vlib_cli_get_possible_completions (u8 * str)
+{
+  vlib_cli_command_t *c;
+  vlib_cli_sub_command_t *sc;
+  vlib_main_t *vm = vlib_get_main ();
+  vlib_cli_main_t *vcm = &vm->cli_main;
+  uword *match_bitmap = 0;
+  uword index, is_unique, help_next_level;
+  u8 **result = 0;
+  unformat_input_t input;
+  unformat_init_vector (&input, vec_dup (str));
+  c = vec_elt_at_index (vcm->commands, 0);
+
+  /* remove trailing whitespace, except for one of them */
+  while (vec_len (input.buffer) >= 2 &&
+	 isspace (input.buffer[vec_len (input.buffer) - 1]) &&
+	 isspace (input.buffer[vec_len (input.buffer) - 2]))
+    {
+      vec_del1 (input.buffer, vec_len (input.buffer) - 1);
+    }
+
+  /* if input is empty, directly return list of root commands */
+  if (vec_len (input.buffer) == 0 ||
+      (vec_len (input.buffer) == 1 && isspace (input.buffer[0])))
+    {
+      vec_foreach (sc, c->sub_commands)
+      {
+	vec_add1 (result, (u8 *) sc->name);
+      }
+      goto done;
+    }
+
+  /* add a trailing '?' so that vlib_cli_sub_command_match can find
+   * all commands starting with the input string */
+  vec_add1 (input.buffer, '?');
+
+  while (1)
+    {
+      match_bitmap = vlib_cli_sub_command_match (c, &input);
+      /* no match: return no result */
+      if (match_bitmap == 0)
+	{
+	  goto done;
+	}
+      is_unique = clib_bitmap_count_set_bits (match_bitmap) == 1;
+      /* unique match: try to step one subcommand level further */
+      if (is_unique)
+	{
+	  /* stop if no more input */
+	  if (input.index >= vec_len (input.buffer) - 1)
+	    {
+	      break;
+	    }
+
+	  index = clib_bitmap_first_set (match_bitmap);
+	  c = get_sub_command (vcm, c, index);
+	  clib_bitmap_free (match_bitmap);
+	  continue;
+	}
+      /* multiple matches: stop here, return all matches */
+      break;
+    }
+
+  /* remove trailing '?' */
+  vec_del1 (input.buffer, vec_len (input.buffer) - 1);
+
+  /* if we have a space at the end of input, and a unique match,
+   * autocomplete the next level of subcommands */
+  help_next_level = (vec_len (str) == 0) || isspace (str[vec_len (str) - 1]);
+  /* *INDENT-OFF* */
+  clib_bitmap_foreach(index, match_bitmap, {
+    if (help_next_level && is_unique) {
+	c = get_sub_command (vcm, c, index);
+	vec_foreach (sc, c->sub_commands) {
+	  vec_add1 (result, (u8*) sc->name);
+	}
+	goto done; /* break doesn't work in this macro-loop */
+    }
+    sc = &c->sub_commands[index];
+    vec_add1(result, (u8*) sc->name);
+  });
+  /* *INDENT-ON* */
+
+done:
+  clib_bitmap_free (match_bitmap);
+  unformat_free (&input);
+
+  if (result)
+    vec_sort_with_function (result, vlib_cli_cmp_strings);
+  return result;
+}
+
+static u8 *
+format_vlib_cli_command_help (u8 * s, va_list * args)
+{
+  vlib_cli_command_t *c = va_arg (*args, vlib_cli_command_t *);
+  int is_long = va_arg (*args, int);
+  if (is_long && c->long_help)
+    s = format (s, "%s", c->long_help);
+  else if (c->short_help)
+    s = format (s, "%s", c->short_help);
+  else
+    s = format (s, "%v commands", c->path);
+  return s;
+}
+
+static u8 *
+format_vlib_cli_parse_rule_name (u8 * s, va_list * args)
+{
+  vlib_cli_parse_rule_t *r = va_arg (*args, vlib_cli_parse_rule_t *);
+  return format (s, "<%U>", format_c_identifier, r->name);
+}
+
+static u8 *
+format_vlib_cli_path (u8 * s, va_list * args)
+{
+  u8 *path = va_arg (*args, u8 *);
+  int i, in_rule;
+  in_rule = 0;
+  for (i = 0; i < vec_len (path); i++)
+    {
+      switch (path[i])
+	{
+	case '%':
+	  in_rule = 1;
+	  vec_add1 (s, '<');	/* start of <RULE> */
+	  break;
+
+	case '_':
+	  /* _ -> space in rules. */
+	  vec_add1 (s, in_rule ? ' ' : '_');
+	  break;
+
+	case ' ':
+	  if (in_rule)
+	    {
+	      vec_add1 (s, '>');	/* end of <RULE> */
+	      in_rule = 0;
+	    }
+	  vec_add1 (s, ' ');
+	  break;
+
+	default:
+	  vec_add1 (s, path[i]);
+	  break;
+	}
+    }
+
+  if (in_rule)
+    vec_add1 (s, '>');		/* terminate <RULE> */
+
+  return s;
+}
+
+static vlib_cli_command_t *
+all_subs (vlib_cli_main_t * cm, vlib_cli_command_t * subs, u32 command_index)
+{
+  vlib_cli_command_t *c = vec_elt_at_index (cm->commands, command_index);
+  vlib_cli_sub_command_t *sc;
+  vlib_cli_sub_rule_t *sr;
+
+  if (c->function)
+    vec_add1 (subs, c[0]);
+
+  vec_foreach (sr, c->sub_rules)
+    subs = all_subs (cm, subs, sr->command_index);
+  vec_foreach (sc, c->sub_commands) subs = all_subs (cm, subs, sc->index);
+
+  return subs;
+}
+
+static int
+vlib_cli_cmp_rule (void *a1, void *a2)
+{
+  vlib_cli_sub_rule_t *r1 = a1;
+  vlib_cli_sub_rule_t *r2 = a2;
+
+  return vec_cmp (r1->name, r2->name);
+}
+
+static int
+vlib_cli_cmp_command (void *a1, void *a2)
+{
+  vlib_cli_command_t *c1 = a1;
+  vlib_cli_command_t *c2 = a2;
+
+  return vec_cmp (c1->path, c2->path);
+}
+
+static clib_error_t *
+vlib_cli_dispatch_sub_commands (vlib_main_t * vm,
+				vlib_cli_main_t * cm,
+				unformat_input_t * input,
+				uword parent_command_index)
+{
+  vlib_cli_command_t *parent, *c;
+  clib_error_t *error = 0;
+  unformat_input_t sub_input;
+  u8 *string;
+  uword is_main_dispatch = cm == &vm->cli_main;
+
+  parent = vec_elt_at_index (cm->commands, parent_command_index);
+  if (is_main_dispatch && unformat (input, "help"))
+    {
+      uword help_at_end_of_line, i;
+
+      help_at_end_of_line =
+	unformat_check_input (input) == UNFORMAT_END_OF_INPUT;
+      while (1)
+	{
+	  c = parent;
+	  if (unformat_user
+	      (input, unformat_vlib_cli_sub_command, vm, c, &parent))
+	    ;
+
+	  else if (!(unformat_check_input (input) == UNFORMAT_END_OF_INPUT))
+	    goto unknown;
+
+	  else
+	    break;
+	}
+
+      /* help SUB-COMMAND => long format help.
+         "help" at end of line: show all commands. */
+      if (!help_at_end_of_line)
+	vlib_cli_output (vm, "%U", format_vlib_cli_command_help, c,
+			 /* is_long */ 1);
+
+      else if (vec_len (c->sub_commands) + vec_len (c->sub_rules) == 0)
+	vlib_cli_output (vm, "%v: no sub-commands", c->path);
+
+      else
+	{
+	  vlib_cli_sub_command_t *sc;
+	  vlib_cli_sub_rule_t *sr, *subs;
+
+	  subs = vec_dup (c->sub_rules);
+
+	  /* Add in rules if any. */
+	  vec_foreach (sc, c->sub_commands)
+	  {
+	    vec_add2 (subs, sr, 1);
+	    sr->name = sc->name;
+	    sr->command_index = sc->index;
+	    sr->rule_index = ~0;
+	  }
+
+	  vec_sort_with_function (subs, vlib_cli_cmp_rule);
+
+	  for (i = 0; i < vec_len (subs); i++)
+	    {
+	      vlib_cli_command_t *d;
+	      vlib_cli_parse_rule_t *r;
+
+	      d = vec_elt_at_index (cm->commands, subs[i].command_index);
+	      r =
+		subs[i].rule_index != ~0 ? vec_elt_at_index (cm->parse_rules,
+							     subs
+							     [i].rule_index) :
+		0;
+
+	      if (r)
+		vlib_cli_output
+		  (vm, "  %-30U %U",
+		   format_vlib_cli_parse_rule_name, r,
+		   format_vlib_cli_command_help, d, /* is_long */ 0);
+	      else
+		vlib_cli_output
+		  (vm, "  %-30v %U",
+		   subs[i].name,
+		   format_vlib_cli_command_help, d, /* is_long */ 0);
+	    }
+
+	  vec_free (subs);
+	}
+    }
+
+  else if (is_main_dispatch
+	   && (unformat (input, "choices") || unformat (input, "?")))
+    {
+      vlib_cli_command_t *sub, *subs;
+
+      subs = all_subs (cm, 0, parent_command_index);
+      vec_sort_with_function (subs, vlib_cli_cmp_command);
+      vec_foreach (sub, subs)
+	vlib_cli_output (vm, "  %-40U %U",
+			 format_vlib_cli_path, sub->path,
+			 format_vlib_cli_command_help, sub, /* is_long */ 0);
+      vec_free (subs);
+    }
+
+  else if (unformat (input, "comment %v", &string))
+    {
+      vec_free (string);
+    }
+
+  else if (unformat (input, "uncomment %U",
+		     unformat_vlib_cli_sub_input, &sub_input))
+    {
+      error =
+	vlib_cli_dispatch_sub_commands (vm, cm, &sub_input,
+					parent_command_index);
+      unformat_free (&sub_input);
+    }
+
+  else
+    if (unformat_user (input, unformat_vlib_cli_sub_command, vm, parent, &c))
+    {
+      unformat_input_t *si;
+      uword has_sub_commands =
+	vec_len (c->sub_commands) + vec_len (c->sub_rules) > 0;
+
+      si = input;
+      if (unformat_user (input, unformat_vlib_cli_sub_input, &sub_input))
+	si = &sub_input;
+
+      if (has_sub_commands)
+	error = vlib_cli_dispatch_sub_commands (vm, cm, si, c - cm->commands);
+
+      if (has_sub_commands && !error)
+	/* Found valid sub-command. */ ;
+
+      else if (c->function)
+	{
+	  clib_error_t *c_error;
+
+	  /* Skip white space for benefit of called function. */
+	  unformat_skip_white_space (si);
+
+	  if (unformat (si, "?"))
+	    {
+	      vlib_cli_output (vm, "  %-40U %U", format_vlib_cli_path, c->path, format_vlib_cli_command_help, c,	/* is_long */
+			       0);
+	    }
+	  else
+	    {
+	      if (!c->is_mp_safe)
+		vlib_worker_thread_barrier_sync (vm);
+
+	      c_error = c->function (vm, si, c);
+
+	      if (!c->is_mp_safe)
+		vlib_worker_thread_barrier_release (vm);
+
+	      if (c_error)
+		{
+		  error =
+		    clib_error_return (0, "%v: %v", c->path, c_error->what);
+		  clib_error_free (c_error);
+		  /* Free sub input. */
+		  if (si != input)
+		    unformat_free (si);
+
+		  return error;
+		}
+	    }
+
+	  /* Free any previous error. */
+	  clib_error_free (error);
+	}
+
+      else if (!error)
+	error = clib_error_return (0, "%v: no sub-commands", c->path);
+
+      /* Free sub input. */
+      if (si != input)
+	unformat_free (si);
+    }
+
+  else
+    goto unknown;
+
+  return error;
+
+unknown:
+  if (parent->path)
+    return clib_error_return (0, "%v: unknown input `%U'", parent->path,
+			      format_unformat_error, input);
+  else
+    return clib_error_return (0, "unknown input `%U'", format_unformat_error,
+			      input);
+}
+
+
+void vlib_unix_error_report (vlib_main_t *, clib_error_t *)
+  __attribute__ ((weak));
+
+void
+vlib_unix_error_report (vlib_main_t * vm, clib_error_t * error)
+{
+}
+
+/* Process CLI input. */
+void
+vlib_cli_input (vlib_main_t * vm,
+		unformat_input_t * input,
+		vlib_cli_output_function_t * function, uword function_arg)
+{
+  vlib_process_t *cp = vlib_get_current_process (vm);
+  vlib_cli_main_t *cm = &vm->cli_main;
+  clib_error_t *error;
+  vlib_cli_output_function_t *save_function;
+  uword save_function_arg;
+
+  save_function = cp->output_function;
+  save_function_arg = cp->output_function_arg;
+
+  cp->output_function = function;
+  cp->output_function_arg = function_arg;
+
+  do
+    {
+      vec_reset_length (cm->parse_rule_data);
+      error = vlib_cli_dispatch_sub_commands (vm, &vm->cli_main, input,	/* parent */
+					      0);
+    }
+  while (!error && !unformat (input, "%U", unformat_eof));
+
+  if (error)
+    {
+      vlib_cli_output (vm, "%v", error->what);
+      vlib_unix_error_report (vm, error);
+      clib_error_free (error);
+    }
+
+  cp->output_function = save_function;
+  cp->output_function_arg = save_function_arg;
+}
+
+/* Output to current CLI connection. */
+void
+vlib_cli_output (vlib_main_t * vm, char *fmt, ...)
+{
+  vlib_process_t *cp = vlib_get_current_process (vm);
+  va_list va;
+  u8 *s;
+
+  va_start (va, fmt);
+  s = va_format (0, fmt, &va);
+  va_end (va);
+
+  /* Terminate with \n if not present. */
+  if (vec_len (s) > 0 && s[vec_len (s) - 1] != '\n')
+    vec_add1 (s, '\n');
+
+  if ((!cp) || (!cp->output_function))
+    fformat (stdout, "%v", s);
+  else
+    cp->output_function (cp->output_function_arg, s, vec_len (s));
+
+  vec_free (s);
+}
+
+static clib_error_t *
+show_memory_usage (vlib_main_t * vm,
+		   unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  int verbose = 0;
+  clib_error_t *error;
+  u32 index = 0;
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "verbose"))
+	verbose = 1;
+      else
+	{
+	  error = clib_error_return (0, "unknown input `%U'",
+				     format_unformat_error, input);
+	  return error;
+	}
+    }
+
+  /* *INDENT-OFF* */
+  foreach_vlib_main (
+  ({
+      vlib_cli_output (vm, "Thread %d %v\n", index, vlib_worker_threads[index].name);
+      vlib_cli_output (vm, "%U\n", format_mheap, clib_per_cpu_mheaps[index], verbose);
+      index++;
+  }));
+  /* *INDENT-ON* */
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (show_memory_usage_command, static) = {
+  .path = "show memory",
+  .short_help = "Show current memory usage",
+  .function = show_memory_usage,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+show_cpu (vlib_main_t * vm, unformat_input_t * input,
+	  vlib_cli_command_t * cmd)
+{
+#define _(a,b,c) vlib_cli_output (vm, "%-25s " b, a ":", c);
+  _("Model name", "%U", format_cpu_model_name);
+  _("Microarchitecture", "%U", format_cpu_uarch);
+  _("Flags", "%U", format_cpu_flags);
+  _("Base frequency", "%.2f GHz",
+    ((f64) vm->clib_time.clocks_per_second) * 1e-9);
+#undef _
+  return 0;
+}
+
+/*?
+ * Displays various information about the CPU.
+ *
+ * @cliexpar
+ * @cliexstart{show cpu}
+ * Model name:               Intel(R) Xeon(R) CPU E5-2667 v4 @ 3.20GHz
+ * Microarchitecture:        Broadwell (Broadwell-EP/EX)
+ * Flags:                    sse3 ssse3 sse41 sse42 avx avx2 aes
+ * Base Frequency:           3.20 GHz
+ * @cliexend
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (show_cpu_command, static) = {
+  .path = "show cpu",
+  .short_help = "Show cpu information",
+  .function = show_cpu,
+};
+
+/* *INDENT-ON* */
+static clib_error_t *
+enable_disable_memory_trace (vlib_main_t * vm,
+			     unformat_input_t * input,
+			     vlib_cli_command_t * cmd)
+{
+  clib_error_t *error = 0;
+  int enable;
+
+  if (!unformat_user (input, unformat_vlib_enable_disable, &enable))
+    {
+      error = clib_error_return (0, "expecting enable/on or disable/off");
+      goto done;
+    }
+
+  clib_mem_trace (enable);
+
+done:
+  return error;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (enable_disable_memory_trace_command, static) = {
+  .path = "memory-trace",
+  .short_help = "Enable/disable memory allocation trace",
+  .function = enable_disable_memory_trace,
+};
+/* *INDENT-ON* */
+
+
+static clib_error_t *
+test_heap_validate (vlib_main_t * vm, unformat_input_t * input,
+		    vlib_cli_command_t * cmd)
+{
+  clib_error_t *error = 0;
+  void *heap;
+  mheap_t *mheap;
+
+  if (unformat (input, "on"))
+    {
+        /* *INDENT-OFF* */
+        foreach_vlib_main({
+          heap = clib_per_cpu_mheaps[this_vlib_main->thread_index];
+          mheap = mheap_header(heap);
+          mheap->flags |= MHEAP_FLAG_VALIDATE;
+          // Turn off small object cache because it delays detection of errors
+          mheap->flags &= ~MHEAP_FLAG_SMALL_OBJECT_CACHE;
+        });
+        /* *INDENT-ON* */
+
+    }
+  else if (unformat (input, "off"))
+    {
+        /* *INDENT-OFF* */
+        foreach_vlib_main({
+          heap = clib_per_cpu_mheaps[this_vlib_main->thread_index];
+          mheap = mheap_header(heap);
+          mheap->flags &= ~MHEAP_FLAG_VALIDATE;
+          mheap->flags |= MHEAP_FLAG_SMALL_OBJECT_CACHE;
+        });
+        /* *INDENT-ON* */
+    }
+  else if (unformat (input, "now"))
+    {
+        /* *INDENT-OFF* */
+        foreach_vlib_main({
+          heap = clib_per_cpu_mheaps[this_vlib_main->thread_index];
+          mheap = mheap_header(heap);
+          mheap_validate(heap);
+        });
+        /* *INDENT-ON* */
+      vlib_cli_output (vm, "heap validation complete");
+
+    }
+  else
+    {
+      return clib_error_return (0, "unknown input `%U'",
+				format_unformat_error, input);
+    }
+
+  return error;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_test_heap_validate,static) = {
+    .path = "test heap-validate",
+    .short_help = "<on/off/now> validate heap on future allocs/frees or right now",
+    .function = test_heap_validate,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+restart_cmd_fn (vlib_main_t * vm, unformat_input_t * input,
+		vlib_cli_command_t * cmd)
+{
+  char *newenviron[] = { NULL };
+
+  execve (vm->name, (char **) vm->argv, newenviron);
+
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (restart_cmd,static) = {
+    .path = "restart",
+    .short_help = "restart process",
+    .function = restart_cmd_fn,
+};
+/* *INDENT-ON* */
+
+#ifdef TEST_CODE
+/*
+ * A trivial test harness to verify the per-process output_function
+ * is working correcty.
+ */
+
+static clib_error_t *
+sleep_ten_seconds (vlib_main_t * vm,
+		   unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  u16 i;
+  u16 my_id = rand ();
+
+  vlib_cli_output (vm, "Starting 10 seconds sleep with id %u\n", my_id);
+
+  for (i = 0; i < 10; i++)
+    {
+      vlib_process_wait_for_event_or_clock (vm, 1.0);
+      vlib_cli_output (vm, "Iteration number %u, my id: %u\n", i, my_id);
+    }
+  vlib_cli_output (vm, "Done with sleep with id %u\n", my_id);
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (ping_command, static) = {
+  .path = "test sleep",
+  .function = sleep_ten_seconds,
+  .short_help = "Sleep for 10 seconds",
+};
+/* *INDENT-ON* */
+#endif /* ifdef TEST_CODE */
+
+static uword
+vlib_cli_normalize_path (char *input, char **result)
+{
+  char *i = input;
+  char *s = 0;
+  uword l = 0;
+  uword index_of_last_space = ~0;
+
+  while (*i != 0)
+    {
+      u8 c = *i++;
+      /* Multiple white space -> single space. */
+      switch (c)
+	{
+	case ' ':
+	case '\t':
+	case '\n':
+	case '\r':
+	  if (l > 0 && s[l - 1] != ' ')
+	    {
+	      vec_add1 (s, ' ');
+	      l++;
+	    }
+	  break;
+
+	default:
+	  if (l > 0 && s[l - 1] == ' ')
+	    index_of_last_space = vec_len (s);
+	  vec_add1 (s, c);
+	  l++;
+	  break;
+	}
+    }
+
+  /* Remove any extra space at end. */
+  if (l > 0 && s[l - 1] == ' ')
+    _vec_len (s) -= 1;
+
+  *result = s;
+  return index_of_last_space;
+}
+
+always_inline uword
+parent_path_len (char *path)
+{
+  word i;
+  for (i = vec_len (path) - 1; i >= 0; i--)
+    {
+      if (path[i] == ' ')
+	return i;
+    }
+  return ~0;
+}
+
+static void
+add_sub_command (vlib_cli_main_t * cm, uword parent_index, uword child_index)
+{
+  vlib_cli_command_t *p, *c;
+  vlib_cli_sub_command_t *sub_c;
+  u8 *sub_name;
+  word i, l;
+
+  p = vec_elt_at_index (cm->commands, parent_index);
+  c = vec_elt_at_index (cm->commands, child_index);
+
+  l = parent_path_len (c->path);
+  if (l == ~0)
+    sub_name = vec_dup ((u8 *) c->path);
+  else
+    {
+      ASSERT (l + 1 < vec_len (c->path));
+      sub_name = 0;
+      vec_add (sub_name, c->path + l + 1, vec_len (c->path) - (l + 1));
+    }
+
+  if (sub_name[0] == '%')
+    {
+      uword *q;
+      vlib_cli_sub_rule_t *sr;
+
+      /* Remove %. */
+      vec_delete (sub_name, 1, 0);
+
+      if (!p->sub_rule_index_by_name)
+	p->sub_rule_index_by_name = hash_create_vec ( /* initial length */ 32,
+						     sizeof (sub_name[0]),
+						     sizeof (uword));
+      q = hash_get_mem (p->sub_rule_index_by_name, sub_name);
+      if (q)
+	{
+	  sr = vec_elt_at_index (p->sub_rules, q[0]);
+	  ASSERT (sr->command_index == child_index);
+	  return;
+	}
+
+      q = hash_get_mem (cm->parse_rule_index_by_name, sub_name);
+      if (!q)
+	{
+	  clib_error ("reference to unknown rule `%%%v' in path `%v'",
+		      sub_name, c->path);
+	  return;
+	}
+
+      hash_set_mem (p->sub_rule_index_by_name, sub_name,
+		    vec_len (p->sub_rules));
+      vec_add2 (p->sub_rules, sr, 1);
+      sr->name = sub_name;
+      sr->rule_index = q[0];
+      sr->command_index = child_index;
+      return;
+    }
+
+  if (!p->sub_command_index_by_name)
+    p->sub_command_index_by_name = hash_create_vec ( /* initial length */ 32,
+						    sizeof (c->path[0]),
+						    sizeof (uword));
+
+  /* Check if sub-command has already been created. */
+  if (hash_get_mem (p->sub_command_index_by_name, sub_name))
+    {
+      vec_free (sub_name);
+      return;
+    }
+
+  vec_add2 (p->sub_commands, sub_c, 1);
+  sub_c->index = child_index;
+  sub_c->name = sub_name;
+  hash_set_mem (p->sub_command_index_by_name, sub_c->name,
+		sub_c - p->sub_commands);
+
+  vec_validate (p->sub_command_positions, vec_len (sub_c->name) - 1);
+  for (i = 0; i < vec_len (sub_c->name); i++)
+    {
+      int n;
+      vlib_cli_parse_position_t *pos;
+
+      pos = vec_elt_at_index (p->sub_command_positions, i);
+
+      if (!pos->bitmaps)
+	pos->min_char = sub_c->name[i];
+
+      n = sub_c->name[i] - pos->min_char;
+      if (n < 0)
+	{
+	  pos->min_char = sub_c->name[i];
+	  vec_insert (pos->bitmaps, -n, 0);
+	  n = 0;
+	}
+
+      vec_validate (pos->bitmaps, n);
+      pos->bitmaps[n] =
+	clib_bitmap_ori (pos->bitmaps[n], sub_c - p->sub_commands);
+    }
+}
+
+static void
+vlib_cli_make_parent (vlib_cli_main_t * cm, uword ci)
+{
+  uword p_len, pi, *p;
+  char *p_path;
+  vlib_cli_command_t *c, *parent;
+
+  /* Root command (index 0) should have already been added. */
+  ASSERT (vec_len (cm->commands) > 0);
+
+  c = vec_elt_at_index (cm->commands, ci);
+  p_len = parent_path_len (c->path);
+
+  /* No space?  Parent is root command. */
+  if (p_len == ~0)
+    {
+      add_sub_command (cm, 0, ci);
+      return;
+    }
+
+  p_path = 0;
+  vec_add (p_path, c->path, p_len);
+
+  p = hash_get_mem (cm->command_index_by_path, p_path);
+
+  /* Parent exists? */
+  if (!p)
+    {
+      /* Parent does not exist; create it. */
+      vec_add2 (cm->commands, parent, 1);
+      parent->path = p_path;
+      hash_set_mem (cm->command_index_by_path, parent->path,
+		    parent - cm->commands);
+      pi = parent - cm->commands;
+    }
+  else
+    {
+      pi = p[0];
+      vec_free (p_path);
+    }
+
+  add_sub_command (cm, pi, ci);
+
+  /* Create parent's parent. */
+  if (!p)
+    vlib_cli_make_parent (cm, pi);
+}
+
+always_inline uword
+vlib_cli_command_is_empty (vlib_cli_command_t * c)
+{
+  return (c->long_help == 0 && c->short_help == 0 && c->function == 0);
+}
+
+clib_error_t *
+vlib_cli_register (vlib_main_t * vm, vlib_cli_command_t * c)
+{
+  vlib_cli_main_t *cm = &vm->cli_main;
+  clib_error_t *error = 0;
+  uword ci, *p;
+  char *normalized_path;
+
+  if ((error = vlib_call_init_function (vm, vlib_cli_init)))
+    return error;
+
+  (void) vlib_cli_normalize_path (c->path, &normalized_path);
+
+  if (!cm->command_index_by_path)
+    cm->command_index_by_path = hash_create_vec ( /* initial length */ 32,
+						 sizeof (c->path[0]),
+						 sizeof (uword));
+
+  /* See if command already exists with given path. */
+  p = hash_get_mem (cm->command_index_by_path, normalized_path);
+  if (p)
+    {
+      vlib_cli_command_t *d;
+
+      ci = p[0];
+      d = vec_elt_at_index (cm->commands, ci);
+
+      /* If existing command was created via vlib_cli_make_parent
+         replaced it with callers data. */
+      if (vlib_cli_command_is_empty (d))
+	{
+	  vlib_cli_command_t save = d[0];
+
+	  ASSERT (!vlib_cli_command_is_empty (c));
+
+	  /* Copy callers fields. */
+	  d[0] = c[0];
+
+	  /* Save internal fields. */
+	  d->path = save.path;
+	  d->sub_commands = save.sub_commands;
+	  d->sub_command_index_by_name = save.sub_command_index_by_name;
+	  d->sub_command_positions = save.sub_command_positions;
+	  d->sub_rules = save.sub_rules;
+	}
+      else
+	error =
+	  clib_error_return (0, "duplicate command name with path %v",
+			     normalized_path);
+
+      vec_free (normalized_path);
+      if (error)
+	return error;
+    }
+  else
+    {
+      /* Command does not exist: create it. */
+
+      /* Add root command (index 0). */
+      if (vec_len (cm->commands) == 0)
+	{
+	  /* Create command with index 0; path is empty string. */
+	  vec_resize (cm->commands, 1);
+	}
+
+      ci = vec_len (cm->commands);
+      hash_set_mem (cm->command_index_by_path, normalized_path, ci);
+      vec_add1 (cm->commands, c[0]);
+
+      c = vec_elt_at_index (cm->commands, ci);
+      c->path = normalized_path;
+
+      /* Don't inherit from registration. */
+      c->sub_commands = 0;
+      c->sub_command_index_by_name = 0;
+      c->sub_command_positions = 0;
+    }
+
+  vlib_cli_make_parent (cm, ci);
+  return 0;
+}
+
+clib_error_t *
+vlib_cli_register_parse_rule (vlib_main_t * vm, vlib_cli_parse_rule_t * r_reg)
+{
+  vlib_cli_main_t *cm = &vm->cli_main;
+  vlib_cli_parse_rule_t *r;
+  clib_error_t *error = 0;
+  u8 *r_name;
+  uword *p;
+
+  if (!cm->parse_rule_index_by_name)
+    cm->parse_rule_index_by_name = hash_create_vec ( /* initial length */ 32,
+						    sizeof (r->name[0]),
+						    sizeof (uword));
+
+  /* Make vector copy of name. */
+  r_name = format (0, "%s", r_reg->name);
+
+  if ((p = hash_get_mem (cm->parse_rule_index_by_name, r_name)))
+    {
+      vec_free (r_name);
+      return clib_error_return (0, "duplicate parse rule name `%s'",
+				r_reg->name);
+    }
+
+  vec_add2 (cm->parse_rules, r, 1);
+  r[0] = r_reg[0];
+  r->name = (char *) r_name;
+  hash_set_mem (cm->parse_rule_index_by_name, r->name, r - cm->parse_rules);
+
+  return error;
+}
+
+#if 0
+/* $$$ turn back on again someday, maybe */
+static clib_error_t *vlib_cli_register_parse_rules (vlib_main_t * vm,
+						    vlib_cli_parse_rule_t *
+						    lo,
+						    vlib_cli_parse_rule_t *
+						    hi)
+  __attribute__ ((unused))
+{
+  clib_error_t *error = 0;
+  vlib_cli_parse_rule_t *r;
+
+  for (r = lo; r < hi; r = clib_elf_section_data_next (r, 0))
+    {
+      if (!r->name || strlen (r->name) == 0)
+	{
+	  error = clib_error_return (0, "parse rule with no name");
+	  goto done;
+	}
+
+      error = vlib_cli_register_parse_rule (vm, r);
+      if (error)
+	goto done;
+    }
+
+done:
+  return error;
+}
+#endif
+
+static int
+cli_path_compare (void *a1, void *a2)
+{
+  u8 **s1 = a1;
+  u8 **s2 = a2;
+
+  if ((vec_len (*s1) < vec_len (*s2)) &&
+      memcmp ((char *) *s1, (char *) *s2, vec_len (*s1)) == 0)
+    return -1;
+
+
+  if ((vec_len (*s1) > vec_len (*s2)) &&
+      memcmp ((char *) *s1, (char *) *s2, vec_len (*s2)) == 0)
+    return 1;
+
+  return vec_cmp (*s1, *s2);
+}
+
+static clib_error_t *
+show_cli_cmd_fn (vlib_main_t * vm, unformat_input_t * input,
+		 vlib_cli_command_t * cmd)
+{
+  vlib_cli_main_t *cm = &vm->cli_main;
+  vlib_cli_command_t *cli;
+  u8 **paths = 0, **s;
+
+  /* *INDENT-OFF* */
+  vec_foreach (cli, cm->commands)
+    if (vec_len (cli->path) > 0)
+      vec_add1 (paths, (u8 *) cli->path);
+
+  vec_sort_with_function (paths, cli_path_compare);
+
+  vec_foreach (s, paths)
+    vlib_cli_output (vm, "%v", *s);
+  /* *INDENT-ON* */
+
+  vec_free (paths);
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (show_cli_command, static) = {
+  .path = "show cli",
+  .short_help = "Show cli commands",
+  .function = show_cli_cmd_fn,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+vlib_cli_init (vlib_main_t * vm)
+{
+  vlib_cli_main_t *cm = &vm->cli_main;
+  clib_error_t *error = 0;
+  vlib_cli_command_t *cmd;
+
+  cmd = cm->cli_command_registrations;
+
+  while (cmd)
+    {
+      error = vlib_cli_register (vm, cmd);
+      if (error)
+	return error;
+      cmd = cmd->next_cli_command;
+    }
+  return error;
+}
+
+VLIB_INIT_FUNCTION (vlib_cli_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/cli.h b/src/vlib/cli.h
new file mode 100644
index 00000000..e713808f
--- /dev/null
+++ b/src/vlib/cli.h
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * cli.h: command line interface
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_cli_h
+#define included_vlib_cli_h
+
+#include <vppinfra/format.h>
+
+struct vlib_cli_command_t;
+
+typedef struct
+{
+  u32 min_char;
+
+  /* Indexed by name[position] - min_char. */
+  uword **bitmaps;
+} vlib_cli_parse_position_t;
+
+typedef struct
+{
+  u8 *name;
+
+  u32 index;
+} vlib_cli_sub_command_t;
+
+typedef struct
+{
+  u8 *name;
+
+  u32 rule_index;
+
+  u32 command_index;
+} vlib_cli_sub_rule_t;
+
+typedef struct
+{
+  char *name;
+  char *short_help;
+  char *long_help;
+
+  /* Number of bytes in parsed data.  Zero for vector. */
+  uword data_size;
+
+  unformat_function_t *unformat_function;
+
+  /* Opaque for unformat function. */
+  uword unformat_function_arg[2];
+} vlib_cli_parse_rule_t;
+
+/* CLI command callback function. */
+typedef clib_error_t *(vlib_cli_command_function_t)
+  (struct vlib_main_t * vm,
+   unformat_input_t * input, struct vlib_cli_command_t * cmd);
+
+typedef struct vlib_cli_command_t
+{
+  /* Command path (e.g. "show something").
+     Spaces delimit elements of path. */
+  char *path;
+
+  /* Short/long help strings. */
+  char *short_help;
+  char *long_help;
+
+  /* Callback function. */
+  vlib_cli_command_function_t *function;
+
+  /* Opaque. */
+  uword function_arg;
+
+  /* Known MP-safe? */
+  uword is_mp_safe;
+
+  /* Sub commands for this command. */
+  vlib_cli_sub_command_t *sub_commands;
+
+  /* Hash table mapping name (e.g. last path element) to sub command index. */
+  uword *sub_command_index_by_name;
+
+  /* bitmap[p][c][i] says whether sub-command i has character
+     c in position p. */
+  vlib_cli_parse_position_t *sub_command_positions;
+
+  /* Hash table mapping name (e.g. last path element) to sub rule index. */
+  uword *sub_rule_index_by_name;
+
+  /* Vector of possible parse rules for this path. */
+  vlib_cli_sub_rule_t *sub_rules;
+
+  /* List of CLI commands, built by constructors */
+  struct vlib_cli_command_t *next_cli_command;
+
+} vlib_cli_command_t;
+
+typedef void (vlib_cli_output_function_t) (uword arg,
+					   u8 * buffer, uword buffer_bytes);
+typedef struct
+{
+  /* Vector of all known commands. */
+  vlib_cli_command_t *commands;
+
+  /* Hash table mapping normalized path to index into all_commands. */
+  uword *command_index_by_path;
+
+  /* Vector of all known parse rules. */
+  vlib_cli_parse_rule_t *parse_rules;
+
+  /* Hash table mapping parse rule name to index into parse_rule vector. */
+  uword *parse_rule_index_by_name;
+
+  /* Data parsed for rules. */
+  void **parse_rule_data;
+
+  /* registration list added by constructors */
+  vlib_cli_command_t *cli_command_registrations;
+} vlib_cli_main_t;
+
+#define VLIB_CLI_COMMAND(x,...)                                         \
+    __VA_ARGS__ vlib_cli_command_t x;                                   \
+static void __vlib_cli_command_registration_##x (void)                  \
+    __attribute__((__constructor__)) ;                                  \
+static void __vlib_cli_command_registration_##x (void)                  \
+{                                                                       \
+    vlib_main_t * vm = vlib_get_main();                                 \
+    vlib_cli_main_t *cm = &vm->cli_main;                                \
+    x.next_cli_command = cm->cli_command_registrations;                 \
+    cm->cli_command_registrations = &x;                                 \
+}                                                                       \
+__VA_ARGS__ vlib_cli_command_t x
+#define VLIB_CLI_PARSE_RULE(x) \
+  vlib_cli_parse_rule_t x
+/* Output to current CLI connection. */
+void vlib_cli_output (struct vlib_main_t *vm, char *fmt, ...);
+
+/* Process CLI input. */
+void vlib_cli_input (struct vlib_main_t *vm,
+		     unformat_input_t * input,
+		     vlib_cli_output_function_t * function,
+		     uword function_arg);
+
+clib_error_t *vlib_cli_register (struct vlib_main_t *vm,
+				 vlib_cli_command_t * c);
+clib_error_t *vlib_cli_register_parse_rule (struct vlib_main_t *vm,
+					    vlib_cli_parse_rule_t * c);
+
+uword unformat_vlib_cli_sub_input (unformat_input_t * i, va_list * args);
+
+/* Return an vector of strings consisting of possible auto-completions
+ * for a given input string */
+u8 **vlib_cli_get_possible_completions (u8 * input_str);
+
+#endif /* included_vlib_cli_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/cli_funcs.h b/src/vlib/cli_funcs.h
new file mode 100644
index 00000000..78aef73b
--- /dev/null
+++ b/src/vlib/cli_funcs.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * cli_funcs.h: VLIB CLI related functions/inlines
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_cli_funcs_h
+#define included_vlib_cli_funcs_h
+
+always_inline void *
+vlib_cli_get_parse_rule_result (vlib_main_t * vm, uword index)
+{
+  vlib_cli_main_t *cm = &vm->cli_main;
+  return vec_elt (cm->parse_rule_data, index);
+}
+
+#endif /* included_vlib_cli_funcs_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/counter.c b/src/vlib/counter.c
new file mode 100644
index 00000000..62f4bd66
--- /dev/null
+++ b/src/vlib/counter.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * counter.c: simple and packet/byte counters
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+
+void
+vlib_clear_simple_counters (vlib_simple_counter_main_t * cm)
+{
+  counter_t *my_counters;
+  uword i, j;
+
+  for (i = 0; i < vec_len (cm->counters); i++)
+    {
+      my_counters = cm->counters[i];
+
+      for (j = 0; j < vec_len (my_counters); j++)
+	{
+	  my_counters[j] = 0;
+	}
+    }
+}
+
+void
+vlib_clear_combined_counters (vlib_combined_counter_main_t * cm)
+{
+  vlib_counter_t *my_counters;
+  uword i, j;
+
+  for (i = 0; i < vec_len (cm->counters); i++)
+    {
+      my_counters = cm->counters[i];
+
+      for (j = 0; j < vec_len (my_counters); j++)
+	{
+	  my_counters[j].packets = 0;
+	  my_counters[j].bytes = 0;
+	}
+    }
+}
+
+void
+vlib_validate_simple_counter (vlib_simple_counter_main_t * cm, u32 index)
+{
+  vlib_thread_main_t *tm = vlib_get_thread_main ();
+  int i;
+
+  vec_validate (cm->counters, tm->n_vlib_mains - 1);
+  for (i = 0; i < tm->n_vlib_mains; i++)
+    vec_validate_aligned (cm->counters[i], index, CLIB_CACHE_LINE_BYTES);
+}
+
+void
+vlib_validate_combined_counter (vlib_combined_counter_main_t * cm, u32 index)
+{
+  vlib_thread_main_t *tm = vlib_get_thread_main ();
+  int i;
+
+  vec_validate (cm->counters, tm->n_vlib_mains - 1);
+  for (i = 0; i < tm->n_vlib_mains; i++)
+    vec_validate_aligned (cm->counters[i], index, CLIB_CACHE_LINE_BYTES);
+}
+
+u32
+vlib_combined_counter_n_counters (const vlib_combined_counter_main_t * cm)
+{
+  ASSERT (cm->counters);
+  return (vec_len (cm->counters[0]));
+}
+
+u32
+vlib_simple_counter_n_counters (const vlib_simple_counter_main_t * cm)
+{
+  ASSERT (cm->counters);
+  return (vec_len (cm->counters[0]));
+}
+
+void
+serialize_vlib_simple_counter_main (serialize_main_t * m, va_list * va)
+{
+  clib_warning ("unimplemented");
+}
+
+void
+unserialize_vlib_simple_counter_main (serialize_main_t * m, va_list * va)
+{
+  clib_warning ("unimplemented");
+}
+
+void
+serialize_vlib_combined_counter_main (serialize_main_t * m, va_list * va)
+{
+  clib_warning ("unimplemented");
+}
+
+void
+unserialize_vlib_combined_counter_main (serialize_main_t * m, va_list * va)
+{
+  clib_warning ("unimplemented");
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/counter.h b/src/vlib/counter.h
new file mode 100644
index 00000000..60e2055d
--- /dev/null
+++ b/src/vlib/counter.h
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * counter.h: simple and packet/byte counters
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_counter_h
+#define included_vlib_counter_h
+
+/** \file
+
+    Optimized thread-safe counters.
+
+    Each vlib_[simple|combined]_counter_main_t consists of a per-thread
+    vector of per-object counters.
+
+    The idea is to drastically eliminate atomic operations.
+*/
+
+/** 64bit counters */
+typedef u64 counter_t;
+
+/** A collection of simple counters */
+
+typedef struct
+{
+  counter_t **counters;	 /**< Per-thread u64 non-atomic counters */
+  counter_t *value_at_last_serialize;	/**< Values as of last serialize. */
+  u32 last_incremental_serialize_index;	/**< Last counter index
+                                           serialized incrementally. */
+
+  char *name;			/**< The counter collection's name. */
+} vlib_simple_counter_main_t;
+
+/** The number of counters (not the number of per-thread counters) */
+u32 vlib_simple_counter_n_counters (const vlib_simple_counter_main_t * cm);
+
+/** Increment a simple counter
+    @param cm - (vlib_simple_counter_main_t *) simple counter main pointer
+    @param thread_index - (u32) the current cpu index
+    @param index - (u32) index of the counter to increment
+    @param increment - (u64) quantitiy to add to the counter
+*/
+always_inline void
+vlib_increment_simple_counter (vlib_simple_counter_main_t * cm,
+			       u32 thread_index, u32 index, u64 increment)
+{
+  counter_t *my_counters;
+
+  my_counters = cm->counters[thread_index];
+  my_counters[index] += increment;
+}
+
+/** Get the value of a simple counter
+    Scrapes the entire set of per-thread counters. Innacurate unless
+    worker threads which might increment the counter are
+    barrier-synchronized
+
+    @param cm - (vlib_simple_counter_main_t *) simple counter main pointer
+    @param index - (u32) index of the counter to fetch
+    @returns - (u64) current counter value
+*/
+always_inline counter_t
+vlib_get_simple_counter (vlib_simple_counter_main_t * cm, u32 index)
+{
+  counter_t *my_counters;
+  counter_t v;
+  int i;
+
+  ASSERT (index < vlib_simple_counter_n_counters (cm));
+
+  v = 0;
+
+  for (i = 0; i < vec_len (cm->counters); i++)
+    {
+      my_counters = cm->counters[i];
+      v += my_counters[index];
+    }
+
+  return v;
+}
+
+/** Clear a simple counter
+    Clears the set of per-thread u16 counters, and the u64 counter
+
+    @param cm - (vlib_simple_counter_main_t *) simple counter main pointer
+    @param index - (u32) index of the counter to clear
+*/
+always_inline void
+vlib_zero_simple_counter (vlib_simple_counter_main_t * cm, u32 index)
+{
+  counter_t *my_counters;
+  int i;
+
+  ASSERT (index < vlib_simple_counter_n_counters (cm));
+
+  for (i = 0; i < vec_len (cm->counters); i++)
+    {
+      my_counters = cm->counters[i];
+      my_counters[index] = 0;
+    }
+}
+
+/** Combined counter to hold both packets and byte differences.
+ */
+typedef struct
+{
+  counter_t packets;			/**< packet counter */
+  counter_t bytes;			/**< byte counter  */
+} vlib_counter_t;
+
+/** Add two combined counters, results in the first counter
+    @param [in,out] a - (vlib_counter_t *) dst counter
+    @param b - (vlib_counter_t *) src counter
+*/
+
+always_inline void
+vlib_counter_add (vlib_counter_t * a, vlib_counter_t * b)
+{
+  a->packets += b->packets;
+  a->bytes += b->bytes;
+}
+
+/** Subtract combined counters, results in the first counter
+    @param [in,out] a - (vlib_counter_t *) dst counter
+    @param b - (vlib_counter_t *) src counter
+*/
+always_inline void
+vlib_counter_sub (vlib_counter_t * a, vlib_counter_t * b)
+{
+  ASSERT (a->packets >= b->packets);
+  ASSERT (a->bytes >= b->bytes);
+  a->packets -= b->packets;
+  a->bytes -= b->bytes;
+}
+
+/** Clear a combined counter
+    @param a - (vlib_counter_t *) counter to clear
+*/
+always_inline void
+vlib_counter_zero (vlib_counter_t * a)
+{
+  a->packets = a->bytes = 0;
+}
+
+/** A collection of combined counters */
+typedef struct
+{
+  vlib_counter_t **counters;	/**< Per-thread u64 non-atomic counter pairs */
+  vlib_counter_t *value_at_last_serialize; /**< Counter values as of last serialize. */
+  u32 last_incremental_serialize_index;	/**< Last counter index serialized incrementally. */
+  char *name; /**< The counter collection's name. */
+} vlib_combined_counter_main_t;
+
+/** The number of counters (not the number of per-thread counters) */
+u32 vlib_combined_counter_n_counters (const vlib_combined_counter_main_t *
+				      cm);
+
+/** Clear a collection of simple counters
+    @param cm - (vlib_simple_counter_main_t *) collection to clear
+*/
+void vlib_clear_simple_counters (vlib_simple_counter_main_t * cm);
+
+/** Clear a collection of combined counters
+    @param cm - (vlib_combined_counter_main_t *) collection to clear
+*/
+void vlib_clear_combined_counters (vlib_combined_counter_main_t * cm);
+
+/** Increment a combined counter
+    @param cm - (vlib_combined_counter_main_t *) comined counter main pointer
+    @param thread_index - (u32) the current cpu index
+    @param index - (u32) index of the counter to increment
+    @param packet_increment - (u64) number of packets to add to the counter
+    @param byte_increment - (u64) number of bytes to add to the counter
+*/
+
+always_inline void
+vlib_increment_combined_counter (vlib_combined_counter_main_t * cm,
+				 u32 thread_index,
+				 u32 index, u64 n_packets, u64 n_bytes)
+{
+  vlib_counter_t *my_counters;
+
+  /* Use this CPU's counter array */
+  my_counters = cm->counters[thread_index];
+
+  my_counters[index].packets += n_packets;
+  my_counters[index].bytes += n_bytes;
+}
+
+/** Pre-fetch a per-thread combined counter for the given object index */
+always_inline void
+vlib_prefetch_combined_counter (const vlib_combined_counter_main_t * cm,
+				u32 thread_index, u32 index)
+{
+  vlib_counter_t *cpu_counters;
+
+  /*
+   * This CPU's index is assumed to already be in cache
+   */
+  cpu_counters = cm->counters[thread_index];
+  CLIB_PREFETCH (cpu_counters + index, CLIB_CACHE_LINE_BYTES, STORE);
+}
+
+
+/** Get the value of a combined counter, never called in the speed path
+    Scrapes the entire set of per-thread counters. Innacurate unless
+    worker threads which might increment the counter are
+    barrier-synchronized
+
+    @param cm - (vlib_combined_counter_main_t *) combined counter main pointer
+    @param index - (u32) index of the combined counter to fetch
+    @param result [out] - (vlib_counter_t *) result stored here
+*/
+
+static inline void
+vlib_get_combined_counter (const vlib_combined_counter_main_t * cm,
+			   u32 index, vlib_counter_t * result)
+{
+  vlib_counter_t *my_counters, *counter;
+  int i;
+
+  result->packets = 0;
+  result->bytes = 0;
+
+  for (i = 0; i < vec_len (cm->counters); i++)
+    {
+      my_counters = cm->counters[i];
+
+      counter = vec_elt_at_index (my_counters, index);
+      result->packets += counter->packets;
+      result->bytes += counter->bytes;
+    }
+}
+
+/** Clear a combined counter
+    Clears the set of per-thread counters.
+
+    @param cm - (vlib_combined_counter_main_t *) combined counter main pointer
+    @param index - (u32) index of the counter to clear
+*/
+always_inline void
+vlib_zero_combined_counter (vlib_combined_counter_main_t * cm, u32 index)
+{
+  vlib_counter_t *my_counters, *counter;
+  int i;
+
+  for (i = 0; i < vec_len (cm->counters); i++)
+    {
+      my_counters = cm->counters[i];
+
+      counter = vec_elt_at_index (my_counters, index);
+      counter->packets = 0;
+      counter->bytes = 0;
+    }
+}
+
+/** validate a simple counter
+    @param cm - (vlib_simple_counter_main_t *) pointer to the counter collection
+    @param index - (u32) index of the counter to validate
+*/
+
+void vlib_validate_simple_counter (vlib_simple_counter_main_t * cm,
+				   u32 index);
+/** validate a combined counter
+    @param cm - (vlib_combined_counter_main_t *) pointer to the counter
+    collection
+    @param index - (u32) index of the counter to validate
+*/
+
+void vlib_validate_combined_counter (vlib_combined_counter_main_t * cm,
+				     u32 index);
+
+/** Obtain the number of simple or combined counters allocated.
+    A macro which reduces to to vec_len(cm->maxi), the answer in either
+    case.
+
+    @param cm - (vlib_simple_counter_main_t) or
+    (vlib_combined_counter_main_t) the counter collection to interrogate
+    @returns vec_len(cm->maxi)
+*/
+#define vlib_counter_len(cm) vec_len((cm)->maxi)
+
+serialize_function_t serialize_vlib_simple_counter_main,
+  unserialize_vlib_simple_counter_main;
+serialize_function_t serialize_vlib_combined_counter_main,
+  unserialize_vlib_combined_counter_main;
+
+#endif /* included_vlib_counter_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/defs.h b/src/vlib/defs.h
new file mode 100644
index 00000000..ad58bc04
--- /dev/null
+++ b/src/vlib/defs.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * defs.h: VLIB generic C definitions
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_defs_h
+#define included_vlib_defs_h
+
+/* Receive or transmit. */
+typedef enum
+{
+  VLIB_RX,
+  VLIB_TX,
+  VLIB_N_RX_TX = 2,		/* Used to size arrays. */
+} vlib_rx_or_tx_t;
+
+#define vlib_foreach_rx_tx(v) for (v = 0; v < VLIB_N_RX_TX; v++)
+
+/* Read/write. */
+typedef enum
+{
+  VLIB_READ,
+  VLIB_WRITE,
+} vlib_read_or_write_t;
+
+/* Up/down. */
+typedef enum
+{
+  VLIB_DOWN = 0,
+  VLIB_UP = 1,
+} vlib_up_or_down_t;
+
+/* Enable/disable. */
+typedef enum
+{
+  VLIB_DISABLE = 0,
+  VLIB_ENABLE = 1,
+} vlib_enable_or_disable_t;
+
+#endif /* included_vlib_defs_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/dir.dox b/src/vlib/dir.dox
new file mode 100644
index 00000000..4806e7a9
--- /dev/null
+++ b/src/vlib/dir.dox
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Copyright (c) 2016 Comcast Cable Communications Management, LLC.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/* Doxygen directory documentation */
+/**
+@dir
+@brief VLIB application library source.
+*/
+/*? %%clicmd:group_label VLIB application library%% ?*/
+
diff --git a/src/vlib/elog_samples.c b/src/vlib/elog_samples.c
new file mode 100644
index 00000000..a8c800df
--- /dev/null
+++ b/src/vlib/elog_samples.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vppinfra/elog.h>
+
+static inline void
+elog_four_int_sample (u32 * data)
+{
+  ELOG_TYPE_DECLARE (e) =
+  {
+  .format = "four int: first %d second %d third %d fourth %d",.format_args =
+      "i4i4i4i4",};
+  struct
+  {
+    u32 data[4];
+  } *ed;
+  ed = ELOG_DATA (&vlib_global_main.elog_main, e);
+  ed->data[0] = data[0];
+  ed->data[1] = data[1];
+  ed->data[2] = data[2];
+  ed->data[3] = data[3];
+}
+
+static inline void
+elog_four_int_track_sample (u32 * data)
+{
+  ELOG_TYPE_DECLARE (e) =
+  {
+  .format =
+      "four_int_track: first %d second %d third %d fourth %d",.format_args =
+      "i4i4i4i4",};
+  struct
+  {
+    u32 data[4];
+  } *ed;
+  ELOG_TRACK (sample_track);
+  ed = ELOG_TRACK_DATA (&vlib_global_main.elog_main, e, sample_track);
+  ed->data[0] = data[0];
+  ed->data[1] = data[1];
+  ed->data[2] = data[2];
+  ed->data[3] = data[3];
+}
+
+static inline void
+elog_enum_sample (u8 which)
+{
+  ELOG_TYPE_DECLARE (e) =
+  {
+    .format = "my enum: %s",.format_args = "t1",.n_enum_strings =
+      2,.enum_strings =
+    {
+  "string 1", "string 2",},};
+  struct
+  {
+    u8 which;
+  } *ed;
+  ed = ELOG_DATA (&vlib_global_main.elog_main, e);
+  ed->which = which;
+}
+
+static inline void
+elog_one_datum_sample (u32 data)
+{
+  ELOG_TYPE_DECLARE (e) =
+  {
+  .format = "one datum: %d",.format_args = "i4",};
+
+  elog (&vlib_global_main.elog_main, &e, data);
+}
+
+static clib_error_t *
+test_elog_command_fn (vlib_main_t * vm,
+		      unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  int i;
+  u32 samples[4];
+
+  for (i = 0; i < 10; i++)
+    {
+      samples[0] = i;
+      samples[1] = i + 1;
+      samples[2] = i + 2;
+      samples[3] = i + 3;
+
+      elog_four_int_sample (samples);
+      elog_four_int_track_sample (samples);
+      elog_enum_sample (0);
+      elog_enum_sample (1);
+      elog_one_datum_sample (i);
+    }
+
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (test_elog_command, static) = {
+  .path = "test elog sample",
+  .short_help = "test elog sample",
+  .function = test_elog_command_fn,
+};
+/* *INDENT-ON* */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/error.c b/src/vlib/error.c
new file mode 100644
index 00000000..dec90bbe
--- /dev/null
+++ b/src/vlib/error.c
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * error.c: VLIB error handler
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vppinfra/heap.h>
+
+uword
+vlib_error_drop_buffers (vlib_main_t * vm,
+			 vlib_node_runtime_t * node,
+			 u32 * buffers,
+			 u32 next_buffer_stride,
+			 u32 n_buffers,
+			 u32 next_index,
+			 u32 drop_error_node, u32 drop_error_code)
+{
+  u32 n_left_this_frame, n_buffers_left, *args, n_args_left;
+  vlib_error_t drop_error;
+
+  drop_error = vlib_error_set (drop_error_node, drop_error_code);
+
+  n_buffers_left = n_buffers;
+  while (n_buffers_left > 0)
+    {
+      vlib_get_next_frame (vm, node, next_index, args, n_args_left);
+
+      n_left_this_frame = clib_min (n_buffers_left, n_args_left);
+      n_buffers_left -= n_left_this_frame;
+      n_args_left -= n_left_this_frame;
+
+      while (n_left_this_frame >= 4)
+	{
+	  u32 bi0, bi1, bi2, bi3;
+	  vlib_buffer_t *b0, *b1, *b2, *b3;
+
+	  args[0] = bi0 = buffers[0];
+	  args[1] = bi1 = buffers[1];
+	  args[2] = bi2 = buffers[2];
+	  args[3] = bi3 = buffers[3];
+
+	  b0 = vlib_get_buffer (vm, bi0);
+	  b1 = vlib_get_buffer (vm, bi1);
+	  b2 = vlib_get_buffer (vm, bi2);
+	  b3 = vlib_get_buffer (vm, bi3);
+
+	  b0->error = drop_error;
+	  b1->error = drop_error;
+	  b2->error = drop_error;
+	  b3->error = drop_error;
+
+	  buffers += 4;
+	  args += 4;
+	  n_left_this_frame -= 4;
+	}
+
+      while (n_left_this_frame >= 1)
+	{
+	  u32 bi0;
+	  vlib_buffer_t *b0;
+
+	  args[0] = bi0 = buffers[0];
+
+	  b0 = vlib_get_buffer (vm, bi0);
+	  b0->error = drop_error;
+
+	  buffers += 1;
+	  args += 1;
+	  n_left_this_frame -= 1;
+	}
+
+      vlib_put_next_frame (vm, node, next_index, n_args_left);
+    }
+
+  return n_buffers;
+}
+
+/* Convenience node to drop a vector of buffers with a "misc error". */
+static uword
+misc_drop_buffers (vlib_main_t * vm,
+		   vlib_node_runtime_t * node, vlib_frame_t * frame)
+{
+  return vlib_error_drop_buffers (vm, node, vlib_frame_args (frame),
+				  /* buffer stride */ 1,
+				  frame->n_vectors,
+				  /* next */ 0,
+				  node->node_index,
+				  /* error */ 0);
+}
+
+static char *misc_drop_buffers_error_strings[] = {
+  [0] = "misc. errors",
+};
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (misc_drop_buffers_node,static) = {
+  .function = misc_drop_buffers,
+  .name = "misc-drop-buffers",
+  .vector_size = sizeof (u32),
+  .n_errors = 1,
+  .n_next_nodes = 1,
+  .next_nodes = {
+      "error-drop",
+  },
+  .error_strings = misc_drop_buffers_error_strings,
+};
+/* *INDENT-ON* */
+
+/* Reserves given number of error codes for given node. */
+void
+vlib_register_errors (vlib_main_t * vm,
+		      u32 node_index, u32 n_errors, char *error_strings[])
+{
+  vlib_error_main_t *em = &vm->error_main;
+  vlib_node_t *n = vlib_get_node (vm, node_index);
+  uword l;
+
+  ASSERT (vlib_get_thread_index () == 0);
+
+  /* Free up any previous error strings. */
+  if (n->n_errors > 0)
+    heap_dealloc (em->error_strings_heap, n->error_heap_handle);
+
+  n->n_errors = n_errors;
+  n->error_strings = error_strings;
+
+  if (n_errors == 0)
+    return;
+
+  n->error_heap_index =
+    heap_alloc (em->error_strings_heap, n_errors, n->error_heap_handle);
+
+  l = vec_len (em->error_strings_heap);
+
+  clib_memcpy (vec_elt_at_index (em->error_strings_heap, n->error_heap_index),
+	       error_strings, n_errors * sizeof (error_strings[0]));
+
+  /* Allocate a counter/elog type for each error. */
+  vec_validate (em->counters, l - 1);
+  vec_validate (vm->error_elog_event_types, l - 1);
+
+  /* Zero counters for re-registrations of errors. */
+  if (n->error_heap_index + n_errors <= vec_len (em->counters_last_clear))
+    clib_memcpy (em->counters + n->error_heap_index,
+		 em->counters_last_clear + n->error_heap_index,
+		 n_errors * sizeof (em->counters[0]));
+  else
+    memset (em->counters + n->error_heap_index,
+	    0, n_errors * sizeof (em->counters[0]));
+
+  {
+    elog_event_type_t t;
+    uword i;
+
+    memset (&t, 0, sizeof (t));
+    for (i = 0; i < n_errors; i++)
+      {
+	t.format = (char *) format (0, "%v %s: %%d",
+				    n->name, error_strings[i]);
+	vm->error_elog_event_types[n->error_heap_index + i] = t;
+      }
+  }
+}
+
+static clib_error_t *
+show_errors (vlib_main_t * vm,
+	     unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  vlib_error_main_t *em = &vm->error_main;
+  vlib_node_t *n;
+  u32 code, i, ni;
+  u64 c;
+  int index = 0;
+  int verbose = 0;
+  u64 *sums = 0;
+
+  if (unformat (input, "verbose %d", &verbose))
+    ;
+  else if (unformat (input, "verbose"))
+    verbose = 1;
+
+  vec_validate (sums, vec_len (em->counters));
+
+  if (verbose)
+    vlib_cli_output (vm, "%=10s%=40s%=20s%=6s", "Count", "Node", "Reason",
+		     "Index");
+  else
+    vlib_cli_output (vm, "%=10s%=40s%=6s", "Count", "Node", "Reason");
+
+
+  /* *INDENT-OFF* */
+  foreach_vlib_main(({
+    em = &this_vlib_main->error_main;
+
+    if (verbose)
+      vlib_cli_output(vm, "Thread %u (%v):", index,
+                      vlib_worker_threads[index].name);
+
+    for (ni = 0; ni < vec_len (this_vlib_main->node_main.nodes); ni++)
+      {
+	n = vlib_get_node (this_vlib_main, ni);
+	for (code = 0; code < n->n_errors; code++)
+	  {
+	    i = n->error_heap_index + code;
+	    c = em->counters[i];
+	    if (i < vec_len (em->counters_last_clear))
+	      c -= em->counters_last_clear[i];
+	    sums[i] += c;
+
+	    if (c == 0 && verbose < 2)
+	      continue;
+
+            if (verbose)
+              vlib_cli_output (vm, "%10Ld%=40v%=20s%=6d", c, n->name,
+                               em->error_strings_heap[i], i);
+            else
+              vlib_cli_output (vm, "%10d%=40v%s", c, n->name,
+                               em->error_strings_heap[i]);
+	  }
+      }
+    index++;
+  }));
+  /* *INDENT-ON* */
+
+  if (verbose)
+    vlib_cli_output (vm, "Total:");
+
+  for (ni = 0; ni < vec_len (vm->node_main.nodes); ni++)
+    {
+      n = vlib_get_node (vm, ni);
+      for (code = 0; code < n->n_errors; code++)
+	{
+	  i = n->error_heap_index + code;
+	  if (sums[i])
+	    {
+	      if (verbose)
+		vlib_cli_output (vm, "%10Ld%=40v%=20s%=10d", sums[i], n->name,
+				 em->error_strings_heap[i], i);
+	    }
+	}
+    }
+
+  vec_free (sums);
+
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (vlib_cli_show_errors) = {
+  .path = "show errors",
+  .short_help = "Show error counts",
+  .function = show_errors,
+};
+/* *INDENT-ON* */
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cli_show_node_counters, static) = {
+  .path = "show node counters",
+  .short_help = "Show node counters",
+  .function = show_errors,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+clear_error_counters (vlib_main_t * vm,
+		      unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  vlib_error_main_t *em;
+  u32 i;
+
+  /* *INDENT-OFF* */
+  foreach_vlib_main(({
+    em = &this_vlib_main->error_main;
+    vec_validate (em->counters_last_clear, vec_len (em->counters) - 1);
+    for (i = 0; i < vec_len (em->counters); i++)
+      em->counters_last_clear[i] = em->counters[i];
+  }));
+  /* *INDENT-ON* */
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cli_clear_error_counters, static) = {
+  .path = "clear errors",
+  .short_help = "Clear error counters",
+  .function = clear_error_counters,
+};
+/* *INDENT-ON* */
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cli_clear_node_counters, static) = {
+  .path = "clear node counters",
+  .short_help = "Clear node counters",
+  .function = clear_error_counters,
+};
+/* *INDENT-ON* */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/error.h b/src/vlib/error.h
new file mode 100644
index 00000000..df2075c3
--- /dev/null
+++ b/src/vlib/error.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * error.h: drop/punt error packets
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_error_h
+#define included_vlib_error_h
+
+/* Combined 16 bit node & 16 bit code as 32 bit number. */
+typedef u32 vlib_error_t;
+
+always_inline u32
+vlib_error_get_node (vlib_error_t e)
+{
+  return e >> 12;
+}
+
+always_inline u32
+vlib_error_get_code (vlib_error_t e)
+{
+  return e & 0xfff;
+}
+
+always_inline vlib_error_t
+vlib_error_set (u32 node_index, u32 code)
+{
+  ASSERT (node_index < (1 << 20));
+  ASSERT (code < (1 << 12));
+  return (node_index << 12) | code;
+}
+
+always_inline vlib_error_t
+vlib_error_set_code (vlib_error_t e, u32 code)
+{
+  ASSERT (vlib_error_get_code (e) == 0);
+  ASSERT (code < (1 << 12));
+  e |= code;
+  return e;
+}
+
+typedef struct
+{
+  /* Error counters. */
+  u64 *counters;
+
+  /* Counter values as of last counter clear. */
+  u64 *counters_last_clear;
+
+  /* Error name strings in heap.  Heap index
+     indexes counter vector. */
+  char **error_strings_heap;
+} vlib_error_main_t;
+
+/* Per node error registration. */
+void vlib_register_errors (struct vlib_main_t *vm,
+			   u32 node_index,
+			   u32 n_errors, char *error_strings[]);
+
+#endif /* included_vlib_error_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/error_funcs.h b/src/vlib/error_funcs.h
new file mode 100644
index 00000000..ab281ba2
--- /dev/null
+++ b/src/vlib/error_funcs.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * error_funcs.h: VLIB error handling
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_error_funcs_h
+#define included_vlib_error_funcs_h
+
+#include <vlib/node_funcs.h>
+
+always_inline void
+vlib_error_elog_count (vlib_main_t * vm, uword counter, uword increment)
+{
+  if (VLIB_ELOG_MAIN_LOOP > 0 && increment > 0)
+    {
+      elog_main_t *em = &vm->elog_main;
+      elog (em, vec_elt_at_index (vm->error_elog_event_types, counter),
+	    increment);
+    }
+}
+
+always_inline void
+vlib_error_count (vlib_main_t * vm, uword node_index,
+		  uword counter, uword increment)
+{
+  vlib_node_t *n = vlib_get_node (vm, node_index);
+  vlib_error_main_t *em = &vm->error_main;
+
+  ASSERT (counter < n->n_errors);
+  counter += n->error_heap_index;
+
+  ASSERT (counter < vec_len (em->counters));
+  em->counters[counter] += increment;
+
+  vlib_error_elog_count (vm, counter, increment);
+}
+
+/* Drop all buffers in frame with given error code. */
+uword
+vlib_error_drop_buffers (vlib_main_t * vm,
+			 vlib_node_runtime_t * node,
+			 u32 * buffers,
+			 u32 next_buffer_stride,
+			 u32 n_buffers,
+			 u32 error_next_index,
+			 u32 error_node, u32 error_code);
+
+#endif /* included_vlib_error_funcs_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/format.c b/src/vlib/format.c
new file mode 100644
index 00000000..79a4d686
--- /dev/null
+++ b/src/vlib/format.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * format.c: generic network formatting/unformating
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+
+u8 *
+format_vlib_rx_tx (u8 * s, va_list * args)
+{
+  vlib_rx_or_tx_t r = va_arg (*args, vlib_rx_or_tx_t);
+  char *t;
+
+  switch (r)
+    {
+    case VLIB_RX:
+      t = "rx";
+      break;
+    case VLIB_TX:
+      t = "tx";
+      break;
+    default:
+      t = "INVALID";
+      break;
+    }
+
+  vec_add (s, t, strlen (t));
+  return s;
+}
+
+u8 *
+format_vlib_read_write (u8 * s, va_list * args)
+{
+  vlib_rx_or_tx_t r = va_arg (*args, vlib_rx_or_tx_t);
+  char *t;
+
+  switch (r)
+    {
+    case VLIB_READ:
+      t = "read";
+      break;
+    case VLIB_WRITE:
+      t = "write";
+      break;
+    default:
+      t = "INVALID";
+      break;
+    }
+
+  vec_add (s, t, strlen (t));
+  return s;
+}
+
+/* Formats buffer data as printable ascii or as hex. */
+u8 *
+format_vlib_buffer_data (u8 * s, va_list * args)
+{
+  u8 *data = va_arg (*args, u8 *);
+  u32 n_data_bytes = va_arg (*args, u32);
+  u32 i, is_printable;
+
+  is_printable = 1;
+  for (i = 0; i < n_data_bytes && is_printable; i++)
+    {
+      u8 c = data[i];
+      if (c < 0x20)
+	is_printable = 0;
+      else if (c >= 0x7f)
+	is_printable = 0;
+    }
+
+  if (is_printable)
+    vec_add (s, data, n_data_bytes);
+  else
+    s = format (s, "%U", format_hex_bytes, data, n_data_bytes);
+
+  return s;
+}
+
+/* Enable/on => 1; disable/off => 0. */
+uword
+unformat_vlib_enable_disable (unformat_input_t * input, va_list * args)
+{
+  int *result = va_arg (*args, int *);
+  int enable;
+
+  if (unformat (input, "enable") || unformat (input, "on"))
+    enable = 1;
+  else if (unformat (input, "disable") || unformat (input, "off"))
+    enable = 0;
+  else
+    return 0;
+
+  *result = enable;
+  return 1;
+}
+
+/* rx/tx => VLIB_RX/VLIB_TX. */
+uword
+unformat_vlib_rx_tx (unformat_input_t * input, va_list * args)
+{
+  int *result = va_arg (*args, int *);
+  if (unformat (input, "rx"))
+    *result = VLIB_RX;
+  else if (unformat (input, "tx"))
+    *result = VLIB_TX;
+  else
+    return 0;
+  return 1;
+}
+
+/* Parse an int either %d or 0x%x. */
+uword
+unformat_vlib_number (unformat_input_t * input, va_list * args)
+{
+  int *result = va_arg (*args, int *);
+
+  return (unformat (input, "0x%x", result) || unformat (input, "%d", result));
+}
+
+/* Parse a-zA-Z0-9_ token and hash to value. */
+uword
+unformat_vlib_number_by_name (unformat_input_t * input, va_list * args)
+{
+  uword *hash = va_arg (*args, uword *);
+  int *result = va_arg (*args, int *);
+  uword *p;
+  u8 *token;
+  int i;
+
+  if (!unformat_user (input, unformat_token, "a-zA-Z0-9_", &token))
+    return 0;
+
+  /* Null terminate. */
+  if (vec_len (token) > 0 && token[vec_len (token) - 1] != 0)
+    vec_add1 (token, 0);
+
+  /* Check for exact match. */
+  p = hash_get_mem (hash, token);
+  if (p)
+    goto done;
+
+  /* Convert to upper case & try match. */
+  for (i = 0; i < vec_len (token); i++)
+    if (token[i] >= 'a' && token[i] <= 'z')
+      token[i] = 'A' + token[i] - 'a';
+  p = hash_get_mem (hash, token);
+
+done:
+  vec_free (token);
+  if (p)
+    *result = p[0];
+  return p != 0;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/format_funcs.h b/src/vlib/format_funcs.h
new file mode 100644
index 00000000..f60b8940
--- /dev/null
+++ b/src/vlib/format_funcs.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * format_funcs.h: VLIB formatting/unformating
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_format_h
+#define included_vlib_format_h
+
+/* Format vlib_rx_or_tx_t/vlib_read_or_write_t enum as string. */
+u8 *format_vlib_rx_tx (u8 * s, va_list * args);
+u8 *format_vlib_read_write (u8 * s, va_list * args);
+
+/* Formats buffer data as printable ascii or as hex. */
+u8 *format_vlib_buffer_data (u8 * s, va_list * args);
+
+/* Enable/on => 1; disable/off => 0. */
+uword unformat_vlib_enable_disable (unformat_input_t * input, va_list * args);
+
+/* rx/tx => VLIB_RX/VLIB_TX. */
+uword unformat_vlib_rx_tx (unformat_input_t * input, va_list * args);
+
+/* Parse a-zA-Z0-9_ token and hash to value. */
+uword unformat_vlib_number_by_name (unformat_input_t * input, va_list * args);
+
+/* Parse an int either %d or 0x%x. */
+uword unformat_vlib_number (unformat_input_t * input, va_list * args);
+
+/* Flag to format_vlib_*_header functions to tell them not to recurse
+   into the next layer's header.  For example, tells format_vlib_ethernet_header
+   not to format ip header. */
+#define FORMAT_VLIB_HEADER_NO_RECURSION (~0)
+
+#endif /* included_vlib_format_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/global_funcs.h b/src/vlib/global_funcs.h
new file mode 100644
index 00000000..9dd01fbf
--- /dev/null
+++ b/src/vlib/global_funcs.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * global_funcs.h: global data structure access functions
+ */
+
+#ifndef included_vlib_global_funcs_h_
+#define included_vlib_global_funcs_h_
+
+always_inline vlib_main_t *
+vlib_get_main (void)
+{
+  vlib_main_t *vm;
+  vm = vlib_mains[vlib_get_thread_index ()];
+  ASSERT (vm);
+  return vm;
+}
+
+always_inline vlib_thread_main_t *
+vlib_get_thread_main ()
+{
+  return &vlib_thread_main;
+}
+
+#endif /* included_vlib_global_funcs_h_ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/i2c.c b/src/vlib/i2c.c
new file mode 100644
index 00000000..97f5bb21
--- /dev/null
+++ b/src/vlib/i2c.c
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/i2c.h>
+
+static inline void
+i2c_delay (i2c_bus_t * b, f64 timeout)
+{
+  vlib_main_t *vm = vlib_get_main ();
+  vlib_time_wait (vm, timeout);
+}
+
+static void
+i2c_wait_for_scl (i2c_bus_t * b)
+{
+  f64 t = 0;
+
+  while (t < b->hold_time)
+    {
+      int sda, scl;
+      i2c_delay (b, b->rise_fall_time);
+      b->get_bits (b, &scl, &sda);
+
+      if (scl)
+	return;
+
+      t += b->rise_fall_time;
+    }
+  b->timeout = 1;
+}
+
+static void
+i2c_start (i2c_bus_t * b)
+{
+  b->timeout = 0;
+
+  b->put_bits (b, 1, 1);
+  i2c_wait_for_scl (b);
+
+  if (vlib_i2c_bus_timed_out (b))
+    return;
+
+  b->put_bits (b, 1, 0);
+  i2c_delay (b, b->hold_time);
+  b->put_bits (b, 0, 0);
+  i2c_delay (b, b->hold_time);
+}
+
+static void
+i2c_stop (i2c_bus_t * b)
+{
+  b->put_bits (b, 0, 0);
+  i2c_delay (b, b->rise_fall_time);
+
+  b->put_bits (b, 1, 0);
+  i2c_delay (b, b->hold_time);
+
+  b->put_bits (b, 1, 1);
+  i2c_delay (b, b->hold_time);
+}
+
+static void
+i2c_write_bit (i2c_bus_t * b, int sda)
+{
+  b->put_bits (b, 0, sda);
+  i2c_delay (b, b->rise_fall_time);
+
+  b->put_bits (b, 1, sda);
+  i2c_wait_for_scl (b);
+  i2c_delay (b, b->hold_time);
+
+  b->put_bits (b, 0, sda);
+  i2c_delay (b, b->rise_fall_time);
+}
+
+static void
+i2c_read_bit (i2c_bus_t * b, int *sda)
+{
+  int scl;
+
+  b->put_bits (b, 1, 1);
+  i2c_wait_for_scl (b);
+  i2c_delay (b, b->hold_time);
+
+  b->get_bits (b, &scl, sda);
+
+  b->put_bits (b, 0, 1);
+  i2c_delay (b, b->rise_fall_time);
+}
+
+static void
+i2c_write_byte (i2c_bus_t * b, u8 data)
+{
+  int i, sda;
+
+  for (i = 7; i >= 0; i--)
+    {
+      i2c_write_bit (b, (data >> i) & 1);
+      if (b->timeout)
+	return;
+    }
+
+  b->put_bits (b, 0, 1);
+  i2c_delay (b, b->rise_fall_time);
+
+  i2c_read_bit (b, &sda);
+
+  if (sda)
+    b->timeout = 1;
+}
+
+
+static void
+i2c_read_byte (i2c_bus_t * b, u8 * data, int ack)
+{
+  int i, sda;
+
+  *data = 0;
+
+  b->put_bits (b, 0, 1);
+  i2c_delay (b, b->rise_fall_time);
+
+  for (i = 7; i >= 0; i--)
+    {
+      i2c_read_bit (b, &sda);
+      if (b->timeout)
+	return;
+
+      *data |= (sda != 0) << i;
+    }
+
+  i2c_write_bit (b, ack == 0);
+}
+
+
+void
+vlib_i2c_init (i2c_bus_t * b)
+{
+  f64 tick;
+  if (!b->clock)
+    b->clock = 400000;
+
+  tick = 1.0 / b->clock;
+
+  /* Spend 40% of time in low and high states */
+  if (!b->hold_time)
+    b->hold_time = 0.4 * tick;
+
+  /* Spend 10% of time waiting for rise and fall */
+  if (!b->rise_fall_time)
+    b->rise_fall_time = 0.1 * tick;
+}
+
+void
+vlib_i2c_xfer (i2c_bus_t * bus, i2c_msg_t * msgs)
+{
+  i2c_msg_t *msg;
+  int i;
+
+  vec_foreach (msg, msgs)
+  {
+    i2c_start (bus);
+    i2c_write_byte (bus,
+		    (msg->addr << 1) + (msg->flags == I2C_MSG_FLAG_READ));
+
+    if (msg->flags & I2C_MSG_FLAG_READ)
+      for (i = 0; i < msg->len; i++)
+	{
+	  i2c_read_byte (bus, &msg->buffer[i], /* ack */ i + 1 != msg->len);
+	  if (bus->timeout)
+	    goto done;
+	}
+
+    else
+      for (i = 0; i < msg->len; i++)
+	{
+	  i2c_write_byte (bus, msg->buffer[i]);
+	  if (bus->timeout)
+	    goto done;
+	}
+  }
+
+done:
+  i2c_stop (bus);
+}
+
+void
+vlib_i2c_read_eeprom (i2c_bus_t * bus, u8 i2c_addr, u16 start_addr,
+		      u16 length, u8 * data)
+{
+  i2c_msg_t *msg = 0;
+  u8 start_address[1];
+
+  vec_validate (msg, 1);
+
+  start_address[0] = start_addr;
+  msg[0].addr = i2c_addr;
+  msg[0].flags = I2C_MSG_FLAG_WRITE;
+  msg[0].buffer = (u8 *) & start_address;
+  msg[0].len = 1;
+
+  msg[1].addr = i2c_addr;
+  msg[1].flags = I2C_MSG_FLAG_READ;
+  msg[1].buffer = data;
+  msg[1].len = length;
+
+  vlib_i2c_xfer (bus, msg);
+
+  vec_free (msg);
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/i2c.h b/src/vlib/i2c.h
new file mode 100644
index 00000000..b79bdc75
--- /dev/null
+++ b/src/vlib/i2c.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_vlib_i2c_h
+#define included_vlib_i2c_h
+
+#include <vppinfra/types.h>
+
+
+#define I2C_MSG_FLAG_WRITE  0
+#define I2C_MSG_FLAG_READ   1
+
+typedef struct
+{
+  u8 addr;
+  u8 flags;
+  u16 len;
+  u8 *buffer;
+} i2c_msg_t;
+
+typedef struct i2c_bus_t
+{
+  void (*put_bits) (struct i2c_bus_t * b, int scl, int sda);
+  void (*get_bits) (struct i2c_bus_t * b, int *scl, int *sda);
+
+  int timeout;
+  u32 clock;
+  f64 hold_time;
+  f64 rise_fall_time;
+
+  /* Private data */
+  uword private_data;
+
+} i2c_bus_t;
+
+void vlib_i2c_init (i2c_bus_t * bus);
+void vlib_i2c_xfer (i2c_bus_t * bus, i2c_msg_t * msgs);
+void vlib_i2c_read_eeprom (i2c_bus_t * bus, u8 i2c_addr, u16 start_addr,
+			   u16 length, u8 * data);
+
+static inline int
+vlib_i2c_bus_timed_out (i2c_bus_t * bus)
+{
+  return bus->timeout;
+}
+
+#endif /* included_vlib_i2c_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/init.c b/src/vlib/init.c
new file mode 100644
index 00000000..8d478451
--- /dev/null
+++ b/src/vlib/init.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * init.c: mechanism for functions to be called at init/exit.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+
+clib_error_t *
+vlib_call_init_exit_functions (vlib_main_t * vm,
+			       _vlib_init_function_list_elt_t * head,
+			       int call_once)
+{
+  clib_error_t *error = 0;
+  _vlib_init_function_list_elt_t *i;
+
+  i = head;
+  while (i)
+    {
+      if (call_once && !hash_get (vm->init_functions_called, i->f))
+	{
+	  if (call_once)
+	    hash_set1 (vm->init_functions_called, i->f);
+	  error = i->f (vm);
+	  if (error)
+	    return error;
+	}
+      i = i->next_init_function;
+    }
+  return error;
+}
+
+clib_error_t *
+vlib_call_all_init_functions (vlib_main_t * vm)
+{
+  /* Call dummy functions to make sure purely static modules are
+     linked in. */
+#define _(f) vlib_##f##_reference ();
+  foreach_vlib_module_reference;
+#undef _
+
+  return vlib_call_init_exit_functions
+    (vm, vm->init_function_registrations, 1 /* call_once */ );
+}
+
+clib_error_t *
+vlib_call_all_main_loop_enter_functions (vlib_main_t * vm)
+{
+  return vlib_call_init_exit_functions
+    (vm, vm->main_loop_enter_function_registrations, 1 /* call_once */ );
+}
+
+clib_error_t *
+vlib_call_all_main_loop_exit_functions (vlib_main_t * vm)
+{
+  return vlib_call_init_exit_functions
+    (vm, vm->main_loop_exit_function_registrations, 1 /* call_once */ );
+}
+
+clib_error_t *
+vlib_call_all_config_functions (vlib_main_t * vm,
+				unformat_input_t * input, int is_early)
+{
+  clib_error_t *error = 0;
+  vlib_config_function_runtime_t *c, **all;
+  uword *hash = 0, *p;
+  uword i;
+
+  hash = hash_create_string (0, sizeof (uword));
+  all = 0;
+
+  c = vm->config_function_registrations;
+
+  while (c)
+    {
+      hash_set_mem (hash, c->name, vec_len (all));
+      vec_add1 (all, c);
+      unformat_init (&c->input, 0, 0);
+      c = c->next_registration;
+    }
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      u8 *s, *v;
+
+      if (!unformat (input, "%s %v", &s, &v) || !(p = hash_get_mem (hash, s)))
+	{
+	  error = clib_error_create ("unknown input `%s %v'", s, v);
+	  goto done;
+	}
+
+      c = all[p[0]];
+      if (vec_len (c->input.buffer) > 0)
+	vec_add1 (c->input.buffer, ' ');
+      vec_add (c->input.buffer, v, vec_len (v));
+      vec_free (v);
+      vec_free (s);
+    }
+
+  for (i = 0; i < vec_len (all); i++)
+    {
+      c = all[i];
+
+      /* Is this an early config? Are we doing early configs? */
+      if (is_early ^ c->is_early)
+	continue;
+
+      /* Already called? */
+      if (hash_get (vm->init_functions_called, c->function))
+	continue;
+      hash_set1 (vm->init_functions_called, c->function);
+
+      error = c->function (vm, &c->input);
+      if (error)
+	goto done;
+    }
+
+done:
+  for (i = 0; i < vec_len (all); i++)
+    {
+      c = all[i];
+      unformat_free (&c->input);
+    }
+  vec_free (all);
+  hash_free (hash);
+  return error;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/init.h b/src/vlib/init.h
new file mode 100644
index 00000000..12db3f90
--- /dev/null
+++ b/src/vlib/init.h
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * init.h: mechanism for functions to be called at init/exit.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_init_h
+#define included_vlib_init_h
+
+#include <vppinfra/error.h>
+#include <vppinfra/format.h>
+#include <vppinfra/hash.h>
+
+/* Init/exit functions: called at start/end of main routine.  Init
+   functions are typically used to register and setup packet
+   processing nodes.  */
+
+typedef clib_error_t *(vlib_init_function_t) (struct vlib_main_t * vm);
+
+typedef struct _vlib_init_function_list_elt
+{
+  struct _vlib_init_function_list_elt *next_init_function;
+  vlib_init_function_t *f;
+} _vlib_init_function_list_elt_t;
+
+/* Configuration functions: called with configuration input just before
+   main polling loop starts. */
+typedef clib_error_t *(vlib_config_function_t) (struct vlib_main_t * vm,
+						unformat_input_t * input);
+
+typedef struct vlib_config_function_runtime_t
+{
+  /* Function to call.  Set to null once function has already been called. */
+  vlib_config_function_t *function;
+
+  /* Input for function. */
+  unformat_input_t input;
+
+  /* next config function registration */
+  struct vlib_config_function_runtime_t *next_registration;
+
+  /* To be invoked as soon as the clib heap is available */
+  u8 is_early;
+
+  /* Name used to distinguish input on command line. */
+  char name[32];
+} vlib_config_function_runtime_t;
+
+#define _VLIB_INIT_FUNCTION_SYMBOL(x, type)	\
+  _vlib_##type##_function_##x
+
+#define VLIB_INIT_FUNCTION_SYMBOL(x)		\
+  _VLIB_INIT_FUNCTION_SYMBOL(x, init)
+#define VLIB_MAIN_LOOP_ENTER_FUNCTION_SYMBOL(x)		\
+  _VLIB_INIT_FUNCTION_SYMBOL(x, main_loop_enter)
+#define VLIB_MAIN_LOOP_EXIT_FUNCTION_SYMBOL(x)	\
+  _VLIB_INIT_FUNCTION_SYMBOL(x, main_loop_exit)
+#define VLIB_CONFIG_FUNCTION_SYMBOL(x)		\
+  _VLIB_INIT_FUNCTION_SYMBOL(x, config)
+
+/* Declaration is global (e.g. not static) so that init functions can
+   be called from other modules to resolve init function depend. */
+
+#define VLIB_DECLARE_INIT_FUNCTION(x, tag)                      \
+vlib_init_function_t * _VLIB_INIT_FUNCTION_SYMBOL (x, tag) = x; \
+static void __vlib_add_##tag##_function_##x (void)              \
+    __attribute__((__constructor__)) ;                          \
+static void __vlib_add_##tag##_function_##x (void)              \
+{                                                               \
+ vlib_main_t * vm = vlib_get_main();                            \
+ static _vlib_init_function_list_elt_t _vlib_init_function;     \
+ _vlib_init_function.next_init_function                         \
+    = vm->tag##_function_registrations;                         \
+  vm->tag##_function_registrations = &_vlib_init_function;      \
+ _vlib_init_function.f = &x;                                    \
+}
+
+#define VLIB_INIT_FUNCTION(x) VLIB_DECLARE_INIT_FUNCTION(x,init)
+#define VLIB_WORKER_INIT_FUNCTION(x) VLIB_DECLARE_INIT_FUNCTION(x,worker_init)
+
+#define VLIB_MAIN_LOOP_ENTER_FUNCTION(x) \
+  VLIB_DECLARE_INIT_FUNCTION(x,main_loop_enter)
+#define VLIB_MAIN_LOOP_EXIT_FUNCTION(x) \
+VLIB_DECLARE_INIT_FUNCTION(x,main_loop_exit)
+
+#define VLIB_CONFIG_FUNCTION(x,n,...)                           \
+    __VA_ARGS__ vlib_config_function_runtime_t                  \
+    VLIB_CONFIG_FUNCTION_SYMBOL(x);                             \
+static void __vlib_add_config_function_##x (void)               \
+    __attribute__((__constructor__)) ;                          \
+static void __vlib_add_config_function_##x (void)               \
+{                                                               \
+    vlib_main_t * vm = vlib_get_main();                         \
+    VLIB_CONFIG_FUNCTION_SYMBOL(x).next_registration            \
+       = vm->config_function_registrations;                     \
+    vm->config_function_registrations                           \
+       = &VLIB_CONFIG_FUNCTION_SYMBOL(x);                       \
+}                                                               \
+  vlib_config_function_runtime_t                                \
+    VLIB_CONFIG_FUNCTION_SYMBOL (x)                             \
+  = {                                                           \
+    .name = n,                                                  \
+    .function = x,                                              \
+    .is_early = 0,						\
+  }
+
+#define VLIB_EARLY_CONFIG_FUNCTION(x,n,...)                     \
+    __VA_ARGS__ vlib_config_function_runtime_t                  \
+    VLIB_CONFIG_FUNCTION_SYMBOL(x);                             \
+static void __vlib_add_config_function_##x (void)               \
+    __attribute__((__constructor__)) ;                          \
+static void __vlib_add_config_function_##x (void)               \
+{                                                               \
+    vlib_main_t * vm = vlib_get_main();                         \
+    VLIB_CONFIG_FUNCTION_SYMBOL(x).next_registration            \
+       = vm->config_function_registrations;                     \
+    vm->config_function_registrations                           \
+       = &VLIB_CONFIG_FUNCTION_SYMBOL(x);                       \
+}                                                               \
+  vlib_config_function_runtime_t                                \
+    VLIB_CONFIG_FUNCTION_SYMBOL (x)                             \
+  = {                                                           \
+    .name = n,                                                  \
+    .function = x,                                              \
+    .is_early = 1,						\
+  }
+
+/* Call given init function: used for init function dependencies. */
+#define vlib_call_init_function(vm, x)					\
+  ({									\
+    extern vlib_init_function_t * VLIB_INIT_FUNCTION_SYMBOL (x);	\
+    vlib_init_function_t * _f = VLIB_INIT_FUNCTION_SYMBOL (x);		\
+    clib_error_t * _error = 0;						\
+    if (! hash_get (vm->init_functions_called, _f))			\
+      {									\
+	hash_set1 (vm->init_functions_called, _f);			\
+	_error = _f (vm);						\
+      }									\
+    _error;								\
+  })
+
+/* Don't call given init function: used to suppress parts of the netstack */
+#define vlib_mark_init_function_complete(vm, x)				\
+  ({									\
+    extern vlib_init_function_t * VLIB_INIT_FUNCTION_SYMBOL (x);	\
+    vlib_init_function_t * _f = VLIB_INIT_FUNCTION_SYMBOL (x);		\
+    hash_set1 (vm->init_functions_called, _f);				\
+  })
+
+#define vlib_call_post_graph_init_function(vm, x)			\
+  ({									\
+    extern vlib_init_function_t * VLIB_POST_GRAPH_INIT_FUNCTION_SYMBOL (x); \
+    vlib_init_function_t * _f = VLIB_POST_GRAPH_INIT_FUNCTION_SYMBOL (x); \
+    clib_error_t * _error = 0;						\
+    if (! hash_get (vm->init_functions_called, _f))			\
+      {									\
+	hash_set1 (vm->init_functions_called, _f);			\
+	_error = _f (vm);						\
+      }									\
+    _error;								\
+  })
+
+#define vlib_call_config_function(vm, x)			\
+  ({								\
+    vlib_config_function_runtime_t * _r;			\
+    clib_error_t * _error = 0;					\
+    extern vlib_config_function_runtime_t			\
+      VLIB_CONFIG_FUNCTION_SYMBOL (x);				\
+								\
+    _r = &VLIB_CONFIG_FUNCTION_SYMBOL (x);			\
+    if (! hash_get (vm->init_functions_called, _r->function))	\
+      {								\
+        hash_set1 (vm->init_functions_called, _r->function);	\
+	_error = _r->function (vm, &_r->input);			\
+      }								\
+    _error;							\
+  })
+
+/* External functions. */
+clib_error_t *vlib_call_all_init_functions (struct vlib_main_t *vm);
+clib_error_t *vlib_call_all_config_functions (struct vlib_main_t *vm,
+					      unformat_input_t * input,
+					      int is_early);
+clib_error_t *vlib_call_all_main_loop_enter_functions (struct vlib_main_t
+						       *vm);
+clib_error_t *vlib_call_all_main_loop_exit_functions (struct vlib_main_t *vm);
+clib_error_t *vlib_call_init_exit_functions (struct vlib_main_t *vm,
+					     _vlib_init_function_list_elt_t *
+					     head, int call_once);
+
+#define foreach_vlib_module_reference		\
+  _ (node_cli)					\
+  _ (trace_cli)
+
+/* Dummy function to get node_cli.c linked in. */
+#define _(x) void vlib_##x##_reference (void);
+foreach_vlib_module_reference
+#undef _
+#endif /* included_vlib_init_h */
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/lex.c b/src/vlib/lex.c
new file mode 100644
index 00000000..1cc8f167
--- /dev/null
+++ b/src/vlib/lex.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vlib/vlib.h>
+#include <vlib/lex.h>
+
+vlib_lex_main_t vlib_lex_main;
+
+#define LEX_DEBUG 0
+
+u8 *
+format_vlib_lex_token (u8 * s, va_list * args)
+{
+  vlib_lex_main_t *lm = va_arg (*args, vlib_lex_main_t *);
+  vlib_lex_token_t *t = va_arg (*args, vlib_lex_token_t *);
+
+  if (t->token == VLIB_LEX_word)
+    s = format (s, "%s", t->value.as_pointer);
+  else
+    s = format (s, "%s", lm->lex_token_names[t->token]);
+  return s;
+}
+
+void
+vlib_lex_get_token (vlib_lex_main_t * lm, vlib_lex_token_t * rv)
+{
+  u8 c;
+  vlib_lex_table_t *t;
+  vlib_lex_table_entry_t *e;
+  uword tv;
+
+  if (PREDICT_FALSE (lm->pushback_sp >= 0))
+    {
+      rv[0] = lm->pushback_vector[lm->pushback_sp--];
+      return;
+    }
+
+  rv->value.as_uword = ~0;
+
+  while (1)
+    {
+      if (PREDICT_FALSE (lm->current_index >= vec_len (lm->input_vector)))
+	{
+	  rv->token = VLIB_LEX_eof;
+	  return;
+	}
+
+      t = vec_elt_at_index (lm->lex_tables, lm->current_table_index);
+      c = (lm->input_vector[lm->current_index++]) & 0x7f;
+      e = &t->entries[c];
+      lm->current_table_index = e->next_table_index;
+
+      switch (e->action)
+	{
+	case VLIB_LEX_IGNORE:
+	  continue;
+
+	case VLIB_LEX_START_NUMBER:
+	  lm->current_token_value = 0;
+	  /* fallthru */
+
+	case VLIB_LEX_ADD_TO_NUMBER:
+	  lm->current_number_base = e->token;
+	  lm->current_token_value *= lm->current_number_base;
+	  tv = c - '0';
+	  if (tv >= lm->current_number_base)
+	    {
+	      tv = 10 + c - 'A';
+	      if (tv >= lm->current_number_base)
+		tv = 10 + c - 'a';
+	    }
+	  lm->current_token_value += tv;
+	  continue;
+
+	case VLIB_LEX_ADD_TO_TOKEN:
+	  vec_add1 (lm->token_buffer, c);
+	  continue;
+
+	case VLIB_LEX_KEYWORD_CHECK:
+	  {
+	    uword *p;
+
+	    vec_add1 (lm->token_buffer, 0);
+
+	    /* It's either a keyword or just a word. */
+	    p = hash_get_mem (lm->lex_keywords, lm->token_buffer);
+	    if (p)
+	      {
+		rv->token = p[0];
+		if (LEX_DEBUG > 0)
+		  clib_warning ("keyword '%s' token %s",
+				lm->token_buffer,
+				lm->lex_token_names[rv->token]);
+	      }
+	    else
+	      {
+		/* it's a WORD */
+		rv->token = VLIB_LEX_word;
+		rv->value.as_pointer = vec_dup (lm->token_buffer);
+		if (LEX_DEBUG > 0)
+		  clib_warning ("%s, value '%s'",
+				lm->lex_token_names[VLIB_LEX_word],
+				rv->value.as_pointer);
+	      }
+	    _vec_len (lm->token_buffer) = 0;
+
+	    /* Rescan the character which terminated the keyword/word. */
+	    lm->current_index--;
+	    return;
+	  }
+
+	case VLIB_LEX_RETURN_AND_RESCAN:
+	  ASSERT (lm->current_index);
+	  lm->current_index--;
+	  /* note flow-through */
+
+	case VLIB_LEX_RETURN:
+	  rv->token = e->token;
+	  rv->value.as_uword = lm->current_token_value;
+	  lm->current_token_value = ~0;
+	  if (LEX_DEBUG > 0)
+	    {
+	      clib_warning
+		("table %s char '%c'(0x%02x) next table %s return %s",
+		 t->name, c, c, lm->lex_tables[e->next_table_index].name,
+		 lm->lex_token_names[e->token]);
+	      if (rv->token == VLIB_LEX_number)
+		clib_warning ("  numeric value 0x%x (%d)", rv->value,
+			      rv->value);
+	    }
+	  return;
+	}
+    }
+}
+
+u16
+vlib_lex_add_token (vlib_lex_main_t * lm, char *token_name)
+{
+  uword *p;
+  u16 rv;
+
+  p = hash_get_mem (lm->lex_tokens_by_name, token_name);
+
+  if (p)
+    return p[0];
+
+  rv = vec_len (lm->lex_token_names);
+  hash_set_mem (lm->lex_tokens_by_name, token_name, rv);
+  vec_add1 (lm->lex_token_names, token_name);
+
+  return rv;
+}
+
+static u16
+add_keyword (vlib_lex_main_t * lm, char *keyword, char *token_name)
+{
+  uword *p;
+  u16 token;
+
+  p = hash_get_mem (lm->lex_keywords, keyword);
+
+  ASSERT (p == 0);
+
+  token = vlib_lex_add_token (lm, token_name);
+
+  hash_set_mem (lm->lex_keywords, keyword, token);
+  return token;
+}
+
+u16
+vlib_lex_find_or_add_keyword (vlib_lex_main_t * lm, char *keyword,
+			      char *token_name)
+{
+  uword *p = hash_get_mem (lm->lex_keywords, keyword);
+  return p ? p[0] : add_keyword (lm, keyword, token_name);
+}
+
+void
+vlib_lex_set_action_range (u32 table_index, u8 lo, u8 hi, u16 action,
+			   u16 token, u32 next_table_index)
+{
+  int i;
+  vlib_lex_main_t *lm = &vlib_lex_main;
+  vlib_lex_table_t *t = pool_elt_at_index (lm->lex_tables, table_index);
+
+  for (i = lo; i <= hi; i++)
+    {
+      ASSERT (i < ARRAY_LEN (t->entries));
+      t->entries[i].action = action;
+      t->entries[i].token = token;
+      t->entries[i].next_table_index = next_table_index;
+    }
+}
+
+u16
+vlib_lex_add_table (char *name)
+{
+  vlib_lex_main_t *lm = &vlib_lex_main;
+  vlib_lex_table_t *t;
+  uword *p;
+
+  p = hash_get_mem (lm->lex_tables_by_name, name);
+
+  ASSERT (p == 0);
+
+  pool_get_aligned (lm->lex_tables, t, CLIB_CACHE_LINE_BYTES);
+
+  t->name = name;
+
+  hash_set_mem (lm->lex_tables_by_name, name, t - lm->lex_tables);
+
+  vlib_lex_set_action_range (t - lm->lex_tables, 1, 0x7F, VLIB_LEX_IGNORE, ~0,
+			     t - lm->lex_tables);
+
+  vlib_lex_set_action_range (t - lm->lex_tables, 0, 0, VLIB_LEX_RETURN,
+			     VLIB_LEX_eof, t - lm->lex_tables);
+
+  return t - lm->lex_tables;
+}
+
+void
+vlib_lex_reset (vlib_lex_main_t * lm, u8 * input_vector)
+{
+  if (lm->pushback_vector)
+    _vec_len (lm->pushback_vector) = 0;
+  lm->pushback_sp = -1;
+
+  lm->input_vector = input_vector;
+  lm->current_index = 0;
+}
+
+static clib_error_t *
+lex_onetime_init (vlib_main_t * vm)
+{
+  vlib_lex_main_t *lm = &vlib_lex_main;
+
+  lm->lex_tables_by_name = hash_create_string (0, sizeof (uword));
+  lm->lex_tokens_by_name = hash_create_string (0, sizeof (uword));
+  lm->lex_keywords = hash_create_string (0, sizeof (uword));
+  lm->pushback_sp = -1;
+
+#define _(f) { u16 tmp = vlib_lex_add_token (lm, #f); ASSERT (tmp == VLIB_LEX_##f); }
+  foreach_vlib_lex_global_token;
+#undef _
+
+  vec_validate (lm->token_buffer, 127);
+  _vec_len (lm->token_buffer) = 0;
+
+  return 0;
+}
+
+VLIB_INIT_FUNCTION (lex_onetime_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/lex.h b/src/vlib/lex.h
new file mode 100644
index 00000000..4ae58f46
--- /dev/null
+++ b/src/vlib/lex.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef included_vlib_lex_h
+#define included_vlib_lex_h
+
+#include <vppinfra/hash.h>
+#include <vppinfra/bitmap.h>
+#include <vppinfra/error.h>
+#include <vppinfra/pool.h>
+
+#define foreach_vlib_lex_global_token           \
+  _ (invalid)                                   \
+  _ (eof)                                       \
+  _ (word)                                      \
+  _ (number)                                    \
+  _ (lt)                                        \
+  _ (gt)                                        \
+  _ (dot)                                       \
+  _ (slash)                                     \
+  _ (qmark)                                     \
+  _ (equals)                                    \
+  _ (plus)                                      \
+  _ (minus)                                     \
+  _ (star)                                      \
+  _ (lpar)                                      \
+  _ (rpar)
+
+typedef enum
+{
+#define _(f) VLIB_LEX_##f,
+  foreach_vlib_lex_global_token
+#undef _
+} vlib_lex_global_token_t;
+
+typedef enum
+{
+  VLIB_LEX_IGNORE,
+  VLIB_LEX_ADD_TO_TOKEN,
+  VLIB_LEX_RETURN,
+  VLIB_LEX_RETURN_AND_RESCAN,
+  VLIB_LEX_KEYWORD_CHECK,
+  VLIB_LEX_START_NUMBER,
+  VLIB_LEX_ADD_TO_NUMBER,
+} vlib_lex_action_t;
+
+typedef struct
+{
+  u16 action;
+  u16 next_table_index;
+  u16 token;
+} vlib_lex_table_entry_t;
+
+typedef struct
+{
+  char *name;
+  vlib_lex_table_entry_t entries[128];
+} vlib_lex_table_t;
+
+typedef struct
+{
+  u32 token;
+
+  union
+  {
+    uword as_uword;
+    void *as_pointer;
+    char *as_string;
+  } value;
+} vlib_lex_token_t;
+
+typedef struct
+{
+  vlib_lex_table_t *lex_tables;
+  uword *lex_tables_by_name;
+
+  /* Vector of token strings. */
+  char **lex_token_names;
+
+  /* Hash mapping c string name to token index. */
+  uword *lex_tokens_by_name;
+
+  /* Hash mapping c string keyword name to token index. */
+  uword *lex_keywords;
+
+  vlib_lex_token_t *pushback_vector;
+
+  i32 pushback_sp;
+
+  u32 current_table_index;
+
+  uword current_token_value;
+
+  uword current_number_base;
+
+  /* Input string we are lex-ing. */
+  u8 *input_vector;
+
+  /* Current index into input vector. */
+  u32 current_index;
+
+  /* Re-used vector for forming token strings and hashing them. */
+  u8 *token_buffer;
+} vlib_lex_main_t;
+
+vlib_lex_main_t vlib_lex_main;
+
+always_inline void
+vlib_lex_cleanup_token (vlib_lex_token_t * t)
+{
+  if (t->token == VLIB_LEX_word)
+    {
+      u8 *tv = t->value.as_pointer;
+      vec_free (tv);
+    }
+}
+
+u16 vlib_lex_add_table (char *name);
+void vlib_lex_get_token (vlib_lex_main_t * lm, vlib_lex_token_t * result);
+u16 vlib_lex_add_token (vlib_lex_main_t * lm, char *token_name);
+void vlib_lex_set_action_range (u32 table_index, u8 lo, u8 hi, u16 action,
+				u16 token, u32 next_table_index);
+void vlib_lex_reset (vlib_lex_main_t * lm, u8 * input_vector);
+format_function_t format_vlib_lex_token;
+
+#endif /* included_vlib_lex_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/linux/pci.c b/src/vlib/linux/pci.c
new file mode 100644
index 00000000..790f168a
--- /dev/null
+++ b/src/vlib/linux/pci.c
@@ -0,0 +1,666 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * pci.c: Linux user space PCI bus management.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vppinfra/linux/sysfs.h>
+
+#include <vlib/vlib.h>
+#include <vlib/pci/pci.h>
+#include <vlib/unix/unix.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <sys/ioctl.h>
+#include <net/if.h>
+#include <linux/ethtool.h>
+#include <linux/sockios.h>
+
+typedef struct
+{
+  /* /sys/bus/pci/devices/... directory name for this device. */
+  u8 *dev_dir_name;
+
+  /* Resource file descriptors. */
+  int *resource_fds;
+
+  /* File descriptor for config space read/write. */
+  int config_fd;
+
+  /* File descriptor for /dev/uio%d */
+  int uio_fd;
+
+  /* Minor device for uio device. */
+  u32 uio_minor;
+
+  /* Index given by clib_file_add. */
+  u32 clib_file_index;
+
+} linux_pci_device_t;
+
+/* Pool of PCI devices. */
+typedef struct
+{
+  vlib_main_t *vlib_main;
+  linux_pci_device_t *linux_pci_devices;
+} linux_pci_main_t;
+
+extern linux_pci_main_t linux_pci_main;
+
+/* Call to allocate/initialize the pci subsystem.
+   This is not an init function so that users can explicitly enable
+   pci only when it's needed. */
+clib_error_t *pci_bus_init (vlib_main_t * vm);
+
+clib_error_t *vlib_pci_bind_to_uio (vlib_pci_device_t * d,
+				    char *uio_driver_name);
+
+linux_pci_main_t linux_pci_main;
+
+clib_error_t *
+vlib_pci_bind_to_uio (vlib_pci_device_t * d, char *uio_driver_name)
+{
+  clib_error_t *error = 0;
+  u8 *s = 0, *driver_name = 0;
+  DIR *dir = 0;
+  struct dirent *e;
+  int fd, clear_driver_override = 0;
+  u8 *dev_dir_name = format (0, "/sys/bus/pci/devices/%U",
+			     format_vlib_pci_addr, &d->bus_address);
+
+  s = format (s, "%v/driver%c", dev_dir_name, 0);
+  driver_name = clib_sysfs_link_to_name ((char *) s);
+  vec_reset_length (s);
+
+  if (driver_name &&
+      ((strcmp ("vfio-pci", (char *) driver_name) == 0) ||
+       (strcmp ("uio_pci_generic", (char *) driver_name) == 0) ||
+       (strcmp ("igb_uio", (char *) driver_name) == 0)))
+    goto done;
+
+  /* walk trough all linux interfaces and if interface belonging to
+     this device is founf check if interface is admin up  */
+  dir = opendir ("/sys/class/net");
+  s = format (s, "%U%c", format_vlib_pci_addr, &d->bus_address, 0);
+
+  if (!dir)
+    {
+      error = clib_error_return (0, "Skipping PCI device %U: failed to "
+				 "read /sys/class/net",
+				 format_vlib_pci_addr, &d->bus_address);
+      goto done;
+    }
+
+  fd = socket (PF_INET, SOCK_DGRAM, 0);
+  if (fd < 0)
+    {
+      error = clib_error_return_unix (0, "socket");
+      goto done;
+    }
+
+  while ((e = readdir (dir)))
+    {
+      struct ifreq ifr;
+      struct ethtool_drvinfo drvinfo;
+
+      if (e->d_name[0] == '.')	/* skip . and .. */
+	continue;
+
+      memset (&ifr, 0, sizeof ifr);
+      memset (&drvinfo, 0, sizeof drvinfo);
+      ifr.ifr_data = (char *) &drvinfo;
+      strncpy (ifr.ifr_name, e->d_name, IFNAMSIZ - 1);
+      drvinfo.cmd = ETHTOOL_GDRVINFO;
+      if (ioctl (fd, SIOCETHTOOL, &ifr) < 0)
+	{
+	  /* Some interfaces (eg "lo") don't support this ioctl */
+	  if ((errno != ENOTSUP) && (errno != ENODEV))
+	    clib_unix_warning ("ioctl fetch intf %s bus info error",
+			       e->d_name);
+	  continue;
+	}
+
+      if (strcmp ((char *) s, drvinfo.bus_info))
+	continue;
+
+      memset (&ifr, 0, sizeof (ifr));
+      strncpy (ifr.ifr_name, e->d_name, IFNAMSIZ - 1);
+      if (ioctl (fd, SIOCGIFFLAGS, &ifr) < 0)
+	{
+	  error = clib_error_return_unix (0, "ioctl fetch intf %s flags",
+					  e->d_name);
+	  close (fd);
+	  goto done;
+	}
+
+      if (ifr.ifr_flags & IFF_UP)
+	{
+	  error = clib_error_return (0, "Skipping PCI device %U as host "
+				     "interface %s is up",
+				     format_vlib_pci_addr, &d->bus_address,
+				     e->d_name);
+	  close (fd);
+	  goto done;
+	}
+    }
+
+  close (fd);
+  vec_reset_length (s);
+
+  s = format (s, "%v/driver/unbind%c", dev_dir_name, 0);
+  clib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address);
+  vec_reset_length (s);
+
+  s = format (s, "%v/driver_override%c", dev_dir_name, 0);
+  if (access ((char *) s, F_OK) == 0)
+    {
+      clib_sysfs_write ((char *) s, "%s", uio_driver_name);
+      clear_driver_override = 1;
+    }
+  else
+    {
+      vec_reset_length (s);
+      s = format (s, "/sys/bus/pci/drivers/%s/new_id%c", uio_driver_name, 0);
+      clib_sysfs_write ((char *) s, "0x%04x 0x%04x", d->vendor_id,
+			d->device_id);
+    }
+  vec_reset_length (s);
+
+  s = format (s, "/sys/bus/pci/drivers/%s/bind%c", uio_driver_name, 0);
+  clib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address);
+  vec_reset_length (s);
+
+  if (clear_driver_override)
+    {
+      s = format (s, "%v/driver_override%c", dev_dir_name, 0);
+      clib_sysfs_write ((char *) s, "%c", 0);
+      vec_reset_length (s);
+    }
+
+done:
+  closedir (dir);
+  vec_free (s);
+  vec_free (dev_dir_name);
+  vec_free (driver_name);
+  return error;
+}
+
+
+static clib_error_t *
+scan_uio_dir (void *arg, u8 * path_name, u8 * file_name)
+{
+  linux_pci_device_t *l = arg;
+  unformat_input_t input;
+
+  unformat_init_string (&input, (char *) file_name, vec_len (file_name));
+
+  if (!unformat (&input, "uio%d", &l->uio_minor))
+    abort ();
+
+  unformat_free (&input);
+  return 0;
+}
+
+static clib_error_t *
+linux_pci_uio_read_ready (clib_file_t * uf)
+{
+  vlib_pci_main_t *pm = &pci_main;
+  vlib_pci_device_t *d;
+  int __attribute__ ((unused)) rv;
+
+  u32 icount;
+  rv = read (uf->file_descriptor, &icount, 4);
+
+  d = pool_elt_at_index (pm->pci_devs, uf->private_data);
+
+  if (d->interrupt_handler)
+    d->interrupt_handler (d);
+
+  vlib_pci_intr_enable (d);
+
+  return /* no error */ 0;
+}
+
+static clib_error_t *
+linux_pci_uio_error_ready (clib_file_t * uf)
+{
+  u32 error_index = (u32) uf->private_data;
+
+  return clib_error_return (0, "pci device %d: error", error_index);
+}
+
+static void
+add_device (vlib_pci_device_t * dev, linux_pci_device_t * pdev)
+{
+  vlib_pci_main_t *pm = &pci_main;
+  linux_pci_main_t *lpm = &linux_pci_main;
+  linux_pci_device_t *l;
+
+  pool_get (lpm->linux_pci_devices, l);
+  l[0] = pdev[0];
+
+  l->dev_dir_name = vec_dup (l->dev_dir_name);
+
+  dev->os_handle = l - lpm->linux_pci_devices;
+
+  {
+    u8 *uio_dir = format (0, "%s/uio", l->dev_dir_name);
+    foreach_directory_file ((char *) uio_dir, scan_uio_dir, l,	/* scan_dirs */
+			    1);
+    vec_free (uio_dir);
+  }
+
+  {
+    char *uio_name = (char *) format (0, "/dev/uio%d%c", l->uio_minor, 0);
+    l->uio_fd = open (uio_name, O_RDWR);
+    if (l->uio_fd < 0)
+      clib_unix_error ("open `%s'", uio_name);
+    vec_free (uio_name);
+  }
+
+  {
+    clib_file_t template = { 0 };
+
+    template.read_function = linux_pci_uio_read_ready;
+    template.file_descriptor = l->uio_fd;
+    template.error_function = linux_pci_uio_error_ready;
+    template.private_data = dev - pm->pci_devs;
+
+    l->clib_file_index = clib_file_add (&file_main, &template);
+  }
+}
+
+static void
+linux_pci_device_free (linux_pci_device_t * l)
+{
+  int i;
+  for (i = 0; i < vec_len (l->resource_fds); i++)
+    if (l->resource_fds[i] > 0)
+      close (l->resource_fds[i]);
+  if (l->config_fd > 0)
+    close (l->config_fd);
+  if (l->uio_fd > 0)
+    close (l->uio_fd);
+  vec_free (l->resource_fds);
+  vec_free (l->dev_dir_name);
+}
+
+/* Configuration space read/write. */
+clib_error_t *
+vlib_pci_read_write_config (vlib_pci_device_t * dev,
+			    vlib_read_or_write_t read_or_write,
+			    uword address, void *data, u32 n_bytes)
+{
+  linux_pci_main_t *lpm = &linux_pci_main;
+  linux_pci_device_t *p;
+  int n;
+
+  p = pool_elt_at_index (lpm->linux_pci_devices, dev->os_handle);
+
+  if (read_or_write == VLIB_READ)
+    n = pread (p->config_fd, data, n_bytes, address);
+  else
+    n = pwrite (p->config_fd, data, n_bytes, address);
+
+  if (n != n_bytes)
+    return clib_error_return_unix (0, "%s",
+				   read_or_write == VLIB_READ
+				   ? "read" : "write");
+
+  return 0;
+}
+
+static clib_error_t *
+os_map_pci_resource_internal (uword os_handle,
+			      u32 resource, u8 * addr, void **result)
+{
+  linux_pci_main_t *pm = &linux_pci_main;
+  linux_pci_device_t *p;
+  struct stat stat_buf;
+  u8 *file_name;
+  int fd;
+  clib_error_t *error;
+  int flags = MAP_SHARED;
+
+  error = 0;
+  p = pool_elt_at_index (pm->linux_pci_devices, os_handle);
+
+  file_name = format (0, "%v/resource%d%c", p->dev_dir_name, resource, 0);
+  fd = open ((char *) file_name, O_RDWR);
+  if (fd < 0)
+    {
+      error = clib_error_return_unix (0, "open `%s'", file_name);
+      goto done;
+    }
+
+  if (fstat (fd, &stat_buf) < 0)
+    {
+      error = clib_error_return_unix (0, "fstat `%s'", file_name);
+      goto done;
+    }
+
+  vec_validate (p->resource_fds, resource);
+  p->resource_fds[resource] = fd;
+  if (addr != 0)
+    flags |= MAP_FIXED;
+
+  *result = mmap (addr,
+		  /* size */ stat_buf.st_size,
+		  PROT_READ | PROT_WRITE, flags,
+		  /* file */ fd,
+		  /* offset */ 0);
+  if (*result == (void *) -1)
+    {
+      error = clib_error_return_unix (0, "mmap `%s'", file_name);
+      goto done;
+    }
+
+done:
+  if (error)
+    {
+      if (fd >= 0)
+	close (fd);
+    }
+  vec_free (file_name);
+  return error;
+}
+
+clib_error_t *
+vlib_pci_map_resource (vlib_pci_device_t * dev, u32 resource, void **result)
+{
+  return (os_map_pci_resource_internal
+	  (dev->os_handle, resource, 0 /* addr */ ,
+	   result));
+}
+
+clib_error_t *
+vlib_pci_map_resource_fixed (vlib_pci_device_t * dev,
+			     u32 resource, u8 * addr, void **result)
+{
+  return (os_map_pci_resource_internal
+	  (dev->os_handle, resource, addr, result));
+}
+
+void
+vlib_pci_free_device (vlib_pci_device_t * dev)
+{
+  linux_pci_main_t *pm = &linux_pci_main;
+  linux_pci_device_t *l;
+
+  l = pool_elt_at_index (pm->linux_pci_devices, dev->os_handle);
+  linux_pci_device_free (l);
+  pool_put (pm->linux_pci_devices, l);
+}
+
+pci_device_registration_t * __attribute__ ((unused))
+pci_device_next_registered (pci_device_registration_t * r)
+{
+  uword i;
+
+  /* Null vendor id marks end of initialized list. */
+  for (i = 0; r->supported_devices[i].vendor_id != 0; i++)
+    ;
+
+  return clib_elf_section_data_next (r, i * sizeof (r->supported_devices[0]));
+}
+
+static clib_error_t *
+init_device_from_registered (vlib_main_t * vm,
+			     vlib_pci_device_t * dev,
+			     linux_pci_device_t * pdev)
+{
+  vlib_pci_main_t *pm = &pci_main;
+  pci_device_registration_t *r;
+  pci_device_id_t *i;
+  clib_error_t *error;
+
+  r = pm->pci_device_registrations;
+
+  while (r)
+    {
+      for (i = r->supported_devices; i->vendor_id != 0; i++)
+	if (i->vendor_id == dev->vendor_id && i->device_id == dev->device_id)
+	  {
+	    error = vlib_pci_bind_to_uio (dev, "uio_pci_generic");
+	    if (error)
+	      {
+		clib_error_report (error);
+		continue;
+	      }
+
+	    add_device (dev, pdev);
+	    dev->interrupt_handler = r->interrupt_handler;
+	    return r->init_function (vm, dev);
+	  }
+      r = r->next_registration;
+    }
+  /* No driver, close the PCI config-space FD */
+  close (pdev->config_fd);
+  return 0;
+}
+
+static clib_error_t *
+init_device (vlib_main_t * vm,
+	     vlib_pci_device_t * dev, linux_pci_device_t * pdev)
+{
+  return init_device_from_registered (vm, dev, pdev);
+}
+
+static clib_error_t *
+scan_device (void *arg, u8 * dev_dir_name, u8 * ignored)
+{
+  vlib_main_t *vm = arg;
+  vlib_pci_main_t *pm = &pci_main;
+  int fd;
+  u8 *f;
+  clib_error_t *error = 0;
+  vlib_pci_device_t *dev;
+  linux_pci_device_t pdev = { 0 };
+  u32 tmp;
+
+  f = format (0, "%v/config%c", dev_dir_name, 0);
+  fd = open ((char *) f, O_RDWR);
+
+  /* Try read-only access if write fails. */
+  if (fd < 0)
+    fd = open ((char *) f, O_RDONLY);
+
+  if (fd < 0)
+    {
+      error = clib_error_return_unix (0, "open `%s'", f);
+      goto done;
+    }
+
+  pool_get (pm->pci_devs, dev);
+
+  /* You can only read more that 64 bytes of config space as root; so we try to
+     read the full space but fall back to just the first 64 bytes. */
+  if (read (fd, &dev->config_data, sizeof (dev->config_data)) !=
+      sizeof (dev->config_data)
+      && read (fd, &dev->config0,
+	       sizeof (dev->config0)) != sizeof (dev->config0))
+    {
+      pool_put (pm->pci_devs, dev);
+      error = clib_error_return_unix (0, "read `%s'", f);
+      close (fd);
+      goto done;
+    }
+
+  {
+    static pci_config_header_t all_ones;
+    if (all_ones.vendor_id == 0)
+      memset (&all_ones, ~0, sizeof (all_ones));
+
+    if (!memcmp (&dev->config0.header, &all_ones, sizeof (all_ones)))
+      {
+	pool_put (pm->pci_devs, dev);
+	error = clib_error_return (0, "invalid PCI config for `%s'", f);
+	close (fd);
+	goto done;
+      }
+  }
+
+  if (dev->config0.header.header_type == 0)
+    pci_config_type0_little_to_host (&dev->config0);
+  else
+    pci_config_type1_little_to_host (&dev->config1);
+
+  /* Parse bus, dev, function from directory name. */
+  {
+    unformat_input_t input;
+
+    unformat_init_string (&input, (char *) dev_dir_name,
+			  vec_len (dev_dir_name));
+
+    if (!unformat (&input, "/sys/bus/pci/devices/%U",
+		   unformat_vlib_pci_addr, &dev->bus_address))
+      abort ();
+
+    unformat_free (&input);
+
+  }
+
+
+  pdev.config_fd = fd;
+  pdev.dev_dir_name = dev_dir_name;
+
+  hash_set (pm->pci_dev_index_by_pci_addr, dev->bus_address.as_u32,
+	    dev - pm->pci_devs);
+
+  vec_reset_length (f);
+  f = format (f, "%v/vpd%c", dev_dir_name, 0);
+  fd = open ((char *) f, O_RDONLY);
+  if (fd >= 0)
+    {
+      while (1)
+	{
+	  u8 tag[3];
+	  u8 *data = 0;
+	  int len;
+
+	  if (read (fd, &tag, 3) != 3)
+	    break;
+
+	  if (tag[0] != 0x82 && tag[0] != 0x90 && tag[0] != 0x91)
+	    break;
+
+	  len = (tag[2] << 8) | tag[1];
+	  vec_validate (data, len);
+
+	  if (read (fd, data, len) != len)
+	    {
+	      vec_free (data);
+	      break;
+	    }
+	  if (tag[0] == 0x82)
+	    dev->product_name = data;
+	  else if (tag[0] == 0x90)
+	    dev->vpd_r = data;
+	  else if (tag[0] == 0x91)
+	    dev->vpd_w = data;
+
+	  data = 0;
+	}
+      close (fd);
+    }
+
+  dev->numa_node = -1;
+  vec_reset_length (f);
+  f = format (f, "%v/numa_node%c", dev_dir_name, 0);
+  clib_sysfs_read ((char *) f, "%u", &dev->numa_node);
+
+  vec_reset_length (f);
+  f = format (f, "%v/class%c", dev_dir_name, 0);
+  clib_sysfs_read ((char *) f, "0x%x", &tmp);
+  dev->device_class = tmp >> 8;
+
+  vec_reset_length (f);
+  f = format (f, "%v/vendor%c", dev_dir_name, 0);
+  clib_sysfs_read ((char *) f, "0x%x", &tmp);
+  dev->vendor_id = tmp;
+
+  vec_reset_length (f);
+  f = format (f, "%v/device%c", dev_dir_name, 0);
+  clib_sysfs_read ((char *) f, "0x%x", &tmp);
+  dev->device_id = tmp;
+
+  error = init_device (vm, dev, &pdev);
+
+  vec_reset_length (f);
+  f = format (f, "%v/driver%c", dev_dir_name, 0);
+  dev->driver_name = clib_sysfs_link_to_name ((char *) f);
+
+done:
+  vec_free (f);
+  return error;
+}
+
+clib_error_t *
+linux_pci_init (vlib_main_t * vm)
+{
+  vlib_pci_main_t *pm = &pci_main;
+  clib_error_t *error;
+
+  pm->vlib_main = vm;
+
+  if ((error = vlib_call_init_function (vm, unix_input_init)))
+    return error;
+
+  ASSERT (sizeof (vlib_pci_addr_t) == sizeof (u32));
+  pm->pci_dev_index_by_pci_addr = hash_create (0, sizeof (uword));
+
+  error = foreach_directory_file ("/sys/bus/pci/devices", scan_device, vm,
+				  /* scan_dirs */ 0);
+
+  /* Complain and continue. might not be root, etc. */
+  if (error)
+    clib_error_report (error);
+
+  return error;
+}
+
+VLIB_INIT_FUNCTION (linux_pci_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/linux/physmem.c b/src/vlib/linux/physmem.c
new file mode 100644
index 00000000..6d3f7c55
--- /dev/null
+++ b/src/vlib/linux/physmem.c
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * physmem.c: Unix physical memory
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/mount.h>
+#include <sys/mman.h>
+#include <sys/fcntl.h>
+#include <sys/stat.h>
+
+#include <vppinfra/linux/syscall.h>
+#include <vppinfra/linux/sysfs.h>
+#include <vlib/vlib.h>
+#include <vlib/physmem.h>
+#include <vlib/unix/unix.h>
+
+static void *
+unix_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx,
+			    uword n_bytes, uword alignment)
+{
+  vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx);
+  uword lo_offset, hi_offset;
+  uword *to_free = 0;
+
+  if (pr->heap == 0)
+    return 0;
+
+  /* IO memory is always at least cache aligned. */
+  alignment = clib_max (alignment, CLIB_CACHE_LINE_BYTES);
+
+  while (1)
+    {
+      mheap_get_aligned (pr->heap, n_bytes,
+			 /* align */ alignment,
+			 /* align offset */ 0,
+			 &lo_offset);
+
+      /* Allocation failed? */
+      if (lo_offset == ~0)
+	break;
+
+      if (pr->flags & VLIB_PHYSMEM_F_FAKE)
+	break;
+
+      /* Make sure allocation does not span DMA physical chunk boundary. */
+      hi_offset = lo_offset + n_bytes - 1;
+
+      if ((lo_offset >> pr->log2_page_size) ==
+	  (hi_offset >> pr->log2_page_size))
+	break;
+
+      /* Allocation would span chunk boundary, queue it to be freed as soon as
+         we find suitable chunk. */
+      vec_add1 (to_free, lo_offset);
+    }
+
+  if (to_free != 0)
+    {
+      uword i;
+      for (i = 0; i < vec_len (to_free); i++)
+	mheap_put (pr->heap, to_free[i]);
+      vec_free (to_free);
+    }
+
+  return lo_offset != ~0 ? pr->heap + lo_offset : 0;
+}
+
+static void
+unix_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx, void *x)
+{
+  vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx);
+  /* Return object to region's heap. */
+  mheap_put (pr->heap, x - pr->heap);
+}
+
+static clib_error_t *
+unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size,
+			   u8 numa_node, u32 flags,
+			   vlib_physmem_region_index_t * idx)
+{
+  vlib_physmem_main_t *vpm = &vm->physmem_main;
+  vlib_physmem_region_t *pr;
+  clib_error_t *error = 0;
+  clib_mem_vm_alloc_t alloc = { 0 };
+
+
+  if (geteuid () != 0 && (flags & VLIB_PHYSMEM_F_FAKE) == 0)
+    return clib_error_return (0, "not allowed");
+
+  pool_get (vpm->regions, pr);
+
+  if ((pr - vpm->regions) >= 256)
+    {
+      error = clib_error_return (0, "maximum number of regions reached");
+      goto error;
+    }
+
+  alloc.name = name;
+  alloc.size = size;
+  alloc.numa_node = numa_node;
+  alloc.flags = CLIB_MEM_VM_F_SHARED;
+
+  if ((flags & VLIB_PHYSMEM_F_FAKE) == 0)
+    {
+      alloc.flags |= CLIB_MEM_VM_F_HUGETLB;
+      alloc.flags |= CLIB_MEM_VM_F_HUGETLB_PREALLOC;
+      alloc.flags |= CLIB_MEM_VM_F_NUMA_FORCE;
+    }
+  else
+    {
+      alloc.flags |= CLIB_MEM_VM_F_NUMA_PREFER;
+    }
+
+  error = clib_mem_vm_ext_alloc (&alloc);
+  if (error)
+    goto error;
+
+  pr->index = pr - vpm->regions;
+  pr->flags = flags;
+  pr->fd = alloc.fd;
+  pr->mem = alloc.addr;
+  pr->log2_page_size = alloc.log2_page_size;
+  pr->n_pages = alloc.n_pages;
+  pr->size = (u64) pr->n_pages << (u64) pr->log2_page_size;
+  pr->page_mask = (1 << pr->log2_page_size) - 1;
+  pr->numa_node = numa_node;
+  pr->name = format (0, "%s", name);
+
+  if ((flags & VLIB_PHYSMEM_F_FAKE) == 0)
+    {
+      int i;
+      for (i = 0; i < pr->n_pages; i++)
+	{
+	  void *ptr = pr->mem + (i << pr->log2_page_size);
+	  int node;
+	  move_pages (0, 1, &ptr, 0, &node, 0);
+	  if (numa_node != node)
+	    {
+	      clib_warning ("physmem page for region \'%s\' allocated on the"
+			    " wrong numa node (requested %u actual %u)",
+			    pr->name, pr->numa_node, node, i);
+	      break;
+	    }
+	}
+      pr->page_table = clib_mem_vm_get_paddr (pr->mem, pr->log2_page_size,
+					      pr->n_pages);
+    }
+
+  if (flags & VLIB_PHYSMEM_F_INIT_MHEAP)
+    {
+      pr->heap = mheap_alloc_with_flags (pr->mem, pr->size,
+					 /* Don't want mheap mmap/munmap with IO memory. */
+					 MHEAP_FLAG_DISABLE_VM |
+					 MHEAP_FLAG_THREAD_SAFE);
+    }
+
+  if (flags & VLIB_PHYSMEM_F_HAVE_BUFFERS)
+    {
+      vlib_buffer_add_mem_range (vm, pointer_to_uword (pr->mem), pr->size);
+    }
+
+  *idx = pr->index;
+
+  goto done;
+
+error:
+  memset (pr, 0, sizeof (*pr));
+  pool_put (vpm->regions, pr);
+
+done:
+  return error;
+}
+
+static void
+unix_physmem_region_free (vlib_main_t * vm, vlib_physmem_region_index_t idx)
+{
+  vlib_physmem_main_t *vpm = &vm->physmem_main;
+  vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx);
+
+  if (pr->fd > 0)
+    close (pr->fd);
+  munmap (pr->mem, pr->size);
+  vec_free (pr->name);
+  pool_put (vpm->regions, pr);
+}
+
+clib_error_t *
+unix_physmem_init (vlib_main_t * vm)
+{
+  clib_error_t *error = 0;
+
+  /* Avoid multiple calls. */
+  if (vm->os_physmem_alloc_aligned)
+    return error;
+
+  vm->os_physmem_alloc_aligned = unix_physmem_alloc_aligned;
+  vm->os_physmem_free = unix_physmem_free;
+  vm->os_physmem_region_alloc = unix_physmem_region_alloc;
+  vm->os_physmem_region_free = unix_physmem_region_free;
+
+  return error;
+}
+
+static clib_error_t *
+show_physmem (vlib_main_t * vm,
+	      unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  vlib_physmem_main_t *vpm = &vm->physmem_main;
+  vlib_physmem_region_t *pr;
+
+  /* *INDENT-OFF* */
+  pool_foreach (pr, vpm->regions, (
+    {
+      vlib_cli_output (vm, "index %u name '%s' page-size %uKB num-pages %d "
+		       "numa-node %u fd %d\n",
+		       pr->index, pr->name, (1 << (pr->log2_page_size -10)),
+		       pr->n_pages, pr->numa_node, pr->fd);
+      if (pr->heap)
+	vlib_cli_output (vm, "  %U", format_mheap, pr->heap, /* verbose */ 1);
+      else
+	vlib_cli_output (vm, "  no heap\n");
+    }));
+  /* *INDENT-ON* */
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (show_physmem_command, static) = {
+  .path = "show physmem",
+  .short_help = "Show physical memory allocation",
+  .function = show_physmem,
+};
+/* *INDENT-ON* */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/main.c b/src/vlib/main.c
new file mode 100644
index 00000000..7875f62a
--- /dev/null
+++ b/src/vlib/main.c
@@ -0,0 +1,1816 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * main.c: main vector processing loop
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <math.h>
+#include <vppinfra/format.h>
+#include <vlib/vlib.h>
+#include <vlib/threads.h>
+#include <vppinfra/tw_timer_1t_3w_1024sl_ov.h>
+
+#include <vlib/unix/unix.h>
+#include <vlib/unix/cj.h>
+
+CJ_GLOBAL_LOG_PROTOTYPE;
+
+/* Actually allocate a few extra slots of vector data to support
+   speculative vector enqueues which overflow vector data in next frame. */
+#define VLIB_FRAME_SIZE_ALLOC (VLIB_FRAME_SIZE + 4)
+
+u32 wraps;
+
+always_inline u32
+vlib_frame_bytes (u32 n_scalar_bytes, u32 n_vector_bytes)
+{
+  u32 n_bytes;
+
+  /* Make room for vlib_frame_t plus scalar arguments. */
+  n_bytes = vlib_frame_vector_byte_offset (n_scalar_bytes);
+
+  /* Make room for vector arguments.
+     Allocate a few extra slots of vector data to support
+     speculative vector enqueues which overflow vector data in next frame. */
+#define VLIB_FRAME_SIZE_EXTRA 4
+  n_bytes += (VLIB_FRAME_SIZE + VLIB_FRAME_SIZE_EXTRA) * n_vector_bytes;
+
+  /* Magic number is first 32bit number after vector data.
+     Used to make sure that vector data is never overrun. */
+#define VLIB_FRAME_MAGIC (0xabadc0ed)
+  n_bytes += sizeof (u32);
+
+  /* Pad to cache line. */
+  n_bytes = round_pow2 (n_bytes, CLIB_CACHE_LINE_BYTES);
+
+  return n_bytes;
+}
+
+always_inline u32 *
+vlib_frame_find_magic (vlib_frame_t * f, vlib_node_t * node)
+{
+  void *p = f;
+
+  p += vlib_frame_vector_byte_offset (node->scalar_size);
+
+  p += (VLIB_FRAME_SIZE + VLIB_FRAME_SIZE_EXTRA) * node->vector_size;
+
+  return p;
+}
+
+static vlib_frame_size_t *
+get_frame_size_info (vlib_node_main_t * nm,
+		     u32 n_scalar_bytes, u32 n_vector_bytes)
+{
+  uword key = (n_scalar_bytes << 16) | n_vector_bytes;
+  uword *p, i;
+
+  p = hash_get (nm->frame_size_hash, key);
+  if (p)
+    i = p[0];
+  else
+    {
+      i = vec_len (nm->frame_sizes);
+      vec_validate (nm->frame_sizes, i);
+      hash_set (nm->frame_size_hash, key, i);
+    }
+
+  return vec_elt_at_index (nm->frame_sizes, i);
+}
+
+static u32
+vlib_frame_alloc_to_node (vlib_main_t * vm, u32 to_node_index,
+			  u32 frame_flags)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_frame_size_t *fs;
+  vlib_node_t *to_node;
+  vlib_frame_t *f;
+  u32 fi, l, n, scalar_size, vector_size;
+
+  to_node = vlib_get_node (vm, to_node_index);
+
+  scalar_size = to_node->scalar_size;
+  vector_size = to_node->vector_size;
+
+  fs = get_frame_size_info (nm, scalar_size, vector_size);
+  n = vlib_frame_bytes (scalar_size, vector_size);
+  if ((l = vec_len (fs->free_frame_indices)) > 0)
+    {
+      /* Allocate from end of free list. */
+      fi = fs->free_frame_indices[l - 1];
+      f = vlib_get_frame_no_check (vm, fi);
+      _vec_len (fs->free_frame_indices) = l - 1;
+    }
+  else
+    {
+      f = clib_mem_alloc_aligned_no_fail (n, VLIB_FRAME_ALIGN);
+      fi = vlib_frame_index_no_check (vm, f);
+    }
+
+  /* Poison frame when debugging. */
+  if (CLIB_DEBUG > 0)
+    memset (f, 0xfe, n);
+
+  /* Insert magic number. */
+  {
+    u32 *magic;
+
+    magic = vlib_frame_find_magic (f, to_node);
+    *magic = VLIB_FRAME_MAGIC;
+  }
+
+  f->flags = VLIB_FRAME_IS_ALLOCATED | frame_flags;
+  f->n_vectors = 0;
+  f->scalar_size = scalar_size;
+  f->vector_size = vector_size;
+
+  fs->n_alloc_frames += 1;
+
+  return fi;
+}
+
+/* Allocate a frame for from FROM_NODE to TO_NODE via TO_NEXT_INDEX.
+   Returns frame index. */
+static u32
+vlib_frame_alloc (vlib_main_t * vm, vlib_node_runtime_t * from_node_runtime,
+		  u32 to_next_index)
+{
+  vlib_node_t *from_node;
+
+  from_node = vlib_get_node (vm, from_node_runtime->node_index);
+  ASSERT (to_next_index < vec_len (from_node->next_nodes));
+
+  return vlib_frame_alloc_to_node (vm, from_node->next_nodes[to_next_index],
+				   /* frame_flags */ 0);
+}
+
+vlib_frame_t *
+vlib_get_frame_to_node (vlib_main_t * vm, u32 to_node_index)
+{
+  u32 fi = vlib_frame_alloc_to_node (vm, to_node_index,
+				     /* frame_flags */
+				     VLIB_FRAME_FREE_AFTER_DISPATCH);
+  return vlib_get_frame (vm, fi);
+}
+
+void
+vlib_put_frame_to_node (vlib_main_t * vm, u32 to_node_index, vlib_frame_t * f)
+{
+  vlib_pending_frame_t *p;
+  vlib_node_t *to_node;
+
+  if (f->n_vectors == 0)
+    return;
+
+  to_node = vlib_get_node (vm, to_node_index);
+
+  vec_add2 (vm->node_main.pending_frames, p, 1);
+
+  f->flags |= VLIB_FRAME_PENDING;
+  p->frame_index = vlib_frame_index (vm, f);
+  p->node_runtime_index = to_node->runtime_index;
+  p->next_frame_index = VLIB_PENDING_FRAME_NO_NEXT_FRAME;
+}
+
+/* Free given frame. */
+void
+vlib_frame_free (vlib_main_t * vm, vlib_node_runtime_t * r, vlib_frame_t * f)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_node_t *node;
+  vlib_frame_size_t *fs;
+  u32 frame_index;
+
+  ASSERT (f->flags & VLIB_FRAME_IS_ALLOCATED);
+
+  node = vlib_get_node (vm, r->node_index);
+  fs = get_frame_size_info (nm, node->scalar_size, node->vector_size);
+
+  frame_index = vlib_frame_index (vm, f);
+
+  ASSERT (f->flags & VLIB_FRAME_IS_ALLOCATED);
+
+  /* No next frames may point to freed frame. */
+  if (CLIB_DEBUG > 0)
+    {
+      vlib_next_frame_t *nf;
+      vec_foreach (nf, vm->node_main.next_frames)
+	ASSERT (nf->frame_index != frame_index);
+    }
+
+  f->flags &= ~VLIB_FRAME_IS_ALLOCATED;
+
+  vec_add1 (fs->free_frame_indices, frame_index);
+  ASSERT (fs->n_alloc_frames > 0);
+  fs->n_alloc_frames -= 1;
+}
+
+static clib_error_t *
+show_frame_stats (vlib_main_t * vm,
+		  unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_frame_size_t *fs;
+
+  vlib_cli_output (vm, "%=6s%=12s%=12s", "Size", "# Alloc", "# Free");
+  vec_foreach (fs, nm->frame_sizes)
+  {
+    u32 n_alloc = fs->n_alloc_frames;
+    u32 n_free = vec_len (fs->free_frame_indices);
+
+    if (n_alloc + n_free > 0)
+      vlib_cli_output (vm, "%=6d%=12d%=12d",
+		       fs - nm->frame_sizes, n_alloc, n_free);
+  }
+
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (show_frame_stats_cli, static) = {
+  .path = "show vlib frame-allocation",
+  .short_help = "Show node dispatch frame statistics",
+  .function = show_frame_stats,
+};
+/* *INDENT-ON* */
+
+/* Change ownership of enqueue rights to given next node. */
+static void
+vlib_next_frame_change_ownership (vlib_main_t * vm,
+				  vlib_node_runtime_t * node_runtime,
+				  u32 next_index)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_next_frame_t *next_frame;
+  vlib_node_t *node, *next_node;
+
+  node = vec_elt (nm->nodes, node_runtime->node_index);
+
+  /* Only internal & input nodes are allowed to call other nodes. */
+  ASSERT (node->type == VLIB_NODE_TYPE_INTERNAL
+	  || node->type == VLIB_NODE_TYPE_INPUT
+	  || node->type == VLIB_NODE_TYPE_PROCESS);
+
+  ASSERT (vec_len (node->next_nodes) == node_runtime->n_next_nodes);
+
+  next_frame =
+    vlib_node_runtime_get_next_frame (vm, node_runtime, next_index);
+  next_node = vec_elt (nm->nodes, node->next_nodes[next_index]);
+
+  if (next_node->owner_node_index != VLIB_INVALID_NODE_INDEX)
+    {
+      /* Get frame from previous owner. */
+      vlib_next_frame_t *owner_next_frame;
+      vlib_next_frame_t tmp;
+
+      owner_next_frame =
+	vlib_node_get_next_frame (vm,
+				  next_node->owner_node_index,
+				  next_node->owner_next_index);
+
+      /* Swap target next frame with owner's. */
+      tmp = owner_next_frame[0];
+      owner_next_frame[0] = next_frame[0];
+      next_frame[0] = tmp;
+
+      /*
+       * If next_frame is already pending, we have to track down
+       * all pending frames and fix their next_frame_index fields.
+       */
+      if (next_frame->flags & VLIB_FRAME_PENDING)
+	{
+	  vlib_pending_frame_t *p;
+	  if (next_frame->frame_index != ~0)
+	    {
+	      vec_foreach (p, nm->pending_frames)
+	      {
+		if (p->frame_index == next_frame->frame_index)
+		  {
+		    p->next_frame_index =
+		      next_frame - vm->node_main.next_frames;
+		  }
+	      }
+	    }
+	}
+    }
+  else
+    {
+      /* No previous owner. Take ownership. */
+      next_frame->flags |= VLIB_FRAME_OWNER;
+    }
+
+  /* Record new owner. */
+  next_node->owner_node_index = node->index;
+  next_node->owner_next_index = next_index;
+
+  /* Now we should be owner. */
+  ASSERT (next_frame->flags & VLIB_FRAME_OWNER);
+}
+
+/* Make sure that magic number is still there.
+   Otherwise, it is likely that caller has overrun frame arguments. */
+always_inline void
+validate_frame_magic (vlib_main_t * vm,
+		      vlib_frame_t * f, vlib_node_t * n, uword next_index)
+{
+  vlib_node_t *next_node = vlib_get_node (vm, n->next_nodes[next_index]);
+  u32 *magic = vlib_frame_find_magic (f, next_node);
+  ASSERT (VLIB_FRAME_MAGIC == magic[0]);
+}
+
+vlib_frame_t *
+vlib_get_next_frame_internal (vlib_main_t * vm,
+			      vlib_node_runtime_t * node,
+			      u32 next_index, u32 allocate_new_next_frame)
+{
+  vlib_frame_t *f;
+  vlib_next_frame_t *nf;
+  u32 n_used;
+
+  nf = vlib_node_runtime_get_next_frame (vm, node, next_index);
+
+  /* Make sure this next frame owns right to enqueue to destination frame. */
+  if (PREDICT_FALSE (!(nf->flags & VLIB_FRAME_OWNER)))
+    vlib_next_frame_change_ownership (vm, node, next_index);
+
+  /* ??? Don't need valid flag: can use frame_index == ~0 */
+  if (PREDICT_FALSE (!(nf->flags & VLIB_FRAME_IS_ALLOCATED)))
+    {
+      nf->frame_index = vlib_frame_alloc (vm, node, next_index);
+      nf->flags |= VLIB_FRAME_IS_ALLOCATED;
+    }
+
+  f = vlib_get_frame (vm, nf->frame_index);
+
+  /* Has frame been removed from pending vector (e.g. finished dispatching)?
+     If so we can reuse frame. */
+  if ((nf->flags & VLIB_FRAME_PENDING) && !(f->flags & VLIB_FRAME_PENDING))
+    {
+      nf->flags &= ~VLIB_FRAME_PENDING;
+      f->n_vectors = 0;
+    }
+
+  /* Allocate new frame if current one is already full. */
+  n_used = f->n_vectors;
+  if (n_used >= VLIB_FRAME_SIZE || (allocate_new_next_frame && n_used > 0))
+    {
+      /* Old frame may need to be freed after dispatch, since we'll have
+         two redundant frames from node -> next node. */
+      if (!(nf->flags & VLIB_FRAME_NO_FREE_AFTER_DISPATCH))
+	{
+	  vlib_frame_t *f_old = vlib_get_frame (vm, nf->frame_index);
+	  f_old->flags |= VLIB_FRAME_FREE_AFTER_DISPATCH;
+	}
+
+      /* Allocate new frame to replace full one. */
+      nf->frame_index = vlib_frame_alloc (vm, node, next_index);
+      f = vlib_get_frame (vm, nf->frame_index);
+      n_used = f->n_vectors;
+    }
+
+  /* Should have free vectors in frame now. */
+  ASSERT (n_used < VLIB_FRAME_SIZE);
+
+  if (CLIB_DEBUG > 0)
+    {
+      validate_frame_magic (vm, f,
+			    vlib_get_node (vm, node->node_index), next_index);
+    }
+
+  return f;
+}
+
+static void
+vlib_put_next_frame_validate (vlib_main_t * vm,
+			      vlib_node_runtime_t * rt,
+			      u32 next_index, u32 n_vectors_left)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_next_frame_t *nf;
+  vlib_frame_t *f;
+  vlib_node_runtime_t *next_rt;
+  vlib_node_t *next_node;
+  u32 n_before, n_after;
+
+  nf = vlib_node_runtime_get_next_frame (vm, rt, next_index);
+  f = vlib_get_frame (vm, nf->frame_index);
+
+  ASSERT (n_vectors_left <= VLIB_FRAME_SIZE);
+  n_after = VLIB_FRAME_SIZE - n_vectors_left;
+  n_before = f->n_vectors;
+
+  ASSERT (n_after >= n_before);
+
+  next_rt = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL],
+			      nf->node_runtime_index);
+  next_node = vlib_get_node (vm, next_rt->node_index);
+  if (n_after > 0 && next_node->validate_frame)
+    {
+      u8 *msg = next_node->validate_frame (vm, rt, f);
+      if (msg)
+	{
+	  clib_warning ("%v", msg);
+	  ASSERT (0);
+	}
+      vec_free (msg);
+    }
+}
+
+void
+vlib_put_next_frame (vlib_main_t * vm,
+		     vlib_node_runtime_t * r,
+		     u32 next_index, u32 n_vectors_left)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_next_frame_t *nf;
+  vlib_frame_t *f;
+  u32 n_vectors_in_frame;
+
+  if (vm->buffer_main->callbacks_registered == 0 && CLIB_DEBUG > 0)
+    vlib_put_next_frame_validate (vm, r, next_index, n_vectors_left);
+
+  nf = vlib_node_runtime_get_next_frame (vm, r, next_index);
+  f = vlib_get_frame (vm, nf->frame_index);
+
+  /* Make sure that magic number is still there.  Otherwise, caller
+     has overrun frame meta data. */
+  if (CLIB_DEBUG > 0)
+    {
+      vlib_node_t *node = vlib_get_node (vm, r->node_index);
+      validate_frame_magic (vm, f, node, next_index);
+    }
+
+  /* Convert # of vectors left -> number of vectors there. */
+  ASSERT (n_vectors_left <= VLIB_FRAME_SIZE);
+  n_vectors_in_frame = VLIB_FRAME_SIZE - n_vectors_left;
+
+  f->n_vectors = n_vectors_in_frame;
+
+  /* If vectors were added to frame, add to pending vector. */
+  if (PREDICT_TRUE (n_vectors_in_frame > 0))
+    {
+      vlib_pending_frame_t *p;
+      u32 v0, v1;
+
+      r->cached_next_index = next_index;
+
+      if (!(f->flags & VLIB_FRAME_PENDING))
+	{
+	  __attribute__ ((unused)) vlib_node_t *node;
+	  vlib_node_t *next_node;
+	  vlib_node_runtime_t *next_runtime;
+
+	  node = vlib_get_node (vm, r->node_index);
+	  next_node = vlib_get_next_node (vm, r->node_index, next_index);
+	  next_runtime = vlib_node_get_runtime (vm, next_node->index);
+
+	  vec_add2 (nm->pending_frames, p, 1);
+
+	  p->frame_index = nf->frame_index;
+	  p->node_runtime_index = nf->node_runtime_index;
+	  p->next_frame_index = nf - nm->next_frames;
+	  nf->flags |= VLIB_FRAME_PENDING;
+	  f->flags |= VLIB_FRAME_PENDING;
+
+	  /*
+	   * If we're going to dispatch this frame on another thread,
+	   * force allocation of a new frame. Otherwise, we create
+	   * a dangling frame reference. Each thread has its own copy of
+	   * the next_frames vector.
+	   */
+	  if (0 && r->thread_index != next_runtime->thread_index)
+	    {
+	      nf->frame_index = ~0;
+	      nf->flags &= ~(VLIB_FRAME_PENDING | VLIB_FRAME_IS_ALLOCATED);
+	    }
+	}
+
+      /* Copy trace flag from next_frame and from runtime. */
+      nf->flags |=
+	(nf->flags & VLIB_NODE_FLAG_TRACE) | (r->
+					      flags & VLIB_NODE_FLAG_TRACE);
+
+      v0 = nf->vectors_since_last_overflow;
+      v1 = v0 + n_vectors_in_frame;
+      nf->vectors_since_last_overflow = v1;
+      if (PREDICT_FALSE (v1 < v0))
+	{
+	  vlib_node_t *node = vlib_get_node (vm, r->node_index);
+	  vec_elt (node->n_vectors_by_next_node, next_index) += v0;
+	}
+    }
+}
+
+/* Sync up runtime (32 bit counters) and main node stats (64 bit counters). */
+never_inline void
+vlib_node_runtime_sync_stats (vlib_main_t * vm,
+			      vlib_node_runtime_t * r,
+			      uword n_calls, uword n_vectors, uword n_clocks)
+{
+  vlib_node_t *n = vlib_get_node (vm, r->node_index);
+
+  n->stats_total.calls += n_calls + r->calls_since_last_overflow;
+  n->stats_total.vectors += n_vectors + r->vectors_since_last_overflow;
+  n->stats_total.clocks += n_clocks + r->clocks_since_last_overflow;
+  n->stats_total.max_clock = r->max_clock;
+  n->stats_total.max_clock_n = r->max_clock_n;
+
+  r->calls_since_last_overflow = 0;
+  r->vectors_since_last_overflow = 0;
+  r->clocks_since_last_overflow = 0;
+}
+
+always_inline void __attribute__ ((unused))
+vlib_process_sync_stats (vlib_main_t * vm,
+			 vlib_process_t * p,
+			 uword n_calls, uword n_vectors, uword n_clocks)
+{
+  vlib_node_runtime_t *rt = &p->node_runtime;
+  vlib_node_t *n = vlib_get_node (vm, rt->node_index);
+  vlib_node_runtime_sync_stats (vm, rt, n_calls, n_vectors, n_clocks);
+  n->stats_total.suspends += p->n_suspends;
+  p->n_suspends = 0;
+}
+
+void
+vlib_node_sync_stats (vlib_main_t * vm, vlib_node_t * n)
+{
+  vlib_node_runtime_t *rt;
+
+  if (n->type == VLIB_NODE_TYPE_PROCESS)
+    {
+      /* Nothing to do for PROCESS nodes except in main thread */
+      if (vm != &vlib_global_main)
+	return;
+
+      vlib_process_t *p = vlib_get_process_from_node (vm, n);
+      n->stats_total.suspends += p->n_suspends;
+      p->n_suspends = 0;
+      rt = &p->node_runtime;
+    }
+  else
+    rt =
+      vec_elt_at_index (vm->node_main.nodes_by_type[n->type],
+			n->runtime_index);
+
+  vlib_node_runtime_sync_stats (vm, rt, 0, 0, 0);
+
+  /* Sync up runtime next frame vector counters with main node structure. */
+  {
+    vlib_next_frame_t *nf;
+    uword i;
+    for (i = 0; i < rt->n_next_nodes; i++)
+      {
+	nf = vlib_node_runtime_get_next_frame (vm, rt, i);
+	vec_elt (n->n_vectors_by_next_node, i) +=
+	  nf->vectors_since_last_overflow;
+	nf->vectors_since_last_overflow = 0;
+      }
+  }
+}
+
+always_inline u32
+vlib_node_runtime_update_stats (vlib_main_t * vm,
+				vlib_node_runtime_t * node,
+				uword n_calls,
+				uword n_vectors, uword n_clocks)
+{
+  u32 ca0, ca1, v0, v1, cl0, cl1, r;
+
+  cl0 = cl1 = node->clocks_since_last_overflow;
+  ca0 = ca1 = node->calls_since_last_overflow;
+  v0 = v1 = node->vectors_since_last_overflow;
+
+  ca1 = ca0 + n_calls;
+  v1 = v0 + n_vectors;
+  cl1 = cl0 + n_clocks;
+
+  node->calls_since_last_overflow = ca1;
+  node->clocks_since_last_overflow = cl1;
+  node->vectors_since_last_overflow = v1;
+  node->max_clock_n = node->max_clock > n_clocks ?
+    node->max_clock_n : n_vectors;
+  node->max_clock = node->max_clock > n_clocks ? node->max_clock : n_clocks;
+
+  r = vlib_node_runtime_update_main_loop_vector_stats (vm, node, n_vectors);
+
+  if (PREDICT_FALSE (ca1 < ca0 || v1 < v0 || cl1 < cl0))
+    {
+      node->calls_since_last_overflow = ca0;
+      node->clocks_since_last_overflow = cl0;
+      node->vectors_since_last_overflow = v0;
+      vlib_node_runtime_sync_stats (vm, node, n_calls, n_vectors, n_clocks);
+    }
+
+  return r;
+}
+
+always_inline void
+vlib_process_update_stats (vlib_main_t * vm,
+			   vlib_process_t * p,
+			   uword n_calls, uword n_vectors, uword n_clocks)
+{
+  vlib_node_runtime_update_stats (vm, &p->node_runtime,
+				  n_calls, n_vectors, n_clocks);
+}
+
+static clib_error_t *
+vlib_cli_elog_clear (vlib_main_t * vm,
+		     unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  elog_reset_buffer (&vm->elog_main);
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (elog_clear_cli, static) = {
+  .path = "event-logger clear",
+  .short_help = "Clear the event log",
+  .function = vlib_cli_elog_clear,
+};
+/* *INDENT-ON* */
+
+#ifdef CLIB_UNIX
+static clib_error_t *
+elog_save_buffer (vlib_main_t * vm,
+		  unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  elog_main_t *em = &vm->elog_main;
+  char *file, *chroot_file;
+  clib_error_t *error = 0;
+
+  if (!unformat (input, "%s", &file))
+    {
+      vlib_cli_output (vm, "expected file name, got `%U'",
+		       format_unformat_error, input);
+      return 0;
+    }
+
+  /* It's fairly hard to get "../oopsie" through unformat; just in case */
+  if (strstr (file, "..") || index (file, '/'))
+    {
+      vlib_cli_output (vm, "illegal characters in filename '%s'", file);
+      return 0;
+    }
+
+  chroot_file = (char *) format (0, "/tmp/%s%c", file, 0);
+
+  vec_free (file);
+
+  vlib_cli_output (vm, "Saving %wd of %wd events to %s",
+		   elog_n_events_in_buffer (em),
+		   elog_buffer_capacity (em), chroot_file);
+
+  vlib_worker_thread_barrier_sync (vm);
+  error = elog_write_file (em, chroot_file, 1 /* flush ring */ );
+  vlib_worker_thread_barrier_release (vm);
+  vec_free (chroot_file);
+  return error;
+}
+
+void
+elog_post_mortem_dump (void)
+{
+  vlib_main_t *vm = &vlib_global_main;
+  elog_main_t *em = &vm->elog_main;
+  u8 *filename;
+  clib_error_t *error;
+
+  if (!vm->elog_post_mortem_dump)
+    return;
+
+  filename = format (0, "/tmp/elog_post_mortem.%d%c", getpid (), 0);
+  error = elog_write_file (em, (char *) filename, 1 /* flush ring */ );
+  if (error)
+    clib_error_report (error);
+  vec_free (filename);
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (elog_save_cli, static) = {
+  .path = "event-logger save",
+  .short_help = "event-logger save <filename> (saves log in /tmp/<filename>)",
+  .function = elog_save_buffer,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+elog_stop (vlib_main_t * vm,
+	   unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  elog_main_t *em = &vm->elog_main;
+
+  em->n_total_events_disable_limit = em->n_total_events;
+
+  vlib_cli_output (vm, "Stopped the event logger...");
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (elog_stop_cli, static) = {
+  .path = "event-logger stop",
+  .short_help = "Stop the event-logger",
+  .function = elog_stop,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+elog_restart (vlib_main_t * vm,
+	      unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  elog_main_t *em = &vm->elog_main;
+
+  em->n_total_events_disable_limit = ~0;
+
+  vlib_cli_output (vm, "Restarted the event logger...");
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (elog_restart_cli, static) = {
+  .path = "event-logger restart",
+  .short_help = "Restart the event-logger",
+  .function = elog_restart,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+elog_resize (vlib_main_t * vm,
+	     unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  elog_main_t *em = &vm->elog_main;
+  u32 tmp;
+
+  /* Stop the parade */
+  elog_reset_buffer (&vm->elog_main);
+
+  if (unformat (input, "%d", &tmp))
+    {
+      elog_alloc (em, tmp);
+      em->n_total_events_disable_limit = ~0;
+    }
+  else
+    return clib_error_return (0, "Must specify how many events in the ring");
+
+  vlib_cli_output (vm, "Resized ring and restarted the event logger...");
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (elog_resize_cli, static) = {
+  .path = "event-logger resize",
+  .short_help = "event-logger resize <nnn>",
+  .function = elog_resize,
+};
+/* *INDENT-ON* */
+
+#endif /* CLIB_UNIX */
+
+static void
+elog_show_buffer_internal (vlib_main_t * vm, u32 n_events_to_show)
+{
+  elog_main_t *em = &vm->elog_main;
+  elog_event_t *e, *es;
+  f64 dt;
+
+  /* Show events in VLIB time since log clock starts after VLIB clock. */
+  dt = (em->init_time.cpu - vm->clib_time.init_cpu_time)
+    * vm->clib_time.seconds_per_clock;
+
+  es = elog_peek_events (em);
+  vlib_cli_output (vm, "%d of %d events in buffer, logger %s", vec_len (es),
+		   em->event_ring_size,
+		   em->n_total_events < em->n_total_events_disable_limit ?
+		   "running" : "stopped");
+  vec_foreach (e, es)
+  {
+    vlib_cli_output (vm, "%18.9f: %U",
+		     e->time + dt, format_elog_event, em, e);
+    n_events_to_show--;
+    if (n_events_to_show == 0)
+      break;
+  }
+  vec_free (es);
+
+}
+
+static clib_error_t *
+elog_show_buffer (vlib_main_t * vm,
+		  unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  u32 n_events_to_show;
+  clib_error_t *error = 0;
+
+  n_events_to_show = 250;
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "%d", &n_events_to_show))
+	;
+      else if (unformat (input, "all"))
+	n_events_to_show = ~0;
+      else
+	return unformat_parse_error (input);
+    }
+  elog_show_buffer_internal (vm, n_events_to_show);
+  return error;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (elog_show_cli, static) = {
+  .path = "show event-logger",
+  .short_help = "Show event logger info",
+  .function = elog_show_buffer,
+};
+/* *INDENT-ON* */
+
+void
+vlib_gdb_show_event_log (void)
+{
+  elog_show_buffer_internal (vlib_get_main (), (u32) ~ 0);
+}
+
+static inline void
+vlib_elog_main_loop_event (vlib_main_t * vm,
+			   u32 node_index,
+			   u64 time, u32 n_vectors, u32 is_return)
+{
+  vlib_main_t *evm = &vlib_global_main;
+  elog_main_t *em = &evm->elog_main;
+
+  if (VLIB_ELOG_MAIN_LOOP && n_vectors)
+    elog_track (em,
+		/* event type */
+		vec_elt_at_index (is_return
+				  ? evm->node_return_elog_event_types
+				  : evm->node_call_elog_event_types,
+				  node_index),
+		/* track */
+		(vm->thread_index ? &vlib_worker_threads[vm->thread_index].
+		 elog_track : &em->default_track),
+		/* data to log */ n_vectors);
+}
+
+void
+vlib_dump_context_trace (vlib_main_t * vm, u32 bi)
+{
+  vlib_node_main_t *vnm = &vm->node_main;
+  vlib_buffer_t *b;
+  u8 i, n;
+
+  if (VLIB_BUFFER_TRACE_TRAJECTORY)
+    {
+      b = vlib_get_buffer (vm, bi);
+      n = b->pre_data[0];
+
+      fformat (stderr, "Context trace for bi %d b 0x%llx, visited %d\n",
+	       bi, b, n);
+
+      if (n == 0 || n > 20)
+	{
+	  fformat (stderr, "n is unreasonable\n");
+	  return;
+	}
+
+
+      for (i = 0; i < n; i++)
+	{
+	  u32 node_index;
+
+	  node_index = b->pre_data[i + 1];
+
+	  if (node_index > vec_len (vnm->nodes))
+	    {
+	      fformat (stderr, "Skip bogus node index %d\n", node_index);
+	      continue;
+	    }
+
+	  fformat (stderr, "%v (%d)\n", vnm->nodes[node_index]->name,
+		   node_index);
+	}
+    }
+  else
+    {
+      fformat (stderr,
+	       "in vlib/buffers.h, #define VLIB_BUFFER_TRACE_TRAJECTORY 1\n");
+    }
+}
+
+
+static_always_inline u64
+dispatch_node (vlib_main_t * vm,
+	       vlib_node_runtime_t * node,
+	       vlib_node_type_t type,
+	       vlib_node_state_t dispatch_state,
+	       vlib_frame_t * frame, u64 last_time_stamp)
+{
+  uword n, v;
+  u64 t;
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_next_frame_t *nf;
+
+  if (CLIB_DEBUG > 0)
+    {
+      vlib_node_t *n = vlib_get_node (vm, node->node_index);
+      ASSERT (n->type == type);
+    }
+
+  /* Only non-internal nodes may be disabled. */
+  if (type != VLIB_NODE_TYPE_INTERNAL && node->state != dispatch_state)
+    {
+      ASSERT (type != VLIB_NODE_TYPE_INTERNAL);
+      return last_time_stamp;
+    }
+
+  if ((type == VLIB_NODE_TYPE_PRE_INPUT || type == VLIB_NODE_TYPE_INPUT)
+      && dispatch_state != VLIB_NODE_STATE_INTERRUPT)
+    {
+      u32 c = node->input_main_loops_per_call;
+      /* Only call node when count reaches zero. */
+      if (c)
+	{
+	  node->input_main_loops_per_call = c - 1;
+	  return last_time_stamp;
+	}
+    }
+
+  /* Speculatively prefetch next frames. */
+  if (node->n_next_nodes > 0)
+    {
+      nf = vec_elt_at_index (nm->next_frames, node->next_frame_index);
+      CLIB_PREFETCH (nf, 4 * sizeof (nf[0]), WRITE);
+    }
+
+  vm->cpu_time_last_node_dispatch = last_time_stamp;
+
+  if (1 /* || vm->thread_index == node->thread_index */ )
+    {
+      vlib_main_t *stat_vm;
+
+      stat_vm = /* vlib_mains ? vlib_mains[0] : */ vm;
+
+      vlib_elog_main_loop_event (vm, node->node_index,
+				 last_time_stamp,
+				 frame ? frame->n_vectors : 0,
+				 /* is_after */ 0);
+
+      /*
+       * Turn this on if you run into
+       * "bad monkey" contexts, and you want to know exactly
+       * which nodes they've visited... See ixge.c...
+       */
+      if (VLIB_BUFFER_TRACE_TRAJECTORY && frame)
+	{
+	  int i;
+	  int log_index;
+	  u32 *from;
+	  from = vlib_frame_vector_args (frame);
+	  for (i = 0; i < frame->n_vectors; i++)
+	    {
+	      vlib_buffer_t *b = vlib_get_buffer (vm, from[i]);
+	      ASSERT (b->pre_data[0] < 32);
+	      log_index = b->pre_data[0]++ + 1;
+	      b->pre_data[log_index] = node->node_index;
+	    }
+	  n = node->function (vm, node, frame);
+	}
+      else
+	n = node->function (vm, node, frame);
+
+      t = clib_cpu_time_now ();
+
+      vlib_elog_main_loop_event (vm, node->node_index, t, n,	/* is_after */
+				 1);
+
+      vm->main_loop_vectors_processed += n;
+      vm->main_loop_nodes_processed += n > 0;
+
+      v = vlib_node_runtime_update_stats (stat_vm, node,
+					  /* n_calls */ 1,
+					  /* n_vectors */ n,
+					  /* n_clocks */ t - last_time_stamp);
+
+      /* When in interrupt mode and vector rate crosses threshold switch to
+         polling mode. */
+      if ((dispatch_state == VLIB_NODE_STATE_INTERRUPT)
+	  || (dispatch_state == VLIB_NODE_STATE_POLLING
+	      && (node->flags
+		  & VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE)))
+	{
+#ifdef DISPATCH_NODE_ELOG_REQUIRED
+	  ELOG_TYPE_DECLARE (e) =
+	  {
+	    .function = (char *) __FUNCTION__,.format =
+	      "%s vector length %d, switching to %s",.format_args =
+	      "T4i4t4",.n_enum_strings = 2,.enum_strings =
+	    {
+	  "interrupt", "polling",},};
+	  struct
+	  {
+	    u32 node_name, vector_length, is_polling;
+	  } *ed;
+	  vlib_worker_thread_t *w = vlib_worker_threads + vm->thread_index;
+#endif
+
+	  if ((dispatch_state == VLIB_NODE_STATE_INTERRUPT
+	       && v >= nm->polling_threshold_vector_length) &&
+	      !(node->flags &
+		VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE))
+	    {
+	      vlib_node_t *n = vlib_get_node (vm, node->node_index);
+	      n->state = VLIB_NODE_STATE_POLLING;
+	      node->state = VLIB_NODE_STATE_POLLING;
+	      node->flags &=
+		~VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE;
+	      node->flags |=
+		VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE;
+	      nm->input_node_counts_by_state[VLIB_NODE_STATE_INTERRUPT] -= 1;
+	      nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] += 1;
+
+#ifdef DISPATCH_NODE_ELOG_REQUIRED
+	      ed = ELOG_TRACK_DATA (&vlib_global_main.elog_main, e,
+				    w->elog_track);
+	      ed->node_name = n->name_elog_string;
+	      ed->vector_length = v;
+	      ed->is_polling = 1;
+#endif
+	    }
+	  else if (dispatch_state == VLIB_NODE_STATE_POLLING
+		   && v <= nm->interrupt_threshold_vector_length)
+	    {
+	      vlib_node_t *n = vlib_get_node (vm, node->node_index);
+	      if (node->flags &
+		  VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE)
+		{
+		  /* Switch to interrupt mode after dispatch in polling one more time.
+		     This allows driver to re-enable interrupts. */
+		  n->state = VLIB_NODE_STATE_INTERRUPT;
+		  node->state = VLIB_NODE_STATE_INTERRUPT;
+		  node->flags &=
+		    ~VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE;
+		  nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] -=
+		    1;
+		  nm->input_node_counts_by_state[VLIB_NODE_STATE_INTERRUPT] +=
+		    1;
+
+		}
+	      else
+		{
+		  node->flags |=
+		    VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE;
+#ifdef DISPATCH_NODE_ELOG_REQUIRED
+		  ed = ELOG_TRACK_DATA (&vlib_global_main.elog_main, e,
+					w->elog_track);
+		  ed->node_name = n->name_elog_string;
+		  ed->vector_length = v;
+		  ed->is_polling = 0;
+#endif
+		}
+	    }
+	}
+    }
+
+  return t;
+}
+
+static u64
+dispatch_pending_node (vlib_main_t * vm, uword pending_frame_index,
+		       u64 last_time_stamp)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_frame_t *f;
+  vlib_next_frame_t *nf, nf_dummy;
+  vlib_node_runtime_t *n;
+  u32 restore_frame_index;
+  vlib_pending_frame_t *p;
+
+  /* See comment below about dangling references to nm->pending_frames */
+  p = nm->pending_frames + pending_frame_index;
+
+  n = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL],
+			p->node_runtime_index);
+
+  f = vlib_get_frame (vm, p->frame_index);
+  if (p->next_frame_index == VLIB_PENDING_FRAME_NO_NEXT_FRAME)
+    {
+      /* No next frame: so use dummy on stack. */
+      nf = &nf_dummy;
+      nf->flags = f->flags & VLIB_NODE_FLAG_TRACE;
+      nf->frame_index = ~p->frame_index;
+    }
+  else
+    nf = vec_elt_at_index (nm->next_frames, p->next_frame_index);
+
+  ASSERT (f->flags & VLIB_FRAME_IS_ALLOCATED);
+
+  /* Force allocation of new frame while current frame is being
+     dispatched. */
+  restore_frame_index = ~0;
+  if (nf->frame_index == p->frame_index)
+    {
+      nf->frame_index = ~0;
+      nf->flags &= ~VLIB_FRAME_IS_ALLOCATED;
+      if (!(n->flags & VLIB_NODE_FLAG_FRAME_NO_FREE_AFTER_DISPATCH))
+	restore_frame_index = p->frame_index;
+    }
+
+  /* Frame must be pending. */
+  ASSERT (f->flags & VLIB_FRAME_PENDING);
+  ASSERT (f->n_vectors > 0);
+
+  /* Copy trace flag from next frame to node.
+     Trace flag indicates that at least one vector in the dispatched
+     frame is traced. */
+  n->flags &= ~VLIB_NODE_FLAG_TRACE;
+  n->flags |= (nf->flags & VLIB_FRAME_TRACE) ? VLIB_NODE_FLAG_TRACE : 0;
+  nf->flags &= ~VLIB_FRAME_TRACE;
+
+  last_time_stamp = dispatch_node (vm, n,
+				   VLIB_NODE_TYPE_INTERNAL,
+				   VLIB_NODE_STATE_POLLING,
+				   f, last_time_stamp);
+
+  f->flags &= ~VLIB_FRAME_PENDING;
+
+  /* Frame is ready to be used again, so restore it. */
+  if (restore_frame_index != ~0)
+    {
+      /*
+       * We musn't restore a frame that is flagged to be freed. This
+       * shouldn't happen since frames to be freed post dispatch are
+       * those used when the to-node frame becomes full i.e. they form a
+       * sort of queue of frames to a single node. If we get here then
+       * the to-node frame and the pending frame *were* the same, and so
+       * we removed the to-node frame.  Therefore this frame is no
+       * longer part of the queue for that node and hence it cannot be
+       * it's overspill.
+       */
+      ASSERT (!(f->flags & VLIB_FRAME_FREE_AFTER_DISPATCH));
+
+      /*
+       * NB: dispatching node n can result in the creation and scheduling
+       * of new frames, and hence in the reallocation of nm->pending_frames.
+       * Recompute p, or no supper. This was broken for more than 10 years.
+       */
+      p = nm->pending_frames + pending_frame_index;
+
+      /*
+       * p->next_frame_index can change during node dispatch if node
+       * function decides to change graph hook up.
+       */
+      nf = vec_elt_at_index (nm->next_frames, p->next_frame_index);
+      nf->flags |= VLIB_FRAME_IS_ALLOCATED;
+
+      if (~0 == nf->frame_index)
+	{
+	  /* no new frame has been assigned to this node, use the saved one */
+	  nf->frame_index = restore_frame_index;
+	  f->n_vectors = 0;
+	}
+      else
+	{
+	  /* The node has gained a frame, implying packets from the current frame
+	     were re-queued to this same node. we don't need the saved one
+	     anymore */
+	  vlib_frame_free (vm, n, f);
+	}
+    }
+  else
+    {
+      if (f->flags & VLIB_FRAME_FREE_AFTER_DISPATCH)
+	{
+	  ASSERT (!(n->flags & VLIB_NODE_FLAG_FRAME_NO_FREE_AFTER_DISPATCH));
+	  vlib_frame_free (vm, n, f);
+	}
+    }
+
+  return last_time_stamp;
+}
+
+always_inline uword
+vlib_process_stack_is_valid (vlib_process_t * p)
+{
+  return p->stack[0] == VLIB_PROCESS_STACK_MAGIC;
+}
+
+typedef struct
+{
+  vlib_main_t *vm;
+  vlib_process_t *process;
+  vlib_frame_t *frame;
+} vlib_process_bootstrap_args_t;
+
+/* Called in process stack. */
+static uword
+vlib_process_bootstrap (uword _a)
+{
+  vlib_process_bootstrap_args_t *a;
+  vlib_main_t *vm;
+  vlib_node_runtime_t *node;
+  vlib_frame_t *f;
+  vlib_process_t *p;
+  uword n;
+
+  a = uword_to_pointer (_a, vlib_process_bootstrap_args_t *);
+
+  vm = a->vm;
+  p = a->process;
+  f = a->frame;
+  node = &p->node_runtime;
+
+  n = node->function (vm, node, f);
+
+  ASSERT (vlib_process_stack_is_valid (p));
+
+  clib_longjmp (&p->return_longjmp, n);
+
+  return n;
+}
+
+/* Called in main stack. */
+static_always_inline uword
+vlib_process_startup (vlib_main_t * vm, vlib_process_t * p, vlib_frame_t * f)
+{
+  vlib_process_bootstrap_args_t a;
+  uword r;
+
+  a.vm = vm;
+  a.process = p;
+  a.frame = f;
+
+  r = clib_setjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_RETURN);
+  if (r == VLIB_PROCESS_RETURN_LONGJMP_RETURN)
+    r = clib_calljmp (vlib_process_bootstrap, pointer_to_uword (&a),
+		      (void *) p->stack + (1 << p->log2_n_stack_bytes));
+
+  return r;
+}
+
+static_always_inline uword
+vlib_process_resume (vlib_process_t * p)
+{
+  uword r;
+  p->flags &= ~(VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK
+		| VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT
+		| VLIB_PROCESS_RESUME_PENDING);
+  r = clib_setjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_RETURN);
+  if (r == VLIB_PROCESS_RETURN_LONGJMP_RETURN)
+    clib_longjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_RESUME);
+  return r;
+}
+
+static u64
+dispatch_process (vlib_main_t * vm,
+		  vlib_process_t * p, vlib_frame_t * f, u64 last_time_stamp)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_node_runtime_t *node_runtime = &p->node_runtime;
+  vlib_node_t *node = vlib_get_node (vm, node_runtime->node_index);
+  u64 t;
+  uword n_vectors, is_suspend;
+
+  if (node->state != VLIB_NODE_STATE_POLLING
+      || (p->flags & (VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK
+		      | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT)))
+    return last_time_stamp;
+
+  p->flags |= VLIB_PROCESS_IS_RUNNING;
+
+  t = last_time_stamp;
+  vlib_elog_main_loop_event (vm, node_runtime->node_index, t,
+			     f ? f->n_vectors : 0, /* is_after */ 0);
+
+  /* Save away current process for suspend. */
+  nm->current_process_index = node->runtime_index;
+
+  n_vectors = vlib_process_startup (vm, p, f);
+
+  nm->current_process_index = ~0;
+
+  ASSERT (n_vectors != VLIB_PROCESS_RETURN_LONGJMP_RETURN);
+  is_suspend = n_vectors == VLIB_PROCESS_RETURN_LONGJMP_SUSPEND;
+  if (is_suspend)
+    {
+      vlib_pending_frame_t *pf;
+
+      n_vectors = 0;
+      pool_get (nm->suspended_process_frames, pf);
+      pf->node_runtime_index = node->runtime_index;
+      pf->frame_index = f ? vlib_frame_index (vm, f) : ~0;
+      pf->next_frame_index = ~0;
+
+      p->n_suspends += 1;
+      p->suspended_process_frame_index = pf - nm->suspended_process_frames;
+
+      if (p->flags & VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK)
+	{
+	  TWT (tw_timer_wheel) * tw =
+	    (TWT (tw_timer_wheel) *) nm->timing_wheel;
+	  p->stop_timer_handle =
+	    TW (tw_timer_start) (tw,
+				 vlib_timing_wheel_data_set_suspended_process
+				 (node->runtime_index) /* [sic] pool idex */ ,
+				 0 /* timer_id */ ,
+				 p->resume_clock_interval);
+	}
+    }
+  else
+    p->flags &= ~VLIB_PROCESS_IS_RUNNING;
+
+  t = clib_cpu_time_now ();
+
+  vlib_elog_main_loop_event (vm, node_runtime->node_index, t, is_suspend,
+			     /* is_after */ 1);
+
+  vlib_process_update_stats (vm, p,
+			     /* n_calls */ !is_suspend,
+			     /* n_vectors */ n_vectors,
+			     /* n_clocks */ t - last_time_stamp);
+
+  return t;
+}
+
+void
+vlib_start_process (vlib_main_t * vm, uword process_index)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_process_t *p = vec_elt (nm->processes, process_index);
+  dispatch_process (vm, p, /* frame */ 0, /* cpu_time_now */ 0);
+}
+
+static u64
+dispatch_suspended_process (vlib_main_t * vm,
+			    uword process_index, u64 last_time_stamp)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_node_runtime_t *node_runtime;
+  vlib_node_t *node;
+  vlib_frame_t *f;
+  vlib_process_t *p;
+  vlib_pending_frame_t *pf;
+  u64 t, n_vectors, is_suspend;
+
+  t = last_time_stamp;
+
+  p = vec_elt (nm->processes, process_index);
+  if (PREDICT_FALSE (!(p->flags & VLIB_PROCESS_IS_RUNNING)))
+    return last_time_stamp;
+
+  ASSERT (p->flags & (VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK
+		      | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT));
+
+  pf =
+    pool_elt_at_index (nm->suspended_process_frames,
+		       p->suspended_process_frame_index);
+
+  node_runtime = &p->node_runtime;
+  node = vlib_get_node (vm, node_runtime->node_index);
+  f = pf->frame_index != ~0 ? vlib_get_frame (vm, pf->frame_index) : 0;
+
+  vlib_elog_main_loop_event (vm, node_runtime->node_index, t,
+			     f ? f->n_vectors : 0, /* is_after */ 0);
+
+  /* Save away current process for suspend. */
+  nm->current_process_index = node->runtime_index;
+
+  n_vectors = vlib_process_resume (p);
+  t = clib_cpu_time_now ();
+
+  nm->current_process_index = ~0;
+
+  is_suspend = n_vectors == VLIB_PROCESS_RETURN_LONGJMP_SUSPEND;
+  if (is_suspend)
+    {
+      /* Suspend it again. */
+      n_vectors = 0;
+      p->n_suspends += 1;
+      if (p->flags & VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK)
+	{
+	  p->stop_timer_handle =
+	    TW (tw_timer_start) ((TWT (tw_timer_wheel) *) nm->timing_wheel,
+				 vlib_timing_wheel_data_set_suspended_process
+				 (node->runtime_index) /* [sic] pool idex */ ,
+				 0 /* timer_id */ ,
+				 p->resume_clock_interval);
+	}
+    }
+  else
+    {
+      p->flags &= ~VLIB_PROCESS_IS_RUNNING;
+      p->suspended_process_frame_index = ~0;
+      pool_put (nm->suspended_process_frames, pf);
+    }
+
+  t = clib_cpu_time_now ();
+  vlib_elog_main_loop_event (vm, node_runtime->node_index, t, !is_suspend,
+			     /* is_after */ 1);
+
+  vlib_process_update_stats (vm, p,
+			     /* n_calls */ !is_suspend,
+			     /* n_vectors */ n_vectors,
+			     /* n_clocks */ t - last_time_stamp);
+
+  return t;
+}
+
+static_always_inline void
+vlib_main_or_worker_loop (vlib_main_t * vm, int is_main)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_thread_main_t *tm = vlib_get_thread_main ();
+  uword i;
+  u64 cpu_time_now;
+  vlib_frame_queue_main_t *fqm;
+  u32 *last_node_runtime_indices = 0;
+
+  /* Initialize pending node vector. */
+  if (is_main)
+    {
+      vec_resize (nm->pending_frames, 32);
+      _vec_len (nm->pending_frames) = 0;
+    }
+
+  /* Mark time of main loop start. */
+  if (is_main)
+    {
+      cpu_time_now = vm->clib_time.last_cpu_time;
+      vm->cpu_time_main_loop_start = cpu_time_now;
+    }
+  else
+    cpu_time_now = clib_cpu_time_now ();
+
+  /* Pre-allocate interupt runtime indices and lock. */
+  vec_alloc (nm->pending_interrupt_node_runtime_indices, 32);
+  vec_alloc (last_node_runtime_indices, 32);
+  if (!is_main)
+    clib_spinlock_init (&nm->pending_interrupt_lock);
+
+  /* Pre-allocate expired nodes. */
+  if (!nm->polling_threshold_vector_length)
+    nm->polling_threshold_vector_length = 10;
+  if (!nm->interrupt_threshold_vector_length)
+    nm->interrupt_threshold_vector_length = 5;
+
+  /* Start all processes. */
+  if (is_main)
+    {
+      uword i;
+      nm->current_process_index = ~0;
+      for (i = 0; i < vec_len (nm->processes); i++)
+	cpu_time_now = dispatch_process (vm, nm->processes[i], /* frame */ 0,
+					 cpu_time_now);
+    }
+
+  while (1)
+    {
+      vlib_node_runtime_t *n;
+
+      if (!is_main)
+	{
+	  vlib_worker_thread_barrier_check ();
+	  vec_foreach (fqm, tm->frame_queue_mains)
+	    vlib_frame_queue_dequeue (vm, fqm);
+	}
+
+      /* Process pre-input nodes. */
+      if (is_main)
+	vec_foreach (n, nm->nodes_by_type[VLIB_NODE_TYPE_PRE_INPUT])
+	  cpu_time_now = dispatch_node (vm, n,
+					VLIB_NODE_TYPE_PRE_INPUT,
+					VLIB_NODE_STATE_POLLING,
+					/* frame */ 0,
+					cpu_time_now);
+
+      /* Next process input nodes. */
+      vec_foreach (n, nm->nodes_by_type[VLIB_NODE_TYPE_INPUT])
+	cpu_time_now = dispatch_node (vm, n,
+				      VLIB_NODE_TYPE_INPUT,
+				      VLIB_NODE_STATE_POLLING,
+				      /* frame */ 0,
+				      cpu_time_now);
+
+      if (PREDICT_TRUE (is_main && vm->queue_signal_pending == 0))
+	vm->queue_signal_callback (vm);
+
+      /* Next handle interrupts. */
+      {
+	uword l = _vec_len (nm->pending_interrupt_node_runtime_indices);
+	uword i;
+	if (l > 0)
+	  {
+	    u32 *tmp;
+	    if (!is_main)
+	      clib_spinlock_lock (&nm->pending_interrupt_lock);
+	    tmp = nm->pending_interrupt_node_runtime_indices;
+	    nm->pending_interrupt_node_runtime_indices =
+	      last_node_runtime_indices;
+	    last_node_runtime_indices = tmp;
+	    _vec_len (last_node_runtime_indices) = 0;
+	    if (!is_main)
+	      clib_spinlock_unlock (&nm->pending_interrupt_lock);
+	    for (i = 0; i < l; i++)
+	      {
+		n = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT],
+				      last_node_runtime_indices[i]);
+		cpu_time_now =
+		  dispatch_node (vm, n, VLIB_NODE_TYPE_INPUT,
+				 VLIB_NODE_STATE_INTERRUPT,
+				 /* frame */ 0,
+				 cpu_time_now);
+	      }
+	  }
+      }
+
+      if (is_main)
+	{
+	  /* Check if process nodes have expired from timing wheel. */
+	  ASSERT (nm->data_from_advancing_timing_wheel != 0);
+
+	  nm->data_from_advancing_timing_wheel =
+	    TW (tw_timer_expire_timers_vec)
+	    ((TWT (tw_timer_wheel) *) nm->timing_wheel, vlib_time_now (vm),
+	     nm->data_from_advancing_timing_wheel);
+
+	  ASSERT (nm->data_from_advancing_timing_wheel != 0);
+
+	  if (PREDICT_FALSE
+	      (_vec_len (nm->data_from_advancing_timing_wheel) > 0))
+	    {
+	      uword i;
+
+	    processes_timing_wheel_data:
+	      for (i = 0; i < _vec_len (nm->data_from_advancing_timing_wheel);
+		   i++)
+		{
+		  u32 d = nm->data_from_advancing_timing_wheel[i];
+		  u32 di = vlib_timing_wheel_data_get_index (d);
+
+		  if (vlib_timing_wheel_data_is_timed_event (d))
+		    {
+		      vlib_signal_timed_event_data_t *te =
+			pool_elt_at_index (nm->signal_timed_event_data_pool,
+					   di);
+		      vlib_node_t *n =
+			vlib_get_node (vm, te->process_node_index);
+		      vlib_process_t *p =
+			vec_elt (nm->processes, n->runtime_index);
+		      void *data;
+		      data =
+			vlib_process_signal_event_helper (nm, n, p,
+							  te->event_type_index,
+							  te->n_data_elts,
+							  te->n_data_elt_bytes);
+		      if (te->n_data_bytes < sizeof (te->inline_event_data))
+			clib_memcpy (data, te->inline_event_data,
+				     te->n_data_bytes);
+		      else
+			{
+			  clib_memcpy (data, te->event_data_as_vector,
+				       te->n_data_bytes);
+			  vec_free (te->event_data_as_vector);
+			}
+		      pool_put (nm->signal_timed_event_data_pool, te);
+		    }
+		  else
+		    {
+		      cpu_time_now = clib_cpu_time_now ();
+		      cpu_time_now =
+			dispatch_suspended_process (vm, di, cpu_time_now);
+		    }
+		}
+	      _vec_len (nm->data_from_advancing_timing_wheel) = 0;
+	    }
+	}
+
+      /* Input nodes may have added work to the pending vector.
+         Process pending vector until there is nothing left.
+         All pending vectors will be processed from input -> output. */
+      for (i = 0; i < _vec_len (nm->pending_frames); i++)
+	cpu_time_now = dispatch_pending_node (vm, i, cpu_time_now);
+      /* Reset pending vector for next iteration. */
+      _vec_len (nm->pending_frames) = 0;
+
+      /* Pending internal nodes may resume processes. */
+      if (is_main && _vec_len (nm->data_from_advancing_timing_wheel) > 0)
+	goto processes_timing_wheel_data;
+
+      vlib_increment_main_loop_counter (vm);
+
+      /* Record time stamp in case there are no enabled nodes and above
+         calls do not update time stamp. */
+      cpu_time_now = clib_cpu_time_now ();
+    }
+}
+
+static void
+vlib_main_loop (vlib_main_t * vm)
+{
+  vlib_main_or_worker_loop (vm, /* is_main */ 1);
+}
+
+void
+vlib_worker_loop (vlib_main_t * vm)
+{
+  vlib_main_or_worker_loop (vm, /* is_main */ 0);
+}
+
+vlib_main_t vlib_global_main;
+
+static clib_error_t *
+vlib_main_configure (vlib_main_t * vm, unformat_input_t * input)
+{
+  int turn_on_mem_trace = 0;
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "memory-trace"))
+	turn_on_mem_trace = 1;
+
+      else if (unformat (input, "elog-events %d",
+			 &vm->elog_main.event_ring_size))
+	;
+      else if (unformat (input, "elog-post-mortem-dump"))
+	vm->elog_post_mortem_dump = 1;
+      else
+	return unformat_parse_error (input);
+    }
+
+  unformat_free (input);
+
+  /* Enable memory trace as early as possible. */
+  if (turn_on_mem_trace)
+    clib_mem_trace (1);
+
+  return 0;
+}
+
+VLIB_EARLY_CONFIG_FUNCTION (vlib_main_configure, "vlib");
+
+static void
+dummy_queue_signal_callback (vlib_main_t * vm)
+{
+}
+
+/* Main function. */
+int
+vlib_main (vlib_main_t * volatile vm, unformat_input_t * input)
+{
+  clib_error_t *volatile error;
+  vlib_node_main_t *nm = &vm->node_main;
+
+  vm->queue_signal_callback = dummy_queue_signal_callback;
+
+  clib_time_init (&vm->clib_time);
+
+  /* Turn on event log. */
+  if (!vm->elog_main.event_ring_size)
+    vm->elog_main.event_ring_size = 128 << 10;
+  elog_init (&vm->elog_main, vm->elog_main.event_ring_size);
+  elog_enable_disable (&vm->elog_main, 1);
+
+  /* Default name. */
+  if (!vm->name)
+    vm->name = "VLIB";
+
+  if ((error = unix_physmem_init (vm)))
+    {
+      clib_error_report (error);
+      goto done;
+    }
+
+  if ((error = vlib_buffer_main_init (vm)))
+    {
+      clib_error_report (error);
+      goto done;
+    }
+
+  if ((error = vlib_thread_init (vm)))
+    {
+      clib_error_report (error);
+      goto done;
+    }
+
+  /* Register static nodes so that init functions may use them. */
+  vlib_register_all_static_nodes (vm);
+
+  /* Set seed for random number generator.
+     Allow user to specify seed to make random sequence deterministic. */
+  if (!unformat (input, "seed %wd", &vm->random_seed))
+    vm->random_seed = clib_cpu_time_now ();
+  clib_random_buffer_init (&vm->random_buffer, vm->random_seed);
+
+  /* Initialize node graph. */
+  if ((error = vlib_node_main_init (vm)))
+    {
+      /* Arrange for graph hook up error to not be fatal when debugging. */
+      if (CLIB_DEBUG > 0)
+	clib_error_report (error);
+      else
+	goto done;
+    }
+
+  /* See unix/main.c; most likely already set up */
+  if (vm->init_functions_called == 0)
+    vm->init_functions_called = hash_create (0, /* value bytes */ 0);
+  if ((error = vlib_call_all_init_functions (vm)))
+    goto done;
+
+  /* Create default buffer free list. */
+  vlib_buffer_get_or_create_free_list (vm,
+				       VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES,
+				       "default");
+
+  nm->timing_wheel = clib_mem_alloc_aligned (sizeof (TWT (tw_timer_wheel)),
+					     CLIB_CACHE_LINE_BYTES);
+
+  vec_validate (nm->data_from_advancing_timing_wheel, 10);
+  _vec_len (nm->data_from_advancing_timing_wheel) = 0;
+
+  /* Create the process timing wheel */
+  TW (tw_timer_wheel_init) ((TWT (tw_timer_wheel) *) nm->timing_wheel,
+			    0 /* no callback */ ,
+			    10e-6 /* timer period 10us */ ,
+			    ~0 /* max expirations per call */ );
+
+  switch (clib_setjmp (&vm->main_loop_exit, VLIB_MAIN_LOOP_EXIT_NONE))
+    {
+    case VLIB_MAIN_LOOP_EXIT_NONE:
+      vm->main_loop_exit_set = 1;
+      break;
+
+    case VLIB_MAIN_LOOP_EXIT_CLI:
+      goto done;
+
+    default:
+      error = vm->main_loop_error;
+      goto done;
+    }
+
+  if ((error = vlib_call_all_config_functions (vm, input, 0 /* is_early */ )))
+    goto done;
+
+  /* Call all main loop enter functions. */
+  {
+    clib_error_t *sub_error;
+    sub_error = vlib_call_all_main_loop_enter_functions (vm);
+    if (sub_error)
+      clib_error_report (sub_error);
+  }
+
+  vlib_main_loop (vm);
+
+done:
+  /* Call all exit functions. */
+  {
+    clib_error_t *sub_error;
+    sub_error = vlib_call_all_main_loop_exit_functions (vm);
+    if (sub_error)
+      clib_error_report (sub_error);
+  }
+
+  if (error)
+    clib_error_report (error);
+
+  return 0;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/main.h b/src/vlib/main.h
new file mode 100644
index 00000000..4288d6f0
--- /dev/null
+++ b/src/vlib/main.h
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * main.h: VLIB main data structure
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_main_h
+#define included_vlib_main_h
+
+#include <vppinfra/elog.h>
+#include <vppinfra/format.h>
+#include <vppinfra/longjmp.h>
+#include <vppinfra/pool.h>
+#include <vppinfra/random_buffer.h>
+#include <vppinfra/time.h>
+
+#include <pthread.h>
+
+
+/* By default turn off node/error event logging.
+   Override with -DVLIB_ELOG_MAIN_LOOP */
+#ifndef VLIB_ELOG_MAIN_LOOP
+#define VLIB_ELOG_MAIN_LOOP 0
+#endif
+
+typedef struct vlib_main_t
+{
+  /* Instruction level timing state. */
+  clib_time_t clib_time;
+
+  /* Time stamp of last node dispatch. */
+  u64 cpu_time_last_node_dispatch;
+
+  /* Time stamp when main loop was entered (time 0). */
+  u64 cpu_time_main_loop_start;
+
+  /* Incremented once for each main loop. */
+  u32 main_loop_count;
+
+  /* Count of vectors processed this main loop. */
+  u32 main_loop_vectors_processed;
+  u32 main_loop_nodes_processed;
+
+  /* Circular buffer of input node vector counts.
+     Indexed by low bits of
+     (main_loop_count >> VLIB_LOG2_INPUT_VECTORS_PER_MAIN_LOOP). */
+  u32 vector_counts_per_main_loop[2];
+  u32 node_counts_per_main_loop[2];
+
+  /* Every so often we switch to the next counter. */
+#define VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE 7
+
+  /* Jump target to exit main loop with given code. */
+  u32 main_loop_exit_set;
+  /* Set e.g. in the SIGTERM signal handler, checked in a safe place... */
+  volatile u32 main_loop_exit_now;
+  clib_longjmp_t main_loop_exit;
+#define VLIB_MAIN_LOOP_EXIT_NONE 0
+#define VLIB_MAIN_LOOP_EXIT_PANIC 1
+  /* Exit via CLI. */
+#define VLIB_MAIN_LOOP_EXIT_CLI 2
+
+  /* Error marker to use when exiting main loop. */
+  clib_error_t *main_loop_error;
+
+  /* Name for e.g. syslog. */
+  char *name;
+
+  /* Start and size of CLIB heap. */
+  void *heap_base;
+  uword heap_size;
+
+  vlib_buffer_main_t *buffer_main;
+
+  vlib_physmem_main_t physmem_main;
+
+  /* Allocate/free buffer memory for DMA transfers, descriptor rings, etc.
+     buffer memory is guaranteed to be cache-aligned. */
+
+  clib_error_t *(*os_physmem_region_alloc) (struct vlib_main_t * vm,
+					    char *name, u32 size,
+					    u8 numa_node, u32 flags,
+					    vlib_physmem_region_index_t *
+					    idx);
+
+  void (*os_physmem_region_free) (struct vlib_main_t * vm,
+				  vlib_physmem_region_index_t idx);
+
+  void *(*os_physmem_alloc_aligned) (struct vlib_main_t * vm,
+				     vlib_physmem_region_index_t idx,
+				     uword n_bytes, uword alignment);
+  void (*os_physmem_free) (struct vlib_main_t * vm,
+			   vlib_physmem_region_index_t idx, void *x);
+
+  /* Node graph main structure. */
+  vlib_node_main_t node_main;
+
+  /* Command line interface. */
+  vlib_cli_main_t cli_main;
+
+  /* Packet trace buffer. */
+  vlib_trace_main_t trace_main;
+
+  /* Error handling. */
+  vlib_error_main_t error_main;
+
+  /* Punt packets to underlying operating system for when fast switching
+     code does not know what to do. */
+  void (*os_punt_frame) (struct vlib_main_t * vm,
+			 struct vlib_node_runtime_t * node,
+			 vlib_frame_t * frame);
+
+  /* Multicast distribution.  Set to zero for MC disabled. */
+  mc_main_t *mc_main;
+
+  /* Stream index to use for distribution when MC is enabled. */
+  u32 mc_stream_index;
+
+  vlib_one_time_waiting_process_t *procs_waiting_for_mc_stream_join;
+
+  /* Event logger. */
+  elog_main_t elog_main;
+
+  /* Node call and return event types. */
+  elog_event_type_t *node_call_elog_event_types;
+  elog_event_type_t *node_return_elog_event_types;
+
+  elog_event_type_t *error_elog_event_types;
+
+  /* Seed for random number generator. */
+  uword random_seed;
+
+  /* Buffer of random data for various uses. */
+  clib_random_buffer_t random_buffer;
+
+  /* Hash table to record which init functions have been called. */
+  uword *init_functions_called;
+
+  /* to compare with node runtime */
+  u32 thread_index;
+
+  void **mbuf_alloc_list;
+
+  /* List of init functions to call, setup by constructors */
+  _vlib_init_function_list_elt_t *init_function_registrations;
+  _vlib_init_function_list_elt_t *worker_init_function_registrations;
+  _vlib_init_function_list_elt_t *main_loop_enter_function_registrations;
+  _vlib_init_function_list_elt_t *main_loop_exit_function_registrations;
+  _vlib_init_function_list_elt_t *api_init_function_registrations;
+  vlib_config_function_runtime_t *config_function_registrations;
+  mc_serialize_msg_t *mc_msg_registrations;	/* mc_main is a pointer... */
+
+  /* control-plane API queue signal pending, length indication */
+  volatile u32 queue_signal_pending;
+  volatile u32 api_queue_nonempty;
+  void (*queue_signal_callback) (struct vlib_main_t *);
+  u8 **argv;
+
+  /* debugging */
+  volatile int parked_at_barrier;
+
+  /* Attempt to do a post-mortem elog dump */
+  int elog_post_mortem_dump;
+
+  /*
+   * Need to call vlib_worker_thread_node_runtime_update before
+   * releasing worker thread barrier. Only valid in vlib_global_main.
+   */
+  int need_vlib_worker_thread_node_runtime_update;
+
+  /*
+   * Barrier epoch - Set to current time, each time barrier_sync or
+   * barrier_release is called with zero recursion.
+   */
+  f64 barrier_epoch;
+
+  /* Earliest barrier can be closed again */
+  f64 barrier_no_close_before;
+
+} vlib_main_t;
+
+/* Global main structure. */
+extern vlib_main_t vlib_global_main;
+
+void vlib_worker_loop (vlib_main_t * vm);
+
+always_inline f64
+vlib_time_now (vlib_main_t * vm)
+{
+  return clib_time_now (&vm->clib_time);
+}
+
+always_inline f64
+vlib_time_now_ticks (vlib_main_t * vm, u64 n)
+{
+  return clib_time_now_internal (&vm->clib_time, n);
+}
+
+/* Busy wait for specified time. */
+always_inline void
+vlib_time_wait (vlib_main_t * vm, f64 wait)
+{
+  f64 t = vlib_time_now (vm);
+  f64 limit = t + wait;
+  while (t < limit)
+    t = vlib_time_now (vm);
+}
+
+/* Time a piece of code. */
+#define vlib_time_code(vm,body)			\
+do {						\
+    f64 _t[2];					\
+    _t[0] = vlib_time_now (vm);			\
+    do { body; } while (0);			\
+    _t[1] = vlib_time_now (vm);			\
+    clib_warning ("%.7e", _t[1] - _t[0]);	\
+} while (0)
+
+#define vlib_wait_with_timeout(vm,suspend_time,timeout_time,test)	\
+({									\
+    uword __vlib_wait_with_timeout = 0;					\
+    f64 __vlib_wait_time = 0;						\
+    while (! (__vlib_wait_with_timeout = (test))			\
+	   && __vlib_wait_time < (timeout_time))			\
+      {									\
+	vlib_process_suspend (vm, suspend_time);			\
+	__vlib_wait_time += suspend_time;				\
+      }									\
+    __vlib_wait_with_timeout;						\
+})
+
+always_inline void
+vlib_panic_with_error (vlib_main_t * vm, clib_error_t * error)
+{
+  vm->main_loop_error = error;
+  clib_longjmp (&vm->main_loop_exit, VLIB_MAIN_LOOP_EXIT_PANIC);
+}
+
+#define vlib_panic_with_msg(vm,args...) \
+  vlib_panic_with_error (vm, clib_error_return (0, args))
+
+always_inline void
+vlib_panic (vlib_main_t * vm)
+{
+  vlib_panic_with_error (vm, 0);
+}
+
+always_inline u32
+vlib_vector_input_stats_index (vlib_main_t * vm, word delta)
+{
+  u32 i;
+  i = vm->main_loop_count >> VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE;
+  ASSERT (is_pow2 (ARRAY_LEN (vm->vector_counts_per_main_loop)));
+  return (i + delta) & (ARRAY_LEN (vm->vector_counts_per_main_loop) - 1);
+}
+
+/* Estimate input rate based on previous
+   2^VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE
+   samples. */
+always_inline u32
+vlib_last_vectors_per_main_loop (vlib_main_t * vm)
+{
+  u32 i = vlib_vector_input_stats_index (vm, -1);
+  u32 n = vm->vector_counts_per_main_loop[i];
+  return n >> VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE;
+}
+
+/* Total ave vector count per iteration of main loop. */
+always_inline f64
+vlib_last_vectors_per_main_loop_as_f64 (vlib_main_t * vm)
+{
+  u32 i = vlib_vector_input_stats_index (vm, -1);
+  u32 v = vm->vector_counts_per_main_loop[i];
+  return (f64) v / (f64) (1 << VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE);
+}
+
+/* Total ave vectors/node count per iteration of main loop. */
+always_inline f64
+vlib_last_vector_length_per_node (vlib_main_t * vm)
+{
+  u32 i = vlib_vector_input_stats_index (vm, -1);
+  u32 v = vm->vector_counts_per_main_loop[i];
+  u32 n = vm->node_counts_per_main_loop[i];
+  return n == 0 ? 0 : (f64) v / (f64) n;
+}
+
+extern u32 wraps;
+
+always_inline void
+vlib_increment_main_loop_counter (vlib_main_t * vm)
+{
+  u32 i, c, n, v, is_wrap;
+
+  c = vm->main_loop_count++;
+
+  is_wrap = (c & pow2_mask (VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE)) == 0;
+
+  if (is_wrap)
+    wraps++;
+
+  i = vlib_vector_input_stats_index (vm, /* delta */ is_wrap);
+
+  v = is_wrap ? 0 : vm->vector_counts_per_main_loop[i];
+  n = is_wrap ? 0 : vm->node_counts_per_main_loop[i];
+
+  v += vm->main_loop_vectors_processed;
+  n += vm->main_loop_nodes_processed;
+  vm->main_loop_vectors_processed = 0;
+  vm->main_loop_nodes_processed = 0;
+  vm->vector_counts_per_main_loop[i] = v;
+  vm->node_counts_per_main_loop[i] = n;
+
+  if (PREDICT_FALSE (vm->main_loop_exit_now))
+    clib_longjmp (&vm->main_loop_exit, VLIB_MAIN_LOOP_EXIT_CLI);
+}
+
+always_inline void vlib_set_queue_signal_callback
+  (vlib_main_t * vm, void (*fp) (vlib_main_t *))
+{
+  vm->queue_signal_callback = fp;
+}
+
+/* Main routine. */
+int vlib_main (vlib_main_t * vm, unformat_input_t * input);
+
+/* Thread stacks, for os_get_thread_index */
+extern u8 **vlib_thread_stacks;
+
+/* Number of thread stacks that the application needs */
+u32 vlib_app_num_thread_stacks_needed (void) __attribute__ ((weak));
+
+extern void vlib_node_sync_stats (vlib_main_t * vm, vlib_node_t * n);
+
+#endif /* included_vlib_main_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/mc.c b/src/vlib/mc.c
new file mode 100644
index 00000000..8fde0913
--- /dev/null
+++ b/src/vlib/mc.c
@@ -0,0 +1,2609 @@
+/*
+ * mc.c: vlib reliable sequenced multicast distributed applications
+ *
+ * Copyright (c) 2010 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+
+/*
+ * 1 to enable msg id training wheels, which are useful for tracking
+ * down catchup and/or partitioned network problems
+ */
+#define MSG_ID_DEBUG 0
+
+static format_function_t format_mc_stream_state;
+
+static u32
+elog_id_for_peer_id (mc_main_t * m, u64 peer_id)
+{
+  uword *p, r;
+  mhash_t *h = &m->elog_id_by_peer_id;
+
+  if (!m->elog_id_by_peer_id.hash)
+    mhash_init (h, sizeof (uword), sizeof (mc_peer_id_t));
+
+  p = mhash_get (h, &peer_id);
+  if (p)
+    return p[0];
+  r = elog_string (m->elog_main, "%U", m->transport.format_peer_id, peer_id);
+  mhash_set (h, &peer_id, r, /* old_value */ 0);
+  return r;
+}
+
+static u32
+elog_id_for_msg_name (mc_main_t * m, char *msg_name)
+{
+  uword *p, r;
+  uword *h = m->elog_id_by_msg_name;
+  u8 *name_copy;
+
+  if (!h)
+    h = m->elog_id_by_msg_name = hash_create_string (0, sizeof (uword));
+
+  p = hash_get_mem (h, msg_name);
+  if (p)
+    return p[0];
+  r = elog_string (m->elog_main, "%s", msg_name);
+
+  name_copy = format (0, "%s%c", msg_name, 0);
+
+  hash_set_mem (h, name_copy, r);
+  m->elog_id_by_msg_name = h;
+
+  return r;
+}
+
+static void
+elog_tx_msg (mc_main_t * m, u32 stream_id, u32 local_sequence,
+	     u32 retry_count)
+{
+  if (MC_EVENT_LOGGING > 0)
+    {
+      /* *INDENT-OFF* */
+      ELOG_TYPE_DECLARE (e) =
+        {
+          .format = "tx-msg: stream %d local seq %d attempt %d",
+          .format_args = "i4i4i4",
+        };
+      /* *INDENT-ON* */
+      struct
+      {
+	u32 stream_id, local_sequence, retry_count;
+      } *ed;
+      ed = ELOG_DATA (m->elog_main, e);
+      ed->stream_id = stream_id;
+      ed->local_sequence = local_sequence;
+      ed->retry_count = retry_count;
+    }
+}
+
+/*
+ * seq_cmp
+ * correctly compare two unsigned sequence numbers.
+ * This function works so long as x and y are within 2**(n-1) of each
+ * other, where n = bits(x, y).
+ *
+ * Magic decoder ring:
+ * seq_cmp == 0 => x and y are equal
+ * seq_cmp < 0 => x is "in the past" with respect to y
+ * seq_cmp > 0 => x is "in the future" with respect to y
+ */
+always_inline i32
+mc_seq_cmp (u32 x, u32 y)
+{
+  return (i32) x - (i32) y;
+}
+
+void *
+mc_get_vlib_buffer (vlib_main_t * vm, u32 n_bytes, u32 * bi_return)
+{
+  u32 n_alloc, bi;
+  vlib_buffer_t *b;
+
+  n_alloc = vlib_buffer_alloc (vm, &bi, 1);
+  ASSERT (n_alloc == 1);
+
+  b = vlib_get_buffer (vm, bi);
+  b->current_length = n_bytes;
+  *bi_return = bi;
+  return (void *) b->data;
+}
+
+static void
+delete_peer_with_index (mc_main_t * mcm, mc_stream_t * s,
+			uword index, int notify_application)
+{
+  mc_stream_peer_t *p = pool_elt_at_index (s->peers, index);
+  ASSERT (p != 0);
+  if (s->config.peer_died && notify_application)
+    s->config.peer_died (mcm, s, p->id);
+
+  s->all_peer_bitmap = clib_bitmap_andnoti (s->all_peer_bitmap, p - s->peers);
+
+  if (MC_EVENT_LOGGING > 0)
+    {
+      /* *INDENT-OFF* */
+      ELOG_TYPE_DECLARE (e) =
+        {
+          .format = "delete peer %s from all_peer_bitmap",
+          .format_args = "T4",
+        };
+      /* *INDENT-ON* */
+      struct
+      {
+	u32 peer;
+      } *ed = 0;
+
+      ed = ELOG_DATA (mcm->elog_main, e);
+      ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64);
+    }
+  /* Do not delete the pool / hash table entries, or we lose sequence number state */
+}
+
+static mc_stream_peer_t *
+get_or_create_peer_with_id (mc_main_t * mcm,
+			    mc_stream_t * s, mc_peer_id_t id, int *created)
+{
+  uword *q = mhash_get (&s->peer_index_by_id, &id);
+  mc_stream_peer_t *p;
+
+  if (q)
+    {
+      p = pool_elt_at_index (s->peers, q[0]);
+      goto done;
+    }
+
+  pool_get (s->peers, p);
+  memset (p, 0, sizeof (p[0]));
+  p->id = id;
+  p->last_sequence_received = ~0;
+  mhash_set (&s->peer_index_by_id, &id, p - s->peers, /* old_value */ 0);
+  if (created)
+    *created = 1;
+
+done:
+  if (MC_EVENT_LOGGING > 0)
+    {
+      /* *INDENT-OFF* */
+      ELOG_TYPE_DECLARE (e) =
+        {
+          .format = "get_or_create %s peer %s stream %d seq %d",
+          .format_args = "t4T4i4i4",
+          .n_enum_strings = 2,
+          .enum_strings = {
+            "old", "new",
+          },
+        };
+      /* *INDENT-ON* */
+      struct
+      {
+	u32 is_new, peer, stream_index, rx_sequence;
+      } *ed = 0;
+
+      ed = ELOG_DATA (mcm->elog_main, e);
+      ed->is_new = q ? 0 : 1;
+      ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64);
+      ed->stream_index = s->index;
+      ed->rx_sequence = p->last_sequence_received;
+    }
+  /* $$$$ Enable or reenable this peer */
+  s->all_peer_bitmap = clib_bitmap_ori (s->all_peer_bitmap, p - s->peers);
+  return p;
+}
+
+static void
+maybe_send_window_open_event (vlib_main_t * vm, mc_stream_t * stream)
+{
+  vlib_one_time_waiting_process_t *p;
+
+  if (pool_elts (stream->retry_pool) >= stream->config.window_size)
+    return;
+
+  vec_foreach (p, stream->procs_waiting_for_open_window)
+    vlib_signal_one_time_waiting_process (vm, p);
+
+  if (stream->procs_waiting_for_open_window)
+    _vec_len (stream->procs_waiting_for_open_window) = 0;
+}
+
+static void
+mc_retry_free (mc_main_t * mcm, mc_stream_t * s, mc_retry_t * r)
+{
+  mc_retry_t record, *retp;
+
+  if (r->unacked_by_peer_bitmap)
+    _vec_len (r->unacked_by_peer_bitmap) = 0;
+
+  if (clib_fifo_elts (s->retired_fifo) >= 2 * s->config.window_size)
+    {
+      clib_fifo_sub1 (s->retired_fifo, record);
+      vlib_buffer_free_one (mcm->vlib_main, record.buffer_index);
+    }
+
+  clib_fifo_add2 (s->retired_fifo, retp);
+
+  retp->buffer_index = r->buffer_index;
+  retp->local_sequence = r->local_sequence;
+
+  r->buffer_index = ~0;		/* poison buffer index in this retry */
+}
+
+static void
+mc_resend_retired (mc_main_t * mcm, mc_stream_t * s, u32 local_sequence)
+{
+  mc_retry_t *retry;
+
+  if (MC_EVENT_LOGGING > 0)
+    {
+      /* *INDENT-OFF* */
+      ELOG_TYPE_DECLARE (e) =
+        {
+          .format = "resend-retired: search for local seq %d",
+          .format_args = "i4",
+        };
+      /* *INDENT-ON* */
+      struct
+      {
+	u32 local_sequence;
+      } *ed;
+      ed = ELOG_DATA (mcm->elog_main, e);
+      ed->local_sequence = local_sequence;
+    }
+
+  /* *INDENT-OFF* */
+  clib_fifo_foreach (retry, s->retired_fifo,
+  ({
+    if (retry->local_sequence == local_sequence)
+      {
+        elog_tx_msg (mcm, s->index, retry-> local_sequence, -13);
+        mcm->transport.tx_buffer (mcm->transport.opaque,
+                                  MC_TRANSPORT_USER_REQUEST_TO_RELAY,
+                                  retry->buffer_index);
+        return;
+      }
+  }));
+  /* *INDENT-ON* */
+
+  if (MC_EVENT_LOGGING > 0)
+    {
+      /* *INDENT-OFF* */
+      ELOG_TYPE_DECLARE (e) =
+        {
+          .format = "resend-retired: FAILED search for local seq %d",
+          .format_args = "i4",
+        };
+      /* *INDENT-ON* */
+      struct
+      {
+	u32 local_sequence;
+      } *ed;
+      ed = ELOG_DATA (mcm->elog_main, e);
+      ed->local_sequence = local_sequence;
+    }
+}
+
+static uword *
+delete_retry_fifo_elt (mc_main_t * mcm,
+		       mc_stream_t * stream,
+		       mc_retry_t * r, uword * dead_peer_bitmap)
+{
+  mc_stream_peer_t *p;
+
+  /* *INDENT-OFF* */
+  pool_foreach (p, stream->peers, ({
+    uword pi = p - stream->peers;
+    uword is_alive = 0 == clib_bitmap_get (r->unacked_by_peer_bitmap, pi);
+
+    if (! is_alive)
+      dead_peer_bitmap = clib_bitmap_ori (dead_peer_bitmap, pi);
+
+    if (MC_EVENT_LOGGING > 0)
+      {
+        ELOG_TYPE_DECLARE (e) = {
+          .format = "delete_retry_fifo_elt: peer %s is %s",
+          .format_args = "T4t4",
+          .n_enum_strings = 2,
+          .enum_strings = { "alive", "dead", },
+        };
+        struct { u32 peer, is_alive; } * ed;
+        ed = ELOG_DATA (mcm->elog_main, e);
+        ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64);
+        ed->is_alive = is_alive;
+      }
+  }));
+  /* *INDENT-ON* */
+
+  hash_unset (stream->retry_index_by_local_sequence, r->local_sequence);
+  mc_retry_free (mcm, stream, r);
+
+  return dead_peer_bitmap;
+}
+
+always_inline mc_retry_t *
+prev_retry (mc_stream_t * s, mc_retry_t * r)
+{
+  return (r->prev_index != ~0
+	  ? pool_elt_at_index (s->retry_pool, r->prev_index) : 0);
+}
+
+always_inline mc_retry_t *
+next_retry (mc_stream_t * s, mc_retry_t * r)
+{
+  return (r->next_index != ~0
+	  ? pool_elt_at_index (s->retry_pool, r->next_index) : 0);
+}
+
+always_inline void
+remove_retry_from_pool (mc_stream_t * s, mc_retry_t * r)
+{
+  mc_retry_t *p = prev_retry (s, r);
+  mc_retry_t *n = next_retry (s, r);
+
+  if (p)
+    p->next_index = r->next_index;
+  else
+    s->retry_head_index = r->next_index;
+  if (n)
+    n->prev_index = r->prev_index;
+  else
+    s->retry_tail_index = r->prev_index;
+
+  pool_put_index (s->retry_pool, r - s->retry_pool);
+}
+
+static void
+check_retry (mc_main_t * mcm, mc_stream_t * s)
+{
+  mc_retry_t *r;
+  vlib_main_t *vm = mcm->vlib_main;
+  f64 now = vlib_time_now (vm);
+  uword *dead_peer_bitmap = 0;
+  u32 ri, ri_next;
+
+  for (ri = s->retry_head_index; ri != ~0; ri = ri_next)
+    {
+      r = pool_elt_at_index (s->retry_pool, ri);
+      ri_next = r->next_index;
+
+      if (now < r->sent_at + s->config.retry_interval)
+	continue;
+
+      r->n_retries += 1;
+      if (r->n_retries > s->config.retry_limit)
+	{
+	  dead_peer_bitmap =
+	    delete_retry_fifo_elt (mcm, s, r, dead_peer_bitmap);
+	  remove_retry_from_pool (s, r);
+	}
+      else
+	{
+	  if (MC_EVENT_LOGGING > 0)
+	    {
+	      mc_stream_peer_t *p;
+
+              /* *INDENT-OFF* */
+	      ELOG_TYPE_DECLARE (t) =
+                {
+                  .format = "resend local seq %d attempt %d",
+                  .format_args = "i4i4",
+                };
+              /* *INDENT-ON* */
+
+              /* *INDENT-OFF* */
+	      pool_foreach (p, s->peers, ({
+		if (clib_bitmap_get (r->unacked_by_peer_bitmap, p - s->peers))
+		  {
+		    ELOG_TYPE_DECLARE (ev) = {
+		      .format = "resend: needed by peer %s local seq %d",
+		      .format_args = "T4i4",
+		    };
+		    struct { u32 peer, rx_sequence; } * ed;
+		    ed = ELOG_DATA (mcm->elog_main, ev);
+		    ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64);
+		    ed->rx_sequence = r->local_sequence;
+		  }
+	      }));
+              /* *INDENT-ON* */
+
+	      struct
+	      {
+		u32 sequence;
+		u32 trail;
+	      } *ed;
+	      ed = ELOG_DATA (mcm->elog_main, t);
+	      ed->sequence = r->local_sequence;
+	      ed->trail = r->n_retries;
+	    }
+
+	  r->sent_at = vlib_time_now (vm);
+	  s->stats.n_retries += 1;
+
+	  elog_tx_msg (mcm, s->index, r->local_sequence, r->n_retries);
+
+	  mcm->transport.tx_buffer
+	    (mcm->transport.opaque,
+	     MC_TRANSPORT_USER_REQUEST_TO_RELAY, r->buffer_index);
+	}
+    }
+
+  maybe_send_window_open_event (mcm->vlib_main, s);
+
+  /* Delete any dead peers we've found. */
+  if (!clib_bitmap_is_zero (dead_peer_bitmap))
+    {
+      uword i;
+
+      /* *INDENT-OFF* */
+      clib_bitmap_foreach (i, dead_peer_bitmap, ({
+	delete_peer_with_index (mcm, s, i, /* notify_application */ 1);
+
+	/* Delete any references to just deleted peer in retry pool. */
+	pool_foreach (r, s->retry_pool, ({
+	  r->unacked_by_peer_bitmap =
+	    clib_bitmap_andnoti (r->unacked_by_peer_bitmap, i);
+	}));
+      }));
+/* *INDENT-ON* */
+      clib_bitmap_free (dead_peer_bitmap);
+    }
+}
+
+always_inline mc_main_t *
+mc_node_get_main (vlib_node_runtime_t * node)
+{
+  mc_main_t **p = (void *) node->runtime_data;
+  return p[0];
+}
+
+static uword
+mc_retry_process (vlib_main_t * vm,
+		  vlib_node_runtime_t * node, vlib_frame_t * f)
+{
+  mc_main_t *mcm = mc_node_get_main (node);
+  mc_stream_t *s;
+
+  while (1)
+    {
+      vlib_process_suspend (vm, 1.0);
+      vec_foreach (s, mcm->stream_vector)
+      {
+	if (s->state != MC_STREAM_STATE_invalid)
+	  check_retry (mcm, s);
+      }
+    }
+  return 0;			/* not likely */
+}
+
+static void
+send_join_or_leave_request (mc_main_t * mcm, u32 stream_index, u32 is_join)
+{
+  vlib_main_t *vm = mcm->vlib_main;
+  mc_msg_join_or_leave_request_t *mp;
+  u32 bi;
+
+  mp = mc_get_vlib_buffer (vm, sizeof (mp[0]), &bi);
+  memset (mp, 0, sizeof (*mp));
+  mp->type = MC_MSG_TYPE_join_or_leave_request;
+  mp->peer_id = mcm->transport.our_ack_peer_id;
+  mp->stream_index = stream_index;
+  mp->is_join = is_join;
+
+  mc_byte_swap_msg_join_or_leave_request (mp);
+
+  /*
+   * These msgs are unnumbered, unordered so send on the from-relay
+   * channel.
+   */
+  mcm->transport.tx_buffer (mcm->transport.opaque, MC_TRANSPORT_JOIN, bi);
+}
+
+static uword
+mc_join_ager_process (vlib_main_t * vm,
+		      vlib_node_runtime_t * node, vlib_frame_t * f)
+{
+  mc_main_t *mcm = mc_node_get_main (node);
+
+  while (1)
+    {
+      if (mcm->joins_in_progress)
+	{
+	  mc_stream_t *s;
+	  vlib_one_time_waiting_process_t *p;
+	  f64 now = vlib_time_now (vm);
+
+	  vec_foreach (s, mcm->stream_vector)
+	  {
+	    if (s->state != MC_STREAM_STATE_join_in_progress)
+	      continue;
+
+	    if (now > s->join_timeout)
+	      {
+		s->state = MC_STREAM_STATE_ready;
+
+		if (MC_EVENT_LOGGING > 0)
+		  {
+                    /* *INDENT-OFF* */
+		    ELOG_TYPE_DECLARE (e) =
+                      {
+                        .format = "stream %d join timeout",
+                      };
+                    /* *INDENT-ON* */
+		    ELOG (mcm->elog_main, e, s->index);
+		  }
+		/* Make sure that this app instance exists as a stream peer,
+		   or we may answer a catchup request with a NULL
+		   all_peer_bitmap... */
+		(void) get_or_create_peer_with_id
+		  (mcm, s, mcm->transport.our_ack_peer_id, /* created */ 0);
+
+		vec_foreach (p, s->procs_waiting_for_join_done)
+		  vlib_signal_one_time_waiting_process (vm, p);
+		if (s->procs_waiting_for_join_done)
+		  _vec_len (s->procs_waiting_for_join_done) = 0;
+
+		mcm->joins_in_progress--;
+		ASSERT (mcm->joins_in_progress >= 0);
+	      }
+	    else
+	      {
+		/* Resent join request which may have been lost. */
+		send_join_or_leave_request (mcm, s->index, 1 /* is_join */ );
+
+		/* We're *not* alone, retry for as long as it takes */
+		if (mcm->relay_state == MC_RELAY_STATE_SLAVE)
+		  s->join_timeout = vlib_time_now (vm) + 2.0;
+
+
+		if (MC_EVENT_LOGGING > 0)
+		  {
+                    /* *INDENT-OFF* */
+		    ELOG_TYPE_DECLARE (e) =
+                      {
+                        .format = "stream %d resend join request",
+                      };
+                    /* *INDENT-ON* */
+		    ELOG (mcm->elog_main, e, s->index);
+		  }
+	      }
+	  }
+	}
+
+      vlib_process_suspend (vm, .5);
+    }
+
+  return 0;			/* not likely */
+}
+
+static void
+serialize_mc_register_stream_name (serialize_main_t * m, va_list * va)
+{
+  char *name = va_arg (*va, char *);
+  serialize_cstring (m, name);
+}
+
+static void
+elog_stream_name (char *buf, int n_buf_bytes, char *v)
+{
+  clib_memcpy (buf, v, clib_min (n_buf_bytes - 1, vec_len (v)));
+  buf[n_buf_bytes - 1] = 0;
+}
+
+static void
+unserialize_mc_register_stream_name (serialize_main_t * m, va_list * va)
+{
+  mc_main_t *mcm = va_arg (*va, mc_main_t *);
+  char *name;
+  mc_stream_t *s;
+  uword *p;
+
+  unserialize_cstring (m, &name);
+
+  if ((p = hash_get_mem (mcm->stream_index_by_name, name)))
+    {
+      if (MC_EVENT_LOGGING > 0)
+	{
+          /* *INDENT-OFF* */
+	  ELOG_TYPE_DECLARE (e) =
+            {
+              .format = "stream index %d already named %s",
+              .format_args = "i4s16",
+            };
+          /* *INDENT-ON* */
+	  struct
+	  {
+	    u32 stream_index;
+	    char name[16];
+	  } *ed;
+	  ed = ELOG_DATA (mcm->elog_main, e);
+	  ed->stream_index = p[0];
+	  elog_stream_name (ed->name, sizeof (ed->name), name);
+	}
+
+      vec_free (name);
+      return;
+    }
+
+  vec_add2 (mcm->stream_vector, s, 1);
+  mc_stream_init (s);
+  s->state = MC_STREAM_STATE_name_known;
+  s->index = s - mcm->stream_vector;
+  s->config.name = name;
+
+  if (MC_EVENT_LOGGING > 0)
+    {
+      /* *INDENT-OFF* */
+      ELOG_TYPE_DECLARE (e) =
+        {
+          .format = "stream index %d named %s",
+          .format_args = "i4s16",
+        };
+      /* *INDENT-ON* */
+      struct
+      {
+	u32 stream_index;
+	char name[16];
+      } *ed;
+      ed = ELOG_DATA (mcm->elog_main, e);
+      ed->stream_index = s->index;
+      elog_stream_name (ed->name, sizeof (ed->name), name);
+    }
+
+  hash_set_mem (mcm->stream_index_by_name, name, s->index);
+
+  p = hash_get (mcm->procs_waiting_for_stream_name_by_name, name);
+  if (p)
+    {
+      vlib_one_time_waiting_process_t *wp, **w;
+      w = pool_elt_at_index (mcm->procs_waiting_for_stream_name_pool, p[0]);
+      vec_foreach (wp, w[0])
+	vlib_signal_one_time_waiting_process (mcm->vlib_main, wp);
+      pool_put (mcm->procs_waiting_for_stream_name_pool, w);
+      hash_unset_mem (mcm->procs_waiting_for_stream_name_by_name, name);
+    }
+}
+
+/* *INDENT-OFF* */
+MC_SERIALIZE_MSG (mc_register_stream_name_msg, static) =
+{
+  .name = "mc_register_stream_name",
+  .serialize = serialize_mc_register_stream_name,
+  .unserialize = unserialize_mc_register_stream_name,
+};
+/* *INDENT-ON* */
+
+void
+mc_rx_buffer_unserialize (mc_main_t * mcm,
+			  mc_stream_t * stream,
+			  mc_peer_id_t peer_id, u32 buffer_index)
+{
+  return mc_unserialize (mcm, stream, buffer_index);
+}
+
+static u8 *
+mc_internal_catchup_snapshot (mc_main_t * mcm,
+			      u8 * data_vector,
+			      u32 last_global_sequence_processed)
+{
+  serialize_main_t m;
+
+  /* Append serialized data to data vector. */
+  serialize_open_vector (&m, data_vector);
+  m.stream.current_buffer_index = vec_len (data_vector);
+
+  serialize (&m, serialize_mc_main, mcm);
+  return serialize_close_vector (&m);
+}
+
+static void
+mc_internal_catchup (mc_main_t * mcm, u8 * data, u32 n_data_bytes)
+{
+  serialize_main_t s;
+
+  unserialize_open_data (&s, data, n_data_bytes);
+
+  unserialize (&s, unserialize_mc_main, mcm);
+}
+
+/* Overridden from the application layer, not actually used here */
+void mc_stream_join_process_hold (void) __attribute__ ((weak));
+void
+mc_stream_join_process_hold (void)
+{
+}
+
+static u32
+mc_stream_join_helper (mc_main_t * mcm,
+		       mc_stream_config_t * config, u32 is_internal)
+{
+  mc_stream_t *s;
+  vlib_main_t *vm = mcm->vlib_main;
+
+  s = 0;
+  if (!is_internal)
+    {
+      uword *p;
+
+      /* Already have a stream with given name? */
+      if ((s = mc_stream_by_name (mcm, config->name)))
+	{
+	  /* Already joined and ready? */
+	  if (s->state == MC_STREAM_STATE_ready)
+	    return s->index;
+	}
+
+      /* First join MC internal stream. */
+      if (!mcm->stream_vector
+	  || (mcm->stream_vector[MC_STREAM_INDEX_INTERNAL].state
+	      == MC_STREAM_STATE_invalid))
+	{
+	  static mc_stream_config_t c = {
+	    .name = "mc-internal",
+	    .rx_buffer = mc_rx_buffer_unserialize,
+	    .catchup = mc_internal_catchup,
+	    .catchup_snapshot = mc_internal_catchup_snapshot,
+	  };
+
+	  c.save_snapshot = config->save_snapshot;
+
+	  mc_stream_join_helper (mcm, &c, /* is_internal */ 1);
+	}
+
+      /* If stream is still unknown register this name and wait for
+         sequenced message to name stream.  This way all peers agree
+         on stream name to index mappings. */
+      s = mc_stream_by_name (mcm, config->name);
+      if (!s)
+	{
+	  vlib_one_time_waiting_process_t *wp, **w;
+	  u8 *name_copy = format (0, "%s", config->name);
+
+	  mc_serialize_stream (mcm,
+			       MC_STREAM_INDEX_INTERNAL,
+			       &mc_register_stream_name_msg, config->name);
+
+	  /* Wait for this stream to be named. */
+	  p =
+	    hash_get_mem (mcm->procs_waiting_for_stream_name_by_name,
+			  name_copy);
+	  if (p)
+	    w =
+	      pool_elt_at_index (mcm->procs_waiting_for_stream_name_pool,
+				 p[0]);
+	  else
+	    {
+	      pool_get (mcm->procs_waiting_for_stream_name_pool, w);
+	      if (!mcm->procs_waiting_for_stream_name_by_name)
+		mcm->procs_waiting_for_stream_name_by_name = hash_create_string ( /* elts */ 0,	/* value size */
+										 sizeof
+										 (uword));
+	      hash_set_mem (mcm->procs_waiting_for_stream_name_by_name,
+			    name_copy,
+			    w - mcm->procs_waiting_for_stream_name_pool);
+	      w[0] = 0;
+	    }
+
+	  vec_add2 (w[0], wp, 1);
+	  vlib_current_process_wait_for_one_time_event (vm, wp);
+	  vec_free (name_copy);
+	}
+
+      /* Name should be known now. */
+      s = mc_stream_by_name (mcm, config->name);
+      ASSERT (s != 0);
+      ASSERT (s->state == MC_STREAM_STATE_name_known);
+    }
+
+  if (!s)
+    {
+      vec_add2 (mcm->stream_vector, s, 1);
+      mc_stream_init (s);
+      s->index = s - mcm->stream_vector;
+    }
+
+  {
+    /* Save name since we could have already used it as hash key. */
+    char *name_save = s->config.name;
+
+    s->config = config[0];
+
+    if (name_save)
+      s->config.name = name_save;
+  }
+
+  if (s->config.window_size == 0)
+    s->config.window_size = 8;
+
+  if (s->config.retry_interval == 0.0)
+    s->config.retry_interval = 1.0;
+
+  /* Sanity. */
+  ASSERT (s->config.retry_interval < 30);
+
+  if (s->config.retry_limit == 0)
+    s->config.retry_limit = 7;
+
+  s->state = MC_STREAM_STATE_join_in_progress;
+  if (!s->peer_index_by_id.hash)
+    mhash_init (&s->peer_index_by_id, sizeof (uword), sizeof (mc_peer_id_t));
+
+  /* If we don't hear from someone in 5 seconds, we're alone */
+  s->join_timeout = vlib_time_now (vm) + 5.0;
+  mcm->joins_in_progress++;
+
+  if (MC_EVENT_LOGGING > 0)
+    {
+      /* *INDENT-OFF* */
+      ELOG_TYPE_DECLARE (e) =
+      {
+        .format = "stream index %d join request %s",
+        .format_args = "i4s16",
+      };
+      /* *INDENT-ON* */
+      struct
+      {
+	u32 stream_index;
+	char name[16];
+      } *ed;
+      ed = ELOG_DATA (mcm->elog_main, e);
+      ed->stream_index = s->index;
+      elog_stream_name (ed->name, sizeof (ed->name), s->config.name);
+    }
+
+  send_join_or_leave_request (mcm, s->index, 1 /* join */ );
+
+  vlib_current_process_wait_for_one_time_event_vector
+    (vm, &s->procs_waiting_for_join_done);
+
+  if (MC_EVENT_LOGGING)
+    {
+      ELOG_TYPE (e, "join complete stream %d");
+      ELOG (mcm->elog_main, e, s->index);
+    }
+
+  return s->index;
+}
+
+u32
+mc_stream_join (mc_main_t * mcm, mc_stream_config_t * config)
+{
+  return mc_stream_join_helper (mcm, config, /* is_internal */ 0);
+}
+
+void
+mc_stream_leave (mc_main_t * mcm, u32 stream_index)
+{
+  mc_stream_t *s = mc_stream_by_index (mcm, stream_index);
+
+  if (!s)
+    return;
+
+  if (MC_EVENT_LOGGING)
+    {
+      /* *INDENT-OFF* */
+      ELOG_TYPE_DECLARE (t) =
+        {
+          .format = "leave-stream: %d",.format_args = "i4",
+        };
+      /* *INDENT-ON* */
+      struct
+      {
+	u32 index;
+      } *ed;
+      ed = ELOG_DATA (mcm->elog_main, t);
+      ed->index = stream_index;
+    }
+
+  send_join_or_leave_request (mcm, stream_index, 0 /* is_join */ );
+  mc_stream_free (s);
+  s->state = MC_STREAM_STATE_name_known;
+}
+
+void
+mc_msg_join_or_leave_request_handler (mc_main_t * mcm,
+				      mc_msg_join_or_leave_request_t * req,
+				      u32 buffer_index)
+{
+  mc_stream_t *s;
+  mc_msg_join_reply_t *rep;
+  u32 bi;
+
+  mc_byte_swap_msg_join_or_leave_request (req);
+
+  s = mc_stream_by_index (mcm, req->stream_index);
+  if (!s || s->state != MC_STREAM_STATE_ready)
+    return;
+
+  /* If the peer is joining, create it */
+  if (req->is_join)
+    {
+      mc_stream_t *this_s;
+
+      /* We're not in a position to catch up a peer until all
+         stream joins are complete. */
+      if (0)
+	{
+	  /* XXX This is hard to test so we've. */
+	  vec_foreach (this_s, mcm->stream_vector)
+	  {
+	    if (this_s->state != MC_STREAM_STATE_ready
+		&& this_s->state != MC_STREAM_STATE_name_known)
+	      return;
+	  }
+	}
+      else if (mcm->joins_in_progress > 0)
+	return;
+
+      (void) get_or_create_peer_with_id (mcm, s, req->peer_id,
+					 /* created */ 0);
+
+      rep = mc_get_vlib_buffer (mcm->vlib_main, sizeof (rep[0]), &bi);
+      memset (rep, 0, sizeof (rep[0]));
+      rep->type = MC_MSG_TYPE_join_reply;
+      rep->stream_index = req->stream_index;
+
+      mc_byte_swap_msg_join_reply (rep);
+      /* These two are already in network byte order... */
+      rep->peer_id = mcm->transport.our_ack_peer_id;
+      rep->catchup_peer_id = mcm->transport.our_catchup_peer_id;
+
+      mcm->transport.tx_buffer (mcm->transport.opaque, MC_TRANSPORT_JOIN, bi);
+    }
+  else
+    {
+      if (s->config.peer_died)
+	s->config.peer_died (mcm, s, req->peer_id);
+    }
+}
+
+void
+mc_msg_join_reply_handler (mc_main_t * mcm,
+			   mc_msg_join_reply_t * mp, u32 buffer_index)
+{
+  mc_stream_t *s;
+
+  mc_byte_swap_msg_join_reply (mp);
+
+  s = mc_stream_by_index (mcm, mp->stream_index);
+
+  if (!s || s->state != MC_STREAM_STATE_join_in_progress)
+    return;
+
+  /* Switch to catchup state; next join reply
+     for this stream will be ignored. */
+  s->state = MC_STREAM_STATE_catchup;
+
+  mcm->joins_in_progress--;
+  mcm->transport.catchup_request_fun (mcm->transport.opaque,
+				      mp->stream_index, mp->catchup_peer_id);
+}
+
+void
+mc_wait_for_stream_ready (mc_main_t * m, char *stream_name)
+{
+  mc_stream_t *s;
+
+  while (1)
+    {
+      s = mc_stream_by_name (m, stream_name);
+      if (s)
+	break;
+      vlib_process_suspend (m->vlib_main, .1);
+    }
+
+  /* It's OK to send a message in catchup and ready states. */
+  if (s->state == MC_STREAM_STATE_catchup
+      || s->state == MC_STREAM_STATE_ready)
+    return;
+
+  /* Otherwise we are waiting for a join to finish. */
+  vlib_current_process_wait_for_one_time_event_vector
+    (m->vlib_main, &s->procs_waiting_for_join_done);
+}
+
+u32
+mc_stream_send (mc_main_t * mcm, u32 stream_index, u32 buffer_index)
+{
+  mc_stream_t *s = mc_stream_by_index (mcm, stream_index);
+  vlib_main_t *vm = mcm->vlib_main;
+  mc_retry_t *r;
+  mc_msg_user_request_t *mp;
+  vlib_buffer_t *b = vlib_get_buffer (vm, buffer_index);
+  u32 ri;
+
+  if (!s)
+    return 0;
+
+  if (s->state != MC_STREAM_STATE_ready)
+    vlib_current_process_wait_for_one_time_event_vector
+      (vm, &s->procs_waiting_for_join_done);
+
+  while (pool_elts (s->retry_pool) >= s->config.window_size)
+    {
+      vlib_current_process_wait_for_one_time_event_vector
+	(vm, &s->procs_waiting_for_open_window);
+    }
+
+  pool_get (s->retry_pool, r);
+  ri = r - s->retry_pool;
+
+  r->prev_index = s->retry_tail_index;
+  r->next_index = ~0;
+  s->retry_tail_index = ri;
+
+  if (r->prev_index == ~0)
+    s->retry_head_index = ri;
+  else
+    {
+      mc_retry_t *p = pool_elt_at_index (s->retry_pool, r->prev_index);
+      p->next_index = ri;
+    }
+
+  vlib_buffer_advance (b, -sizeof (mp[0]));
+  mp = vlib_buffer_get_current (b);
+
+  mp->peer_id = mcm->transport.our_ack_peer_id;
+  /* mp->transport.global_sequence set by relay agent. */
+  mp->global_sequence = 0xdeadbeef;
+  mp->stream_index = s->index;
+  mp->local_sequence = s->our_local_sequence++;
+  mp->n_data_bytes =
+    vlib_buffer_index_length_in_chain (vm, buffer_index) - sizeof (mp[0]);
+
+  r->buffer_index = buffer_index;
+  r->local_sequence = mp->local_sequence;
+  r->sent_at = vlib_time_now (vm);
+  r->n_retries = 0;
+
+  /* Retry will be freed when all currently known peers have acked. */
+  vec_validate (r->unacked_by_peer_bitmap, vec_len (s->all_peer_bitmap) - 1);
+  vec_copy (r->unacked_by_peer_bitmap, s->all_peer_bitmap);
+
+  hash_set (s->retry_index_by_local_sequence, r->local_sequence,
+	    r - s->retry_pool);
+
+  elog_tx_msg (mcm, s->index, mp->local_sequence, r->n_retries);
+
+  mc_byte_swap_msg_user_request (mp);
+
+  mcm->transport.tx_buffer (mcm->transport.opaque,
+			    MC_TRANSPORT_USER_REQUEST_TO_RELAY, buffer_index);
+
+  s->user_requests_sent++;
+
+  /* return amount of window remaining */
+  return s->config.window_size - pool_elts (s->retry_pool);
+}
+
+void
+mc_msg_user_request_handler (mc_main_t * mcm, mc_msg_user_request_t * mp,
+			     u32 buffer_index)
+{
+  vlib_main_t *vm = mcm->vlib_main;
+  mc_stream_t *s;
+  mc_stream_peer_t *peer;
+  i32 seq_cmp_result;
+  static int once = 0;
+
+  mc_byte_swap_msg_user_request (mp);
+
+  s = mc_stream_by_index (mcm, mp->stream_index);
+
+  /* Not signed up for this stream? Turf-o-matic */
+  if (!s || s->state != MC_STREAM_STATE_ready)
+    {
+      vlib_buffer_free_one (vm, buffer_index);
+      return;
+    }
+
+  /* Find peer, including ourselves. */
+  peer = get_or_create_peer_with_id (mcm, s, mp->peer_id,
+				     /* created */ 0);
+
+  seq_cmp_result = mc_seq_cmp (mp->local_sequence,
+			       peer->last_sequence_received + 1);
+
+  if (MC_EVENT_LOGGING > 0)
+    {
+      /* *INDENT-OFF* */
+      ELOG_TYPE_DECLARE (e) =
+        {
+          .format = "rx-msg: peer %s stream %d rx seq %d seq_cmp %d",
+          .format_args = "T4i4i4i4",
+        };
+      /* *INDENT-ON* */
+      struct
+      {
+	u32 peer, stream_index, rx_sequence;
+	i32 seq_cmp_result;
+      } *ed;
+      ed = ELOG_DATA (mcm->elog_main, e);
+      ed->peer = elog_id_for_peer_id (mcm, peer->id.as_u64);
+      ed->stream_index = mp->stream_index;
+      ed->rx_sequence = mp->local_sequence;
+      ed->seq_cmp_result = seq_cmp_result;
+    }
+
+  if (0 && mp->stream_index == 1 && once == 0)
+    {
+      once = 1;
+      ELOG_TYPE (e, "FAKE lost msg on stream 1");
+      ELOG (mcm->elog_main, e, 0);
+      return;
+    }
+
+  peer->last_sequence_received += seq_cmp_result == 0;
+  s->user_requests_received++;
+
+  if (seq_cmp_result > 0)
+    peer->stats.n_msgs_from_future += 1;
+
+  /* Send ack even if msg from future */
+  if (1)
+    {
+      mc_msg_user_ack_t *rp;
+      u32 bi;
+
+      rp = mc_get_vlib_buffer (vm, sizeof (rp[0]), &bi);
+      rp->peer_id = mcm->transport.our_ack_peer_id;
+      rp->stream_index = s->index;
+      rp->local_sequence = mp->local_sequence;
+      rp->seq_cmp_result = seq_cmp_result;
+
+      if (MC_EVENT_LOGGING > 0)
+	{
+          /* *INDENT-OFF* */
+	  ELOG_TYPE_DECLARE (e) =
+            {
+              .format = "tx-ack: stream %d local seq %d",
+              .format_args = "i4i4",
+            };
+          /* *INDENT-ON* */
+	  struct
+	  {
+	    u32 stream_index;
+	    u32 local_sequence;
+	  } *ed;
+	  ed = ELOG_DATA (mcm->elog_main, e);
+	  ed->stream_index = rp->stream_index;
+	  ed->local_sequence = rp->local_sequence;
+	}
+
+      mc_byte_swap_msg_user_ack (rp);
+
+      mcm->transport.tx_ack (mcm->transport.opaque, mp->peer_id, bi);
+      /* Msg from past? If so, free the buffer... */
+      if (seq_cmp_result < 0)
+	{
+	  vlib_buffer_free_one (vm, buffer_index);
+	  peer->stats.n_msgs_from_past += 1;
+	}
+    }
+
+  if (seq_cmp_result == 0)
+    {
+      vlib_buffer_t *b = vlib_get_buffer (vm, buffer_index);
+      switch (s->state)
+	{
+	case MC_STREAM_STATE_ready:
+	  vlib_buffer_advance (b, sizeof (mp[0]));
+	  s->config.rx_buffer (mcm, s, mp->peer_id, buffer_index);
+
+	  /* Stream vector can change address via rx callback for mc-internal
+	     stream. */
+	  s = mc_stream_by_index (mcm, mp->stream_index);
+	  ASSERT (s != 0);
+	  s->last_global_sequence_processed = mp->global_sequence;
+	  break;
+
+	case MC_STREAM_STATE_catchup:
+	  clib_fifo_add1 (s->catchup_fifo, buffer_index);
+	  break;
+
+	default:
+	  clib_warning ("stream in unknown state %U",
+			format_mc_stream_state, s->state);
+	  break;
+	}
+    }
+}
+
+void
+mc_msg_user_ack_handler (mc_main_t * mcm, mc_msg_user_ack_t * mp,
+			 u32 buffer_index)
+{
+  vlib_main_t *vm = mcm->vlib_main;
+  uword *p;
+  mc_stream_t *s;
+  mc_stream_peer_t *peer;
+  mc_retry_t *r;
+  int peer_created = 0;
+
+  mc_byte_swap_msg_user_ack (mp);
+
+  s = mc_stream_by_index (mcm, mp->stream_index);
+
+  if (MC_EVENT_LOGGING > 0)
+    {
+      /* *INDENT-OFF* */
+      ELOG_TYPE_DECLARE (t) =
+        {
+          .format = "rx-ack: local seq %d peer %s seq_cmp_result %d",
+          .format_args = "i4T4i4",
+        };
+      /* *INDENT-ON* */
+
+      struct
+      {
+	u32 local_sequence;
+	u32 peer;
+	i32 seq_cmp_result;
+      } *ed;
+      ed = ELOG_DATA (mcm->elog_main, t);
+      ed->local_sequence = mp->local_sequence;
+      ed->peer = elog_id_for_peer_id (mcm, mp->peer_id.as_u64);
+      ed->seq_cmp_result = mp->seq_cmp_result;
+    }
+
+  /* Unknown stream? */
+  if (!s)
+    return;
+
+  /* Find the peer which just ack'ed. */
+  peer = get_or_create_peer_with_id (mcm, s, mp->peer_id,
+				     /* created */ &peer_created);
+
+  /*
+   * Peer reports message from the future. If it's not in the retry
+   * fifo, look for a retired message.
+   */
+  if (mp->seq_cmp_result > 0)
+    {
+      p = hash_get (s->retry_index_by_local_sequence, mp->local_sequence -
+		    mp->seq_cmp_result);
+      if (p == 0)
+	mc_resend_retired (mcm, s, mp->local_sequence - mp->seq_cmp_result);
+
+      /* Normal retry should fix it... */
+      return;
+    }
+
+  /*
+   * Pointer to the indicated retry fifo entry.
+   * Worth hashing because we could use a window size of 100 or 1000.
+   */
+  p = hash_get (s->retry_index_by_local_sequence, mp->local_sequence);
+
+  /*
+   * Is this a duplicate ACK, received after we've retired the
+   * fifo entry. This can happen when learning about new
+   * peers.
+   */
+  if (p == 0)
+    {
+      if (MC_EVENT_LOGGING > 0)
+	{
+          /* *INDENT-OFF* */
+	  ELOG_TYPE_DECLARE (t) =
+            {
+              .format = "ack: for seq %d from peer %s no fifo elt",
+              .format_args = "i4T4",
+            };
+          /* *INDENT-ON* */
+
+	  struct
+	  {
+	    u32 seq;
+	    u32 peer;
+	  } *ed;
+	  ed = ELOG_DATA (mcm->elog_main, t);
+	  ed->seq = mp->local_sequence;
+	  ed->peer = elog_id_for_peer_id (mcm, mp->peer_id.as_u64);
+	}
+
+      return;
+    }
+
+  r = pool_elt_at_index (s->retry_pool, p[0]);
+
+  /* Make sure that this new peer ACKs our msgs from now on */
+  if (peer_created)
+    {
+      mc_retry_t *later_retry = next_retry (s, r);
+
+      while (later_retry)
+	{
+	  later_retry->unacked_by_peer_bitmap =
+	    clib_bitmap_ori (later_retry->unacked_by_peer_bitmap,
+			     peer - s->peers);
+	  later_retry = next_retry (s, later_retry);
+	}
+    }
+
+  ASSERT (mp->local_sequence == r->local_sequence);
+
+  /* If we weren't expecting to hear from this peer */
+  if (!peer_created &&
+      !clib_bitmap_get (r->unacked_by_peer_bitmap, peer - s->peers))
+    {
+      if (MC_EVENT_LOGGING > 0)
+	{
+          /* *INDENT-OFF* */
+	  ELOG_TYPE_DECLARE (t) =
+            {
+              .format = "dup-ack: for seq %d from peer %s",
+              .format_args = "i4T4",
+            };
+          /* *INDENT-ON* */
+	  struct
+	  {
+	    u32 seq;
+	    u32 peer;
+	  } *ed;
+	  ed = ELOG_DATA (mcm->elog_main, t);
+	  ed->seq = r->local_sequence;
+	  ed->peer = elog_id_for_peer_id (mcm, peer->id.as_u64);
+	}
+      if (!clib_bitmap_is_zero (r->unacked_by_peer_bitmap))
+	return;
+    }
+
+  if (MC_EVENT_LOGGING > 0)
+    {
+      /* *INDENT-OFF* */
+      ELOG_TYPE_DECLARE (t) =
+        {
+          .format = "ack: for seq %d from peer %s",
+          .format_args = "i4T4",
+        };
+      /* *INDENT-ON* */
+      struct
+      {
+	u32 seq;
+	u32 peer;
+      } *ed;
+      ed = ELOG_DATA (mcm->elog_main, t);
+      ed->seq = mp->local_sequence;
+      ed->peer = elog_id_for_peer_id (mcm, peer->id.as_u64);
+    }
+
+  r->unacked_by_peer_bitmap =
+    clib_bitmap_andnoti (r->unacked_by_peer_bitmap, peer - s->peers);
+
+  /* Not all clients have ack'ed */
+  if (!clib_bitmap_is_zero (r->unacked_by_peer_bitmap))
+    {
+      return;
+    }
+  if (MC_EVENT_LOGGING > 0)
+    {
+      /* *INDENT-OFF* */
+      ELOG_TYPE_DECLARE (t) =
+        {
+          .format = "ack: retire fifo elt loc seq %d after %d acks",
+          .format_args = "i4i4",
+        };
+      /* *INDENT-ON* */
+      struct
+      {
+	u32 seq;
+	u32 npeers;
+      } *ed;
+      ed = ELOG_DATA (mcm->elog_main, t);
+      ed->seq = r->local_sequence;
+      ed->npeers = pool_elts (s->peers);
+    }
+
+  hash_unset (s->retry_index_by_local_sequence, mp->local_sequence);
+  mc_retry_free (mcm, s, r);
+  remove_retry_from_pool (s, r);
+  maybe_send_window_open_event (vm, s);
+}
+
+#define EVENT_MC_SEND_CATCHUP_DATA 0
+
+static uword
+mc_catchup_process (vlib_main_t * vm,
+		    vlib_node_runtime_t * node, vlib_frame_t * f)
+{
+  mc_main_t *mcm = mc_node_get_main (node);
+  uword *event_data = 0;
+  mc_catchup_process_arg_t *args;
+  int i;
+
+  while (1)
+    {
+      if (event_data)
+	_vec_len (event_data) = 0;
+      vlib_process_wait_for_event_with_type (vm, &event_data,
+					     EVENT_MC_SEND_CATCHUP_DATA);
+
+      for (i = 0; i < vec_len (event_data); i++)
+	{
+	  args = pool_elt_at_index (mcm->catchup_process_args, event_data[i]);
+
+	  mcm->transport.catchup_send_fun (mcm->transport.opaque,
+					   args->catchup_opaque,
+					   args->catchup_snapshot);
+
+	  /* Send function will free snapshot data vector. */
+	  pool_put (mcm->catchup_process_args, args);
+	}
+    }
+
+  return 0;			/* not likely */
+}
+
+static void
+serialize_mc_stream (serialize_main_t * m, va_list * va)
+{
+  mc_stream_t *s = va_arg (*va, mc_stream_t *);
+  mc_stream_peer_t *p;
+
+  serialize_integer (m, pool_elts (s->peers), sizeof (u32));
+  /* *INDENT-OFF* */
+  pool_foreach (p, s->peers, ({
+    u8 * x = serialize_get (m, sizeof (p->id));
+    clib_memcpy (x, p->id.as_u8, sizeof (p->id));
+    serialize_integer (m, p->last_sequence_received,
+                       sizeof (p->last_sequence_received));
+  }));
+/* *INDENT-ON* */
+  serialize_bitmap (m, s->all_peer_bitmap);
+}
+
+void
+unserialize_mc_stream (serialize_main_t * m, va_list * va)
+{
+  mc_stream_t *s = va_arg (*va, mc_stream_t *);
+  u32 i, n_peers;
+  mc_stream_peer_t *p;
+
+  unserialize_integer (m, &n_peers, sizeof (u32));
+  mhash_init (&s->peer_index_by_id, sizeof (uword), sizeof (mc_peer_id_t));
+  for (i = 0; i < n_peers; i++)
+    {
+      u8 *x;
+      pool_get (s->peers, p);
+      x = unserialize_get (m, sizeof (p->id));
+      clib_memcpy (p->id.as_u8, x, sizeof (p->id));
+      unserialize_integer (m, &p->last_sequence_received,
+			   sizeof (p->last_sequence_received));
+      mhash_set (&s->peer_index_by_id, &p->id, p - s->peers,	/* old_value */
+		 0);
+    }
+  s->all_peer_bitmap = unserialize_bitmap (m);
+
+  /* This is really bad. */
+  if (!s->all_peer_bitmap)
+    clib_warning ("BUG: stream %s all_peer_bitmap NULL", s->config.name);
+}
+
+void
+mc_msg_catchup_request_handler (mc_main_t * mcm,
+				mc_msg_catchup_request_t * req,
+				u32 catchup_opaque)
+{
+  vlib_main_t *vm = mcm->vlib_main;
+  mc_stream_t *s;
+  mc_catchup_process_arg_t *args;
+
+  mc_byte_swap_msg_catchup_request (req);
+
+  s = mc_stream_by_index (mcm, req->stream_index);
+  if (!s || s->state != MC_STREAM_STATE_ready)
+    return;
+
+  if (MC_EVENT_LOGGING > 0)
+    {
+      /* *INDENT-OFF* */
+      ELOG_TYPE_DECLARE (t) =
+        {
+          .format = "catchup-request: from %s stream %d",
+          .format_args = "T4i4",
+        };
+      /* *INDENT-ON* */
+      struct
+      {
+	u32 peer, stream;
+      } *ed;
+      ed = ELOG_DATA (mcm->elog_main, t);
+      ed->peer = elog_id_for_peer_id (mcm, req->peer_id.as_u64);
+      ed->stream = req->stream_index;
+    }
+
+  /*
+   * The application has to snapshoot its data structures right
+   * here, right now. If we process any messages after
+   * noting the last global sequence we've processed, the client
+   * won't be able to accurately reconstruct our data structures.
+   *
+   * Once the data structures are e.g. vec_dup()'ed, we
+   * send the resulting messages from a separate process, to
+   * make sure that we don't cause a bunch of message retransmissions
+   */
+  pool_get (mcm->catchup_process_args, args);
+
+  args->stream_index = s - mcm->stream_vector;
+  args->catchup_opaque = catchup_opaque;
+  args->catchup_snapshot = 0;
+
+  /* Construct catchup reply and snapshot state for stream to send as
+     catchup reply payload. */
+  {
+    mc_msg_catchup_reply_t *rep;
+    serialize_main_t m;
+
+    vec_resize (args->catchup_snapshot, sizeof (rep[0]));
+
+    rep = (void *) args->catchup_snapshot;
+
+    rep->peer_id = req->peer_id;
+    rep->stream_index = req->stream_index;
+    rep->last_global_sequence_included = s->last_global_sequence_processed;
+
+    /* Setup for serialize to append to catchup snapshot. */
+    serialize_open_vector (&m, args->catchup_snapshot);
+    m.stream.current_buffer_index = vec_len (m.stream.buffer);
+
+    serialize (&m, serialize_mc_stream, s);
+
+    args->catchup_snapshot = serialize_close_vector (&m);
+
+    /* Actually copy internal state */
+    args->catchup_snapshot = s->config.catchup_snapshot
+      (mcm, args->catchup_snapshot, rep->last_global_sequence_included);
+
+    rep = (void *) args->catchup_snapshot;
+    rep->n_data_bytes = vec_len (args->catchup_snapshot) - sizeof (rep[0]);
+
+    mc_byte_swap_msg_catchup_reply (rep);
+  }
+
+  /* now go send it... */
+  vlib_process_signal_event (vm, mcm->catchup_process,
+			     EVENT_MC_SEND_CATCHUP_DATA,
+			     args - mcm->catchup_process_args);
+}
+
+#define EVENT_MC_UNSERIALIZE_BUFFER 0
+#define EVENT_MC_UNSERIALIZE_CATCHUP 1
+
+void
+mc_msg_catchup_reply_handler (mc_main_t * mcm, mc_msg_catchup_reply_t * mp,
+			      u32 catchup_opaque)
+{
+  vlib_process_signal_event (mcm->vlib_main,
+			     mcm->unserialize_process,
+			     EVENT_MC_UNSERIALIZE_CATCHUP,
+			     pointer_to_uword (mp));
+}
+
+static void
+perform_catchup (mc_main_t * mcm, mc_msg_catchup_reply_t * mp)
+{
+  mc_stream_t *s;
+  i32 seq_cmp_result;
+
+  mc_byte_swap_msg_catchup_reply (mp);
+
+  s = mc_stream_by_index (mcm, mp->stream_index);
+
+  /* Never heard of this stream or already caught up. */
+  if (!s || s->state == MC_STREAM_STATE_ready)
+    return;
+
+  {
+    serialize_main_t m;
+    mc_stream_peer_t *p;
+    u32 n_stream_bytes;
+
+    /* For offline sim replay: save the entire catchup snapshot... */
+    if (s->config.save_snapshot)
+      s->config.save_snapshot (mcm, /* is_catchup */ 1, mp->data,
+			       mp->n_data_bytes);
+
+    unserialize_open_data (&m, mp->data, mp->n_data_bytes);
+    unserialize (&m, unserialize_mc_stream, s);
+
+    /* Make sure we start numbering our messages as expected */
+    /* *INDENT-OFF* */
+    pool_foreach (p, s->peers, ({
+      if (p->id.as_u64 == mcm->transport.our_ack_peer_id.as_u64)
+        s->our_local_sequence = p->last_sequence_received + 1;
+    }));
+/* *INDENT-ON* */
+
+    n_stream_bytes = m.stream.current_buffer_index;
+
+    /* No need to unserialize close; nothing to free. */
+
+    /* After serialized stream is user's catchup data. */
+    s->config.catchup (mcm, mp->data + n_stream_bytes,
+		       mp->n_data_bytes - n_stream_bytes);
+  }
+
+  /* Vector could have been moved by catchup.
+     This can only happen for mc-internal stream. */
+  s = mc_stream_by_index (mcm, mp->stream_index);
+
+  s->last_global_sequence_processed = mp->last_global_sequence_included;
+
+  while (clib_fifo_elts (s->catchup_fifo))
+    {
+      mc_msg_user_request_t *gp;
+      u32 bi;
+      vlib_buffer_t *b;
+
+      clib_fifo_sub1 (s->catchup_fifo, bi);
+
+      b = vlib_get_buffer (mcm->vlib_main, bi);
+      gp = vlib_buffer_get_current (b);
+
+      /* Make sure we're replaying "new" news */
+      seq_cmp_result = mc_seq_cmp (gp->global_sequence,
+				   mp->last_global_sequence_included);
+
+      if (seq_cmp_result > 0)
+	{
+	  vlib_buffer_advance (b, sizeof (gp[0]));
+	  s->config.rx_buffer (mcm, s, gp->peer_id, bi);
+	  s->last_global_sequence_processed = gp->global_sequence;
+
+	  if (MC_EVENT_LOGGING)
+	    {
+              /* *INDENT-OFF* */
+	      ELOG_TYPE_DECLARE (t) =
+                {
+                  .format = "catchup replay local sequence 0x%x",
+                  .format_args = "i4",
+                };
+              /* *INDENT-ON* */
+	      struct
+	      {
+		u32 local_sequence;
+	      } *ed;
+	      ed = ELOG_DATA (mcm->elog_main, t);
+	      ed->local_sequence = gp->local_sequence;
+	    }
+	}
+      else
+	{
+	  if (MC_EVENT_LOGGING)
+	    {
+              /* *INDENT-OFF* */
+	      ELOG_TYPE_DECLARE (t) =
+                {
+                  .format = "catchup discard local sequence 0x%x",
+                  .format_args = "i4",
+                };
+              /* *INDENT-ON* */
+	      struct
+	      {
+		u32 local_sequence;
+	      } *ed;
+	      ed = ELOG_DATA (mcm->elog_main, t);
+	      ed->local_sequence = gp->local_sequence;
+	    }
+
+	  vlib_buffer_free_one (mcm->vlib_main, bi);
+	}
+    }
+
+  s->state = MC_STREAM_STATE_ready;
+
+  /* Now that we are caught up wake up joining process. */
+  {
+    vlib_one_time_waiting_process_t *wp;
+    vec_foreach (wp, s->procs_waiting_for_join_done)
+      vlib_signal_one_time_waiting_process (mcm->vlib_main, wp);
+    if (s->procs_waiting_for_join_done)
+      _vec_len (s->procs_waiting_for_join_done) = 0;
+  }
+}
+
+static void
+this_node_maybe_master (mc_main_t * mcm)
+{
+  vlib_main_t *vm = mcm->vlib_main;
+  mc_msg_master_assert_t *mp;
+  uword event_type;
+  int timeouts = 0;
+  int is_master = mcm->relay_state == MC_RELAY_STATE_MASTER;
+  clib_error_t *error;
+  f64 now, time_last_master_assert = -1;
+  u32 bi;
+
+  while (1)
+    {
+      if (!mcm->we_can_be_relay_master)
+	{
+	  mcm->relay_state = MC_RELAY_STATE_SLAVE;
+	  if (MC_EVENT_LOGGING)
+	    {
+	      ELOG_TYPE (e, "become slave (config)");
+	      ELOG (mcm->elog_main, e, 0);
+	    }
+	  return;
+	}
+
+      now = vlib_time_now (vm);
+      if (now >= time_last_master_assert + 1)
+	{
+	  time_last_master_assert = now;
+	  mp = mc_get_vlib_buffer (mcm->vlib_main, sizeof (mp[0]), &bi);
+
+	  mp->peer_id = mcm->transport.our_ack_peer_id;
+	  mp->global_sequence = mcm->relay_global_sequence;
+
+	  /*
+	   * these messages clog the event log, set MC_EVENT_LOGGING higher
+	   * if you want them
+	   */
+	  if (MC_EVENT_LOGGING > 1)
+	    {
+              /* *INDENT-OFF* */
+	      ELOG_TYPE_DECLARE (e) =
+                {
+                  .format = "tx-massert: peer %s global seq %u",
+                  .format_args = "T4i4",
+                };
+              /* *INDENT-ON* */
+	      struct
+	      {
+		u32 peer, global_sequence;
+	      } *ed;
+	      ed = ELOG_DATA (mcm->elog_main, e);
+	      ed->peer = elog_id_for_peer_id (mcm, mp->peer_id.as_u64);
+	      ed->global_sequence = mp->global_sequence;
+	    }
+
+	  mc_byte_swap_msg_master_assert (mp);
+
+	  error =
+	    mcm->transport.tx_buffer (mcm->transport.opaque,
+				      MC_TRANSPORT_MASTERSHIP, bi);
+	  if (error)
+	    clib_error_report (error);
+	}
+
+      vlib_process_wait_for_event_or_clock (vm, 1.0);
+      event_type = vlib_process_get_events (vm, /* no event data */ 0);
+
+      switch (event_type)
+	{
+	case ~0:
+	  if (!is_master && timeouts++ > 2)
+	    {
+	      mcm->relay_state = MC_RELAY_STATE_MASTER;
+	      mcm->relay_master_peer_id =
+		mcm->transport.our_ack_peer_id.as_u64;
+	      if (MC_EVENT_LOGGING)
+		{
+		  ELOG_TYPE (e, "become master (was maybe_master)");
+		  ELOG (mcm->elog_main, e, 0);
+		}
+	      return;
+	    }
+	  break;
+
+	case MC_RELAY_STATE_SLAVE:
+	  mcm->relay_state = MC_RELAY_STATE_SLAVE;
+	  if (MC_EVENT_LOGGING && mcm->relay_state != MC_RELAY_STATE_SLAVE)
+	    {
+	      ELOG_TYPE (e, "become slave (was maybe_master)");
+	      ELOG (mcm->elog_main, e, 0);
+	    }
+	  return;
+	}
+    }
+}
+
+static void
+this_node_slave (mc_main_t * mcm)
+{
+  vlib_main_t *vm = mcm->vlib_main;
+  uword event_type;
+  int timeouts = 0;
+
+  if (MC_EVENT_LOGGING)
+    {
+      ELOG_TYPE (e, "become slave");
+      ELOG (mcm->elog_main, e, 0);
+    }
+
+  while (1)
+    {
+      vlib_process_wait_for_event_or_clock (vm, 1.0);
+      event_type = vlib_process_get_events (vm, /* no event data */ 0);
+
+      switch (event_type)
+	{
+	case ~0:
+	  if (timeouts++ > 2)
+	    {
+	      mcm->relay_state = MC_RELAY_STATE_NEGOTIATE;
+	      mcm->relay_master_peer_id = ~0ULL;
+	      if (MC_EVENT_LOGGING)
+		{
+		  ELOG_TYPE (e, "timeouts; negoitate mastership");
+		  ELOG (mcm->elog_main, e, 0);
+		}
+	      return;
+	    }
+	  break;
+
+	case MC_RELAY_STATE_SLAVE:
+	  mcm->relay_state = MC_RELAY_STATE_SLAVE;
+	  timeouts = 0;
+	  break;
+	}
+    }
+}
+
+static uword
+mc_mastership_process (vlib_main_t * vm,
+		       vlib_node_runtime_t * node, vlib_frame_t * f)
+{
+  mc_main_t *mcm = mc_node_get_main (node);
+
+  while (1)
+    {
+      switch (mcm->relay_state)
+	{
+	case MC_RELAY_STATE_NEGOTIATE:
+	case MC_RELAY_STATE_MASTER:
+	  this_node_maybe_master (mcm);
+	  break;
+
+	case MC_RELAY_STATE_SLAVE:
+	  this_node_slave (mcm);
+	  break;
+	}
+    }
+  return 0;			/* not likely */
+}
+
+void
+mc_enable_disable_mastership (mc_main_t * mcm, int we_can_be_master)
+{
+  if (we_can_be_master != mcm->we_can_be_relay_master)
+    {
+      mcm->we_can_be_relay_master = we_can_be_master;
+      vlib_process_signal_event (mcm->vlib_main,
+				 mcm->mastership_process,
+				 MC_RELAY_STATE_NEGOTIATE, 0);
+    }
+}
+
+void
+mc_msg_master_assert_handler (mc_main_t * mcm, mc_msg_master_assert_t * mp,
+			      u32 buffer_index)
+{
+  mc_peer_id_t his_peer_id, our_peer_id;
+  i32 seq_cmp_result;
+  u8 signal_slave = 0;
+  u8 update_global_sequence = 0;
+
+  mc_byte_swap_msg_master_assert (mp);
+
+  his_peer_id = mp->peer_id;
+  our_peer_id = mcm->transport.our_ack_peer_id;
+
+  /* compare the incoming global sequence with ours */
+  seq_cmp_result = mc_seq_cmp (mp->global_sequence,
+			       mcm->relay_global_sequence);
+
+  /* If the sender has a lower peer id and the sender's sequence >=
+     our global sequence, we become a slave.  Otherwise we are master. */
+  if (mc_peer_id_compare (his_peer_id, our_peer_id) < 0
+      && seq_cmp_result >= 0)
+    {
+      vlib_process_signal_event (mcm->vlib_main,
+				 mcm->mastership_process,
+				 MC_RELAY_STATE_SLAVE, 0);
+      signal_slave = 1;
+    }
+
+  /* Update our global sequence. */
+  if (seq_cmp_result > 0)
+    {
+      mcm->relay_global_sequence = mp->global_sequence;
+      update_global_sequence = 1;
+    }
+
+  {
+    uword *q = mhash_get (&mcm->mastership_peer_index_by_id, &his_peer_id);
+    mc_mastership_peer_t *p;
+
+    if (q)
+      p = vec_elt_at_index (mcm->mastership_peers, q[0]);
+    else
+      {
+	vec_add2 (mcm->mastership_peers, p, 1);
+	p->peer_id = his_peer_id;
+	mhash_set (&mcm->mastership_peer_index_by_id, &p->peer_id,
+		   p - mcm->mastership_peers,
+		   /* old_value */ 0);
+      }
+    p->time_last_master_assert_received = vlib_time_now (mcm->vlib_main);
+  }
+
+  /*
+   * these messages clog the event log, set MC_EVENT_LOGGING higher
+   * if you want them.
+   */
+  if (MC_EVENT_LOGGING > 1)
+    {
+      /* *INDENT-OFF* */
+      ELOG_TYPE_DECLARE (e) =
+        {
+          .format = "rx-massert: peer %s global seq %u upd %d slave %d",
+          .format_args = "T4i4i1i1",
+        };
+      /* *INDENT-ON* */
+
+      struct
+      {
+	u32 peer;
+	u32 global_sequence;
+	u8 update_sequence;
+	u8 slave;
+      } *ed;
+      ed = ELOG_DATA (mcm->elog_main, e);
+      ed->peer = elog_id_for_peer_id (mcm, his_peer_id.as_u64);
+      ed->global_sequence = mp->global_sequence;
+      ed->update_sequence = update_global_sequence;
+      ed->slave = signal_slave;
+    }
+}
+
+static void
+mc_serialize_init (mc_main_t * mcm)
+{
+  mc_serialize_msg_t *m;
+  vlib_main_t *vm = vlib_get_main ();
+
+  mcm->global_msg_index_by_name
+    = hash_create_string ( /* elts */ 0, sizeof (uword));
+
+  m = vm->mc_msg_registrations;
+
+  while (m)
+    {
+      m->global_index = vec_len (mcm->global_msgs);
+      hash_set_mem (mcm->global_msg_index_by_name, m->name, m->global_index);
+      vec_add1 (mcm->global_msgs, m);
+      m = m->next_registration;
+    }
+}
+
+clib_error_t *
+mc_serialize_va (mc_main_t * mc,
+		 u32 stream_index,
+		 u32 multiple_messages_per_vlib_buffer,
+		 mc_serialize_msg_t * msg, va_list * va)
+{
+  mc_stream_t *s;
+  clib_error_t *error;
+  serialize_main_t *m = &mc->serialize_mains[VLIB_TX];
+  vlib_serialize_buffer_main_t *sbm = &mc->serialize_buffer_mains[VLIB_TX];
+  u32 bi, n_before, n_after, n_total, n_this_msg;
+  u32 si, gi;
+
+  if (!sbm->vlib_main)
+    {
+      sbm->tx.max_n_data_bytes_per_chain = 4096;
+      sbm->tx.free_list_index = VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX;
+    }
+
+  if (sbm->first_buffer == 0)
+    serialize_open_vlib_buffer (m, mc->vlib_main, sbm);
+
+  n_before = serialize_vlib_buffer_n_bytes (m);
+
+  s = mc_stream_by_index (mc, stream_index);
+  gi = msg->global_index;
+  ASSERT (msg == vec_elt (mc->global_msgs, gi));
+
+  si = ~0;
+  if (gi < vec_len (s->stream_msg_index_by_global_index))
+    si = s->stream_msg_index_by_global_index[gi];
+
+  serialize_likely_small_unsigned_integer (m, si);
+
+  /* For first time message is sent, use name to identify message. */
+  if (si == ~0 || MSG_ID_DEBUG)
+    serialize_cstring (m, msg->name);
+
+  if (MSG_ID_DEBUG && MC_EVENT_LOGGING > 0)
+    {
+      /* *INDENT-OFF* */
+      ELOG_TYPE_DECLARE (e) =
+        {
+          .format = "serialize-msg: %s index %d",
+          .format_args = "T4i4",
+        };
+      /* *INDENT-ON* */
+      struct
+      {
+	u32 c[2];
+      } *ed;
+      ed = ELOG_DATA (mc->elog_main, e);
+      ed->c[0] = elog_id_for_msg_name (mc, msg->name);
+      ed->c[1] = si;
+    }
+
+  error = va_serialize (m, va);
+
+  n_after = serialize_vlib_buffer_n_bytes (m);
+  n_this_msg = n_after - n_before;
+  n_total = n_after + sizeof (mc_msg_user_request_t);
+
+  /* For max message size ignore first message where string name is sent. */
+  if (si != ~0)
+    msg->max_n_bytes_serialized =
+      clib_max (msg->max_n_bytes_serialized, n_this_msg);
+
+  if (!multiple_messages_per_vlib_buffer
+      || si == ~0
+      || n_total + msg->max_n_bytes_serialized >
+      mc->transport.max_packet_size)
+    {
+      bi = serialize_close_vlib_buffer (m);
+      sbm->first_buffer = 0;
+      if (!error)
+	mc_stream_send (mc, stream_index, bi);
+      else if (bi != ~0)
+	vlib_buffer_free_one (mc->vlib_main, bi);
+    }
+
+  return error;
+}
+
+clib_error_t *
+mc_serialize_internal (mc_main_t * mc,
+		       u32 stream_index,
+		       u32 multiple_messages_per_vlib_buffer,
+		       mc_serialize_msg_t * msg, ...)
+{
+  vlib_main_t *vm = mc->vlib_main;
+  va_list va;
+  clib_error_t *error;
+
+  if (stream_index == ~0)
+    {
+      if (vm->mc_main && vm->mc_stream_index == ~0)
+	vlib_current_process_wait_for_one_time_event_vector
+	  (vm, &vm->procs_waiting_for_mc_stream_join);
+      stream_index = vm->mc_stream_index;
+    }
+
+  va_start (va, msg);
+  error = mc_serialize_va (mc, stream_index,
+			   multiple_messages_per_vlib_buffer, msg, &va);
+  va_end (va);
+  return error;
+}
+
+uword
+mc_unserialize_message (mc_main_t * mcm,
+			mc_stream_t * s, serialize_main_t * m)
+{
+  mc_serialize_stream_msg_t *sm;
+  u32 gi, si;
+
+  si = unserialize_likely_small_unsigned_integer (m);
+
+  if (!(si == ~0 || MSG_ID_DEBUG))
+    {
+      sm = vec_elt_at_index (s->stream_msgs, si);
+      gi = sm->global_index;
+    }
+  else
+    {
+      char *name;
+
+      unserialize_cstring (m, &name);
+
+      if (MSG_ID_DEBUG && MC_EVENT_LOGGING > 0)
+	{
+          /* *INDENT-OFF* */
+	  ELOG_TYPE_DECLARE (e) =
+            {
+              .format = "unserialize-msg: %s rx index %d",
+              .format_args = "T4i4",
+            };
+          /* *INDENT-ON* */
+	  struct
+	  {
+	    u32 c[2];
+	  } *ed;
+	  ed = ELOG_DATA (mcm->elog_main, e);
+	  ed->c[0] = elog_id_for_msg_name (mcm, name);
+	  ed->c[1] = si;
+	}
+
+      {
+	uword *p = hash_get_mem (mcm->global_msg_index_by_name, name);
+	gi = p ? p[0] : ~0;
+      }
+
+      /* Unknown message? */
+      if (gi == ~0)
+	{
+	  vec_free (name);
+	  goto done;
+	}
+
+      vec_validate_init_empty (s->stream_msg_index_by_global_index, gi, ~0);
+      si = s->stream_msg_index_by_global_index[gi];
+
+      /* Stream local index unknown?  Create it. */
+      if (si == ~0)
+	{
+	  vec_add2 (s->stream_msgs, sm, 1);
+
+	  si = sm - s->stream_msgs;
+	  sm->global_index = gi;
+	  s->stream_msg_index_by_global_index[gi] = si;
+
+	  if (MC_EVENT_LOGGING > 0)
+	    {
+              /* *INDENT-OFF* */
+	      ELOG_TYPE_DECLARE (e) =
+                {
+                  .format = "msg-bind: stream %d %s to index %d",
+                  .format_args = "i4T4i4",
+                };
+              /* *INDENT-ON* */
+	      struct
+	      {
+		u32 c[3];
+	      } *ed;
+	      ed = ELOG_DATA (mcm->elog_main, e);
+	      ed->c[0] = s->index;
+	      ed->c[1] = elog_id_for_msg_name (mcm, name);
+	      ed->c[2] = si;
+	    }
+	}
+      else
+	{
+	  sm = vec_elt_at_index (s->stream_msgs, si);
+	  if (gi != sm->global_index && MC_EVENT_LOGGING > 0)
+	    {
+              /* *INDENT-OFF* */
+	      ELOG_TYPE_DECLARE (e) =
+                {
+                  .format = "msg-id-ERROR: %s index %d expected %d",
+                  .format_args = "T4i4i4",
+                };
+              /* *INDENT-ON* */
+	      struct
+	      {
+		u32 c[3];
+	      } *ed;
+	      ed = ELOG_DATA (mcm->elog_main, e);
+	      ed->c[0] = elog_id_for_msg_name (mcm, name);
+	      ed->c[1] = si;
+	      ed->c[2] = ~0;
+	      if (sm->global_index <
+		  vec_len (s->stream_msg_index_by_global_index))
+		ed->c[2] =
+		  s->stream_msg_index_by_global_index[sm->global_index];
+	    }
+	}
+
+      vec_free (name);
+    }
+
+  if (gi != ~0)
+    {
+      mc_serialize_msg_t *msg;
+      msg = vec_elt (mcm->global_msgs, gi);
+      unserialize (m, msg->unserialize, mcm);
+    }
+
+done:
+  return gi != ~0;
+}
+
+void
+mc_unserialize_internal (mc_main_t * mcm, u32 stream_and_buffer_index)
+{
+  vlib_main_t *vm = mcm->vlib_main;
+  serialize_main_t *m = &mcm->serialize_mains[VLIB_RX];
+  vlib_serialize_buffer_main_t *sbm = &mcm->serialize_buffer_mains[VLIB_RX];
+  mc_stream_and_buffer_t *sb;
+  mc_stream_t *stream;
+  u32 buffer_index;
+
+  sb =
+    pool_elt_at_index (mcm->mc_unserialize_stream_and_buffers,
+		       stream_and_buffer_index);
+  buffer_index = sb->buffer_index;
+  stream = vec_elt_at_index (mcm->stream_vector, sb->stream_index);
+  pool_put (mcm->mc_unserialize_stream_and_buffers, sb);
+
+  if (stream->config.save_snapshot)
+    {
+      u32 n_bytes = vlib_buffer_index_length_in_chain (vm, buffer_index);
+      static u8 *contents;
+      vec_reset_length (contents);
+      vec_validate (contents, n_bytes - 1);
+      vlib_buffer_contents (vm, buffer_index, contents);
+      stream->config.save_snapshot (mcm, /* is_catchup */ 0, contents,
+				    n_bytes);
+    }
+
+  ASSERT (vlib_in_process_context (vm));
+
+  unserialize_open_vlib_buffer (m, vm, sbm);
+
+  clib_fifo_add1 (sbm->rx.buffer_fifo, buffer_index);
+
+  while (unserialize_vlib_buffer_n_bytes (m) > 0)
+    mc_unserialize_message (mcm, stream, m);
+
+  /* Frees buffer. */
+  unserialize_close_vlib_buffer (m);
+}
+
+void
+mc_unserialize (mc_main_t * mcm, mc_stream_t * s, u32 buffer_index)
+{
+  vlib_main_t *vm = mcm->vlib_main;
+  mc_stream_and_buffer_t *sb;
+  pool_get (mcm->mc_unserialize_stream_and_buffers, sb);
+  sb->stream_index = s->index;
+  sb->buffer_index = buffer_index;
+  vlib_process_signal_event (vm, mcm->unserialize_process,
+			     EVENT_MC_UNSERIALIZE_BUFFER,
+			     sb - mcm->mc_unserialize_stream_and_buffers);
+}
+
+static uword
+mc_unserialize_process (vlib_main_t * vm,
+			vlib_node_runtime_t * node, vlib_frame_t * f)
+{
+  mc_main_t *mcm = mc_node_get_main (node);
+  uword event_type, *event_data = 0;
+  int i;
+
+  while (1)
+    {
+      if (event_data)
+	_vec_len (event_data) = 0;
+
+      vlib_process_wait_for_event (vm);
+      event_type = vlib_process_get_events (vm, &event_data);
+      switch (event_type)
+	{
+	case EVENT_MC_UNSERIALIZE_BUFFER:
+	  for (i = 0; i < vec_len (event_data); i++)
+	    mc_unserialize_internal (mcm, event_data[i]);
+	  break;
+
+	case EVENT_MC_UNSERIALIZE_CATCHUP:
+	  for (i = 0; i < vec_len (event_data); i++)
+	    {
+	      u8 *mp = uword_to_pointer (event_data[i], u8 *);
+	      perform_catchup (mcm, (void *) mp);
+	      vec_free (mp);
+	    }
+	  break;
+
+	default:
+	  break;
+	}
+    }
+
+  return 0;			/* not likely */
+}
+
+void
+serialize_mc_main (serialize_main_t * m, va_list * va)
+{
+  mc_main_t *mcm = va_arg (*va, mc_main_t *);
+  mc_stream_t *s;
+  mc_serialize_stream_msg_t *sm;
+  mc_serialize_msg_t *msg;
+
+  serialize_integer (m, vec_len (mcm->stream_vector), sizeof (u32));
+  vec_foreach (s, mcm->stream_vector)
+  {
+    /* Stream name. */
+    serialize_cstring (m, s->config.name);
+
+    /* Serialize global names for all sent messages. */
+    serialize_integer (m, vec_len (s->stream_msgs), sizeof (u32));
+    vec_foreach (sm, s->stream_msgs)
+    {
+      msg = vec_elt (mcm->global_msgs, sm->global_index);
+      serialize_cstring (m, msg->name);
+    }
+  }
+}
+
+void
+unserialize_mc_main (serialize_main_t * m, va_list * va)
+{
+  mc_main_t *mcm = va_arg (*va, mc_main_t *);
+  u32 i, n_streams, n_stream_msgs;
+  char *name;
+  mc_stream_t *s;
+  mc_serialize_stream_msg_t *sm;
+
+  unserialize_integer (m, &n_streams, sizeof (u32));
+  for (i = 0; i < n_streams; i++)
+    {
+      unserialize_cstring (m, &name);
+      if (i != MC_STREAM_INDEX_INTERNAL && !mc_stream_by_name (mcm, name))
+	{
+	  vec_validate (mcm->stream_vector, i);
+	  s = vec_elt_at_index (mcm->stream_vector, i);
+	  mc_stream_init (s);
+	  s->index = s - mcm->stream_vector;
+	  s->config.name = name;
+	  s->state = MC_STREAM_STATE_name_known;
+	  hash_set_mem (mcm->stream_index_by_name, s->config.name, s->index);
+	}
+      else
+	vec_free (name);
+
+      s = vec_elt_at_index (mcm->stream_vector, i);
+
+      vec_free (s->stream_msgs);
+      vec_free (s->stream_msg_index_by_global_index);
+
+      unserialize_integer (m, &n_stream_msgs, sizeof (u32));
+      vec_resize (s->stream_msgs, n_stream_msgs);
+      vec_foreach (sm, s->stream_msgs)
+      {
+	uword *p;
+	u32 si, gi;
+
+	unserialize_cstring (m, &name);
+	p = hash_get (mcm->global_msg_index_by_name, name);
+	gi = p ? p[0] : ~0;
+	si = sm - s->stream_msgs;
+
+	if (MC_EVENT_LOGGING > 0)
+	  {
+            /* *INDENT-OFF* */
+	    ELOG_TYPE_DECLARE (e) =
+              {
+                .format = "catchup-bind: %s to %d global index %d stream %d",
+                .format_args = "T4i4i4i4",
+              };
+            /* *INDENT-ON* */
+
+	    struct
+	    {
+	      u32 c[4];
+	    } *ed;
+	    ed = ELOG_DATA (mcm->elog_main, e);
+	    ed->c[0] = elog_id_for_msg_name (mcm, name);
+	    ed->c[1] = si;
+	    ed->c[2] = gi;
+	    ed->c[3] = s->index;
+	  }
+
+	vec_free (name);
+
+	sm->global_index = gi;
+	if (gi != ~0)
+	  {
+	    vec_validate_init_empty (s->stream_msg_index_by_global_index,
+				     gi, ~0);
+	    s->stream_msg_index_by_global_index[gi] = si;
+	  }
+      }
+    }
+}
+
+void
+mc_main_init (mc_main_t * mcm, char *tag)
+{
+  vlib_main_t *vm = vlib_get_main ();
+
+  mcm->vlib_main = vm;
+  mcm->elog_main = &vm->elog_main;
+
+  mcm->relay_master_peer_id = ~0ULL;
+  mcm->relay_state = MC_RELAY_STATE_NEGOTIATE;
+
+  mcm->stream_index_by_name
+    = hash_create_string ( /* elts */ 0, /* value size */ sizeof (uword));
+
+  {
+    vlib_node_registration_t r;
+
+    memset (&r, 0, sizeof (r));
+
+    r.type = VLIB_NODE_TYPE_PROCESS;
+
+    /* Point runtime data to main instance. */
+    r.runtime_data = &mcm;
+    r.runtime_data_bytes = sizeof (&mcm);
+
+    r.name = (char *) format (0, "mc-mastership-%s", tag);
+    r.function = mc_mastership_process;
+    mcm->mastership_process = vlib_register_node (vm, &r);
+
+    r.name = (char *) format (0, "mc-join-ager-%s", tag);
+    r.function = mc_join_ager_process;
+    mcm->join_ager_process = vlib_register_node (vm, &r);
+
+    r.name = (char *) format (0, "mc-retry-%s", tag);
+    r.function = mc_retry_process;
+    mcm->retry_process = vlib_register_node (vm, &r);
+
+    r.name = (char *) format (0, "mc-catchup-%s", tag);
+    r.function = mc_catchup_process;
+    mcm->catchup_process = vlib_register_node (vm, &r);
+
+    r.name = (char *) format (0, "mc-unserialize-%s", tag);
+    r.function = mc_unserialize_process;
+    mcm->unserialize_process = vlib_register_node (vm, &r);
+  }
+
+  if (MC_EVENT_LOGGING > 0)
+    mhash_init (&mcm->elog_id_by_peer_id, sizeof (uword),
+		sizeof (mc_peer_id_t));
+
+  mhash_init (&mcm->mastership_peer_index_by_id, sizeof (uword),
+	      sizeof (mc_peer_id_t));
+  mc_serialize_init (mcm);
+}
+
+static u8 *
+format_mc_relay_state (u8 * s, va_list * args)
+{
+  mc_relay_state_t state = va_arg (*args, mc_relay_state_t);
+  char *t = 0;
+  switch (state)
+    {
+    case MC_RELAY_STATE_NEGOTIATE:
+      t = "negotiate";
+      break;
+    case MC_RELAY_STATE_MASTER:
+      t = "master";
+      break;
+    case MC_RELAY_STATE_SLAVE:
+      t = "slave";
+      break;
+    default:
+      return format (s, "unknown 0x%x", state);
+    }
+
+  return format (s, "%s", t);
+}
+
+static u8 *
+format_mc_stream_state (u8 * s, va_list * args)
+{
+  mc_stream_state_t state = va_arg (*args, mc_stream_state_t);
+  char *t = 0;
+  switch (state)
+    {
+#define _(f) case MC_STREAM_STATE_##f: t = #f; break;
+      foreach_mc_stream_state
+#undef _
+    default:
+      return format (s, "unknown 0x%x", state);
+    }
+
+  return format (s, "%s", t);
+}
+
+static int
+mc_peer_comp (void *a1, void *a2)
+{
+  mc_stream_peer_t *p1 = a1;
+  mc_stream_peer_t *p2 = a2;
+
+  return mc_peer_id_compare (p1->id, p2->id);
+}
+
+u8 *
+format_mc_main (u8 * s, va_list * args)
+{
+  mc_main_t *mcm = va_arg (*args, mc_main_t *);
+  mc_stream_t *t;
+  mc_stream_peer_t *p, *ps;
+  uword indent = format_get_indent (s);
+
+  s = format (s, "MC state %U, %d streams joined, global sequence 0x%x",
+	      format_mc_relay_state, mcm->relay_state,
+	      vec_len (mcm->stream_vector), mcm->relay_global_sequence);
+
+  {
+    mc_mastership_peer_t *mp;
+    f64 now = vlib_time_now (mcm->vlib_main);
+    s = format (s, "\n%UMost recent mastership peers:",
+		format_white_space, indent + 2);
+    vec_foreach (mp, mcm->mastership_peers)
+    {
+      s = format (s, "\n%U%-30U%.4e",
+		  format_white_space, indent + 4,
+		  mcm->transport.format_peer_id, mp->peer_id,
+		  now - mp->time_last_master_assert_received);
+    }
+  }
+
+  vec_foreach (t, mcm->stream_vector)
+  {
+    s = format (s, "\n%Ustream `%s' index %d",
+		format_white_space, indent + 2, t->config.name, t->index);
+
+    s = format (s, "\n%Ustate %U",
+		format_white_space, indent + 4,
+		format_mc_stream_state, t->state);
+
+    s =
+      format (s,
+	      "\n%Uretries: interval %.0f sec, limit %d, pool elts %d, %Ld sent",
+	      format_white_space, indent + 4, t->config.retry_interval,
+	      t->config.retry_limit, pool_elts (t->retry_pool),
+	      t->stats.n_retries - t->stats_last_clear.n_retries);
+
+    s = format (s, "\n%U%Ld/%Ld user requests sent/received",
+		format_white_space, indent + 4,
+		t->user_requests_sent, t->user_requests_received);
+
+    s = format (s, "\n%U%d peers, local/global sequence 0x%x/0x%x",
+		format_white_space, indent + 4,
+		pool_elts (t->peers),
+		t->our_local_sequence, t->last_global_sequence_processed);
+
+    ps = 0;
+    /* *INDENT-OFF* */
+    pool_foreach (p, t->peers,
+    ({
+      if (clib_bitmap_get (t->all_peer_bitmap, p - t->peers))
+        vec_add1 (ps, p[0]);
+    }));
+    /* *INDENT-ON* */
+    vec_sort_with_function (ps, mc_peer_comp);
+    s = format (s, "\n%U%=30s%10s%16s%16s",
+		format_white_space, indent + 6,
+		"Peer", "Last seq", "Retries", "Future");
+
+    vec_foreach (p, ps)
+    {
+      s = format (s, "\n%U%-30U0x%08x%16Ld%16Ld%s",
+		  format_white_space, indent + 6,
+		  mcm->transport.format_peer_id, p->id.as_u64,
+		  p->last_sequence_received,
+		  p->stats.n_msgs_from_past -
+		  p->stats_last_clear.n_msgs_from_past,
+		  p->stats.n_msgs_from_future -
+		  p->stats_last_clear.n_msgs_from_future,
+		  (mcm->transport.our_ack_peer_id.as_u64 ==
+		   p->id.as_u64 ? " (self)" : ""));
+    }
+    vec_free (ps);
+  }
+
+  return s;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/mc.h b/src/vlib/mc.h
new file mode 100644
index 00000000..dc95b0e9
--- /dev/null
+++ b/src/vlib/mc.h
@@ -0,0 +1,687 @@
+/*
+ * mc.h: vlib reliable sequenced multicast distributed applications
+ *
+ * Copyright (c) 2010 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_vlib_mc_h
+#define included_vlib_mc_h
+
+#include <vppinfra/elog.h>
+#include <vppinfra/fifo.h>
+#include <vppinfra/mhash.h>
+#include <vlib/node.h>
+
+#ifndef MC_EVENT_LOGGING
+#define MC_EVENT_LOGGING 1
+#endif
+
+always_inline uword
+mc_need_byte_swap (void)
+{
+  return CLIB_ARCH_IS_LITTLE_ENDIAN;
+}
+
+/*
+ * Used to uniquely identify hosts.
+ * For IP4 this would be ip4_address plus tcp/udp port.
+ */
+typedef union
+{
+  u8 as_u8[8];
+  u64 as_u64;
+} mc_peer_id_t;
+
+always_inline mc_peer_id_t
+mc_byte_swap_peer_id (mc_peer_id_t i)
+{
+  /* Peer id is already in network byte order. */
+  return i;
+}
+
+always_inline int
+mc_peer_id_compare (mc_peer_id_t a, mc_peer_id_t b)
+{
+  return memcmp (a.as_u8, b.as_u8, sizeof (a.as_u8));
+}
+
+/* Assert mastership.  Lowest peer_id amount all peers wins mastership.
+   Only sent/received over mastership channel (MC_TRANSPORT_MASTERSHIP).
+   So, we don't need a message opcode. */
+typedef CLIB_PACKED (struct
+		     {
+		     /* Peer id asserting mastership. */
+		     mc_peer_id_t peer_id;
+		     /* Global sequence number asserted. */
+		     u32 global_sequence;}) mc_msg_master_assert_t;
+
+always_inline void
+mc_byte_swap_msg_master_assert (mc_msg_master_assert_t * r)
+{
+  if (mc_need_byte_swap ())
+    {
+      r->peer_id = mc_byte_swap_peer_id (r->peer_id);
+      r->global_sequence = clib_byte_swap_u32 (r->global_sequence);
+    }
+}
+
+#define foreach_mc_msg_type			\
+  _ (master_assert)				\
+  _ (join_or_leave_request)			\
+  _ (join_reply)				\
+  _ (user_request)				\
+  _ (user_ack)					\
+  _ (catchup_request)				\
+  _ (catchup_reply)
+
+typedef enum
+{
+#define _(f) MC_MSG_TYPE_##f,
+  foreach_mc_msg_type
+#undef _
+} mc_relay_msg_type_t;
+
+/* Request to join a given stream.  Multicast over MC_TRANSPORT_JOIN. */
+typedef CLIB_PACKED (struct
+		     {
+mc_peer_id_t peer_id; mc_relay_msg_type_t type:32;
+		     /* MC_MSG_TYPE_join_or_leave_request */
+		     /* Stream to join or leave. */
+		     u32 stream_index;
+		     /* join = 1, leave = 0 */
+		     u8 is_join;}) mc_msg_join_or_leave_request_t;
+
+always_inline void
+mc_byte_swap_msg_join_or_leave_request (mc_msg_join_or_leave_request_t * r)
+{
+  if (mc_need_byte_swap ())
+    {
+      r->peer_id = mc_byte_swap_peer_id (r->peer_id);
+      r->type = clib_byte_swap_u32 (r->type);
+      r->stream_index = clib_byte_swap_u32 (r->stream_index);
+    }
+}
+
+/* Join reply.  Multicast over MC_TRANSPORT_JOIN. */
+typedef CLIB_PACKED (struct
+		     {
+mc_peer_id_t peer_id; mc_relay_msg_type_t type:32;
+		     /* MC_MSG_TYPE_join_reply */
+		     u32 stream_index;
+		     /* Peer ID to contact to catchup with this stream. */
+		     mc_peer_id_t catchup_peer_id;}) mc_msg_join_reply_t;
+
+always_inline void
+mc_byte_swap_msg_join_reply (mc_msg_join_reply_t * r)
+{
+  if (mc_need_byte_swap ())
+    {
+      r->peer_id = mc_byte_swap_peer_id (r->peer_id);
+      r->type = clib_byte_swap_u32 (r->type);
+      r->stream_index = clib_byte_swap_u32 (r->stream_index);
+      r->catchup_peer_id = mc_byte_swap_peer_id (r->catchup_peer_id);
+    }
+}
+
+/* Generic (application) request.  Multicast over MC_TRANSPORT_USER_REQUEST_TO_RELAY and then
+   relayed by relay master after filling in global sequence number. */
+typedef CLIB_PACKED (struct
+		     {
+		     mc_peer_id_t peer_id; u32 stream_index;
+		     /* Global sequence number as filled in by relay master. */
+		     u32 global_sequence;
+		     /* Local sequence number as filled in by peer sending message. */
+		     u32 local_sequence;
+		     /* Size of request data. */
+		     u32 n_data_bytes;
+		     /* Opaque request data. */
+		     u8 data[0];}) mc_msg_user_request_t;
+
+always_inline void
+mc_byte_swap_msg_user_request (mc_msg_user_request_t * r)
+{
+  if (mc_need_byte_swap ())
+    {
+      r->peer_id = mc_byte_swap_peer_id (r->peer_id);
+      r->stream_index = clib_byte_swap_u32 (r->stream_index);
+      r->global_sequence = clib_byte_swap_u32 (r->global_sequence);
+      r->local_sequence = clib_byte_swap_u32 (r->local_sequence);
+      r->n_data_bytes = clib_byte_swap_u32 (r->n_data_bytes);
+    }
+}
+
+/* Sent unicast over ACK channel. */
+typedef CLIB_PACKED (struct
+		     {
+		     mc_peer_id_t peer_id;
+		     u32 global_sequence; u32 stream_index;
+		     u32 local_sequence;
+		     i32 seq_cmp_result;}) mc_msg_user_ack_t;
+
+always_inline void
+mc_byte_swap_msg_user_ack (mc_msg_user_ack_t * r)
+{
+  if (mc_need_byte_swap ())
+    {
+      r->peer_id = mc_byte_swap_peer_id (r->peer_id);
+      r->stream_index = clib_byte_swap_u32 (r->stream_index);
+      r->global_sequence = clib_byte_swap_u32 (r->global_sequence);
+      r->local_sequence = clib_byte_swap_u32 (r->local_sequence);
+      r->seq_cmp_result = clib_byte_swap_i32 (r->seq_cmp_result);
+    }
+}
+
+/* Sent/received unicast over catchup channel (e.g. using TCP). */
+typedef CLIB_PACKED (struct
+		     {
+		     mc_peer_id_t peer_id;
+		     u32 stream_index;}) mc_msg_catchup_request_t;
+
+always_inline void
+mc_byte_swap_msg_catchup_request (mc_msg_catchup_request_t * r)
+{
+  if (mc_need_byte_swap ())
+    {
+      r->peer_id = mc_byte_swap_peer_id (r->peer_id);
+      r->stream_index = clib_byte_swap_u32 (r->stream_index);
+    }
+}
+
+/* Sent/received unicast over catchup channel. */
+typedef CLIB_PACKED (struct
+		     {
+		     mc_peer_id_t peer_id; u32 stream_index;
+		     /* Last global sequence number included in catchup data. */
+		     u32 last_global_sequence_included;
+		     /* Size of catchup data. */
+		     u32 n_data_bytes;
+		     /* Catchup data. */
+		     u8 data[0];}) mc_msg_catchup_reply_t;
+
+always_inline void
+mc_byte_swap_msg_catchup_reply (mc_msg_catchup_reply_t * r)
+{
+  if (mc_need_byte_swap ())
+    {
+      r->peer_id = mc_byte_swap_peer_id (r->peer_id);
+      r->stream_index = clib_byte_swap_u32 (r->stream_index);
+      r->last_global_sequence_included =
+	clib_byte_swap_u32 (r->last_global_sequence_included);
+      r->n_data_bytes = clib_byte_swap_u32 (r->n_data_bytes);
+    }
+}
+
+typedef struct _mc_serialize_msg
+{
+  /* Name for this type. */
+  char *name;
+
+  /* Functions to serialize/unserialize data. */
+  serialize_function_t *serialize;
+  serialize_function_t *unserialize;
+
+  /* Maximum message size in bytes when serialized.
+     If zero then this will be set to the largest sent message. */
+  u32 max_n_bytes_serialized;
+
+  /* Opaque to use for first argument to serialize/unserialize function. */
+  u32 opaque;
+
+  /* Index in global message vector. */
+  u32 global_index;
+
+  /* Registration list */
+  struct _mc_serialize_msg *next_registration;
+} mc_serialize_msg_t;
+
+typedef struct
+{
+  /* Index into global message vector. */
+  u32 global_index;
+} mc_serialize_stream_msg_t;
+
+#define MC_SERIALIZE_MSG(x,...)                                 \
+    __VA_ARGS__ mc_serialize_msg_t x;                           \
+static void __mc_serialize_msg_registration_##x (void)          \
+    __attribute__((__constructor__)) ;                          \
+static void __mc_serialize_msg_registration_##x (void)          \
+{                                                               \
+    vlib_main_t * vm = vlib_get_main();                         \
+    x.next_registration = vm->mc_msg_registrations;             \
+    vm->mc_msg_registrations = &x;                              \
+}                                                               \
+__VA_ARGS__ mc_serialize_msg_t x
+
+typedef enum
+{
+  MC_TRANSPORT_MASTERSHIP,
+  MC_TRANSPORT_JOIN,
+  MC_TRANSPORT_USER_REQUEST_TO_RELAY,
+  MC_TRANSPORT_USER_REQUEST_FROM_RELAY,
+  MC_N_TRANSPORT_TYPE,
+} mc_transport_type_t;
+
+typedef struct
+{
+  clib_error_t *(*tx_buffer) (void *opaque, mc_transport_type_t type,
+			      u32 buffer_index);
+
+  clib_error_t *(*tx_ack) (void *opaque, mc_peer_id_t peer_id,
+			   u32 buffer_index);
+
+  /* Returns catchup opaque. */
+    uword (*catchup_request_fun) (void *opaque, u32 stream_index,
+				  mc_peer_id_t catchup_peer_id);
+
+  void (*catchup_send_fun) (void *opaque, uword catchup_opaque,
+			    u8 * data_vector);
+
+  /* Opaque passed to callbacks. */
+  void *opaque;
+
+  mc_peer_id_t our_ack_peer_id;
+  mc_peer_id_t our_catchup_peer_id;
+
+  /* Max packet size (MTU) for this transport.
+     For IP this is interface MTU less IP + UDP header size. */
+  u32 max_packet_size;
+
+  format_function_t *format_peer_id;
+} mc_transport_t;
+
+typedef struct
+{
+  /* Count of messages received from this peer from the past/future
+     (with seq_cmp != 0). */
+  u64 n_msgs_from_past;
+  u64 n_msgs_from_future;
+} mc_stream_peer_stats_t;
+
+typedef struct
+{
+  /* ID of this peer. */
+  mc_peer_id_t id;
+
+  /* The last sequence we received from this peer. */
+  u32 last_sequence_received;
+
+  mc_stream_peer_stats_t stats, stats_last_clear;
+} mc_stream_peer_t;
+
+typedef struct
+{
+  u32 buffer_index;
+
+  /* Cached copy of local sequence number from buffer. */
+  u32 local_sequence;
+
+  /* Number of times this buffer has been sent (retried). */
+  u32 n_retries;
+
+  /* Previous/next retries in doubly-linked list. */
+  u32 prev_index, next_index;
+
+  /* Bitmap of all peers which have acked this msg */
+  uword *unacked_by_peer_bitmap;
+
+  /* Message send or resend time */
+  f64 sent_at;
+} mc_retry_t;
+
+typedef struct
+{
+  /* Number of retries sent for this stream. */
+  u64 n_retries;
+} mc_stream_stats_t;
+
+struct mc_main_t;
+struct mc_stream_t;
+
+typedef struct
+{
+  /* Stream name. */
+  char *name;
+
+  /* Number of outstanding messages. */
+  u32 window_size;
+
+  /* Retry interval, in seconds */
+  f64 retry_interval;
+
+  /* Retry limit */
+  u32 retry_limit;
+
+  /* User rx buffer callback */
+  void (*rx_buffer) (struct mc_main_t * mc_main,
+		     struct mc_stream_t * stream,
+		     mc_peer_id_t peer_id, u32 buffer_index);
+
+  /* User callback to create a snapshot */
+  u8 *(*catchup_snapshot) (struct mc_main_t * mc_main,
+			   u8 * snapshot_vector,
+			   u32 last_global_sequence_included);
+
+  /* User callback to replay a snapshot */
+  void (*catchup) (struct mc_main_t * mc_main,
+		   u8 * snapshot_data, u32 n_snapshot_data_bytes);
+
+  /* Callback to save a snapshot for offline replay */
+  void (*save_snapshot) (struct mc_main_t * mc_main,
+			 u32 is_catchup,
+			 u8 * snapshot_data, u32 n_snapshot_data_bytes);
+
+  /* Called when a peer dies */
+  void (*peer_died) (struct mc_main_t * mc_main,
+		     struct mc_stream_t * stream, mc_peer_id_t peer_id);
+} mc_stream_config_t;
+
+#define foreach_mc_stream_state			\
+  _ (invalid)					\
+  _ (name_known)				\
+  _ (join_in_progress)				\
+  _ (catchup)					\
+  _ (ready)
+
+typedef enum
+{
+#define _(f) MC_STREAM_STATE_##f,
+  foreach_mc_stream_state
+#undef _
+} mc_stream_state_t;
+
+typedef struct mc_stream_t
+{
+  mc_stream_config_t config;
+
+  mc_stream_state_t state;
+
+  /* Index in stream pool. */
+  u32 index;
+
+  /* Stream index 0 is always for MC internal use. */
+#define MC_STREAM_INDEX_INTERNAL 0
+
+  mc_retry_t *retry_pool;
+
+  /* Head and tail index of retry pool. */
+  u32 retry_head_index, retry_tail_index;
+
+  /*
+   * Country club for recently retired messages
+   * If the set of peers is expanding and a new peer
+   * misses a message, we can easily retire the FIFO
+   * element before we even know about the new peer
+   */
+  mc_retry_t *retired_fifo;
+
+  /* Hash mapping local sequence to retry pool index. */
+  uword *retry_index_by_local_sequence;
+
+  /* catch-up fifo of VLIB buffer indices.
+     start recording when catching up. */
+  u32 *catchup_fifo;
+
+  mc_stream_stats_t stats, stats_last_clear;
+
+  /* Peer pool. */
+  mc_stream_peer_t *peers;
+
+  /* Bitmap with ones for all peers in peer pool. */
+  uword *all_peer_bitmap;
+
+  /* Map of 64 bit id to index in stream pool. */
+  mhash_t peer_index_by_id;
+
+  /* Timeout, in case we're alone in the world */
+  f64 join_timeout;
+
+  vlib_one_time_waiting_process_t *procs_waiting_for_join_done;
+
+  vlib_one_time_waiting_process_t *procs_waiting_for_open_window;
+
+  /* Next sequence number to use */
+  u32 our_local_sequence;
+
+  /*
+   * Last global sequence we processed.
+   * When supplying catchup data, we need to tell
+   * the client precisely where to start replaying
+   */
+  u32 last_global_sequence_processed;
+
+  /* Vector of unique messages we've sent on this stream. */
+  mc_serialize_stream_msg_t *stream_msgs;
+
+  /* Vector global message index into per stream message index. */
+  u32 *stream_msg_index_by_global_index;
+
+  /* Hashed by message name. */
+  uword *stream_msg_index_by_name;
+
+  u64 user_requests_sent;
+  u64 user_requests_received;
+} mc_stream_t;
+
+always_inline void
+mc_stream_free (mc_stream_t * s)
+{
+  pool_free (s->retry_pool);
+  hash_free (s->retry_index_by_local_sequence);
+  clib_fifo_free (s->catchup_fifo);
+  pool_free (s->peers);
+  mhash_free (&s->peer_index_by_id);
+  vec_free (s->procs_waiting_for_join_done);
+  vec_free (s->procs_waiting_for_open_window);
+}
+
+always_inline void
+mc_stream_init (mc_stream_t * s)
+{
+  memset (s, 0, sizeof (s[0]));
+  s->retry_head_index = s->retry_tail_index = ~0;
+}
+
+typedef struct
+{
+  u32 stream_index;
+  u32 catchup_opaque;
+  u8 *catchup_snapshot;
+} mc_catchup_process_arg_t;
+
+typedef enum
+{
+  MC_RELAY_STATE_NEGOTIATE,
+  MC_RELAY_STATE_MASTER,
+  MC_RELAY_STATE_SLAVE,
+} mc_relay_state_t;
+
+typedef struct
+{
+  mc_peer_id_t peer_id;
+
+  f64 time_last_master_assert_received;
+} mc_mastership_peer_t;
+
+typedef struct
+{
+  u32 stream_index;
+  u32 buffer_index;
+} mc_stream_and_buffer_t;
+
+typedef struct mc_main_t
+{
+  mc_relay_state_t relay_state;
+
+  /* Mastership */
+  u32 we_can_be_relay_master;
+
+  u64 relay_master_peer_id;
+
+  mc_mastership_peer_t *mastership_peers;
+
+  /* Map of 64 bit id to index in stream pool. */
+  mhash_t mastership_peer_index_by_id;
+
+  /* The transport we're using. */
+  mc_transport_t transport;
+
+  /* Last-used global sequence number. */
+  u32 relay_global_sequence;
+
+  /* Vector of streams. */
+  mc_stream_t *stream_vector;
+
+  /* Hash table mapping stream name to pool index. */
+  uword *stream_index_by_name;
+
+  uword *procs_waiting_for_stream_name_by_name;
+
+  vlib_one_time_waiting_process_t **procs_waiting_for_stream_name_pool;
+
+  int joins_in_progress;
+
+  mc_catchup_process_arg_t *catchup_process_args;
+
+  /* Node indices for mastership, join ager,
+     retry and catchup processes. */
+  u32 mastership_process;
+  u32 join_ager_process;
+  u32 retry_process;
+  u32 catchup_process;
+  u32 unserialize_process;
+
+  /* Global vector of messages. */
+  mc_serialize_msg_t **global_msgs;
+
+  /* Hash table mapping message name to index. */
+  uword *global_msg_index_by_name;
+
+  /* Shared serialize/unserialize main. */
+  serialize_main_t serialize_mains[VLIB_N_RX_TX];
+
+  vlib_serialize_buffer_main_t serialize_buffer_mains[VLIB_N_RX_TX];
+
+  /* Convenience variables */
+  struct vlib_main_t *vlib_main;
+  elog_main_t *elog_main;
+
+  /* Maps 64 bit peer id to elog string table offset for this formatted peer id. */
+  mhash_t elog_id_by_peer_id;
+
+  uword *elog_id_by_msg_name;
+
+  /* For mc_unserialize. */
+  mc_stream_and_buffer_t *mc_unserialize_stream_and_buffers;
+} mc_main_t;
+
+always_inline mc_stream_t *
+mc_stream_by_name (mc_main_t * m, char *name)
+{
+  uword *p = hash_get (m->stream_index_by_name, name);
+  return p ? vec_elt_at_index (m->stream_vector, p[0]) : 0;
+}
+
+always_inline mc_stream_t *
+mc_stream_by_index (mc_main_t * m, u32 i)
+{
+  return i < vec_len (m->stream_vector) ? m->stream_vector + i : 0;
+}
+
+always_inline void
+mc_clear_stream_stats (mc_main_t * m)
+{
+  mc_stream_t *s;
+  mc_stream_peer_t *p;
+  vec_foreach (s, m->stream_vector)
+  {
+    s->stats_last_clear = s->stats;
+      /* *INDENT-OFF* */
+      pool_foreach (p, s->peers, ({
+	p->stats_last_clear = p->stats;
+      }));
+      /* *INDENT-ON* */
+  }
+}
+
+/* Declare all message handlers. */
+#define _(f) void mc_msg_##f##_handler (mc_main_t * mcm, mc_msg_##f##_t * msg, u32 buffer_index);
+foreach_mc_msg_type
+#undef _
+  u32 mc_stream_join (mc_main_t * mcm, mc_stream_config_t *);
+
+void mc_stream_leave (mc_main_t * mcm, u32 stream_index);
+
+void mc_wait_for_stream_ready (mc_main_t * m, char *stream_name);
+
+u32 mc_stream_send (mc_main_t * mcm, u32 stream_index, u32 buffer_index);
+
+void mc_main_init (mc_main_t * mcm, char *tag);
+
+void mc_enable_disable_mastership (mc_main_t * mcm, int we_can_be_master);
+
+void *mc_get_vlib_buffer (struct vlib_main_t *vm, u32 n_bytes,
+			  u32 * bi_return);
+
+format_function_t format_mc_main;
+
+clib_error_t *mc_serialize_internal (mc_main_t * mc,
+				     u32 stream_index,
+				     u32 multiple_messages_per_vlib_buffer,
+				     mc_serialize_msg_t * msg, ...);
+
+clib_error_t *mc_serialize_va (mc_main_t * mc,
+			       u32 stream_index,
+			       u32 multiple_messages_per_vlib_buffer,
+			       mc_serialize_msg_t * msg, va_list * va);
+
+#define mc_serialize_stream(mc,si,msg,args...)			\
+  mc_serialize_internal((mc),(si),(0),(msg),(msg)->serialize,args)
+
+#define mc_serialize(mc,msg,args...)				\
+  mc_serialize_internal((mc),(~0),(0),(msg),(msg)->serialize,args)
+
+#define mc_serialize2(mc,add,msg,args...)				\
+  mc_serialize_internal((mc),(~0),(add),(msg),(msg)->serialize,args)
+
+void mc_unserialize (mc_main_t * mcm, mc_stream_t * s, u32 buffer_index);
+uword mc_unserialize_message (mc_main_t * mcm, mc_stream_t * s,
+			      serialize_main_t * m);
+
+serialize_function_t serialize_mc_main, unserialize_mc_main;
+
+always_inline uword
+mc_max_message_size_in_bytes (mc_main_t * mcm)
+{
+  return mcm->transport.max_packet_size - sizeof (mc_msg_user_request_t);
+}
+
+always_inline word
+mc_serialize_n_bytes_left (mc_main_t * mcm, serialize_main_t * m)
+{
+  return mc_max_message_size_in_bytes (mcm) -
+    serialize_vlib_buffer_n_bytes (m);
+}
+
+void unserialize_mc_stream (serialize_main_t * m, va_list * va);
+void mc_stream_join_process_hold (void);
+
+#endif /* included_vlib_mc_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/node.c b/src/vlib/node.c
new file mode 100644
index 00000000..e6739dc7
--- /dev/null
+++ b/src/vlib/node.c
@@ -0,0 +1,679 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * node.c: VLIB processing nodes
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/threads.h>
+
+/* Query node given name. */
+vlib_node_t *
+vlib_get_node_by_name (vlib_main_t * vm, u8 * name)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  uword *p;
+  u8 *key = name;
+  if (!clib_mem_is_heap_object (key))
+    key = format (0, "%s", key);
+  p = hash_get (nm->node_by_name, key);
+  if (key != name)
+    vec_free (key);
+  return p ? vec_elt (nm->nodes, p[0]) : 0;
+}
+
+static void
+node_set_elog_name (vlib_main_t * vm, uword node_index)
+{
+  vlib_node_t *n = vlib_get_node (vm, node_index);
+  elog_event_type_t *t;
+
+  t = vec_elt_at_index (vm->node_call_elog_event_types, node_index);
+  vec_free (t->format);
+  t->format = (char *) format (0, "%v-call: %%d%c", n->name, 0);
+
+  t = vec_elt_at_index (vm->node_return_elog_event_types, node_index);
+  vec_free (t->format);
+  t->format = (char *) format (0, "%v-return: %%d%c", n->name, 0);
+
+  n->name_elog_string = elog_string (&vm->elog_main, "%v%c", n->name, 0);
+}
+
+static void
+vlib_worker_thread_node_rename (u32 node_index)
+{
+  int i;
+  vlib_main_t *vm;
+  vlib_node_t *n;
+
+  if (vec_len (vlib_mains) == 1)
+    return;
+
+  vm = vlib_mains[0];
+  n = vlib_get_node (vm, node_index);
+
+  ASSERT (vlib_get_thread_index () == 0);
+  ASSERT (*vlib_worker_threads->wait_at_barrier == 1);
+
+  for (i = 1; i < vec_len (vlib_mains); i++)
+    {
+      vlib_main_t *vm_worker = vlib_mains[i];
+      vlib_node_t *n_worker = vlib_get_node (vm_worker, node_index);
+
+      n_worker->name = n->name;
+      n_worker->name_elog_string = n->name_elog_string;
+    }
+}
+
+void
+vlib_node_rename (vlib_main_t * vm, u32 node_index, char *fmt, ...)
+{
+  va_list va;
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_node_t *n = vlib_get_node (vm, node_index);
+
+  va_start (va, fmt);
+  hash_unset (nm->node_by_name, n->name);
+  vec_free (n->name);
+  n->name = va_format (0, fmt, &va);
+  va_end (va);
+  hash_set (nm->node_by_name, n->name, n->index);
+
+  node_set_elog_name (vm, node_index);
+
+  /* Propagate the change to all worker threads */
+  vlib_worker_thread_node_rename (node_index);
+}
+
+static void
+vlib_node_runtime_update (vlib_main_t * vm, u32 node_index, u32 next_index)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_node_runtime_t *r, *s;
+  vlib_node_t *node, *next_node;
+  vlib_next_frame_t *nf;
+  vlib_pending_frame_t *pf;
+  i32 i, j, n_insert;
+
+  ASSERT (vlib_get_thread_index () == 0);
+
+  vlib_worker_thread_barrier_sync (vm);
+
+  node = vec_elt (nm->nodes, node_index);
+  r = vlib_node_get_runtime (vm, node_index);
+
+  n_insert = vec_len (node->next_nodes) - r->n_next_nodes;
+  if (n_insert > 0)
+    {
+      i = r->next_frame_index + r->n_next_nodes;
+      vec_insert (nm->next_frames, n_insert, i);
+
+      /* Initialize newly inserted next frames. */
+      for (j = 0; j < n_insert; j++)
+	vlib_next_frame_init (nm->next_frames + i + j);
+
+      /* Relocate other next frames at higher indices. */
+      for (j = 0; j < vec_len (nm->nodes); j++)
+	{
+	  s = vlib_node_get_runtime (vm, j);
+	  if (j != node_index && s->next_frame_index >= i)
+	    s->next_frame_index += n_insert;
+	}
+
+      /* Pending frames may need to be relocated also. */
+      vec_foreach (pf, nm->pending_frames)
+      {
+	if (pf->next_frame_index != VLIB_PENDING_FRAME_NO_NEXT_FRAME
+	    && pf->next_frame_index >= i)
+	  pf->next_frame_index += n_insert;
+      }
+      /* *INDENT-OFF* */
+      pool_foreach (pf, nm->suspended_process_frames, ({
+	  if (pf->next_frame_index != ~0 && pf->next_frame_index >= i)
+	    pf->next_frame_index += n_insert;
+      }));
+      /* *INDENT-ON* */
+
+      r->n_next_nodes = vec_len (node->next_nodes);
+    }
+
+  /* Set frame's node runtime index. */
+  next_node = vlib_get_node (vm, node->next_nodes[next_index]);
+  nf = nm->next_frames + r->next_frame_index + next_index;
+  nf->node_runtime_index = next_node->runtime_index;
+
+  vlib_worker_thread_node_runtime_update ();
+
+  vlib_worker_thread_barrier_release (vm);
+}
+
+uword
+vlib_node_get_next (vlib_main_t * vm, uword node_index, uword next_node_index)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_node_t *node;
+  uword *p;
+
+  node = vec_elt (nm->nodes, node_index);
+
+  /* Runtime has to be initialized. */
+  ASSERT (nm->flags & VLIB_NODE_MAIN_RUNTIME_STARTED);
+
+  if ((p = hash_get (node->next_slot_by_node, next_node_index)))
+    {
+      return p[0];
+    }
+
+  return (~0);
+}
+
+/* Add next node to given node in given slot. */
+uword
+vlib_node_add_next_with_slot (vlib_main_t * vm,
+			      uword node_index,
+			      uword next_node_index, uword slot)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_node_t *node, *next;
+  uword *p;
+
+  node = vec_elt (nm->nodes, node_index);
+  next = vec_elt (nm->nodes, next_node_index);
+
+  /* Runtime has to be initialized. */
+  ASSERT (nm->flags & VLIB_NODE_MAIN_RUNTIME_STARTED);
+
+  if ((p = hash_get (node->next_slot_by_node, next_node_index)))
+    {
+      /* Next already exists: slot must match. */
+      if (slot != ~0)
+	ASSERT (slot == p[0]);
+      return p[0];
+    }
+
+  if (slot == ~0)
+    slot = vec_len (node->next_nodes);
+
+  vec_validate_init_empty (node->next_nodes, slot, ~0);
+  vec_validate (node->n_vectors_by_next_node, slot);
+
+  node->next_nodes[slot] = next_node_index;
+  hash_set (node->next_slot_by_node, next_node_index, slot);
+
+  vlib_node_runtime_update (vm, node_index, slot);
+
+  next->prev_node_bitmap = clib_bitmap_ori (next->prev_node_bitmap,
+					    node_index);
+
+  /* Siblings all get same node structure. */
+  {
+    uword sib_node_index, sib_slot;
+    vlib_node_t *sib_node;
+    /* *INDENT-OFF* */
+    clib_bitmap_foreach (sib_node_index, node->sibling_bitmap, ({
+      sib_node = vec_elt (nm->nodes, sib_node_index);
+      if (sib_node != node)
+	{
+	  sib_slot = vlib_node_add_next_with_slot (vm, sib_node_index, next_node_index, slot);
+	  ASSERT (sib_slot == slot);
+	}
+    }));
+    /* *INDENT-ON* */
+  }
+
+  return slot;
+}
+
+/* Add named next node to given node in given slot. */
+uword
+vlib_node_add_named_next_with_slot (vlib_main_t * vm,
+				    uword node, char *name, uword slot)
+{
+  vlib_node_main_t *nm;
+  vlib_node_t *n, *n_next;
+
+  nm = &vm->node_main;
+  n = vlib_get_node (vm, node);
+
+  n_next = vlib_get_node_by_name (vm, (u8 *) name);
+  if (!n_next)
+    {
+      if (nm->flags & VLIB_NODE_MAIN_RUNTIME_STARTED)
+	return ~0;
+
+      if (slot == ~0)
+	slot = clib_max (vec_len (n->next_node_names),
+			 vec_len (n->next_nodes));
+      vec_validate (n->next_node_names, slot);
+      n->next_node_names[slot] = name;
+      return slot;
+    }
+
+  return vlib_node_add_next_with_slot (vm, node, n_next->index, slot);
+}
+
+static void
+node_elog_init (vlib_main_t * vm, uword ni)
+{
+  elog_event_type_t t;
+
+  memset (&t, 0, sizeof (t));
+
+  /* 2 event types for this node: one when node function is called.
+     One when it returns. */
+  vec_validate (vm->node_call_elog_event_types, ni);
+  vm->node_call_elog_event_types[ni] = t;
+
+  vec_validate (vm->node_return_elog_event_types, ni);
+  vm->node_return_elog_event_types[ni] = t;
+
+  node_set_elog_name (vm, ni);
+}
+
+#ifdef CLIB_UNIX
+#define STACK_ALIGN (clib_mem_get_page_size())
+#else
+#define STACK_ALIGN CLIB_CACHE_LINE_BYTES
+#endif
+
+static void
+register_node (vlib_main_t * vm, vlib_node_registration_t * r)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_node_t *n;
+  u32 page_size = clib_mem_get_page_size ();
+  int i;
+
+  if (CLIB_DEBUG > 0)
+    {
+      /* Default (0) type should match INTERNAL. */
+      vlib_node_t zero = { 0 };
+      ASSERT (VLIB_NODE_TYPE_INTERNAL == zero.type);
+    }
+
+  ASSERT (r->function != 0);
+
+  n = clib_mem_alloc_no_fail (sizeof (n[0]));
+  memset (n, 0, sizeof (n[0]));
+  n->index = vec_len (nm->nodes);
+
+  vec_add1 (nm->nodes, n);
+
+  /* Name is always a vector so it can be formatted with %v. */
+  if (clib_mem_is_heap_object (vec_header (r->name, 0)))
+    n->name = vec_dup ((u8 *) r->name);
+  else
+    n->name = format (0, "%s", r->name);
+
+  if (!nm->node_by_name)
+    nm->node_by_name = hash_create_vec ( /* size */ 32,
+					sizeof (n->name[0]), sizeof (uword));
+
+  /* Node names must be unique. */
+  {
+    vlib_node_t *o = vlib_get_node_by_name (vm, n->name);
+    if (o)
+      clib_error ("more than one node named `%v'", n->name);
+  }
+
+  hash_set (nm->node_by_name, n->name, n->index);
+
+  r->index = n->index;		/* save index in registration */
+  n->function = r->function;
+
+  /* Node index of next sibling will be filled in by vlib_node_main_init. */
+  n->sibling_of = r->sibling_of;
+  if (r->sibling_of && r->n_next_nodes > 0)
+    clib_error ("sibling node should not have any next nodes `%v'", n->name);
+
+  if (r->type == VLIB_NODE_TYPE_INTERNAL)
+    ASSERT (r->vector_size > 0);
+
+#define _(f) n->f = r->f
+
+  _(type);
+  _(flags);
+  _(state);
+  _(scalar_size);
+  _(vector_size);
+  _(format_buffer);
+  _(unformat_buffer);
+  _(format_trace);
+  _(validate_frame);
+
+  /* Register error counters. */
+  vlib_register_errors (vm, n->index, r->n_errors, r->error_strings);
+  node_elog_init (vm, n->index);
+
+  _(runtime_data_bytes);
+  if (r->runtime_data_bytes > 0)
+    {
+      vec_resize (n->runtime_data, r->runtime_data_bytes);
+      if (r->runtime_data)
+	clib_memcpy (n->runtime_data, r->runtime_data, r->runtime_data_bytes);
+    }
+
+  vec_resize (n->next_node_names, r->n_next_nodes);
+  for (i = 0; i < r->n_next_nodes; i++)
+    n->next_node_names[i] = r->next_nodes[i];
+
+  vec_validate_init_empty (n->next_nodes, r->n_next_nodes - 1, ~0);
+  vec_validate (n->n_vectors_by_next_node, r->n_next_nodes - 1);
+
+  n->owner_node_index = n->owner_next_index = ~0;
+
+  /* Initialize node runtime. */
+  {
+    vlib_node_runtime_t *rt;
+    u32 i;
+
+    if (n->type == VLIB_NODE_TYPE_PROCESS)
+      {
+	vlib_process_t *p;
+	uword log2_n_stack_bytes;
+
+	log2_n_stack_bytes = clib_max (r->process_log2_n_stack_bytes, 15);
+
+#ifdef CLIB_UNIX
+	/*
+	 * Bump the stack size if running over a kernel with a large page size,
+	 * and the stack isn't any too big to begin with. Otherwise, we'll
+	 * trip over the stack guard page for sure.
+	 */
+	if ((page_size > (4 << 10)) && log2_n_stack_bytes < 19)
+	  {
+	    if ((1 << log2_n_stack_bytes) <= page_size)
+	      log2_n_stack_bytes = min_log2 (page_size) + 1;
+	    else
+	      log2_n_stack_bytes++;
+	  }
+#endif
+
+	p = clib_mem_alloc_aligned_at_offset
+	  (sizeof (p[0]) + (1 << log2_n_stack_bytes),
+	   STACK_ALIGN, STRUCT_OFFSET_OF (vlib_process_t, stack),
+	   0 /* no, don't call os_out_of_memory */ );
+	if (p == 0)
+	  clib_panic ("failed to allocate process stack (%d bytes)",
+		      1 << log2_n_stack_bytes);
+
+	memset (p, 0, sizeof (p[0]));
+	p->log2_n_stack_bytes = log2_n_stack_bytes;
+
+	/* Process node's runtime index is really index into process
+	   pointer vector. */
+	n->runtime_index = vec_len (nm->processes);
+
+	vec_add1 (nm->processes, p);
+
+	/* Paint first stack word with magic number so we can at least
+	   detect process stack overruns. */
+	p->stack[0] = VLIB_PROCESS_STACK_MAGIC;
+
+	/* Node runtime is stored inside of process. */
+	rt = &p->node_runtime;
+
+#ifdef CLIB_UNIX
+	/*
+	 * Disallow writes to the bottom page of the stack, to
+	 * catch stack overflows.
+	 */
+	if (mprotect (p->stack, page_size, PROT_READ) < 0)
+	  clib_unix_warning ("process stack");
+#endif
+
+      }
+    else
+      {
+	vec_add2_aligned (nm->nodes_by_type[n->type], rt, 1,
+			  /* align */ CLIB_CACHE_LINE_BYTES);
+	n->runtime_index = rt - nm->nodes_by_type[n->type];
+      }
+
+    if (n->type == VLIB_NODE_TYPE_INPUT)
+      nm->input_node_counts_by_state[n->state] += 1;
+
+    rt->function = n->function;
+    rt->flags = n->flags;
+    rt->state = n->state;
+    rt->node_index = n->index;
+
+    rt->n_next_nodes = r->n_next_nodes;
+    rt->next_frame_index = vec_len (nm->next_frames);
+
+    vec_resize (nm->next_frames, rt->n_next_nodes);
+    for (i = 0; i < rt->n_next_nodes; i++)
+      vlib_next_frame_init (nm->next_frames + rt->next_frame_index + i);
+
+    vec_resize (rt->errors, r->n_errors);
+    for (i = 0; i < vec_len (rt->errors); i++)
+      rt->errors[i] = vlib_error_set (n->index, i);
+
+    STATIC_ASSERT_SIZEOF (vlib_node_runtime_t, 128);
+    ASSERT (vec_len (n->runtime_data) <= VLIB_NODE_RUNTIME_DATA_SIZE);
+
+    if (vec_len (n->runtime_data) > 0)
+      clib_memcpy (rt->runtime_data, n->runtime_data,
+		   vec_len (n->runtime_data));
+
+    vec_free (n->runtime_data);
+  }
+}
+
+/* Register new packet processing node. */
+u32
+vlib_register_node (vlib_main_t * vm, vlib_node_registration_t * r)
+{
+  register_node (vm, r);
+  return r->index;
+}
+
+static uword
+null_node_fn (vlib_main_t * vm,
+	      vlib_node_runtime_t * node, vlib_frame_t * frame)
+{
+  u16 n_vectors = frame->n_vectors;
+
+  vlib_node_increment_counter (vm, node->node_index, 0, n_vectors);
+  vlib_buffer_free (vm, vlib_frame_args (frame), n_vectors);
+  vlib_frame_free (vm, node, frame);
+
+  return n_vectors;
+}
+
+void
+vlib_register_all_static_nodes (vlib_main_t * vm)
+{
+  vlib_node_registration_t *r;
+
+  static char *null_node_error_strings[] = {
+    "blackholed packets",
+  };
+
+  static vlib_node_registration_t null_node_reg = {
+    .function = null_node_fn,
+    .vector_size = sizeof (u32),
+    .name = "null-node",
+    .n_errors = 1,
+    .error_strings = null_node_error_strings,
+  };
+
+  /* make sure that node index 0 is not used by
+     real node */
+  register_node (vm, &null_node_reg);
+
+  r = vm->node_main.node_registrations;
+  while (r)
+    {
+      register_node (vm, r);
+      r = r->next_registration;
+    }
+}
+
+clib_error_t *
+vlib_node_main_init (vlib_main_t * vm)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  clib_error_t *error = 0;
+  vlib_node_t *n;
+  uword ni;
+
+  nm->frame_size_hash = hash_create (0, sizeof (uword));
+  nm->flags |= VLIB_NODE_MAIN_RUNTIME_STARTED;
+
+  /* Generate sibling relationships */
+  {
+    vlib_node_t *n, *sib;
+    uword si;
+
+    for (ni = 0; ni < vec_len (nm->nodes); ni++)
+      {
+	n = vec_elt (nm->nodes, ni);
+
+	if (!n->sibling_of)
+	  continue;
+
+	sib = vlib_get_node_by_name (vm, (u8 *) n->sibling_of);
+	if (!sib)
+	  {
+	    error = clib_error_create ("sibling `%s' not found for node `%v'",
+				       n->sibling_of, n->name);
+	    goto done;
+	  }
+
+        /* *INDENT-OFF* */
+	clib_bitmap_foreach (si, sib->sibling_bitmap, ({
+	      vlib_node_t * m = vec_elt (nm->nodes, si);
+
+	      /* Connect all of sibling's siblings to us. */
+	      m->sibling_bitmap = clib_bitmap_ori (m->sibling_bitmap, n->index);
+
+	      /* Connect us to all of sibling's siblings. */
+	      n->sibling_bitmap = clib_bitmap_ori (n->sibling_bitmap, si);
+	    }));
+        /* *INDENT-ON* */
+
+	/* Connect sibling to us. */
+	sib->sibling_bitmap = clib_bitmap_ori (sib->sibling_bitmap, n->index);
+
+	/* Connect us to sibling. */
+	n->sibling_bitmap = clib_bitmap_ori (n->sibling_bitmap, sib->index);
+      }
+  }
+
+  /* Resolve next names into next indices. */
+  for (ni = 0; ni < vec_len (nm->nodes); ni++)
+    {
+      uword i;
+
+      n = vec_elt (nm->nodes, ni);
+
+      for (i = 0; i < vec_len (n->next_node_names); i++)
+	{
+	  char *a = n->next_node_names[i];
+
+	  if (!a)
+	    continue;
+
+	  if (~0 == vlib_node_add_named_next_with_slot (vm, n->index, a, i))
+	    {
+	      error = clib_error_create
+		("node `%v' refers to unknown node `%s'", n->name, a);
+	      goto done;
+	    }
+	}
+
+      vec_free (n->next_node_names);
+    }
+
+  /* Set previous node pointers. */
+  for (ni = 0; ni < vec_len (nm->nodes); ni++)
+    {
+      vlib_node_t *n_next;
+      uword i;
+
+      n = vec_elt (nm->nodes, ni);
+
+      for (i = 0; i < vec_len (n->next_nodes); i++)
+	{
+	  if (n->next_nodes[i] >= vec_len (nm->nodes))
+	    continue;
+
+	  n_next = vec_elt (nm->nodes, n->next_nodes[i]);
+	  n_next->prev_node_bitmap =
+	    clib_bitmap_ori (n_next->prev_node_bitmap, n->index);
+	}
+    }
+
+  {
+    vlib_next_frame_t *nf;
+    vlib_node_runtime_t *r;
+    vlib_node_t *next;
+    uword i;
+
+    vec_foreach (r, nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL])
+    {
+      if (r->n_next_nodes == 0)
+	continue;
+
+      n = vlib_get_node (vm, r->node_index);
+      nf = vec_elt_at_index (nm->next_frames, r->next_frame_index);
+
+      for (i = 0; i < vec_len (n->next_nodes); i++)
+	{
+	  next = vlib_get_node (vm, n->next_nodes[i]);
+
+	  /* Validate node runtime indices are correctly initialized. */
+	  ASSERT (nf[i].node_runtime_index == next->runtime_index);
+
+	  nf[i].flags = 0;
+	  if (next->flags & VLIB_NODE_FLAG_FRAME_NO_FREE_AFTER_DISPATCH)
+	    nf[i].flags |= VLIB_FRAME_NO_FREE_AFTER_DISPATCH;
+	}
+    }
+  }
+
+done:
+  return error;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/node.h b/src/vlib/node.h
new file mode 100644
index 00000000..2acd61ce
--- /dev/null
+++ b/src/vlib/node.h
@@ -0,0 +1,736 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * node.h: VLIB processing nodes
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_node_h
+#define included_vlib_node_h
+
+#include <vppinfra/cpu.h>
+#include <vppinfra/longjmp.h>
+#include <vppinfra/lock.h>
+#include <vlib/trace.h>		/* for vlib_trace_filter_t */
+
+/* Forward declaration. */
+struct vlib_node_runtime_t;
+struct vlib_frame_t;
+
+/* Internal nodes (including output nodes) move data from node to
+   node (or out of the graph for output nodes). */
+typedef uword (vlib_node_function_t) (struct vlib_main_t * vm,
+				      struct vlib_node_runtime_t * node,
+				      struct vlib_frame_t * frame);
+
+typedef enum
+{
+  /* An internal node on the call graph (could be output). */
+  VLIB_NODE_TYPE_INTERNAL,
+
+  /* Nodes which input data into the processing graph.
+     Input nodes are called for each iteration of main loop. */
+  VLIB_NODE_TYPE_INPUT,
+
+  /* Nodes to be called before all input nodes.
+     Used, for example, to clean out driver TX rings before
+     processing input. */
+  VLIB_NODE_TYPE_PRE_INPUT,
+
+  /* "Process" nodes which can be suspended and later resumed. */
+  VLIB_NODE_TYPE_PROCESS,
+
+  VLIB_N_NODE_TYPE,
+} vlib_node_type_t;
+
+typedef struct _vlib_node_registration
+{
+  /* Vector processing function for this node. */
+  vlib_node_function_t *function;
+
+  /* Node name. */
+  char *name;
+
+  /* Name of sibling (if applicable). */
+  char *sibling_of;
+
+  /* Node index filled in by registration. */
+  u32 index;
+
+  /* Type of this node. */
+  vlib_node_type_t type;
+
+  /* Error strings indexed by error code for this node. */
+  char **error_strings;
+
+  /* Buffer format/unformat for this node. */
+  format_function_t *format_buffer;
+  unformat_function_t *unformat_buffer;
+
+  /* Trace format/unformat for this node. */
+  format_function_t *format_trace;
+  unformat_function_t *unformat_trace;
+
+  /* Function to validate incoming frames. */
+  u8 *(*validate_frame) (struct vlib_main_t * vm,
+			 struct vlib_node_runtime_t *,
+			 struct vlib_frame_t * f);
+
+  /* Per-node runtime data. */
+  void *runtime_data;
+
+  /* Process stack size. */
+  u16 process_log2_n_stack_bytes;
+
+  /* Number of bytes of per-node run time data. */
+  u8 runtime_data_bytes;
+
+  /* State for input nodes. */
+  u8 state;
+
+  /* Node flags. */
+  u16 flags;
+
+  /* Size of scalar and vector arguments in bytes. */
+  u16 scalar_size, vector_size;
+
+  /* Number of error codes used by this node. */
+  u16 n_errors;
+
+  /* Number of next node names that follow. */
+  u16 n_next_nodes;
+
+  /* Constructor link-list, don't ask... */
+  struct _vlib_node_registration *next_registration;
+
+  /* Names of next nodes which this node feeds into. */
+  char *next_nodes[];
+
+} vlib_node_registration_t;
+
+#define VLIB_REGISTER_NODE(x,...)                                       \
+    __VA_ARGS__ vlib_node_registration_t x;                             \
+static void __vlib_add_node_registration_##x (void)                     \
+    __attribute__((__constructor__)) ;                                  \
+static void __vlib_add_node_registration_##x (void)                     \
+{                                                                       \
+    vlib_main_t * vm = vlib_get_main();                                 \
+    x.next_registration = vm->node_main.node_registrations;             \
+    vm->node_main.node_registrations = &x;                              \
+}                                                                       \
+__VA_ARGS__ vlib_node_registration_t x
+
+#if CLIB_DEBUG > 0
+#define VLIB_NODE_FUNCTION_CLONE_TEMPLATE(arch, fn)
+#define VLIB_NODE_FUNCTION_MULTIARCH_CLONE(fn)
+#define VLIB_NODE_FUNCTION_MULTIARCH(node, fn)
+#else
+#define VLIB_NODE_FUNCTION_CLONE_TEMPLATE(arch, fn, tgt)		\
+  uword									\
+  __attribute__ ((flatten))						\
+  __attribute__ ((target (tgt)))					\
+  CLIB_CPU_OPTIMIZED							\
+  fn ## _ ## arch ( struct vlib_main_t * vm,				\
+                   struct vlib_node_runtime_t * node,			\
+                   struct vlib_frame_t * frame)				\
+  { return fn (vm, node, frame); }
+
+#define VLIB_NODE_FUNCTION_MULTIARCH_CLONE(fn)				\
+  foreach_march_variant(VLIB_NODE_FUNCTION_CLONE_TEMPLATE, fn)
+
+#define VLIB_NODE_FUNCTION_MULTIARCH(node, fn)				\
+  VLIB_NODE_FUNCTION_MULTIARCH_CLONE(fn)				\
+  CLIB_MULTIARCH_SELECT_FN(fn, static inline)				\
+  static void __attribute__((__constructor__))				\
+  __vlib_node_function_multiarch_select_##node (void)			\
+  { node.function = fn ## _multiarch_select(); }
+#endif
+
+always_inline vlib_node_registration_t *
+vlib_node_next_registered (vlib_node_registration_t * c)
+{
+  c =
+    clib_elf_section_data_next (c,
+				c->n_next_nodes * sizeof (c->next_nodes[0]));
+  return c;
+}
+
+typedef struct
+{
+  /* Total calls, clock ticks and vector elements processed for this node. */
+  u64 calls, vectors, clocks, suspends;
+  u64 max_clock;
+  u64 max_clock_n;
+} vlib_node_stats_t;
+
+#define foreach_vlib_node_state					\
+  /* Input node is called each iteration of main loop.		\
+     This is the default (zero). */				\
+  _ (POLLING)							\
+  /* Input node is called when device signals an interrupt. */	\
+  _ (INTERRUPT)							\
+  /* Input node is never called. */				\
+  _ (DISABLED)
+
+typedef enum
+{
+#define _(f) VLIB_NODE_STATE_##f,
+  foreach_vlib_node_state
+#undef _
+    VLIB_N_NODE_STATE,
+} vlib_node_state_t;
+
+typedef struct vlib_node_t
+{
+  /* Vector processing function for this node. */
+  vlib_node_function_t *function;
+
+  /* Node name. */
+  u8 *name;
+
+  /* Node name index in elog string table. */
+  u32 name_elog_string;
+
+  /* Total statistics for this node. */
+  vlib_node_stats_t stats_total;
+
+  /* Saved values as of last clear (or zero if never cleared).
+     Current values are always stats_total - stats_last_clear. */
+  vlib_node_stats_t stats_last_clear;
+
+  /* Type of this node. */
+  vlib_node_type_t type;
+
+  /* Node index. */
+  u32 index;
+
+  /* Index of corresponding node runtime. */
+  u32 runtime_index;
+
+  /* Runtime data for this node. */
+  void *runtime_data;
+
+  /* Node flags. */
+  u16 flags;
+
+  /* Processing function keeps frame.  Tells node dispatching code not
+     to free frame after dispatch is done.  */
+#define VLIB_NODE_FLAG_FRAME_NO_FREE_AFTER_DISPATCH (1 << 0)
+
+  /* Node counts as output/drop/punt node for stats purposes. */
+#define VLIB_NODE_FLAG_IS_OUTPUT (1 << 1)
+#define VLIB_NODE_FLAG_IS_DROP (1 << 2)
+#define VLIB_NODE_FLAG_IS_PUNT (1 << 3)
+#define VLIB_NODE_FLAG_IS_HANDOFF (1 << 4)
+
+  /* Set if current node runtime has traced vectors. */
+#define VLIB_NODE_FLAG_TRACE (1 << 5)
+
+#define VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE (1 << 6)
+#define VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE (1 << 7)
+
+  /* State for input nodes. */
+  u8 state;
+
+  /* Number of bytes of run time data. */
+  u8 runtime_data_bytes;
+
+  /* Number of error codes used by this node. */
+  u16 n_errors;
+
+  /* Size of scalar and vector arguments in bytes. */
+  u16 scalar_size, vector_size;
+
+  /* Handle/index in error heap for this node. */
+  u32 error_heap_handle;
+  u32 error_heap_index;
+
+  /* Error strings indexed by error code for this node. */
+  char **error_strings;
+
+  /* Vector of next node names.
+     Only used before next_nodes array is initialized. */
+  char **next_node_names;
+
+  /* Next node indices for this node. */
+  u32 *next_nodes;
+
+  /* Name of node that we are sibling of. */
+  char *sibling_of;
+
+  /* Bitmap of all of this node's siblings. */
+  uword *sibling_bitmap;
+
+  /* Total number of vectors sent to each next node. */
+  u64 *n_vectors_by_next_node;
+
+  /* Hash table mapping next node index into slot in
+     next_nodes vector.  Quickly determines whether this node
+     is connected to given next node and, if so, with which slot. */
+  uword *next_slot_by_node;
+
+  /* Bitmap of node indices which feed this node. */
+  uword *prev_node_bitmap;
+
+  /* Node/next-index which own enqueue rights with to this node. */
+  u32 owner_node_index, owner_next_index;
+
+  /* Buffer format/unformat for this node. */
+  format_function_t *format_buffer;
+  unformat_function_t *unformat_buffer;
+
+  /* Trace buffer format/unformat for this node. */
+  format_function_t *format_trace;
+
+  /* Function to validate incoming frames. */
+  u8 *(*validate_frame) (struct vlib_main_t * vm,
+			 struct vlib_node_runtime_t *,
+			 struct vlib_frame_t * f);
+  /* for pretty-printing, not typically valid */
+  u8 *state_string;
+} vlib_node_t;
+
+#define VLIB_INVALID_NODE_INDEX ((u32) ~0)
+
+/* Max number of vector elements to process at once per node. */
+#define VLIB_FRAME_SIZE 256
+#define VLIB_FRAME_ALIGN CLIB_CACHE_LINE_BYTES
+
+/* Calling frame (think stack frame) for a node. */
+typedef struct vlib_frame_t
+{
+  /* Frame flags. */
+  u16 flags;
+
+  /* Number of scalar bytes in arguments. */
+  u8 scalar_size;
+
+  /* Number of bytes per vector argument. */
+  u8 vector_size;
+
+  /* Number of vector elements currently in frame. */
+  u16 n_vectors;
+
+  /* Scalar and vector arguments to next node. */
+  u8 arguments[0];
+} vlib_frame_t;
+
+typedef struct
+{
+  /* Frame index. */
+  u32 frame_index;
+
+  /* Node runtime for this next. */
+  u32 node_runtime_index;
+
+  /* Next frame flags. */
+  u32 flags;
+
+  /* Reflects node frame-used flag for this next. */
+#define VLIB_FRAME_NO_FREE_AFTER_DISPATCH \
+  VLIB_NODE_FLAG_FRAME_NO_FREE_AFTER_DISPATCH
+
+  /* This next frame owns enqueue to node
+     corresponding to node_runtime_index. */
+#define VLIB_FRAME_OWNER (1 << 15)
+
+  /* Set when frame has been allocated for this next. */
+#define VLIB_FRAME_IS_ALLOCATED	VLIB_NODE_FLAG_IS_OUTPUT
+
+  /* Set when frame has been added to pending vector. */
+#define VLIB_FRAME_PENDING VLIB_NODE_FLAG_IS_DROP
+
+  /* Set when frame is to be freed after dispatch. */
+#define VLIB_FRAME_FREE_AFTER_DISPATCH VLIB_NODE_FLAG_IS_PUNT
+
+  /* Set when frame has traced packets. */
+#define VLIB_FRAME_TRACE VLIB_NODE_FLAG_TRACE
+
+  /* Number of vectors enqueue to this next since last overflow. */
+  u32 vectors_since_last_overflow;
+} vlib_next_frame_t;
+
+always_inline void
+vlib_next_frame_init (vlib_next_frame_t * nf)
+{
+  memset (nf, 0, sizeof (nf[0]));
+  nf->frame_index = ~0;
+  nf->node_runtime_index = ~0;
+}
+
+/* A frame pending dispatch by main loop. */
+typedef struct
+{
+  /* Node and runtime for this frame. */
+  u32 node_runtime_index;
+
+  /* Frame index (in the heap). */
+  u32 frame_index;
+
+  /* Start of next frames for this node. */
+  u32 next_frame_index;
+
+  /* Special value for next_frame_index when there is no next frame. */
+#define VLIB_PENDING_FRAME_NO_NEXT_FRAME ((u32) ~0)
+} vlib_pending_frame_t;
+
+typedef struct vlib_node_runtime_t
+{
+  CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);	/**< cacheline mark */
+
+  vlib_node_function_t *function;	/**< Node function to call. */
+
+  vlib_error_t *errors;			/**< Vector of errors for this node. */
+
+#if __SIZEOF_POINTER__ == 4
+  u8 pad[8];
+#endif
+
+  u32 clocks_since_last_overflow;	/**< Number of clock cycles. */
+
+  u32 max_clock;			/**< Maximum clock cycle for an
+					  invocation. */
+
+  u32 max_clock_n;			/**< Number of vectors in the recorded
+					  max_clock. */
+
+  u32 calls_since_last_overflow;	/**< Number of calls. */
+
+  u32 vectors_since_last_overflow;	/**< Number of vector elements
+					  processed by this node. */
+
+  u32 next_frame_index;			/**< Start of next frames for this
+					  node. */
+
+  u32 node_index;			/**< Node index. */
+
+  u32 input_main_loops_per_call;	/**< For input nodes: decremented
+					  on each main loop interation until
+					  it reaches zero and function is
+					  called.  Allows some input nodes to
+					  be called more than others. */
+
+  u32 main_loop_count_last_dispatch;	/**< Saved main loop counter of last
+					  dispatch of this node. */
+
+  u32 main_loop_vector_stats[2];
+
+  u16 flags;				/**< Copy of main node flags. */
+
+  u16 state;				/**< Input node state. */
+
+  u16 n_next_nodes;
+
+  u16 cached_next_index;		/**< Next frame index that vector
+					  arguments were last enqueued to
+					  last time this node ran. Set to
+					  zero before first run of this
+					  node. */
+
+  u16 thread_index;			/**< thread this node runs on */
+
+  u8 runtime_data[0];			/**< Function dependent
+					  node-runtime data. This data is
+					  thread local, and it is not
+					  cloned from main thread. It needs
+					  to be initialized for each thread
+					  before it is used unless
+					  runtime_data template exists in
+					  vlib_node_t. */
+}
+vlib_node_runtime_t;
+
+#define VLIB_NODE_RUNTIME_DATA_SIZE	(sizeof (vlib_node_runtime_t) - STRUCT_OFFSET_OF (vlib_node_runtime_t, runtime_data))
+
+typedef struct
+{
+  /* Number of allocated frames for this scalar/vector size. */
+  u32 n_alloc_frames;
+
+  /* Vector of free frame indices for this scalar/vector size. */
+  u32 *free_frame_indices;
+} vlib_frame_size_t;
+
+typedef struct
+{
+  /* Users opaque value for event type. */
+  uword opaque;
+} vlib_process_event_type_t;
+
+typedef struct
+{
+  /* Node runtime for this process. */
+  vlib_node_runtime_t node_runtime;
+
+  /* Where to longjmp when process is done. */
+  clib_longjmp_t return_longjmp;
+
+#define VLIB_PROCESS_RETURN_LONGJMP_RETURN ((uword) ~0 - 0)
+#define VLIB_PROCESS_RETURN_LONGJMP_SUSPEND ((uword) ~0 - 1)
+
+  /* Where to longjmp to resume node after suspend. */
+  clib_longjmp_t resume_longjmp;
+#define VLIB_PROCESS_RESUME_LONGJMP_SUSPEND 0
+#define VLIB_PROCESS_RESUME_LONGJMP_RESUME  1
+
+  u16 flags;
+#define VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK (1 << 0)
+#define VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT (1 << 1)
+  /* Set to indicate that this process has been added to resume vector. */
+#define VLIB_PROCESS_RESUME_PENDING (1 << 2)
+
+  /* Process function is currently running. */
+#define VLIB_PROCESS_IS_RUNNING (1 << 3)
+
+  /* Size of process stack. */
+  u16 log2_n_stack_bytes;
+
+  u32 suspended_process_frame_index;
+
+  /* Number of times this process was suspended. */
+  u32 n_suspends;
+
+  /* Vectors of pending event data indexed by event type index. */
+  void **pending_event_data_by_type_index;
+
+  /* Bitmap of event type-indices with non-empty vectors. */
+  uword *non_empty_event_type_bitmap;
+
+  /* Bitmap of event type-indices which are one time events. */
+  uword *one_time_event_type_bitmap;
+
+  /* Type is opaque pointer -- typically a pointer to an event handler
+     function.  Hash table to map opaque to a type index. */
+  uword *event_type_index_by_type_opaque;
+
+  /* Pool of currently valid event types. */
+  vlib_process_event_type_t *event_type_pool;
+
+  /*
+   * When suspending saves clock time (10us ticks) when process
+   * is to be resumed.
+   */
+  u64 resume_clock_interval;
+
+  /* Handle from timer code, to cancel an unexpired timer */
+  u32 stop_timer_handle;
+
+  /* Default output function and its argument for any CLI outputs
+     within the process. */
+  vlib_cli_output_function_t *output_function;
+  uword output_function_arg;
+
+#ifdef CLIB_UNIX
+  /* Pad to a multiple of the page size so we can mprotect process stacks */
+#define PAGE_SIZE_MULTIPLE 0x1000
+#define ALIGN_ON_MULTIPLE_PAGE_BOUNDARY_FOR_MPROTECT  __attribute__ ((aligned (PAGE_SIZE_MULTIPLE)))
+#else
+#define ALIGN_ON_MULTIPLE_PAGE_BOUNDARY_FOR_MPROTECT
+#endif
+
+  /* Process stack.  Starts here and extends 2^log2_n_stack_bytes
+     bytes. */
+
+#define VLIB_PROCESS_STACK_MAGIC (0xdead7ead)
+  u32 stack[0] ALIGN_ON_MULTIPLE_PAGE_BOUNDARY_FOR_MPROTECT;
+} vlib_process_t __attribute__ ((aligned (CLIB_CACHE_LINE_BYTES)));
+
+#ifdef CLIB_UNIX
+  /* Ensure that the stack is aligned on the multiple of the page size */
+typedef char
+  assert_process_stack_must_be_aligned_exactly_to_page_size_multiple[(sizeof
+								      (vlib_process_t)
+								      -
+								      PAGE_SIZE_MULTIPLE)
+								     ==
+								     0 ? 0 :
+								     -1];
+#endif
+
+typedef struct
+{
+  u32 node_index;
+
+  u32 one_time_event;
+} vlib_one_time_waiting_process_t;
+
+typedef struct
+{
+  u16 n_data_elts;
+
+  u16 n_data_elt_bytes;
+
+  /* n_data_elts * n_data_elt_bytes */
+  u32 n_data_bytes;
+
+  /* Process node & event type to be used to signal event. */
+  u32 process_node_index;
+
+  u32 event_type_index;
+
+  union
+  {
+    u8 inline_event_data[64 - 3 * sizeof (u32) - 2 * sizeof (u16)];
+
+    /* Vector of event data used only when data does not fit inline. */
+    u8 *event_data_as_vector;
+  };
+}
+vlib_signal_timed_event_data_t;
+
+always_inline uword
+vlib_timing_wheel_data_is_timed_event (u32 d)
+{
+  return d & 1;
+}
+
+always_inline u32
+vlib_timing_wheel_data_set_suspended_process (u32 i)
+{
+  return 0 + 2 * i;
+}
+
+always_inline u32
+vlib_timing_wheel_data_set_timed_event (u32 i)
+{
+  return 1 + 2 * i;
+}
+
+always_inline uword
+vlib_timing_wheel_data_get_index (u32 d)
+{
+  return d / 2;
+}
+
+typedef struct
+{
+  /* Public nodes. */
+  vlib_node_t **nodes;
+
+  /* Node index hashed by node name. */
+  uword *node_by_name;
+
+  u32 flags;
+#define VLIB_NODE_MAIN_RUNTIME_STARTED (1 << 0)
+
+  /* Nodes segregated by type for cache locality.
+     Does not apply to nodes of type VLIB_NODE_TYPE_INTERNAL. */
+  vlib_node_runtime_t *nodes_by_type[VLIB_N_NODE_TYPE];
+
+  /* Node runtime indices for input nodes with pending interrupts. */
+  u32 *pending_interrupt_node_runtime_indices;
+  clib_spinlock_t pending_interrupt_lock;
+
+  /* Input nodes are switched from/to interrupt to/from polling mode
+     when average vector length goes above/below polling/interrupt
+     thresholds. */
+  u32 polling_threshold_vector_length;
+  u32 interrupt_threshold_vector_length;
+
+  /* Vector of next frames. */
+  vlib_next_frame_t *next_frames;
+
+  /* Vector of internal node's frames waiting to be called. */
+  vlib_pending_frame_t *pending_frames;
+
+  /* Timing wheel for scheduling time-based node dispatch. */
+  void *timing_wheel;
+
+  vlib_signal_timed_event_data_t *signal_timed_event_data_pool;
+
+  /* Opaque data vector added via timing_wheel_advance. */
+  u32 *data_from_advancing_timing_wheel;
+
+  /* CPU time of next process to be ready on timing wheel. */
+  f64 time_next_process_ready;
+
+  /* Vector of process nodes.
+     One for each node of type VLIB_NODE_TYPE_PROCESS. */
+  vlib_process_t **processes;
+
+  /* Current running process or ~0 if no process running. */
+  u32 current_process_index;
+
+  /* Pool of pending process frames. */
+  vlib_pending_frame_t *suspended_process_frames;
+
+  /* Vector of event data vectors pending recycle. */
+  void **recycled_event_data_vectors;
+
+  /* Current counts of nodes in each state. */
+  u32 input_node_counts_by_state[VLIB_N_NODE_STATE];
+
+  /* Hash of (scalar_size,vector_size) to frame_sizes index. */
+  uword *frame_size_hash;
+
+  /* Per-size frame allocation information. */
+  vlib_frame_size_t *frame_sizes;
+
+  /* Time of last node runtime stats clear. */
+  f64 time_last_runtime_stats_clear;
+
+  /* Node registrations added by constructors */
+  vlib_node_registration_t *node_registrations;
+} vlib_node_main_t;
+
+
+#define FRAME_QUEUE_MAX_NELTS 32
+typedef struct
+{
+  CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+  u64 head;
+  u64 head_hint;
+  u64 tail;
+  u32 n_in_use;
+  u32 nelts;
+  u32 written;
+  u32 threshold;
+  i32 n_vectors[FRAME_QUEUE_MAX_NELTS];
+} frame_queue_trace_t;
+
+typedef struct
+{
+  u64 count[FRAME_QUEUE_MAX_NELTS];
+} frame_queue_nelt_counter_t;
+
+#endif /* included_vlib_node_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/node_cli.c b/src/vlib/node_cli.c
new file mode 100644
index 00000000..62ab2e64
--- /dev/null
+++ b/src/vlib/node_cli.c
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * node_cli.c: node CLI
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/threads.h>
+
+static int
+node_cmp (void *a1, void *a2)
+{
+  vlib_node_t **n1 = a1;
+  vlib_node_t **n2 = a2;
+
+  return vec_cmp (n1[0]->name, n2[0]->name);
+}
+
+static clib_error_t *
+show_node_graph (vlib_main_t * vm,
+		 unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_node_t *n;
+  u32 node_index;
+
+  vlib_cli_output (vm, "%U\n", format_vlib_node_graph, nm, 0);
+
+  if (unformat (input, "%U", unformat_vlib_node, vm, &node_index))
+    {
+      n = vlib_get_node (vm, node_index);
+      vlib_cli_output (vm, "%U\n", format_vlib_node_graph, nm, n);
+    }
+  else
+    {
+      vlib_node_t **nodes = vec_dup (nm->nodes);
+      uword i;
+
+      vec_sort_with_function (nodes, node_cmp);
+
+      for (i = 0; i < vec_len (nodes); i++)
+	vlib_cli_output (vm, "%U\n\n", format_vlib_node_graph, nm, nodes[i]);
+
+      vec_free (nodes);
+    }
+
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (show_node_graph_command, static) = {
+  .path = "show vlib graph",
+  .short_help = "Show packet processing node graph",
+  .function = show_node_graph,
+};
+/* *INDENT-ON* */
+
+static u8 *
+format_vlib_node_stats (u8 * s, va_list * va)
+{
+  vlib_main_t *vm = va_arg (*va, vlib_main_t *);
+  vlib_node_t *n = va_arg (*va, vlib_node_t *);
+  int max = va_arg (*va, int);
+  f64 v;
+  char *state;
+  u8 *ns;
+  u8 *misc_info = 0;
+  u64 c, p, l, d;
+  f64 x;
+  f64 maxc, maxcn;
+  u32 maxn;
+  uword indent;
+
+  if (!n)
+    {
+      if (max)
+	return format (s,
+		       "%=30s%=17s%=16s%=16s%=16s%=16s",
+		       "Name", "Max Node Clocks", "Vectors at Max",
+		       "Max Clocks", "Avg Clocks", "Avg Vectors/Call");
+      else
+	return format (s,
+		       "%=30s%=12s%=16s%=16s%=16s%=16s%=16s",
+		       "Name", "State", "Calls", "Vectors", "Suspends",
+		       "Clocks", "Vectors/Call");
+    }
+
+  indent = format_get_indent (s);
+
+  l = n->stats_total.clocks - n->stats_last_clear.clocks;
+  c = n->stats_total.calls - n->stats_last_clear.calls;
+  p = n->stats_total.vectors - n->stats_last_clear.vectors;
+  d = n->stats_total.suspends - n->stats_last_clear.suspends;
+  maxc = (f64) n->stats_total.max_clock;
+  maxn = n->stats_total.max_clock_n;
+  if (n->stats_total.max_clock_n)
+    maxcn = (f64) n->stats_total.max_clock / (f64) maxn;
+  else
+    maxcn = 0.0;
+
+  /* Clocks per packet, per call or per suspend. */
+  x = 0;
+  if (p > 0)
+    x = (f64) l / (f64) p;
+  else if (c > 0)
+    x = (f64) l / (f64) c;
+  else if (d > 0)
+    x = (f64) l / (f64) d;
+
+  if (c > 0)
+    v = (double) p / (double) c;
+  else
+    v = 0;
+
+  state = "active";
+  if (n->type == VLIB_NODE_TYPE_PROCESS)
+    {
+      vlib_process_t *p = vlib_get_process_from_node (vm, n);
+
+      /* Show processes with events pending.  This helps spot bugs where events are not
+         being handled. */
+      if (!clib_bitmap_is_zero (p->non_empty_event_type_bitmap))
+	misc_info = format (misc_info, "events pending, ");
+
+      switch (p->flags & (VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK
+			  | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT))
+	{
+	default:
+	  if (!(p->flags & VLIB_PROCESS_IS_RUNNING))
+	    state = "done";
+	  break;
+
+	case VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK:
+	  state = "time wait";
+	  break;
+
+	case VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT:
+	  state = "event wait";
+	  break;
+
+	case (VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK):
+	  state =
+	    "any wait";
+	  break;
+	}
+    }
+  else if (n->type != VLIB_NODE_TYPE_INTERNAL)
+    {
+      state = "polling";
+      if (n->state == VLIB_NODE_STATE_DISABLED)
+	state = "disabled";
+      else if (n->state == VLIB_NODE_STATE_INTERRUPT)
+	state = "interrupt wait";
+    }
+
+  ns = n->name;
+
+  if (max)
+    s = format (s, "%-30v%=17.2e%=16d%=16.2e%=16.2e%=16.2e",
+		ns, maxc, maxn, maxcn, x, v);
+  else
+    s = format (s, "%-30v%=12s%16Ld%16Ld%16Ld%16.2e%16.2f", ns, state,
+		c, p, d, x, v);
+
+  if (ns != n->name)
+    vec_free (ns);
+
+  if (misc_info)
+    {
+      s = format (s, "\n%U%v", format_white_space, indent + 4, misc_info);
+      vec_free (misc_info);
+    }
+
+  return s;
+}
+
+static clib_error_t *
+show_node_runtime (vlib_main_t * vm,
+		   unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_node_t *n;
+  f64 time_now;
+  u32 node_index;
+  vlib_node_t ***node_dups = 0;
+  f64 *vectors_per_main_loop = 0;
+  f64 *last_vector_length_per_node = 0;
+
+  time_now = vlib_time_now (vm);
+
+  if (unformat (input, "%U", unformat_vlib_node, vm, &node_index))
+    {
+      n = vlib_get_node (vm, node_index);
+      vlib_node_sync_stats (vm, n);
+      vlib_cli_output (vm, "%U\n", format_vlib_node_stats, vm, 0, 0);
+      vlib_cli_output (vm, "%U\n", format_vlib_node_stats, vm, n, 0);
+    }
+  else
+    {
+      vlib_node_t **nodes;
+      uword i, j;
+      f64 dt;
+      u64 n_input, n_output, n_drop, n_punt;
+      u64 n_internal_vectors, n_internal_calls;
+      u64 n_clocks, l, v, c, d;
+      int brief = 1;
+      int max = 0;
+      vlib_main_t **stat_vms = 0, *stat_vm;
+
+      /* Suppress nodes with zero calls since last clear */
+      if (unformat (input, "brief") || unformat (input, "b"))
+	brief = 1;
+      if (unformat (input, "verbose") || unformat (input, "v"))
+	brief = 0;
+      if (unformat (input, "max") || unformat (input, "m"))
+	max = 1;
+
+      for (i = 0; i < vec_len (vlib_mains); i++)
+	{
+	  stat_vm = vlib_mains[i];
+	  if (stat_vm)
+	    vec_add1 (stat_vms, stat_vm);
+	}
+
+      /*
+       * Barrier sync across stats scraping.
+       * Otherwise, the counts will be grossly inaccurate.
+       */
+      vlib_worker_thread_barrier_sync (vm);
+
+      for (j = 0; j < vec_len (stat_vms); j++)
+	{
+	  stat_vm = stat_vms[j];
+	  nm = &stat_vm->node_main;
+
+	  for (i = 0; i < vec_len (nm->nodes); i++)
+	    {
+	      n = nm->nodes[i];
+	      vlib_node_sync_stats (stat_vm, n);
+	    }
+
+	  nodes = vec_dup (nm->nodes);
+
+	  vec_add1 (node_dups, nodes);
+	  vec_add1 (vectors_per_main_loop,
+		    vlib_last_vectors_per_main_loop_as_f64 (stat_vm));
+	  vec_add1 (last_vector_length_per_node,
+		    vlib_last_vector_length_per_node (stat_vm));
+	}
+      vlib_worker_thread_barrier_release (vm);
+
+
+      for (j = 0; j < vec_len (stat_vms); j++)
+	{
+	  stat_vm = stat_vms[j];
+	  nodes = node_dups[j];
+
+	  vec_sort_with_function (nodes, node_cmp);
+
+	  n_input = n_output = n_drop = n_punt = n_clocks = 0;
+	  n_internal_vectors = n_internal_calls = 0;
+	  for (i = 0; i < vec_len (nodes); i++)
+	    {
+	      n = nodes[i];
+
+	      l = n->stats_total.clocks - n->stats_last_clear.clocks;
+	      n_clocks += l;
+
+	      v = n->stats_total.vectors - n->stats_last_clear.vectors;
+	      c = n->stats_total.calls - n->stats_last_clear.calls;
+
+	      switch (n->type)
+		{
+		default:
+		  continue;
+
+		case VLIB_NODE_TYPE_INTERNAL:
+		  n_output += (n->flags & VLIB_NODE_FLAG_IS_OUTPUT) ? v : 0;
+		  n_drop += (n->flags & VLIB_NODE_FLAG_IS_DROP) ? v : 0;
+		  n_punt += (n->flags & VLIB_NODE_FLAG_IS_PUNT) ? v : 0;
+		  if (!(n->flags & VLIB_NODE_FLAG_IS_OUTPUT))
+		    {
+		      n_internal_vectors += v;
+		      n_internal_calls += c;
+		    }
+		  if (n->flags & VLIB_NODE_FLAG_IS_HANDOFF)
+		    n_input += v;
+		  break;
+
+		case VLIB_NODE_TYPE_INPUT:
+		  n_input += v;
+		  break;
+		}
+	    }
+
+	  if (vec_len (vlib_mains) > 1)
+	    {
+	      vlib_worker_thread_t *w = vlib_worker_threads + j;
+	      if (j > 0)
+		vlib_cli_output (vm, "---------------");
+
+	      if (w->lcore_id > -1)
+		vlib_cli_output (vm, "Thread %d %s (lcore %u)", j, w->name,
+				 w->lcore_id);
+	      else
+		vlib_cli_output (vm, "Thread %d %s", j, w->name);
+	    }
+
+	  dt = time_now - nm->time_last_runtime_stats_clear;
+	  vlib_cli_output
+	    (vm,
+	     "Time %.1f, average vectors/node %.2f, last %d main loops %.2f per node %.2f"
+	     "\n  vector rates in %.4e, out %.4e, drop %.4e, punt %.4e",
+	     dt,
+	     (n_internal_calls > 0
+	      ? (f64) n_internal_vectors / (f64) n_internal_calls
+	      : 0),
+	     1 << VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE,
+	     vectors_per_main_loop[j],
+	     last_vector_length_per_node[j],
+	     (f64) n_input / dt,
+	     (f64) n_output / dt, (f64) n_drop / dt, (f64) n_punt / dt);
+
+	  vlib_cli_output (vm, "%U", format_vlib_node_stats, stat_vm, 0, max);
+	  for (i = 0; i < vec_len (nodes); i++)
+	    {
+	      c =
+		nodes[i]->stats_total.calls -
+		nodes[i]->stats_last_clear.calls;
+	      d =
+		nodes[i]->stats_total.suspends -
+		nodes[i]->stats_last_clear.suspends;
+	      if (c || d || !brief)
+		{
+		  vlib_cli_output (vm, "%U", format_vlib_node_stats, stat_vm,
+				   nodes[i], max);
+		}
+	    }
+	  vec_free (nodes);
+	}
+      vec_free (stat_vms);
+      vec_free (node_dups);
+      vec_free (vectors_per_main_loop);
+      vec_free (last_vector_length_per_node);
+    }
+
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (show_node_runtime_command, static) = {
+  .path = "show runtime",
+  .short_help = "Show packet processing runtime",
+  .function = show_node_runtime,
+  .is_mp_safe = 1,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+clear_node_runtime (vlib_main_t * vm,
+		    unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  vlib_node_main_t *nm;
+  vlib_node_t *n;
+  int i, j;
+  vlib_main_t **stat_vms = 0, *stat_vm;
+  vlib_node_runtime_t *r;
+
+  for (i = 0; i < vec_len (vlib_mains); i++)
+    {
+      stat_vm = vlib_mains[i];
+      if (stat_vm)
+	vec_add1 (stat_vms, stat_vm);
+    }
+
+  vlib_worker_thread_barrier_sync (vm);
+
+  for (j = 0; j < vec_len (stat_vms); j++)
+    {
+      stat_vm = stat_vms[j];
+      nm = &stat_vm->node_main;
+
+      for (i = 0; i < vec_len (nm->nodes); i++)
+	{
+	  n = nm->nodes[i];
+	  vlib_node_sync_stats (stat_vm, n);
+	  n->stats_last_clear = n->stats_total;
+
+	  r = vlib_node_get_runtime (stat_vm, n->index);
+	  r->max_clock = 0;
+	}
+      /* Note: input/output rates computed using vlib_global_main */
+      nm->time_last_runtime_stats_clear = vlib_time_now (vm);
+    }
+
+  vlib_worker_thread_barrier_release (vm);
+
+  vec_free (stat_vms);
+
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (clear_node_runtime_command, static) = {
+  .path = "clear runtime",
+  .short_help = "Clear packet processing runtime statistics",
+  .function = clear_node_runtime,
+};
+/* *INDENT-ON* */
+
+/* Dummy function to get us linked in. */
+void
+vlib_node_cli_reference (void)
+{
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/node_format.c b/src/vlib/node_format.c
new file mode 100644
index 00000000..e9dde40f
--- /dev/null
+++ b/src/vlib/node_format.c
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * node_format.c: node formatting
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+
+u8 *
+format_vlib_node_graph (u8 * s, va_list * va)
+{
+  vlib_node_main_t *nm = va_arg (*va, vlib_node_main_t *);
+  vlib_node_t *n = va_arg (*va, vlib_node_t *);
+  int i, j;
+  uword indent;
+  typedef struct
+  {
+    u32 next_node;
+    u32 next_slot;
+    u32 prev_node;
+  } tmp_t;
+  tmp_t *tmps = 0;
+  tmp_t empty = {.next_node = ~0,.prev_node = ~0 };
+
+  if (!n)
+    return format (s, "%=26s%=26s%=26s", "Name", "Next", "Previous");
+
+  s = format (s, "%-26v", n->name);
+
+  indent = format_get_indent (s);
+
+  for (i = j = 0; i < vec_len (n->next_nodes); i++)
+    {
+      if (n->next_nodes[i] == VLIB_INVALID_NODE_INDEX)
+	continue;
+      vec_validate_init_empty (tmps, j, empty);
+      tmps[j].next_node = n->next_nodes[i];
+      tmps[j].next_slot = i;
+      j++;
+    }
+
+  j = 0;
+  /* *INDENT-OFF* */
+  clib_bitmap_foreach (i, n->prev_node_bitmap, ({
+	vec_validate_init_empty (tmps, j, empty);
+	tmps[j].prev_node = i;
+	j++;
+      }));
+  /* *INDENT-ON* */
+
+  for (i = 0; i < vec_len (tmps); i++)
+    {
+      if (i > 0)
+	s = format (s, "\n%U", format_white_space, indent);
+
+      if (tmps[i].next_node != ~0)
+	{
+	  vlib_node_t *x;
+	  u8 *t = 0;
+
+	  x = vec_elt (nm->nodes, tmps[i].next_node);
+	  t = format (t, "%v [%d]", x->name, tmps[i].next_slot);
+	  s = format (s, "%=26v", t);
+	  vec_free (t);
+	}
+      else
+	s = format (s, "%26s", "");
+
+      if (tmps[i].prev_node != ~0)
+	{
+	  vlib_node_t *x;
+	  x = vec_elt (nm->nodes, tmps[i].prev_node);
+	  s = format (s, "%=26v", x->name);
+	}
+    }
+
+  vec_free (tmps);
+
+  return s;
+}
+
+u8 *
+format_vlib_node_and_next (u8 * s, va_list * va)
+{
+  vlib_main_t *vm = va_arg (*va, vlib_main_t *);
+  vlib_node_t *n = va_arg (*va, vlib_node_t *);
+  u32 next_index = va_arg (*va, u32);
+  vlib_node_t *n_next;
+  u32 *ni;
+
+  ni = vec_elt_at_index (n->next_nodes, next_index);
+  n_next = vlib_get_node (vm, ni[0]);
+  return format (s, "%v -> %v", n->name, n_next->name);
+}
+
+u8 *
+format_vlib_node_name (u8 * s, va_list * va)
+{
+  vlib_main_t *vm = va_arg (*va, vlib_main_t *);
+  u32 node_index = va_arg (*va, u32);
+  vlib_node_t *n = vlib_get_node (vm, node_index);
+
+  return format (s, "%v", n->name);
+}
+
+u8 *
+format_vlib_next_node_name (u8 * s, va_list * va)
+{
+  vlib_main_t *vm = va_arg (*va, vlib_main_t *);
+  u32 node_index = va_arg (*va, u32);
+  u32 next_index = va_arg (*va, u32);
+  vlib_node_t *next = vlib_get_next_node (vm, node_index, next_index);
+  return format (s, "%v", next->name);
+}
+
+/* Parse node name -> node index. */
+uword
+unformat_vlib_node (unformat_input_t * input, va_list * args)
+{
+  vlib_main_t *vm = va_arg (*args, vlib_main_t *);
+  u32 *result = va_arg (*args, u32 *);
+
+  return unformat_user (input, unformat_hash_vec_string,
+			vm->node_main.node_by_name, result);
+}
+
+u8 *
+format_vlib_time (u8 * s, va_list * va)
+{
+  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *);
+  f64 time = va_arg (*va, f64);
+  return format (s, "%12.4f", time);
+}
+
+u8 *
+format_vlib_cpu_time (u8 * s, va_list * va)
+{
+  vlib_main_t *vm = va_arg (*va, vlib_main_t *);
+  u64 cpu_time = va_arg (*va, u64);
+  f64 dt;
+
+  dt =
+    (cpu_time -
+     vm->clib_time.init_cpu_time) * vm->clib_time.seconds_per_clock;
+  return format (s, "%U", format_vlib_time, vm, dt);
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h
new file mode 100644
index 00000000..0734476c
--- /dev/null
+++ b/src/vlib/node_funcs.h
@@ -0,0 +1,1175 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * node_funcs.h: processing nodes global functions/inlines
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/** \file
+    vlib node functions
+*/
+
+
+#ifndef included_vlib_node_funcs_h
+#define included_vlib_node_funcs_h
+
+#include <vppinfra/fifo.h>
+#include <vppinfra/tw_timer_1t_3w_1024sl_ov.h>
+
+/** \brief Get vlib node by index.
+ @warning This function will ASSERT if @c i is out of range.
+ @param vm vlib_main_t pointer, varies by thread
+ @param i node index.
+ @return pointer to the requested vlib_node_t.
+*/
+
+always_inline vlib_node_t *
+vlib_get_node (vlib_main_t * vm, u32 i)
+{
+  return vec_elt (vm->node_main.nodes, i);
+}
+
+/** \brief Get vlib node by graph arc (next) index.
+ @param vm vlib_main_t pointer, varies by thread
+ @param node_index index of original node
+ @param next_index graph arc index
+ @return pointer to the vlib_node_t at the end of the indicated arc
+*/
+
+always_inline vlib_node_t *
+vlib_get_next_node (vlib_main_t * vm, u32 node_index, u32 next_index)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_node_t *n;
+
+  n = vec_elt (nm->nodes, node_index);
+  ASSERT (next_index < vec_len (n->next_nodes));
+  return vlib_get_node (vm, n->next_nodes[next_index]);
+}
+
+/** \brief Get node runtime by node index.
+ @param vm vlib_main_t pointer, varies by thread
+ @param node_index index of node
+ @return pointer to the indicated vlib_node_runtime_t
+*/
+
+always_inline vlib_node_runtime_t *
+vlib_node_get_runtime (vlib_main_t * vm, u32 node_index)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_node_t *n = vec_elt (nm->nodes, node_index);
+  vlib_process_t *p;
+  if (n->type != VLIB_NODE_TYPE_PROCESS)
+    return vec_elt_at_index (nm->nodes_by_type[n->type], n->runtime_index);
+  else
+    {
+      p = vec_elt (nm->processes, n->runtime_index);
+      return &p->node_runtime;
+    }
+}
+
+/** \brief Get node runtime private data by node index.
+ @param vm vlib_main_t pointer, varies by thread
+ @param node_index index of the node
+ @return pointer to the indicated vlib_node_runtime_t private data
+*/
+
+always_inline void *
+vlib_node_get_runtime_data (vlib_main_t * vm, u32 node_index)
+{
+  vlib_node_runtime_t *r = vlib_node_get_runtime (vm, node_index);
+  return r->runtime_data;
+}
+
+/** \brief Set node runtime private data.
+ @param vm vlib_main_t pointer, varies by thread
+ @param node_index index of the node
+ @param runtime_data arbitrary runtime private data
+ @param n_runtime_data_bytes size of runtime private data
+*/
+
+always_inline void
+vlib_node_set_runtime_data (vlib_main_t * vm, u32 node_index,
+			    void *runtime_data, u32 n_runtime_data_bytes)
+{
+  vlib_node_t *n = vlib_get_node (vm, node_index);
+  vlib_node_runtime_t *r = vlib_node_get_runtime (vm, node_index);
+
+  n->runtime_data_bytes = n_runtime_data_bytes;
+  vec_free (n->runtime_data);
+  vec_add (n->runtime_data, runtime_data, n_runtime_data_bytes);
+
+  ASSERT (vec_len (n->runtime_data) <= sizeof (vlib_node_runtime_t) -
+	  STRUCT_OFFSET_OF (vlib_node_runtime_t, runtime_data));
+
+  if (vec_len (n->runtime_data) > 0)
+    clib_memcpy (r->runtime_data, n->runtime_data, vec_len (n->runtime_data));
+}
+
+/** \brief Set node dispatch state.
+ @param vm vlib_main_t pointer, varies by thread
+ @param node_index index of the node
+ @param new_state new state for node, see vlib_node_state_t
+*/
+always_inline void
+vlib_node_set_state (vlib_main_t * vm, u32 node_index,
+		     vlib_node_state_t new_state)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_node_t *n;
+  vlib_node_runtime_t *r;
+
+  n = vec_elt (nm->nodes, node_index);
+  if (n->type == VLIB_NODE_TYPE_PROCESS)
+    {
+      vlib_process_t *p = vec_elt (nm->processes, n->runtime_index);
+      r = &p->node_runtime;
+
+      /* When disabling make sure flags are cleared. */
+      p->flags &= ~(VLIB_PROCESS_RESUME_PENDING
+		    | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK
+		    | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT);
+    }
+  else
+    r = vec_elt_at_index (nm->nodes_by_type[n->type], n->runtime_index);
+
+  ASSERT (new_state < VLIB_N_NODE_STATE);
+
+  if (n->type == VLIB_NODE_TYPE_INPUT)
+    {
+      ASSERT (nm->input_node_counts_by_state[n->state] > 0);
+      nm->input_node_counts_by_state[n->state] -= 1;
+      nm->input_node_counts_by_state[new_state] += 1;
+    }
+
+  n->state = new_state;
+  r->state = new_state;
+}
+
+/** \brief Get node dispatch state.
+ @param vm vlib_main_t pointer, varies by thread
+ @param node_index index of the node
+ @return state for node, see vlib_node_state_t
+*/
+always_inline vlib_node_state_t
+vlib_node_get_state (vlib_main_t * vm, u32 node_index)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_node_t *n;
+  n = vec_elt (nm->nodes, node_index);
+  return n->state;
+}
+
+always_inline void
+vlib_node_set_interrupt_pending (vlib_main_t * vm, u32 node_index)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_node_t *n = vec_elt (nm->nodes, node_index);
+  ASSERT (n->type == VLIB_NODE_TYPE_INPUT);
+  clib_spinlock_lock_if_init (&nm->pending_interrupt_lock);
+  vec_add1 (nm->pending_interrupt_node_runtime_indices, n->runtime_index);
+  clib_spinlock_unlock_if_init (&nm->pending_interrupt_lock);
+}
+
+always_inline vlib_process_t *
+vlib_get_process_from_node (vlib_main_t * vm, vlib_node_t * node)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  ASSERT (node->type == VLIB_NODE_TYPE_PROCESS);
+  return vec_elt (nm->processes, node->runtime_index);
+}
+
+/* Fetches frame with given handle. */
+always_inline vlib_frame_t *
+vlib_get_frame_no_check (vlib_main_t * vm, uword frame_index)
+{
+  vlib_frame_t *f;
+  f = vm->heap_base + (frame_index * VLIB_FRAME_ALIGN);
+  return f;
+}
+
+always_inline u32
+vlib_frame_index_no_check (vlib_main_t * vm, vlib_frame_t * f)
+{
+  uword i;
+
+  ASSERT (((uword) f & (VLIB_FRAME_ALIGN - 1)) == 0);
+
+  i = ((u8 *) f - (u8 *) vm->heap_base);
+  ASSERT ((i / VLIB_FRAME_ALIGN) <= 0xFFFFFFFFULL);
+
+  return i / VLIB_FRAME_ALIGN;
+}
+
+always_inline vlib_frame_t *
+vlib_get_frame (vlib_main_t * vm, uword frame_index)
+{
+  vlib_frame_t *f = vlib_get_frame_no_check (vm, frame_index);
+  ASSERT (f->flags & VLIB_FRAME_IS_ALLOCATED);
+  return f;
+}
+
+always_inline u32
+vlib_frame_index (vlib_main_t * vm, vlib_frame_t * f)
+{
+  uword i = vlib_frame_index_no_check (vm, f);
+  ASSERT (vlib_get_frame (vm, i) == f);
+  return i;
+}
+
+/* Byte alignment for vector arguments. */
+#define VLIB_FRAME_VECTOR_ALIGN (1 << 4)
+
+always_inline u32
+vlib_frame_vector_byte_offset (u32 scalar_size)
+{
+  return round_pow2 (sizeof (vlib_frame_t) + scalar_size,
+		     VLIB_FRAME_VECTOR_ALIGN);
+}
+
+/** \brief Get pointer to frame vector data.
+ @param f vlib_frame_t pointer
+ @return pointer to first vector element in frame
+*/
+always_inline void *
+vlib_frame_vector_args (vlib_frame_t * f)
+{
+  return (void *) f + vlib_frame_vector_byte_offset (f->scalar_size);
+}
+
+/** \brief Get pointer to frame scalar data.
+
+ @warning This is almost certainly not the function you wish to call.
+ See @ref vlib_frame_vector_args instead.
+
+ @param f vlib_frame_t pointer
+
+ @return arbitrary node scalar data
+
+ @sa vlib_frame_vector_args
+*/
+always_inline void *
+vlib_frame_args (vlib_frame_t * f)
+{
+  return vlib_frame_vector_args (f) - f->scalar_size;
+}
+
+always_inline vlib_next_frame_t *
+vlib_node_runtime_get_next_frame (vlib_main_t * vm,
+				  vlib_node_runtime_t * n, u32 next_index)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_next_frame_t *nf;
+
+  ASSERT (next_index < n->n_next_nodes);
+  nf = vec_elt_at_index (nm->next_frames, n->next_frame_index + next_index);
+
+  if (CLIB_DEBUG > 0)
+    {
+      vlib_node_t *node, *next;
+      node = vec_elt (nm->nodes, n->node_index);
+      next = vec_elt (nm->nodes, node->next_nodes[next_index]);
+      ASSERT (nf->node_runtime_index == next->runtime_index);
+    }
+
+  return nf;
+}
+
+/** \brief Get pointer to frame by (@c node_index, @c next_index).
+
+ @warning This is not a function that you should call directly.
+ See @ref vlib_get_next_frame instead.
+
+ @param vm vlib_main_t pointer, varies by thread
+ @param node_index index of the node
+ @param next_index graph arc index
+
+ @return pointer to the requested vlib_next_frame_t
+
+ @sa vlib_get_next_frame
+*/
+
+always_inline vlib_next_frame_t *
+vlib_node_get_next_frame (vlib_main_t * vm, u32 node_index, u32 next_index)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_node_t *n;
+  vlib_node_runtime_t *r;
+
+  n = vec_elt (nm->nodes, node_index);
+  r = vec_elt_at_index (nm->nodes_by_type[n->type], n->runtime_index);
+  return vlib_node_runtime_get_next_frame (vm, r, next_index);
+}
+
+vlib_frame_t *vlib_get_next_frame_internal (vlib_main_t * vm,
+					    vlib_node_runtime_t * node,
+					    u32 next_index,
+					    u32 alloc_new_frame);
+
+#define vlib_get_next_frame_macro(vm,node,next_index,vectors,n_vectors_left,alloc_new_frame) \
+do {									\
+  vlib_frame_t * _f							\
+    = vlib_get_next_frame_internal ((vm), (node), (next_index),		\
+				    (alloc_new_frame));			\
+  u32 _n = _f->n_vectors;						\
+  (vectors) = vlib_frame_vector_args (_f) + _n * sizeof ((vectors)[0]); \
+  (n_vectors_left) = VLIB_FRAME_SIZE - _n;				\
+} while (0)
+
+
+/** \brief Get pointer to next frame vector data by
+    (@c vlib_node_runtime_t, @c next_index).
+ Standard single/dual loop boilerplate element.
+ @attention This is a MACRO, with SIDE EFFECTS.
+
+ @param vm vlib_main_t pointer, varies by thread
+ @param node current node vlib_node_runtime_t pointer
+ @param next_index requested graph arc index
+
+ @return @c vectors -- pointer to next available vector slot
+ @return @c n_vectors_left -- number of vector slots available
+*/
+#define vlib_get_next_frame(vm,node,next_index,vectors,n_vectors_left)	\
+  vlib_get_next_frame_macro (vm, node, next_index,			\
+			     vectors, n_vectors_left,			\
+			     /* alloc new frame */ 0)
+
+#define vlib_get_new_next_frame(vm,node,next_index,vectors,n_vectors_left) \
+  vlib_get_next_frame_macro (vm, node, next_index,			\
+			     vectors, n_vectors_left,			\
+			     /* alloc new frame */ 1)
+
+/** \brief Release pointer to next frame vector data.
+ Standard single/dual loop boilerplate element.
+ @param vm vlib_main_t pointer, varies by thread
+ @param r current node vlib_node_runtime_t pointer
+ @param next_index graph arc index
+ @param n_packets_left number of slots still available in vector
+*/
+void
+vlib_put_next_frame (vlib_main_t * vm,
+		     vlib_node_runtime_t * r,
+		     u32 next_index, u32 n_packets_left);
+
+/* Combination get plus put.  Returns vector argument just added. */
+#define vlib_set_next_frame(vm,node,next_index,v)			\
+({									\
+  uword _n_left;							\
+  vlib_get_next_frame ((vm), (node), (next_index), (v), _n_left);	\
+  ASSERT (_n_left > 0);							\
+  vlib_put_next_frame ((vm), (node), (next_index), _n_left - 1);	\
+  (v);									\
+})
+
+always_inline void
+vlib_set_next_frame_buffer (vlib_main_t * vm,
+			    vlib_node_runtime_t * node,
+			    u32 next_index, u32 buffer_index)
+{
+  u32 *p;
+  p = vlib_set_next_frame (vm, node, next_index, p);
+  p[0] = buffer_index;
+}
+
+vlib_frame_t *vlib_get_frame_to_node (vlib_main_t * vm, u32 to_node_index);
+void vlib_put_frame_to_node (vlib_main_t * vm, u32 to_node_index,
+			     vlib_frame_t * f);
+
+always_inline uword
+vlib_in_process_context (vlib_main_t * vm)
+{
+  return vm->node_main.current_process_index != ~0;
+}
+
+always_inline vlib_process_t *
+vlib_get_current_process (vlib_main_t * vm)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  if (vlib_in_process_context (vm))
+    return vec_elt (nm->processes, nm->current_process_index);
+  return 0;
+}
+
+always_inline uword
+vlib_current_process (vlib_main_t * vm)
+{
+  return vlib_get_current_process (vm)->node_runtime.node_index;
+}
+
+/** Returns TRUE if a process suspend time is less than 10us
+    @param dt - remaining poll time in seconds
+    @returns 1 if dt < 10e-6, 0 otherwise
+*/
+always_inline uword
+vlib_process_suspend_time_is_zero (f64 dt)
+{
+  return dt < 10e-6;
+}
+
+/** Suspend a vlib cooperative multi-tasking thread for a period of time
+    @param vm - vlib_main_t *
+    @param dt - suspend interval in seconds
+    @returns VLIB_PROCESS_RESUME_LONGJMP_RESUME, routinely ignored
+*/
+
+always_inline uword
+vlib_process_suspend (vlib_main_t * vm, f64 dt)
+{
+  uword r;
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_process_t *p = vec_elt (nm->processes, nm->current_process_index);
+
+  if (vlib_process_suspend_time_is_zero (dt))
+    return VLIB_PROCESS_RESUME_LONGJMP_RESUME;
+
+  p->flags |= VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK;
+  r = clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND);
+  if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND)
+    {
+      /* expiration time in 10us ticks */
+      p->resume_clock_interval = dt * 1e5;
+      clib_longjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_SUSPEND);
+    }
+
+  return r;
+}
+
+always_inline void
+vlib_process_free_event_type (vlib_process_t * p, uword t,
+			      uword is_one_time_event)
+{
+  ASSERT (!pool_is_free_index (p->event_type_pool, t));
+  pool_put_index (p->event_type_pool, t);
+  if (is_one_time_event)
+    p->one_time_event_type_bitmap =
+      clib_bitmap_andnoti (p->one_time_event_type_bitmap, t);
+}
+
+always_inline void
+vlib_process_maybe_free_event_type (vlib_process_t * p, uword t)
+{
+  ASSERT (!pool_is_free_index (p->event_type_pool, t));
+  if (clib_bitmap_get (p->one_time_event_type_bitmap, t))
+    vlib_process_free_event_type (p, t, /* is_one_time_event */ 1);
+}
+
+always_inline void *
+vlib_process_get_event_data (vlib_main_t * vm,
+			     uword * return_event_type_opaque)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_process_t *p;
+  vlib_process_event_type_t *et;
+  uword t;
+  void *event_data_vector;
+
+  p = vec_elt (nm->processes, nm->current_process_index);
+
+  /* Find first type with events ready.
+     Return invalid type when there's nothing there. */
+  t = clib_bitmap_first_set (p->non_empty_event_type_bitmap);
+  if (t == ~0)
+    return 0;
+
+  p->non_empty_event_type_bitmap =
+    clib_bitmap_andnoti (p->non_empty_event_type_bitmap, t);
+
+  ASSERT (_vec_len (p->pending_event_data_by_type_index[t]) > 0);
+  event_data_vector = p->pending_event_data_by_type_index[t];
+  p->pending_event_data_by_type_index[t] = 0;
+
+  et = pool_elt_at_index (p->event_type_pool, t);
+
+  /* Return user's opaque value and possibly index. */
+  *return_event_type_opaque = et->opaque;
+
+  vlib_process_maybe_free_event_type (p, t);
+
+  return event_data_vector;
+}
+
+/* Return event data vector for later reuse.  We reuse event data to avoid
+   repeatedly allocating event vectors in cases where we care about speed. */
+always_inline void
+vlib_process_put_event_data (vlib_main_t * vm, void *event_data)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vec_add1 (nm->recycled_event_data_vectors, event_data);
+}
+
+/** Return the first event type which has occurred and a vector of per-event
+    data of that type, or a timeout indication
+
+    @param vm - vlib_main_t pointer
+    @param data_vector - pointer to a (uword *) vector to receive event data
+    @returns either an event type and a vector of per-event instance data,
+    or ~0 to indicate a timeout.
+*/
+
+always_inline uword
+vlib_process_get_events (vlib_main_t * vm, uword ** data_vector)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_process_t *p;
+  vlib_process_event_type_t *et;
+  uword r, t, l;
+
+  p = vec_elt (nm->processes, nm->current_process_index);
+
+  /* Find first type with events ready.
+     Return invalid type when there's nothing there. */
+  t = clib_bitmap_first_set (p->non_empty_event_type_bitmap);
+  if (t == ~0)
+    return t;
+
+  p->non_empty_event_type_bitmap =
+    clib_bitmap_andnoti (p->non_empty_event_type_bitmap, t);
+
+  l = _vec_len (p->pending_event_data_by_type_index[t]);
+  if (data_vector)
+    vec_add (*data_vector, p->pending_event_data_by_type_index[t], l);
+  _vec_len (p->pending_event_data_by_type_index[t]) = 0;
+
+  et = pool_elt_at_index (p->event_type_pool, t);
+
+  /* Return user's opaque value. */
+  r = et->opaque;
+
+  vlib_process_maybe_free_event_type (p, t);
+
+  return r;
+}
+
+always_inline uword
+vlib_process_get_events_helper (vlib_process_t * p, uword t,
+				uword ** data_vector)
+{
+  uword l;
+
+  p->non_empty_event_type_bitmap =
+    clib_bitmap_andnoti (p->non_empty_event_type_bitmap, t);
+
+  l = _vec_len (p->pending_event_data_by_type_index[t]);
+  if (data_vector)
+    vec_add (*data_vector, p->pending_event_data_by_type_index[t], l);
+  _vec_len (p->pending_event_data_by_type_index[t]) = 0;
+
+  vlib_process_maybe_free_event_type (p, t);
+
+  return l;
+}
+
+/* As above but query as specified type of event.  Returns number of
+   events found. */
+always_inline uword
+vlib_process_get_events_with_type (vlib_main_t * vm, uword ** data_vector,
+				   uword with_type_opaque)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_process_t *p;
+  uword t, *h;
+
+  p = vec_elt (nm->processes, nm->current_process_index);
+  h = hash_get (p->event_type_index_by_type_opaque, with_type_opaque);
+  if (!h)
+    /* This can happen when an event has not yet been
+       signaled with given opaque type. */
+    return 0;
+
+  t = h[0];
+  if (!clib_bitmap_get (p->non_empty_event_type_bitmap, t))
+    return 0;
+
+  return vlib_process_get_events_helper (p, t, data_vector);
+}
+
+always_inline uword *
+vlib_process_wait_for_event (vlib_main_t * vm)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_process_t *p;
+  uword r;
+
+  p = vec_elt (nm->processes, nm->current_process_index);
+  if (clib_bitmap_is_zero (p->non_empty_event_type_bitmap))
+    {
+      p->flags |= VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT;
+      r =
+	clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND);
+      if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND)
+	clib_longjmp (&p->return_longjmp,
+		      VLIB_PROCESS_RETURN_LONGJMP_SUSPEND);
+    }
+
+  return p->non_empty_event_type_bitmap;
+}
+
+always_inline uword
+vlib_process_wait_for_one_time_event (vlib_main_t * vm,
+				      uword ** data_vector,
+				      uword with_type_index)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_process_t *p;
+  uword r;
+
+  p = vec_elt (nm->processes, nm->current_process_index);
+  ASSERT (!pool_is_free_index (p->event_type_pool, with_type_index));
+  while (!clib_bitmap_get (p->non_empty_event_type_bitmap, with_type_index))
+    {
+      p->flags |= VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT;
+      r =
+	clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND);
+      if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND)
+	clib_longjmp (&p->return_longjmp,
+		      VLIB_PROCESS_RETURN_LONGJMP_SUSPEND);
+    }
+
+  return vlib_process_get_events_helper (p, with_type_index, data_vector);
+}
+
+always_inline uword
+vlib_process_wait_for_event_with_type (vlib_main_t * vm,
+				       uword ** data_vector,
+				       uword with_type_opaque)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_process_t *p;
+  uword r, *h;
+
+  p = vec_elt (nm->processes, nm->current_process_index);
+  h = hash_get (p->event_type_index_by_type_opaque, with_type_opaque);
+  while (!h || !clib_bitmap_get (p->non_empty_event_type_bitmap, h[0]))
+    {
+      p->flags |= VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT;
+      r =
+	clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND);
+      if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND)
+	clib_longjmp (&p->return_longjmp,
+		      VLIB_PROCESS_RETURN_LONGJMP_SUSPEND);
+
+      /* See if unknown event type has been signaled now. */
+      if (!h)
+	h = hash_get (p->event_type_index_by_type_opaque, with_type_opaque);
+    }
+
+  return vlib_process_get_events_helper (p, h[0], data_vector);
+}
+
+/** Suspend a cooperative multi-tasking thread
+    Waits for an event, or for the indicated number of seconds to elapse
+    @param vm - vlib_main_t pointer
+    @param dt - timeout, in seconds.
+    @returns the remaining time interval
+*/
+
+always_inline f64
+vlib_process_wait_for_event_or_clock (vlib_main_t * vm, f64 dt)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_process_t *p;
+  f64 wakeup_time;
+  uword r;
+
+  p = vec_elt (nm->processes, nm->current_process_index);
+
+  if (vlib_process_suspend_time_is_zero (dt)
+      || !clib_bitmap_is_zero (p->non_empty_event_type_bitmap))
+    return dt;
+
+  wakeup_time = vlib_time_now (vm) + dt;
+
+  /* Suspend waiting for both clock and event to occur. */
+  p->flags |= (VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT
+	       | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK);
+
+  r = clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND);
+  if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND)
+    {
+      p->resume_clock_interval = dt * 1e5;
+      clib_longjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_SUSPEND);
+    }
+
+  /* Return amount of time still left to sleep.
+     If <= 0 then we've been waken up by the clock (and not an event). */
+  return wakeup_time - vlib_time_now (vm);
+}
+
+always_inline vlib_process_event_type_t *
+vlib_process_new_event_type (vlib_process_t * p, uword with_type_opaque)
+{
+  vlib_process_event_type_t *et;
+  pool_get (p->event_type_pool, et);
+  et->opaque = with_type_opaque;
+  return et;
+}
+
+always_inline uword
+vlib_process_create_one_time_event (vlib_main_t * vm, uword node_index,
+				    uword with_type_opaque)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_node_t *n = vlib_get_node (vm, node_index);
+  vlib_process_t *p = vec_elt (nm->processes, n->runtime_index);
+  vlib_process_event_type_t *et;
+  uword t;
+
+  et = vlib_process_new_event_type (p, with_type_opaque);
+  t = et - p->event_type_pool;
+  p->one_time_event_type_bitmap =
+    clib_bitmap_ori (p->one_time_event_type_bitmap, t);
+  return t;
+}
+
+always_inline void
+vlib_process_delete_one_time_event (vlib_main_t * vm, uword node_index,
+				    uword t)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_node_t *n = vlib_get_node (vm, node_index);
+  vlib_process_t *p = vec_elt (nm->processes, n->runtime_index);
+
+  ASSERT (clib_bitmap_get (p->one_time_event_type_bitmap, t));
+  vlib_process_free_event_type (p, t, /* is_one_time_event */ 1);
+}
+
+always_inline void *
+vlib_process_signal_event_helper (vlib_node_main_t * nm,
+				  vlib_node_t * n,
+				  vlib_process_t * p,
+				  uword t,
+				  uword n_data_elts, uword n_data_elt_bytes)
+{
+  uword p_flags, add_to_pending, delete_from_wheel;
+  void *data_to_be_written_by_caller;
+
+  ASSERT (!pool_is_free_index (p->event_type_pool, t));
+
+  vec_validate (p->pending_event_data_by_type_index, t);
+
+  /* Resize data vector and return caller's data to be written. */
+  {
+    void *data_vec = p->pending_event_data_by_type_index[t];
+    uword l;
+
+    if (!data_vec && vec_len (nm->recycled_event_data_vectors))
+      {
+	data_vec = vec_pop (nm->recycled_event_data_vectors);
+	_vec_len (data_vec) = 0;
+      }
+
+    l = vec_len (data_vec);
+
+    data_vec = _vec_resize (data_vec,
+			    /* length_increment */ n_data_elts,
+			    /* total size after increment */
+			    (l + n_data_elts) * n_data_elt_bytes,
+			    /* header_bytes */ 0, /* data_align */ 0);
+
+    p->pending_event_data_by_type_index[t] = data_vec;
+    data_to_be_written_by_caller = data_vec + l * n_data_elt_bytes;
+  }
+
+  p->non_empty_event_type_bitmap =
+    clib_bitmap_ori (p->non_empty_event_type_bitmap, t);
+
+  p_flags = p->flags;
+
+  /* Event was already signalled? */
+  add_to_pending = (p_flags & VLIB_PROCESS_RESUME_PENDING) == 0;
+
+  /* Process will resume when suspend time elapses? */
+  delete_from_wheel = 0;
+  if (p_flags & VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK)
+    {
+      /* Waiting for both event and clock? */
+      if (p_flags & VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT)
+	delete_from_wheel = 1;
+      else
+	/* Waiting only for clock.  Event will be queue and may be
+	   handled when timer expires. */
+	add_to_pending = 0;
+    }
+
+  /* Never add current process to pending vector since current process is
+     already running. */
+  add_to_pending &= nm->current_process_index != n->runtime_index;
+
+  if (add_to_pending)
+    {
+      u32 x = vlib_timing_wheel_data_set_suspended_process (n->runtime_index);
+      p->flags = p_flags | VLIB_PROCESS_RESUME_PENDING;
+      vec_add1 (nm->data_from_advancing_timing_wheel, x);
+      if (delete_from_wheel)
+	TW (tw_timer_stop) ((TWT (tw_timer_wheel) *) nm->timing_wheel,
+			    p->stop_timer_handle);
+    }
+
+  return data_to_be_written_by_caller;
+}
+
+always_inline void *
+vlib_process_signal_event_data (vlib_main_t * vm,
+				uword node_index,
+				uword type_opaque,
+				uword n_data_elts, uword n_data_elt_bytes)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_node_t *n = vlib_get_node (vm, node_index);
+  vlib_process_t *p = vec_elt (nm->processes, n->runtime_index);
+  uword *h, t;
+
+  /* Must be in main thread */
+  ASSERT (vlib_get_thread_index () == 0);
+
+  h = hash_get (p->event_type_index_by_type_opaque, type_opaque);
+  if (!h)
+    {
+      vlib_process_event_type_t *et =
+	vlib_process_new_event_type (p, type_opaque);
+      t = et - p->event_type_pool;
+      hash_set (p->event_type_index_by_type_opaque, type_opaque, t);
+    }
+  else
+    t = h[0];
+
+  return vlib_process_signal_event_helper (nm, n, p, t, n_data_elts,
+					   n_data_elt_bytes);
+}
+
+always_inline void *
+vlib_process_signal_event_at_time (vlib_main_t * vm,
+				   f64 dt,
+				   uword node_index,
+				   uword type_opaque,
+				   uword n_data_elts, uword n_data_elt_bytes)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_node_t *n = vlib_get_node (vm, node_index);
+  vlib_process_t *p = vec_elt (nm->processes, n->runtime_index);
+  uword *h, t;
+
+  h = hash_get (p->event_type_index_by_type_opaque, type_opaque);
+  if (!h)
+    {
+      vlib_process_event_type_t *et =
+	vlib_process_new_event_type (p, type_opaque);
+      t = et - p->event_type_pool;
+      hash_set (p->event_type_index_by_type_opaque, type_opaque, t);
+    }
+  else
+    t = h[0];
+
+  if (vlib_process_suspend_time_is_zero (dt))
+    return vlib_process_signal_event_helper (nm, n, p, t, n_data_elts,
+					     n_data_elt_bytes);
+  else
+    {
+      vlib_signal_timed_event_data_t *te;
+
+      pool_get_aligned (nm->signal_timed_event_data_pool, te, sizeof (te[0]));
+
+      te->n_data_elts = n_data_elts;
+      te->n_data_elt_bytes = n_data_elt_bytes;
+      te->n_data_bytes = n_data_elts * n_data_elt_bytes;
+
+      /* Assert that structure fields are big enough. */
+      ASSERT (te->n_data_elts == n_data_elts);
+      ASSERT (te->n_data_elt_bytes == n_data_elt_bytes);
+      ASSERT (te->n_data_bytes == n_data_elts * n_data_elt_bytes);
+
+      te->process_node_index = n->runtime_index;
+      te->event_type_index = t;
+
+      p->stop_timer_handle =
+	TW (tw_timer_start) ((TWT (tw_timer_wheel) *) nm->timing_wheel,
+			     vlib_timing_wheel_data_set_timed_event
+			     (te - nm->signal_timed_event_data_pool),
+			     0 /* timer_id */ ,
+			     (vlib_time_now (vm) + dt) * 1e5);
+
+      /* Inline data big enough to hold event? */
+      if (te->n_data_bytes < sizeof (te->inline_event_data))
+	return te->inline_event_data;
+      else
+	{
+	  te->event_data_as_vector = 0;
+	  vec_resize (te->event_data_as_vector, te->n_data_bytes);
+	  return te->event_data_as_vector;
+	}
+    }
+}
+
+always_inline void *
+vlib_process_signal_one_time_event_data (vlib_main_t * vm,
+					 uword node_index,
+					 uword type_index,
+					 uword n_data_elts,
+					 uword n_data_elt_bytes)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_node_t *n = vlib_get_node (vm, node_index);
+  vlib_process_t *p = vec_elt (nm->processes, n->runtime_index);
+  return vlib_process_signal_event_helper (nm, n, p, type_index, n_data_elts,
+					   n_data_elt_bytes);
+}
+
+always_inline void
+vlib_process_signal_event (vlib_main_t * vm,
+			   uword node_index, uword type_opaque, uword data)
+{
+  uword *d = vlib_process_signal_event_data (vm, node_index, type_opaque,
+					     1 /* elts */ , sizeof (uword));
+  d[0] = data;
+}
+
+always_inline void
+vlib_process_signal_event_pointer (vlib_main_t * vm,
+				   uword node_index,
+				   uword type_opaque, void *data)
+{
+  void **d = vlib_process_signal_event_data (vm, node_index, type_opaque,
+					     1 /* elts */ , sizeof (data));
+  d[0] = data;
+}
+
+/**
+ * Signal event to process from any thread.
+ *
+ * When in doubt, use this.
+ */
+always_inline void
+vlib_process_signal_event_mt (vlib_main_t * vm,
+			      uword node_index, uword type_opaque, uword data)
+{
+  if (vlib_get_thread_index () != 0)
+    {
+      vlib_process_signal_event_mt_args_t args = {
+	.node_index = node_index,
+	.type_opaque = type_opaque,
+	.data = data,
+      };
+      vlib_rpc_call_main_thread (vlib_process_signal_event_mt_helper,
+				 (u8 *) & args, sizeof (args));
+    }
+  else
+    vlib_process_signal_event (vm, node_index, type_opaque, data);
+}
+
+always_inline void
+vlib_process_signal_one_time_event (vlib_main_t * vm,
+				    uword node_index,
+				    uword type_index, uword data)
+{
+  uword *d =
+    vlib_process_signal_one_time_event_data (vm, node_index, type_index,
+					     1 /* elts */ , sizeof (uword));
+  d[0] = data;
+}
+
+always_inline void
+vlib_signal_one_time_waiting_process (vlib_main_t * vm,
+				      vlib_one_time_waiting_process_t * p)
+{
+  vlib_process_signal_one_time_event (vm, p->node_index, p->one_time_event,
+				      /* data */ ~0);
+  memset (p, ~0, sizeof (p[0]));
+}
+
+always_inline void
+vlib_signal_one_time_waiting_process_vector (vlib_main_t * vm,
+					     vlib_one_time_waiting_process_t
+					     ** wps)
+{
+  vlib_one_time_waiting_process_t *wp;
+  vec_foreach (wp, *wps) vlib_signal_one_time_waiting_process (vm, wp);
+  vec_free (*wps);
+}
+
+always_inline void
+vlib_current_process_wait_for_one_time_event (vlib_main_t * vm,
+					      vlib_one_time_waiting_process_t
+					      * p)
+{
+  p->node_index = vlib_current_process (vm);
+  p->one_time_event = vlib_process_create_one_time_event (vm, p->node_index,	/* type opaque */
+							  ~0);
+  vlib_process_wait_for_one_time_event (vm,
+					/* don't care about data */ 0,
+					p->one_time_event);
+}
+
+always_inline void
+vlib_current_process_wait_for_one_time_event_vector (vlib_main_t * vm,
+						     vlib_one_time_waiting_process_t
+						     ** wps)
+{
+  vlib_one_time_waiting_process_t *wp;
+  vec_add2 (*wps, wp, 1);
+  vlib_current_process_wait_for_one_time_event (vm, wp);
+}
+
+always_inline u32
+vlib_node_runtime_update_main_loop_vector_stats (vlib_main_t * vm,
+						 vlib_node_runtime_t * node,
+						 uword n_vectors)
+{
+  u32 i, d, vi0, vi1;
+  u32 i0, i1;
+
+  ASSERT (is_pow2 (ARRAY_LEN (node->main_loop_vector_stats)));
+  i = ((vm->main_loop_count >> VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE)
+       & (ARRAY_LEN (node->main_loop_vector_stats) - 1));
+  i0 = i ^ 0;
+  i1 = i ^ 1;
+  d = ((vm->main_loop_count >> VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE)
+       -
+       (node->main_loop_count_last_dispatch >>
+	VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE));
+  vi0 = node->main_loop_vector_stats[i0];
+  vi1 = node->main_loop_vector_stats[i1];
+  vi0 = d == 0 ? vi0 : 0;
+  vi1 = d <= 1 ? vi1 : 0;
+  vi0 += n_vectors;
+  node->main_loop_vector_stats[i0] = vi0;
+  node->main_loop_vector_stats[i1] = vi1;
+  node->main_loop_count_last_dispatch = vm->main_loop_count;
+  /* Return previous counter. */
+  return node->main_loop_vector_stats[i1];
+}
+
+always_inline f64
+vlib_node_vectors_per_main_loop_as_float (vlib_main_t * vm, u32 node_index)
+{
+  vlib_node_runtime_t *rt = vlib_node_get_runtime (vm, node_index);
+  u32 v;
+
+  v = vlib_node_runtime_update_main_loop_vector_stats (vm, rt,	/* n_vectors */
+						       0);
+  return (f64) v / (1 << VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE);
+}
+
+always_inline u32
+vlib_node_vectors_per_main_loop_as_integer (vlib_main_t * vm, u32 node_index)
+{
+  vlib_node_runtime_t *rt = vlib_node_get_runtime (vm, node_index);
+  u32 v;
+
+  v = vlib_node_runtime_update_main_loop_vector_stats (vm, rt,	/* n_vectors */
+						       0);
+  return v >> VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE;
+}
+
+void
+vlib_frame_free (vlib_main_t * vm, vlib_node_runtime_t * r, vlib_frame_t * f);
+
+/* Return the edge index if present, ~0 otherwise */
+uword vlib_node_get_next (vlib_main_t * vm, uword node, uword next_node);
+
+/* Add next node to given node in given slot. */
+uword
+vlib_node_add_next_with_slot (vlib_main_t * vm,
+			      uword node, uword next_node, uword slot);
+
+/* As above but adds to end of node's next vector. */
+always_inline uword
+vlib_node_add_next (vlib_main_t * vm, uword node, uword next_node)
+{
+  return vlib_node_add_next_with_slot (vm, node, next_node, ~0);
+}
+
+/* Add next node to given node in given slot. */
+uword
+vlib_node_add_named_next_with_slot (vlib_main_t * vm,
+				    uword node, char *next_name, uword slot);
+
+/* As above but adds to end of node's next vector. */
+always_inline uword
+vlib_node_add_named_next (vlib_main_t * vm, uword node, char *name)
+{
+  return vlib_node_add_named_next_with_slot (vm, node, name, ~0);
+}
+
+/* Query node given name. */
+vlib_node_t *vlib_get_node_by_name (vlib_main_t * vm, u8 * name);
+
+/* Rename a node. */
+void vlib_node_rename (vlib_main_t * vm, u32 node_index, char *fmt, ...);
+
+/* Register new packet processing node.  Nodes can be registered
+   dynamically via this call or statically via the VLIB_REGISTER_NODE
+   macro. */
+u32 vlib_register_node (vlib_main_t * vm, vlib_node_registration_t * r);
+
+/* Register all static nodes registered via VLIB_REGISTER_NODE. */
+void vlib_register_all_static_nodes (vlib_main_t * vm);
+
+/* Start a process. */
+void vlib_start_process (vlib_main_t * vm, uword process_index);
+
+/* Sync up runtime and main node stats. */
+void vlib_node_sync_stats (vlib_main_t * vm, vlib_node_t * n);
+
+/* Node graph initialization function. */
+clib_error_t *vlib_node_main_init (vlib_main_t * vm);
+
+format_function_t format_vlib_node_graph;
+format_function_t format_vlib_node_name;
+format_function_t format_vlib_next_node_name;
+format_function_t format_vlib_node_and_next;
+format_function_t format_vlib_cpu_time;
+format_function_t format_vlib_time;
+/* Parse node name -> node index. */
+unformat_function_t unformat_vlib_node;
+
+always_inline void
+vlib_node_increment_counter (vlib_main_t * vm, u32 node_index,
+			     u32 counter_index, u64 increment)
+{
+  vlib_node_t *n = vlib_get_node (vm, node_index);
+  vlib_error_main_t *em = &vm->error_main;
+  u32 node_counter_base_index = n->error_heap_index;
+  em->counters[node_counter_base_index + counter_index] += increment;
+}
+
+#endif /* included_vlib_node_funcs_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/parse.c b/src/vlib/parse.c
new file mode 100644
index 00000000..1c4500ce
--- /dev/null
+++ b/src/vlib/parse.c
@@ -0,0 +1,1007 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vlib/parse.h>
+
+#define PARSE_DEBUG 0
+
+u16 word_type_index, number_type_index, eof_type_index, rule_eof_type_index,
+  plus_type_index, minus_type_index, star_type_index, slash_type_index,
+  lpar_type_index, rpar_type_index;
+
+u8 *
+format_vlib_parse_value (u8 * s, va_list * args)
+{
+  vlib_parse_main_t *pm = va_arg (*args, vlib_parse_main_t *);
+  vlib_parse_type_t *type;
+  vlib_parse_value_t *v;
+  u16 type_index;
+
+  s = format (s, "%d items:\n", vec_len (pm->parse_value));
+  vec_foreach (v, pm->parse_value)
+  {
+    type_index = v->type;
+    type = pool_elt_at_index (pm->parse_types, type_index);
+    if (type->format_value)
+      s = format (s, "[%d]: %U\n", v - pm->parse_value,
+		  type->format_value, v);
+    else
+      s = format (s, "[%d]: (nofun)\n", v - pm->parse_value);
+  }
+  return s;
+}
+
+static u8 *
+format_vlib_parse_match (u8 * s, va_list * args)
+{
+  vlib_parse_match_t m = va_arg (*args, vlib_parse_match_t);
+  char *t = 0;
+  switch (m)
+    {
+#define _(a) case VLIB_PARSE_##a: t = #a; break;
+      foreach_parse_match_type
+#undef _
+    default:
+      t = 0;
+      break;
+    }
+
+  if (t)
+    return format (s, "%s", t);
+  else
+    return format (s, "unknown 0x%x", m);
+}
+
+static u8 *
+format_vlib_parse_item (u8 * s, va_list * args)
+{
+  vlib_parse_main_t *pm = va_arg (*args, vlib_parse_main_t *);
+  vlib_parse_item_t *item = va_arg (*args, vlib_parse_item_t *);
+  vlib_parse_type_t *type = pool_elt_at_index (pm->parse_types, item->type);
+
+  if (item->type == word_type_index)
+    s = format (s, "%s", item->value.as_pointer);
+  else
+    s = format (s, "<%s>", type->name);
+  return s;
+}
+
+static u8 *
+format_vlib_parse_graph (u8 * s, va_list * args)
+{
+  vlib_parse_main_t *pm = va_arg (*args, vlib_parse_main_t *);
+  vlib_parse_graph_t *node = va_arg (*args, vlib_parse_graph_t *);
+  vlib_parse_item_t *item;
+  vlib_parse_type_t *type;
+
+  /* $$$ hash table */
+  /* *INDENT-OFF* */
+  pool_foreach (type, pm->parse_types,
+                ({
+                  if (type->rule_index == node - pm->parse_graph)
+                    s = format (s, "\n<%s>\n", type->name);
+                }));
+/* *INDENT-ON* */
+
+  if (pm->root_index == (node - pm->parse_graph))
+    s = format (s, "\n<root>\n");
+
+  item = pool_elt_at_index (pm->parse_items, node->item);
+
+  s = format (s, "[%d] %U ", node - pm->parse_graph,
+	      format_vlib_parse_item, pm, item);
+
+  if (node->peer == (u32) ~ 0)
+    s = format (s, "peer nil  ");
+  else
+    s = format (s, "peer %4u ", node->peer);
+
+  if (node->deeper == (u32) ~ 0)
+    s = format (s, "deeper nil  ");
+  else
+    s = format (s, "deeper %4u ", node->deeper);
+
+  return s;
+}
+
+void
+dump_parse_graph (void)
+{
+  vlib_parse_main_t *pm = &vlib_parse_main;
+  vlib_parse_graph_t *node;
+
+  /* *INDENT-OFF* */
+  pool_foreach (node, pm->parse_graph, ({
+    fformat(stdout, "%U\n", format_vlib_parse_graph, pm, node);
+  }));
+/* *INDENT-ON* */
+}
+
+always_inline void
+parse_cleanup_value (vlib_parse_main_t * pm, vlib_parse_value_t * pv)
+{
+  vlib_parse_type_t *type = pool_elt_at_index (pm->parse_types, pv->type);
+  if (type->value_cleanup_function)
+    type->value_cleanup_function (pv);
+}
+
+static void
+parse_reset (vlib_parse_main_t * pm, u8 * input)
+{
+  vlib_lex_token_t *t;
+  vlib_parse_value_t *pv;
+
+  vlib_lex_reset (pm->lex_main, input);
+
+  vec_foreach (t, pm->tokens) vlib_lex_cleanup_token (t);
+
+  vec_foreach (pv, pm->parse_value) parse_cleanup_value (pm, pv);
+
+  _vec_len (pm->parse_value) = 0;
+  _vec_len (pm->tokens) = 0;
+  pm->current_token_index = 0;
+}
+
+static void
+parse_help (vlib_parse_main_t * pm, u32 index)
+{
+  vlib_parse_graph_t *node;
+  vlib_parse_item_t *item;
+  vlib_parse_type_t *type;
+  vlib_main_t *vm = pm->vlib_main;
+  u8 *help_input;
+  int i;
+
+  help_input = vec_dup (pm->lex_main->input_vector);
+
+  for (i = vec_len (help_input) - 1; i >= 0; i--)
+    if (help_input[i] == '?')
+      {
+	help_input[i] = 0;
+	_vec_len (help_input) = i;
+	break;
+      }
+
+  for (i = vec_len (help_input) - 1; i >= 0; i--)
+    {
+      if (help_input[i] != ' ' && help_input[i] != '\t')
+	break;
+      help_input[i] = 0;
+      break;
+    }
+  _vec_len (help_input) = i + 1;
+
+  while (index != (u32) ~ 0)
+    {
+      node = pool_elt_at_index (pm->parse_graph, index);
+      item = pool_elt_at_index (pm->parse_items, node->item);
+      type = pool_elt_at_index (pm->parse_types, item->type);
+
+      if (item->type == eof_type_index && vec_len (pm->match_items) == 0)
+	/* do nothing */ ;
+      else if (item->type == word_type_index)
+	vlib_cli_output (vm, "%s %s\n", help_input, item->value.as_pointer);
+      else
+	vlib_cli_output (vm, "%s <%s>\n", help_input, type->name);
+      index = node->peer;
+    }
+  vec_free (help_input);
+}
+
+static vlib_parse_match_t
+parse_eval_internal (vlib_parse_main_t * pm, u32 index)
+{
+  vlib_parse_graph_t *node;
+  vlib_parse_item_t *item;
+  vlib_parse_type_t *type;
+  vlib_parse_value_t value, *pv;
+  vlib_parse_match_t rv;
+  u32 *partial_matches = 0;
+  vlib_lex_token_t *t;
+  u32 save_token_index = (u32) ~ 0, save_match_items = 0;
+  int had_value = 0;
+
+  if (pm->current_token_index >= vec_len (pm->tokens))
+    return VLIB_PARSE_MATCH_FAIL;
+
+  /* current token */
+  t = vec_elt_at_index (pm->tokens, pm->current_token_index);
+
+  /* Help ? */
+  if (PREDICT_FALSE (t->token == VLIB_LEX_qmark))
+    {
+      parse_help (pm, index);
+      _vec_len (pm->match_items) = 0;
+      return VLIB_PARSE_MATCH_DONE;
+    }
+
+  /* Across all peers at this level of the parse graph */
+  while (index != (u32) ~ 0)
+    {
+      node = pool_elt_at_index (pm->parse_graph, index);
+      item = pool_elt_at_index (pm->parse_items, node->item);
+      type = pool_elt_at_index (pm->parse_types, item->type);
+
+      /*
+       * Save the token index. We may have to back up several
+       * trie plies. Type-specific match functions can consume
+       * multiple tokens, and they may not be optimally careful
+       */
+      save_token_index = pm->current_token_index;
+      save_match_items = vec_len (pm->match_items);
+      vec_add1 (pm->match_items, node->item);
+
+      if (PARSE_DEBUG > 1)
+	clib_warning ("Try to match token %U against node %d",
+		      format_vlib_lex_token, pm->lex_main, t, index);
+
+      /* Call the type-specific match function */
+      rv = type->match_function (pm, type, t, &value);
+
+      if (PARSE_DEBUG > 1)
+	clib_warning ("returned %U", format_vlib_parse_match, rv);
+
+      switch (rv)
+	{
+	case VLIB_PARSE_MATCH_VALUE:
+	  /*
+	   * Matched, and returned a value to append to the
+	   * set of args passed to the action function
+	   */
+	  value.type = item->type;
+	  vec_add1 (pm->parse_value, value);
+	  had_value = 1;
+	  /* fallthrough */
+
+	case VLIB_PARSE_MATCH_FULL:
+	unambiguous_partial_match:
+	  /* Consume the matched token */
+	  pm->current_token_index++;
+
+	  /* continue matching along this path */
+	  rv = parse_eval_internal (pm, node->deeper);
+
+	  /* this is not the right path */
+	  if (rv == VLIB_PARSE_MATCH_FAIL)
+	    {
+	      if (had_value)
+		{
+		  /* Delete the value */
+		  value = pm->parse_value[vec_len (pm->parse_value) - 1];
+		  parse_cleanup_value (pm, &value);
+		  _vec_len (pm->parse_value) -= 1;
+		}
+	      /* Continue with the next sibling */
+	      pm->current_token_index = save_token_index;
+	      _vec_len (pm->match_items) = save_match_items;
+	      index = node->peer;
+	      break;
+	    }
+	  return rv;
+
+	case VLIB_PARSE_MATCH_PARTIAL:
+	  /* Partial (substring) match, remember it but keep going */
+	  vec_add1 (partial_matches, node - pm->parse_graph);
+	  index = node->peer;
+	  break;
+
+	case VLIB_PARSE_MATCH_FAIL:
+	  /* Continue with the next sibling */
+	  index = node->peer;
+	  _vec_len (pm->match_items) = save_match_items;
+	  break;
+
+	case VLIB_PARSE_MATCH_DONE:
+	  /* Parse complete, invoke the action function */
+	  if (PARSE_DEBUG > 0)
+	    clib_warning ("parse_value: %U", format_vlib_parse_value, pm);
+
+	  {
+	    vlib_parse_eval_function_t *f = item->value.as_pointer;
+	    if (f)
+	      rv = f (pm, item, pm->parse_value);
+	  }
+
+	  vec_foreach (pv, pm->parse_value) parse_cleanup_value (pm, pv);
+	  _vec_len (pm->parse_value) = 0;
+	  _vec_len (pm->match_items) = 0;
+	  return rv;
+
+	case VLIB_PARSE_MATCH_AMBIGUOUS:
+	case VLIB_PARSE_MATCH_EVAL_FAIL:
+	case VLIB_PARSE_MATCH_RULE:
+	  _vec_len (pm->match_items) = save_match_items;
+	  return rv;
+	}
+    }
+
+  /*
+   * Out of siblings. If we have exactly one partial match
+   * we win
+   */
+  if (vec_len (partial_matches) == 1)
+    {
+      index = partial_matches[0];
+      node = pool_elt_at_index (pm->parse_graph, index);
+      vec_free (partial_matches);
+      goto unambiguous_partial_match;
+    }
+
+  /* Ordinary loser */
+  rv = VLIB_PARSE_MATCH_FAIL;
+
+  /* Ambiguous loser */
+  if (vec_len (partial_matches) > 1)
+    {
+      vec_free (partial_matches);
+      rv = VLIB_PARSE_MATCH_AMBIGUOUS;
+    }
+
+  _vec_len (pm->match_items) = save_match_items;
+  return rv;
+}
+
+vlib_parse_match_t
+rule_match (vlib_parse_main_t * pm, vlib_parse_type_t * type,
+	    vlib_lex_token_t * t, vlib_parse_value_t * valuep)
+{
+  vlib_parse_match_t rv;
+  static int recursion_level;
+
+  if (PARSE_DEBUG > 1)
+    clib_warning ("[%d]: try to match type %s graph index %d",
+		  recursion_level, type->name, type->rule_index);
+  recursion_level++;
+  rv = parse_eval_internal (pm, type->rule_index);
+  recursion_level--;
+
+  /* Break the recusive unwind here... */
+  if (rv == VLIB_PARSE_MATCH_RULE)
+    {
+      if (PARSE_DEBUG > 1)
+	clib_warning ("[%d]: type %s matched", recursion_level, type->name);
+
+      return VLIB_PARSE_MATCH_FULL;
+    }
+  else
+    {
+      if (PARSE_DEBUG > 1)
+	clib_warning ("[%d]: type %s returns %U", recursion_level, type->name,
+		      format_vlib_parse_match, rv);
+    }
+  return rv;
+}
+
+static int
+parse_eval (vlib_parse_main_t * pm, u8 * input)
+{
+  vlib_lex_token_t *t;
+
+  parse_reset (pm, input);
+
+  /* Tokenize the entire input vector */
+  do
+    {
+      vec_add2 (pm->tokens, t, 1);
+      vlib_lex_get_token (pm->lex_main, t);
+    }
+  while (t->token != VLIB_LEX_eof);
+
+  /* Feed it to the parser */
+  return parse_eval_internal (pm, pm->root_index);
+}
+
+/* Temporary vlib stub */
+vlib_parse_match_t
+vlib_parse_eval (u8 * input)
+{
+  return parse_eval (&vlib_parse_main, input);
+}
+
+u16
+parse_type_find_or_create (vlib_parse_main_t * pm, vlib_parse_type_t * t)
+{
+  uword *p;
+  vlib_parse_type_t *n;
+  u8 *name_copy;
+
+  p = hash_get_mem (pm->parse_type_by_name_hash, t->name);
+  if (p)
+    return p[0];
+
+  pool_get (pm->parse_types, n);
+  *n = *t;
+  n->rule_index = (u32) ~ 0;
+
+  name_copy = format (0, "%s%c", n->name, 0);
+
+  hash_set_mem (pm->parse_type_by_name_hash, name_copy, n - pm->parse_types);
+  return n - pm->parse_types;
+}
+
+u16
+parse_type_find_by_name (vlib_parse_main_t * pm, char *name)
+{
+  uword *p;
+
+  p = hash_get_mem (pm->parse_type_by_name_hash, name);
+  if (p)
+    return p[0];
+
+  return (u16) ~ 0;
+}
+
+u32
+parse_item_find_or_create (vlib_parse_main_t * pm, vlib_parse_item_t * item)
+{
+  uword *p;
+  vlib_parse_item_t *i;
+
+  /* Exact match the entire item */
+  p = mhash_get (&pm->parse_item_hash, item);
+  if (p)
+    return p[0];
+
+  pool_get (pm->parse_items, i);
+  *i = *item;
+
+  mhash_set (&pm->parse_item_hash, i, i - pm->parse_items, 0);
+  return i - pm->parse_items;
+}
+
+static void
+parse_type_and_graph_init (vlib_parse_main_t * pm)
+{
+  u32 eof_index;
+  vlib_parse_type_t type;
+  vlib_parse_item_t item;
+
+  memset (&type, 0, sizeof (type));
+
+#define foreach_token_type                      \
+  _ (eof)                                       \
+  _ (rule_eof)                                  \
+  _ (word)                                      \
+  _ (number)                                    \
+  _ (plus)                                      \
+  _ (minus)                                     \
+  _ (star)                                      \
+  _ (slash)                                     \
+  _ (lpar)                                      \
+  _ (rpar)
+
+#define _(a) a##_type_index = parse_type_find_by_name (pm, #a);
+  foreach_token_type
+#undef _
+    memset (&item, 0, sizeof (item));
+  item.type = eof_type_index;
+
+  eof_index = parse_item_find_or_create (pm, &item);
+  pm->root_index = (u32) ~ 0;
+
+#if 0
+  pool_get (pm->parse_graph, g);
+  memset (g, 0xff, sizeof (*g));
+  g->item = eof_index;
+  pm->root_index = 0;
+#endif
+}
+
+
+
+static void
+tokenize (vlib_parse_main_t * pm, parse_registration_t * pr)
+{
+  vlib_lex_token_t *t;
+  pm->register_input = format (pm->register_input,
+			       "%s%c", pr->initializer, 0);
+
+  parse_reset (pm, pm->register_input);
+
+  do
+    {
+      vec_add2 (pm->tokens, t, 1);
+      vlib_lex_get_token (pm->lex_main, t);
+    }
+  while (t->token != VLIB_LEX_eof);
+  _vec_len (pm->register_input) = 0;
+}
+
+static int
+is_typed_rule (vlib_parse_main_t * pm)
+{
+  vlib_lex_token_t *t = vec_elt_at_index (pm->tokens, 0);
+
+  /* <mytype> = blah blah blah */
+  if (vec_len (pm->tokens) >= 4
+      && t[0].token == VLIB_LEX_lt
+      && t[1].token == VLIB_LEX_word
+      && t[2].token == VLIB_LEX_gt && t[3].token == VLIB_LEX_equals)
+    return 1;
+  return 0;
+}
+
+static int
+token_matches_graph_node (vlib_parse_main_t * pm,
+			  vlib_lex_token_t * t,
+			  vlib_parse_graph_t * node,
+			  vlib_parse_item_t * item,
+			  vlib_parse_type_t * type, u32 * token_increment)
+{
+  /* EOFs don't match */
+  if (t->token == VLIB_LEX_eof)
+    return 0;
+
+  /* New chain element is a word */
+  if (t->token == VLIB_LEX_word)
+    {
+      /* but the item in hand is not a word */
+      if (item->type != word_type_index)
+	return 0;
+
+      /* Or it's not this particular word */
+      if (strcmp (t->value.as_pointer, item->value.as_pointer))
+	return 0;
+      *token_increment = 1;
+      return 1;
+    }
+  /* New chain element is a type-name: < TYPE-NAME > */
+  if (t->token == VLIB_LEX_lt)
+    {
+      u16 token_type_index;
+
+      /* < TYPE > */
+      if (t[1].token != VLIB_LEX_word || t[2].token != VLIB_LEX_gt)
+	{
+	  clib_warning (0, "broken type name in '%s'", pm->register_input);
+	  return 0;
+	}
+
+      token_type_index = parse_type_find_by_name (pm, t[1].value.as_pointer);
+      if (token_type_index == (u16) ~ 0)
+	{
+	  clib_warning (0, "unknown type '%s'", t[1].value.as_pointer);
+	  return 0;
+	}
+
+      /* Its a known type but does not match. */
+      if (item->type != token_type_index)
+	return 0;
+
+      *token_increment = 3;
+      return 1;
+    }
+  clib_warning ("BUG: t->token = %d", t->token);
+  return 0;
+}
+
+u32
+generate_subgraph_from_tokens (vlib_parse_main_t * pm,
+			       vlib_lex_token_t * t,
+			       u32 * new_subgraph_depth,
+			       parse_registration_t * pr, int not_a_rule)
+{
+  vlib_parse_graph_t *g, *last_g;
+  vlib_parse_item_t new_item;
+  u32 rv = (u32) ~ 0, new_item_index, last_index = (u32) ~ 0;
+  u16 token_type_index;
+  u32 depth = 0;
+
+  while (t < pm->tokens + vec_len (pm->tokens))
+    {
+      memset (&new_item, 0, sizeof (new_item));
+
+      if (t->token == VLIB_LEX_word)
+	{
+	  new_item.type = word_type_index;
+	  new_item.value.as_pointer = vec_dup ((u8 *) t->value.as_pointer);
+	  new_item_index = parse_item_find_or_create (pm, &new_item);
+	  t++;
+	}
+      else if (t->token == VLIB_LEX_lt)
+	{
+	  if (t[1].token != VLIB_LEX_word || t[2].token != VLIB_LEX_gt)
+	    {
+	      clib_warning ("broken type name in '%s'", pm->register_input);
+	      goto screwed;
+	    }
+	  token_type_index = parse_type_find_by_name (pm,
+						      t[1].value.as_pointer);
+	  if (token_type_index == (u16) ~ 0)
+	    {
+	      clib_warning ("unknown type 2 '%s'", t[1].value.as_pointer);
+	      goto screwed;
+	    }
+
+	  new_item.type = token_type_index;
+	  new_item.value.as_pointer = 0;
+	  new_item_index = parse_item_find_or_create (pm, &new_item);
+	  t += 3;		/* skip < <type-name> and > */
+	}
+      else if (t->token == VLIB_LEX_eof)
+	{
+	screwed:
+	  new_item.type = not_a_rule ? eof_type_index : rule_eof_type_index;
+	  new_item.value.as_pointer = pr->eof_match;
+	  new_item_index = parse_item_find_or_create (pm, &new_item);
+	  t++;
+	}
+      else
+	{
+	  clib_warning ("unexpected token %U index %d in '%s'",
+			format_vlib_lex_token, pm->lex_main, t,
+			t - pm->tokens, pm->register_input);
+	  goto screwed;
+	}
+
+      pool_get (pm->parse_graph, g);
+      memset (g, 0xff, sizeof (*g));
+      g->item = new_item_index;
+      depth++;
+
+      if (rv == (u32) ~ 0)
+	{
+	  rv = g - pm->parse_graph;
+	  last_index = rv;
+	}
+      else
+	{
+	  last_g = pool_elt_at_index (pm->parse_graph, last_index);
+	  last_index = last_g->deeper = g - pm->parse_graph;
+	}
+    }
+  *new_subgraph_depth = depth;
+  return rv;
+}
+
+static u32
+measure_depth (vlib_parse_main_t * pm, u32 index)
+{
+  vlib_parse_graph_t *node;
+  vlib_parse_item_t *item;
+  u32 max = 0;
+  u32 depth;
+
+  if (index == (u32) ~ 0)
+    return 0;
+
+  node = pool_elt_at_index (pm->parse_graph, index);
+  item = pool_elt_at_index (pm->parse_items, node->item);
+
+  if (item->type == eof_type_index)
+    return 1;
+
+  while (index != (u32) ~ 0)
+    {
+      node = pool_elt_at_index (pm->parse_graph, index);
+      depth = measure_depth (pm, node->deeper);
+      if (max < depth)
+	max = depth;
+      index = node->peer;
+    }
+
+  return max + 1;
+}
+
+static void
+add_subgraph_to_graph (vlib_parse_main_t * pm,
+		       u32 last_matching_index,
+		       u32 graph_root_index,
+		       u32 new_subgraph_index, u32 new_subgraph_depth)
+{
+  vlib_parse_graph_t *parent_node;
+  int new_subgraph_longest = 1;
+  u32 current_peer_index;
+  u32 current_depth;
+  vlib_parse_graph_t *current_peer = 0;
+  vlib_parse_graph_t *new_subgraph_node =
+    pool_elt_at_index (pm->parse_graph, new_subgraph_index);
+
+  /*
+   * Case 1: top-level peer. Splice into the top-level
+   * peer chain according to rule depth
+   */
+  if (last_matching_index == (u32) ~ 0)
+    {
+      u32 index = graph_root_index;
+      while (1)
+	{
+	  current_peer = pool_elt_at_index (pm->parse_graph, index);
+	  current_depth = measure_depth (pm, index);
+	  if (current_depth < new_subgraph_depth
+	      || current_peer->peer == (u32) ~ 0)
+	    break;
+	  index = current_peer->peer;
+	}
+      new_subgraph_node->peer = current_peer->peer;
+      current_peer->peer = new_subgraph_index;
+      return;
+    }
+
+  parent_node = pool_elt_at_index (pm->parse_graph, last_matching_index);
+  current_peer_index = parent_node->deeper;
+
+  while (current_peer_index != (u32) ~ 0)
+    {
+      current_peer = pool_elt_at_index (pm->parse_graph, current_peer_index);
+      current_depth = measure_depth (pm, current_peer_index);
+      if (current_depth < new_subgraph_depth)
+	break;
+      new_subgraph_longest = 0;
+      current_peer_index = current_peer->peer;
+    }
+
+  ASSERT (current_peer);
+
+  if (new_subgraph_longest)
+    {
+      new_subgraph_node->peer = parent_node->deeper;
+      parent_node->deeper = new_subgraph_index;
+    }
+  else
+    {
+      new_subgraph_node->peer = current_peer->peer;
+      current_peer->peer = new_subgraph_index;
+    }
+}
+
+static clib_error_t *
+parse_register_one (vlib_parse_main_t * pm, parse_registration_t * pr)
+{
+  u32 graph_root_index;
+  u16 subgraph_type_index = (u16) ~ 0;
+  vlib_parse_type_t *subgraph_type = 0;
+  vlib_lex_token_t *t;
+  vlib_parse_graph_t *node;
+  u32 node_index, last_index, token_increment, new_subgraph_index;
+  u32 new_subgraph_depth, last_matching_index;
+  vlib_parse_item_t *item;
+  vlib_parse_type_t *type;
+
+  int use_main_graph = 1;
+
+  tokenize (pm, pr);
+
+  /* A typed rule? */
+  if (is_typed_rule (pm))
+    {
+      /* Get the type and its current subgraph root, if any */
+      t = vec_elt_at_index (pm->tokens, 1);
+      subgraph_type_index = parse_type_find_by_name (pm, t->value.as_pointer);
+      if (subgraph_type_index == (u16) ~ 0)
+	return clib_error_return (0, "undeclared type '%s'",
+				  t->value.as_pointer);
+      subgraph_type =
+	pool_elt_at_index (pm->parse_types, subgraph_type_index);
+      graph_root_index = subgraph_type->rule_index;
+      /* Skip "mytype> = */
+      t += 3;
+      use_main_graph = 0;
+    }
+  else
+    {
+      /* top-level graph */
+      graph_root_index = pm->root_index;
+      t = vec_elt_at_index (pm->tokens, 0);
+    }
+
+  last_matching_index = (u32) ~ 0;
+  last_index = node_index = graph_root_index;
+
+  /* Find the first token which isn't already being parsed */
+  while (t < pm->tokens + vec_len (pm->tokens) && node_index != (u32) ~ 0)
+    {
+      node = pool_elt_at_index (pm->parse_graph, node_index);
+      item = pool_elt_at_index (pm->parse_items, node->item);
+      type = pool_elt_at_index (pm->parse_types, item->type);
+      last_index = node_index;
+
+      if (token_matches_graph_node
+	  (pm, t, node, item, type, &token_increment))
+	{
+	  t += token_increment;
+	  last_matching_index = node_index;
+	  node_index = node->deeper;
+	}
+      else
+	node_index = node->peer;
+    }
+
+  new_subgraph_index =
+    generate_subgraph_from_tokens (pm, t, &new_subgraph_depth, pr,
+				   use_main_graph);
+
+  /* trivial cases: first graph node or first type rule */
+  if (graph_root_index == (u32) ~ 0)
+    {
+      if (use_main_graph)
+	pm->root_index = new_subgraph_index;
+      else
+	subgraph_type->rule_index = new_subgraph_index;
+      return 0;
+    }
+
+  add_subgraph_to_graph (pm, last_matching_index, graph_root_index,
+			 new_subgraph_index, new_subgraph_depth);
+  return 0;
+}
+
+static clib_error_t *
+parse_register (vlib_main_t * vm,
+		parse_registration_t * lo,
+		parse_registration_t * hi, vlib_parse_main_t * pm)
+{
+  parse_registration_t *pr;
+
+  for (pr = lo; pr < hi; pr = vlib_elf_section_data_next (pr, 0))
+    vec_add1 (pm->parse_registrations, pr);
+
+  return 0;
+}
+
+static clib_error_t *
+parse_register_one_type (vlib_parse_main_t * pm, vlib_parse_type_t * rp)
+{
+  (void) parse_type_find_or_create (pm, (vlib_parse_type_t *) rp);
+  return 0;
+}
+
+static clib_error_t *
+parse_type_register (vlib_main_t * vm,
+		     vlib_parse_type_t * lo,
+		     vlib_parse_type_t * hi, vlib_parse_main_t * pm)
+{
+  clib_error_t *error = 0;
+  vlib_parse_type_t *ptr;
+
+  for (ptr = lo; ptr < hi; ptr = vlib_elf_section_data_next (ptr, 0))
+    {
+      error = parse_register_one_type (pm, ptr);
+      if (error)
+	goto done;
+    }
+
+done:
+  return error;
+}
+
+clib_error_t *vlib_stdlex_init (vlib_main_t * vm) __attribute__ ((weak));
+clib_error_t *
+vlib_stdlex_init (vlib_main_t * vm)
+{
+  (void) vlib_lex_add_table ("ignore_everything");
+  return 0;
+}
+
+static int
+compute_rule_length (parse_registration_t * r)
+{
+  int length, i;
+  vlib_parse_main_t *pm = &vlib_parse_main;
+
+  if (r->rule_length)
+    return r->rule_length;
+
+  length = 0;
+
+  tokenize (pm, r);
+  length = vec_len (pm->tokens);
+
+  /* Account for "<foo> = " in "<foo> = bar" etc. */
+  if (is_typed_rule (pm))
+    length -= 2;
+
+  for (i = 0; i < vec_len (pm->tokens); i++)
+    {
+      switch (pm->tokens[i].token)
+	{
+	case VLIB_LEX_lt:
+	case VLIB_LEX_gt:
+	  length -= 1;
+
+	default:
+	  break;
+	}
+    }
+
+  ASSERT (length > 0);
+  r->rule_length = length;
+  return length;
+}
+
+static int
+rule_length_compare (parse_registration_t * r1, parse_registration_t * r2)
+{
+  compute_rule_length (r1);
+  compute_rule_length (r2);
+  /* Descending sort */
+  return r2->rule_length - r1->rule_length;
+}
+
+
+static clib_error_t *
+parse_init (vlib_main_t * vm)
+{
+  vlib_parse_main_t *pm = &vlib_parse_main;
+  vlib_lex_main_t *lm = &vlib_lex_main;
+  vlib_elf_section_bounds_t *b, *bounds;
+  clib_error_t *error = 0;
+  parse_registration_t *rule;
+  int i;
+
+  if ((error = vlib_call_init_function (vm, lex_onetime_init)))
+    return error;
+
+  if ((error = vlib_stdlex_init (vm)))
+    return error;
+
+  if ((error = vlib_call_init_function (vm, parse_builtin_init)))
+    return error;
+
+  pm->vlib_main = vm;
+  pm->lex_main = lm;
+
+  mhash_init (&pm->parse_item_hash, sizeof (u32), sizeof (vlib_parse_item_t));
+  pm->parse_type_by_name_hash = hash_create_string (0, sizeof (u32));
+
+  vec_validate (pm->parse_value, 16);
+  vec_validate (pm->tokens, 16);
+  vec_validate (pm->register_input, 32);
+  vec_validate (pm->match_items, 16);
+
+  _vec_len (pm->parse_value) = 0;
+  _vec_len (pm->tokens) = 0;
+  _vec_len (pm->register_input) = 0;
+  _vec_len (pm->match_items) = 0;
+
+  bounds = vlib_get_elf_section_bounds (vm, "parse_type_registrations");
+  vec_foreach (b, bounds)
+  {
+    error = parse_type_register (vm, b->lo, b->hi, pm);
+    if (error)
+      break;
+  }
+  vec_free (bounds);
+
+  parse_type_and_graph_init (pm);
+
+  bounds = vlib_get_elf_section_bounds (vm, "parse_registrations");
+  vec_foreach (b, bounds)
+  {
+    error = parse_register (vm, b->lo, b->hi, pm);
+    if (error)
+      break;
+  }
+  vec_free (bounds);
+
+  vec_sort_with_function (pm->parse_registrations, rule_length_compare);
+
+  for (i = 0; i < vec_len (pm->parse_registrations); i++)
+    {
+      rule = pm->parse_registrations[i];
+      parse_register_one (pm, rule);
+    }
+
+  return error;
+}
+
+VLIB_INIT_FUNCTION (parse_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/parse.h b/src/vlib/parse.h
new file mode 100644
index 00000000..036e7447
--- /dev/null
+++ b/src/vlib/parse.h
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef included_vlib_parse_h
+#define included_vlib_parse_h
+
+#include <vlib/vlib.h>
+#include <vlib/lex.h>
+#include <vppinfra/mhash.h>
+
+typedef struct
+{
+  /* Word aligned value. */
+  union
+  {
+    u8 as_u8[32 - 1 * sizeof (u16)];
+    void *as_pointer;
+    uword as_uword;
+    word as_word;
+    u64 as_u64;
+  } value;
+
+  /* 16 bit type at end so that 30 bytes of value are aligned. */
+  u16 type;
+} __attribute ((packed))
+  vlib_parse_value_t;
+
+/* Instance of a type. */
+     typedef struct
+     {
+       u32
+	 type;
+
+       u32
+	 origin;
+
+       u32
+	 help_index;
+
+       union
+       {
+	 void *
+	   as_pointer;
+	 uword
+	   as_uword;
+       } value;
+     } vlib_parse_item_t;
+
+     typedef struct
+     {
+       /* Index of item for this node. */
+       u32
+	 item;
+
+       /* Graph index of peer (sibling) node (linked list of peers). */
+       u32
+	 peer;
+
+       /* Graph index of deeper (child) node (linked list of children). */
+       u32
+	 deeper;
+     } vlib_parse_graph_t;
+
+#define foreach_parse_match_type                \
+  _(MATCH_DONE)					\
+  _(MATCH_RULE)					\
+  _(MATCH_FAIL)					\
+  _(MATCH_FULL)					\
+  _(MATCH_VALUE)				\
+  _(MATCH_PARTIAL)				\
+  _(MATCH_AMBIGUOUS)				\
+  _(MATCH_EVAL_FAIL)
+
+     typedef enum
+     {
+#define _(a) VLIB_PARSE_##a,
+       foreach_parse_match_type
+#undef _
+     } vlib_parse_match_t;
+
+     struct vlib_parse_type;
+     struct vlib_parse_main;
+
+     typedef
+     vlib_parse_match_t (vlib_parse_match_function_t)
+  (struct vlib_parse_main *,
+   struct vlib_parse_type *, vlib_lex_token_t *, vlib_parse_value_t *);
+     typedef void (vlib_parse_value_cleanup_function_t) (vlib_parse_value_t
+							 *);
+
+     typedef struct vlib_parse_type
+     {
+       /* Type name. */
+       char *
+	 name;
+
+       vlib_parse_match_function_t *
+	 match_function;
+
+       vlib_parse_value_cleanup_function_t *
+	 value_cleanup_function;
+
+       format_function_t *
+	 format_value;
+
+       u32
+	 rule_index;
+     } vlib_parse_type_t;
+
+     typedef struct
+     {
+       char *
+	 initializer;
+       void *
+	 eof_match;
+       int
+	 rule_length;
+     } parse_registration_t;
+
+     typedef struct vlib_parse_main
+     {
+       /* (type, origin, help, value) tuples */
+       vlib_parse_item_t *
+	 parse_items;
+       mhash_t
+	 parse_item_hash;
+
+       /* (item, peer, deeper) tuples */
+       vlib_parse_graph_t *
+	 parse_graph;
+       u32
+	 root_index;
+
+       u8 *
+	 register_input;
+
+       /* parser types */
+       vlib_parse_type_t *
+	 parse_types;
+       uword *
+	 parse_type_by_name_hash;
+
+       /* Vector of MATCH_VALUEs */
+       vlib_parse_value_t *
+	 parse_value;
+       u32 *
+	 match_items;
+
+       /* Parse registrations */
+       parse_registration_t **
+	 parse_registrations;
+
+       /* Token vector */
+       vlib_lex_token_t *
+	 tokens;
+       u32
+	 current_token_index;
+
+       vlib_lex_main_t *
+	 lex_main;
+       vlib_main_t *
+	 vlib_main;
+     } vlib_parse_main_t;
+
+     vlib_parse_main_t
+       vlib_parse_main;
+
+     typedef
+     vlib_parse_match_t (vlib_parse_eval_function_t)
+  (vlib_parse_main_t *, vlib_parse_item_t *, vlib_parse_value_t *);
+
+vlib_parse_match_t
+vlib_parse_eval (u8 * input);
+
+     format_function_t format_vlib_parse_value;
+
+/* FIXME need these to be global? */
+     vlib_parse_match_function_t rule_match, eof_match, word_match,
+       number_match;
+
+#define _PARSE_REGISTRATION_DATA(x) \
+VLIB_ELF_SECTION_DATA(x##_registration,parse_registration_t,parse_registrations)
+
+#define PARSE_INIT(x, s, e)                     \
+static _PARSE_REGISTRATION_DATA(x) = {          \
+    .initializer = s,                           \
+    .eof_match = e,                             \
+};
+
+#define _PARSE_TYPE_REGISTRATION_DATA(x) \
+VLIB_ELF_SECTION_DATA(x##_type_registration,vlib_parse_type_t, \
+parse_type_registrations)
+
+#define PARSE_TYPE_INIT(n, m, c, f)             \
+static _PARSE_TYPE_REGISTRATION_DATA(n) = {     \
+    .name = #n,                                 \
+    .match_function = m,			\
+    .value_cleanup_function = c,		\
+    .format_value = f,				\
+};
+
+#endif /* included_vlib_parse_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/parse_builtin.c b/src/vlib/parse_builtin.c
new file mode 100644
index 00000000..0ce716b5
--- /dev/null
+++ b/src/vlib/parse_builtin.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <vlib/parse.h>
+
+always_inline void *
+parse_last_match_value (vlib_parse_main_t * pm)
+{
+  vlib_parse_item_t *i;
+  i = pool_elt_at_index (pm->parse_items,
+			 vec_elt (pm->match_items,
+				  vec_len (pm->match_items) - 1));
+  return i->value.as_pointer;
+}
+
+vlib_parse_match_t
+eof_match (vlib_parse_main_t * pm, vlib_parse_type_t * type,
+	   vlib_lex_token_t * t, vlib_parse_value_t * valuep)
+{
+  return t->token ==
+    VLIB_LEX_eof ? VLIB_PARSE_MATCH_DONE : VLIB_PARSE_MATCH_FAIL;
+}
+
+PARSE_TYPE_INIT (eof, eof_match, 0 /* cleanup value */ ,
+		 0 /* format value */ );
+
+vlib_parse_match_t
+rule_eof_match (vlib_parse_main_t * pm, vlib_parse_type_t * type,
+		vlib_lex_token_t * t, vlib_parse_value_t * valuep)
+{
+  vlib_parse_match_function_t *fp = parse_last_match_value (pm);
+  pm->current_token_index--;
+  return fp ? fp (pm, type, t, valuep) : VLIB_PARSE_MATCH_RULE;
+}
+
+PARSE_TYPE_INIT (rule_eof, rule_eof_match, 0, 0);
+
+vlib_parse_match_t
+word_match (vlib_parse_main_t * pm, vlib_parse_type_t * type,
+	    vlib_lex_token_t * t, vlib_parse_value_t * valuep)
+{
+  u8 *tv, *iv;
+  int i;
+
+  if (t->token != VLIB_LEX_word)
+    return VLIB_PARSE_MATCH_FAIL;
+
+  tv = t->value.as_pointer;
+  iv = parse_last_match_value (pm);
+
+  for (i = 0; tv[i]; i++)
+    {
+      if (tv[i] != iv[i])
+	return VLIB_PARSE_MATCH_FAIL;
+    }
+
+  return iv[i] == 0 ? VLIB_PARSE_MATCH_FULL : VLIB_PARSE_MATCH_PARTIAL;
+}
+
+PARSE_TYPE_INIT (word, word_match, 0 /* clnup value */ ,
+		 0 /* format value */ );
+
+vlib_parse_match_t
+number_match (vlib_parse_main_t * pm, vlib_parse_type_t * type,
+	      vlib_lex_token_t * t, vlib_parse_value_t * valuep)
+{
+  if (t->token == VLIB_LEX_number)
+    {
+      valuep->value.as_uword = t->value.as_uword;
+      return VLIB_PARSE_MATCH_VALUE;
+    }
+  return VLIB_PARSE_MATCH_FAIL;
+}
+
+static u8 *
+format_value_number (u8 * s, va_list * args)
+{
+  vlib_parse_value_t *v = va_arg (*args, vlib_parse_value_t *);
+  uword a = v->value.as_uword;
+
+  if (BITS (uword) == 64)
+    s = format (s, "%lld(0x%llx)", a, a);
+  else
+    s = format (s, "%ld(0x%lx)", a, a);
+  return s;
+}
+
+PARSE_TYPE_INIT (number, number_match, 0 /* cln value */ ,
+		 format_value_number /* fmt value */ );
+
+
+#define foreach_vanilla_lex_match_function      \
+    _(plus)                                     \
+    _(minus)                                    \
+    _(star)                                     \
+    _(slash)                                    \
+    _(lpar)                                     \
+    _(rpar)
+
+#define LEX_MATCH_DEBUG 0
+
+#define _(name)                                                 \
+vlib_parse_match_t name##_match (vlib_parse_main_t *pm,         \
+                                 vlib_parse_type_t *type,       \
+                                 vlib_lex_token_t *t,           \
+                                 vlib_parse_value_t *valuep)    \
+{                                                               \
+  if (LEX_MATCH_DEBUG > 0)                                      \
+    clib_warning ("against %U returns %s",                      \
+                  format_vlib_lex_token, pm->lex_main, t,       \
+                  (t->token == VLIB_LEX_##name)                 \
+                  ? "VLIB_PARSE_MATCH_FULL" :                   \
+                  "VLIB_PARSE_MATCH_FAIL");                     \
+  if (t->token == VLIB_LEX_##name)                              \
+    return VLIB_PARSE_MATCH_FULL;                               \
+  return VLIB_PARSE_MATCH_FAIL;                                 \
+}                                                               \
+                                                                \
+PARSE_TYPE_INIT (name, name##_match, 0 /* cln value */,         \
+                 0 /* fmt val */);
+
+foreach_vanilla_lex_match_function
+#undef _
+/* So we're linked in. */
+static clib_error_t *
+parse_builtin_init (vlib_main_t * vm)
+{
+  return 0;
+}
+
+VLIB_INIT_FUNCTION (parse_builtin_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/pci/pci.c b/src/vlib/pci/pci.c
new file mode 100644
index 00000000..7100064d
--- /dev/null
+++ b/src/vlib/pci/pci.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * pci.c: Linux user space PCI bus management.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/pci/pci.h>
+#include <vlib/unix/unix.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <sys/ioctl.h>
+#include <net/if.h>
+#include <linux/ethtool.h>
+#include <linux/sockios.h>
+
+vlib_pci_main_t pci_main;
+
+vlib_pci_device_t *
+vlib_get_pci_device (vlib_pci_addr_t * addr)
+{
+  vlib_pci_main_t *pm = &pci_main;
+  uword *p;
+  p = hash_get (pm->pci_dev_index_by_pci_addr, addr->as_u32);
+
+  if (p == 0)
+    return 0;
+
+  return vec_elt_at_index (pm->pci_devs, p[0]);
+}
+
+static clib_error_t *
+show_pci_fn (vlib_main_t * vm,
+	     unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  vlib_pci_main_t *pm = &pci_main;
+  vlib_pci_device_t *d;
+  int show_all = 0;
+  u8 *s = 0;
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "all"))
+	show_all = 1;
+      else
+	return clib_error_return (0, "unknown input `%U'",
+				  format_unformat_error, input);
+    }
+
+  vlib_cli_output (vm, "%-13s%-5s%-12s%-13s%-16s%-32s%s",
+		   "Address", "Sock", "VID:PID", "Link Speed", "Driver",
+		   "Product Name", "Vital Product Data");
+
+  /* *INDENT-OFF* */
+  pool_foreach (d, pm->pci_devs, ({
+
+    if (d->device_class != PCI_CLASS_NETWORK_ETHERNET && !show_all)
+      continue;
+
+    vec_reset_length (s);
+
+    if (d->numa_node >= 0)
+      s = format (s, "  %d", d->numa_node);
+
+    vlib_cli_output (vm, "%-13U%-5v%04x:%04x   %-13U%-16s%-32v%U",
+		     format_vlib_pci_addr, &d->bus_address, s,
+		     d->vendor_id, d->device_id,
+		     format_vlib_pci_link_speed, d,
+		     d->driver_name ? (char *) d->driver_name : "",
+		     d->product_name,
+		     format_vlib_pci_vpd, d->vpd_r, 0);
+  }));
+/* *INDENT-ON* */
+
+  vec_free (s);
+  return 0;
+}
+
+uword
+unformat_vlib_pci_addr (unformat_input_t * input, va_list * args)
+{
+  vlib_pci_addr_t *addr = va_arg (*args, vlib_pci_addr_t *);
+  u32 x[4];
+
+  if (!unformat (input, "%x:%x:%x.%x", &x[0], &x[1], &x[2], &x[3]))
+    return 0;
+
+  addr->domain = x[0];
+  addr->bus = x[1];
+  addr->slot = x[2];
+  addr->function = x[3];
+
+  return 1;
+}
+
+u8 *
+format_vlib_pci_addr (u8 * s, va_list * va)
+{
+  vlib_pci_addr_t *addr = va_arg (*va, vlib_pci_addr_t *);
+  return format (s, "%04x:%02x:%02x.%x", addr->domain, addr->bus,
+		 addr->slot, addr->function);
+}
+
+u8 *
+format_vlib_pci_handle (u8 * s, va_list * va)
+{
+  vlib_pci_addr_t *addr = va_arg (*va, vlib_pci_addr_t *);
+  return format (s, "%x/%x/%x", addr->bus, addr->slot, addr->function);
+}
+
+u8 *
+format_vlib_pci_link_speed (u8 * s, va_list * va)
+{
+  vlib_pci_device_t *d = va_arg (*va, vlib_pci_device_t *);
+  pcie_config_regs_t *r =
+    pci_config_find_capability (&d->config0, PCI_CAP_ID_PCIE);
+  int width;
+
+  if (!r)
+    return format (s, "unknown");
+
+  width = (r->link_status >> 4) & 0x3f;
+
+  if ((r->link_status & 0xf) == 1)
+    return format (s, "2.5 GT/s x%u", width);
+  if ((r->link_status & 0xf) == 2)
+    return format (s, "5.0 GT/s x%u", width);
+  if ((r->link_status & 0xf) == 3)
+    return format (s, "8.0 GT/s x%u", width);
+  return format (s, "unknown");
+}
+
+u8 *
+format_vlib_pci_vpd (u8 * s, va_list * args)
+{
+  u8 *data = va_arg (*args, u8 *);
+  u8 *id = va_arg (*args, u8 *);
+  uword indent = format_get_indent (s);
+  char *string_types[] = { "PN", "EC", "SN", "MN", 0 };
+  uword p = 0;
+  int first_line = 1;
+
+  if (vec_len (data) < 3)
+    return s;
+
+  while (p + 3 < vec_len (data))
+    {
+
+      if (data[p] == 0 && data[p + 1] == 0)
+	return s;
+
+      if (p + data[p + 2] > vec_len (data))
+	return s;
+
+      if (id == 0)
+	{
+	  int is_string = 0;
+	  char **c = string_types;
+
+	  while (c[0])
+	    {
+	      if (*(u16 *) & data[p] == *(u16 *) c[0])
+		is_string = 1;
+	      c++;
+	    }
+
+	  if (data[p + 2])
+	    {
+	      if (!first_line)
+		s = format (s, "\n%U", format_white_space, indent);
+	      else
+		{
+		  first_line = 0;
+		  s = format (s, " ");
+		}
+
+	      s = format (s, "%c%c: ", data[p], data[p + 1]);
+	      if (is_string)
+		vec_add (s, data + p + 3, data[p + 2]);
+	      else
+		{
+		  int i;
+		  const int max_bytes = 8;
+		  s = format (s, "0x");
+		  for (i = 0; i < clib_min (data[p + 2], max_bytes); i++)
+		    s = format (s, " %02x", data[p + 3 + i]);
+
+		  if (data[p + 2] > max_bytes)
+		    s = format (s, " ...");
+		}
+	    }
+	}
+      else if (*(u16 *) & data[p] == *(u16 *) id)
+	{
+	  vec_add (s, data + p + 3, data[p + 2]);
+	  return s;
+	}
+
+      p += 3 + data[p + 2];
+    }
+
+  return s;
+}
+
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (show_pci_command, static) = {
+  .path = "show pci",
+  .short_help = "show pci [all]",
+  .function = show_pci_fn,
+};
+/* *INDENT-ON* */
+
+clib_error_t *
+pci_bus_init (vlib_main_t * vm)
+{
+  return 0;
+}
+
+VLIB_INIT_FUNCTION (pci_bus_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/pci/pci.h b/src/vlib/pci/pci.h
new file mode 100644
index 00000000..21410809
--- /dev/null
+++ b/src/vlib/pci/pci.h
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * pci.h: PCI definitions.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_pci_h
+#define included_vlib_pci_h
+
+#include <vlib/vlib.h>
+#include <vlib/pci/pci_config.h>
+
+typedef CLIB_PACKED (union
+		     {
+		     struct
+		     {
+u16 domain; u8 bus; u8 slot: 5; u8 function:3;};
+		     u32 as_u32;}) vlib_pci_addr_t;
+
+typedef struct vlib_pci_device
+{
+  /* Operating system handle for this device. */
+  uword os_handle;
+
+  vlib_pci_addr_t bus_address;
+
+  /* First 64 bytes of configuration space. */
+  union
+  {
+    pci_config_type0_regs_t config0;
+    pci_config_type1_regs_t config1;
+    u8 config_data[256];
+  };
+
+  /* Interrupt handler */
+  void (*interrupt_handler) (struct vlib_pci_device * dev);
+
+  /* Driver name */
+  u8 *driver_name;
+
+  /* Numa Node */
+  int numa_node;
+
+  /* Device data */
+  u16 device_class;
+  u16 vendor_id;
+  u16 device_id;
+
+  /* Vital Product Data */
+  u8 *product_name;
+  u8 *vpd_r;
+  u8 *vpd_w;
+
+  /* Private data */
+  uword private_data;
+
+} vlib_pci_device_t;
+
+typedef struct
+{
+  u16 vendor_id, device_id;
+} pci_device_id_t;
+
+typedef struct _pci_device_registration
+{
+  /* Driver init function. */
+  clib_error_t *(*init_function) (vlib_main_t * vm, vlib_pci_device_t * dev);
+
+  /* Interrupt handler */
+  void (*interrupt_handler) (vlib_pci_device_t * dev);
+
+  /* List of registrations */
+  struct _pci_device_registration *next_registration;
+
+  /* Vendor/device ids supported by this driver. */
+  pci_device_id_t supported_devices[];
+} pci_device_registration_t;
+
+/* Pool of PCI devices. */
+typedef struct
+{
+  vlib_main_t *vlib_main;
+  vlib_pci_device_t *pci_devs;
+  pci_device_registration_t *pci_device_registrations;
+  uword *pci_dev_index_by_pci_addr;
+} vlib_pci_main_t;
+
+extern vlib_pci_main_t pci_main;
+
+#define PCI_REGISTER_DEVICE(x,...)                              \
+    __VA_ARGS__ pci_device_registration_t x;                    \
+static void __vlib_add_pci_device_registration_##x (void)       \
+    __attribute__((__constructor__)) ;                          \
+static void __vlib_add_pci_device_registration_##x (void)       \
+{                                                               \
+    vlib_pci_main_t * pm = &pci_main;                           \
+    x.next_registration = pm->pci_device_registrations;         \
+    pm->pci_device_registrations = &x;                          \
+}                                                               \
+__VA_ARGS__ pci_device_registration_t x
+
+clib_error_t *vlib_pci_bind_to_uio (vlib_pci_device_t * d,
+				    char *uio_driver_name);
+
+/* Configuration space read/write. */
+clib_error_t *vlib_pci_read_write_config (vlib_pci_device_t * dev,
+					  vlib_read_or_write_t read_or_write,
+					  uword address,
+					  void *data, u32 n_bytes);
+
+#define _(t)								\
+static inline clib_error_t *						\
+vlib_pci_read_config_##t (vlib_pci_device_t * dev,			\
+			  uword address, t * data)			\
+{									\
+  return vlib_pci_read_write_config (dev, VLIB_READ,address, data,	\
+				     sizeof (data[0]));			\
+}
+
+_(u32);
+_(u16);
+_(u8);
+
+#undef _
+
+#define _(t)								\
+static inline clib_error_t *						\
+vlib_pci_write_config_##t (vlib_pci_device_t * dev, uword address,	\
+			   t * data)					\
+{									\
+  return vlib_pci_read_write_config (dev, VLIB_WRITE,			\
+				   address, data, sizeof (data[0]));	\
+}
+
+_(u32);
+_(u16);
+_(u8);
+
+#undef _
+
+static inline clib_error_t *
+vlib_pci_intr_enable (vlib_pci_device_t * dev)
+{
+  u16 command;
+  clib_error_t *err;
+
+  err = vlib_pci_read_config_u16 (dev, 4, &command);
+
+  if (err)
+    return err;
+
+  command &= ~PCI_COMMAND_INTX_DISABLE;
+
+  return vlib_pci_write_config_u16 (dev, 4, &command);
+}
+
+static inline clib_error_t *
+vlib_pci_intr_disable (vlib_pci_device_t * dev)
+{
+  u16 command;
+  clib_error_t *err;
+
+  err = vlib_pci_read_config_u16 (dev, 4, &command);
+
+  if (err)
+    return err;
+
+  command |= PCI_COMMAND_INTX_DISABLE;
+
+  return vlib_pci_write_config_u16 (dev, 4, &command);
+}
+
+static inline clib_error_t *
+vlib_pci_bus_master_enable (vlib_pci_device_t * dev)
+{
+  clib_error_t *err;
+  u16 command;
+
+  /* Set bus master enable (BME) */
+  err = vlib_pci_read_config_u16 (dev, 4, &command);
+
+  if (err)
+    return err;
+
+  if (command & PCI_COMMAND_BUS_MASTER)
+    return 0;
+
+  command |= PCI_COMMAND_BUS_MASTER;
+
+  return vlib_pci_write_config_u16 (dev, 4, &command);
+}
+
+clib_error_t *vlib_pci_map_resource (vlib_pci_device_t * dev, u32 resource,
+				     void **result);
+
+clib_error_t *vlib_pci_map_resource_fixed (vlib_pci_device_t * dev,
+					   u32 resource, u8 * addr,
+					   void **result);
+
+vlib_pci_device_t *vlib_get_pci_device (vlib_pci_addr_t * addr);
+/* Free's device. */
+void vlib_pci_free_device (vlib_pci_device_t * dev);
+
+unformat_function_t unformat_vlib_pci_addr;
+format_function_t format_vlib_pci_addr;
+format_function_t format_vlib_pci_handle;
+format_function_t format_vlib_pci_link_speed;
+format_function_t format_vlib_pci_vpd;
+
+#endif /* included_vlib_pci_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/pci/pci_config.h b/src/vlib/pci/pci_config.h
new file mode 100644
index 00000000..92e56af6
--- /dev/null
+++ b/src/vlib/pci/pci_config.h
@@ -0,0 +1,731 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * pci.h: PCI definitions.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_pci_config_h
+#define included_vlib_pci_config_h
+
+#include <vppinfra/byte_order.h>
+#include <vppinfra/error.h>
+
+typedef enum
+{
+  PCI_CLASS_NOT_DEFINED = 0x0000,
+  PCI_CLASS_NOT_DEFINED_VGA = 0x0001,
+
+  PCI_CLASS_STORAGE_SCSI = 0x0100,
+  PCI_CLASS_STORAGE_IDE = 0x0101,
+  PCI_CLASS_STORAGE_FLOPPY = 0x0102,
+  PCI_CLASS_STORAGE_IPI = 0x0103,
+  PCI_CLASS_STORAGE_RAID = 0x0104,
+  PCI_CLASS_STORAGE_OTHER = 0x0180,
+  PCI_CLASS_STORAGE = 0x0100,
+
+  PCI_CLASS_NETWORK_ETHERNET = 0x0200,
+  PCI_CLASS_NETWORK_TOKEN_RING = 0x0201,
+  PCI_CLASS_NETWORK_FDDI = 0x0202,
+  PCI_CLASS_NETWORK_ATM = 0x0203,
+  PCI_CLASS_NETWORK_OTHER = 0x0280,
+  PCI_CLASS_NETWORK = 0x0200,
+
+  PCI_CLASS_DISPLAY_VGA = 0x0300,
+  PCI_CLASS_DISPLAY_XGA = 0x0301,
+  PCI_CLASS_DISPLAY_3D = 0x0302,
+  PCI_CLASS_DISPLAY_OTHER = 0x0380,
+  PCI_CLASS_DISPLAY = 0x0300,
+
+  PCI_CLASS_MULTIMEDIA_VIDEO = 0x0400,
+  PCI_CLASS_MULTIMEDIA_AUDIO = 0x0401,
+  PCI_CLASS_MULTIMEDIA_PHONE = 0x0402,
+  PCI_CLASS_MULTIMEDIA_OTHER = 0x0480,
+  PCI_CLASS_MULTIMEDIA = 0x0400,
+
+  PCI_CLASS_MEMORY_RAM = 0x0500,
+  PCI_CLASS_MEMORY_FLASH = 0x0501,
+  PCI_CLASS_MEMORY_OTHER = 0x0580,
+  PCI_CLASS_MEMORY = 0x0500,
+
+  PCI_CLASS_BRIDGE_HOST = 0x0600,
+  PCI_CLASS_BRIDGE_ISA = 0x0601,
+  PCI_CLASS_BRIDGE_EISA = 0x0602,
+  PCI_CLASS_BRIDGE_MC = 0x0603,
+  PCI_CLASS_BRIDGE_PCI = 0x0604,
+  PCI_CLASS_BRIDGE_PCMCIA = 0x0605,
+  PCI_CLASS_BRIDGE_NUBUS = 0x0606,
+  PCI_CLASS_BRIDGE_CARDBUS = 0x0607,
+  PCI_CLASS_BRIDGE_RACEWAY = 0x0608,
+  PCI_CLASS_BRIDGE_OTHER = 0x0680,
+  PCI_CLASS_BRIDGE = 0x0600,
+
+  PCI_CLASS_COMMUNICATION_SERIAL = 0x0700,
+  PCI_CLASS_COMMUNICATION_PARALLEL = 0x0701,
+  PCI_CLASS_COMMUNICATION_MULTISERIAL = 0x0702,
+  PCI_CLASS_COMMUNICATION_MODEM = 0x0703,
+  PCI_CLASS_COMMUNICATION_OTHER = 0x0780,
+  PCI_CLASS_COMMUNICATION = 0x0700,
+
+  PCI_CLASS_SYSTEM_PIC = 0x0800,
+  PCI_CLASS_SYSTEM_DMA = 0x0801,
+  PCI_CLASS_SYSTEM_TIMER = 0x0802,
+  PCI_CLASS_SYSTEM_RTC = 0x0803,
+  PCI_CLASS_SYSTEM_PCI_HOTPLUG = 0x0804,
+  PCI_CLASS_SYSTEM_OTHER = 0x0880,
+  PCI_CLASS_SYSTEM = 0x0800,
+
+  PCI_CLASS_INPUT_KEYBOARD = 0x0900,
+  PCI_CLASS_INPUT_PEN = 0x0901,
+  PCI_CLASS_INPUT_MOUSE = 0x0902,
+  PCI_CLASS_INPUT_SCANNER = 0x0903,
+  PCI_CLASS_INPUT_GAMEPORT = 0x0904,
+  PCI_CLASS_INPUT_OTHER = 0x0980,
+  PCI_CLASS_INPUT = 0x0900,
+
+  PCI_CLASS_DOCKING_GENERIC = 0x0a00,
+  PCI_CLASS_DOCKING_OTHER = 0x0a80,
+  PCI_CLASS_DOCKING = 0x0a00,
+
+  PCI_CLASS_PROCESSOR_386 = 0x0b00,
+  PCI_CLASS_PROCESSOR_486 = 0x0b01,
+  PCI_CLASS_PROCESSOR_PENTIUM = 0x0b02,
+  PCI_CLASS_PROCESSOR_ALPHA = 0x0b10,
+  PCI_CLASS_PROCESSOR_POWERPC = 0x0b20,
+  PCI_CLASS_PROCESSOR_MIPS = 0x0b30,
+  PCI_CLASS_PROCESSOR_CO = 0x0b40,
+  PCI_CLASS_PROCESSOR = 0x0b00,
+
+  PCI_CLASS_SERIAL_FIREWIRE = 0x0c00,
+  PCI_CLASS_SERIAL_ACCESS = 0x0c01,
+  PCI_CLASS_SERIAL_SSA = 0x0c02,
+  PCI_CLASS_SERIAL_USB = 0x0c03,
+  PCI_CLASS_SERIAL_FIBER = 0x0c04,
+  PCI_CLASS_SERIAL_SMBUS = 0x0c05,
+  PCI_CLASS_SERIAL = 0x0c00,
+
+  PCI_CLASS_INTELLIGENT_I2O = 0x0e00,
+  PCI_CLASS_INTELLIGENT = 0x0e00,
+
+  PCI_CLASS_SATELLITE_TV = 0x0f00,
+  PCI_CLASS_SATELLITE_AUDIO = 0x0f01,
+  PCI_CLASS_SATELLITE_VOICE = 0x0f03,
+  PCI_CLASS_SATELLITE_DATA = 0x0f04,
+  PCI_CLASS_SATELLITE = 0x0f00,
+
+  PCI_CLASS_CRYPT_NETWORK = 0x1000,
+  PCI_CLASS_CRYPT_ENTERTAINMENT = 0x1001,
+  PCI_CLASS_CRYPT_OTHER = 0x1080,
+  PCI_CLASS_CRYPT = 0x1000,
+
+  PCI_CLASS_SP_DPIO = 0x1100,
+  PCI_CLASS_SP_OTHER = 0x1180,
+  PCI_CLASS_SP = 0x1100,
+} pci_device_class_t;
+
+static inline pci_device_class_t
+pci_device_class_base (pci_device_class_t c)
+{
+  return c & ~0xff;
+}
+
+/*
+ * Under PCI, each device has 256 bytes of configuration address space,
+ * of which the first 64 bytes are standardized as follows:
+ */
+typedef struct
+{
+  u16 vendor_id;
+  u16 device_id;
+
+  u16 command;
+#define PCI_COMMAND_IO		(1 << 0)	/* Enable response in I/O space */
+#define PCI_COMMAND_MEMORY	(1 << 1)	/* Enable response in Memory space */
+#define PCI_COMMAND_BUS_MASTER	(1 << 2)	/* Enable bus mastering */
+#define PCI_COMMAND_SPECIAL	(1 << 3)	/* Enable response to special cycles */
+#define PCI_COMMAND_WRITE_INVALIDATE (1 << 4)	/* Use memory write and invalidate */
+#define PCI_COMMAND_VGA_PALETTE_SNOOP (1 << 5)
+#define PCI_COMMAND_PARITY	(1 << 6)
+#define PCI_COMMAND_WAIT 	(1 << 7)	/* Enable address/data stepping */
+#define PCI_COMMAND_SERR	(1 << 8)	/* Enable SERR */
+#define PCI_COMMAND_BACK_TO_BACK_WRITE (1 << 9)
+#define PCI_COMMAND_INTX_DISABLE (1 << 10)	/* INTx Emulation Disable */
+
+  u16 status;
+#define PCI_STATUS_INTX_PENDING (1 << 3)
+#define PCI_STATUS_CAPABILITY_LIST (1 << 4)
+#define PCI_STATUS_66MHZ	(1 << 5)	/* Support 66 Mhz PCI 2.1 bus */
+#define PCI_STATUS_UDF		(1 << 6)	/* Support User Definable Features (obsolete) */
+#define PCI_STATUS_BACK_TO_BACK_WRITE (1 << 7)	/* Accept fast-back to back */
+#define PCI_STATUS_PARITY_ERROR	(1 << 8)	/* Detected parity error */
+#define PCI_STATUS_DEVSEL_GET(x) ((x >> 9) & 3)	/* DEVSEL timing */
+#define PCI_STATUS_DEVSEL_FAST (0 << 9)
+#define PCI_STATUS_DEVSEL_MEDIUM (1 << 9)
+#define PCI_STATUS_DEVSEL_SLOW (2 << 9)
+#define PCI_STATUS_SIG_TARGET_ABORT (1 << 11)	/* Set on target abort */
+#define PCI_STATUS_REC_TARGET_ABORT (1 << 12)	/* Master ack of " */
+#define PCI_STATUS_REC_MASTER_ABORT (1 << 13)	/* Set on master abort */
+#define PCI_STATUS_SIG_SYSTEM_ERROR (1 << 14)	/* Set when we drive SERR */
+#define PCI_STATUS_DETECTED_PARITY_ERROR (1 << 15)
+
+  u8 revision_id;
+  u8 programming_interface_class;	/* Reg. Level Programming Interface */
+
+  pci_device_class_t device_class:16;
+
+  u8 cache_size;
+  u8 latency_timer;
+
+  u8 header_type;
+#define PCI_HEADER_TYPE_NORMAL	0
+#define PCI_HEADER_TYPE_BRIDGE 1
+#define PCI_HEADER_TYPE_CARDBUS 2
+
+  u8 bist;
+#define PCI_BIST_CODE_MASK	0x0f	/* Return result */
+#define PCI_BIST_START		0x40	/* 1 to start BIST, 2 secs or less */
+#define PCI_BIST_CAPABLE	0x80	/* 1 if BIST capable */
+} pci_config_header_t;
+
+/* Byte swap config header. */
+always_inline void
+pci_config_header_little_to_host (pci_config_header_t * r)
+{
+  if (!CLIB_ARCH_IS_BIG_ENDIAN)
+    return;
+#define _(f,t) r->f = clib_byte_swap_##t (r->f)
+  _(vendor_id, u16);
+  _(device_id, u16);
+  _(command, u16);
+  _(status, u16);
+  _(device_class, u16);
+#undef _
+}
+
+/* Header type 0 (normal devices) */
+typedef struct
+{
+  pci_config_header_t header;
+
+  /*
+   * Base addresses specify locations in memory or I/O space.
+   * Decoded size can be determined by writing a value of
+   * 0xffffffff to the register, and reading it back. Only
+   * 1 bits are decoded.
+   */
+  u32 base_address[6];
+
+  u16 cardbus_cis;
+
+  u16 subsystem_vendor_id;
+  u16 subsystem_id;
+
+  u32 rom_address;
+#define PCI_ROM_ADDRESS		0x30	/* Bits 31..11 are address, 10..1 reserved */
+#define PCI_ROM_ADDRESS_ENABLE	0x01
+#define PCI_ROM_ADDRESS_MASK	(~0x7ffUL)
+
+  u8 first_capability_offset;
+    CLIB_PAD_FROM_TO (0x35, 0x3c);
+
+  u8 interrupt_line;
+  u8 interrupt_pin;
+  u8 min_grant;
+  u8 max_latency;
+
+  u8 capability_data[0];
+} pci_config_type0_regs_t;
+
+always_inline void
+pci_config_type0_little_to_host (pci_config_type0_regs_t * r)
+{
+  int i;
+  if (!CLIB_ARCH_IS_BIG_ENDIAN)
+    return;
+  pci_config_header_little_to_host (&r->header);
+#define _(f,t) r->f = clib_byte_swap_##t (r->f)
+  for (i = 0; i < ARRAY_LEN (r->base_address); i++)
+    _(base_address[i], u32);
+  _(cardbus_cis, u16);
+  _(subsystem_vendor_id, u16);
+  _(subsystem_id, u16);
+  _(rom_address, u32);
+#undef _
+}
+
+/* Header type 1 (PCI-to-PCI bridges) */
+typedef struct
+{
+  pci_config_header_t header;
+
+  u32 base_address[2];
+
+  /* Primary/secondary bus number. */
+  u8 primary_bus;
+  u8 secondary_bus;
+
+  /* Highest bus number behind the bridge */
+  u8 subordinate_bus;
+
+  u8 secondary_bus_latency_timer;
+
+  /* I/O range behind bridge. */
+  u8 io_base, io_limit;
+
+  /* Secondary status register, only bit 14 used */
+  u16 secondary_status;
+
+  /* Memory range behind bridge in units of 64k bytes. */
+  u16 memory_base, memory_limit;
+#define PCI_MEMORY_RANGE_TYPE_MASK 0x0fUL
+#define PCI_MEMORY_RANGE_MASK	(~0x0fUL)
+
+  u16 prefetchable_memory_base, prefetchable_memory_limit;
+#define PCI_PREF_RANGE_TYPE_MASK 0x0fUL
+#define PCI_PREF_RANGE_TYPE_32	0x00
+#define PCI_PREF_RANGE_TYPE_64	0x01
+#define PCI_PREF_RANGE_MASK	(~0x0fUL)
+
+  u32 prefetchable_memory_base_upper_32bits;
+  u32 prefetchable_memory_limit_upper_32bits;
+  u16 io_base_upper_16bits;
+  u16 io_limit_upper_16bits;
+
+  /* Same as for type 0. */
+  u8 capability_list_offset;
+    CLIB_PAD_FROM_TO (0x35, 0x37);
+
+  u32 rom_address;
+    CLIB_PAD_FROM_TO (0x3c, 0x3e);
+
+  u16 bridge_control;
+#define PCI_BRIDGE_CTL_PARITY	0x01	/* Enable parity detection on secondary interface */
+#define PCI_BRIDGE_CTL_SERR	0x02	/* The same for SERR forwarding */
+#define PCI_BRIDGE_CTL_NO_ISA	0x04	/* Disable bridging of ISA ports */
+#define PCI_BRIDGE_CTL_VGA	0x08	/* Forward VGA addresses */
+#define PCI_BRIDGE_CTL_MASTER_ABORT 0x20	/* Report master aborts */
+#define PCI_BRIDGE_CTL_BUS_RESET 0x40	/* Secondary bus reset */
+#define PCI_BRIDGE_CTL_FAST_BACK 0x80	/* Fast Back2Back enabled on secondary interface */
+
+  u8 capability_data[0];
+} pci_config_type1_regs_t;
+
+always_inline void
+pci_config_type1_little_to_host (pci_config_type1_regs_t * r)
+{
+  int i;
+  if (!CLIB_ARCH_IS_BIG_ENDIAN)
+    return;
+  pci_config_header_little_to_host (&r->header);
+#define _(f,t) r->f = clib_byte_swap_##t (r->f)
+  for (i = 0; i < ARRAY_LEN (r->base_address); i++)
+    _(base_address[i], u32);
+  _(secondary_status, u16);
+  _(memory_base, u16);
+  _(memory_limit, u16);
+  _(prefetchable_memory_base, u16);
+  _(prefetchable_memory_limit, u16);
+  _(prefetchable_memory_base_upper_32bits, u32);
+  _(prefetchable_memory_limit_upper_32bits, u32);
+  _(io_base_upper_16bits, u16);
+  _(io_limit_upper_16bits, u16);
+  _(rom_address, u32);
+  _(bridge_control, u16);
+#undef _
+}
+
+/* Capabilities. */
+typedef enum pci_capability_type
+{
+  /* Power Management */
+  PCI_CAP_ID_PM = 1,
+
+  /* Accelerated Graphics Port */
+  PCI_CAP_ID_AGP = 2,
+
+  /* Vital Product Data */
+  PCI_CAP_ID_VPD = 3,
+
+  /* Slot Identification */
+  PCI_CAP_ID_SLOTID = 4,
+
+  /* Message Signalled Interrupts */
+  PCI_CAP_ID_MSI = 5,
+
+  /* CompactPCI HotSwap */
+  PCI_CAP_ID_CHSWP = 6,
+
+  /* PCI-X */
+  PCI_CAP_ID_PCIX = 7,
+
+  /* Hypertransport. */
+  PCI_CAP_ID_HYPERTRANSPORT = 8,
+
+  /* PCI Standard Hot-Plug Controller */
+  PCI_CAP_ID_SHPC = 0xc,
+
+  /* PCI Express */
+  PCI_CAP_ID_PCIE = 0x10,
+
+  /* MSI-X */
+  PCI_CAP_ID_MSIX = 0x11,
+} pci_capability_type_t;
+
+/* Common header for capabilities. */
+/* *INDENT-OFF* */
+typedef CLIB_PACKED (struct
+		     {
+		     enum pci_capability_type type:8;
+		     u8 next_offset;}) pci_capability_regs_t;
+/* *INDENT-ON* */
+
+always_inline void *
+pci_config_find_capability (pci_config_type0_regs_t * t, int cap_type)
+{
+  pci_capability_regs_t *c;
+  u32 next_offset;
+  u32 ttl = 48;
+
+  if (!(t->header.status & PCI_STATUS_CAPABILITY_LIST))
+    return 0;
+
+  next_offset = t->first_capability_offset;
+  while (ttl-- && next_offset >= 0x40)
+    {
+      c = (void *) t + (next_offset & ~3);
+      if ((u8) c->type == 0xff)
+	break;
+      if (c->type == cap_type)
+	return c;
+      next_offset = c->next_offset;
+    }
+  return 0;
+}
+
+/* Power Management Registers */
+/* *INDENT-OFF* */
+typedef CLIB_PACKED (struct
+		     {
+		     pci_capability_regs_t header; u16 capabilities;
+#define PCI_PM_CAP_VER_MASK	0x0007	/* Version */
+#define PCI_PM_CAP_PME_CLOCK	0x0008	/* PME clock required */
+#define PCI_PM_CAP_RESERVED  0x0010	/* Reserved field */
+#define PCI_PM_CAP_DSI		0x0020	/* Device specific initialization */
+#define PCI_PM_CAP_AUX_POWER	0x01C0	/* Auxilliary power support mask */
+#define PCI_PM_CAP_D1		0x0200	/* D1 power state support */
+#define PCI_PM_CAP_D2		0x0400	/* D2 power state support */
+#define PCI_PM_CAP_PME		0x0800	/* PME pin supported */
+#define PCI_PM_CAP_PME_MASK  0xF800	/* PME Mask of all supported states */
+#define PCI_PM_CAP_PME_D0   0x0800	/* PME# from D0 */
+#define PCI_PM_CAP_PME_D1   0x1000	/* PME# from D1 */
+#define PCI_PM_CAP_PME_D2   0x2000	/* PME# from D2 */
+#define PCI_PM_CAP_PME_D3   0x4000	/* PME# from D3 (hot) */
+#define PCI_PM_CAP_PME_D3cold 0x8000	/* PME# from D3 (cold) */
+		     u16 control;
+#define PCI_PM_CTRL_STATE_MASK	0x0003	/* Current power state (D0 to D3) */
+#define PCI_PM_CTRL_PME_ENABLE	0x0100	/* PME pin enable */
+#define PCI_PM_CTRL_DATA_SEL_MASK	0x1e00	/* Data select (??) */
+#define PCI_PM_CTRL_DATA_SCALE_MASK	0x6000	/* Data scale (??) */
+#define PCI_PM_CTRL_PME_STATUS	0x8000	/* PME pin status */
+		     u8 extensions;
+#define PCI_PM_PPB_B2_B3	0x40	/* Stop clock when in D3hot (??) */
+#define PCI_PM_BPCC_ENABLE	0x80	/* Bus power/clock control enable (??) */
+		     u8 data;}) pci_power_management_regs_t;
+/* *INDENT-ON* */
+
+/* AGP registers */
+/* *INDENT-OFF* */
+typedef CLIB_PACKED (struct
+		     {
+		     pci_capability_regs_t header; u8 version;
+		     u8 rest_of_capability_flags; u32 status; u32 command;
+		     /* Command & status common bits. */
+#define PCI_AGP_RQ_MASK	0xff000000	/* Maximum number of requests - 1 */
+#define PCI_AGP_SBA	0x0200	/* Sideband addressing supported */
+#define PCI_AGP_64BIT	0x0020	/* 64-bit addressing supported */
+#define PCI_AGP_ALLOW_TRANSACTIONS 0x0100	/* Allow processing of AGP transactions */
+#define PCI_AGP_FW	0x0010	/* FW transfers supported/forced */
+#define PCI_AGP_RATE4	0x0004	/* 4x transfer rate supported */
+#define PCI_AGP_RATE2	0x0002	/* 2x transfer rate supported */
+#define PCI_AGP_RATE1	0x0001	/* 1x transfer rate supported */
+		     }) pci_agp_regs_t;
+/* *INDENT-ON* */
+
+/* Vital Product Data */
+/* *INDENT-OFF* */
+typedef CLIB_PACKED (struct
+		     {
+		     pci_capability_regs_t header; u16 address;
+#define PCI_VPD_ADDR_MASK	0x7fff	/* Address mask */
+#define PCI_VPD_ADDR_F		0x8000	/* Write 0, 1 indicates completion */
+		     u32 data;}) pci_vpd_regs_t;
+/* *INDENT-ON* */
+
+/* Slot Identification */
+/* *INDENT-OFF* */
+typedef CLIB_PACKED (struct
+		     {
+		     pci_capability_regs_t header; u8 esr;
+#define PCI_SID_ESR_NSLOTS	0x1f	/* Number of expansion slots available */
+#define PCI_SID_ESR_FIC	0x20	/* First In Chassis Flag */
+		     u8 chassis;}) pci_sid_regs_t;
+/* *INDENT-ON* */
+
+/* Message Signalled Interrupts registers */
+/* *INDENT-OFF* */
+typedef CLIB_PACKED (struct
+		     {
+		     pci_capability_regs_t header; u16 flags;
+#define PCI_MSI_FLAGS_ENABLE	(1 << 0)	/* MSI feature enabled */
+#define PCI_MSI_FLAGS_GET_MAX_QUEUE_SIZE(x) ((x >> 1) & 0x7)
+#define PCI_MSI_FLAGS_MAX_QUEUE_SIZE(x)     (((x) & 0x7) << 1)
+#define PCI_MSI_FLAGS_GET_QUEUE_SIZE(x) ((x >> 4) & 0x7)
+#define PCI_MSI_FLAGS_QUEUE_SIZE(x)     (((x) & 0x7) << 4)
+#define PCI_MSI_FLAGS_64BIT	(1 << 7)	/* 64-bit addresses allowed */
+#define PCI_MSI_FLAGS_MASKBIT	(1 << 8)	/* 64-bit mask bits allowed */
+		     u32 address; u32 data; u32 mask_bits;}) pci_msi32_regs_t;
+/* *INDENT-ON* */
+
+/* *INDENT-OFF* */
+typedef CLIB_PACKED (struct
+		     {
+		     pci_capability_regs_t header; u16 flags;
+		     u32 address[2];
+		     u32 data; u32 mask_bits;}) pci_msi64_regs_t;
+/* *INDENT-ON* */
+
+/* CompactPCI Hotswap Register */
+/* *INDENT-OFF* */
+typedef CLIB_PACKED (struct
+		     {
+		     pci_capability_regs_t header; u16 control_status;
+#define PCI_CHSWP_DHA		0x01	/* Device Hiding Arm */
+#define PCI_CHSWP_EIM		0x02	/* ENUM# Signal Mask */
+#define PCI_CHSWP_PIE		0x04	/* Pending Insert or Extract */
+#define PCI_CHSWP_LOO		0x08	/* LED On / Off */
+#define PCI_CHSWP_PI		0x30	/* Programming Interface */
+#define PCI_CHSWP_EXT		0x40	/* ENUM# status - extraction */
+#define PCI_CHSWP_INS		0x80	/* ENUM# status - insertion */
+		     }) pci_chswp_regs_t;
+/* *INDENT-ON* */
+
+/* PCIX registers */
+/* *INDENT-OFF* */
+typedef CLIB_PACKED (struct
+		     {
+		     pci_capability_regs_t header; u16 command;
+#define PCIX_CMD_DPERR_E	0x0001	/* Data Parity Error Recovery Enable */
+#define PCIX_CMD_ERO		0x0002	/* Enable Relaxed Ordering */
+#define PCIX_CMD_MAX_READ	0x000c	/* Max Memory Read Byte Count */
+#define PCIX_CMD_MAX_SPLIT	0x0070	/* Max Outstanding Split Transactions */
+#define PCIX_CMD_VERSION(x) 	(((x) >> 12) & 3)	/* Version */
+		     u32 status;
+#define PCIX_STATUS_DEVFN	0x000000ff	/* A copy of devfn */
+#define PCIX_STATUS_BUS	0x0000ff00	/* A copy of bus nr */
+#define PCIX_STATUS_64BIT	0x00010000	/* 64-bit device */
+#define PCIX_STATUS_133MHZ	0x00020000	/* 133 MHz capable */
+#define PCIX_STATUS_SPL_DISC	0x00040000	/* Split Completion Discarded */
+#define PCIX_STATUS_UNX_SPL	0x00080000	/* Unexpected Split Completion */
+#define PCIX_STATUS_COMPLEX	0x00100000	/* Device Complexity */
+#define PCIX_STATUS_MAX_READ	0x00600000	/* Designed Max Memory Read Count */
+#define PCIX_STATUS_MAX_SPLIT	0x03800000	/* Designed Max Outstanding Split Transactions */
+#define PCIX_STATUS_MAX_CUM	0x1c000000	/* Designed Max Cumulative Read Size */
+#define PCIX_STATUS_SPL_ERR	0x20000000	/* Rcvd Split Completion Error Msg */
+#define PCIX_STATUS_266MHZ	0x40000000	/* 266 MHz capable */
+#define PCIX_STATUS_533MHZ	0x80000000	/* 533 MHz capable */
+		     }) pcix_config_regs_t;
+/* *INDENT-ON* */
+
+static inline int
+pcie_size_to_code (int bytes)
+{
+  ASSERT (is_pow2 (bytes));
+  ASSERT (bytes <= 4096);
+  return min_log2 (bytes) - 7;
+}
+
+static inline int
+pcie_code_to_size (int code)
+{
+  int size = 1 << (code + 7);
+  ASSERT (size <= 4096);
+  return size;
+}
+
+/* PCI Express capability registers */
+/* *INDENT-OFF* */
+typedef CLIB_PACKED (struct
+		     {
+		     pci_capability_regs_t header; u16 pcie_capabilities;
+#define PCIE_CAP_VERSION(x)	(((x) >> 0) & 0xf)
+#define PCIE_CAP_DEVICE_TYPE(x)	(((x) >> 4) & 0xf)
+#define PCIE_DEVICE_TYPE_ENDPOINT 0
+#define PCIE_DEVICE_TYPE_LEGACY_ENDPOINT 1
+#define PCIE_DEVICE_TYPE_ROOT_PORT 4
+		     /* Upstream/downstream port of PCI Express switch. */
+#define PCIE_DEVICE_TYPE_SWITCH_UPSTREAM 5
+#define PCIE_DEVICE_TYPE_SWITCH_DOWNSTREAM 6
+#define PCIE_DEVICE_TYPE_PCIE_TO_PCI_BRIDGE 7
+#define PCIE_DEVICE_TYPE_PCI_TO_PCIE_BRIDGE 8
+		     /* Root complex integrated endpoint. */
+#define PCIE_DEVICE_TYPE_ROOT_COMPLEX_ENDPOINT 9
+#define PCIE_DEVICE_TYPE_ROOT_COMPLEX_EVENT_COLLECTOR 10
+#define PCIE_CAP_SLOW_IMPLEMENTED (1 << 8)
+#define PCIE_CAP_MSI_IRQ(x) (((x) >> 9) & 0x1f)
+		     u32 dev_capabilities;
+#define PCIE_DEVCAP_MAX_PAYLOAD(x) (128 << (((x) >> 0) & 0x7))
+#define PCIE_DEVCAP_PHANTOM_BITS(x) (((x) >> 3) & 0x3)
+#define PCIE_DEVCAP_EXTENTED_TAG (1 << 5)
+#define PCIE_DEVCAP_L0S	0x1c0	/* L0s Acceptable Latency */
+#define PCIE_DEVCAP_L1	0xe00	/* L1 Acceptable Latency */
+#define PCIE_DEVCAP_ATN_BUT	0x1000	/* Attention Button Present */
+#define PCIE_DEVCAP_ATN_IND	0x2000	/* Attention Indicator Present */
+#define PCIE_DEVCAP_PWR_IND	0x4000	/* Power Indicator Present */
+#define PCIE_DEVCAP_PWR_VAL	0x3fc0000	/* Slot Power Limit Value */
+#define PCIE_DEVCAP_PWR_SCL	0xc000000	/* Slot Power Limit Scale */
+		     u16 dev_control;
+#define PCIE_CTRL_CERE	0x0001	/* Correctable Error Reporting En. */
+#define PCIE_CTRL_NFERE	0x0002	/* Non-Fatal Error Reporting Enable */
+#define PCIE_CTRL_FERE	0x0004	/* Fatal Error Reporting Enable */
+#define PCIE_CTRL_URRE	0x0008	/* Unsupported Request Reporting En. */
+#define PCIE_CTRL_RELAX_EN 0x0010	/* Enable relaxed ordering */
+#define PCIE_CTRL_MAX_PAYLOAD(n) (((n) & 7) << 5)
+#define PCIE_CTRL_EXT_TAG	0x0100	/* Extended Tag Field Enable */
+#define PCIE_CTRL_PHANTOM	0x0200	/* Phantom Functions Enable */
+#define PCIE_CTRL_AUX_PME	0x0400	/* Auxiliary Power PM Enable */
+#define PCIE_CTRL_NOSNOOP_EN	0x0800	/* Enable No Snoop */
+#define PCIE_CTRL_MAX_READ_REQUEST(n) (((n) & 7) << 12)
+		     u16 dev_status;
+#define PCIE_DEVSTA_AUXPD	0x10	/* AUX Power Detected */
+#define PCIE_DEVSTA_TRPND	0x20	/* Transactions Pending */
+		     u32 link_capabilities; u16 link_control; u16 link_status;
+		     u32 slot_capabilities;
+		     u16 slot_control; u16 slot_status; u16 root_control;
+#define PCIE_RTCTL_SECEE	0x01	/* System Error on Correctable Error */
+#define PCIE_RTCTL_SENFEE	0x02	/* System Error on Non-Fatal Error */
+#define PCIE_RTCTL_SEFEE	0x04	/* System Error on Fatal Error */
+#define PCIE_RTCTL_PMEIE	0x08	/* PME Interrupt Enable */
+#define PCIE_RTCTL_CRSSVE	0x10	/* CRS Software Visibility Enable */
+		     u16 root_capabilities;
+		     u32 root_status;
+		     u32 dev_capabilities2;
+		     u16 dev_control2;
+		     u16 dev_status2;
+		     u32 link_capabilities2;
+		     u16 link_control2;
+		     u16 link_status2;
+		     u32 slot_capabilities2; u16 slot_control2;
+		     u16 slot_status2;}) pcie_config_regs_t;
+/* *INDENT-ON* */
+
+/* PCI express extended capabilities. */
+typedef enum pcie_capability_type
+{
+  PCIE_CAP_ADVANCED_ERROR = 1,
+  PCIE_CAP_VC = 2,
+  PCIE_CAP_DSN = 3,
+  PCIE_CAP_PWR = 4,
+} pcie_capability_type_t;
+
+/* Common header for capabilities. */
+/* *INDENT-OFF* */
+typedef CLIB_PACKED (struct
+		     {
+enum pcie_capability_type type:16; u16 version: 4; u16 next_capability:12;})
+  /* *INDENT-ON* */
+pcie_capability_regs_t;
+
+/* *INDENT-OFF* */
+typedef CLIB_PACKED (struct
+		     {
+		     pcie_capability_regs_t header; u32 uncorrectable_status;
+#define PCIE_ERROR_UNC_LINK_TRAINING 		(1 << 0)
+#define PCIE_ERROR_UNC_DATA_LINK_PROTOCOL 	(1 << 4)
+#define PCIE_ERROR_UNC_SURPRISE_DOWN		(1 << 5)
+#define PCIE_ERROR_UNC_POISONED_TLP		(1 << 12)
+#define PCIE_ERROR_UNC_FLOW_CONTROL		(1 << 13)
+#define PCIE_ERROR_UNC_COMPLETION_TIMEOUT	(1 << 14)
+#define PCIE_ERROR_UNC_COMPLETER_ABORT		(1 << 15)
+#define PCIE_ERROR_UNC_UNEXPECTED_COMPLETION	(1 << 16)
+#define PCIE_ERROR_UNC_RX_OVERFLOW		(1 << 17)
+#define PCIE_ERROR_UNC_MALFORMED_TLP		(1 << 18)
+#define PCIE_ERROR_UNC_CRC_ERROR		(1 << 19)
+#define PCIE_ERROR_UNC_UNSUPPORTED_REQUEST	(1 << 20)
+		     u32 uncorrectable_mask;
+		     u32 uncorrectable_severity; u32 correctable_status;
+#define PCIE_ERROR_COR_RX_ERROR		(1 << 0)
+#define PCIE_ERROR_COR_BAD_TLP		(1 << 6)
+#define PCIE_ERROR_COR_BAD_DLLP		(1 << 7)
+#define PCIE_ERROR_COR_REPLAY_ROLLOVER	(1 << 8)
+#define PCIE_ERROR_COR_REPLAY_TIMER	(1 << 12)
+#define PCIE_ERROR_COR_ADVISORY		(1 << 13)
+		     u32 correctable_mask;
+		     u32 control;
+		     u32 log[4];
+		     u32 root_command;
+		     u32 root_status; u16 correctable_error_source;
+		     u16 error_source;}) pcie_advanced_error_regs_t;
+/* *INDENT-ON* */
+
+/* Virtual Channel */
+#define PCI_VC_PORT_REG1	4
+#define PCI_VC_PORT_REG2	8
+#define PCI_VC_PORT_CTRL	12
+#define PCI_VC_PORT_STATUS	14
+#define PCI_VC_RES_CAP		16
+#define PCI_VC_RES_CTRL		20
+#define PCI_VC_RES_STATUS	26
+
+/* Power Budgeting */
+#define PCI_PWR_DSR		4	/* Data Select Register */
+#define PCI_PWR_DATA		8	/* Data Register */
+#define PCI_PWR_DATA_BASE(x)	((x) & 0xff)	/* Base Power */
+#define PCI_PWR_DATA_SCALE(x)	(((x) >> 8) & 3)	/* Data Scale */
+#define PCI_PWR_DATA_PM_SUB(x)	(((x) >> 10) & 7)	/* PM Sub State */
+#define PCI_PWR_DATA_PM_STATE(x) (((x) >> 13) & 3)	/* PM State */
+#define PCI_PWR_DATA_TYPE(x)	(((x) >> 15) & 7)	/* Type */
+#define PCI_PWR_DATA_RAIL(x)	(((x) >> 18) & 7)	/* Power Rail */
+#define PCI_PWR_CAP		12	/* Capability */
+#define PCI_PWR_CAP_BUDGET(x)	((x) & 1)	/* Included in system budget */
+
+#endif /* included_vlib_pci_config_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/physmem.h b/src/vlib/physmem.h
new file mode 100644
index 00000000..a7fed124
--- /dev/null
+++ b/src/vlib/physmem.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * physmem.h: virtual <-> physical memory mapping for VLIB buffers
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_physmem_h
+#define included_vlib_physmem_h
+
+typedef u8 vlib_physmem_region_index_t;
+
+typedef struct
+{
+  vlib_physmem_region_index_t index;
+  void *mem;
+  uword size;
+  int fd;
+  u8 log2_page_size;
+  u16 n_pages;
+  u32 page_mask;
+
+  void *heap;
+  u32 flags;
+#define VLIB_PHYSMEM_F_INIT_MHEAP (1<<0)
+#define VLIB_PHYSMEM_F_HAVE_BUFFERS (1<<1)
+#define VLIB_PHYSMEM_F_FAKE (1<<2)
+
+  u8 numa_node;
+  u64 *page_table;
+  u8 *name;
+} vlib_physmem_region_t;
+
+
+
+typedef struct
+{
+  vlib_physmem_region_t *regions;
+} vlib_physmem_main_t;
+
+#endif /* included_vlib_physmem_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/physmem_funcs.h b/src/vlib/physmem_funcs.h
new file mode 100644
index 00000000..dbb8d9de
--- /dev/null
+++ b/src/vlib/physmem_funcs.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * physmem.h: virtual <-> physical memory mapping for VLIB buffers
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_physmem_funcs_h
+#define included_vlib_physmem_funcs_h
+
+always_inline vlib_physmem_region_t *
+vlib_physmem_get_region (vlib_main_t * vm, u8 index)
+{
+  vlib_physmem_main_t *vpm = &vm->physmem_main;
+  return pool_elt_at_index (vpm->regions, index);
+}
+
+always_inline u64
+vlib_physmem_offset_to_physical (vlib_main_t * vm,
+				 vlib_physmem_region_index_t idx, uword o)
+{
+  vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx);
+  uword page_index = o >> pr->log2_page_size;
+  ASSERT (o < pr->size);
+  ASSERT (pr->page_table[page_index] != 0);
+  return (vec_elt (pr->page_table, page_index) + (o & pr->page_mask));
+}
+
+always_inline int
+vlib_physmem_is_virtual (vlib_main_t * vm, vlib_physmem_region_index_t idx,
+			 uword p)
+{
+  vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx);
+  return p >= pointer_to_uword (pr->mem)
+    && p < (pointer_to_uword (pr->mem) + pr->size);
+}
+
+always_inline uword
+vlib_physmem_offset_of (vlib_main_t * vm, vlib_physmem_region_index_t idx,
+			void *p)
+{
+  vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx);
+  uword a = pointer_to_uword (p);
+  uword o;
+
+  ASSERT (vlib_physmem_is_virtual (vm, idx, a));
+  o = a - pointer_to_uword (pr->mem);
+
+  /* Offset must fit in 32 bits. */
+  ASSERT ((uword) o == a - pointer_to_uword (pr->mem));
+
+  return o;
+}
+
+always_inline void *
+vlib_physmem_at_offset (vlib_main_t * vm, vlib_physmem_region_index_t idx,
+			uword offset)
+{
+  vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx);
+  ASSERT (offset < pr->size);
+  return uword_to_pointer (pointer_to_uword (pr->mem) + offset, void *);
+}
+
+always_inline void *
+vlib_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx,
+			    clib_error_t ** error,
+			    uword n_bytes, uword alignment)
+{
+  void *r = vm->os_physmem_alloc_aligned (vm, idx, n_bytes, alignment);
+  if (!r)
+    *error =
+      clib_error_return (0, "failed to allocate %wd bytes of I/O memory",
+			 n_bytes);
+  else
+    *error = 0;
+  return r;
+}
+
+/* By default allocate I/O memory with cache line alignment. */
+always_inline void *
+vlib_physmem_alloc (vlib_main_t * vm, vlib_physmem_region_index_t idx,
+		    clib_error_t ** error, uword n_bytes)
+{
+  return vlib_physmem_alloc_aligned (vm, idx, error, n_bytes,
+				     CLIB_CACHE_LINE_BYTES);
+}
+
+always_inline void
+vlib_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx,
+		   void *mem)
+{
+  return vm->os_physmem_free (vm, idx, mem);
+}
+
+always_inline u64
+vlib_physmem_virtual_to_physical (vlib_main_t * vm,
+				  vlib_physmem_region_index_t idx, void *mem)
+{
+  vlib_physmem_main_t *vpm = &vm->physmem_main;
+  vlib_physmem_region_t *pr = pool_elt_at_index (vpm->regions, idx);
+  uword o = mem - pr->mem;
+  return vlib_physmem_offset_to_physical (vm, idx, o);
+}
+
+
+always_inline clib_error_t *
+vlib_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size,
+			   u8 numa_node, u32 flags,
+			   vlib_physmem_region_index_t * idx)
+{
+  return vm->os_physmem_region_alloc (vm, name, size, numa_node, flags, idx);
+}
+
+always_inline void
+vlib_physmem_region_free (struct vlib_main_t *vm,
+			  vlib_physmem_region_index_t idx)
+{
+  vm->os_physmem_region_free (vm, idx);
+}
+
+#endif /* included_vlib_physmem_funcs_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/threads.c b/src/vlib/threads.c
new file mode 100644
index 00000000..be8daa64
--- /dev/null
+++ b/src/vlib/threads.c
@@ -0,0 +1,1820 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#define _GNU_SOURCE
+
+#include <signal.h>
+#include <math.h>
+#include <vppinfra/format.h>
+#include <vlib/vlib.h>
+
+#include <vlib/threads.h>
+#include <vlib/unix/cj.h>
+
+DECLARE_CJ_GLOBAL_LOG;
+
+#define FRAME_QUEUE_NELTS 32
+
+u32
+vl (void *p)
+{
+  return vec_len (p);
+}
+
+vlib_worker_thread_t *vlib_worker_threads;
+vlib_thread_main_t vlib_thread_main;
+
+/*
+ * Barrier tracing can be enabled on a normal build to collect information
+ * on barrier use, including timings and call stacks.  Deliberately not
+ * keyed off CLIB_DEBUG, because that can add significant overhead which
+ * imapacts observed timings.
+ */
+
+#ifdef BARRIER_TRACING
+ /*
+  * Output of barrier tracing can be to syslog or elog as suits
+  */
+#ifdef BARRIER_TRACING_ELOG
+static u32
+elog_id_for_msg_name (const char *msg_name)
+{
+  uword *p, r;
+  static uword *h;
+  u8 *name_copy;
+
+  if (!h)
+    h = hash_create_string (0, sizeof (uword));
+
+  p = hash_get_mem (h, msg_name);
+  if (p)
+    return p[0];
+  r = elog_string (&vlib_global_main.elog_main, "%s", msg_name);
+
+  name_copy = format (0, "%s%c", msg_name, 0);
+
+  hash_set_mem (h, name_copy, r);
+
+  return r;
+}
+
+  /*
+   * elog Barrier trace functions, which are nulled out if BARRIER_TRACING isn't
+   * defined
+   */
+
+static inline void
+barrier_trace_sync (f64 t_entry, f64 t_open, f64 t_closed)
+{
+    /* *INDENT-OFF* */
+    ELOG_TYPE_DECLARE (e) =
+      {
+        .format = "barrier <%d#%s(O:%dus:%dus)(%dus)",
+        .format_args = "i4T4i4i4i4",
+      };
+    /* *INDENT-ON* */
+  struct
+  {
+    u32 count, caller, t_entry, t_open, t_closed;
+  } *ed = 0;
+
+  ed = ELOG_DATA (&vlib_global_main.elog_main, e);
+  ed->count = (int) vlib_worker_threads[0].barrier_sync_count;
+  ed->caller = elog_id_for_msg_name (vlib_worker_threads[0].barrier_caller);
+  ed->t_entry = (int) (1000000.0 * t_entry);
+  ed->t_open = (int) (1000000.0 * t_open);
+  ed->t_closed = (int) (1000000.0 * t_closed);
+}
+
+static inline void
+barrier_trace_sync_rec (f64 t_entry)
+{
+    /* *INDENT-OFF* */
+    ELOG_TYPE_DECLARE (e) =
+      {
+        .format = "barrier    <%d(%dus)%s",
+        .format_args = "i4i4T4",
+      };
+    /* *INDENT-ON* */
+  struct
+  {
+    u32 depth, t_entry, caller;
+  } *ed = 0;
+
+  ed = ELOG_DATA (&vlib_global_main.elog_main, e);
+  ed->depth = (int) vlib_worker_threads[0].recursion_level - 1;
+  ed->t_entry = (int) (1000000.0 * t_entry);
+  ed->caller = elog_id_for_msg_name (vlib_worker_threads[0].barrier_caller);
+}
+
+static inline void
+barrier_trace_release_rec (f64 t_entry)
+{
+    /* *INDENT-OFF* */
+    ELOG_TYPE_DECLARE (e) =
+      {
+        .format = "barrier      (%dus)%d>",
+        .format_args = "i4i4",
+      };
+    /* *INDENT-ON* */
+  struct
+  {
+    u32 t_entry, depth;
+  } *ed = 0;
+
+  ed = ELOG_DATA (&vlib_global_main.elog_main, e);
+  ed->t_entry = (int) (1000000.0 * t_entry);
+  ed->depth = (int) vlib_worker_threads[0].recursion_level;
+}
+
+static inline void
+barrier_trace_release (f64 t_entry, f64 t_closed_total, f64 t_update_main)
+{
+    /* *INDENT-OFF* */
+    ELOG_TYPE_DECLARE (e) =
+      {
+        .format = "barrier   (%dus){%d}(C:%dus)#%d>",
+        .format_args = "i4i4i4i4",
+      };
+    /* *INDENT-ON* */
+  struct
+  {
+    u32 t_entry, t_update_main, t_closed_total, count;
+  } *ed = 0;
+
+  ed = ELOG_DATA (&vlib_global_main.elog_main, e);
+  ed->t_entry = (int) (1000000.0 * t_entry);
+  ed->t_update_main = (int) (1000000.0 * t_update_main);
+  ed->t_closed_total = (int) (1000000.0 * t_closed_total);
+  ed->count = (int) vlib_worker_threads[0].barrier_sync_count;
+
+  /* Reset context for next trace */
+  vlib_worker_threads[0].barrier_context = NULL;
+}
+#else
+char barrier_trace[65536];
+char *btp = barrier_trace;
+
+  /*
+   * syslog Barrier trace functions, which are nulled out if BARRIER_TRACING
+   * isn't defined
+   */
+
+
+static inline void
+barrier_trace_sync (f64 t_entry, f64 t_open, f64 t_closed)
+{
+  btp += sprintf (btp, "<%u#%s",
+		  (unsigned int) vlib_worker_threads[0].barrier_sync_count,
+		  vlib_worker_threads[0].barrier_caller);
+
+  if (vlib_worker_threads[0].barrier_context)
+    {
+      btp += sprintf (btp, "[%s]", vlib_worker_threads[0].barrier_context);
+
+    }
+
+  btp += sprintf (btp, "(O:%dus:%dus)(%dus):",
+		  (int) (1000000.0 * t_entry),
+		  (int) (1000000.0 * t_open), (int) (1000000.0 * t_closed));
+
+}
+
+static inline void
+barrier_trace_sync_rec (f64 t_entry)
+{
+  btp += sprintf (btp, "<%u(%dus)%s:",
+		  (int) vlib_worker_threads[0].recursion_level - 1,
+		  (int) (1000000.0 * t_entry),
+		  vlib_worker_threads[0].barrier_caller);
+}
+
+static inline void
+barrier_trace_release_rec (f64 t_entry)
+{
+  btp += sprintf (btp, ":(%dus)%u>", (int) (1000000.0 * t_entry),
+		  (int) vlib_worker_threads[0].recursion_level);
+}
+
+static inline void
+barrier_trace_release (f64 t_entry, f64 t_closed_total, f64 t_update_main)
+{
+
+  btp += sprintf (btp, ":(%dus)", (int) (1000000.0 * t_entry));
+  if (t_update_main > 0)
+    {
+      btp += sprintf (btp, "{%dus}", (int) (1000000.0 * t_update_main));
+    }
+
+  btp += sprintf (btp, "(C:%dus)#%u>",
+		  (int) (1000000.0 * t_closed_total),
+		  (int) vlib_worker_threads[0].barrier_sync_count);
+
+  /* Dump buffer to syslog, and reset for next trace */
+  fformat (stderr, "BTRC %s\n", barrier_trace);
+  btp = barrier_trace;
+  vlib_worker_threads[0].barrier_context = NULL;
+}
+#endif
+#else
+
+  /* Null functions for default case where barrier tracing isn't used */
+static inline void
+barrier_trace_sync (f64 t_entry, f64 t_open, f64 t_closed)
+{
+}
+
+static inline void
+barrier_trace_sync_rec (f64 t_entry)
+{
+}
+
+static inline void
+barrier_trace_release_rec (f64 t_entry)
+{
+}
+
+static inline void
+barrier_trace_release (f64 t_entry, f64 t_closed_total, f64 t_update_main)
+{
+}
+#endif
+
+uword
+os_get_nthreads (void)
+{
+  u32 len;
+
+  len = vec_len (vlib_thread_stacks);
+  if (len == 0)
+    return 1;
+  else
+    return len;
+}
+
+void
+vlib_set_thread_name (char *name)
+{
+  int pthread_setname_np (pthread_t __target_thread, const char *__name);
+  int rv;
+  pthread_t thread = pthread_self ();
+
+  if (thread)
+    {
+      rv = pthread_setname_np (thread, name);
+      if (rv)
+	clib_warning ("pthread_setname_np returned %d", rv);
+    }
+}
+
+static int
+sort_registrations_by_no_clone (void *a0, void *a1)
+{
+  vlib_thread_registration_t **tr0 = a0;
+  vlib_thread_registration_t **tr1 = a1;
+
+  return ((i32) ((*tr0)->no_data_structure_clone)
+	  - ((i32) ((*tr1)->no_data_structure_clone)));
+}
+
+static uword *
+clib_sysfs_list_to_bitmap (char *filename)
+{
+  FILE *fp;
+  uword *r = 0;
+
+  fp = fopen (filename, "r");
+
+  if (fp != NULL)
+    {
+      u8 *buffer = 0;
+      vec_validate (buffer, 256 - 1);
+      if (fgets ((char *) buffer, 256, fp))
+	{
+	  unformat_input_t in;
+	  unformat_init_string (&in, (char *) buffer,
+				strlen ((char *) buffer));
+	  if (unformat (&in, "%U", unformat_bitmap_list, &r) != 1)
+	    clib_warning ("unformat_bitmap_list failed");
+	  unformat_free (&in);
+	}
+      vec_free (buffer);
+      fclose (fp);
+    }
+  return r;
+}
+
+
+/* Called early in the init sequence */
+
+clib_error_t *
+vlib_thread_init (vlib_main_t * vm)
+{
+  vlib_thread_main_t *tm = &vlib_thread_main;
+  vlib_worker_thread_t *w;
+  vlib_thread_registration_t *tr;
+  u32 n_vlib_mains = 1;
+  u32 first_index = 1;
+  u32 i;
+  uword *avail_cpu;
+
+  /* get bitmaps of active cpu cores and sockets */
+  tm->cpu_core_bitmap =
+    clib_sysfs_list_to_bitmap ("/sys/devices/system/cpu/online");
+  tm->cpu_socket_bitmap =
+    clib_sysfs_list_to_bitmap ("/sys/devices/system/node/online");
+
+  avail_cpu = clib_bitmap_dup (tm->cpu_core_bitmap);
+
+  /* skip cores */
+  for (i = 0; i < tm->skip_cores; i++)
+    {
+      uword c = clib_bitmap_first_set (avail_cpu);
+      if (c == ~0)
+	return clib_error_return (0, "no available cpus to skip");
+
+      avail_cpu = clib_bitmap_set (avail_cpu, c, 0);
+    }
+
+  /* grab cpu for main thread */
+  if (!tm->main_lcore)
+    {
+      tm->main_lcore = clib_bitmap_first_set (avail_cpu);
+      if (tm->main_lcore == (u8) ~ 0)
+	return clib_error_return (0, "no available cpus to be used for the"
+				  " main thread");
+    }
+  else
+    {
+      if (clib_bitmap_get (avail_cpu, tm->main_lcore) == 0)
+	return clib_error_return (0, "cpu %u is not available to be used"
+				  " for the main thread", tm->main_lcore);
+    }
+  avail_cpu = clib_bitmap_set (avail_cpu, tm->main_lcore, 0);
+
+  /* assume that there is socket 0 only if there is no data from sysfs */
+  if (!tm->cpu_socket_bitmap)
+    tm->cpu_socket_bitmap = clib_bitmap_set (0, 0, 1);
+
+  /* pin main thread to main_lcore  */
+  if (tm->cb.vlib_thread_set_lcore_cb)
+    {
+      tm->cb.vlib_thread_set_lcore_cb (0, tm->main_lcore);
+    }
+  else
+    {
+      cpu_set_t cpuset;
+      CPU_ZERO (&cpuset);
+      CPU_SET (tm->main_lcore, &cpuset);
+      pthread_setaffinity_np (pthread_self (), sizeof (cpu_set_t), &cpuset);
+    }
+
+  /* as many threads as stacks... */
+  vec_validate_aligned (vlib_worker_threads, vec_len (vlib_thread_stacks) - 1,
+			CLIB_CACHE_LINE_BYTES);
+
+  /* Preallocate thread 0 */
+  _vec_len (vlib_worker_threads) = 1;
+  w = vlib_worker_threads;
+  w->thread_mheap = clib_mem_get_heap ();
+  w->thread_stack = vlib_thread_stacks[0];
+  w->lcore_id = tm->main_lcore;
+  w->lwp = syscall (SYS_gettid);
+  w->thread_id = pthread_self ();
+  tm->n_vlib_mains = 1;
+
+  if (tm->sched_policy != ~0)
+    {
+      struct sched_param sched_param;
+      if (!sched_getparam (w->lwp, &sched_param))
+	{
+	  if (tm->sched_priority != ~0)
+	    sched_param.sched_priority = tm->sched_priority;
+	  sched_setscheduler (w->lwp, tm->sched_policy, &sched_param);
+	}
+    }
+
+  /* assign threads to cores and set n_vlib_mains */
+  tr = tm->next;
+
+  while (tr)
+    {
+      vec_add1 (tm->registrations, tr);
+      tr = tr->next;
+    }
+
+  vec_sort_with_function (tm->registrations, sort_registrations_by_no_clone);
+
+  for (i = 0; i < vec_len (tm->registrations); i++)
+    {
+      int j;
+      tr = tm->registrations[i];
+      tr->first_index = first_index;
+      first_index += tr->count;
+      n_vlib_mains += (tr->no_data_structure_clone == 0) ? tr->count : 0;
+
+      /* construct coremask */
+      if (tr->use_pthreads || !tr->count)
+	continue;
+
+      if (tr->coremask)
+	{
+	  uword c;
+          /* *INDENT-OFF* */
+          clib_bitmap_foreach (c, tr->coremask, ({
+            if (clib_bitmap_get(avail_cpu, c) == 0)
+              return clib_error_return (0, "cpu %u is not available to be used"
+                                        " for the '%s' thread",c, tr->name);
+
+            avail_cpu = clib_bitmap_set(avail_cpu, c, 0);
+          }));
+/* *INDENT-ON* */
+
+	}
+      else
+	{
+	  for (j = 0; j < tr->count; j++)
+	    {
+	      uword c = clib_bitmap_first_set (avail_cpu);
+	      if (c == ~0)
+		return clib_error_return (0,
+					  "no available cpus to be used for"
+					  " the '%s' thread", tr->name);
+
+	      avail_cpu = clib_bitmap_set (avail_cpu, c, 0);
+	      tr->coremask = clib_bitmap_set (tr->coremask, c, 1);
+	    }
+	}
+    }
+
+  clib_bitmap_free (avail_cpu);
+
+  tm->n_vlib_mains = n_vlib_mains;
+
+  vec_validate_aligned (vlib_worker_threads, first_index - 1,
+			CLIB_CACHE_LINE_BYTES);
+
+  return 0;
+}
+
+vlib_frame_queue_t *
+vlib_frame_queue_alloc (int nelts)
+{
+  vlib_frame_queue_t *fq;
+
+  fq = clib_mem_alloc_aligned (sizeof (*fq), CLIB_CACHE_LINE_BYTES);
+  memset (fq, 0, sizeof (*fq));
+  fq->nelts = nelts;
+  fq->vector_threshold = 128;	// packets
+  vec_validate_aligned (fq->elts, nelts - 1, CLIB_CACHE_LINE_BYTES);
+
+  if (1)
+    {
+      if (((uword) & fq->tail) & (CLIB_CACHE_LINE_BYTES - 1))
+	fformat (stderr, "WARNING: fq->tail unaligned\n");
+      if (((uword) & fq->head) & (CLIB_CACHE_LINE_BYTES - 1))
+	fformat (stderr, "WARNING: fq->head unaligned\n");
+      if (((uword) fq->elts) & (CLIB_CACHE_LINE_BYTES - 1))
+	fformat (stderr, "WARNING: fq->elts unaligned\n");
+
+      if (sizeof (fq->elts[0]) % CLIB_CACHE_LINE_BYTES)
+	fformat (stderr, "WARNING: fq->elts[0] size %d\n",
+		 sizeof (fq->elts[0]));
+      if (nelts & (nelts - 1))
+	{
+	  fformat (stderr, "FATAL: nelts MUST be a power of 2\n");
+	  abort ();
+	}
+    }
+
+  return (fq);
+}
+
+void vl_msg_api_handler_no_free (void *) __attribute__ ((weak));
+void
+vl_msg_api_handler_no_free (void *v)
+{
+}
+
+/* Turned off, save as reference material... */
+#if 0
+static inline int
+vlib_frame_queue_dequeue_internal (int thread_id,
+				   vlib_main_t * vm, vlib_node_main_t * nm)
+{
+  vlib_frame_queue_t *fq = vlib_frame_queues[thread_id];
+  vlib_frame_queue_elt_t *elt;
+  vlib_frame_t *f;
+  vlib_pending_frame_t *p;
+  vlib_node_runtime_t *r;
+  u32 node_runtime_index;
+  int msg_type;
+  u64 before;
+  int processed = 0;
+
+  ASSERT (vm == vlib_mains[thread_id]);
+
+  while (1)
+    {
+      if (fq->head == fq->tail)
+	return processed;
+
+      elt = fq->elts + ((fq->head + 1) & (fq->nelts - 1));
+
+      if (!elt->valid)
+	return processed;
+
+      before = clib_cpu_time_now ();
+
+      f = elt->frame;
+      node_runtime_index = elt->node_runtime_index;
+      msg_type = elt->msg_type;
+
+      switch (msg_type)
+	{
+	case VLIB_FRAME_QUEUE_ELT_FREE_BUFFERS:
+	  vlib_buffer_free (vm, vlib_frame_vector_args (f), f->n_vectors);
+	  /* note fallthrough... */
+	case VLIB_FRAME_QUEUE_ELT_FREE_FRAME:
+	  r = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL],
+				node_runtime_index);
+	  vlib_frame_free (vm, r, f);
+	  break;
+	case VLIB_FRAME_QUEUE_ELT_DISPATCH_FRAME:
+	  vec_add2 (vm->node_main.pending_frames, p, 1);
+	  f->flags |= (VLIB_FRAME_PENDING | VLIB_FRAME_FREE_AFTER_DISPATCH);
+	  p->node_runtime_index = elt->node_runtime_index;
+	  p->frame_index = vlib_frame_index (vm, f);
+	  p->next_frame_index = VLIB_PENDING_FRAME_NO_NEXT_FRAME;
+	  fq->dequeue_vectors += (u64) f->n_vectors;
+	  break;
+	case VLIB_FRAME_QUEUE_ELT_API_MSG:
+	  vl_msg_api_handler_no_free (f);
+	  break;
+	default:
+	  clib_warning ("bogus frame queue message, type %d", msg_type);
+	  break;
+	}
+      elt->valid = 0;
+      fq->dequeues++;
+      fq->dequeue_ticks += clib_cpu_time_now () - before;
+      CLIB_MEMORY_BARRIER ();
+      fq->head++;
+      processed++;
+    }
+  ASSERT (0);
+  return processed;
+}
+
+int
+vlib_frame_queue_dequeue (int thread_id,
+			  vlib_main_t * vm, vlib_node_main_t * nm)
+{
+  return vlib_frame_queue_dequeue_internal (thread_id, vm, nm);
+}
+
+int
+vlib_frame_queue_enqueue (vlib_main_t * vm, u32 node_runtime_index,
+			  u32 frame_queue_index, vlib_frame_t * frame,
+			  vlib_frame_queue_msg_type_t type)
+{
+  vlib_frame_queue_t *fq = vlib_frame_queues[frame_queue_index];
+  vlib_frame_queue_elt_t *elt;
+  u32 save_count;
+  u64 new_tail;
+  u64 before = clib_cpu_time_now ();
+
+  ASSERT (fq);
+
+  new_tail = __sync_add_and_fetch (&fq->tail, 1);
+
+  /* Wait until a ring slot is available */
+  while (new_tail >= fq->head + fq->nelts)
+    {
+      f64 b4 = vlib_time_now_ticks (vm, before);
+      vlib_worker_thread_barrier_check (vm, b4);
+      /* Bad idea. Dequeue -> enqueue -> dequeue -> trouble */
+      // vlib_frame_queue_dequeue (vm->thread_index, vm, nm);
+    }
+
+  elt = fq->elts + (new_tail & (fq->nelts - 1));
+
+  /* this would be very bad... */
+  while (elt->valid)
+    {
+    }
+
+  /* Once we enqueue the frame, frame->n_vectors is owned elsewhere... */
+  save_count = frame->n_vectors;
+
+  elt->frame = frame;
+  elt->node_runtime_index = node_runtime_index;
+  elt->msg_type = type;
+  CLIB_MEMORY_BARRIER ();
+  elt->valid = 1;
+
+  return save_count;
+}
+#endif /* 0 */
+
+/* To be called by vlib worker threads upon startup */
+void
+vlib_worker_thread_init (vlib_worker_thread_t * w)
+{
+  vlib_thread_main_t *tm = vlib_get_thread_main ();
+
+  /*
+   * Note: disabling signals in worker threads as follows
+   * prevents the api post-mortem dump scheme from working
+   * {
+   *    sigset_t s;
+   *    sigfillset (&s);
+   *    pthread_sigmask (SIG_SETMASK, &s, 0);
+   *  }
+   */
+
+  clib_mem_set_heap (w->thread_mheap);
+
+  if (vec_len (tm->thread_prefix) && w->registration->short_name)
+    {
+      w->name = format (0, "%v_%s_%d%c", tm->thread_prefix,
+			w->registration->short_name, w->instance_id, '\0');
+      vlib_set_thread_name ((char *) w->name);
+    }
+
+  if (!w->registration->use_pthreads)
+    {
+
+      /* Initial barrier sync, for both worker and i/o threads */
+      clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, 1);
+
+      while (*vlib_worker_threads->wait_at_barrier)
+	;
+
+      clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, -1);
+    }
+}
+
+void *
+vlib_worker_thread_bootstrap_fn (void *arg)
+{
+  void *rv;
+  vlib_worker_thread_t *w = arg;
+
+  w->lwp = syscall (SYS_gettid);
+  w->thread_id = pthread_self ();
+
+  __os_thread_index = w - vlib_worker_threads;
+
+  rv = (void *) clib_calljmp
+    ((uword (*)(uword)) w->thread_function,
+     (uword) arg, w->thread_stack + VLIB_THREAD_STACK_SIZE);
+  /* NOTREACHED, we hope */
+  return rv;
+}
+
+static clib_error_t *
+vlib_launch_thread_int (void *fp, vlib_worker_thread_t * w, unsigned lcore_id)
+{
+  vlib_thread_main_t *tm = &vlib_thread_main;
+  void *(*fp_arg) (void *) = fp;
+
+  w->lcore_id = lcore_id;
+  if (tm->cb.vlib_launch_thread_cb && !w->registration->use_pthreads)
+    return tm->cb.vlib_launch_thread_cb (fp, (void *) w, lcore_id);
+  else
+    {
+      pthread_t worker;
+      cpu_set_t cpuset;
+      CPU_ZERO (&cpuset);
+      CPU_SET (lcore_id, &cpuset);
+
+      if (pthread_create (&worker, NULL /* attr */ , fp_arg, (void *) w))
+	return clib_error_return_unix (0, "pthread_create");
+
+      if (pthread_setaffinity_np (worker, sizeof (cpu_set_t), &cpuset))
+	return clib_error_return_unix (0, "pthread_setaffinity_np");
+
+      return 0;
+    }
+}
+
+static clib_error_t *
+start_workers (vlib_main_t * vm)
+{
+  int i, j;
+  vlib_worker_thread_t *w;
+  vlib_main_t *vm_clone;
+  void *oldheap;
+  vlib_thread_main_t *tm = &vlib_thread_main;
+  vlib_thread_registration_t *tr;
+  vlib_node_runtime_t *rt;
+  u32 n_vlib_mains = tm->n_vlib_mains;
+  u32 worker_thread_index;
+  u8 *main_heap = clib_mem_get_per_cpu_heap ();
+  mheap_t *main_heap_header = mheap_header (main_heap);
+
+  vec_reset_length (vlib_worker_threads);
+
+  /* Set up the main thread */
+  vec_add2_aligned (vlib_worker_threads, w, 1, CLIB_CACHE_LINE_BYTES);
+  w->elog_track.name = "main thread";
+  elog_track_register (&vm->elog_main, &w->elog_track);
+
+  if (vec_len (tm->thread_prefix))
+    {
+      w->name = format (0, "%v_main%c", tm->thread_prefix, '\0');
+      vlib_set_thread_name ((char *) w->name);
+    }
+
+  /*
+   * Truth of the matter: we always use at least two
+   * threads. So, make the main heap thread-safe
+   * and make the event log thread-safe.
+   */
+  main_heap_header->flags |= MHEAP_FLAG_THREAD_SAFE;
+  vm->elog_main.lock =
+    clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES);
+  vm->elog_main.lock[0] = 0;
+
+  if (n_vlib_mains > 1)
+    {
+      /* Replace hand-crafted length-1 vector with a real vector */
+      vlib_mains = 0;
+
+      vec_validate_aligned (vlib_mains, tm->n_vlib_mains - 1,
+			    CLIB_CACHE_LINE_BYTES);
+      _vec_len (vlib_mains) = 0;
+      vec_add1_aligned (vlib_mains, vm, CLIB_CACHE_LINE_BYTES);
+
+      vlib_worker_threads->wait_at_barrier =
+	clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES);
+      vlib_worker_threads->workers_at_barrier =
+	clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES);
+
+      vlib_worker_threads->node_reforks_required =
+	clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES);
+
+      /* Ask for an initial barrier sync */
+      *vlib_worker_threads->workers_at_barrier = 0;
+      *vlib_worker_threads->wait_at_barrier = 1;
+
+      /* Without update or refork */
+      *vlib_worker_threads->node_reforks_required = 0;
+      vm->need_vlib_worker_thread_node_runtime_update = 0;
+
+      /* init timing */
+      vm->barrier_epoch = 0;
+      vm->barrier_no_close_before = 0;
+
+      worker_thread_index = 1;
+
+      for (i = 0; i < vec_len (tm->registrations); i++)
+	{
+	  vlib_node_main_t *nm, *nm_clone;
+	  vlib_buffer_main_t *bm_clone;
+	  vlib_buffer_free_list_t *fl_clone, *fl_orig;
+	  vlib_buffer_free_list_t *orig_freelist_pool;
+	  int k;
+
+	  tr = tm->registrations[i];
+
+	  if (tr->count == 0)
+	    continue;
+
+	  for (k = 0; k < tr->count; k++)
+	    {
+	      vlib_node_t *n;
+
+	      vec_add2 (vlib_worker_threads, w, 1);
+	      if (tr->mheap_size)
+		w->thread_mheap =
+		  mheap_alloc (0 /* use VM */ , tr->mheap_size);
+	      else
+		w->thread_mheap = main_heap;
+
+	      w->thread_stack =
+		vlib_thread_stack_init (w - vlib_worker_threads);
+	      w->thread_function = tr->function;
+	      w->thread_function_arg = w;
+	      w->instance_id = k;
+	      w->registration = tr;
+
+	      w->elog_track.name =
+		(char *) format (0, "%s %d", tr->name, k + 1);
+	      vec_add1 (w->elog_track.name, 0);
+	      elog_track_register (&vm->elog_main, &w->elog_track);
+
+	      if (tr->no_data_structure_clone)
+		continue;
+
+	      /* Fork vlib_global_main et al. Look for bugs here */
+	      oldheap = clib_mem_set_heap (w->thread_mheap);
+
+	      vm_clone = clib_mem_alloc (sizeof (*vm_clone));
+	      clib_memcpy (vm_clone, vlib_mains[0], sizeof (*vm_clone));
+
+	      vm_clone->thread_index = worker_thread_index;
+	      vm_clone->heap_base = w->thread_mheap;
+	      vm_clone->mbuf_alloc_list = 0;
+	      vm_clone->init_functions_called =
+		hash_create (0, /* value bytes */ 0);
+	      memset (&vm_clone->random_buffer, 0,
+		      sizeof (vm_clone->random_buffer));
+
+	      nm = &vlib_mains[0]->node_main;
+	      nm_clone = &vm_clone->node_main;
+	      /* fork next frames array, preserving node runtime indices */
+	      nm_clone->next_frames = vec_dup (nm->next_frames);
+	      for (j = 0; j < vec_len (nm_clone->next_frames); j++)
+		{
+		  vlib_next_frame_t *nf = &nm_clone->next_frames[j];
+		  u32 save_node_runtime_index;
+		  u32 save_flags;
+
+		  save_node_runtime_index = nf->node_runtime_index;
+		  save_flags = nf->flags & VLIB_FRAME_NO_FREE_AFTER_DISPATCH;
+		  vlib_next_frame_init (nf);
+		  nf->node_runtime_index = save_node_runtime_index;
+		  nf->flags = save_flags;
+		}
+
+	      /* fork the frame dispatch queue */
+	      nm_clone->pending_frames = 0;
+	      vec_validate (nm_clone->pending_frames, 10);	/* $$$$$?????? */
+	      _vec_len (nm_clone->pending_frames) = 0;
+
+	      /* fork nodes */
+	      nm_clone->nodes = 0;
+
+	      /* Allocate all nodes in single block for speed */
+	      n = clib_mem_alloc_no_fail (vec_len (nm->nodes) * sizeof (*n));
+
+	      for (j = 0; j < vec_len (nm->nodes); j++)
+		{
+		  clib_memcpy (n, nm->nodes[j], sizeof (*n));
+		  /* none of the copied nodes have enqueue rights given out */
+		  n->owner_node_index = VLIB_INVALID_NODE_INDEX;
+		  memset (&n->stats_total, 0, sizeof (n->stats_total));
+		  memset (&n->stats_last_clear, 0,
+			  sizeof (n->stats_last_clear));
+		  vec_add1 (nm_clone->nodes, n);
+		  n++;
+		}
+	      nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] =
+		vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]);
+	      vec_foreach (rt,
+			   nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL])
+	      {
+		vlib_node_t *n = vlib_get_node (vm, rt->node_index);
+		rt->thread_index = vm_clone->thread_index;
+		/* copy initial runtime_data from node */
+		if (n->runtime_data && n->runtime_data_bytes > 0)
+		  clib_memcpy (rt->runtime_data, n->runtime_data,
+			       clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
+					 n->runtime_data_bytes));
+	      }
+
+	      nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] =
+		vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]);
+	      vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT])
+	      {
+		vlib_node_t *n = vlib_get_node (vm, rt->node_index);
+		rt->thread_index = vm_clone->thread_index;
+		/* copy initial runtime_data from node */
+		if (n->runtime_data && n->runtime_data_bytes > 0)
+		  clib_memcpy (rt->runtime_data, n->runtime_data,
+			       clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
+					 n->runtime_data_bytes));
+	      }
+
+	      nm_clone->processes = vec_dup (nm->processes);
+
+	      /* zap the (per worker) frame freelists, etc */
+	      nm_clone->frame_sizes = 0;
+	      nm_clone->frame_size_hash = hash_create (0, sizeof (uword));
+
+	      /* Packet trace buffers are guaranteed to be empty, nothing to do here */
+
+	      clib_mem_set_heap (oldheap);
+	      vec_add1_aligned (vlib_mains, vm_clone, CLIB_CACHE_LINE_BYTES);
+
+	      vm_clone->error_main.counters =
+		vec_dup (vlib_mains[0]->error_main.counters);
+	      vm_clone->error_main.counters_last_clear =
+		vec_dup (vlib_mains[0]->error_main.counters_last_clear);
+
+	      /* Fork the vlib_buffer_main_t free lists, etc. */
+	      bm_clone = vec_dup (vm_clone->buffer_main);
+	      vm_clone->buffer_main = bm_clone;
+
+	      orig_freelist_pool = bm_clone->buffer_free_list_pool;
+	      bm_clone->buffer_free_list_pool = 0;
+
+            /* *INDENT-OFF* */
+            pool_foreach (fl_orig, orig_freelist_pool,
+                          ({
+                            pool_get_aligned (bm_clone->buffer_free_list_pool,
+                                              fl_clone, CLIB_CACHE_LINE_BYTES);
+                            ASSERT (fl_orig - orig_freelist_pool
+                                    == fl_clone - bm_clone->buffer_free_list_pool);
+
+                            fl_clone[0] = fl_orig[0];
+                            fl_clone->buffers = 0;
+                            fl_clone->n_alloc = 0;
+                          }));
+/* *INDENT-ON* */
+
+	      worker_thread_index++;
+	    }
+	}
+    }
+  else
+    {
+      /* only have non-data-structure copy threads to create... */
+      for (i = 0; i < vec_len (tm->registrations); i++)
+	{
+	  tr = tm->registrations[i];
+
+	  for (j = 0; j < tr->count; j++)
+	    {
+	      vec_add2 (vlib_worker_threads, w, 1);
+	      if (tr->mheap_size)
+		w->thread_mheap =
+		  mheap_alloc (0 /* use VM */ , tr->mheap_size);
+	      else
+		w->thread_mheap = main_heap;
+	      w->thread_stack =
+		vlib_thread_stack_init (w - vlib_worker_threads);
+	      w->thread_function = tr->function;
+	      w->thread_function_arg = w;
+	      w->instance_id = j;
+	      w->elog_track.name =
+		(char *) format (0, "%s %d", tr->name, j + 1);
+	      w->registration = tr;
+	      vec_add1 (w->elog_track.name, 0);
+	      elog_track_register (&vm->elog_main, &w->elog_track);
+	    }
+	}
+    }
+
+  worker_thread_index = 1;
+
+  for (i = 0; i < vec_len (tm->registrations); i++)
+    {
+      clib_error_t *err;
+      int j;
+
+      tr = tm->registrations[i];
+
+      if (tr->use_pthreads || tm->use_pthreads)
+	{
+	  for (j = 0; j < tr->count; j++)
+	    {
+	      w = vlib_worker_threads + worker_thread_index++;
+	      err = vlib_launch_thread_int (vlib_worker_thread_bootstrap_fn,
+					    w, 0);
+	      if (err)
+		clib_error_report (err);
+	    }
+	}
+      else
+	{
+	  uword c;
+          /* *INDENT-OFF* */
+          clib_bitmap_foreach (c, tr->coremask, ({
+            w = vlib_worker_threads + worker_thread_index++;
+	    err = vlib_launch_thread_int (vlib_worker_thread_bootstrap_fn,
+					  w, c);
+	    if (err)
+	      clib_error_report (err);
+          }));
+          /* *INDENT-ON* */
+	}
+    }
+  vlib_worker_thread_barrier_sync (vm);
+  vlib_worker_thread_barrier_release (vm);
+  return 0;
+}
+
+VLIB_MAIN_LOOP_ENTER_FUNCTION (start_workers);
+
+
+static inline void
+worker_thread_node_runtime_update_internal (void)
+{
+  int i, j;
+  vlib_main_t *vm;
+  vlib_node_main_t *nm, *nm_clone;
+  vlib_main_t *vm_clone;
+  vlib_node_runtime_t *rt;
+  never_inline void
+    vlib_node_runtime_sync_stats (vlib_main_t * vm,
+				  vlib_node_runtime_t * r,
+				  uword n_calls,
+				  uword n_vectors, uword n_clocks);
+
+  ASSERT (vlib_get_thread_index () == 0);
+
+  vm = vlib_mains[0];
+  nm = &vm->node_main;
+
+  ASSERT (*vlib_worker_threads->wait_at_barrier == 1);
+
+  /*
+   * Scrape all runtime stats, so we don't lose node runtime(s) with
+   * pending counts, or throw away worker / io thread counts.
+   */
+  for (j = 0; j < vec_len (nm->nodes); j++)
+    {
+      vlib_node_t *n;
+      n = nm->nodes[j];
+      vlib_node_sync_stats (vm, n);
+    }
+
+  for (i = 1; i < vec_len (vlib_mains); i++)
+    {
+      vlib_node_t *n;
+
+      vm_clone = vlib_mains[i];
+      nm_clone = &vm_clone->node_main;
+
+      for (j = 0; j < vec_len (nm_clone->nodes); j++)
+	{
+	  n = nm_clone->nodes[j];
+
+	  rt = vlib_node_get_runtime (vm_clone, n->index);
+	  vlib_node_runtime_sync_stats (vm_clone, rt, 0, 0, 0);
+	}
+    }
+
+  /* Per-worker clone rebuilds are now done on each thread */
+}
+
+
+void
+vlib_worker_thread_node_refork (void)
+{
+  vlib_main_t *vm, *vm_clone;
+  vlib_node_main_t *nm, *nm_clone;
+  vlib_node_t **old_nodes_clone;
+  vlib_node_runtime_t *rt, *old_rt;
+
+  vlib_node_t *new_n_clone;
+
+  int j;
+
+  vm = vlib_mains[0];
+  nm = &vm->node_main;
+  vm_clone = vlib_get_main ();
+  nm_clone = &vm_clone->node_main;
+
+  /* Re-clone error heap */
+  u64 *old_counters = vm_clone->error_main.counters;
+  u64 *old_counters_all_clear = vm_clone->error_main.counters_last_clear;
+
+  clib_memcpy (&vm_clone->error_main, &vm->error_main,
+	       sizeof (vm->error_main));
+  j = vec_len (vm->error_main.counters) - 1;
+  vec_validate_aligned (old_counters, j, CLIB_CACHE_LINE_BYTES);
+  vec_validate_aligned (old_counters_all_clear, j, CLIB_CACHE_LINE_BYTES);
+  vm_clone->error_main.counters = old_counters;
+  vm_clone->error_main.counters_last_clear = old_counters_all_clear;
+
+  nm_clone = &vm_clone->node_main;
+  vec_free (nm_clone->next_frames);
+  nm_clone->next_frames = vec_dup (nm->next_frames);
+
+  for (j = 0; j < vec_len (nm_clone->next_frames); j++)
+    {
+      vlib_next_frame_t *nf = &nm_clone->next_frames[j];
+      u32 save_node_runtime_index;
+      u32 save_flags;
+
+      save_node_runtime_index = nf->node_runtime_index;
+      save_flags = nf->flags & VLIB_FRAME_NO_FREE_AFTER_DISPATCH;
+      vlib_next_frame_init (nf);
+      nf->node_runtime_index = save_node_runtime_index;
+      nf->flags = save_flags;
+    }
+
+  old_nodes_clone = nm_clone->nodes;
+  nm_clone->nodes = 0;
+
+  /* re-fork nodes */
+
+  /* Allocate all nodes in single block for speed */
+  new_n_clone =
+    clib_mem_alloc_no_fail (vec_len (nm->nodes) * sizeof (*new_n_clone));
+  for (j = 0; j < vec_len (nm->nodes); j++)
+    {
+      vlib_node_t *old_n_clone;
+      vlib_node_t *new_n;
+
+      new_n = nm->nodes[j];
+      old_n_clone = old_nodes_clone[j];
+
+      clib_memcpy (new_n_clone, new_n, sizeof (*new_n));
+      /* none of the copied nodes have enqueue rights given out */
+      new_n_clone->owner_node_index = VLIB_INVALID_NODE_INDEX;
+
+      if (j >= vec_len (old_nodes_clone))
+	{
+	  /* new node, set to zero */
+	  memset (&new_n_clone->stats_total, 0,
+		  sizeof (new_n_clone->stats_total));
+	  memset (&new_n_clone->stats_last_clear, 0,
+		  sizeof (new_n_clone->stats_last_clear));
+	}
+      else
+	{
+	  /* Copy stats if the old data is valid */
+	  clib_memcpy (&new_n_clone->stats_total,
+		       &old_n_clone->stats_total,
+		       sizeof (new_n_clone->stats_total));
+	  clib_memcpy (&new_n_clone->stats_last_clear,
+		       &old_n_clone->stats_last_clear,
+		       sizeof (new_n_clone->stats_last_clear));
+
+	  /* keep previous node state */
+	  new_n_clone->state = old_n_clone->state;
+	}
+      vec_add1 (nm_clone->nodes, new_n_clone);
+      new_n_clone++;
+    }
+  /* Free the old node clones */
+  clib_mem_free (old_nodes_clone[0]);
+
+  vec_free (old_nodes_clone);
+
+
+  /* re-clone internal nodes */
+  old_rt = nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL];
+  nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] =
+    vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]);
+
+  vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL])
+  {
+    vlib_node_t *n = vlib_get_node (vm, rt->node_index);
+    rt->thread_index = vm_clone->thread_index;
+    /* copy runtime_data, will be overwritten later for existing rt */
+    if (n->runtime_data && n->runtime_data_bytes > 0)
+      clib_memcpy (rt->runtime_data, n->runtime_data,
+		   clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
+			     n->runtime_data_bytes));
+  }
+
+  for (j = 0; j < vec_len (old_rt); j++)
+    {
+      rt = vlib_node_get_runtime (vm_clone, old_rt[j].node_index);
+      rt->state = old_rt[j].state;
+      clib_memcpy (rt->runtime_data, old_rt[j].runtime_data,
+		   VLIB_NODE_RUNTIME_DATA_SIZE);
+    }
+
+  vec_free (old_rt);
+
+  /* re-clone input nodes */
+  old_rt = nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT];
+  nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] =
+    vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]);
+
+  vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT])
+  {
+    vlib_node_t *n = vlib_get_node (vm, rt->node_index);
+    rt->thread_index = vm_clone->thread_index;
+    /* copy runtime_data, will be overwritten later for existing rt */
+    if (n->runtime_data && n->runtime_data_bytes > 0)
+      clib_memcpy (rt->runtime_data, n->runtime_data,
+		   clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
+			     n->runtime_data_bytes));
+  }
+
+  for (j = 0; j < vec_len (old_rt); j++)
+    {
+      rt = vlib_node_get_runtime (vm_clone, old_rt[j].node_index);
+      rt->state = old_rt[j].state;
+      clib_memcpy (rt->runtime_data, old_rt[j].runtime_data,
+		   VLIB_NODE_RUNTIME_DATA_SIZE);
+    }
+
+  vec_free (old_rt);
+
+  nm_clone->processes = vec_dup (nm->processes);
+}
+
+void
+vlib_worker_thread_node_runtime_update (void)
+{
+  /*
+   * Make a note that we need to do a node runtime update
+   * prior to releasing the barrier.
+   */
+  vlib_global_main.need_vlib_worker_thread_node_runtime_update = 1;
+}
+
+u32
+unformat_sched_policy (unformat_input_t * input, va_list * args)
+{
+  u32 *r = va_arg (*args, u32 *);
+
+  if (0);
+#define _(v,f,s) else if (unformat (input, s)) *r = SCHED_POLICY_##f;
+  foreach_sched_policy
+#undef _
+    else
+    return 0;
+  return 1;
+}
+
+static clib_error_t *
+cpu_config (vlib_main_t * vm, unformat_input_t * input)
+{
+  vlib_thread_registration_t *tr;
+  uword *p;
+  vlib_thread_main_t *tm = &vlib_thread_main;
+  u8 *name;
+  u64 coremask;
+  uword *bitmap;
+  u32 count;
+
+  tm->thread_registrations_by_name = hash_create_string (0, sizeof (uword));
+
+  tm->n_thread_stacks = 1;	/* account for main thread */
+  tm->sched_policy = ~0;
+  tm->sched_priority = ~0;
+
+  tr = tm->next;
+
+  while (tr)
+    {
+      hash_set_mem (tm->thread_registrations_by_name, tr->name, (uword) tr);
+      tr = tr->next;
+    }
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "use-pthreads"))
+	tm->use_pthreads = 1;
+      else if (unformat (input, "thread-prefix %v", &tm->thread_prefix))
+	;
+      else if (unformat (input, "main-core %u", &tm->main_lcore))
+	;
+      else if (unformat (input, "skip-cores %u", &tm->skip_cores))
+	;
+      else if (unformat (input, "coremask-%s %llx", &name, &coremask))
+	{
+	  p = hash_get_mem (tm->thread_registrations_by_name, name);
+	  if (p == 0)
+	    return clib_error_return (0, "no such thread type '%s'", name);
+
+	  tr = (vlib_thread_registration_t *) p[0];
+
+	  if (tr->use_pthreads)
+	    return clib_error_return (0,
+				      "coremask cannot be set for '%s' threads",
+				      name);
+
+	  tr->coremask = clib_bitmap_set_multiple
+	    (tr->coremask, 0, coremask, BITS (coremask));
+	  tr->count = clib_bitmap_count_set_bits (tr->coremask);
+	}
+      else if (unformat (input, "corelist-%s %U", &name, unformat_bitmap_list,
+			 &bitmap))
+	{
+	  p = hash_get_mem (tm->thread_registrations_by_name, name);
+	  if (p == 0)
+	    return clib_error_return (0, "no such thread type '%s'", name);
+
+	  tr = (vlib_thread_registration_t *) p[0];
+
+	  if (tr->use_pthreads)
+	    return clib_error_return (0,
+				      "corelist cannot be set for '%s' threads",
+				      name);
+
+	  tr->coremask = bitmap;
+	  tr->count = clib_bitmap_count_set_bits (tr->coremask);
+	}
+      else
+	if (unformat
+	    (input, "scheduler-policy %U", unformat_sched_policy,
+	     &tm->sched_policy))
+	;
+      else if (unformat (input, "scheduler-priority %u", &tm->sched_priority))
+	;
+      else if (unformat (input, "%s %u", &name, &count))
+	{
+	  p = hash_get_mem (tm->thread_registrations_by_name, name);
+	  if (p == 0)
+	    return clib_error_return (0, "no such thread type 3 '%s'", name);
+
+	  tr = (vlib_thread_registration_t *) p[0];
+	  if (tr->fixed_count)
+	    return clib_error_return
+	      (0, "number of %s threads not configurable", tr->name);
+	  tr->count = count;
+	}
+      else
+	break;
+    }
+
+  if (tm->sched_priority != ~0)
+    {
+      if (tm->sched_policy == SCHED_FIFO || tm->sched_policy == SCHED_RR)
+	{
+	  u32 prio_max = sched_get_priority_max (tm->sched_policy);
+	  u32 prio_min = sched_get_priority_min (tm->sched_policy);
+	  if (tm->sched_priority > prio_max)
+	    tm->sched_priority = prio_max;
+	  if (tm->sched_priority < prio_min)
+	    tm->sched_priority = prio_min;
+	}
+      else
+	{
+	  return clib_error_return
+	    (0,
+	     "scheduling priority (%d) is not allowed for `normal` scheduling policy",
+	     tm->sched_priority);
+	}
+    }
+  tr = tm->next;
+
+  if (!tm->thread_prefix)
+    tm->thread_prefix = format (0, "vpp");
+
+  while (tr)
+    {
+      tm->n_thread_stacks += tr->count;
+      tm->n_pthreads += tr->count * tr->use_pthreads;
+      tm->n_threads += tr->count * (tr->use_pthreads == 0);
+      tr = tr->next;
+    }
+
+  return 0;
+}
+
+VLIB_EARLY_CONFIG_FUNCTION (cpu_config, "cpu");
+
+#if !defined (__x86_64__) && !defined (__i386__) && !defined (__aarch64__) && !defined (__powerpc64__) && !defined(__arm__)
+void
+__sync_fetch_and_add_8 (void)
+{
+  fformat (stderr, "%s called\n", __FUNCTION__);
+  abort ();
+}
+
+void
+__sync_add_and_fetch_8 (void)
+{
+  fformat (stderr, "%s called\n", __FUNCTION__);
+  abort ();
+}
+#endif
+
+void vnet_main_fixup (vlib_fork_fixup_t which) __attribute__ ((weak));
+void
+vnet_main_fixup (vlib_fork_fixup_t which)
+{
+}
+
+void
+vlib_worker_thread_fork_fixup (vlib_fork_fixup_t which)
+{
+  vlib_main_t *vm = vlib_get_main ();
+
+  if (vlib_mains == 0)
+    return;
+
+  ASSERT (vlib_get_thread_index () == 0);
+  vlib_worker_thread_barrier_sync (vm);
+
+  switch (which)
+    {
+    case VLIB_WORKER_THREAD_FORK_FIXUP_NEW_SW_IF_INDEX:
+      vnet_main_fixup (VLIB_WORKER_THREAD_FORK_FIXUP_NEW_SW_IF_INDEX);
+      break;
+
+    default:
+      ASSERT (0);
+    }
+  vlib_worker_thread_barrier_release (vm);
+}
+
+  /*
+   * Enforce minimum open time to minimize packet loss due to Rx overflow,
+   * based on a test based heuristic that barrier should be open for at least
+   * 3 time as long as it is closed (with an upper bound of 1ms because by that
+   *  point it is probably too late to make a difference)
+   */
+
+#ifndef BARRIER_MINIMUM_OPEN_LIMIT
+#define BARRIER_MINIMUM_OPEN_LIMIT 0.001
+#endif
+
+#ifndef BARRIER_MINIMUM_OPEN_FACTOR
+#define BARRIER_MINIMUM_OPEN_FACTOR 3
+#endif
+
+void
+vlib_worker_thread_barrier_sync_int (vlib_main_t * vm)
+{
+  f64 deadline;
+  f64 now;
+  f64 t_entry;
+  f64 t_open;
+  f64 t_closed;
+  u32 count;
+
+  if (vec_len (vlib_mains) < 2)
+    return;
+
+  ASSERT (vlib_get_thread_index () == 0);
+
+  count = vec_len (vlib_mains) - 1;
+
+  /* Record entry relative to last close */
+  now = vlib_time_now (vm);
+  t_entry = now - vm->barrier_epoch;
+
+  /* Tolerate recursive calls */
+  if (++vlib_worker_threads[0].recursion_level > 1)
+    {
+      barrier_trace_sync_rec (t_entry);
+      return;
+    }
+
+  vlib_worker_threads[0].barrier_sync_count++;
+
+  /* Enforce minimum barrier open time to minimize packet loss */
+  ASSERT (vm->barrier_no_close_before <= (now + BARRIER_MINIMUM_OPEN_LIMIT));
+  while ((now = vlib_time_now (vm)) < vm->barrier_no_close_before)
+    ;
+
+  /* Record time of closure */
+  t_open = now - vm->barrier_epoch;
+  vm->barrier_epoch = now;
+
+  deadline = now + BARRIER_SYNC_TIMEOUT;
+
+  *vlib_worker_threads->wait_at_barrier = 1;
+  while (*vlib_worker_threads->workers_at_barrier != count)
+    {
+      if ((now = vlib_time_now (vm)) > deadline)
+	{
+	  fformat (stderr, "%s: worker thread deadlock\n", __FUNCTION__);
+	  os_panic ();
+	}
+    }
+
+  t_closed = now - vm->barrier_epoch;
+
+  barrier_trace_sync (t_entry, t_open, t_closed);
+
+}
+
+void
+vlib_worker_thread_barrier_release (vlib_main_t * vm)
+{
+  f64 deadline;
+  f64 now;
+  f64 minimum_open;
+  f64 t_entry;
+  f64 t_closed_total;
+  f64 t_update_main = 0.0;
+  int refork_needed = 0;
+
+  if (vec_len (vlib_mains) < 2)
+    return;
+
+  ASSERT (vlib_get_thread_index () == 0);
+
+
+  now = vlib_time_now (vm);
+  t_entry = now - vm->barrier_epoch;
+
+  if (--vlib_worker_threads[0].recursion_level > 0)
+    {
+      barrier_trace_release_rec (t_entry);
+      return;
+    }
+
+  /* Update (all) node runtimes before releasing the barrier, if needed */
+  if (vm->need_vlib_worker_thread_node_runtime_update)
+    {
+      /* Do stats elements on main thread */
+      worker_thread_node_runtime_update_internal ();
+      vm->need_vlib_worker_thread_node_runtime_update = 0;
+
+      /* Do per thread rebuilds in parallel */
+      refork_needed = 1;
+      clib_smp_atomic_add (vlib_worker_threads->node_reforks_required,
+			   (vec_len (vlib_mains) - 1));
+      now = vlib_time_now (vm);
+      t_update_main = now - vm->barrier_epoch;
+    }
+
+  deadline = now + BARRIER_SYNC_TIMEOUT;
+
+  *vlib_worker_threads->wait_at_barrier = 0;
+
+  while (*vlib_worker_threads->workers_at_barrier > 0)
+    {
+      if ((now = vlib_time_now (vm)) > deadline)
+	{
+	  fformat (stderr, "%s: worker thread deadlock\n", __FUNCTION__);
+	  os_panic ();
+	}
+    }
+
+  /* Wait for reforks before continuing */
+  if (refork_needed)
+    {
+      now = vlib_time_now (vm);
+
+      deadline = now + BARRIER_SYNC_TIMEOUT;
+
+      while (*vlib_worker_threads->node_reforks_required > 0)
+	{
+	  if ((now = vlib_time_now (vm)) > deadline)
+	    {
+	      fformat (stderr, "%s: worker thread refork deadlock\n",
+		       __FUNCTION__);
+	      os_panic ();
+	    }
+	}
+    }
+
+  t_closed_total = now - vm->barrier_epoch;
+
+  minimum_open = t_closed_total * BARRIER_MINIMUM_OPEN_FACTOR;
+
+  if (minimum_open > BARRIER_MINIMUM_OPEN_LIMIT)
+    {
+      minimum_open = BARRIER_MINIMUM_OPEN_LIMIT;
+    }
+
+  vm->barrier_no_close_before = now + minimum_open;
+
+  /* Record barrier epoch (used to enforce minimum open time) */
+  vm->barrier_epoch = now;
+
+  barrier_trace_release (t_entry, t_closed_total, t_update_main);
+
+}
+
+/*
+ * Check the frame queue to see if any frames are available.
+ * If so, pull the packets off the frames and put them to
+ * the handoff node.
+ */
+int
+vlib_frame_queue_dequeue (vlib_main_t * vm, vlib_frame_queue_main_t * fqm)
+{
+  u32 thread_id = vm->thread_index;
+  vlib_frame_queue_t *fq = fqm->vlib_frame_queues[thread_id];
+  vlib_frame_queue_elt_t *elt;
+  u32 *from, *to;
+  vlib_frame_t *f;
+  int msg_type;
+  int processed = 0;
+  u32 n_left_to_node;
+  u32 vectors = 0;
+
+  ASSERT (fq);
+  ASSERT (vm == vlib_mains[thread_id]);
+
+  if (PREDICT_FALSE (fqm->node_index == ~0))
+    return 0;
+  /*
+   * Gather trace data for frame queues
+   */
+  if (PREDICT_FALSE (fq->trace))
+    {
+      frame_queue_trace_t *fqt;
+      frame_queue_nelt_counter_t *fqh;
+      u32 elix;
+
+      fqt = &fqm->frame_queue_traces[thread_id];
+
+      fqt->nelts = fq->nelts;
+      fqt->head = fq->head;
+      fqt->head_hint = fq->head_hint;
+      fqt->tail = fq->tail;
+      fqt->threshold = fq->vector_threshold;
+      fqt->n_in_use = fqt->tail - fqt->head;
+      if (fqt->n_in_use >= fqt->nelts)
+	{
+	  // if beyond max then use max
+	  fqt->n_in_use = fqt->nelts - 1;
+	}
+
+      /* Record the number of elements in use in the histogram */
+      fqh = &fqm->frame_queue_histogram[thread_id];
+      fqh->count[fqt->n_in_use]++;
+
+      /* Record a snapshot of the elements in use */
+      for (elix = 0; elix < fqt->nelts; elix++)
+	{
+	  elt = fq->elts + ((fq->head + 1 + elix) & (fq->nelts - 1));
+	  if (1 || elt->valid)
+	    {
+	      fqt->n_vectors[elix] = elt->n_vectors;
+	    }
+	}
+      fqt->written = 1;
+    }
+
+  while (1)
+    {
+      if (fq->head == fq->tail)
+	{
+	  fq->head_hint = fq->head;
+	  return processed;
+	}
+
+      elt = fq->elts + ((fq->head + 1) & (fq->nelts - 1));
+
+      if (!elt->valid)
+	{
+	  fq->head_hint = fq->head;
+	  return processed;
+	}
+
+      from = elt->buffer_index;
+      msg_type = elt->msg_type;
+
+      ASSERT (msg_type == VLIB_FRAME_QUEUE_ELT_DISPATCH_FRAME);
+      ASSERT (elt->n_vectors <= VLIB_FRAME_SIZE);
+
+      f = vlib_get_frame_to_node (vm, fqm->node_index);
+
+      to = vlib_frame_vector_args (f);
+
+      n_left_to_node = elt->n_vectors;
+
+      while (n_left_to_node >= 4)
+	{
+	  to[0] = from[0];
+	  to[1] = from[1];
+	  to[2] = from[2];
+	  to[3] = from[3];
+	  to += 4;
+	  from += 4;
+	  n_left_to_node -= 4;
+	}
+
+      while (n_left_to_node > 0)
+	{
+	  to[0] = from[0];
+	  to++;
+	  from++;
+	  n_left_to_node--;
+	}
+
+      vectors += elt->n_vectors;
+      f->n_vectors = elt->n_vectors;
+      vlib_put_frame_to_node (vm, fqm->node_index, f);
+
+      elt->valid = 0;
+      elt->n_vectors = 0;
+      elt->msg_type = 0xfefefefe;
+      CLIB_MEMORY_BARRIER ();
+      fq->head++;
+      processed++;
+
+      /*
+       * Limit the number of packets pushed into the graph
+       */
+      if (vectors >= fq->vector_threshold)
+	{
+	  fq->head_hint = fq->head;
+	  return processed;
+	}
+    }
+  ASSERT (0);
+  return processed;
+}
+
+void
+vlib_worker_thread_fn (void *arg)
+{
+  vlib_worker_thread_t *w = (vlib_worker_thread_t *) arg;
+  vlib_thread_main_t *tm = vlib_get_thread_main ();
+  vlib_main_t *vm = vlib_get_main ();
+  clib_error_t *e;
+
+  ASSERT (vm->thread_index == vlib_get_thread_index ());
+
+  vlib_worker_thread_init (w);
+  clib_time_init (&vm->clib_time);
+  clib_mem_set_heap (w->thread_mheap);
+
+  /* Wait until the dpdk init sequence is complete */
+  while (tm->extern_thread_mgmt && tm->worker_thread_release == 0)
+    vlib_worker_thread_barrier_check ();
+
+  e = vlib_call_init_exit_functions
+    (vm, vm->worker_init_function_registrations, 1 /* call_once */ );
+  if (e)
+    clib_error_report (e);
+
+  vlib_worker_loop (vm);
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_THREAD (worker_thread_reg, static) = {
+  .name = "workers",
+  .short_name = "wk",
+  .function = vlib_worker_thread_fn,
+};
+/* *INDENT-ON* */
+
+u32
+vlib_frame_queue_main_init (u32 node_index, u32 frame_queue_nelts)
+{
+  vlib_thread_main_t *tm = vlib_get_thread_main ();
+  vlib_frame_queue_main_t *fqm;
+  vlib_frame_queue_t *fq;
+  int i;
+
+  if (frame_queue_nelts == 0)
+    frame_queue_nelts = FRAME_QUEUE_NELTS;
+
+  vec_add2 (tm->frame_queue_mains, fqm, 1);
+
+  fqm->node_index = node_index;
+
+  vec_validate (fqm->vlib_frame_queues, tm->n_vlib_mains - 1);
+  _vec_len (fqm->vlib_frame_queues) = 0;
+  for (i = 0; i < tm->n_vlib_mains; i++)
+    {
+      fq = vlib_frame_queue_alloc (frame_queue_nelts);
+      vec_add1 (fqm->vlib_frame_queues, fq);
+    }
+
+  return (fqm - tm->frame_queue_mains);
+}
+
+int
+vlib_thread_cb_register (struct vlib_main_t *vm, vlib_thread_callbacks_t * cb)
+{
+  vlib_thread_main_t *tm = vlib_get_thread_main ();
+
+  if (tm->extern_thread_mgmt)
+    return -1;
+
+  tm->cb.vlib_launch_thread_cb = cb->vlib_launch_thread_cb;
+  tm->extern_thread_mgmt = 1;
+  return 0;
+}
+
+void
+vlib_process_signal_event_mt_helper (vlib_process_signal_event_mt_args_t *
+				     args)
+{
+  ASSERT (vlib_get_thread_index () == 0);
+  vlib_process_signal_event (vlib_get_main (), args->node_index,
+			     args->type_opaque, args->data);
+}
+
+void *rpc_call_main_thread_cb_fn;
+
+void
+vlib_rpc_call_main_thread (void *callback, u8 * args, u32 arg_size)
+{
+  if (rpc_call_main_thread_cb_fn)
+    {
+      void (*fp) (void *, u8 *, u32) = rpc_call_main_thread_cb_fn;
+      (*fp) (callback, args, arg_size);
+    }
+  else
+    clib_warning ("BUG: rpc_call_main_thread_cb_fn NULL!");
+}
+
+clib_error_t *
+threads_init (vlib_main_t * vm)
+{
+  return 0;
+}
+
+VLIB_INIT_FUNCTION (threads_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/threads.h b/src/vlib/threads.h
new file mode 100644
index 00000000..8931584b
--- /dev/null
+++ b/src/vlib/threads.h
@@ -0,0 +1,537 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef included_vlib_threads_h
+#define included_vlib_threads_h
+
+#include <vlib/main.h>
+#include <linux/sched.h>
+
+/*
+ * To enable detailed tracing of barrier usage, including call stacks and
+ * timings, define BARRIER_TRACING here or in relevant TAGS.  If also used
+ * with CLIB_DEBUG, timing will _not_ be representative of normal code
+ * execution.
+ *
+ */
+
+// #define BARRIER_TRACING 1
+
+/*
+ * Two options for barrier tracing output: syslog & elog.
+ */
+
+// #define BARRIER_TRACING_ELOG 1
+
+extern vlib_main_t **vlib_mains;
+
+void vlib_set_thread_name (char *name);
+
+/* arg is actually a vlib__thread_t * */
+typedef void (vlib_thread_function_t) (void *arg);
+
+typedef struct vlib_thread_registration_
+{
+  /* constructor generated list of thread registrations */
+  struct vlib_thread_registration_ *next;
+
+  /* config parameters */
+  char *name;
+  char *short_name;
+  vlib_thread_function_t *function;
+  uword mheap_size;
+  int fixed_count;
+  u32 count;
+  int no_data_structure_clone;
+  u32 frame_queue_nelts;
+
+  /* All threads of this type run on pthreads */
+  int use_pthreads;
+  u32 first_index;
+  uword *coremask;
+} vlib_thread_registration_t;
+
+/*
+ * Frames have their cpu / vlib_main_t index in the low-order N bits
+ * Make VLIB_MAX_CPUS a power-of-two, please...
+ */
+
+#ifndef VLIB_MAX_CPUS
+#define VLIB_MAX_CPUS 256
+#endif
+
+#if VLIB_MAX_CPUS > CLIB_MAX_MHEAPS
+#error Please increase number of per-cpu mheaps
+#endif
+
+#define VLIB_CPU_MASK (VLIB_MAX_CPUS - 1)	/* 0x3f, max */
+#define VLIB_OFFSET_MASK (~VLIB_CPU_MASK)
+
+#define VLIB_LOG2_THREAD_STACK_SIZE (21)
+#define VLIB_THREAD_STACK_SIZE (1<<VLIB_LOG2_THREAD_STACK_SIZE)
+
+typedef enum
+{
+  VLIB_FRAME_QUEUE_ELT_DISPATCH_FRAME,
+} vlib_frame_queue_msg_type_t;
+
+typedef struct
+{
+  CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+  volatile u32 valid;
+  u32 msg_type;
+  u32 n_vectors;
+  u32 last_n_vectors;
+
+  /* 256 * 4 = 1024 bytes, even mult of cache line size */
+  u32 buffer_index[VLIB_FRAME_SIZE];
+}
+vlib_frame_queue_elt_t;
+
+typedef struct
+{
+  /* First cache line */
+  CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+  volatile u32 *wait_at_barrier;
+  volatile u32 *workers_at_barrier;
+
+  /* Second Cache Line */
+    CLIB_CACHE_LINE_ALIGN_MARK (cacheline1);
+  void *thread_mheap;
+  u8 *thread_stack;
+  void (*thread_function) (void *);
+  void *thread_function_arg;
+  i64 recursion_level;
+  elog_track_t elog_track;
+  u32 instance_id;
+  vlib_thread_registration_t *registration;
+  u8 *name;
+  u64 barrier_sync_count;
+#ifdef BARRIER_TRACING
+  const char *barrier_caller;
+  const char *barrier_context;
+#endif
+  volatile u32 *node_reforks_required;
+
+  long lwp;
+  int lcore_id;
+  pthread_t thread_id;
+} vlib_worker_thread_t;
+
+extern vlib_worker_thread_t *vlib_worker_threads;
+
+typedef struct
+{
+  /* enqueue side */
+  CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
+  volatile u64 tail;
+  u64 enqueues;
+  u64 enqueue_ticks;
+  u64 enqueue_vectors;
+  u32 enqueue_full_events;
+
+  /* dequeue side */
+    CLIB_CACHE_LINE_ALIGN_MARK (cacheline1);
+  volatile u64 head;
+  u64 dequeues;
+  u64 dequeue_ticks;
+  u64 dequeue_vectors;
+  u64 trace;
+  u64 vector_threshold;
+
+  /* dequeue hint to enqueue side */
+    CLIB_CACHE_LINE_ALIGN_MARK (cacheline2);
+  volatile u64 head_hint;
+
+  /* read-only, constant, shared */
+    CLIB_CACHE_LINE_ALIGN_MARK (cacheline3);
+  vlib_frame_queue_elt_t *elts;
+  u32 nelts;
+}
+vlib_frame_queue_t;
+
+typedef struct
+{
+  u32 node_index;
+  vlib_frame_queue_t **vlib_frame_queues;
+
+  /* for frame queue tracing */
+  frame_queue_trace_t *frame_queue_traces;
+  frame_queue_nelt_counter_t *frame_queue_histogram;
+} vlib_frame_queue_main_t;
+
+typedef struct
+{
+  uword node_index;
+  uword type_opaque;
+  uword data;
+} vlib_process_signal_event_mt_args_t;
+
+/* Called early, in thread 0's context */
+clib_error_t *vlib_thread_init (vlib_main_t * vm);
+
+int vlib_frame_queue_enqueue (vlib_main_t * vm, u32 node_runtime_index,
+			      u32 frame_queue_index, vlib_frame_t * frame,
+			      vlib_frame_queue_msg_type_t type);
+
+int
+vlib_frame_queue_dequeue (vlib_main_t * vm, vlib_frame_queue_main_t * fqm);
+
+void vlib_worker_thread_node_runtime_update (void);
+
+void vlib_create_worker_threads (vlib_main_t * vm, int n,
+				 void (*thread_function) (void *));
+
+void vlib_worker_thread_init (vlib_worker_thread_t * w);
+u32 vlib_frame_queue_main_init (u32 node_index, u32 frame_queue_nelts);
+
+/* Check for a barrier sync request every 30ms */
+#define BARRIER_SYNC_DELAY (0.030000)
+
+#if CLIB_DEBUG > 0
+/* long barrier timeout, for gdb... */
+#define BARRIER_SYNC_TIMEOUT (600.1)
+#else
+#define BARRIER_SYNC_TIMEOUT (1.0)
+#endif
+
+#ifdef BARRIER_TRACING
+#define vlib_worker_thread_barrier_sync(X) {vlib_worker_threads[0].barrier_caller=__FUNCTION__;vlib_worker_thread_barrier_sync_int(X);}
+#else
+#define vlib_worker_thread_barrier_sync(X) vlib_worker_thread_barrier_sync_int(X)
+#endif
+
+
+void vlib_worker_thread_barrier_sync_int (vlib_main_t * vm);
+void vlib_worker_thread_barrier_release (vlib_main_t * vm);
+void vlib_worker_thread_node_refork (void);
+
+static_always_inline uword
+vlib_get_thread_index (void)
+{
+  return __os_thread_index;
+}
+
+always_inline void
+vlib_smp_unsafe_warning (void)
+{
+  if (CLIB_DEBUG > 0)
+    {
+      if (vlib_get_thread_index ())
+	fformat (stderr, "%s: SMP unsafe warning...\n", __FUNCTION__);
+    }
+}
+
+typedef enum
+{
+  VLIB_WORKER_THREAD_FORK_FIXUP_ILLEGAL = 0,
+  VLIB_WORKER_THREAD_FORK_FIXUP_NEW_SW_IF_INDEX,
+} vlib_fork_fixup_t;
+
+void vlib_worker_thread_fork_fixup (vlib_fork_fixup_t which);
+
+#define foreach_vlib_main(body)                         \
+do {                                                    \
+  vlib_main_t ** __vlib_mains = 0, *this_vlib_main;     \
+  int ii;                                               \
+                                                        \
+  for (ii = 0; ii < vec_len (vlib_mains); ii++)         \
+    {                                                   \
+      this_vlib_main = vlib_mains[ii];                  \
+      ASSERT (ii == 0 ||                                \
+	      this_vlib_main->parked_at_barrier == 1);  \
+      if (this_vlib_main)                               \
+        vec_add1 (__vlib_mains, this_vlib_main);        \
+    }                                                   \
+                                                        \
+  for (ii = 0; ii < vec_len (__vlib_mains); ii++)       \
+    {                                                   \
+      this_vlib_main = __vlib_mains[ii];                \
+      /* body uses this_vlib_main... */                 \
+      (body);                                           \
+    }                                                   \
+  vec_free (__vlib_mains);                              \
+} while (0);
+
+#define foreach_sched_policy \
+  _(SCHED_OTHER, OTHER, "other") \
+  _(SCHED_BATCH, BATCH, "batch") \
+  _(SCHED_IDLE, IDLE, "idle")   \
+  _(SCHED_FIFO, FIFO, "fifo")   \
+  _(SCHED_RR, RR, "rr")
+
+typedef enum
+{
+#define _(v,f,s) SCHED_POLICY_##f = v,
+  foreach_sched_policy
+#undef _
+    SCHED_POLICY_N,
+} sched_policy_t;
+
+typedef struct
+{
+  clib_error_t *(*vlib_launch_thread_cb) (void *fp, vlib_worker_thread_t * w,
+					  unsigned lcore_id);
+  clib_error_t *(*vlib_thread_set_lcore_cb) (u32 thread, u16 lcore);
+} vlib_thread_callbacks_t;
+
+typedef struct
+{
+  /* Link list of registrations, built by constructors */
+  vlib_thread_registration_t *next;
+
+  /* Vector of registrations, w/ non-data-structure clones at the top */
+  vlib_thread_registration_t **registrations;
+
+  uword *thread_registrations_by_name;
+
+  vlib_worker_thread_t *worker_threads;
+
+  /*
+   * Launch all threads as pthreads,
+   * not eal_rte_launch (strict affinity) threads
+   */
+  int use_pthreads;
+
+  /* Number of vlib_main / vnet_main clones */
+  u32 n_vlib_mains;
+
+  /* Number of thread stacks to create */
+  u32 n_thread_stacks;
+
+  /* Number of pthreads */
+  u32 n_pthreads;
+
+  /* Number of threads */
+  u32 n_threads;
+
+  /* Number of cores to skip, must match the core mask */
+  u32 skip_cores;
+
+  /* Thread prefix name */
+  u8 *thread_prefix;
+
+  /* main thread lcore */
+  u8 main_lcore;
+
+  /* Bitmap of available CPU cores */
+  uword *cpu_core_bitmap;
+
+  /* Bitmap of available CPU sockets (NUMA nodes) */
+  uword *cpu_socket_bitmap;
+
+  /* Worker handoff queues */
+  vlib_frame_queue_main_t *frame_queue_mains;
+
+  /* worker thread initialization barrier */
+  volatile u32 worker_thread_release;
+
+  /* scheduling policy */
+  u32 sched_policy;
+
+  /* scheduling policy priority */
+  u32 sched_priority;
+
+  /* callbacks */
+  vlib_thread_callbacks_t cb;
+  int extern_thread_mgmt;
+} vlib_thread_main_t;
+
+extern vlib_thread_main_t vlib_thread_main;
+
+#include <vlib/global_funcs.h>
+
+#define VLIB_REGISTER_THREAD(x,...)                     \
+  __VA_ARGS__ vlib_thread_registration_t x;             \
+static void __vlib_add_thread_registration_##x (void)   \
+  __attribute__((__constructor__)) ;                    \
+static void __vlib_add_thread_registration_##x (void)   \
+{                                                       \
+  vlib_thread_main_t * tm = &vlib_thread_main;          \
+  x.next = tm->next;                                    \
+  tm->next = &x;                                        \
+}                                                       \
+__VA_ARGS__ vlib_thread_registration_t x
+
+always_inline u32
+vlib_num_workers ()
+{
+  return vlib_thread_main.n_vlib_mains - 1;
+}
+
+always_inline u32
+vlib_get_worker_thread_index (u32 worker_index)
+{
+  return worker_index + 1;
+}
+
+always_inline u32
+vlib_get_worker_index (u32 thread_index)
+{
+  return thread_index - 1;
+}
+
+always_inline u32
+vlib_get_current_worker_index ()
+{
+  return vlib_get_thread_index () - 1;
+}
+
+static inline void
+vlib_worker_thread_barrier_check (void)
+{
+  if (PREDICT_FALSE (*vlib_worker_threads->wait_at_barrier))
+    {
+      vlib_main_t *vm;
+      clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, 1);
+      if (CLIB_DEBUG > 0)
+	{
+	  vm = vlib_get_main ();
+	  vm->parked_at_barrier = 1;
+	}
+      while (*vlib_worker_threads->wait_at_barrier)
+	;
+      if (CLIB_DEBUG > 0)
+	vm->parked_at_barrier = 0;
+      clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, -1);
+
+      if (PREDICT_FALSE (*vlib_worker_threads->node_reforks_required))
+	{
+	  vlib_worker_thread_node_refork ();
+	  clib_smp_atomic_add (vlib_worker_threads->node_reforks_required,
+			       -1);
+	  while (*vlib_worker_threads->node_reforks_required)
+	    ;
+	}
+    }
+}
+
+always_inline vlib_main_t *
+vlib_get_worker_vlib_main (u32 worker_index)
+{
+  vlib_main_t *vm;
+  vlib_thread_main_t *tm = &vlib_thread_main;
+  ASSERT (worker_index < tm->n_vlib_mains - 1);
+  vm = vlib_mains[worker_index + 1];
+  ASSERT (vm);
+  return vm;
+}
+
+static inline void
+vlib_put_frame_queue_elt (vlib_frame_queue_elt_t * hf)
+{
+  CLIB_MEMORY_BARRIER ();
+  hf->valid = 1;
+}
+
+static inline vlib_frame_queue_elt_t *
+vlib_get_frame_queue_elt (u32 frame_queue_index, u32 index)
+{
+  vlib_frame_queue_t *fq;
+  vlib_frame_queue_elt_t *elt;
+  vlib_thread_main_t *tm = &vlib_thread_main;
+  vlib_frame_queue_main_t *fqm =
+    vec_elt_at_index (tm->frame_queue_mains, frame_queue_index);
+  u64 new_tail;
+
+  fq = fqm->vlib_frame_queues[index];
+  ASSERT (fq);
+
+  new_tail = __sync_add_and_fetch (&fq->tail, 1);
+
+  /* Wait until a ring slot is available */
+  while (new_tail >= fq->head_hint + fq->nelts)
+    vlib_worker_thread_barrier_check ();
+
+  elt = fq->elts + (new_tail & (fq->nelts - 1));
+
+  /* this would be very bad... */
+  while (elt->valid)
+    ;
+
+  elt->msg_type = VLIB_FRAME_QUEUE_ELT_DISPATCH_FRAME;
+  elt->last_n_vectors = elt->n_vectors = 0;
+
+  return elt;
+}
+
+static inline vlib_frame_queue_t *
+is_vlib_frame_queue_congested (u32 frame_queue_index,
+			       u32 index,
+			       u32 queue_hi_thresh,
+			       vlib_frame_queue_t **
+			       handoff_queue_by_worker_index)
+{
+  vlib_frame_queue_t *fq;
+  vlib_thread_main_t *tm = &vlib_thread_main;
+  vlib_frame_queue_main_t *fqm =
+    vec_elt_at_index (tm->frame_queue_mains, frame_queue_index);
+
+  fq = handoff_queue_by_worker_index[index];
+  if (fq != (vlib_frame_queue_t *) (~0))
+    return fq;
+
+  fq = fqm->vlib_frame_queues[index];
+  ASSERT (fq);
+
+  if (PREDICT_FALSE (fq->tail >= (fq->head_hint + queue_hi_thresh)))
+    {
+      /* a valid entry in the array will indicate the queue has reached
+       * the specified threshold and is congested
+       */
+      handoff_queue_by_worker_index[index] = fq;
+      fq->enqueue_full_events++;
+      return fq;
+    }
+
+  return NULL;
+}
+
+static inline vlib_frame_queue_elt_t *
+vlib_get_worker_handoff_queue_elt (u32 frame_queue_index,
+				   u32 vlib_worker_index,
+				   vlib_frame_queue_elt_t **
+				   handoff_queue_elt_by_worker_index)
+{
+  vlib_frame_queue_elt_t *elt;
+
+  if (handoff_queue_elt_by_worker_index[vlib_worker_index])
+    return handoff_queue_elt_by_worker_index[vlib_worker_index];
+
+  elt = vlib_get_frame_queue_elt (frame_queue_index, vlib_worker_index);
+
+  handoff_queue_elt_by_worker_index[vlib_worker_index] = elt;
+
+  return elt;
+}
+
+u8 *vlib_thread_stack_init (uword thread_index);
+int vlib_thread_cb_register (struct vlib_main_t *vm,
+			     vlib_thread_callbacks_t * cb);
+extern void *rpc_call_main_thread_cb_fn;
+
+void
+vlib_process_signal_event_mt_helper (vlib_process_signal_event_mt_args_t *
+				     args);
+void vlib_rpc_call_main_thread (void *function, u8 * args, u32 size);
+
+#endif /* included_vlib_threads_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/threads_cli.c b/src/vlib/threads_cli.c
new file mode 100644
index 00000000..02bdea5c
--- /dev/null
+++ b/src/vlib/threads_cli.c
@@ -0,0 +1,584 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#define _GNU_SOURCE
+
+#include <vppinfra/format.h>
+#include <vppinfra/linux/sysfs.h>
+#include <vlib/vlib.h>
+
+#include <vlib/threads.h>
+#include <vlib/unix/unix.h>
+
+static u8 *
+format_sched_policy_and_priority (u8 * s, va_list * args)
+{
+  long i = va_arg (*args, long);
+  struct sched_param sched_param;
+  u8 *t = 0;
+
+  switch (sched_getscheduler (i))
+    {
+#define _(v,f,str) case SCHED_POLICY_##f: t = (u8 *) str; break;
+      foreach_sched_policy
+#undef _
+    }
+  if (sched_getparam (i, &sched_param) == 0)
+    return format (s, "%s (%d)", t, sched_param.sched_priority);
+  else
+    return format (s, "%s (n/a)", t);
+}
+
+static clib_error_t *
+show_threads_fn (vlib_main_t * vm,
+		 unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  vlib_worker_thread_t *w;
+  int i;
+
+  vlib_cli_output (vm, "%-7s%-20s%-12s%-8s%-25s%-7s%-7s%-7s%-10s",
+		   "ID", "Name", "Type", "LWP", "Sched Policy (Priority)",
+		   "lcore", "Core", "Socket", "State");
+
+#if !defined(__powerpc64__)
+  for (i = 0; i < vec_len (vlib_worker_threads); i++)
+    {
+      w = vlib_worker_threads + i;
+      u8 *line = NULL;
+
+      line = format (line, "%-7d%-20s%-12s%-8d",
+		     i,
+		     w->name ? w->name : (u8 *) "",
+		     w->registration ? w->registration->name : "", w->lwp);
+
+      line = format (line, "%-25U", format_sched_policy_and_priority, w->lwp);
+
+      int lcore = -1;
+      cpu_set_t cpuset;
+      CPU_ZERO (&cpuset);
+      int ret = -1;
+
+      ret =
+	pthread_getaffinity_np (w->thread_id, sizeof (cpu_set_t), &cpuset);
+      if (!ret)
+	{
+	  int c;
+	  for (c = 0; c < CPU_SETSIZE; c++)
+	    if (CPU_ISSET (c, &cpuset))
+	      {
+		if (lcore > -1)
+		  {
+		    lcore = -2;
+		    break;
+		  }
+		lcore = c;
+	      }
+	}
+      else
+	{
+	  lcore = w->lcore_id;
+	}
+
+      if (lcore > -1)
+	{
+	  const char *sys_cpu_path = "/sys/devices/system/cpu/cpu";
+	  int socket_id = -1;
+	  int core_id = -1;
+	  u8 *p = 0;
+
+	  p = format (p, "%s%u/topology/core_id%c", sys_cpu_path, lcore, 0);
+	  clib_sysfs_read ((char *) p, "%d", &core_id);
+
+	  vec_reset_length (p);
+	  p =
+	    format (p,
+		    "%s%u/topology/physical_package_id%c",
+		    sys_cpu_path, lcore, 0);
+	  clib_sysfs_read ((char *) p, "%d", &socket_id);
+	  vec_free (p);
+
+	  line = format (line, "%-7u%-7u%-7u%", lcore, core_id, socket_id);
+	}
+      else
+	{
+	  line =
+	    format (line, "%-7s%-7s%-7s%", (lcore == -2) ? "M" : "n/a", "n/a",
+		    "n/a");
+	}
+
+      vlib_cli_output (vm, "%v", line);
+      vec_free (line);
+    }
+#endif
+
+  return 0;
+}
+
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (show_threads_command, static) = {
+  .path = "show threads",
+  .short_help = "Show threads",
+  .function = show_threads_fn,
+};
+/* *INDENT-ON* */
+
+/*
+ * Trigger threads to grab frame queue trace data
+ */
+static clib_error_t *
+trace_frame_queue (vlib_main_t * vm, unformat_input_t * input,
+		   vlib_cli_command_t * cmd)
+{
+  unformat_input_t _line_input, *line_input = &_line_input;
+  clib_error_t *error = NULL;
+  frame_queue_trace_t *fqt;
+  frame_queue_nelt_counter_t *fqh;
+  vlib_thread_main_t *tm = vlib_get_thread_main ();
+  vlib_frame_queue_main_t *fqm;
+  u32 num_fq;
+  u32 fqix;
+  u32 enable = 2;
+  u32 index = ~(u32) 0;
+
+  if (!unformat_user (input, unformat_line_input, line_input))
+    return 0;
+
+  while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (line_input, "on"))
+	enable = 1;
+      else if (unformat (line_input, "off"))
+	enable = 0;
+      else if (unformat (line_input, "index %u", &index))
+	;
+      else
+	{
+	  error = clib_error_return (0, "parse error: '%U'",
+				     format_unformat_error, line_input);
+	  goto done;
+	}
+    }
+
+  if (enable > 1)
+    {
+      error = clib_error_return (0, "expecting on or off");
+      goto done;
+    }
+
+  if (vec_len (tm->frame_queue_mains) == 0)
+    {
+      error = clib_error_return (0, "no worker handoffs exist");
+      goto done;
+    }
+
+  if (index > vec_len (tm->frame_queue_mains) - 1)
+    {
+      error = clib_error_return (0,
+				 "expecting valid worker handoff queue index");
+      goto done;
+    }
+
+  fqm = vec_elt_at_index (tm->frame_queue_mains, index);
+
+  num_fq = vec_len (fqm->vlib_frame_queues);
+  if (num_fq == 0)
+    {
+      vlib_cli_output (vm, "No frame queues exist\n");
+      goto done;
+    }
+
+  // Allocate storage for trace if necessary
+  vec_validate_aligned (fqm->frame_queue_traces, num_fq - 1,
+			CLIB_CACHE_LINE_BYTES);
+  vec_validate_aligned (fqm->frame_queue_histogram, num_fq - 1,
+			CLIB_CACHE_LINE_BYTES);
+
+  for (fqix = 0; fqix < num_fq; fqix++)
+    {
+      fqt = &fqm->frame_queue_traces[fqix];
+      fqh = &fqm->frame_queue_histogram[fqix];
+
+      memset (fqt->n_vectors, 0xff, sizeof (fqt->n_vectors));
+      fqt->written = 0;
+      memset (fqh, 0, sizeof (*fqh));
+      fqm->vlib_frame_queues[fqix]->trace = enable;
+    }
+
+done:
+  unformat_free (line_input);
+
+  return error;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_trace_frame_queue,static) = {
+    .path = "trace frame-queue",
+    .short_help = "trace frame-queue (on|off)",
+    .function = trace_frame_queue,
+    .is_mp_safe = 1,
+};
+/* *INDENT-ON* */
+
+
+/*
+ * Adding two counters and compute percent of total
+ * Round up, e.g. 0.000001 => 1%
+ */
+static u32
+compute_percent (u64 * two_counters, u64 total)
+{
+  if (total == 0)
+    {
+      return 0;
+    }
+  else
+    {
+      return (((two_counters[0] + two_counters[1]) * 100) +
+	      (total - 1)) / total;
+    }
+}
+
+/*
+ * Display frame queue trace data gathered by threads.
+ */
+static clib_error_t *
+show_frame_queue_internal (vlib_main_t * vm,
+			   vlib_frame_queue_main_t * fqm, u32 histogram)
+{
+  clib_error_t *error = NULL;
+  frame_queue_trace_t *fqt;
+  frame_queue_nelt_counter_t *fqh;
+  u32 num_fq;
+  u32 fqix;
+
+  num_fq = vec_len (fqm->frame_queue_traces);
+  if (num_fq == 0)
+    {
+      vlib_cli_output (vm, "No trace data for frame queues\n");
+      return error;
+    }
+
+  if (histogram)
+    {
+      vlib_cli_output (vm, "0-1   2-3   4-5   6-7   8-9   10-11 12-13 14-15 "
+		       "16-17 18-19 20-21 22-23 24-25 26-27 28-29 30-31\n");
+    }
+
+  for (fqix = 0; fqix < num_fq; fqix++)
+    {
+      fqt = &(fqm->frame_queue_traces[fqix]);
+
+      vlib_cli_output (vm, "Thread %d %v\n", fqix,
+		       vlib_worker_threads[fqix].name);
+
+      if (fqt->written == 0)
+	{
+	  vlib_cli_output (vm, "  no trace data\n");
+	  continue;
+	}
+
+      if (histogram)
+	{
+	  fqh = &(fqm->frame_queue_histogram[fqix]);
+	  u32 nelt;
+	  u64 total = 0;
+
+	  for (nelt = 0; nelt < FRAME_QUEUE_MAX_NELTS; nelt++)
+	    {
+	      total += fqh->count[nelt];
+	    }
+
+	  /*
+	   * Print in pairs to condense the output.
+	   * Allow entries with 0 counts to be clearly identified, by rounding up.
+	   * Any non-zero value will be displayed as at least one percent. This
+	   * also means the sum of percentages can be > 100, but that is fine. The
+	   * histogram is counted from the last time "trace frame on" was issued.
+	   */
+	  vlib_cli_output (vm,
+			   "%3d%%  %3d%%  %3d%%  %3d%%  %3d%%  %3d%%  %3d%%  %3d%%  "
+			   "%3d%%  %3d%%  %3d%%  %3d%%  %3d%%  %3d%%  %3d%%  %3d%%\n",
+			   compute_percent (&fqh->count[0], total),
+			   compute_percent (&fqh->count[2], total),
+			   compute_percent (&fqh->count[4], total),
+			   compute_percent (&fqh->count[6], total),
+			   compute_percent (&fqh->count[8], total),
+			   compute_percent (&fqh->count[10], total),
+			   compute_percent (&fqh->count[12], total),
+			   compute_percent (&fqh->count[14], total),
+			   compute_percent (&fqh->count[16], total),
+			   compute_percent (&fqh->count[18], total),
+			   compute_percent (&fqh->count[20], total),
+			   compute_percent (&fqh->count[22], total),
+			   compute_percent (&fqh->count[24], total),
+			   compute_percent (&fqh->count[26], total),
+			   compute_percent (&fqh->count[28], total),
+			   compute_percent (&fqh->count[30], total));
+	}
+      else
+	{
+	  vlib_cli_output (vm,
+			   "  vector-threshold %d  ring size %d  in use %d\n",
+			   fqt->threshold, fqt->nelts, fqt->n_in_use);
+	  vlib_cli_output (vm, "  head %12d  head_hint %12d  tail %12d\n",
+			   fqt->head, fqt->head_hint, fqt->tail);
+	  vlib_cli_output (vm,
+			   "  %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d\n",
+			   fqt->n_vectors[0], fqt->n_vectors[1],
+			   fqt->n_vectors[2], fqt->n_vectors[3],
+			   fqt->n_vectors[4], fqt->n_vectors[5],
+			   fqt->n_vectors[6], fqt->n_vectors[7],
+			   fqt->n_vectors[8], fqt->n_vectors[9],
+			   fqt->n_vectors[10], fqt->n_vectors[11],
+			   fqt->n_vectors[12], fqt->n_vectors[13],
+			   fqt->n_vectors[14], fqt->n_vectors[15]);
+
+	  if (fqt->nelts > 16)
+	    {
+	      vlib_cli_output (vm,
+			       "  %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d\n",
+			       fqt->n_vectors[16], fqt->n_vectors[17],
+			       fqt->n_vectors[18], fqt->n_vectors[19],
+			       fqt->n_vectors[20], fqt->n_vectors[21],
+			       fqt->n_vectors[22], fqt->n_vectors[23],
+			       fqt->n_vectors[24], fqt->n_vectors[25],
+			       fqt->n_vectors[26], fqt->n_vectors[27],
+			       fqt->n_vectors[28], fqt->n_vectors[29],
+			       fqt->n_vectors[30], fqt->n_vectors[31]);
+	    }
+	}
+
+    }
+  return error;
+}
+
+static clib_error_t *
+show_frame_queue_trace (vlib_main_t * vm, unformat_input_t * input,
+			vlib_cli_command_t * cmd)
+{
+  vlib_thread_main_t *tm = vlib_get_thread_main ();
+  vlib_frame_queue_main_t *fqm;
+  clib_error_t *error;
+
+  vec_foreach (fqm, tm->frame_queue_mains)
+  {
+    vlib_cli_output (vm, "Worker handoff queue index %u (next node '%U'):",
+		     fqm - tm->frame_queue_mains,
+		     format_vlib_node_name, vm, fqm->node_index);
+    error = show_frame_queue_internal (vm, fqm, 0);
+    if (error)
+      return error;
+  }
+  return 0;
+}
+
+static clib_error_t *
+show_frame_queue_histogram (vlib_main_t * vm, unformat_input_t * input,
+			    vlib_cli_command_t * cmd)
+{
+  vlib_thread_main_t *tm = vlib_get_thread_main ();
+  vlib_frame_queue_main_t *fqm;
+  clib_error_t *error;
+
+  vec_foreach (fqm, tm->frame_queue_mains)
+  {
+    vlib_cli_output (vm, "Worker handoff queue index %u (next node '%U'):",
+		     fqm - tm->frame_queue_mains,
+		     format_vlib_node_name, vm, fqm->node_index);
+    error = show_frame_queue_internal (vm, fqm, 1);
+    if (error)
+      return error;
+  }
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_show_frame_queue_trace,static) = {
+    .path = "show frame-queue",
+    .short_help = "show frame-queue trace",
+    .function = show_frame_queue_trace,
+};
+/* *INDENT-ON* */
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_show_frame_queue_histogram,static) = {
+    .path = "show frame-queue histogram",
+    .short_help = "show frame-queue histogram",
+    .function = show_frame_queue_histogram,
+};
+/* *INDENT-ON* */
+
+
+/*
+ * Modify the number of elements on the frame_queues
+ */
+static clib_error_t *
+test_frame_queue_nelts (vlib_main_t * vm, unformat_input_t * input,
+			vlib_cli_command_t * cmd)
+{
+  unformat_input_t _line_input, *line_input = &_line_input;
+  vlib_thread_main_t *tm = vlib_get_thread_main ();
+  vlib_frame_queue_main_t *fqm;
+  clib_error_t *error = NULL;
+  u32 num_fq;
+  u32 fqix;
+  u32 nelts = 0;
+  u32 index = ~(u32) 0;
+
+  if (!unformat_user (input, unformat_line_input, line_input))
+    return 0;
+
+  while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (line_input, "nelts %u", &nelts))
+	;
+      else if (unformat (line_input, "index %u", &index))
+	;
+      else
+	{
+	  error = clib_error_return (0, "parse error: '%U'",
+				     format_unformat_error, line_input);
+	  goto done;
+	}
+    }
+
+  if (index > vec_len (tm->frame_queue_mains) - 1)
+    {
+      error = clib_error_return (0,
+				 "expecting valid worker handoff queue index");
+      goto done;
+    }
+
+  fqm = vec_elt_at_index (tm->frame_queue_mains, index);
+
+  if ((nelts != 4) && (nelts != 8) && (nelts != 16) && (nelts != 32))
+    {
+      error = clib_error_return (0, "expecting 4,8,16,32");
+      goto done;
+    }
+
+  num_fq = vec_len (fqm->vlib_frame_queues);
+  if (num_fq == 0)
+    {
+      vlib_cli_output (vm, "No frame queues exist\n");
+      goto done;
+    }
+
+  for (fqix = 0; fqix < num_fq; fqix++)
+    {
+      fqm->vlib_frame_queues[fqix]->nelts = nelts;
+    }
+
+done:
+  unformat_free (line_input);
+
+  return error;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_test_frame_queue_nelts,static) = {
+    .path = "test frame-queue nelts",
+    .short_help = "test frame-queue nelts (4,8,16,32)",
+    .function = test_frame_queue_nelts,
+};
+/* *INDENT-ON* */
+
+
+/*
+ * Modify the max number of packets pulled off the frame queues
+ */
+static clib_error_t *
+test_frame_queue_threshold (vlib_main_t * vm, unformat_input_t * input,
+			    vlib_cli_command_t * cmd)
+{
+  unformat_input_t _line_input, *line_input = &_line_input;
+  vlib_thread_main_t *tm = vlib_get_thread_main ();
+  vlib_frame_queue_main_t *fqm;
+  clib_error_t *error = NULL;
+  u32 num_fq;
+  u32 fqix;
+  u32 threshold = ~(u32) 0;
+  u32 index = ~(u32) 0;
+
+  if (!unformat_user (input, unformat_line_input, line_input))
+    return 0;
+
+  while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (line_input, "threshold %u", &threshold))
+	;
+      else if (unformat (line_input, "index %u", &index))
+	;
+      else
+	{
+	  error = clib_error_return (0, "parse error: '%U'",
+				     format_unformat_error, line_input);
+	  goto done;
+	}
+    }
+
+  if (index > vec_len (tm->frame_queue_mains) - 1)
+    {
+      error = clib_error_return (0,
+				 "expecting valid worker handoff queue index");
+      goto done;
+    }
+
+  fqm = vec_elt_at_index (tm->frame_queue_mains, index);
+
+
+  if (threshold == ~(u32) 0)
+    {
+      vlib_cli_output (vm, "expecting threshold value\n");
+      goto done;
+    }
+
+  if (threshold == 0)
+    threshold = ~0;
+
+  num_fq = vec_len (fqm->vlib_frame_queues);
+  if (num_fq == 0)
+    {
+      vlib_cli_output (vm, "No frame queues exist\n");
+      goto done;
+    }
+
+  for (fqix = 0; fqix < num_fq; fqix++)
+    {
+      fqm->vlib_frame_queues[fqix]->vector_threshold = threshold;
+    }
+
+done:
+  unformat_free (line_input);
+
+  return error;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_test_frame_queue_threshold,static) = {
+    .path = "test frame-queue threshold",
+    .short_help = "test frame-queue threshold N (0=no limit)",
+    .function = test_frame_queue_threshold,
+};
+/* *INDENT-ON* */
+
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/trace.c b/src/vlib/trace.c
new file mode 100644
index 00000000..6d487ae1
--- /dev/null
+++ b/src/vlib/trace.c
@@ -0,0 +1,552 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * trace.c: VLIB trace buffer.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/threads.h>
+
+/* Helper function for nodes which only trace buffer data. */
+void
+vlib_trace_frame_buffers_only (vlib_main_t * vm,
+			       vlib_node_runtime_t * node,
+			       u32 * buffers,
+			       uword n_buffers,
+			       uword next_buffer_stride,
+			       uword n_buffer_data_bytes_in_trace)
+{
+  u32 n_left, *from;
+
+  n_left = n_buffers;
+  from = buffers;
+
+  while (n_left >= 4)
+    {
+      u32 bi0, bi1;
+      vlib_buffer_t *b0, *b1;
+      u8 *t0, *t1;
+
+      /* Prefetch next iteration. */
+      vlib_prefetch_buffer_with_index (vm, from[2], LOAD);
+      vlib_prefetch_buffer_with_index (vm, from[3], LOAD);
+
+      bi0 = from[0];
+      bi1 = from[1];
+
+      b0 = vlib_get_buffer (vm, bi0);
+      b1 = vlib_get_buffer (vm, bi1);
+
+      if (b0->flags & VLIB_BUFFER_IS_TRACED)
+	{
+	  t0 = vlib_add_trace (vm, node, b0, n_buffer_data_bytes_in_trace);
+	  clib_memcpy (t0, b0->data + b0->current_data,
+		       n_buffer_data_bytes_in_trace);
+	}
+      if (b1->flags & VLIB_BUFFER_IS_TRACED)
+	{
+	  t1 = vlib_add_trace (vm, node, b1, n_buffer_data_bytes_in_trace);
+	  clib_memcpy (t1, b1->data + b1->current_data,
+		       n_buffer_data_bytes_in_trace);
+	}
+      from += 2;
+      n_left -= 2;
+    }
+
+  while (n_left >= 1)
+    {
+      u32 bi0;
+      vlib_buffer_t *b0;
+      u8 *t0;
+
+      bi0 = from[0];
+
+      b0 = vlib_get_buffer (vm, bi0);
+
+      if (b0->flags & VLIB_BUFFER_IS_TRACED)
+	{
+	  t0 = vlib_add_trace (vm, node, b0, n_buffer_data_bytes_in_trace);
+	  clib_memcpy (t0, b0->data + b0->current_data,
+		       n_buffer_data_bytes_in_trace);
+	}
+      from += 1;
+      n_left -= 1;
+    }
+}
+
+/* Free up all trace buffer memory. */
+always_inline void
+clear_trace_buffer (void)
+{
+  int i;
+  vlib_trace_main_t *tm;
+
+  /* *INDENT-OFF* */
+  foreach_vlib_main (
+  ({
+    void *mainheap;
+
+    tm = &this_vlib_main->trace_main;
+    mainheap = clib_mem_set_heap (this_vlib_main->heap_base);
+
+    tm->trace_active_hint = 0;
+
+    for (i = 0; i < vec_len (tm->trace_buffer_pool); i++)
+      if (! pool_is_free_index (tm->trace_buffer_pool, i))
+        vec_free (tm->trace_buffer_pool[i]);
+    pool_free (tm->trace_buffer_pool);
+    clib_mem_set_heap (mainheap);
+  }));
+  /* *INDENT-ON* */
+}
+
+static u8 *
+format_vlib_trace (u8 * s, va_list * va)
+{
+  vlib_main_t *vm = va_arg (*va, vlib_main_t *);
+  vlib_trace_header_t *h = va_arg (*va, vlib_trace_header_t *);
+  vlib_trace_header_t *e = vec_end (h);
+  vlib_node_t *node, *prev_node;
+  clib_time_t *ct = &vm->clib_time;
+  f64 t;
+
+  prev_node = 0;
+  while (h < e)
+    {
+      node = vlib_get_node (vm, h->node_index);
+
+      if (node != prev_node)
+	{
+	  t =
+	    (h->time - vm->cpu_time_main_loop_start) * ct->seconds_per_clock;
+	  s =
+	    format (s, "\n%U: %v", format_time_interval, "h:m:s:u", t,
+		    node->name);
+	}
+      prev_node = node;
+
+      if (node->format_trace)
+	s = format (s, "\n  %U", node->format_trace, vm, node, h->data);
+      else
+	s = format (s, "\n  %U", node->format_buffer, h->data);
+
+      h = vlib_trace_header_next (h);
+    }
+
+  return s;
+}
+
+/* Root of all trace cli commands. */
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (trace_cli_command,static) = {
+  .path = "trace",
+  .short_help = "Packet tracer commands",
+};
+/* *INDENT-ON* */
+
+static int
+trace_cmp (void *a1, void *a2)
+{
+  vlib_trace_header_t **t1 = a1;
+  vlib_trace_header_t **t2 = a2;
+  i64 dt = t1[0]->time - t2[0]->time;
+  return dt < 0 ? -1 : (dt > 0 ? +1 : 0);
+}
+
+/*
+ * Return 1 if this packet passes the trace filter, or 0 otherwise
+ */
+u32
+filter_accept (vlib_trace_main_t * tm, vlib_trace_header_t * h)
+{
+  vlib_trace_header_t *e = vec_end (h);
+
+  if (tm->filter_flag == 0)
+    return 1;
+
+  if (tm->filter_flag == FILTER_FLAG_INCLUDE)
+    {
+      while (h < e)
+	{
+	  if (h->node_index == tm->filter_node_index)
+	    return 1;
+	  h = vlib_trace_header_next (h);
+	}
+      return 0;
+    }
+  else				/* FILTER_FLAG_EXCLUDE */
+    {
+      while (h < e)
+	{
+	  if (h->node_index == tm->filter_node_index)
+	    return 0;
+	  h = vlib_trace_header_next (h);
+	}
+      return 1;
+    }
+
+  return 0;
+}
+
+/*
+ * Remove traces from the trace buffer pool that don't pass the filter
+ */
+void
+trace_apply_filter (vlib_main_t * vm)
+{
+  vlib_trace_main_t *tm = &vm->trace_main;
+  vlib_trace_header_t **h;
+  vlib_trace_header_t ***traces_to_remove = 0;
+  u32 index;
+  u32 trace_index;
+  u32 n_accepted;
+
+  u32 accept;
+
+  if (tm->filter_flag == FILTER_FLAG_NONE)
+    return;
+
+  /*
+   * Ideally we would retain the first N traces that pass the filter instead
+   * of any N traces.
+   */
+  n_accepted = 0;
+  /* *INDENT-OFF* */
+  pool_foreach (h, tm->trace_buffer_pool,
+   ({
+      accept = filter_accept(tm, h[0]);
+
+      if ((n_accepted == tm->filter_count) || !accept)
+          vec_add1 (traces_to_remove, h);
+      else
+          n_accepted++;
+  }));
+  /* *INDENT-ON* */
+
+  /* remove all traces that we don't want to keep */
+  for (index = 0; index < vec_len (traces_to_remove); index++)
+    {
+      trace_index = traces_to_remove[index] - tm->trace_buffer_pool;
+      _vec_len (tm->trace_buffer_pool[trace_index]) = 0;
+      pool_put_index (tm->trace_buffer_pool, trace_index);
+    }
+
+  vec_free (traces_to_remove);
+}
+
+static clib_error_t *
+cli_show_trace_buffer (vlib_main_t * vm,
+		       unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  vlib_trace_main_t *tm;
+  vlib_trace_header_t **h, **traces;
+  u32 i, index = 0;
+  char *fmt;
+  u8 *s = 0;
+  u32 max;
+
+  /*
+   * By default display only this many traces. To display more, explicitly
+   * specify a max. This prevents unexpectedly huge outputs.
+   */
+  max = 50;
+  while (unformat_check_input (input) != (uword) UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "max %d", &max))
+	;
+      else
+	return clib_error_create ("expected 'max COUNT', got `%U'",
+				  format_unformat_error, input);
+    }
+
+
+  /* Get active traces from pool. */
+
+  /* *INDENT-OFF* */
+  foreach_vlib_main (
+  ({
+    void *mainheap;
+
+    fmt = "------------------- Start of thread %d %s -------------------\n";
+    s = format (s, fmt, index, vlib_worker_threads[index].name);
+
+    tm = &this_vlib_main->trace_main;
+
+    mainheap = clib_mem_set_heap (this_vlib_main->heap_base);
+
+    trace_apply_filter(this_vlib_main);
+
+    traces = 0;
+    pool_foreach (h, tm->trace_buffer_pool,
+    ({
+      vec_add1 (traces, h[0]);
+    }));
+
+    if (vec_len (traces) == 0)
+      {
+        clib_mem_set_heap (mainheap);
+        s = format (s, "No packets in trace buffer\n");
+        goto done;
+      }
+
+    /* Sort them by increasing time. */
+    vec_sort_with_function (traces, trace_cmp);
+
+    for (i = 0; i < vec_len (traces); i++)
+      {
+        if (i == max)
+          {
+            vlib_cli_output (vm, "Limiting display to %d packets."
+                                 " To display more specify max.", max);
+            goto done;
+          }
+
+        clib_mem_set_heap (mainheap);
+
+        s = format (s, "Packet %d\n%U\n\n", i + 1,
+                         format_vlib_trace, vm, traces[i]);
+
+        mainheap = clib_mem_set_heap (this_vlib_main->heap_base);
+      }
+
+  done:
+    vec_free (traces);
+    clib_mem_set_heap (mainheap);
+
+    index++;
+  }));
+  /* *INDENT-ON* */
+
+  vlib_cli_output (vm, "%v", s);
+  vec_free (s);
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (show_trace_cli,static) = {
+  .path = "show trace",
+  .short_help = "Show trace buffer [max COUNT]",
+  .function = cli_show_trace_buffer,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+cli_add_trace_buffer (vlib_main_t * vm,
+		      unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  unformat_input_t _line_input, *line_input = &_line_input;
+  vlib_trace_main_t *tm;
+  vlib_trace_node_t *tn;
+  u32 node_index, add;
+  u8 verbose = 0;
+  clib_error_t *error = 0;
+
+  if (!unformat_user (input, unformat_line_input, line_input))
+    return 0;
+
+  while (unformat_check_input (line_input) != (uword) UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (line_input, "%U %d",
+		    unformat_vlib_node, vm, &node_index, &add))
+	;
+      else if (unformat (line_input, "verbose"))
+	verbose = 1;
+      else
+	{
+	  error = clib_error_create ("expected NODE COUNT, got `%U'",
+				     format_unformat_error, line_input);
+	  goto done;
+	}
+    }
+
+  /* *INDENT-OFF* */
+  foreach_vlib_main ((
+    {
+      void *oldheap;
+      tm = &this_vlib_main->trace_main;
+      tm->trace_active_hint = 1;
+      tm->verbose = verbose;
+      oldheap =
+	clib_mem_set_heap (this_vlib_main->heap_base);
+      vec_validate (tm->nodes, node_index);
+      tn = tm->nodes + node_index;
+      tn->limit += add; clib_mem_set_heap (oldheap);
+    }));
+  /* *INDENT-ON* */
+
+done:
+  unformat_free (line_input);
+
+  return error;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (add_trace_cli,static) = {
+  .path = "trace add",
+  .short_help = "Trace given number of packets",
+  .function = cli_add_trace_buffer,
+};
+/* *INDENT-ON* */
+
+
+/*
+ * Configure a filter for packet traces.
+ *
+ * This supplements the packet trace feature so that only packets matching
+ * the filter are included in the trace. Currently the only filter is to
+ * keep packets that include a certain node in the trace or exclude a certain
+ * node in the trace.
+ *
+ * The count of traced packets in the "trace add" command is still used to
+ * create a certain number of traces. The "trace filter" command specifies
+ * how many of those packets should be retained in the trace.
+ *
+ * For example, 1Mpps of traffic is arriving and one of those packets is being
+ * dropped. To capture the trace for only that dropped packet, you can do:
+ *     trace filter include error-drop 1
+ *     trace add dpdk-input 1000000
+ *     <wait one second>
+ *     show trace
+ *
+ * Note that the filter could be implemented by capturing all traces and just
+ * reducing traces displayed by the "show trace" function. But that would
+ * require a lot of memory for storing the traces, making that infeasible.
+ *
+ * To remove traces from the trace pool that do not include a certain node
+ * requires that the trace be "complete" before applying the filter. To
+ * accomplish this, the trace pool is filtered upon each iteraction of the
+ * main vlib loop. Doing so keeps the number of allocated traces down to a
+ * reasonably low number. This requires that tracing for a buffer is not
+ * performed after the vlib main loop interation completes. i.e. you can't
+ * save away a buffer temporarily then inject it back into the graph and
+ * expect that the trace_index is still valid (such as a traffic manager might
+ * do). A new trace buffer should be allocated for those types of packets.
+ *
+ * The filter can be extended to support multiple nodes and other match
+ * criteria (e.g. input sw_if_index, mac address) but for now just checks if
+ * a specified node is in the trace or not in the trace.
+ */
+static clib_error_t *
+cli_filter_trace (vlib_main_t * vm,
+		  unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  vlib_trace_main_t *tm = &vm->trace_main;
+  u32 filter_node_index;
+  u32 filter_flag;
+  u32 filter_count;
+  void *mainheap;
+
+  if (unformat (input, "include %U %d",
+		unformat_vlib_node, vm, &filter_node_index, &filter_count))
+    {
+      filter_flag = FILTER_FLAG_INCLUDE;
+    }
+  else if (unformat (input, "exclude %U %d",
+		     unformat_vlib_node, vm, &filter_node_index,
+		     &filter_count))
+    {
+      filter_flag = FILTER_FLAG_EXCLUDE;
+    }
+  else if (unformat (input, "none"))
+    {
+      filter_flag = FILTER_FLAG_NONE;
+      filter_node_index = 0;
+      filter_count = 0;
+    }
+  else
+    return
+      clib_error_create
+      ("expected 'include NODE COUNT' or 'exclude NODE COUNT' or 'none', got `%U'",
+       format_unformat_error, input);
+
+  /* *INDENT-OFF* */
+  foreach_vlib_main (
+    ({
+    tm = &this_vlib_main->trace_main;
+    tm->filter_node_index = filter_node_index;
+    tm->filter_flag = filter_flag;
+    tm->filter_count = filter_count;
+
+    /*
+     * Clear the trace limits to stop any in-progress tracing
+     * Prevents runaway trace allocations when the filter changes (or is removed)
+     */
+    mainheap = clib_mem_set_heap (this_vlib_main->heap_base);
+    vec_free (tm->nodes);
+    clib_mem_set_heap (mainheap);
+  }));
+  /* *INDENT-ON* */
+
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (filter_trace_cli,static) = {
+  .path = "trace filter",
+  .short_help = "filter trace output - include NODE COUNT | exclude NODE COUNT | none",
+  .function = cli_filter_trace,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+cli_clear_trace_buffer (vlib_main_t * vm,
+			unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  clear_trace_buffer ();
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (clear_trace_cli,static) = {
+  .path = "clear trace",
+  .short_help = "Clear trace buffer and free memory",
+  .function = cli_clear_trace_buffer,
+};
+/* *INDENT-ON* */
+
+/* Dummy function to get us linked in. */
+void
+vlib_trace_cli_reference (void)
+{
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/trace.h b/src/vlib/trace.h
new file mode 100644
index 00000000..fc0fc5c8
--- /dev/null
+++ b/src/vlib/trace.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * trace.h: VLIB trace buffer.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_trace_h
+#define included_vlib_trace_h
+
+#include <vppinfra/pool.h>
+
+typedef struct
+{
+  /* CPU time stamp trace was made. */
+  u64 time;
+
+  /* Node which generated this trace. */
+  u32 node_index;
+
+  /* Number of data words in this trace. */
+  u32 n_data;
+
+  /* Trace data follows. */
+  u8 data[0];
+} vlib_trace_header_t;
+
+typedef struct
+{
+  /* Current number of traces in buffer. */
+  u32 count;
+
+  /* Max. number of traces to be added to buffer. */
+  u32 limit;
+} vlib_trace_node_t;
+
+typedef struct
+{
+  /* Pool of trace buffers. */
+  vlib_trace_header_t **trace_buffer_pool;
+
+  u32 last_main_loop_count;
+  u32 filter_node_index;
+  u32 filter_flag;
+#define FILTER_FLAG_NONE    0
+#define FILTER_FLAG_INCLUDE 1
+#define FILTER_FLAG_EXCLUDE 2
+  u32 filter_count;
+
+  /* set on trace add, cleared on clear trace */
+  u32 trace_active_hint;
+
+  /* Per node trace counts. */
+  vlib_trace_node_t *nodes;
+
+  /* verbosity */
+  int verbose;
+} vlib_trace_main_t;
+
+#endif /* included_vlib_trace_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/trace_funcs.h b/src/vlib/trace_funcs.h
new file mode 100644
index 00000000..5280eae9
--- /dev/null
+++ b/src/vlib/trace_funcs.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * trace_funcs.h: VLIB trace buffer.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_trace_funcs_h
+#define included_vlib_trace_funcs_h
+
+always_inline void
+vlib_validate_trace (vlib_trace_main_t * tm, vlib_buffer_t * b)
+{
+  /*
+   * this assert seems right, but goes off constantly.
+   * disabling it appears to make the pain go away
+   */
+  ASSERT (1 || b->flags & VLIB_BUFFER_IS_TRACED);
+  ASSERT (!pool_is_free_index (tm->trace_buffer_pool, b->trace_index));
+}
+
+always_inline void *
+vlib_add_trace (vlib_main_t * vm,
+		vlib_node_runtime_t * r, vlib_buffer_t * b, u32 n_data_bytes)
+{
+  vlib_trace_main_t *tm = &vm->trace_main;
+  vlib_trace_header_t *h;
+  u32 n_data_words;
+
+  vlib_validate_trace (tm, b);
+
+  n_data_bytes = round_pow2 (n_data_bytes, sizeof (h[0]));
+  n_data_words = n_data_bytes / sizeof (h[0]);
+  vec_add2_aligned (tm->trace_buffer_pool[b->trace_index], h,
+		    1 + n_data_words, sizeof (h[0]));
+
+  h->time = vm->cpu_time_last_node_dispatch;
+  h->n_data = n_data_words;
+  h->node_index = r->node_index;
+
+  return h->data;
+}
+
+always_inline vlib_trace_header_t *
+vlib_trace_header_next (vlib_trace_header_t * h)
+{
+  return h + 1 + h->n_data;
+}
+
+always_inline void
+vlib_free_trace (vlib_main_t * vm, vlib_buffer_t * b)
+{
+  vlib_trace_main_t *tm = &vm->trace_main;
+  vlib_validate_trace (tm, b);
+  _vec_len (tm->trace_buffer_pool[b->trace_index]) = 0;
+  pool_put_index (tm->trace_buffer_pool, b->trace_index);
+}
+
+always_inline void
+vlib_trace_next_frame (vlib_main_t * vm,
+		       vlib_node_runtime_t * r, u32 next_index)
+{
+  vlib_next_frame_t *nf;
+  nf = vlib_node_runtime_get_next_frame (vm, r, next_index);
+  nf->flags |= VLIB_FRAME_TRACE;
+}
+
+void trace_apply_filter (vlib_main_t * vm);
+
+/* Mark buffer as traced and allocate trace buffer. */
+always_inline void
+vlib_trace_buffer (vlib_main_t * vm,
+		   vlib_node_runtime_t * r,
+		   u32 next_index, vlib_buffer_t * b, int follow_chain)
+{
+  vlib_trace_main_t *tm = &vm->trace_main;
+  vlib_trace_header_t **h;
+
+  /*
+   * Apply filter to existing traces to keep number of allocated traces low.
+   * Performed each time around the main loop.
+   */
+  if (tm->last_main_loop_count != vm->main_loop_count)
+    {
+      tm->last_main_loop_count = vm->main_loop_count;
+      trace_apply_filter (vm);
+    }
+
+  vlib_trace_next_frame (vm, r, next_index);
+
+  pool_get (tm->trace_buffer_pool, h);
+
+  do
+    {
+      b->flags |= VLIB_BUFFER_IS_TRACED;
+      b->trace_index = h - tm->trace_buffer_pool;
+    }
+  while (follow_chain && (b = vlib_get_next_buffer (vm, b)));
+}
+
+always_inline void
+vlib_buffer_copy_trace_flag (vlib_main_t * vm, vlib_buffer_t * b,
+			     u32 bi_target)
+{
+  vlib_buffer_t *b_target = vlib_get_buffer (vm, bi_target);
+  b_target->flags |= b->flags & VLIB_BUFFER_IS_TRACED;
+  b_target->trace_index = b->trace_index;
+}
+
+always_inline u32
+vlib_get_trace_count (vlib_main_t * vm, vlib_node_runtime_t * rt)
+{
+  vlib_trace_main_t *tm = &vm->trace_main;
+  vlib_trace_node_t *tn;
+  int n;
+
+  if (rt->node_index >= vec_len (tm->nodes))
+    return 0;
+  tn = tm->nodes + rt->node_index;
+  n = tn->limit - tn->count;
+  ASSERT (n >= 0);
+
+  return n;
+}
+
+always_inline void
+vlib_set_trace_count (vlib_main_t * vm, vlib_node_runtime_t * rt, u32 count)
+{
+  vlib_trace_main_t *tm = &vm->trace_main;
+  vlib_trace_node_t *tn = vec_elt_at_index (tm->nodes, rt->node_index);
+
+  ASSERT (count <= tn->limit);
+  tn->count = tn->limit - count;
+}
+
+/* Helper function for nodes which only trace buffer data. */
+void
+vlib_trace_frame_buffers_only (vlib_main_t * vm,
+			       vlib_node_runtime_t * node,
+			       u32 * buffers,
+			       uword n_buffers,
+			       uword next_buffer_stride,
+			       uword n_buffer_data_bytes_in_trace);
+
+#endif /* included_vlib_trace_funcs_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/unix/cj.c b/src/vlib/unix/cj.c
new file mode 100644
index 00000000..7c1e9475
--- /dev/null
+++ b/src/vlib/unix/cj.c
@@ -0,0 +1,272 @@
+/*
+ *------------------------------------------------------------------
+ * cj.c
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+/**
+ * @file
+ * Circular joournal diagnostic mechanism.
+ *
+ * The @c cj thread-safe circular log buffer scheme is occasionally useful
+ * when chasing bugs. Calls to it should not be checked in.
+ */
+/*? %%clicmd:group_label Circular Journal %% ?*/
+/*? %%syscfg:group_label Circular Journal %% ?*/
+
+#include <stdio.h>
+#include <vlib/vlib.h>
+
+#include <vlib/unix/cj.h>
+
+cj_main_t cj_main;
+
+void
+cj_log (u32 type, void *data0, void *data1)
+{
+  u64 new_tail;
+  cj_main_t *cjm = &cj_main;
+  cj_record_t *r;
+
+  if (cjm->enable == 0)
+    return;
+
+  new_tail = __sync_add_and_fetch (&cjm->tail, 1);
+
+  r = (cj_record_t *) & (cjm->records[new_tail & (cjm->num_records - 1)]);
+  r->time = vlib_time_now (cjm->vlib_main);
+  r->thread_index = vlib_get_thread_index ();
+  r->type = type;
+  r->data[0] = pointer_to_uword (data0);
+  r->data[1] = pointer_to_uword (data1);
+}
+
+void
+cj_stop (void)
+{
+  cj_main_t *cjm = &cj_main;
+
+  cjm->enable = 0;
+}
+
+
+clib_error_t *
+cj_init (vlib_main_t * vm)
+{
+  cj_main_t *cjm = &cj_main;
+
+  cjm->vlib_main = vm;
+  return 0;
+}
+
+VLIB_INIT_FUNCTION (cj_init);
+
+static clib_error_t *
+cj_config (vlib_main_t * vm, unformat_input_t * input)
+{
+  cj_main_t *cjm = &cj_main;
+  int matched = 0;
+  int enable = 0;
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "records %d", &cjm->num_records))
+	matched = 1;
+      else if (unformat (input, "on"))
+	enable = 1;
+      else
+	return clib_error_return (0, "cj_config: unknown input '%U'",
+				  format_unformat_error, input);
+    }
+
+  if (matched == 0)
+    return 0;
+
+  cjm->num_records = max_pow2 (cjm->num_records);
+  vec_validate (cjm->records, cjm->num_records - 1);
+  memset (cjm->records, 0xff, cjm->num_records * sizeof (cj_record_t));
+  cjm->tail = ~0;
+  cjm->enable = enable;
+
+  return 0;
+}
+
+/*?
+ * Configure the circular journal diagnostic mechanism. This is only useful
+ * if you, the deveoper, have written code to make use of the circular
+ * journal.
+ *
+ * @cfgcmd{records, &lt;number&gt;}
+ * Configure the number of records to allocate for the circular journal.
+ *
+ * @cfgcmd{on}
+ * Enable the collection of records in the circular journal at the
+ * earliest opportunity.
+?*/
+VLIB_CONFIG_FUNCTION (cj_config, "cj");
+
+void
+cj_enable_disable (int is_enable)
+{
+  cj_main_t *cjm = &cj_main;
+
+  if (cjm->num_records)
+    cjm->enable = is_enable;
+  else
+    vlib_cli_output (cjm->vlib_main, "CJ not configured...");
+}
+
+static inline void
+cj_dump_one_record (cj_record_t * r)
+{
+  fprintf (stderr, "[%d]: %10.6f T%02d %llx %llx\n",
+	   r->thread_index, r->time, r->type,
+	   (long long unsigned int) r->data[0],
+	   (long long unsigned int) r->data[1]);
+}
+
+static void
+cj_dump_internal (u8 filter0_enable, u64 filter0,
+		  u8 filter1_enable, u64 filter1)
+{
+  cj_main_t *cjm = &cj_main;
+  cj_record_t *r;
+  u32 i, index;
+
+  if (cjm->num_records == 0)
+    {
+      fprintf (stderr, "CJ not configured...\n");
+      return;
+    }
+
+  if (cjm->tail == (u64) ~ 0)
+    {
+      fprintf (stderr, "No data collected...\n");
+      return;
+    }
+
+  /* Has the trace wrapped? */
+  index = (cjm->tail + 1) & (cjm->num_records - 1);
+  r = &(cjm->records[index]);
+
+  if (r->thread_index != (u32) ~ 0)
+    {
+      /* Yes, dump from tail + 1 to the end */
+      for (i = index; i < cjm->num_records; i++)
+	{
+	  if (filter0_enable && (r->data[0] != filter0))
+	    goto skip;
+	  if (filter1_enable && (r->data[1] != filter1))
+	    goto skip;
+	  cj_dump_one_record (r);
+	skip:
+	  r++;
+	}
+    }
+  /* dump from the beginning through the final tail */
+  r = cjm->records;
+  for (i = 0; i <= cjm->tail; i++)
+    {
+      if (filter0_enable && (r->data[0] != filter0))
+	goto skip2;
+      if (filter1_enable && (r->data[1] != filter1))
+	goto skip2;
+      cj_dump_one_record (r);
+    skip2:
+      r++;
+    }
+}
+
+void
+cj_dump (void)
+{
+  cj_dump_internal (0, 0, 0, 0);
+}
+
+void
+cj_dump_filter_data0 (u64 filter0)
+{
+  cj_dump_internal (1 /* enable f0 */ , filter0, 0, 0);
+}
+
+void
+cj_dump_filter_data1 (u64 filter1)
+{
+  cj_dump_internal (0, 0, 1 /* enable f1 */ , filter1);
+}
+
+void
+cj_dump_filter_data12 (u64 filter0, u64 filter1)
+{
+  cj_dump_internal (1, filter0, 1, filter1);
+}
+
+static clib_error_t *
+cj_command_fn (vlib_main_t * vm,
+	       unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  int is_enable = -1;
+  int is_dump = -1;
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "enable") || unformat (input, "on"))
+	is_enable = 1;
+      else if (unformat (input, "disable") || unformat (input, "off"))
+	is_enable = 0;
+      else if (unformat (input, "dump"))
+	is_dump = 1;
+      else
+	return clib_error_return (0, "unknown input `%U'",
+				  format_unformat_error, input);
+    }
+
+  if (is_enable >= 0)
+    cj_enable_disable (is_enable);
+
+  if (is_dump > 0)
+    cj_dump ();
+
+  return 0;
+}
+
+/*?
+ * Enable, disable the collection of diagnostic data into a
+ * circular journal or dump the circular journal diagnostic data.
+ * This is only useful if you, the deveoper, have written code to make
+ * use of the circular journal.
+ *
+ * When dumping the data it is formatted and sent to @c stderr of the
+ * VPP process; when running VPP in <code>unix interactive</code> mode
+ * this is typically the same place as the Debug CLI.
+?*/
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cj_command,static) = {
+  .path = "cj",
+  .short_help = "cj <enable | disable | dump>",
+  .function = cj_command_fn,
+};
+/* *INDENT-ON* */
+
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/unix/cj.h b/src/vlib/unix/cj.h
new file mode 100644
index 00000000..d0a1d46e
--- /dev/null
+++ b/src/vlib/unix/cj.h
@@ -0,0 +1,79 @@
+/*
+ *------------------------------------------------------------------
+ * cj.h
+ *
+ * Copyright (c) 2013 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *------------------------------------------------------------------
+ */
+
+#ifndef __included_cj_h__
+#define __included_cj_h__
+
+typedef struct
+{
+  f64 time;
+  u32 thread_index;
+  u32 type;
+  u64 data[2];
+} cj_record_t;
+
+typedef struct
+{
+  volatile u64 tail;
+  cj_record_t *records;
+  u32 num_records;
+  volatile u32 enable;
+
+  vlib_main_t *vlib_main;
+} cj_main_t;
+
+void cj_log (u32 type, void *data0, void *data1);
+
+/*
+ * Supply in application main, so we can log from any library...
+ * Declare a weak reference in the library, off you go.
+ */
+
+#define DECLARE_CJ_GLOBAL_LOG                                   \
+void cj_global_log (unsigned type, void * data0, void * data1)  \
+  __attribute__ ((weak));                                       \
+                                                                \
+unsigned __cj_type;                                             \
+void * __cj_data0;                                              \
+void * __cj_data1;                                              \
+                                                                \
+void                                                            \
+cj_global_log (unsigned type, void * data0, void * data1)       \
+{                                                               \
+  __cj_type = type;                                             \
+  __cj_data0 = data0;                                           \
+  __cj_data1 = data1;                                           \
+}
+
+#define CJ_GLOBAL_LOG_PROTOTYPE
+void
+cj_global_log (unsigned type, void *data0, void *data1)
+__attribute__ ((weak));
+
+void cj_stop (void);
+
+#endif /* __included_cj_h__ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/unix/cli.c b/src/vlib/unix/cli.c
new file mode 100644
index 00000000..be3c813a
--- /dev/null
+++ b/src/vlib/unix/cli.c
@@ -0,0 +1,3468 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * cli.c: Unix stdin/socket CLI.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+/**
+ * @file
+ * @brief Unix stdin/socket command line interface.
+ * Provides a command line interface so humans can interact with VPP.
+ * This is predominantly a debugging and testing mechanism.
+ */
+/*? %%clicmd:group_label Command line session %% ?*/
+/*? %%syscfg:group_label Command line session %% ?*/
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vppinfra/timer.h>
+
+#include <ctype.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <termios.h>
+#include <signal.h>
+#include <unistd.h>
+#include <arpa/telnet.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+/** ANSI escape code. */
+#define ESC "\x1b"
+
+/** ANSI Control Sequence Introducer. */
+#define CSI ESC "["
+
+/** ANSI clear screen. */
+#define ANSI_CLEAR      CSI "2J" CSI "1;1H"
+/** ANSI reset color settings. */
+#define ANSI_RESET      CSI "0m"
+/** ANSI Start bold text. */
+#define ANSI_BOLD       CSI "1m"
+/** ANSI Stop bold text. */
+#define ANSI_DIM        CSI "2m"
+/** ANSI Start dark red text. */
+#define ANSI_DRED       ANSI_DIM CSI "31m"
+/** ANSI Start bright red text. */
+#define ANSI_BRED       ANSI_BOLD CSI "31m"
+/** ANSI clear line cursor is on. */
+#define ANSI_CLEARLINE  CSI "2K"
+/** ANSI scroll screen down one line. */
+#define ANSI_SCROLLDN   CSI "1T"
+/** ANSI save cursor position. */
+#define ANSI_SAVECURSOR CSI "s"
+/** ANSI restore cursor position if previously saved. */
+#define ANSI_RESTCURSOR CSI "u"
+
+/** Maximum depth into a byte stream from which to compile a Telnet
+ * protocol message. This is a saftey measure. */
+#define UNIX_CLI_MAX_DEPTH_TELNET 24
+
+/** Minimum terminal width we will accept */
+#define UNIX_CLI_MIN_TERMINAL_WIDTH 1
+/** Maximum terminal width we will accept */
+#define UNIX_CLI_MAX_TERMINAL_WIDTH 512
+/** Minimum terminal height we will accept */
+#define UNIX_CLI_MIN_TERMINAL_HEIGHT 1
+/** Maximum terminal height we will accept */
+#define UNIX_CLI_MAX_TERMINAL_HEIGHT 512
+
+
+/** A CLI banner line. */
+typedef struct
+{
+  u8 *line;	/**< The line to print. */
+  u32 length;	/**< The length of the line without terminating NUL. */
+} unix_cli_banner_t;
+
+#define _(a) { .line = (u8 *)(a), .length = sizeof(a) - 1 }
+/** Plain welcome banner. */
+static unix_cli_banner_t unix_cli_banner[] = {
+  _("    _______    _        _   _____  ___ \n"),
+  _(" __/ __/ _ \\  (_)__    | | / / _ \\/ _ \\\n"),
+  _(" _/ _// // / / / _ \\   | |/ / ___/ ___/\n"),
+  _(" /_/ /____(_)_/\\___/   |___/_/  /_/    \n"),
+  _("\n")
+};
+
+/** ANSI color welcome banner. */
+static unix_cli_banner_t unix_cli_banner_color[] = {
+  _(ANSI_BRED "    _______    _     " ANSI_RESET "   _   _____  ___ \n"),
+  _(ANSI_BRED " __/ __/ _ \\  (_)__ " ANSI_RESET "   | | / / _ \\/ _ \\\n"),
+  _(ANSI_BRED " _/ _// // / / / _ \\" ANSI_RESET "   | |/ / ___/ ___/\n"),
+  _(ANSI_BRED " /_/ /____(_)_/\\___/" ANSI_RESET "   |___/_/  /_/    \n"),
+  _("\n")
+};
+
+#undef _
+
+/** Pager line index */
+typedef struct
+{
+  /** Index into pager_vector */
+  u32 line;
+
+  /** Offset of the string in the line */
+  u32 offset;
+
+  /** Length of the string in the line */
+  u32 length;
+} unix_cli_pager_index_t;
+
+
+/** Unix CLI session. */
+typedef struct
+{
+  /** The file index held by unix.c */
+  u32 clib_file_index;
+
+  /** Vector of output pending write to file descriptor. */
+  u8 *output_vector;
+
+  /** Vector of input saved by Unix input node to be processed by
+     CLI process. */
+  u8 *input_vector;
+
+  /** This session has command history. */
+  u8 has_history;
+  /** Array of vectors of commands in the history. */
+  u8 **command_history;
+  /** The command currently pointed at by the history cursor. */
+  u8 *current_command;
+  /** How far from the end of the history array the user has browsed. */
+  i32 excursion;
+
+  /** Maximum number of history entries this session will store. */
+  u32 history_limit;
+
+  /** Current command line counter */
+  u32 command_number;
+
+  /** The string being searched for in the history. */
+  u8 *search_key;
+  /** If non-zero then the CLI is searching in the history array.
+   * - @c -1 means search backwards.
+   * - @c 1 means search forwards.
+   */
+  int search_mode;
+
+  /** Position of the insert cursor on the current input line */
+  u32 cursor;
+
+  /** Line mode or char mode */
+  u8 line_mode;
+
+  /** Set if the CRLF mode wants CR + LF */
+  u8 crlf_mode;
+
+  /** Can we do ANSI output? */
+  u8 ansi_capable;
+
+  /** Has the session started? */
+  u8 started;
+
+  /** Disable the pager? */
+  u8 no_pager;
+
+  /** Whether the session is interactive or not.
+   * Controls things like initial banner, the CLI prompt etc.  */
+  u8 is_interactive;
+
+  /** Whether the session is attached to a socket. */
+  u8 is_socket;
+
+  /** If EPIPE has been detected, prevent further write-related
+   * activity on the descriptor.
+   */
+  u8 has_epipe;
+
+  /** Pager buffer */
+  u8 **pager_vector;
+
+  /** Index of line fragments in the pager buffer */
+  unix_cli_pager_index_t *pager_index;
+
+  /** Line number of top of page */
+  u32 pager_start;
+
+  /** Terminal width */
+  u32 width;
+
+  /** Terminal height */
+  u32 height;
+
+  /** Process node identifier */
+  u32 process_node_index;
+} unix_cli_file_t;
+
+/** Resets the pager buffer and other data.
+ * @param f The CLI session whose pager needs to be reset.
+ */
+always_inline void
+unix_cli_pager_reset (unix_cli_file_t * f)
+{
+  u8 **p;
+
+  f->pager_start = 0;
+
+  vec_free (f->pager_index);
+  f->pager_index = 0;
+
+  vec_foreach (p, f->pager_vector)
+  {
+    vec_free (*p);
+  }
+  vec_free (f->pager_vector);
+  f->pager_vector = 0;
+}
+
+/** Release storage used by a CLI session.
+ * @param f The CLI session whose storage needs to be released.
+ */
+always_inline void
+unix_cli_file_free (unix_cli_file_t * f)
+{
+  vec_free (f->output_vector);
+  vec_free (f->input_vector);
+  unix_cli_pager_reset (f);
+}
+
+/** CLI actions */
+typedef enum
+{
+  UNIX_CLI_PARSE_ACTION_NOACTION = 0,	/**< No action */
+  UNIX_CLI_PARSE_ACTION_CRLF,		/**< Carriage return, newline or enter */
+  UNIX_CLI_PARSE_ACTION_TAB,		/**< Tab key */
+  UNIX_CLI_PARSE_ACTION_ERASE,		/**< Erase cursor left */
+  UNIX_CLI_PARSE_ACTION_ERASERIGHT,	/**< Erase cursor right */
+  UNIX_CLI_PARSE_ACTION_UP,		/**< Up arrow */
+  UNIX_CLI_PARSE_ACTION_DOWN,		/**< Down arrow */
+  UNIX_CLI_PARSE_ACTION_LEFT,		/**< Left arrow */
+  UNIX_CLI_PARSE_ACTION_RIGHT,		/**< Right arrow */
+  UNIX_CLI_PARSE_ACTION_HOME,		/**< Home key (jump to start of line) */
+  UNIX_CLI_PARSE_ACTION_END,		/**< End key (jump to end of line) */
+  UNIX_CLI_PARSE_ACTION_WORDLEFT,	/**< Jump cursor to start of left word */
+  UNIX_CLI_PARSE_ACTION_WORDRIGHT,	/**< Jump cursor to start of right word */
+  UNIX_CLI_PARSE_ACTION_ERASELINELEFT,	/**< Erase line to left of cursor */
+  UNIX_CLI_PARSE_ACTION_ERASELINERIGHT,	/**< Erase line to right & including cursor */
+  UNIX_CLI_PARSE_ACTION_CLEAR,		/**< Clear the terminal */
+  UNIX_CLI_PARSE_ACTION_REVSEARCH,	/**< Search backwards in command history */
+  UNIX_CLI_PARSE_ACTION_FWDSEARCH,	/**< Search forwards in command history */
+  UNIX_CLI_PARSE_ACTION_YANK,		/**< Undo last erase action */
+  UNIX_CLI_PARSE_ACTION_TELNETIAC,	/**< Telnet control code */
+
+  UNIX_CLI_PARSE_ACTION_PAGER_CRLF,	/**< Enter pressed (CR, CRLF, LF, etc) */
+  UNIX_CLI_PARSE_ACTION_PAGER_QUIT,	/**< Exit the pager session */
+  UNIX_CLI_PARSE_ACTION_PAGER_NEXT,	/**< Scroll to next page */
+  UNIX_CLI_PARSE_ACTION_PAGER_DN,	/**< Scroll to next line */
+  UNIX_CLI_PARSE_ACTION_PAGER_UP,	/**< Scroll to previous line */
+  UNIX_CLI_PARSE_ACTION_PAGER_TOP,	/**< Scroll to first line */
+  UNIX_CLI_PARSE_ACTION_PAGER_BOTTOM,	/**< Scroll to last line */
+  UNIX_CLI_PARSE_ACTION_PAGER_PGDN,	/**< Scroll to next page */
+  UNIX_CLI_PARSE_ACTION_PAGER_PGUP,	/**< Scroll to previous page */
+  UNIX_CLI_PARSE_ACTION_PAGER_REDRAW,	/**< Clear and redraw the page on the terminal */
+  UNIX_CLI_PARSE_ACTION_PAGER_SEARCH,	/**< Search the pager buffer */
+
+  UNIX_CLI_PARSE_ACTION_PARTIALMATCH,	/**< Action parser found a partial match */
+  UNIX_CLI_PARSE_ACTION_NOMATCH		/**< Action parser did not find any match */
+} unix_cli_parse_action_t;
+
+/** @brief Mapping of input buffer strings to action values.
+ * @note This won't work as a hash since we need to be able to do
+ *       partial matches on the string.
+ */
+typedef struct
+{
+  u8 *input;			    /**< Input string to match. */
+  u32 len;			    /**< Length of input without final NUL. */
+  unix_cli_parse_action_t action;   /**< Action to take when matched. */
+} unix_cli_parse_actions_t;
+
+/** @brief Given a capital ASCII letter character return a @c NUL terminated
+ * string with the control code for that letter.
+ *
+ * @param c An ASCII character.
+ * @return A @c NUL terminated string of type @c u8[].
+ *
+ * @par Example
+ *     @c CTL('A') returns <code>{ 0x01, 0x00 }</code> as a @c u8[].
+ */
+#define CTL(c) (u8[]){ (c) - '@', 0 }
+
+#define _(a,b) { .input = (u8 *)(a), .len = sizeof(a) - 1, .action = (b) }
+/**
+ * Patterns to match on a CLI input stream.
+ * @showinitializer
+ */
+static unix_cli_parse_actions_t unix_cli_parse_strings[] = {
+  /* Line handling */
+  _("\r\n", UNIX_CLI_PARSE_ACTION_CRLF),	/* Must be before '\r' */
+  _("\n", UNIX_CLI_PARSE_ACTION_CRLF),
+  _("\r\0", UNIX_CLI_PARSE_ACTION_CRLF),	/* Telnet does this */
+  _("\r", UNIX_CLI_PARSE_ACTION_CRLF),
+
+  /* Unix shell control codes */
+  _(CTL ('B'), UNIX_CLI_PARSE_ACTION_LEFT),
+  _(CTL ('F'), UNIX_CLI_PARSE_ACTION_RIGHT),
+  _(CTL ('P'), UNIX_CLI_PARSE_ACTION_UP),
+  _(CTL ('N'), UNIX_CLI_PARSE_ACTION_DOWN),
+  _(CTL ('A'), UNIX_CLI_PARSE_ACTION_HOME),
+  _(CTL ('E'), UNIX_CLI_PARSE_ACTION_END),
+  _(CTL ('D'), UNIX_CLI_PARSE_ACTION_ERASERIGHT),
+  _(CTL ('U'), UNIX_CLI_PARSE_ACTION_ERASELINELEFT),
+  _(CTL ('K'), UNIX_CLI_PARSE_ACTION_ERASELINERIGHT),
+  _(CTL ('Y'), UNIX_CLI_PARSE_ACTION_YANK),
+  _(CTL ('L'), UNIX_CLI_PARSE_ACTION_CLEAR),
+  _(ESC "b", UNIX_CLI_PARSE_ACTION_WORDLEFT),	/* Alt-B */
+  _(ESC "f", UNIX_CLI_PARSE_ACTION_WORDRIGHT),	/* Alt-F */
+  _("\b", UNIX_CLI_PARSE_ACTION_ERASE),	/* ^H */
+  _("\x7f", UNIX_CLI_PARSE_ACTION_ERASE),	/* Backspace */
+  _("\t", UNIX_CLI_PARSE_ACTION_TAB),	/* ^I */
+
+  /* VT100 Normal mode - Broadest support */
+  _(CSI "A", UNIX_CLI_PARSE_ACTION_UP),
+  _(CSI "B", UNIX_CLI_PARSE_ACTION_DOWN),
+  _(CSI "C", UNIX_CLI_PARSE_ACTION_RIGHT),
+  _(CSI "D", UNIX_CLI_PARSE_ACTION_LEFT),
+  _(CSI "H", UNIX_CLI_PARSE_ACTION_HOME),
+  _(CSI "F", UNIX_CLI_PARSE_ACTION_END),
+  _(CSI "3~", UNIX_CLI_PARSE_ACTION_ERASERIGHT),	/* Delete */
+  _(CSI "1;5D", UNIX_CLI_PARSE_ACTION_WORDLEFT),	/* C-Left */
+  _(CSI "1;5C", UNIX_CLI_PARSE_ACTION_WORDRIGHT),	/* C-Right */
+
+  /* VT100 Application mode - Some Gnome Terminal functions use these */
+  _(ESC "OA", UNIX_CLI_PARSE_ACTION_UP),
+  _(ESC "OB", UNIX_CLI_PARSE_ACTION_DOWN),
+  _(ESC "OC", UNIX_CLI_PARSE_ACTION_RIGHT),
+  _(ESC "OD", UNIX_CLI_PARSE_ACTION_LEFT),
+  _(ESC "OH", UNIX_CLI_PARSE_ACTION_HOME),
+  _(ESC "OF", UNIX_CLI_PARSE_ACTION_END),
+
+  /* ANSI X3.41-1974 - sent by Microsoft Telnet and PuTTY */
+  _(CSI "1~", UNIX_CLI_PARSE_ACTION_HOME),
+  _(CSI "4~", UNIX_CLI_PARSE_ACTION_END),
+
+  /* Emacs-ish history search */
+  _(CTL ('S'), UNIX_CLI_PARSE_ACTION_FWDSEARCH),
+  _(CTL ('R'), UNIX_CLI_PARSE_ACTION_REVSEARCH),
+
+  /* Other protocol things */
+  _("\xff", UNIX_CLI_PARSE_ACTION_TELNETIAC),	/* IAC */
+  _("\0", UNIX_CLI_PARSE_ACTION_NOACTION),	/* NUL */
+  _(NULL, UNIX_CLI_PARSE_ACTION_NOMATCH)
+};
+
+/**
+ * Patterns to match when a CLI session is in the pager.
+ * @showinitializer
+ */
+static unix_cli_parse_actions_t unix_cli_parse_pager[] = {
+  /* Line handling */
+  _("\r\n", UNIX_CLI_PARSE_ACTION_PAGER_CRLF),	/* Must be before '\r' */
+  _("\n", UNIX_CLI_PARSE_ACTION_PAGER_CRLF),
+  _("\r\0", UNIX_CLI_PARSE_ACTION_PAGER_CRLF),	/* Telnet does this */
+  _("\r", UNIX_CLI_PARSE_ACTION_PAGER_CRLF),
+
+  /* Pager commands */
+  _(" ", UNIX_CLI_PARSE_ACTION_PAGER_NEXT),
+  _("q", UNIX_CLI_PARSE_ACTION_PAGER_QUIT),
+  _(CTL ('L'), UNIX_CLI_PARSE_ACTION_PAGER_REDRAW),
+  _(CTL ('R'), UNIX_CLI_PARSE_ACTION_PAGER_REDRAW),
+  _("/", UNIX_CLI_PARSE_ACTION_PAGER_SEARCH),
+
+  /* VT100 */
+  _(CSI "A", UNIX_CLI_PARSE_ACTION_PAGER_UP),
+  _(CSI "B", UNIX_CLI_PARSE_ACTION_PAGER_DN),
+  _(CSI "H", UNIX_CLI_PARSE_ACTION_PAGER_TOP),
+  _(CSI "F", UNIX_CLI_PARSE_ACTION_PAGER_BOTTOM),
+
+  /* VT100 Application mode */
+  _(ESC "OA", UNIX_CLI_PARSE_ACTION_PAGER_UP),
+  _(ESC "OB", UNIX_CLI_PARSE_ACTION_PAGER_DN),
+  _(ESC "OH", UNIX_CLI_PARSE_ACTION_PAGER_TOP),
+  _(ESC "OF", UNIX_CLI_PARSE_ACTION_PAGER_BOTTOM),
+
+  /* ANSI X3.41-1974 */
+  _(CSI "1~", UNIX_CLI_PARSE_ACTION_PAGER_TOP),
+  _(CSI "4~", UNIX_CLI_PARSE_ACTION_PAGER_BOTTOM),
+  _(CSI "5~", UNIX_CLI_PARSE_ACTION_PAGER_PGUP),
+  _(CSI "6~", UNIX_CLI_PARSE_ACTION_PAGER_PGDN),
+
+  /* Other protocol things */
+  _("\xff", UNIX_CLI_PARSE_ACTION_TELNETIAC),	/* IAC */
+  _("\0", UNIX_CLI_PARSE_ACTION_NOACTION),	/* NUL */
+  _(NULL, UNIX_CLI_PARSE_ACTION_NOMATCH)
+};
+
+#undef _
+
+/** CLI session events. */
+typedef enum
+{
+  UNIX_CLI_PROCESS_EVENT_READ_READY,  /**< A file descriptor has data to be read. */
+  UNIX_CLI_PROCESS_EVENT_QUIT,	      /**< A CLI session wants to close. */
+} unix_cli_process_event_type_t;
+
+/** CLI global state. */
+typedef struct
+{
+  /** Prompt string for CLI. */
+  u8 *cli_prompt;
+
+  /** Vec pool of CLI sessions. */
+  unix_cli_file_t *cli_file_pool;
+
+  /** Vec pool of unused session indices. */
+  u32 *unused_cli_process_node_indices;
+
+  /** The session index of the stdin cli */
+  u32 stdin_cli_file_index;
+
+  /** File pool index of current input. */
+  u32 current_input_file_index;
+} unix_cli_main_t;
+
+/** CLI global state */
+static unix_cli_main_t unix_cli_main;
+
+/**
+ * @brief Search for a byte sequence in the action list.
+ *
+ * Searches the @ref unix_cli_parse_actions_t list in @a a for a match with
+ * the bytes in @a input of maximum length @a ilen bytes.
+ * When a match is made @a *matched indicates how many bytes were matched.
+ * Returns a value from the enum @ref unix_cli_parse_action_t to indicate
+ * whether no match was found, a partial match was found or a complete
+ * match was found and what action, if any, should be taken.
+ *
+ * @param[in]  a        Actions list to search within.
+ * @param[in]  input    String fragment to search for.
+ * @param[in]  ilen     Length of the string in 'input'.
+ * @param[out] matched  Pointer to an integer that will contain the number
+ *                      of bytes matched when a complete match is found.
+ *
+ * @return Action from @ref unix_cli_parse_action_t that the string fragment
+ *         matches.
+ *         @ref UNIX_CLI_PARSE_ACTION_PARTIALMATCH is returned when the
+ *         whole input string matches the start of at least one action.
+ *         @ref UNIX_CLI_PARSE_ACTION_NOMATCH is returned when there is no
+ *         match at all.
+ */
+static unix_cli_parse_action_t
+unix_cli_match_action (unix_cli_parse_actions_t * a,
+		       u8 * input, u32 ilen, i32 * matched)
+{
+  u8 partial = 0;
+
+  while (a->input)
+    {
+      if (ilen >= a->len)
+	{
+	  /* see if the start of the input buffer exactly matches the current
+	   * action string. */
+	  if (memcmp (input, a->input, a->len) == 0)
+	    {
+	      *matched = a->len;
+	      return a->action;
+	    }
+	}
+      else
+	{
+	  /* if the first ilen characters match, flag this as a partial -
+	   * meaning keep collecting bytes in case of a future match */
+	  if (memcmp (input, a->input, ilen) == 0)
+	    partial = 1;
+	}
+
+      /* check next action */
+      a++;
+    }
+
+  return partial ?
+    UNIX_CLI_PARSE_ACTION_PARTIALMATCH : UNIX_CLI_PARSE_ACTION_NOMATCH;
+}
+
+
+/** Add bytes to the output vector and then flagg the I/O system that bytes
+ * are available to be sent.
+ */
+static void
+unix_cli_add_pending_output (clib_file_t * uf,
+			     unix_cli_file_t * cf,
+			     u8 * buffer, uword buffer_bytes)
+{
+  clib_file_main_t *fm = &file_main;
+
+  vec_add (cf->output_vector, buffer, buffer_bytes);
+  if (vec_len (cf->output_vector) > 0)
+    {
+      int skip_update = 0 != (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE);
+      uf->flags |= UNIX_FILE_DATA_AVAILABLE_TO_WRITE;
+      if (!skip_update)
+	fm->file_update (uf, UNIX_FILE_UPDATE_MODIFY);
+    }
+}
+
+/** Delete all bytes from the output vector and flag the I/O system
+ * that no more bytes are available to be sent.
+ */
+static void
+unix_cli_del_pending_output (clib_file_t * uf,
+			     unix_cli_file_t * cf, uword n_bytes)
+{
+  clib_file_main_t *fm = &file_main;
+
+  vec_delete (cf->output_vector, n_bytes, 0);
+  if (vec_len (cf->output_vector) <= 0)
+    {
+      int skip_update = 0 == (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE);
+      uf->flags &= ~UNIX_FILE_DATA_AVAILABLE_TO_WRITE;
+      if (!skip_update)
+	fm->file_update (uf, UNIX_FILE_UPDATE_MODIFY);
+    }
+}
+
+/** @brief A bit like strchr with a buffer length limit.
+ * Search a buffer for the first instance of a character up to the limit of
+ * the buffer length. If found then return the position of that character.
+ *
+ * The key departure from strchr is that if the character is not found then
+ * return the buffer length.
+ *
+ * @param chr The byte value to search for.
+ * @param str The buffer in which to search for the value.
+ * @param len The depth into the buffer to search.
+ *
+ * @return The index of the first occurence of \c chr. If \c chr is not
+ *          found then \c len instead.
+ */
+always_inline word
+unix_vlib_findchr (u8 chr, u8 * str, word len)
+{
+  word i = 0;
+  for (i = 0; i < len; i++, str++)
+    {
+      if (*str == chr)
+	return i;
+    }
+  return len;
+}
+
+/** @brief Send a buffer to the CLI stream if possible, enqueue it otherwise.
+ * Attempts to write given buffer to the file descriptor of the given
+ * Unix CLI session. If that session already has data in the output buffer
+ * or if the write attempt tells us to try again later then the given buffer
+ * is appended to the pending output buffer instead.
+ *
+ * This is typically called only from \c unix_vlib_cli_output_cooked since
+ * that is where CRLF handling occurs or from places where we explicitly do
+ * not want cooked handling.
+ *
+ * @param cf Unix CLI session of the desired stream to write to.
+ * @param uf The Unix file structure of the desired stream to write to.
+ * @param buffer Pointer to the buffer that needs to be written.
+ * @param buffer_bytes The number of bytes from \c buffer to write.
+ */
+static void
+unix_vlib_cli_output_raw (unix_cli_file_t * cf,
+			  clib_file_t * uf, u8 * buffer, uword buffer_bytes)
+{
+  int n = 0;
+
+  if (cf->has_epipe)		/* don't try writing anything */
+    return;
+
+  if (vec_len (cf->output_vector) == 0)
+    {
+      if (cf->is_socket)
+	/* If it's a socket we use MSG_NOSIGNAL to prevent SIGPIPE */
+	n = send (uf->file_descriptor, buffer, buffer_bytes, MSG_NOSIGNAL);
+      else
+	n = write (uf->file_descriptor, buffer, buffer_bytes);
+    }
+
+  if (n < 0 && errno != EAGAIN)
+    {
+      if (errno == EPIPE)
+	{
+	  /* connection closed on us */
+	  unix_main_t *um = &unix_main;
+	  cf->has_epipe = 1;
+	  vlib_process_signal_event (um->vlib_main, cf->process_node_index,
+				     UNIX_CLI_PROCESS_EVENT_QUIT,
+				     uf->private_data);
+	}
+      else
+	{
+	  clib_unix_warning ("write");
+	}
+    }
+  else if ((word) n < (word) buffer_bytes)
+    {
+      /* We got EAGAIN or we already have stuff in the buffer;
+       * queue up whatever didn't get sent for later. */
+      if (n < 0)
+	n = 0;
+      unix_cli_add_pending_output (uf, cf, buffer + n, buffer_bytes - n);
+    }
+}
+
+/** @brief Process a buffer for CRLF handling before outputting it to the CLI.
+ *
+ * @param cf Unix CLI session of the desired stream to write to.
+ * @param uf The Unix file structure of the desired stream to write to.
+ * @param buffer Pointer to the buffer that needs to be written.
+ * @param buffer_bytes The number of bytes from \c buffer to write.
+ */
+static void
+unix_vlib_cli_output_cooked (unix_cli_file_t * cf,
+			     clib_file_t * uf,
+			     u8 * buffer, uword buffer_bytes)
+{
+  word end = 0, start = 0;
+
+  while (end < buffer_bytes)
+    {
+      if (cf->crlf_mode)
+	{
+	  /* iterate the line on \n's so we can insert a \r before it */
+	  end = unix_vlib_findchr ('\n',
+				   buffer + start,
+				   buffer_bytes - start) + start;
+	}
+      else
+	{
+	  /* otherwise just send the whole buffer */
+	  end = buffer_bytes;
+	}
+
+      unix_vlib_cli_output_raw (cf, uf, buffer + start, end - start);
+
+      if (cf->crlf_mode)
+	{
+	  if (end < buffer_bytes)
+	    {
+	      unix_vlib_cli_output_raw (cf, uf, (u8 *) "\r\n", 2);
+	      end++;		/* skip the \n that we already sent */
+	    }
+	  start = end;
+	}
+    }
+}
+
+/** @brief Output the CLI prompt */
+static void
+unix_cli_cli_prompt (unix_cli_file_t * cf, clib_file_t * uf)
+{
+  unix_cli_main_t *cm = &unix_cli_main;
+
+  if (cf->is_interactive)	/* Only interactive sessions get a prompt */
+    unix_vlib_cli_output_raw (cf, uf, cm->cli_prompt,
+			      vec_len (cm->cli_prompt));
+}
+
+/** @brief Output a pager prompt and show number of buffered lines */
+static void
+unix_cli_pager_prompt (unix_cli_file_t * cf, clib_file_t * uf)
+{
+  u8 *prompt;
+  u32 h;
+
+  h = cf->pager_start + (cf->height - 1);
+  if (h > vec_len (cf->pager_index))
+    h = vec_len (cf->pager_index);
+
+  prompt = format (0, "\r%s-- more -- (%d-%d/%d)%s",
+		   cf->ansi_capable ? ANSI_BOLD : "",
+		   cf->pager_start + 1,
+		   h,
+		   vec_len (cf->pager_index),
+		   cf->ansi_capable ? ANSI_RESET : "");
+
+  unix_vlib_cli_output_cooked (cf, uf, prompt, vec_len (prompt));
+
+  vec_free (prompt);
+}
+
+/** @brief Output a pager "skipping" message */
+static void
+unix_cli_pager_message (unix_cli_file_t * cf, clib_file_t * uf,
+			char *message, char *postfix)
+{
+  u8 *prompt;
+
+  prompt = format (0, "\r%s-- %s --%s%s",
+		   cf->ansi_capable ? ANSI_BOLD : "",
+		   message, cf->ansi_capable ? ANSI_RESET : "", postfix);
+
+  unix_vlib_cli_output_cooked (cf, uf, prompt, vec_len (prompt));
+
+  vec_free (prompt);
+}
+
+/** @brief Erase the printed pager prompt */
+static void
+unix_cli_pager_prompt_erase (unix_cli_file_t * cf, clib_file_t * uf)
+{
+  if (cf->ansi_capable)
+    {
+      unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\r", 1);
+      unix_vlib_cli_output_cooked (cf, uf,
+				   (u8 *) ANSI_CLEARLINE,
+				   sizeof (ANSI_CLEARLINE) - 1);
+    }
+  else
+    {
+      int i;
+
+      unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\r", 1);
+      for (i = 0; i < cf->width - 1; i++)
+	unix_vlib_cli_output_cooked (cf, uf, (u8 *) " ", 1);
+      unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\r", 1);
+    }
+}
+
+/** @brief Uses an ANSI escape sequence to move the cursor */
+static void
+unix_cli_ansi_cursor (unix_cli_file_t * cf, clib_file_t * uf, u16 x, u16 y)
+{
+  u8 *str;
+
+  str = format (0, "%s%d;%dH", CSI, y, x);
+
+  unix_vlib_cli_output_cooked (cf, uf, str, vec_len (str));
+
+  vec_free (str);
+}
+
+/** Redraw the currently displayed page of text.
+ * @param cf CLI session to redraw the pager buffer of.
+ * @param uf Unix file of the CLI session.
+ */
+static void
+unix_cli_pager_redraw (unix_cli_file_t * cf, clib_file_t * uf)
+{
+  unix_cli_pager_index_t *pi = NULL;
+  u8 *line = NULL;
+  word i;
+
+  /* No active pager? Do nothing. */
+  if (!vec_len (cf->pager_index))
+    return;
+
+  if (cf->ansi_capable)
+    {
+      /* If we have ANSI, send the clear screen sequence */
+      unix_vlib_cli_output_cooked (cf, uf,
+				   (u8 *) ANSI_CLEAR,
+				   sizeof (ANSI_CLEAR) - 1);
+    }
+  else
+    {
+      /* Otherwise make sure we're on a blank line */
+      unix_cli_pager_prompt_erase (cf, uf);
+    }
+
+  /* (Re-)send the current page of content */
+  for (i = 0; i < cf->height - 1 &&
+       i + cf->pager_start < vec_len (cf->pager_index); i++)
+    {
+      pi = &cf->pager_index[cf->pager_start + i];
+      line = cf->pager_vector[pi->line] + pi->offset;
+
+      unix_vlib_cli_output_cooked (cf, uf, line, pi->length);
+    }
+  /* if the last line didn't end in newline, add a newline */
+  if (pi && line[pi->length - 1] != '\n')
+    unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1);
+
+  unix_cli_pager_prompt (cf, uf);
+}
+
+/** @brief Process and add a line to the pager index.
+ * In normal operation this function will take the given character string
+ * found in @c line and with length @c len_or_index and iterates the over the
+ * contents, adding each line of text discovered within it to the
+ * pager index. Lines are identified by newlines ("<code>\\n</code>") and by
+ * strings longer than the width of the terminal.
+ *
+ * If instead @c line is @c NULL then @c len_or_index is taken to mean the
+ * index of an existing line in the pager buffer; this simply means that the
+ * input line does not need to be cloned since we alreayd have it. This is
+ * typical if we are reindexing the pager buffer.
+ *
+ * @param cf           The CLI session whose pager we are adding to.
+ * @param line         The string of text to be indexed into the pager buffer.
+ *                     If @c line is @c NULL then the mode of operation
+ *                     changes slightly; see the description above.
+ * @param len_or_index If @c line is a pointer to a string then this parameter
+ *                     indicates the length of that string; Otherwise this
+ *                     value provides the index in the pager buffer of an
+ *                     existing string to be indexed.
+ */
+static void
+unix_cli_pager_add_line (unix_cli_file_t * cf, u8 * line, word len_or_index)
+{
+  u8 *p;
+  word i, j, k;
+  word line_index, len;
+  u32 width = cf->width;
+  unix_cli_pager_index_t *pi;
+
+  if (line == NULL)
+    {
+      /* Use a line already in the pager buffer */
+      line_index = len_or_index;
+      p = cf->pager_vector[line_index];
+      len = vec_len (p);
+    }
+  else
+    {
+      len = len_or_index;
+      /* Add a copy of the raw string to the pager buffer */
+      p = vec_new (u8, len);
+      clib_memcpy (p, line, len);
+
+      /* store in pager buffer */
+      line_index = vec_len (cf->pager_vector);
+      vec_add1 (cf->pager_vector, p);
+    }
+
+  i = 0;
+  while (i < len)
+    {
+      /* Find the next line, or run to terminal width, or run to EOL */
+      int l = len - i;
+      j = unix_vlib_findchr ((u8) '\n', p, l < width ? l : width);
+
+      if (j < l && p[j] == '\n')	/* incl \n */
+	j++;
+
+      /* Add the line to the index */
+      k = vec_len (cf->pager_index);
+      vec_validate (cf->pager_index, k);
+      pi = &cf->pager_index[k];
+
+      pi->line = line_index;
+      pi->offset = i;
+      pi->length = j;
+
+      i += j;
+      p += j;
+    }
+}
+
+/** @brief Reindex entire pager buffer.
+ * Resets the current pager index and then re-adds the lines in the pager
+ * buffer to the index.
+ *
+ * Additionally this function attempts to retain the current page start
+ * line offset by searching for the same top-of-screen line in the new index.
+ *
+ * @param cf The CLI session whose pager buffer should be reindexed.
+ */
+static void
+unix_cli_pager_reindex (unix_cli_file_t * cf)
+{
+  word i, old_line, old_offset;
+  unix_cli_pager_index_t *pi;
+
+  /* If there is nothing in the pager buffer then make sure the index
+   * is empty and move on.
+   */
+  if (cf->pager_vector == 0)
+    {
+      vec_reset_length (cf->pager_index);
+      return;
+    }
+
+  /* Retain a pointer to the current page start line so we can
+   * find it later
+   */
+  pi = &cf->pager_index[cf->pager_start];
+  old_line = pi->line;
+  old_offset = pi->offset;
+
+  /* Re-add the buffered lines to the index */
+  vec_reset_length (cf->pager_index);
+  vec_foreach_index (i, cf->pager_vector)
+  {
+    unix_cli_pager_add_line (cf, NULL, i);
+  }
+
+  /* Attempt to re-locate the previously stored page start line */
+  vec_foreach_index (i, cf->pager_index)
+  {
+    pi = &cf->pager_index[i];
+
+    if (pi->line == old_line &&
+	(pi->offset <= old_offset || pi->offset + pi->length > old_offset))
+      {
+	/* Found it! */
+	cf->pager_start = i;
+	break;
+      }
+  }
+
+  /* In case the start line was not found (rare), ensure the pager start
+   * index is within bounds
+   */
+  if (cf->pager_start >= vec_len (cf->pager_index))
+    {
+      if (!cf->height || vec_len (cf->pager_index) < (cf->height - 1))
+	cf->pager_start = 0;
+      else
+	cf->pager_start = vec_len (cf->pager_index) - (cf->height - 1);
+    }
+}
+
+/** VLIB CLI output function.
+ *
+ * If the terminal has a pager configured then this function takes care
+ * of collating output into the pager buffer; ensuring only the first page
+ * is displayed and any lines in excess of the first page are buffered.
+ *
+ * If the maximum number of index lines in the buffer is exceeded then the
+ * pager is cancelled and the contents of the current buffer are sent to the
+ * terminal.
+ *
+ * If there is no pager configured then the output is sent directly to the
+ * terminal.
+ *
+ * @param cli_file_index Index of the CLI session where this output is
+ *                       directed.
+ * @param buffer         String of printabe bytes to be output.
+ * @param buffer_bytes   The number of bytes in @c buffer to be output.
+ */
+static void
+unix_vlib_cli_output (uword cli_file_index, u8 * buffer, uword buffer_bytes)
+{
+  unix_main_t *um = &unix_main;
+  clib_file_main_t *fm = &file_main;
+  unix_cli_main_t *cm = &unix_cli_main;
+  unix_cli_file_t *cf;
+  clib_file_t *uf;
+
+  cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index);
+  uf = pool_elt_at_index (fm->file_pool, cf->clib_file_index);
+
+  if (cf->no_pager || um->cli_pager_buffer_limit == 0 || cf->height == 0)
+    {
+      unix_vlib_cli_output_cooked (cf, uf, buffer, buffer_bytes);
+    }
+  else
+    {
+      word row = vec_len (cf->pager_index);
+      u8 *line;
+      unix_cli_pager_index_t *pi;
+
+      /* Index and add the output lines to the pager buffer. */
+      unix_cli_pager_add_line (cf, buffer, buffer_bytes);
+
+      /* Now iterate what was added to display the lines.
+       * If we reach the bottom of the page, display a prompt.
+       */
+      while (row < vec_len (cf->pager_index))
+	{
+	  if (row < cf->height - 1)
+	    {
+	      /* output this line */
+	      pi = &cf->pager_index[row];
+	      line = cf->pager_vector[pi->line] + pi->offset;
+	      unix_vlib_cli_output_cooked (cf, uf, line, pi->length);
+
+	      /* if the last line didn't end in newline, and we're at the
+	       * bottom of the page, add a newline */
+	      if (line[pi->length - 1] != '\n' && row == cf->height - 2)
+		unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1);
+	    }
+	  else
+	    {
+	      /* Display the pager prompt every 10 lines */
+	      if (!(row % 10))
+		unix_cli_pager_prompt (cf, uf);
+	    }
+	  row++;
+	}
+
+      /* Check if we went over the pager buffer limit */
+      if (vec_len (cf->pager_index) > um->cli_pager_buffer_limit)
+	{
+	  /* Stop using the pager for the remainder of this CLI command */
+	  cf->no_pager = 2;
+
+	  /* If we likely printed the prompt, erase it */
+	  if (vec_len (cf->pager_index) > cf->height - 1)
+	    unix_cli_pager_prompt_erase (cf, uf);
+
+	  /* Dump out the contents of the buffer */
+	  for (row = cf->pager_start + (cf->height - 1);
+	       row < vec_len (cf->pager_index); row++)
+	    {
+	      pi = &cf->pager_index[row];
+	      line = cf->pager_vector[pi->line] + pi->offset;
+	      unix_vlib_cli_output_cooked (cf, uf, line, pi->length);
+	    }
+
+	  unix_cli_pager_reset (cf);
+	}
+    }
+}
+
+/** Identify whether a terminal type is ANSI capable.
+ *
+ * Compares the string given in @c term with a list of terminal types known
+ * to support ANSI escape sequences.
+ *
+ * This list contains, for example, @c xterm, @c screen and @c ansi.
+ *
+ * @param term A string with a terminal type in it.
+ * @param len The length of the string in @c term.
+ *
+ * @return @c 1 if the terminal type is recognized as supporting ANSI
+ *         terminal sequences; @c 0 otherwise.
+ */
+static u8
+unix_cli_terminal_type_ansi (u8 * term, uword len)
+{
+  /* This may later be better done as a hash of some sort. */
+#define _(a) do { \
+    if (strncasecmp(a, (char *)term, (size_t)len) == 0) return 1; \
+  } while(0)
+
+  _("xterm");
+  _("xterm-color");
+  _("xterm-256color");		/* iTerm on Mac */
+  _("screen");
+  _("screen-256color");		/* Screen and tmux */
+  _("ansi");			/* Microsoft Telnet */
+#undef _
+
+  return 0;
+}
+
+/** Identify whether a terminal type is non-interactive.
+ *
+ * Compares the string given in @c term with a list of terminal types known
+ * to be non-interactive, as send by tools such as @c vppctl .
+ *
+ * This list contains, for example, @c vppctl.
+ *
+ * @param term A string with a terminal type in it.
+ * @param len The length of the string in @c term.
+ *
+ * @return @c 1 if the terminal type is recognized as being non-interactive;
+ *         @c 0 otherwise.
+ */
+static u8
+unix_cli_terminal_type_noninteractive (u8 * term, uword len)
+{
+  /* This may later be better done as a hash of some sort. */
+#define _(a) do { \
+    if (strncasecmp(a, (char *)term, (size_t)len) == 0) return 1; \
+  } while(0)
+
+  _("vppctl");
+#undef _
+
+  return 0;
+}
+
+/** Set a session to be non-interactive. */
+static void
+unix_cli_set_session_noninteractive (unix_cli_file_t * cf)
+{
+  /* Non-interactive sessions don't get these */
+  cf->is_interactive = 0;
+  cf->no_pager = 1;
+  cf->history_limit = 0;
+  cf->has_history = 0;
+  cf->line_mode = 1;
+}
+
+/** @brief Emit initial welcome banner and prompt on a connection. */
+static void
+unix_cli_file_welcome (unix_cli_main_t * cm, unix_cli_file_t * cf)
+{
+  unix_main_t *um = &unix_main;
+  clib_file_main_t *fm = &file_main;
+  clib_file_t *uf = pool_elt_at_index (fm->file_pool, cf->clib_file_index);
+  unix_cli_banner_t *banner;
+  int i, len;
+
+  /* Mark the session as started if we get here */
+  cf->started = 1;
+
+  if (!(cf->is_interactive))	/* No banner for non-interactive sessions */
+    return;
+
+  /*
+   * Put the first bytes directly into the buffer so that further output is
+   * queued until everything is ready. (oterwise initial prompt can appear
+   * mid way through VPP initialization)
+   */
+  unix_cli_add_pending_output (uf, cf, (u8 *) "\r", 1);
+
+  if (!um->cli_no_banner)
+    {
+      if (cf->ansi_capable)
+	{
+	  banner = unix_cli_banner_color;
+	  len = ARRAY_LEN (unix_cli_banner_color);
+	}
+      else
+	{
+	  banner = unix_cli_banner;
+	  len = ARRAY_LEN (unix_cli_banner);
+	}
+
+      for (i = 0; i < len; i++)
+	{
+	  unix_vlib_cli_output_cooked (cf, uf,
+				       banner[i].line, banner[i].length);
+	}
+    }
+
+  /* Prompt. */
+  unix_cli_cli_prompt (cf, uf);
+
+}
+
+/** @brief A failsafe triggered on a timer to ensure we send the prompt
+ * to telnet sessions that fail to negotiate the terminal type. */
+static void
+unix_cli_file_welcome_timer (any arg, f64 delay)
+{
+  unix_cli_main_t *cm = &unix_cli_main;
+  unix_cli_file_t *cf;
+  (void) delay;
+
+  /* Check the connection didn't close already */
+  if (pool_is_free_index (cm->cli_file_pool, (uword) arg))
+    return;
+
+  cf = pool_elt_at_index (cm->cli_file_pool, (uword) arg);
+
+  if (!cf->started)
+    unix_cli_file_welcome (cm, cf);
+}
+
+/** @brief A mostly no-op Telnet state machine.
+ * Process Telnet command bytes in a way that ensures we're mostly
+ * transparent to the Telnet protocol. That is, it's mostly a no-op.
+ *
+ * @return -1 if we need more bytes, otherwise a positive integer number of
+ *          bytes to consume from the input_vector, not including the initial
+ *          IAC byte.
+ */
+static i32
+unix_cli_process_telnet (unix_main_t * um,
+			 unix_cli_file_t * cf,
+			 clib_file_t * uf, u8 * input_vector, uword len)
+{
+  /* Input_vector starts at IAC byte.
+   * See if we have a complete message; if not, return -1 so we wait for more.
+   * if we have a complete message, consume those bytes from the vector.
+   */
+  i32 consume = 0;
+
+  if (len == 1)
+    return -1;			/* want more bytes */
+
+  switch (input_vector[1])
+    {
+    case IAC:
+      /* two IAC's in a row means to pass through 0xff.
+       * since that makes no sense here, just consume it.
+       */
+      consume = 1;
+      break;
+
+    case WILL:
+    case WONT:
+    case DO:
+    case DONT:
+      /* Expect 3 bytes */
+      if (vec_len (input_vector) < 3)
+	return -1;		/* want more bytes */
+
+      consume = 2;
+      break;
+
+    case SB:
+      {
+	/* Sub option - search ahead for IAC SE to end it */
+	i32 i;
+	for (i = 3; i < len && i < UNIX_CLI_MAX_DEPTH_TELNET; i++)
+	  {
+	    if (input_vector[i - 1] == IAC && input_vector[i] == SE)
+	      {
+		/* We have a complete message; see if we care about it */
+		switch (input_vector[2])
+		  {
+		  case TELOPT_TTYPE:
+		    if (input_vector[3] != 0)
+		      break;
+		    {
+		      /* See if the the terminal type is recognized */
+		      u8 *term = input_vector + 4;
+		      uword len = i - 5;
+
+		      /* See if the terminal type is ANSI capable */
+		      cf->ansi_capable =
+			unix_cli_terminal_type_ansi (term, len);
+
+		      /* See if the terminal type indicates non-interactive */
+		      if (unix_cli_terminal_type_noninteractive (term, len))
+			unix_cli_set_session_noninteractive (cf);
+		    }
+
+		    /* If session not started, we can release the pause */
+		    if (!cf->started)
+		      /* Send the welcome banner and initial prompt */
+		      unix_cli_file_welcome (&unix_cli_main, cf);
+		    break;
+
+		  case TELOPT_NAWS:
+		    /* Window size */
+		    if (i != 8)	/* check message is correct size */
+		      break;
+
+		    cf->width =
+		      clib_net_to_host_u16 (*((u16 *) (input_vector + 3)));
+		    if (cf->width > UNIX_CLI_MAX_TERMINAL_WIDTH)
+		      cf->width = UNIX_CLI_MAX_TERMINAL_WIDTH;
+		    if (cf->width < UNIX_CLI_MIN_TERMINAL_WIDTH)
+		      cf->width = UNIX_CLI_MIN_TERMINAL_WIDTH;
+
+		    cf->height =
+		      clib_net_to_host_u16 (*((u16 *) (input_vector + 5)));
+		    if (cf->height > UNIX_CLI_MAX_TERMINAL_HEIGHT)
+		      cf->height = UNIX_CLI_MAX_TERMINAL_HEIGHT;
+		    if (cf->height < UNIX_CLI_MIN_TERMINAL_HEIGHT)
+		      cf->height = UNIX_CLI_MIN_TERMINAL_HEIGHT;
+
+		    /* reindex pager buffer */
+		    unix_cli_pager_reindex (cf);
+		    /* redraw page */
+		    unix_cli_pager_redraw (cf, uf);
+		    break;
+
+		  default:
+		    break;
+		  }
+		/* Consume it all */
+		consume = i;
+		break;
+	      }
+	  }
+
+	if (i == UNIX_CLI_MAX_DEPTH_TELNET)
+	  consume = 1;		/* hit max search depth, advance one byte */
+
+	if (consume == 0)
+	  return -1;		/* want more bytes */
+
+	break;
+      }
+
+    case GA:
+    case EL:
+    case EC:
+    case AO:
+    case IP:
+    case BREAK:
+    case DM:
+    case NOP:
+    case SE:
+    case EOR:
+    case ABORT:
+    case SUSP:
+    case xEOF:
+      /* Simple one-byte messages */
+      consume = 1;
+      break;
+
+    case AYT:
+      /* Are You There - trigger a visible response */
+      consume = 1;
+      unix_vlib_cli_output_cooked (cf, uf, (u8 *) "fd.io VPP\n", 10);
+      break;
+
+    default:
+      /* Unknown command! Eat the IAC byte */
+      break;
+    }
+
+  return consume;
+}
+
+/** @brief Process actionable input.
+ * Based on the \c action process the input; this typically involves
+ * searching the command history or editing the current command line.
+ */
+static int
+unix_cli_line_process_one (unix_cli_main_t * cm,
+			   unix_main_t * um,
+			   unix_cli_file_t * cf,
+			   clib_file_t * uf,
+			   u8 input, unix_cli_parse_action_t action)
+{
+  u8 *prev;
+  u8 *save = 0;
+  u8 **possible_commands;
+  int j, delta;
+
+  switch (action)
+    {
+    case UNIX_CLI_PARSE_ACTION_NOACTION:
+      break;
+
+    case UNIX_CLI_PARSE_ACTION_REVSEARCH:
+    case UNIX_CLI_PARSE_ACTION_FWDSEARCH:
+      if (!cf->has_history || !cf->history_limit)
+	break;
+      if (cf->search_mode == 0)
+	{
+	  /* Erase the current command (if any) */
+	  for (j = 0; j < (vec_len (cf->current_command)); j++)
+	    unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b \b", 3);
+
+	  vec_reset_length (cf->search_key);
+	  vec_reset_length (cf->current_command);
+	  if (action == UNIX_CLI_PARSE_ACTION_REVSEARCH)
+	    cf->search_mode = -1;
+	  else
+	    cf->search_mode = 1;
+	  cf->cursor = 0;
+	}
+      else
+	{
+	  if (action == UNIX_CLI_PARSE_ACTION_REVSEARCH)
+	    cf->search_mode = -1;
+	  else
+	    cf->search_mode = 1;
+
+	  cf->excursion += cf->search_mode;
+	  goto search_again;
+	}
+      break;
+
+    case UNIX_CLI_PARSE_ACTION_ERASELINELEFT:
+      /* Erase the command from the cursor to the start */
+
+      /* Shimmy forwards to the new end of line position */
+      delta = vec_len (cf->current_command) - cf->cursor;
+      for (j = cf->cursor; j > delta; j--)
+	unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1);
+      /* Zap from here to the end of what is currently displayed */
+      for (; j < (vec_len (cf->current_command)); j++)
+	unix_vlib_cli_output_cooked (cf, uf, (u8 *) " ", 1);
+      /* Get back to the start of the line */
+      for (j = 0; j < (vec_len (cf->current_command)); j++)
+	unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1);
+
+      j = vec_len (cf->current_command) - cf->cursor;
+      memmove (cf->current_command, cf->current_command + cf->cursor, j);
+      _vec_len (cf->current_command) = j;
+
+      /* Print the new contents */
+      unix_vlib_cli_output_cooked (cf, uf, cf->current_command, j);
+      /* Shimmy back to the start */
+      for (j = 0; j < (vec_len (cf->current_command)); j++)
+	unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1);
+      cf->cursor = 0;
+
+      cf->search_mode = 0;
+      break;
+
+    case UNIX_CLI_PARSE_ACTION_ERASELINERIGHT:
+      /* Erase the command from the cursor to the end */
+
+      /* Zap from cursor to end of what is currently displayed */
+      for (j = cf->cursor; j < (vec_len (cf->current_command)); j++)
+	unix_vlib_cli_output_cooked (cf, uf, (u8 *) " ", 1);
+      /* Get back to where we were */
+      for (j = cf->cursor; j < (vec_len (cf->current_command)); j++)
+	unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1);
+
+      /* Truncate the line at the cursor */
+      _vec_len (cf->current_command) = cf->cursor;
+
+      cf->search_mode = 0;
+      break;
+
+    case UNIX_CLI_PARSE_ACTION_LEFT:
+      if (cf->cursor > 0)
+	{
+	  unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1);
+	  cf->cursor--;
+	}
+
+      cf->search_mode = 0;
+      break;
+
+    case UNIX_CLI_PARSE_ACTION_RIGHT:
+      if (cf->cursor < vec_len (cf->current_command))
+	{
+	  /* have to emit the character under the cursor */
+	  unix_vlib_cli_output_cooked (cf, uf,
+				       cf->current_command + cf->cursor, 1);
+	  cf->cursor++;
+	}
+
+      cf->search_mode = 0;
+      break;
+
+    case UNIX_CLI_PARSE_ACTION_UP:
+    case UNIX_CLI_PARSE_ACTION_DOWN:
+      if (!cf->has_history || !cf->history_limit)
+	break;
+      cf->search_mode = 0;
+      /* Erase the command */
+      for (j = cf->cursor; j < (vec_len (cf->current_command)); j++)
+	unix_vlib_cli_output_cooked (cf, uf, (u8 *) " ", 1);
+      for (j = 0; j < (vec_len (cf->current_command)); j++)
+	unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b \b", 3);
+      vec_reset_length (cf->current_command);
+      if (vec_len (cf->command_history))
+	{
+	  if (action == UNIX_CLI_PARSE_ACTION_UP)
+	    delta = -1;
+	  else
+	    delta = 1;
+
+	  cf->excursion += delta;
+
+	  if (cf->excursion == vec_len (cf->command_history))
+	    {
+	      /* down-arrowed to last entry - want a blank line */
+	      _vec_len (cf->current_command) = 0;
+	    }
+	  else if (cf->excursion < 0)
+	    {
+	      /* up-arrowed over the start to the end, want a blank line */
+	      cf->excursion = vec_len (cf->command_history);
+	      _vec_len (cf->current_command) = 0;
+	    }
+	  else
+	    {
+	      if (cf->excursion > (i32) vec_len (cf->command_history) - 1)
+		/* down-arrowed past end - wrap to start */
+		cf->excursion = 0;
+
+	      /* Print the command at the current position */
+	      prev = cf->command_history[cf->excursion];
+	      vec_validate (cf->current_command, vec_len (prev) - 1);
+
+	      clib_memcpy (cf->current_command, prev, vec_len (prev));
+	      _vec_len (cf->current_command) = vec_len (prev);
+	      unix_vlib_cli_output_cooked (cf, uf, cf->current_command,
+					   vec_len (cf->current_command));
+	    }
+	}
+      cf->cursor = vec_len (cf->current_command);
+      break;
+
+    case UNIX_CLI_PARSE_ACTION_HOME:
+      if (vec_len (cf->current_command) && cf->cursor > 0)
+	{
+	  while (cf->cursor)
+	    {
+	      unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1);
+	      cf->cursor--;
+	    }
+	}
+
+      cf->search_mode = 0;
+      break;
+
+    case UNIX_CLI_PARSE_ACTION_END:
+      if (vec_len (cf->current_command) &&
+	  cf->cursor < vec_len (cf->current_command))
+	{
+	  unix_vlib_cli_output_cooked (cf, uf,
+				       cf->current_command + cf->cursor,
+				       vec_len (cf->current_command) -
+				       cf->cursor);
+	  cf->cursor = vec_len (cf->current_command);
+	}
+
+      cf->search_mode = 0;
+      break;
+
+    case UNIX_CLI_PARSE_ACTION_WORDLEFT:
+      if (vec_len (cf->current_command) && cf->cursor > 0)
+	{
+	  j = cf->cursor;
+
+	  unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1);
+	  j--;
+
+	  while (j && isspace (cf->current_command[j]))
+	    {
+	      unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1);
+	      j--;
+	    }
+	  while (j && !isspace (cf->current_command[j]))
+	    {
+	      if (isspace (cf->current_command[j - 1]))
+		break;
+	      unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1);
+	      j--;
+	    }
+
+	  cf->cursor = j;
+	}
+
+      cf->search_mode = 0;
+      break;
+
+    case UNIX_CLI_PARSE_ACTION_WORDRIGHT:
+      if (vec_len (cf->current_command) &&
+	  cf->cursor < vec_len (cf->current_command))
+	{
+	  int e = vec_len (cf->current_command);
+	  j = cf->cursor;
+	  while (j < e && !isspace (cf->current_command[j]))
+	    j++;
+	  while (j < e && isspace (cf->current_command[j]))
+	    j++;
+	  unix_vlib_cli_output_cooked (cf, uf,
+				       cf->current_command + cf->cursor,
+				       j - cf->cursor);
+	  cf->cursor = j;
+	}
+
+      cf->search_mode = 0;
+      break;
+
+
+    case UNIX_CLI_PARSE_ACTION_ERASE:
+      if (vec_len (cf->current_command))
+	{
+	  if (cf->cursor == vec_len (cf->current_command))
+	    {
+	      unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b \b", 3);
+	      _vec_len (cf->current_command)--;
+	      cf->cursor--;
+	    }
+	  else if (cf->cursor > 0)
+	    {
+	      /* shift everything at & to the right of the cursor left by 1 */
+	      j = vec_len (cf->current_command) - cf->cursor;
+	      memmove (cf->current_command + cf->cursor - 1,
+		       cf->current_command + cf->cursor, j);
+	      _vec_len (cf->current_command)--;
+	      cf->cursor--;
+	      /* redraw the rest of the line */
+	      unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1);
+	      unix_vlib_cli_output_cooked (cf, uf,
+					   cf->current_command + cf->cursor,
+					   j);
+	      unix_vlib_cli_output_cooked (cf, uf, (u8 *) " \b\b", 3);
+	      /* and shift the terminal cursor back where it should be */
+	      while (--j)
+		unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1);
+	    }
+	}
+      cf->search_mode = 0;
+      cf->excursion = 0;
+      vec_reset_length (cf->search_key);
+      break;
+
+    case UNIX_CLI_PARSE_ACTION_ERASERIGHT:
+      if (vec_len (cf->current_command))
+	{
+	  if (cf->cursor < vec_len (cf->current_command))
+	    {
+	      /* shift everything to the right of the cursor left by 1 */
+	      j = vec_len (cf->current_command) - cf->cursor - 1;
+	      memmove (cf->current_command + cf->cursor,
+		       cf->current_command + cf->cursor + 1, j);
+	      _vec_len (cf->current_command)--;
+	      /* redraw the rest of the line */
+	      unix_vlib_cli_output_cooked (cf, uf,
+					   cf->current_command + cf->cursor,
+					   j);
+	      unix_vlib_cli_output_cooked (cf, uf, (u8 *) " \b", 2);
+	      /* and shift the terminal cursor back where it should be */
+	      if (j)
+		{
+		  unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1);
+		  while (--j)
+		    unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1);
+		}
+	    }
+	}
+      else if (input == 'D' - '@')
+	{
+	  /* ^D with no command entered = quit */
+	  unix_vlib_cli_output_cooked (cf, uf, (u8 *) "quit\n", 5);
+	  vlib_process_signal_event (um->vlib_main,
+				     vlib_current_process (um->vlib_main),
+				     UNIX_CLI_PROCESS_EVENT_QUIT,
+				     cf - cm->cli_file_pool);
+	}
+      cf->search_mode = 0;
+      cf->excursion = 0;
+      vec_reset_length (cf->search_key);
+      break;
+
+    case UNIX_CLI_PARSE_ACTION_CLEAR:
+      /* If we're in ANSI mode, clear the screen.
+       * Then redraw the prompt and any existing command input, then put
+       * the cursor back where it was in that line.
+       */
+      if (cf->ansi_capable)
+	unix_vlib_cli_output_cooked (cf, uf,
+				     (u8 *) ANSI_CLEAR,
+				     sizeof (ANSI_CLEAR) - 1);
+      else
+	unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1);
+
+      unix_vlib_cli_output_raw (cf, uf,
+				cm->cli_prompt, vec_len (cm->cli_prompt));
+      unix_vlib_cli_output_raw (cf, uf,
+				cf->current_command,
+				vec_len (cf->current_command));
+      for (j = cf->cursor; j < vec_len (cf->current_command); j++)
+	unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1);
+
+      break;
+
+    case UNIX_CLI_PARSE_ACTION_TAB:
+      if (cf->cursor < vec_len (cf->current_command))
+	{
+	  /* if we are in the middle of a line, complete only if
+	   * the cursor points to whitespace */
+	  if (isspace (cf->current_command[cf->cursor]))
+	    {
+	      /* save and clear any input that is after the cursor */
+	      vec_resize (save, vec_len (cf->current_command) - cf->cursor);
+	      clib_memcpy (save, cf->current_command + cf->cursor,
+			   vec_len (cf->current_command) - cf->cursor);
+	      _vec_len (cf->current_command) = cf->cursor;
+	    }
+	  else
+	    {
+	      unix_vlib_cli_output_raw (cf, uf, (u8 *) "\a", 1);
+	      break;
+	    }
+	}
+      possible_commands =
+	vlib_cli_get_possible_completions (cf->current_command);
+      if (vec_len (possible_commands) == 1)
+	{
+	  u32 j = cf->cursor;
+	  u8 *completed = possible_commands[0];
+
+	  /* find the last word of current_command */
+	  while (j >= 1 && !isspace (cf->current_command[j - 1]))
+	    {
+	      j--;
+	      unix_vlib_cli_output_raw (cf, uf, (u8 *) "\b", 1);
+	    }
+	  _vec_len (cf->current_command) = j;
+
+	  /* replace it with the newly expanded command */
+	  vec_append (cf->current_command, completed);
+
+	  /* echo to the terminal */
+	  unix_vlib_cli_output_raw (cf, uf, completed, vec_len (completed));
+
+	  /* add one trailing space if needed */
+	  if (vec_len (save) == 0)
+	    {
+	      vec_add1 (cf->current_command, ' ');
+	      unix_vlib_cli_output_raw (cf, uf, (u8 *) " ", 1);
+	    }
+
+	  cf->cursor = vec_len (cf->current_command);
+
+	}
+      else if (vec_len (possible_commands) >= 2)
+	{
+	  u8 **possible_command;
+	  uword max_command_len = 0, min_command_len = ~0;
+	  u32 i, j;
+
+	  vec_foreach (possible_command, possible_commands)
+	  {
+	    if (vec_len (*possible_command) > max_command_len)
+	      {
+		max_command_len = vec_len (*possible_command);
+	      }
+	    if (vec_len (*possible_command) < min_command_len)
+	      {
+		min_command_len = vec_len (*possible_command);
+	      }
+	  }
+
+	  unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1);
+
+	  i = 0;
+	  vec_foreach (possible_command, possible_commands)
+	  {
+	    if (i + max_command_len >= cf->width)
+	      {
+		unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1);
+		i = 0;
+	      }
+	    unix_vlib_cli_output_raw (cf, uf, *possible_command,
+				      vec_len (*possible_command));
+	    for (j = vec_len (*possible_command); j < max_command_len + 2;
+		 j++)
+	      {
+		unix_vlib_cli_output_raw (cf, uf, (u8 *) " ", 1);
+	      }
+	    i += max_command_len + 2;
+	  }
+
+	  unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1);
+
+	  /* rewrite prompt */
+	  unix_cli_cli_prompt (cf, uf);
+	  unix_vlib_cli_output_raw (cf, uf, cf->current_command,
+				    vec_len (cf->current_command));
+
+	  /* count length of last word */
+	  j = cf->cursor;
+	  i = 0;
+	  while (j >= 1 && !isspace (cf->current_command[j - 1]))
+	    {
+	      j--;
+	      i++;
+	    }
+
+	  /* determine smallest common command */
+	  for (; i < min_command_len; i++)
+	    {
+	      u8 common = '\0';
+	      int stop = 0;
+	      vec_foreach (possible_command, possible_commands)
+	      {
+		if (common == '\0')
+		  {
+		    common = (*possible_command)[i];
+		  }
+		else if (common != (*possible_command)[i])
+		  {
+		    stop = 1;
+		    break;
+		  }
+	      }
+	      if (!stop)
+		{
+		  vec_add1 (cf->current_command, common);
+		  cf->cursor++;
+		  unix_vlib_cli_output_raw (cf, uf, (u8 *) & common, 1);
+		}
+	      else
+		{
+		  break;
+		}
+	    }
+	}
+      else
+	{
+	  unix_vlib_cli_output_raw (cf, uf, (u8 *) "\a", 1);
+	}
+
+      if (vec_len (save) > 0)
+	{
+	  /* restore remaining input if tab was hit in the middle of a line */
+	  unix_vlib_cli_output_raw (cf, uf, save, vec_len (save));
+	  for (j = 0; j < vec_len (save); j++)
+	    {
+	      unix_vlib_cli_output_raw (cf, uf, (u8 *) "\b", 1);
+	    }
+	  vec_append (cf->current_command, save);
+	  vec_free (save);
+	}
+      vec_free (possible_commands);
+
+      break;
+    case UNIX_CLI_PARSE_ACTION_YANK:
+      /* TODO */
+      break;
+
+
+    case UNIX_CLI_PARSE_ACTION_PAGER_QUIT:
+    pager_quit:
+      unix_cli_pager_prompt_erase (cf, uf);
+      unix_cli_pager_reset (cf);
+      unix_cli_cli_prompt (cf, uf);
+      break;
+
+    case UNIX_CLI_PARSE_ACTION_PAGER_NEXT:
+    case UNIX_CLI_PARSE_ACTION_PAGER_PGDN:
+      /* show next page of the buffer */
+      if (cf->height + cf->pager_start < vec_len (cf->pager_index))
+	{
+	  u8 *line = NULL;
+	  unix_cli_pager_index_t *pi = NULL;
+
+	  int m = cf->pager_start + (cf->height - 1);
+	  unix_cli_pager_prompt_erase (cf, uf);
+	  for (j = m;
+	       j < vec_len (cf->pager_index) && cf->pager_start < m;
+	       j++, cf->pager_start++)
+	    {
+	      pi = &cf->pager_index[j];
+	      line = cf->pager_vector[pi->line] + pi->offset;
+	      unix_vlib_cli_output_cooked (cf, uf, line, pi->length);
+	    }
+	  /* if the last line didn't end in newline, add a newline */
+	  if (pi && line[pi->length - 1] != '\n')
+	    unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1);
+	  unix_cli_pager_prompt (cf, uf);
+	}
+      else
+	{
+	  if (action == UNIX_CLI_PARSE_ACTION_PAGER_NEXT)
+	    /* no more in buffer, exit, but only if it was <space> */
+	    goto pager_quit;
+	}
+      break;
+
+    case UNIX_CLI_PARSE_ACTION_PAGER_DN:
+    case UNIX_CLI_PARSE_ACTION_PAGER_CRLF:
+      /* display the next line of the buffer */
+      if (cf->pager_start < vec_len (cf->pager_index) - (cf->height - 1))
+	{
+	  u8 *line;
+	  unix_cli_pager_index_t *pi;
+
+	  unix_cli_pager_prompt_erase (cf, uf);
+	  pi = &cf->pager_index[cf->pager_start + (cf->height - 1)];
+	  line = cf->pager_vector[pi->line] + pi->offset;
+	  unix_vlib_cli_output_cooked (cf, uf, line, pi->length);
+	  cf->pager_start++;
+	  /* if the last line didn't end in newline, add a newline */
+	  if (line[pi->length - 1] != '\n')
+	    unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1);
+	  unix_cli_pager_prompt (cf, uf);
+	}
+      else
+	{
+	  if (action == UNIX_CLI_PARSE_ACTION_PAGER_CRLF)
+	    /* no more in buffer, exit, but only if it was <enter> */
+	    goto pager_quit;
+	}
+
+      break;
+
+    case UNIX_CLI_PARSE_ACTION_PAGER_UP:
+      /* scroll the page back one line */
+      if (cf->pager_start > 0)
+	{
+	  u8 *line = NULL;
+	  unix_cli_pager_index_t *pi = NULL;
+
+	  cf->pager_start--;
+	  if (cf->ansi_capable)
+	    {
+	      pi = &cf->pager_index[cf->pager_start];
+	      line = cf->pager_vector[pi->line] + pi->offset;
+	      unix_cli_pager_prompt_erase (cf, uf);
+	      unix_vlib_cli_output_cooked (cf, uf, (u8 *) ANSI_SCROLLDN,
+					   sizeof (ANSI_SCROLLDN) - 1);
+	      unix_vlib_cli_output_cooked (cf, uf, (u8 *) ANSI_SAVECURSOR,
+					   sizeof (ANSI_SAVECURSOR) - 1);
+	      unix_cli_ansi_cursor (cf, uf, 1, 1);
+	      unix_vlib_cli_output_cooked (cf, uf, (u8 *) ANSI_CLEARLINE,
+					   sizeof (ANSI_CLEARLINE) - 1);
+	      unix_vlib_cli_output_cooked (cf, uf, line, pi->length);
+	      unix_vlib_cli_output_cooked (cf, uf, (u8 *) ANSI_RESTCURSOR,
+					   sizeof (ANSI_RESTCURSOR) - 1);
+	      unix_cli_pager_prompt_erase (cf, uf);
+	      unix_cli_pager_prompt (cf, uf);
+	    }
+	  else
+	    {
+	      int m = cf->pager_start + (cf->height - 1);
+	      unix_cli_pager_prompt_erase (cf, uf);
+	      for (j = cf->pager_start;
+		   j < vec_len (cf->pager_index) && j < m; j++)
+		{
+		  pi = &cf->pager_index[j];
+		  line = cf->pager_vector[pi->line] + pi->offset;
+		  unix_vlib_cli_output_cooked (cf, uf, line, pi->length);
+		}
+	      /* if the last line didn't end in newline, add a newline */
+	      if (pi && line[pi->length - 1] != '\n')
+		unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1);
+	      unix_cli_pager_prompt (cf, uf);
+	    }
+	}
+      break;
+
+    case UNIX_CLI_PARSE_ACTION_PAGER_TOP:
+      /* back to the first page of the buffer */
+      if (cf->pager_start > 0)
+	{
+	  u8 *line = NULL;
+	  unix_cli_pager_index_t *pi = NULL;
+
+	  cf->pager_start = 0;
+	  int m = cf->pager_start + (cf->height - 1);
+	  unix_cli_pager_prompt_erase (cf, uf);
+	  for (j = cf->pager_start; j < vec_len (cf->pager_index) && j < m;
+	       j++)
+	    {
+	      pi = &cf->pager_index[j];
+	      line = cf->pager_vector[pi->line] + pi->offset;
+	      unix_vlib_cli_output_cooked (cf, uf, line, pi->length);
+	    }
+	  /* if the last line didn't end in newline, add a newline */
+	  if (pi && line[pi->length - 1] != '\n')
+	    unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1);
+	  unix_cli_pager_prompt (cf, uf);
+	}
+      break;
+
+    case UNIX_CLI_PARSE_ACTION_PAGER_BOTTOM:
+      /* skip to the last page of the buffer */
+      if (cf->pager_start < vec_len (cf->pager_index) - (cf->height - 1))
+	{
+	  u8 *line = NULL;
+	  unix_cli_pager_index_t *pi = NULL;
+
+	  cf->pager_start = vec_len (cf->pager_index) - (cf->height - 1);
+	  unix_cli_pager_prompt_erase (cf, uf);
+	  unix_cli_pager_message (cf, uf, "skipping", "\n");
+	  for (j = cf->pager_start; j < vec_len (cf->pager_index); j++)
+	    {
+	      pi = &cf->pager_index[j];
+	      line = cf->pager_vector[pi->line] + pi->offset;
+	      unix_vlib_cli_output_cooked (cf, uf, line, pi->length);
+	    }
+	  /* if the last line didn't end in newline, add a newline */
+	  if (pi && line[pi->length - 1] != '\n')
+	    unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1);
+	  unix_cli_pager_prompt (cf, uf);
+	}
+      break;
+
+    case UNIX_CLI_PARSE_ACTION_PAGER_PGUP:
+      /* wander back one page in the buffer */
+      if (cf->pager_start > 0)
+	{
+	  u8 *line = NULL;
+	  unix_cli_pager_index_t *pi = NULL;
+	  int m;
+
+	  if (cf->pager_start >= cf->height)
+	    cf->pager_start -= cf->height - 1;
+	  else
+	    cf->pager_start = 0;
+	  m = cf->pager_start + cf->height - 1;
+	  unix_cli_pager_prompt_erase (cf, uf);
+	  for (j = cf->pager_start; j < vec_len (cf->pager_index) && j < m;
+	       j++)
+	    {
+	      pi = &cf->pager_index[j];
+	      line = cf->pager_vector[pi->line] + pi->offset;
+	      unix_vlib_cli_output_cooked (cf, uf, line, pi->length);
+	    }
+	  /* if the last line didn't end in newline, add a newline */
+	  if (pi && line[pi->length - 1] != '\n')
+	    unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1);
+	  unix_cli_pager_prompt (cf, uf);
+	}
+      break;
+
+    case UNIX_CLI_PARSE_ACTION_PAGER_REDRAW:
+      /* Redraw the current pager screen */
+      unix_cli_pager_redraw (cf, uf);
+      break;
+
+    case UNIX_CLI_PARSE_ACTION_PAGER_SEARCH:
+      /* search forwards in the buffer */
+      break;
+
+
+    case UNIX_CLI_PARSE_ACTION_CRLF:
+    crlf:
+      unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1);
+
+      if (cf->has_history && cf->history_limit)
+	{
+	  if (cf->command_history
+	      && vec_len (cf->command_history) >= cf->history_limit)
+	    {
+	      vec_free (cf->command_history[0]);
+	      vec_delete (cf->command_history, 1, 0);
+	    }
+	  /* Don't add blank lines to the cmd history */
+	  if (vec_len (cf->current_command))
+	    {
+	      /* Don't duplicate the previous command */
+	      j = vec_len (cf->command_history);
+	      if (j == 0 ||
+		  (vec_len (cf->current_command) !=
+		   vec_len (cf->command_history[j - 1])
+		   || memcmp (cf->current_command, cf->command_history[j - 1],
+			      vec_len (cf->current_command)) != 0))
+		{
+		  /* copy the command to the history */
+		  u8 *c = 0;
+		  vec_append (c, cf->current_command);
+		  vec_add1 (cf->command_history, c);
+		  cf->command_number++;
+		}
+	    }
+	  cf->excursion = vec_len (cf->command_history);
+	}
+
+      cf->search_mode = 0;
+      vec_reset_length (cf->search_key);
+      cf->cursor = 0;
+
+      return 0;
+
+    case UNIX_CLI_PARSE_ACTION_PARTIALMATCH:
+    case UNIX_CLI_PARSE_ACTION_NOMATCH:
+      if (vec_len (cf->pager_index))
+	{
+	  /* no-op for now */
+	}
+      else if (cf->has_history && cf->search_mode && isprint (input))
+	{
+	  int k, limit, offset;
+	  u8 *item;
+
+	  vec_add1 (cf->search_key, input);
+
+	search_again:
+	  for (j = 0; j < vec_len (cf->command_history); j++)
+	    {
+	      if (cf->excursion > (i32) vec_len (cf->command_history) - 1)
+		cf->excursion = 0;
+	      else if (cf->excursion < 0)
+		cf->excursion = vec_len (cf->command_history) - 1;
+
+	      item = cf->command_history[cf->excursion];
+
+	      limit = (vec_len (cf->search_key) > vec_len (item)) ?
+		vec_len (item) : vec_len (cf->search_key);
+
+	      for (offset = 0; offset <= vec_len (item) - limit; offset++)
+		{
+		  for (k = 0; k < limit; k++)
+		    {
+		      if (item[k + offset] != cf->search_key[k])
+			goto next_offset;
+		    }
+		  goto found_at_offset;
+
+		next_offset:
+		  ;
+		}
+	      goto next;
+
+	    found_at_offset:
+	      for (j = 0; j < vec_len (cf->current_command); j++)
+		unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b \b", 3);
+
+	      vec_validate (cf->current_command, vec_len (item) - 1);
+	      clib_memcpy (cf->current_command, item, vec_len (item));
+	      _vec_len (cf->current_command) = vec_len (item);
+
+	      unix_vlib_cli_output_cooked (cf, uf, cf->current_command,
+					   vec_len (cf->current_command));
+	      cf->cursor = vec_len (cf->current_command);
+	      goto found;
+
+	    next:
+	      cf->excursion += cf->search_mode;
+	    }
+
+	  unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\nNo match...", 12);
+	  vec_reset_length (cf->search_key);
+	  vec_reset_length (cf->current_command);
+	  cf->search_mode = 0;
+	  cf->cursor = 0;
+	  goto crlf;
+	}
+      else if (isprint (input))	/* skip any errant control codes */
+	{
+	  if (cf->cursor == vec_len (cf->current_command))
+	    {
+	      /* Append to end */
+	      vec_add1 (cf->current_command, input);
+	      cf->cursor++;
+
+	      /* Echo the character back to the client */
+	      unix_vlib_cli_output_raw (cf, uf, &input, 1);
+	    }
+	  else
+	    {
+	      /* Insert at cursor: resize +1 byte, move everything over */
+	      j = vec_len (cf->current_command) - cf->cursor;
+	      vec_add1 (cf->current_command, (u8) 'A');
+	      memmove (cf->current_command + cf->cursor + 1,
+		       cf->current_command + cf->cursor, j);
+	      cf->current_command[cf->cursor] = input;
+	      /* Redraw the line */
+	      j++;
+	      unix_vlib_cli_output_raw (cf, uf,
+					cf->current_command + cf->cursor, j);
+	      /* Put terminal cursor back */
+	      while (--j)
+		unix_vlib_cli_output_raw (cf, uf, (u8 *) "\b", 1);
+	      cf->cursor++;
+	    }
+	}
+      else
+	{
+	  /* no-op - not printable or otherwise not actionable */
+	}
+
+    found:
+
+      break;
+
+    case UNIX_CLI_PARSE_ACTION_TELNETIAC:
+      break;
+    }
+  return 1;
+}
+
+/** @brief Process input bytes on a stream to provide line editing and
+ * command history in the CLI. */
+static int
+unix_cli_line_edit (unix_cli_main_t * cm, unix_main_t * um,
+		    clib_file_main_t * fm, unix_cli_file_t * cf)
+{
+  clib_file_t *uf = pool_elt_at_index (fm->file_pool, cf->clib_file_index);
+  int i;
+
+  for (i = 0; i < vec_len (cf->input_vector); i++)
+    {
+      unix_cli_parse_action_t action;
+      i32 matched = 0;
+      unix_cli_parse_actions_t *a;
+
+      /* If we're in the pager mode, search the pager actions */
+      a =
+	vec_len (cf->pager_index) ? unix_cli_parse_pager :
+	unix_cli_parse_strings;
+
+      /* See if the input buffer is some sort of control code */
+      action = unix_cli_match_action (a, &cf->input_vector[i],
+				      vec_len (cf->input_vector) - i,
+				      &matched);
+
+      switch (action)
+	{
+	case UNIX_CLI_PARSE_ACTION_PARTIALMATCH:
+	  if (i)
+	    {
+	      /* There was a partial match which means we need more bytes
+	       * than the input buffer currently has.
+	       * Since the bytes before here have been processed, shift
+	       * the remaining contents to the start of the input buffer.
+	       */
+	      vec_delete (cf->input_vector, i, 0);
+	    }
+	  return 1;		/* wait for more */
+
+	case UNIX_CLI_PARSE_ACTION_TELNETIAC:
+	  /* process telnet options */
+	  matched = unix_cli_process_telnet (um, cf, uf,
+					     cf->input_vector + i,
+					     vec_len (cf->input_vector) - i);
+	  if (matched < 0)
+	    {
+	      /* There was a partial match which means we need more bytes
+	       * than the input buffer currently has.
+	       */
+	      if (i)
+		{
+		  /*
+		   * Since the bytes before here have been processed, shift
+		   * the remaining contents to the start of the input buffer.
+		   */
+		  vec_delete (cf->input_vector, i, 0);
+		}
+	      return 1;		/* wait for more */
+	    }
+	  break;
+
+	default:
+	  /* If telnet option processing switched us to line mode, get us
+	   * out of here!
+	   */
+	  if (cf->line_mode)
+	    {
+	      vec_delete (cf->input_vector, i, 0);
+	      cf->current_command = cf->input_vector;
+	      return 0;
+	    }
+
+	  /* process the action */
+	  if (!unix_cli_line_process_one (cm, um, cf, uf,
+					  cf->input_vector[i], action))
+	    {
+	      /* CRLF found. Consume the bytes from the input_vector */
+	      vec_delete (cf->input_vector, i + matched, 0);
+	      /* And tell our caller to execute cf->input_command */
+	      return 0;
+	    }
+	}
+
+      i += matched;
+    }
+
+  vec_reset_length (cf->input_vector);
+  return 1;
+}
+
+/** @brief Process input to a CLI session. */
+static void
+unix_cli_process_input (unix_cli_main_t * cm, uword cli_file_index)
+{
+  unix_main_t *um = &unix_main;
+  clib_file_main_t *fm = &file_main;
+  clib_file_t *uf;
+  unix_cli_file_t *cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index);
+  unformat_input_t input;
+  int vlib_parse_eval (u8 *);
+
+  cm->current_input_file_index = cli_file_index;
+
+more:
+  /* Try vlibplex first.  Someday... */
+  if (0 && vlib_parse_eval (cf->input_vector) == 0)
+    goto done;
+
+
+  if (cf->line_mode)
+    {
+      /* just treat whatever we got as a complete line of input */
+      cf->current_command = cf->input_vector;
+    }
+  else
+    {
+      /* Line edit, echo, etc. */
+      if (unix_cli_line_edit (cm, um, fm, cf))
+	/* want more input */
+	return;
+    }
+
+  if (um->log_fd)
+    {
+      static u8 *lv;
+      vec_reset_length (lv);
+      lv = format (lv, "%U[%d]: %v",
+		   format_timeval, 0 /* current bat-time */ ,
+		   0 /* current bat-format */ ,
+		   cli_file_index, cf->current_command);
+      int rv __attribute__ ((unused)) = write (um->log_fd, lv, vec_len (lv));
+    }
+
+  /* Build an unformat structure around our command */
+  unformat_init_vector (&input, cf->current_command);
+
+  /* Remove leading white space from input. */
+  (void) unformat (&input, "");
+
+  cf->pager_start = 0;		/* start a new pager session */
+
+  if (unformat_check_input (&input) != UNFORMAT_END_OF_INPUT)
+    vlib_cli_input (um->vlib_main, &input, unix_vlib_cli_output,
+		    cli_file_index);
+
+  /* Zero buffer since otherwise unformat_free will call vec_free on it. */
+  input.buffer = 0;
+
+  unformat_free (&input);
+
+  /* Re-fetch pointer since pool may have moved. */
+  cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index);
+  uf = pool_elt_at_index (fm->file_pool, cf->clib_file_index);
+
+done:
+  /* reset vector; we'll re-use it later  */
+  if (cf->line_mode)
+    {
+      vec_reset_length (cf->input_vector);
+      cf->current_command = 0;
+    }
+  else
+    {
+      vec_reset_length (cf->current_command);
+    }
+
+  if (cf->no_pager == 2)
+    {
+      /* Pager was programmatically disabled */
+      unix_cli_pager_message (cf, uf, "pager buffer overflowed", "\n");
+      cf->no_pager = um->cli_no_pager;
+    }
+
+  if (vec_len (cf->pager_index) == 0
+      || vec_len (cf->pager_index) < cf->height)
+    {
+      /* There was no need for the pager */
+      unix_cli_pager_reset (cf);
+
+      /* Prompt. */
+      unix_cli_cli_prompt (cf, uf);
+    }
+  else
+    {
+      /* Display the pager prompt */
+      unix_cli_pager_prompt (cf, uf);
+    }
+
+  /* Any residual data in the input vector? */
+  if (vec_len (cf->input_vector))
+    goto more;
+
+  /* For non-interactive sessions send a NUL byte.
+   * Specifically this is because vppctl needs to see some traffic in
+   * order to move on to closing the session. Commands with no output
+   * would thus cause vppctl to hang indefinitely in non-interactive mode
+   * since there is also no prompt sent after the command completes.
+   */
+  if (!cf->is_interactive)
+    unix_vlib_cli_output_raw (cf, uf, (u8 *) "\0", 1);
+}
+
+/** Destroy a CLI session.
+ * @note If we destroy the @c stdin session this additionally signals
+ *       the shutdown of VPP.
+ */
+static void
+unix_cli_kill (unix_cli_main_t * cm, uword cli_file_index)
+{
+  unix_main_t *um = &unix_main;
+  clib_file_main_t *fm = &file_main;
+  unix_cli_file_t *cf;
+  clib_file_t *uf;
+  int i;
+
+  cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index);
+  uf = pool_elt_at_index (fm->file_pool, cf->clib_file_index);
+
+  /* Quit/EOF on stdin means quit program. */
+  if (uf->file_descriptor == STDIN_FILENO)
+    clib_longjmp (&um->vlib_main->main_loop_exit, VLIB_MAIN_LOOP_EXIT_CLI);
+
+  vec_free (cf->current_command);
+  vec_free (cf->search_key);
+
+  for (i = 0; i < vec_len (cf->command_history); i++)
+    vec_free (cf->command_history[i]);
+
+  vec_free (cf->command_history);
+
+  clib_file_del (fm, uf);
+
+  unix_cli_file_free (cf);
+  pool_put (cm->cli_file_pool, cf);
+}
+
+/** Handle system events. */
+static uword
+unix_cli_process (vlib_main_t * vm,
+		  vlib_node_runtime_t * rt, vlib_frame_t * f)
+{
+  unix_cli_main_t *cm = &unix_cli_main;
+  uword i, *data = 0;
+
+  while (1)
+    {
+      unix_cli_process_event_type_t event_type;
+      vlib_process_wait_for_event (vm);
+      event_type = vlib_process_get_events (vm, &data);
+
+      switch (event_type)
+	{
+	case UNIX_CLI_PROCESS_EVENT_READ_READY:
+	  for (i = 0; i < vec_len (data); i++)
+	    unix_cli_process_input (cm, data[i]);
+	  break;
+
+	case UNIX_CLI_PROCESS_EVENT_QUIT:
+	  /* Kill this process. */
+	  for (i = 0; i < vec_len (data); i++)
+	    unix_cli_kill (cm, data[i]);
+	  goto done;
+	}
+
+      if (data)
+	_vec_len (data) = 0;
+    }
+
+done:
+  vec_free (data);
+
+  vlib_node_set_state (vm, rt->node_index, VLIB_NODE_STATE_DISABLED);
+
+  /* Add node index so we can re-use this process later. */
+  vec_add1 (cm->unused_cli_process_node_indices, rt->node_index);
+
+  return 0;
+}
+
+/** Called when a CLI session file descriptor can be written to without
+ * blocking. */
+static clib_error_t *
+unix_cli_write_ready (clib_file_t * uf)
+{
+  unix_cli_main_t *cm = &unix_cli_main;
+  unix_cli_file_t *cf;
+  int n;
+
+  cf = pool_elt_at_index (cm->cli_file_pool, uf->private_data);
+
+  /* Flush output vector. */
+  if (cf->is_socket)
+    /* If it's a socket we use MSG_NOSIGNAL to prevent SIGPIPE */
+    n = send (uf->file_descriptor,
+	      cf->output_vector, vec_len (cf->output_vector), MSG_NOSIGNAL);
+  else
+    n = write (uf->file_descriptor,
+	       cf->output_vector, vec_len (cf->output_vector));
+
+  if (n < 0 && errno != EAGAIN)
+    {
+      if (errno == EPIPE)
+	{
+	  /* connection closed on us */
+	  unix_main_t *um = &unix_main;
+	  cf->has_epipe = 1;
+	  vlib_process_signal_event (um->vlib_main, cf->process_node_index,
+				     UNIX_CLI_PROCESS_EVENT_QUIT,
+				     uf->private_data);
+	}
+      else
+	{
+	  return clib_error_return_unix (0, "write");
+	}
+    }
+
+  else if (n > 0)
+    unix_cli_del_pending_output (uf, cf, n);
+
+  return /* no error */ 0;
+}
+
+/** Called when a CLI session file descriptor has data to be read. */
+static clib_error_t *
+unix_cli_read_ready (clib_file_t * uf)
+{
+  unix_main_t *um = &unix_main;
+  unix_cli_main_t *cm = &unix_cli_main;
+  unix_cli_file_t *cf;
+  uword l;
+  int n, n_read, n_try;
+
+  cf = pool_elt_at_index (cm->cli_file_pool, uf->private_data);
+
+  n = n_try = 4096;
+  while (n == n_try)
+    {
+      l = vec_len (cf->input_vector);
+      vec_resize (cf->input_vector, l + n_try);
+
+      n = read (uf->file_descriptor, cf->input_vector + l, n_try);
+
+      /* Error? */
+      if (n < 0 && errno != EAGAIN)
+	return clib_error_return_unix (0, "read");
+
+      n_read = n < 0 ? 0 : n;
+      _vec_len (cf->input_vector) = l + n_read;
+    }
+
+  if (!(n < 0))
+    vlib_process_signal_event (um->vlib_main,
+			       cf->process_node_index,
+			       (n_read == 0
+				? UNIX_CLI_PROCESS_EVENT_QUIT
+				: UNIX_CLI_PROCESS_EVENT_READ_READY),
+			       /* event data */ uf->private_data);
+
+  return /* no error */ 0;
+}
+
+/** Called when a CLI session file descriptor has an error condition. */
+static clib_error_t *
+unix_cli_error_detected (clib_file_t * uf)
+{
+  unix_main_t *um = &unix_main;
+  unix_cli_main_t *cm = &unix_cli_main;
+  unix_cli_file_t *cf;
+
+  cf = pool_elt_at_index (cm->cli_file_pool, uf->private_data);
+  cf->has_epipe = 1;		/* prevent writes while the close is pending */
+  vlib_process_signal_event (um->vlib_main,
+			     cf->process_node_index,
+			     UNIX_CLI_PROCESS_EVENT_QUIT,
+			     /* event data */ uf->private_data);
+
+  return /* no error */ 0;
+}
+
+/** Store a new CLI session.
+ * @param name The name of the session.
+ * @param fd   The file descriptor for the session I/O.
+ * @return The session ID.
+ */
+static u32
+unix_cli_file_add (unix_cli_main_t * cm, char *name, int fd)
+{
+  unix_main_t *um = &unix_main;
+  clib_file_main_t *fm = &file_main;
+  unix_cli_file_t *cf;
+  clib_file_t template = { 0 };
+  vlib_main_t *vm = um->vlib_main;
+  vlib_node_t *n;
+
+  name = (char *) format (0, "unix-cli-%s", name);
+
+  if (vec_len (cm->unused_cli_process_node_indices) > 0)
+    {
+      uword l = vec_len (cm->unused_cli_process_node_indices);
+
+      /* Find node and give it new name. */
+      n = vlib_get_node (vm, cm->unused_cli_process_node_indices[l - 1]);
+      vec_free (n->name);
+      n->name = (u8 *) name;
+
+      vlib_node_set_state (vm, n->index, VLIB_NODE_STATE_POLLING);
+
+      _vec_len (cm->unused_cli_process_node_indices) = l - 1;
+    }
+  else
+    {
+      static vlib_node_registration_t r = {
+	.function = unix_cli_process,
+	.type = VLIB_NODE_TYPE_PROCESS,
+	.process_log2_n_stack_bytes = 16,
+      };
+
+      r.name = name;
+      vlib_register_node (vm, &r);
+      vec_free (name);
+
+      n = vlib_get_node (vm, r.index);
+    }
+
+  pool_get (cm->cli_file_pool, cf);
+  memset (cf, 0, sizeof (*cf));
+
+  template.read_function = unix_cli_read_ready;
+  template.write_function = unix_cli_write_ready;
+  template.error_function = unix_cli_error_detected;
+  template.file_descriptor = fd;
+  template.private_data = cf - cm->cli_file_pool;
+
+  cf->process_node_index = n->index;
+  cf->clib_file_index = clib_file_add (fm, &template);
+  cf->output_vector = 0;
+  cf->input_vector = 0;
+
+  vlib_start_process (vm, n->runtime_index);
+
+  vlib_process_t *p = vlib_get_process_from_node (vm, n);
+  p->output_function = unix_vlib_cli_output;
+  p->output_function_arg = cf - cm->cli_file_pool;
+
+  return cf - cm->cli_file_pool;
+}
+
+/** Telnet listening socket has a new connection. */
+static clib_error_t *
+unix_cli_listen_read_ready (clib_file_t * uf)
+{
+  unix_main_t *um = &unix_main;
+  clib_file_main_t *fm = &file_main;
+  unix_cli_main_t *cm = &unix_cli_main;
+  clib_socket_t *s = &um->cli_listen_socket;
+  clib_socket_t client;
+  char *client_name;
+  clib_error_t *error;
+  unix_cli_file_t *cf;
+  u32 cf_index;
+
+  error = clib_socket_accept (s, &client);
+  if (error)
+    return error;
+
+  client_name = (char *) format (0, "%U%c", format_sockaddr, &client.peer, 0);
+
+  cf_index = unix_cli_file_add (cm, client_name, client.fd);
+  cf = pool_elt_at_index (cm->cli_file_pool, cf_index);
+  cf->is_socket = 1;
+
+  /* No longer need CLIB version of socket. */
+  clib_socket_free (&client);
+  vec_free (client_name);
+
+  /* if we're supposed to run telnet session in character mode (default) */
+  if (um->cli_line_mode == 0)
+    {
+      /*
+       * Set telnet client character mode, echo on, suppress "go-ahead".
+       * Technically these should be negotiated, but this works.
+       */
+      u8 charmode_option[] = {
+	IAC, WONT, TELOPT_LINEMODE,	/* server will do char-by-char */
+	IAC, DONT, TELOPT_LINEMODE,	/* client should do char-by-char */
+	IAC, WILL, TELOPT_SGA,	/* server willl supress GA */
+	IAC, DO, TELOPT_SGA,	/* client should supress Go Ahead */
+	IAC, WILL, TELOPT_ECHO,	/* server will do echo */
+	IAC, DONT, TELOPT_ECHO,	/* client should not echo */
+	IAC, DO, TELOPT_TTYPE,	/* client should tell us its term type */
+	IAC, SB, TELOPT_TTYPE, 1, IAC, SE,	/* now tell me ttype */
+	IAC, DO, TELOPT_NAWS,	/* client should tell us its window sz */
+	IAC, SB, TELOPT_NAWS, 1, IAC, SE,	/* now tell me window size */
+      };
+
+      /* Enable history on this CLI */
+      cf->history_limit = um->cli_history_limit;
+      cf->has_history = cf->history_limit != 0;
+
+      /* This is an interactive session until we decide otherwise */
+      cf->is_interactive = 1;
+
+      /* Make sure this session is in line mode */
+      cf->line_mode = 0;
+
+      /* We need CRLF */
+      cf->crlf_mode = 1;
+
+      /* Setup the pager */
+      cf->no_pager = um->cli_no_pager;
+
+      /* Send the telnet options */
+      uf = pool_elt_at_index (fm->file_pool, cf->clib_file_index);
+      unix_vlib_cli_output_raw (cf, uf, charmode_option,
+				ARRAY_LEN (charmode_option));
+
+      /* In case the client doesn't negotiate terminal type, use
+       * a timer to kick off the initial prompt. */
+      timer_call (unix_cli_file_welcome_timer, cf_index, 1);
+    }
+
+  return error;
+}
+
+/** The system terminal has informed us that the window size
+ * has changed.
+ */
+static void
+unix_cli_resize_interrupt (int signum)
+{
+  clib_file_main_t *fm = &file_main;
+  unix_cli_main_t *cm = &unix_cli_main;
+  unix_cli_file_t *cf = pool_elt_at_index (cm->cli_file_pool,
+					   cm->stdin_cli_file_index);
+  clib_file_t *uf = pool_elt_at_index (fm->file_pool, cf->clib_file_index);
+  struct winsize ws;
+  (void) signum;
+
+  /* Terminal resized, fetch the new size */
+  if (ioctl (STDIN_FILENO, TIOCGWINSZ, &ws) < 0)
+    {
+      /* "Should never happen..." */
+      clib_unix_warning ("TIOCGWINSZ");
+      /* We can't trust ws.XXX... */
+      return;
+    }
+
+  cf->width = ws.ws_col;
+  if (cf->width > UNIX_CLI_MAX_TERMINAL_WIDTH)
+    cf->width = UNIX_CLI_MAX_TERMINAL_WIDTH;
+  if (cf->width < UNIX_CLI_MIN_TERMINAL_WIDTH)
+    cf->width = UNIX_CLI_MIN_TERMINAL_WIDTH;
+
+  cf->height = ws.ws_row;
+  if (cf->height > UNIX_CLI_MAX_TERMINAL_HEIGHT)
+    cf->height = UNIX_CLI_MAX_TERMINAL_HEIGHT;
+  if (cf->height < UNIX_CLI_MIN_TERMINAL_HEIGHT)
+    cf->height = UNIX_CLI_MIN_TERMINAL_HEIGHT;
+
+  /* Reindex the pager buffer */
+  unix_cli_pager_reindex (cf);
+
+  /* Redraw the page */
+  unix_cli_pager_redraw (cf, uf);
+}
+
+/** Handle configuration directives in the @em unix section. */
+static clib_error_t *
+unix_cli_config (vlib_main_t * vm, unformat_input_t * input)
+{
+  unix_main_t *um = &unix_main;
+  clib_file_main_t *fm = &file_main;
+  unix_cli_main_t *cm = &unix_cli_main;
+  int flags;
+  clib_error_t *error = 0;
+  unix_cli_file_t *cf;
+  u32 cf_index;
+  struct termios tio;
+  struct sigaction sa;
+  struct winsize ws;
+  u8 *term;
+
+  /* We depend on unix flags being set. */
+  if ((error = vlib_call_config_function (vm, unix_config)))
+    return error;
+
+  if (um->flags & UNIX_FLAG_INTERACTIVE)
+    {
+      /* Set stdin to be non-blocking. */
+      if ((flags = fcntl (STDIN_FILENO, F_GETFL, 0)) < 0)
+	flags = 0;
+      (void) fcntl (STDIN_FILENO, F_SETFL, flags | O_NONBLOCK);
+
+      cf_index = unix_cli_file_add (cm, "stdin", STDIN_FILENO);
+      cf = pool_elt_at_index (cm->cli_file_pool, cf_index);
+      cm->stdin_cli_file_index = cf_index;
+
+      /* If stdin is a tty and we are using chacracter mode, enable
+       * history on the CLI and set the tty line discipline accordingly. */
+      if (isatty (STDIN_FILENO) && um->cli_line_mode == 0)
+	{
+	  /* Capture terminal resize events */
+	  memset (&sa, 0, sizeof (sa));
+	  sa.sa_handler = unix_cli_resize_interrupt;
+	  if (sigaction (SIGWINCH, &sa, 0) < 0)
+	    clib_panic ("sigaction");
+
+	  /* Retrieve the current terminal size */
+	  ioctl (STDIN_FILENO, TIOCGWINSZ, &ws);
+	  cf->width = ws.ws_col;
+	  cf->height = ws.ws_row;
+
+	  if (cf->width == 0 || cf->height == 0)
+	    {
+	      /*
+	       * We have a tty, but no size. Use defaults.
+	       * vpp "unix interactive" inside emacs + gdb ends up here.
+	       */
+	      cf->width = 80;
+	      cf->height = 24;
+	    }
+
+	  /* Setup the history */
+	  cf->history_limit = um->cli_history_limit;
+	  cf->has_history = cf->history_limit != 0;
+
+	  /* Setup the pager */
+	  cf->no_pager = um->cli_no_pager;
+
+	  /* This is an interactive session until we decide otherwise */
+	  cf->is_interactive = 1;
+
+	  /* We're going to be in char by char mode */
+	  cf->line_mode = 0;
+
+	  /* Save the original tty state so we can restore it later */
+	  tcgetattr (STDIN_FILENO, &um->tio_stdin);
+	  um->tio_isset = 1;
+
+	  /* Tweak the tty settings */
+	  tio = um->tio_stdin;
+	  /* echo off, canonical mode off, ext'd input processing off */
+	  tio.c_lflag &= ~(ECHO | ICANON | IEXTEN);
+	  tio.c_cc[VMIN] = 1;	/* 1 byte at a time */
+	  tio.c_cc[VTIME] = 0;	/* no timer */
+	  tcsetattr (STDIN_FILENO, TCSAFLUSH, &tio);
+
+	  /* See if we can do ANSI/VT100 output */
+	  term = (u8 *) getenv ("TERM");
+	  if (term != NULL)
+	    {
+	      int len = strlen ((char *) term);
+	      cf->ansi_capable = unix_cli_terminal_type_ansi (term, len);
+	      if (unix_cli_terminal_type_noninteractive (term, len))
+		unix_cli_set_session_noninteractive (cf);
+	    }
+	}
+      else
+	{
+	  /* No tty, so make sure the session doesn't have tty-like features */
+	  unix_cli_set_session_noninteractive (cf);
+	}
+
+      /* Send banner and initial prompt */
+      unix_cli_file_welcome (cm, cf);
+    }
+
+  /* If we have socket config, LISTEN, otherwise, don't */
+  clib_socket_t *s = &um->cli_listen_socket;
+  if (s->config && s->config[0] != 0)
+    {
+      /* CLI listen. */
+      clib_file_t template = { 0 };
+
+      /* mkdir of file socketu, only under /run  */
+      if (strncmp (s->config, "/run", 4) == 0)
+	{
+	  u8 *tmp = format (0, "%s", s->config);
+	  int i = vec_len (tmp);
+	  while (i && tmp[--i] != '/')
+	    ;
+
+	  tmp[i] = 0;
+
+	  if (i)
+	    vlib_unix_recursive_mkdir ((char *) tmp);
+	  vec_free (tmp);
+	}
+
+      s->flags = CLIB_SOCKET_F_IS_SERVER |	/* listen, don't connect */
+	CLIB_SOCKET_F_ALLOW_GROUP_WRITE;	/* PF_LOCAL socket only */
+      error = clib_socket_init (s);
+
+      if (error)
+	return error;
+
+      template.read_function = unix_cli_listen_read_ready;
+      template.file_descriptor = s->fd;
+
+      clib_file_add (fm, &template);
+    }
+
+  /* Set CLI prompt. */
+  if (!cm->cli_prompt)
+    cm->cli_prompt = format (0, "VLIB: ");
+
+  return 0;
+}
+
+/*?
+ * This module has no configurable parameters.
+?*/
+VLIB_CONFIG_FUNCTION (unix_cli_config, "unix-cli");
+
+/** Called when VPP is shutting down, this restores the system
+ * terminal state if previously saved.
+ */
+static clib_error_t *
+unix_cli_exit (vlib_main_t * vm)
+{
+  unix_main_t *um = &unix_main;
+
+  /* If stdin is a tty and we saved the tty state, reset the tty state */
+  if (isatty (STDIN_FILENO) && um->tio_isset)
+    tcsetattr (STDIN_FILENO, TCSAFLUSH, &um->tio_stdin);
+
+  return 0;
+}
+
+VLIB_MAIN_LOOP_EXIT_FUNCTION (unix_cli_exit);
+
+/** Set the CLI prompt.
+ * @param prompt The C string to set the prompt to.
+ * @note This setting is global; it impacts all current
+ *       and future CLI sessions.
+ */
+void
+vlib_unix_cli_set_prompt (char *prompt)
+{
+  char *fmt = (prompt[strlen (prompt) - 1] == ' ') ? "%s" : "%s ";
+  unix_cli_main_t *cm = &unix_cli_main;
+  if (cm->cli_prompt)
+    vec_free (cm->cli_prompt);
+  cm->cli_prompt = format (0, fmt, prompt);
+}
+
+/** CLI command to quit the terminal session.
+ * @note If this is a stdin session then this will
+ *       shutdown VPP also.
+ */
+static clib_error_t *
+unix_cli_quit (vlib_main_t * vm,
+	       unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  unix_cli_main_t *cm = &unix_cli_main;
+  unix_cli_file_t *cf = pool_elt_at_index (cm->cli_file_pool,
+					   cm->current_input_file_index);
+
+  /* Cosmetic: suppress the final prompt from appearing before we die */
+  cf->is_interactive = 0;
+  cf->started = 1;
+
+  vlib_process_signal_event (vm,
+			     vlib_current_process (vm),
+			     UNIX_CLI_PROCESS_EVENT_QUIT,
+			     cm->current_input_file_index);
+  return 0;
+}
+
+/*?
+ * Terminates the current CLI session.
+ *
+ * If VPP is running in @em interactive mode and this is the console session
+ * (that is, the session on @c stdin) then this will also terminate VPP.
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (unix_cli_quit_command, static) = {
+  .path = "quit",
+  .short_help = "Exit CLI",
+  .function = unix_cli_quit,
+};
+/* *INDENT-ON* */
+
+/** CLI command to execute a VPP command script. */
+static clib_error_t *
+unix_cli_exec (vlib_main_t * vm,
+	       unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  char *file_name;
+  int fd;
+  unformat_input_t sub_input;
+  clib_error_t *error;
+
+  file_name = 0;
+  fd = -1;
+  error = 0;
+
+  if (!unformat (input, "%s", &file_name))
+    {
+      error = clib_error_return (0, "expecting file name, got `%U'",
+				 format_unformat_error, input);
+      goto done;
+    }
+
+  fd = open (file_name, O_RDONLY);
+  if (fd < 0)
+    {
+      error = clib_error_return_unix (0, "failed to open `%s'", file_name);
+      goto done;
+    }
+
+  /* Make sure its a regular file. */
+  {
+    struct stat s;
+
+    if (fstat (fd, &s) < 0)
+      {
+	error = clib_error_return_unix (0, "failed to stat `%s'", file_name);
+	goto done;
+      }
+
+    if (!(S_ISREG (s.st_mode) || S_ISLNK (s.st_mode)))
+      {
+	error = clib_error_return (0, "not a regular file `%s'", file_name);
+	goto done;
+      }
+  }
+
+  unformat_init_unix_file (&sub_input, fd);
+
+  vlib_cli_input (vm, &sub_input, 0, 0);
+  unformat_free (&sub_input);
+
+done:
+  if (fd > 0)
+    close (fd);
+  vec_free (file_name);
+
+  return error;
+}
+
+/*?
+ * Executes a sequence of CLI commands which are read from a file.
+ *
+ * If a command is unrecognised or otherwise invalid then the usual CLI
+ * feedback will be generated, however execution of subsequent commands
+ * from the file will continue.
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cli_exec, static) = {
+  .path = "exec",
+  .short_help = "Execute commands from file",
+  .function = unix_cli_exec,
+  .is_mp_safe = 1,
+};
+/* *INDENT-ON* */
+
+/** CLI command to show various unix error statistics. */
+static clib_error_t *
+unix_show_errors (vlib_main_t * vm,
+		  unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  unix_main_t *um = &unix_main;
+  clib_error_t *error = 0;
+  int i, n_errors_to_show;
+  unix_error_history_t *unix_errors = 0;
+
+  n_errors_to_show = 1 << 30;
+
+  if (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (!unformat (input, "%d", &n_errors_to_show))
+	{
+	  error =
+	    clib_error_return (0,
+			       "expecting integer number of errors to show, got `%U'",
+			       format_unformat_error, input);
+	  goto done;
+	}
+    }
+
+  n_errors_to_show =
+    clib_min (ARRAY_LEN (um->error_history), n_errors_to_show);
+
+  i =
+    um->error_history_index >
+    0 ? um->error_history_index - 1 : ARRAY_LEN (um->error_history) - 1;
+
+  while (n_errors_to_show > 0)
+    {
+      unix_error_history_t *eh = um->error_history + i;
+
+      if (!eh->error)
+	break;
+
+      vec_add1 (unix_errors, eh[0]);
+      n_errors_to_show -= 1;
+      if (i == 0)
+	i = ARRAY_LEN (um->error_history) - 1;
+      else
+	i--;
+    }
+
+  if (vec_len (unix_errors) == 0)
+    vlib_cli_output (vm, "no Unix errors so far");
+  else
+    {
+      vlib_cli_output (vm, "%Ld total errors seen", um->n_total_errors);
+      for (i = vec_len (unix_errors) - 1; i >= 0; i--)
+	{
+	  unix_error_history_t *eh = vec_elt_at_index (unix_errors, i);
+	  vlib_cli_output (vm, "%U: %U",
+			   format_time_interval, "h:m:s:u", eh->time,
+			   format_clib_error, eh->error);
+	}
+      vlib_cli_output (vm, "%U: time now",
+		       format_time_interval, "h:m:s:u", vlib_time_now (vm));
+    }
+
+done:
+  vec_free (unix_errors);
+  return error;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cli_unix_show_errors, static) = {
+  .path = "show unix-errors",
+  .short_help = "Show Unix system call error history",
+  .function = unix_show_errors,
+};
+/* *INDENT-ON* */
+
+/** CLI command to show session command history. */
+static clib_error_t *
+unix_cli_show_history (vlib_main_t * vm,
+		       unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  unix_cli_main_t *cm = &unix_cli_main;
+  unix_cli_file_t *cf;
+  int i, j;
+
+  cf = pool_elt_at_index (cm->cli_file_pool, cm->current_input_file_index);
+
+  if (!cf->is_interactive)
+    return clib_error_return (0, "invalid for non-interactive sessions");
+
+  if (cf->has_history && cf->history_limit)
+    {
+      i = 1 + cf->command_number - vec_len (cf->command_history);
+      for (j = 0; j < vec_len (cf->command_history); j++)
+	vlib_cli_output (vm, "%d  %v\n", i + j, cf->command_history[j]);
+    }
+  else
+    {
+      vlib_cli_output (vm, "History not enabled.\n");
+    }
+
+  return 0;
+}
+
+/*?
+ * Displays the command history for the current session, if any.
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cli_unix_cli_show_history, static) = {
+  .path = "history",
+  .short_help = "Show current session command history",
+  .function = unix_cli_show_history,
+};
+/* *INDENT-ON* */
+
+/** CLI command to show terminal status. */
+static clib_error_t *
+unix_cli_show_terminal (vlib_main_t * vm,
+			unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  unix_main_t *um = &unix_main;
+  unix_cli_main_t *cm = &unix_cli_main;
+  unix_cli_file_t *cf;
+  vlib_node_t *n;
+
+  cf = pool_elt_at_index (cm->cli_file_pool, cm->current_input_file_index);
+  n = vlib_get_node (vm, cf->process_node_index);
+
+  vlib_cli_output (vm, "Terminal name:   %v\n", n->name);
+  vlib_cli_output (vm, "Terminal mode:   %s\n", cf->line_mode ?
+		   "line-by-line" : "char-by-char");
+  vlib_cli_output (vm, "Terminal width:  %d\n", cf->width);
+  vlib_cli_output (vm, "Terminal height: %d\n", cf->height);
+  vlib_cli_output (vm, "ANSI capable:    %s\n",
+		   cf->ansi_capable ? "yes" : "no");
+  vlib_cli_output (vm, "Interactive:     %s\n",
+		   cf->is_interactive ? "yes" : "no");
+  vlib_cli_output (vm, "History enabled: %s%s\n",
+		   cf->has_history ? "yes" : "no", !cf->has_history
+		   || cf->history_limit ? "" :
+		   " (disabled by history limit)");
+  if (cf->has_history)
+    vlib_cli_output (vm, "History limit:   %d\n", cf->history_limit);
+  vlib_cli_output (vm, "Pager enabled:   %s%s%s\n",
+		   cf->no_pager ? "no" : "yes",
+		   cf->no_pager
+		   || cf->height ? "" : " (disabled by terminal height)",
+		   cf->no_pager
+		   || um->cli_pager_buffer_limit ? "" :
+		   " (disabled by buffer limit)");
+  if (!cf->no_pager)
+    vlib_cli_output (vm, "Pager limit:     %d\n", um->cli_pager_buffer_limit);
+  vlib_cli_output (vm, "CRLF mode:       %s\n",
+		   cf->crlf_mode ? "CR+LF" : "LF");
+
+  return 0;
+}
+
+/*?
+ * Displays various information about the state of the current terminal
+ * session.
+ *
+ * @cliexpar
+ * @cliexstart{show terminal}
+ * Terminal name:   unix-cli-stdin
+ * Terminal mode:   char-by-char
+ * Terminal width:  123
+ * Terminal height: 48
+ * ANSI capable:    yes
+ * Interactive:     yes
+ * History enabled: yes
+ * History limit:   50
+ * Pager enabled:   yes
+ * Pager limit:     100000
+ * CRLF mode:       LF
+ * @cliexend
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cli_unix_cli_show_terminal, static) = {
+  .path = "show terminal",
+  .short_help = "Show current session terminal settings",
+  .function = unix_cli_show_terminal,
+};
+/* *INDENT-ON* */
+
+/** CLI command to display a list of CLI sessions. */
+static clib_error_t *
+unix_cli_show_cli_sessions (vlib_main_t * vm,
+			    unformat_input_t * input,
+			    vlib_cli_command_t * cmd)
+{
+  //unix_main_t *um = &unix_main;
+  unix_cli_main_t *cm = &unix_cli_main;
+  clib_file_main_t *fm = &file_main;
+  unix_cli_file_t *cf;
+  clib_file_t *uf;
+  vlib_node_t *n;
+
+  vlib_cli_output (vm, "%-5s %-5s %-20s %s", "PNI", "FD", "Name", "Flags");
+
+#define fl(x, y) ( (x) ? toupper((y)) : tolower((y)) )
+  /* *INDENT-OFF* */
+  pool_foreach (cf, cm->cli_file_pool, ({
+    uf = pool_elt_at_index (fm->file_pool, cf->clib_file_index);
+    n = vlib_get_node (vm, cf->process_node_index);
+    vlib_cli_output (vm,
+		     "%-5d %-5d %-20v %c%c%c%c%c\n",
+		     cf->process_node_index,
+		     uf->file_descriptor,
+		     n->name,
+		     fl (cf->is_interactive, 'i'),
+		     fl (cf->is_socket, 's'),
+		     fl (cf->line_mode, 'l'),
+		     fl (cf->has_epipe, 'p'),
+		     fl (cf->ansi_capable, 'a'));
+  }));
+  /* *INDENT-ON* */
+#undef fl
+
+  return 0;
+}
+
+/*?
+ * Displays a summary of all the current CLI sessions.
+ *
+ * Typically used to diagnose connection issues with the CLI
+ * socket.
+ *
+ * @cliexpar
+ * @cliexstart{show cli-sessions}
+ * PNI   FD    Name                 Flags
+ * 343   0     unix-cli-stdin       IslpA
+ * 344   7     unix-cli-local:20    ISlpA
+ * 346   8     unix-cli-local:21    iSLpa
+ * @cliexend
+
+ * In this example we have the debug console of the running process
+ * on stdin/out, we have an interactive socket session and we also
+ * have a non-interactive socket session.
+ *
+ * Fields:
+ *
+ * - @em PNI: Process node index.
+ * - @em FD: Unix file descriptor.
+ * - @em Name: Name of the session.
+ * - @em Flags: Various flags that describe the state of the session.
+ *
+ * @em Flags have the following meanings; lower-case typically negates
+ * upper-case:
+ *
+ * - @em I Interactive session.
+ * - @em S Connected by socket.
+ * - @em s Not a socket, likely stdin.
+ * - @em L Line-by-line mode.
+ * - @em l Char-by-char mode.
+ * - @em P EPIPE detected on connection; it will close soon.
+ * - @em A ANSI-capable terminal.
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cli_unix_cli_show_cli_sessions, static) = {
+  .path = "show cli-sessions",
+  .short_help = "Show current CLI sessions",
+  .function = unix_cli_show_cli_sessions,
+};
+/* *INDENT-ON* */
+
+/** CLI command to set terminal pager settings. */
+static clib_error_t *
+unix_cli_set_terminal_pager (vlib_main_t * vm,
+			     unformat_input_t * input,
+			     vlib_cli_command_t * cmd)
+{
+  unix_main_t *um = &unix_main;
+  unix_cli_main_t *cm = &unix_cli_main;
+  unix_cli_file_t *cf;
+  unformat_input_t _line_input, *line_input = &_line_input;
+  clib_error_t *error = 0;
+
+  cf = pool_elt_at_index (cm->cli_file_pool, cm->current_input_file_index);
+
+  if (!cf->is_interactive)
+    return clib_error_return (0, "invalid for non-interactive sessions");
+
+  if (!unformat_user (input, unformat_line_input, line_input))
+    return 0;
+
+  while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (line_input, "on"))
+	cf->no_pager = 0;
+      else if (unformat (line_input, "off"))
+	cf->no_pager = 1;
+      else if (unformat (line_input, "limit %u", &um->cli_pager_buffer_limit))
+	vlib_cli_output (vm,
+			 "Pager limit set to %u lines; note, this is global.\n",
+			 um->cli_pager_buffer_limit);
+      else
+	{
+	  error = clib_error_return (0, "unknown parameter: `%U`",
+				     format_unformat_error, line_input);
+	  goto done;
+	}
+    }
+
+done:
+  unformat_free (line_input);
+
+  return error;
+}
+
+/*?
+ * Enables or disables the terminal pager for this session. Generally
+ * this defaults to enabled.
+ *
+ * Additionally allows the pager buffer size to be set; though note that
+ * this value is set globally and not per session.
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cli_unix_cli_set_terminal_pager, static) = {
+  .path = "set terminal pager",
+  .short_help = "set terminal pager [on|off] [limit <lines>]",
+  .function = unix_cli_set_terminal_pager,
+};
+/* *INDENT-ON* */
+
+/** CLI command to set terminal history settings. */
+static clib_error_t *
+unix_cli_set_terminal_history (vlib_main_t * vm,
+			       unformat_input_t * input,
+			       vlib_cli_command_t * cmd)
+{
+  unix_cli_main_t *cm = &unix_cli_main;
+  unix_cli_file_t *cf;
+  unformat_input_t _line_input, *line_input = &_line_input;
+  u32 limit;
+  clib_error_t *error = 0;
+
+  cf = pool_elt_at_index (cm->cli_file_pool, cm->current_input_file_index);
+
+  if (!cf->is_interactive)
+    return clib_error_return (0, "invalid for non-interactive sessions");
+
+  if (!unformat_user (input, unformat_line_input, line_input))
+    return 0;
+
+  while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (line_input, "on"))
+	cf->has_history = 1;
+      else if (unformat (line_input, "off"))
+	cf->has_history = 0;
+      else if (unformat (line_input, "limit %u", &cf->history_limit))
+	;
+      else
+	{
+	  error = clib_error_return (0, "unknown parameter: `%U`",
+				     format_unformat_error, line_input);
+	  goto done;
+	}
+
+      /* If we reduced history size, or turned it off, purge the history */
+      limit = cf->has_history ? cf->history_limit : 0;
+
+      while (cf->command_history && vec_len (cf->command_history) >= limit)
+	{
+	  vec_free (cf->command_history[0]);
+	  vec_delete (cf->command_history, 1, 0);
+	}
+    }
+
+done:
+  unformat_free (line_input);
+
+  return error;
+}
+
+/*?
+ * Enables or disables the command history function of the current
+ * terminal. Generally this defaults to enabled.
+ *
+ * This command also allows the maximum size of the history buffer for
+ * this session to be altered.
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cli_unix_cli_set_terminal_history, static) = {
+  .path = "set terminal history",
+  .short_help = "set terminal history [on|off] [limit <lines>]",
+  .function = unix_cli_set_terminal_history,
+};
+/* *INDENT-ON* */
+
+/** CLI command to set terminal ANSI settings. */
+static clib_error_t *
+unix_cli_set_terminal_ansi (vlib_main_t * vm,
+			    unformat_input_t * input,
+			    vlib_cli_command_t * cmd)
+{
+  unix_cli_main_t *cm = &unix_cli_main;
+  unix_cli_file_t *cf;
+
+  cf = pool_elt_at_index (cm->cli_file_pool, cm->current_input_file_index);
+
+  if (!cf->is_interactive)
+    return clib_error_return (0, "invalid for non-interactive sessions");
+
+  if (unformat (input, "on"))
+    cf->ansi_capable = 1;
+  else if (unformat (input, "off"))
+    cf->ansi_capable = 0;
+  else
+    return clib_error_return (0, "unknown parameter: `%U`",
+			      format_unformat_error, input);
+
+  return 0;
+}
+
+/*?
+ * Enables or disables the use of ANSI control sequences by this terminal.
+ * The default will vary based on terminal detection at the start of the
+ * session.
+ *
+ * ANSI control sequences are used in a small number of places to provide,
+ * for example, color text output and to control the cursor in the pager.
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cli_unix_cli_set_terminal_ansi, static) = {
+  .path = "set terminal ansi",
+  .short_help = "set terminal ansi [on|off]",
+  .function = unix_cli_set_terminal_ansi,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+unix_cli_init (vlib_main_t * vm)
+{
+  return 0;
+}
+
+VLIB_INIT_FUNCTION (unix_cli_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/unix/dir.dox b/src/vlib/unix/dir.dox
new file mode 100644
index 00000000..1380fa56
--- /dev/null
+++ b/src/vlib/unix/dir.dox
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2016 Comcast Cable Communications Management, LLC.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Doxygen directory documentation */
+
+/**
+@dir
+@brief VLIB Unix interface
+
+VLIB application library Unix interface layer.
+
+*/
+/*? %%clicmd:group_label Unix Interface %% ?*/
+/*? %%syscfg:group_label Unix Interface %% ?*/
+
diff --git a/src/vlib/unix/input.c b/src/vlib/unix/input.c
new file mode 100644
index 00000000..ecd31791
--- /dev/null
+++ b/src/vlib/unix/input.c
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * input.c: Unix file input
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <signal.h>
+#include <vppinfra/tw_timer_1t_3w_1024sl_ov.h>
+
+/* FIXME autoconf */
+#define HAVE_LINUX_EPOLL
+
+#ifdef HAVE_LINUX_EPOLL
+
+#include <sys/epoll.h>
+
+typedef struct
+{
+  int epoll_fd;
+  struct epoll_event *epoll_events;
+
+  /* Statistics. */
+  u64 epoll_files_ready;
+  u64 epoll_waits;
+} linux_epoll_main_t;
+
+static linux_epoll_main_t linux_epoll_main;
+
+static void
+linux_epoll_file_update (clib_file_t * f, unix_file_update_type_t update_type)
+{
+  clib_file_main_t *fm = &file_main;
+  linux_epoll_main_t *em = &linux_epoll_main;
+  struct epoll_event e;
+  int op;
+
+  memset (&e, 0, sizeof (e));
+
+  e.events = EPOLLIN;
+  if (f->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE)
+    e.events |= EPOLLOUT;
+  if (f->flags & UNIX_FILE_EVENT_EDGE_TRIGGERED)
+    e.events |= EPOLLET;
+  e.data.u32 = f - fm->file_pool;
+
+  op = -1;
+
+  switch (update_type)
+    {
+    case UNIX_FILE_UPDATE_ADD:
+      op = EPOLL_CTL_ADD;
+      break;
+
+    case UNIX_FILE_UPDATE_MODIFY:
+      op = EPOLL_CTL_MOD;
+      break;
+
+    case UNIX_FILE_UPDATE_DELETE:
+      op = EPOLL_CTL_DEL;
+      break;
+
+    default:
+      clib_warning ("unknown update_type %d", update_type);
+      return;
+    }
+
+  if (epoll_ctl (em->epoll_fd, op, f->file_descriptor, &e) < 0)
+    clib_unix_warning ("epoll_ctl");
+}
+
+static uword
+linux_epoll_input (vlib_main_t * vm,
+		   vlib_node_runtime_t * node, vlib_frame_t * frame)
+{
+  unix_main_t *um = &unix_main;
+  clib_file_main_t *fm = &file_main;
+  linux_epoll_main_t *em = &linux_epoll_main;
+  struct epoll_event *e;
+  int n_fds_ready;
+
+  {
+    vlib_node_main_t *nm = &vm->node_main;
+    u32 ticks_until_expiration;
+    f64 timeout;
+    int timeout_ms = 0, max_timeout_ms = 10;
+    f64 vector_rate = vlib_last_vectors_per_main_loop (vm);
+
+    /* If we're not working very hard, decide how long to sleep */
+    if (vector_rate < 2 && vm->api_queue_nonempty == 0
+	&& nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] == 0)
+      {
+	ticks_until_expiration = TW (tw_timer_first_expires_in_ticks)
+	  ((TWT (tw_timer_wheel) *) nm->timing_wheel);
+
+	/* Nothing on the fast wheel, sleep 10ms */
+	if (ticks_until_expiration == TW_SLOTS_PER_RING)
+	  {
+	    timeout = 10e-3;
+	    timeout_ms = max_timeout_ms;
+	  }
+	else
+	  {
+	    timeout = (f64) ticks_until_expiration *1e-5;
+	    if (timeout < 1e-3)
+	      timeout_ms = 0;
+	    else
+	      {
+		timeout_ms = timeout * 1e3;
+		/* Must be between 1 and 10 ms. */
+		timeout_ms = clib_max (1, timeout_ms);
+		timeout_ms = clib_min (max_timeout_ms, timeout_ms);
+	      }
+	  }
+	node->input_main_loops_per_call = 0;
+      }
+    else			/* busy */
+      {
+	/* Don't come back for a respectable number of dispatch cycles */
+	node->input_main_loops_per_call = 1024;
+      }
+
+    /* Allow any signal to wakeup our sleep. */
+    {
+      static sigset_t unblock_all_signals;
+      n_fds_ready = epoll_pwait (em->epoll_fd,
+				 em->epoll_events,
+				 vec_len (em->epoll_events),
+				 timeout_ms, &unblock_all_signals);
+
+      /* This kludge is necessary to run over absurdly old kernels */
+      if (n_fds_ready < 0 && errno == ENOSYS)
+	{
+	  n_fds_ready = epoll_wait (em->epoll_fd,
+				    em->epoll_events,
+				    vec_len (em->epoll_events), timeout_ms);
+	}
+    }
+  }
+
+  if (n_fds_ready < 0)
+    {
+      if (unix_error_is_fatal (errno))
+	vlib_panic_with_error (vm, clib_error_return_unix (0, "epoll_wait"));
+
+      /* non fatal error (e.g. EINTR). */
+      return 0;
+    }
+
+  em->epoll_waits += 1;
+  em->epoll_files_ready += n_fds_ready;
+
+  for (e = em->epoll_events; e < em->epoll_events + n_fds_ready; e++)
+    {
+      u32 i = e->data.u32;
+      clib_file_t *f = pool_elt_at_index (fm->file_pool, i);
+      clib_error_t *errors[4];
+      int n_errors = 0;
+
+      if (PREDICT_TRUE (!(e->events & EPOLLERR)))
+	{
+	  if (e->events & EPOLLIN)
+	    {
+	      errors[n_errors] = f->read_function (f);
+	      n_errors += errors[n_errors] != 0;
+	    }
+	  if (e->events & EPOLLOUT)
+	    {
+	      errors[n_errors] = f->write_function (f);
+	      n_errors += errors[n_errors] != 0;
+	    }
+	}
+      else
+	{
+	  if (f->error_function)
+	    {
+	      errors[n_errors] = f->error_function (f);
+	      n_errors += errors[n_errors] != 0;
+	    }
+	  else
+	    close (f->file_descriptor);
+	}
+
+      ASSERT (n_errors < ARRAY_LEN (errors));
+      for (i = 0; i < n_errors; i++)
+	{
+	  unix_save_error (um, errors[i]);
+	}
+    }
+
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (linux_epoll_input_node,static) = {
+  .function = linux_epoll_input,
+  .type = VLIB_NODE_TYPE_PRE_INPUT,
+  .name = "unix-epoll-input",
+};
+/* *INDENT-ON* */
+
+clib_error_t *
+linux_epoll_input_init (vlib_main_t * vm)
+{
+  linux_epoll_main_t *em = &linux_epoll_main;
+  clib_file_main_t *fm = &file_main;
+
+  /* Allocate some events. */
+  vec_resize (em->epoll_events, VLIB_FRAME_SIZE);
+
+  em->epoll_fd = epoll_create (vec_len (em->epoll_events));
+  if (em->epoll_fd < 0)
+    return clib_error_return_unix (0, "epoll_create");
+
+  fm->file_update = linux_epoll_file_update;
+
+  return 0;
+}
+
+VLIB_INIT_FUNCTION (linux_epoll_input_init);
+
+#endif /* HAVE_LINUX_EPOLL */
+
+static clib_error_t *
+unix_input_init (vlib_main_t * vm)
+{
+  return vlib_call_init_function (vm, linux_epoll_input_init);
+}
+
+VLIB_INIT_FUNCTION (unix_input_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c
new file mode 100644
index 00000000..f286c870
--- /dev/null
+++ b/src/vlib/unix/main.c
@@ -0,0 +1,642 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * main.c: Unix main routine
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <vlib/unix/plugin.h>
+
+#include <signal.h>
+#include <sys/ucontext.h>
+#include <syslog.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <unistd.h>
+
+/** Default CLI pager limit is not configured in startup.conf */
+#define UNIX_CLI_DEFAULT_PAGER_LIMIT 100000
+
+/** Default CLI history depth if not configured in startup.conf */
+#define UNIX_CLI_DEFAULT_HISTORY 50
+
+char *vlib_default_runtime_dir __attribute__ ((weak));
+char *vlib_default_runtime_dir = "vlib";
+
+unix_main_t unix_main;
+clib_file_main_t file_main;
+
+static clib_error_t *
+unix_main_init (vlib_main_t * vm)
+{
+  unix_main_t *um = &unix_main;
+  um->vlib_main = vm;
+  return vlib_call_init_function (vm, unix_input_init);
+}
+
+VLIB_INIT_FUNCTION (unix_main_init);
+
+static void
+unix_signal_handler (int signum, siginfo_t * si, ucontext_t * uc)
+{
+  uword fatal = 0;
+  u8 *msg = 0;
+
+  msg = format (msg, "received signal %U, PC %U",
+		format_signal, signum, format_ucontext_pc, uc);
+
+  if (signum == SIGSEGV)
+    msg = format (msg, ", faulting address %p", si->si_addr);
+
+  switch (signum)
+    {
+      /* these (caught) signals cause the application to exit */
+    case SIGTERM:
+      if (unix_main.vlib_main->main_loop_exit_set)
+	{
+	  syslog (LOG_ERR | LOG_DAEMON, "received SIGTERM, exiting...");
+	  unix_main.vlib_main->main_loop_exit_now = 1;
+	}
+      break;
+      /* fall through */
+    case SIGQUIT:
+    case SIGINT:
+    case SIGILL:
+    case SIGBUS:
+    case SIGSEGV:
+    case SIGHUP:
+    case SIGFPE:
+      fatal = 1;
+      break;
+
+      /* by default, print a message and continue */
+    default:
+      fatal = 0;
+      break;
+    }
+
+  /* Null terminate. */
+  vec_add1 (msg, 0);
+
+  if (fatal)
+    {
+      syslog (LOG_ERR | LOG_DAEMON, "%s", msg);
+      os_exit (1);
+    }
+  else
+    clib_warning ("%s", msg);
+
+  vec_free (msg);
+}
+
+static clib_error_t *
+setup_signal_handlers (unix_main_t * um)
+{
+  uword i;
+  struct sigaction sa;
+
+  for (i = 1; i < 32; i++)
+    {
+      memset (&sa, 0, sizeof (sa));
+      sa.sa_sigaction = (void *) unix_signal_handler;
+      sa.sa_flags = SA_SIGINFO;
+
+      switch (i)
+	{
+	  /* these signals take the default action */
+	case SIGABRT:
+	case SIGKILL:
+	case SIGSTOP:
+	case SIGUSR1:
+	case SIGUSR2:
+	  continue;
+
+	  /* ignore SIGPIPE, SIGCHLD */
+	case SIGPIPE:
+	case SIGCHLD:
+	  sa.sa_sigaction = (void *) SIG_IGN;
+	  break;
+
+	  /* catch and handle all other signals */
+	default:
+	  break;
+	}
+
+      if (sigaction (i, &sa, 0) < 0)
+	return clib_error_return_unix (0, "sigaction %U", format_signal, i);
+    }
+
+  return 0;
+}
+
+static void
+unix_error_handler (void *arg, u8 * msg, int msg_len)
+{
+  unix_main_t *um = arg;
+
+  /* Echo to stderr when interactive. */
+  if (um->flags & UNIX_FLAG_INTERACTIVE)
+    {
+      CLIB_UNUSED (int r) = write (2, msg, msg_len);
+    }
+  else
+    {
+      char save = msg[msg_len - 1];
+
+      /* Null Terminate. */
+      msg[msg_len - 1] = 0;
+
+      syslog (LOG_ERR | LOG_DAEMON, "%s", msg);
+
+      msg[msg_len - 1] = save;
+    }
+}
+
+void
+vlib_unix_error_report (vlib_main_t * vm, clib_error_t * error)
+{
+  unix_main_t *um = &unix_main;
+
+  if (um->flags & UNIX_FLAG_INTERACTIVE || error == 0)
+    return;
+
+  {
+    char save;
+    u8 *msg;
+    u32 msg_len;
+
+    msg = error->what;
+    msg_len = vec_len (msg);
+
+    /* Null Terminate. */
+    save = msg[msg_len - 1];
+    msg[msg_len - 1] = 0;
+
+    syslog (LOG_ERR | LOG_DAEMON, "%s", msg);
+
+    msg[msg_len - 1] = save;
+  }
+}
+
+static uword
+startup_config_process (vlib_main_t * vm,
+			vlib_node_runtime_t * rt, vlib_frame_t * f)
+{
+  unix_main_t *um = &unix_main;
+  u8 *buf = 0;
+  uword l, n = 1;
+
+  vlib_process_suspend (vm, 2.0);
+
+  while (um->unix_config_complete == 0)
+    vlib_process_suspend (vm, 0.1);
+
+  if (um->startup_config_filename)
+    {
+      unformat_input_t sub_input;
+      int fd;
+      struct stat s;
+      char *fn = (char *) um->startup_config_filename;
+
+      fd = open (fn, O_RDONLY);
+      if (fd < 0)
+	{
+	  clib_warning ("failed to open `%s'", fn);
+	  return 0;
+	}
+
+      if (fstat (fd, &s) < 0)
+	{
+	  clib_warning ("failed to stat `%s'", fn);
+	bail:
+	  close (fd);
+	  return 0;
+	}
+
+      if (!(S_ISREG (s.st_mode) || S_ISLNK (s.st_mode)))
+	{
+	  clib_warning ("not a regular file: `%s'", fn);
+	  goto bail;
+	}
+
+      while (n > 0)
+	{
+	  l = vec_len (buf);
+	  vec_resize (buf, 4096);
+	  n = read (fd, buf + l, 4096);
+	  if (n > 0)
+	    {
+	      _vec_len (buf) = l + n;
+	      if (n < 4096)
+		break;
+	    }
+	  else
+	    break;
+	}
+      if (um->log_fd && vec_len (buf))
+	{
+	  u8 *lv = 0;
+	  lv = format (lv, "%U: ***** Startup Config *****\n%v",
+		       format_timeval, 0 /* current bat-time */ ,
+		       0 /* current bat-format */ ,
+		       buf);
+	  {
+	    int rv __attribute__ ((unused)) =
+	      write (um->log_fd, lv, vec_len (lv));
+	  }
+	  vec_reset_length (lv);
+	  lv = format (lv, "%U: ***** End Startup Config *****\n",
+		       format_timeval, 0 /* current bat-time */ ,
+		       0 /* current bat-format */ );
+	  {
+	    int rv __attribute__ ((unused)) =
+	      write (um->log_fd, lv, vec_len (lv));
+	  }
+	  vec_free (lv);
+	}
+
+      if (vec_len (buf))
+	{
+	  unformat_init_vector (&sub_input, buf);
+	  vlib_cli_input (vm, &sub_input, 0, 0);
+	  /* frees buf for us */
+	  unformat_free (&sub_input);
+	}
+      close (fd);
+    }
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_NODE (startup_config_node,static) = {
+    .function = startup_config_process,
+    .type = VLIB_NODE_TYPE_PROCESS,
+    .name = "startup-config-process",
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+unix_config (vlib_main_t * vm, unformat_input_t * input)
+{
+  unix_main_t *um = &unix_main;
+  clib_error_t *error = 0;
+  gid_t gid;
+  int pidfd = -1;
+
+  /* Defaults */
+  um->cli_pager_buffer_limit = UNIX_CLI_DEFAULT_PAGER_LIMIT;
+  um->cli_history_limit = UNIX_CLI_DEFAULT_HISTORY;
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      char *cli_prompt;
+      if (unformat (input, "interactive"))
+	um->flags |= UNIX_FLAG_INTERACTIVE;
+      else if (unformat (input, "nodaemon"))
+	um->flags |= UNIX_FLAG_NODAEMON;
+      else if (unformat (input, "cli-prompt %s", &cli_prompt))
+	vlib_unix_cli_set_prompt (cli_prompt);
+      else
+	if (unformat (input, "cli-listen %s", &um->cli_listen_socket.config))
+	;
+      else if (unformat (input, "runtime-dir %s", &um->runtime_dir))
+	;
+      else if (unformat (input, "cli-line-mode"))
+	um->cli_line_mode = 1;
+      else if (unformat (input, "cli-no-banner"))
+	um->cli_no_banner = 1;
+      else if (unformat (input, "cli-no-pager"))
+	um->cli_no_pager = 1;
+      else if (unformat (input, "cli-pager-buffer-limit %d",
+			 &um->cli_pager_buffer_limit))
+	;
+      else
+	if (unformat (input, "cli-history-limit %d", &um->cli_history_limit))
+	;
+      else if (unformat (input, "coredump-size"))
+	{
+	  uword coredump_size = 0;
+	  if (unformat (input, "unlimited"))
+	    {
+	      coredump_size = RLIM_INFINITY;
+	    }
+	  else
+	    if (!unformat (input, "%U", unformat_memory_size, &coredump_size))
+	    {
+	      return clib_error_return (0,
+					"invalid coredump-size parameter `%U'",
+					format_unformat_error, input);
+	    }
+	  const struct rlimit new_limit = { coredump_size, coredump_size };
+	  if (0 != setrlimit (RLIMIT_CORE, &new_limit))
+	    {
+	      clib_unix_warning ("prlimit() failed");
+	    }
+	}
+      else if (unformat (input, "full-coredump"))
+	{
+	  int fd;
+
+	  fd = open ("/proc/self/coredump_filter", O_WRONLY);
+	  if (fd >= 0)
+	    {
+	      if (write (fd, "0x6f\n", 5) != 5)
+		clib_unix_warning ("coredump filter write failed!");
+	      close (fd);
+	    }
+	  else
+	    clib_unix_warning ("couldn't open /proc/self/coredump_filter");
+	}
+      else if (unformat (input, "startup-config %s",
+			 &um->startup_config_filename))
+	;
+      else if (unformat (input, "exec %s", &um->startup_config_filename))
+	;
+      else if (unformat (input, "log %s", &um->log_filename))
+	{
+	  um->log_fd = open ((char *) um->log_filename,
+			     O_CREAT | O_WRONLY | O_APPEND, 0644);
+	  if (um->log_fd < 0)
+	    {
+	      clib_warning ("couldn't open log '%s'\n", um->log_filename);
+	      um->log_fd = 0;
+	    }
+	  else
+	    {
+	      u8 *lv = 0;
+	      lv = format (0, "%U: ***** Start: PID %d *****\n",
+			   format_timeval, 0 /* current bat-time */ ,
+			   0 /* current bat-format */ ,
+			   getpid ());
+	      {
+		int rv __attribute__ ((unused)) =
+		  write (um->log_fd, lv, vec_len (lv));
+	      }
+	      vec_free (lv);
+	    }
+	}
+      else if (unformat (input, "gid %U", unformat_unix_gid, &gid))
+	{
+	  if (setegid (gid) == -1)
+	    return clib_error_return_unix (0, "setegid");
+	}
+      else if (unformat (input, "pidfile %s", &um->pidfile))
+	;
+      else
+	return clib_error_return (0, "unknown input `%U'",
+				  format_unformat_error, input);
+    }
+
+  if (um->runtime_dir == 0)
+    {
+      uid_t uid = geteuid ();
+      if (uid == 00)
+	um->runtime_dir = format (0, "/run/%s%c",
+				  vlib_default_runtime_dir, 0);
+      else
+	um->runtime_dir = format (0, "/run/user/%u/%s%c", uid,
+				  vlib_default_runtime_dir, 0);
+    }
+
+  error = setup_signal_handlers (um);
+  if (error)
+    return error;
+
+  if (um->pidfile)
+    {
+      if ((error = vlib_unix_validate_runtime_file (um,
+						    (char *) um->pidfile,
+						    &um->pidfile)))
+	return error;
+
+      if (((pidfd = open ((char *) um->pidfile,
+			  O_CREAT | O_WRONLY | O_TRUNC, 0644)) < 0))
+	{
+	  return clib_error_return_unix (0, "open");
+	}
+    }
+
+  if (!(um->flags & UNIX_FLAG_INTERACTIVE))
+    {
+      openlog (vm->name, LOG_CONS | LOG_PERROR | LOG_PID, LOG_DAEMON);
+      clib_error_register_handler (unix_error_handler, um);
+
+      if (!(um->flags & UNIX_FLAG_NODAEMON) && daemon ( /* chdir to / */ 0,
+						       /* stdin/stdout/stderr -> /dev/null */
+						       0) < 0)
+	clib_error_return (0, "daemon () fails");
+    }
+
+  if (pidfd >= 0)
+    {
+      u8 *lv = format (0, "%d", getpid ());
+      if (write (pidfd, (char *) lv, vec_len (lv)) != vec_len (lv))
+	{
+	  vec_free (lv);
+	  close (pidfd);
+	  return clib_error_return_unix (0, "write");
+	}
+      vec_free (lv);
+      close (pidfd);
+    }
+
+  um->unix_config_complete = 1;
+
+  return 0;
+}
+
+/* unix { ... } configuration. */
+/*?
+ *
+ * @cfgcmd{interactive}
+ * Attach CLI to stdin/out and provide a debugging command line interface.
+ * Implies @c nodaemon.
+ *
+ * @cfgcmd{nodaemon}
+ * Do not fork or background the VPP process. Typically used when invoking
+ * VPP applications from a process monitor.
+ *
+ * @cfgcmd{exec, &lt;filename&gt;}
+ * @par <code>startup-config &lt;filename&gt;</code>
+ * Read startup operational configuration from @c filename.
+ * The contents of the file will be performed as though entered at the CLI.
+ * The two keywords are aliases for the same function; if both are specified,
+ * only the last will have an effect.
+ *
+ * @cfgcmd{log, &lt;filename&gt;}
+ * Logs the startup configuration and all subsequent CLI commands in
+ * @c filename.
+ * Very useful in situations where folks don't remember or can't be bothered
+ * to include CLI commands in bug reports.
+ *
+ * @cfgcmd{pidfile, &lt;filename&gt;}
+ * Writes the pid of the main thread in @c filename.
+ *
+ * @cfgcmd{full-coredump}
+ * Ask the Linux kernel to dump all memory-mapped address regions, instead
+ * of just text+data+bss.
+ *
+ * @cfgcmd{runtime-dir}
+ * Define directory where VPP is going to store all runtime files.
+ * Default is /run/vpp.
+ *
+ * @cfgcmd{cli-listen, &lt;address:port&gt;}
+ * Bind the CLI to listen at the address and port given. @clocalhost
+ * on TCP port @c 5002, given as <tt>cli-listen localhost:5002</tt>,
+ * is typical.
+ *
+ * @cfgcmd{cli-line-mode}
+ * Disable character-by-character I/O on stdin. Useful when combined with,
+ * for example, <tt>emacs M-x gud-gdb</tt>.
+ *
+ * @cfgcmd{cli-prompt, &lt;string&gt;}
+ * Configure the CLI prompt to be @c string.
+ *
+ * @cfgcmd{cli-history-limit, &lt;nn&gt;}
+ * Limit commmand history to @c nn  lines. A value of @c 0
+ * disables command history. Default value: @c 50
+ *
+ * @cfgcmd{cli-no-banner}
+ * Disable the login banner on stdin and Telnet connections.
+ *
+ * @cfgcmd{cli-no-pager}
+ * Disable the output pager.
+ *
+ * @cfgcmd{cli-pager-buffer-limit, &lt;nn&gt;}
+ * Limit pager buffer to @c nn lines of output.
+ * A value of @c 0 disables the pager. Default value: @c 100000
+?*/
+VLIB_EARLY_CONFIG_FUNCTION (unix_config, "unix");
+
+static clib_error_t *
+unix_exit (vlib_main_t * vm)
+{
+  /* Close syslog connection. */
+  closelog ();
+  return 0;
+}
+
+VLIB_MAIN_LOOP_EXIT_FUNCTION (unix_exit);
+
+u8 **vlib_thread_stacks;
+
+static uword
+thread0 (uword arg)
+{
+  vlib_main_t *vm = (vlib_main_t *) arg;
+  unformat_input_t input;
+  int i;
+
+  unformat_init_command_line (&input, (char **) vm->argv);
+  i = vlib_main (vm, &input);
+  unformat_free (&input);
+
+  return i;
+}
+
+u8 *
+vlib_thread_stack_init (uword thread_index)
+{
+  vec_validate (vlib_thread_stacks, thread_index);
+  vlib_thread_stacks[thread_index] = clib_mem_alloc_aligned
+    (VLIB_THREAD_STACK_SIZE, VLIB_THREAD_STACK_SIZE);
+
+  /*
+   * Disallow writes to the bottom page of the stack, to
+   * catch stack overflows.
+   */
+  if (mprotect (vlib_thread_stacks[thread_index],
+		clib_mem_get_page_size (), PROT_READ) < 0)
+    clib_unix_warning ("thread stack");
+  return vlib_thread_stacks[thread_index];
+}
+
+int
+vlib_unix_main (int argc, char *argv[])
+{
+  vlib_main_t *vm = &vlib_global_main;	/* one and only time for this! */
+  unformat_input_t input;
+  clib_error_t *e;
+  int i;
+
+  vm->argv = (u8 **) argv;
+  vm->name = argv[0];
+  vm->heap_base = clib_mem_get_heap ();
+  ASSERT (vm->heap_base);
+
+  unformat_init_command_line (&input, (char **) vm->argv);
+  if ((e = vlib_plugin_config (vm, &input)))
+    {
+      clib_error_report (e);
+      return 1;
+    }
+  unformat_free (&input);
+
+  i = vlib_plugin_early_init (vm);
+  if (i)
+    return i;
+
+  unformat_init_command_line (&input, (char **) vm->argv);
+  if (vm->init_functions_called == 0)
+    vm->init_functions_called = hash_create (0, /* value bytes */ 0);
+  e = vlib_call_all_config_functions (vm, &input, 1 /* early */ );
+  if (e != 0)
+    {
+      clib_error_report (e);
+      return 1;
+    }
+  unformat_free (&input);
+
+  vlib_thread_stack_init (0);
+
+  __os_thread_index = 0;
+
+  i = clib_calljmp (thread0, (uword) vm,
+		    (void *) (vlib_thread_stacks[0] +
+			      VLIB_THREAD_STACK_SIZE));
+  return i;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/unix/mc_socket.c b/src/vlib/unix/mc_socket.c
new file mode 100644
index 00000000..3f1cd99d
--- /dev/null
+++ b/src/vlib/unix/mc_socket.c
@@ -0,0 +1,1050 @@
+/*
+ * mc_socket.c: socket based multicast for vlib mc
+ *
+ * Copyright (c) 2010 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/unix/mc_socket.h>
+
+#include <sys/ioctl.h>		/* for FIONBIO */
+#include <netinet/tcp.h>	/* for TCP_NODELAY */
+#include <net/if.h>		/* for struct ifreq */
+
+static u8 *
+format_socket_peer_id (u8 * s, va_list * args)
+{
+  u64 peer_id_as_u64 = va_arg (*args, u64);
+  mc_peer_id_t peer_id;
+  peer_id.as_u64 = peer_id_as_u64;
+  u32 a = mc_socket_peer_id_get_address (peer_id);
+  u32 p = mc_socket_peer_id_get_port (peer_id);
+
+  s = format (s, "%U:%04x", format_network_address, AF_INET, &a, ntohs (p));
+
+  return s;
+}
+
+typedef void (mc_msg_handler_t) (mc_main_t * mcm, void *msg,
+				 u32 buffer_index);
+
+always_inline void
+msg_handler (mc_main_t * mcm,
+	     u32 buffer_index, u32 handler_frees_buffer, void *_h)
+{
+  vlib_main_t *vm = mcm->vlib_main;
+  mc_msg_handler_t *h = _h;
+  vlib_buffer_t *b = vlib_get_buffer (vm, buffer_index);
+  void *the_msg = vlib_buffer_get_current (b);
+
+  h (mcm, the_msg, buffer_index);
+  if (!handler_frees_buffer)
+    vlib_buffer_free_one (vm, buffer_index);
+}
+
+static uword
+append_buffer_index_to_iovec (vlib_main_t * vm,
+			      u32 buffer_index, struct iovec **iovs_return)
+{
+  struct iovec *i;
+  vlib_buffer_t *b;
+  u32 bi = buffer_index;
+  u32 l = 0;
+
+  while (1)
+    {
+      b = vlib_get_buffer (vm, bi);
+      vec_add2 (*iovs_return, i, 1);
+      i->iov_base = vlib_buffer_get_current (b);
+      i->iov_len = b->current_length;
+      l += i->iov_len;
+      if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT))
+	break;
+      bi = b->next_buffer;
+    }
+
+  return l;
+}
+
+static clib_error_t *
+sendmsg_helper (mc_socket_main_t * msm,
+		int socket, struct sockaddr_in *tx_addr, u32 buffer_index)
+{
+  vlib_main_t *vm = msm->mc_main.vlib_main;
+  struct msghdr h;
+  word n_bytes, n_bytes_tx, n_retries;
+
+  memset (&h, 0, sizeof (h));
+  h.msg_name = tx_addr;
+  h.msg_namelen = sizeof (tx_addr[0]);
+
+  if (msm->iovecs)
+    _vec_len (msm->iovecs) = 0;
+
+  n_bytes = append_buffer_index_to_iovec (vm, buffer_index, &msm->iovecs);
+  ASSERT (n_bytes <= msm->mc_main.transport.max_packet_size);
+  if (n_bytes > msm->mc_main.transport.max_packet_size)
+    clib_error ("sending packet larger than interace MTU %d bytes", n_bytes);
+
+  h.msg_iov = msm->iovecs;
+  h.msg_iovlen = vec_len (msm->iovecs);
+
+  n_retries = 0;
+  while ((n_bytes_tx = sendmsg (socket, &h, /* flags */ 0)) != n_bytes
+	 && errno == EAGAIN)
+    n_retries++;
+  if (n_bytes_tx != n_bytes)
+    {
+      clib_unix_warning ("sendmsg");
+      return 0;
+    }
+  if (n_retries)
+    {
+      ELOG_TYPE_DECLARE (e) =
+      {
+      .format = "sendmsg-helper: %d retries",.format_args = "i4",};
+      struct
+      {
+	u32 retries;
+      } *ed = 0;
+
+      ed = ELOG_DATA (&vm->elog_main, e);
+      ed->retries = n_retries;
+    }
+  return 0;
+}
+
+static clib_error_t *
+tx_buffer (void *transport, mc_transport_type_t type, u32 buffer_index)
+{
+  mc_socket_main_t *msm = (mc_socket_main_t *) transport;
+  vlib_main_t *vm = msm->mc_main.vlib_main;
+  mc_multicast_socket_t *ms = &msm->multicast_sockets[type];
+  clib_error_t *error;
+  error = sendmsg_helper (msm, ms->socket, &ms->tx_addr, buffer_index);
+  if (type != MC_TRANSPORT_USER_REQUEST_TO_RELAY)
+    vlib_buffer_free_one (vm, buffer_index);
+  return error;
+}
+
+static clib_error_t *
+tx_ack (void *transport, mc_peer_id_t dest_peer_id, u32 buffer_index)
+{
+  struct sockaddr_in tx_addr;
+  mc_socket_main_t *msm = (mc_socket_main_t *) transport;
+  vlib_main_t *vm = msm->mc_main.vlib_main;
+  clib_error_t *error;
+
+  memset (&tx_addr, 0, sizeof (tx_addr));
+  tx_addr.sin_family = AF_INET;
+  tx_addr.sin_addr.s_addr = mc_socket_peer_id_get_address (dest_peer_id);
+  tx_addr.sin_port = mc_socket_peer_id_get_port (dest_peer_id);
+
+  error = sendmsg_helper (msm, msm->ack_socket, &tx_addr, buffer_index);
+  vlib_buffer_free_one (vm, buffer_index);
+  return error;
+}
+
+static clib_error_t *
+recvmsg_helper (mc_socket_main_t * msm,
+		int socket,
+		struct sockaddr_in *rx_addr,
+		u32 * buffer_index, u32 drop_message)
+{
+  vlib_main_t *vm = msm->mc_main.vlib_main;
+  vlib_buffer_t *b;
+  uword n_left, n_alloc, n_mtu, i, i_rx;
+  const uword buffer_size = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES;
+  word n_bytes_left;
+
+  /* Make sure we have at least a MTU worth of buffers. */
+  n_mtu = msm->rx_mtu_n_buffers;
+  n_left = vec_len (msm->rx_buffers);
+  if (n_left < n_mtu)
+    {
+      uword max_alloc = 8 * n_mtu;
+      vec_validate (msm->rx_buffers, max_alloc - 1);
+      n_alloc =
+	vlib_buffer_alloc (vm, msm->rx_buffers + n_left, max_alloc - n_left);
+      _vec_len (msm->rx_buffers) = n_left + n_alloc;
+    }
+
+  ASSERT (vec_len (msm->rx_buffers) >= n_mtu);
+  vec_validate (msm->iovecs, n_mtu - 1);
+
+  /* Allocate RX buffers from end of rx_buffers.
+     Turn them into iovecs to pass to readv. */
+  i_rx = vec_len (msm->rx_buffers) - 1;
+  for (i = 0; i < n_mtu; i++)
+    {
+      b = vlib_get_buffer (vm, msm->rx_buffers[i_rx - i]);
+      msm->iovecs[i].iov_base = b->data;
+      msm->iovecs[i].iov_len = buffer_size;
+    }
+  _vec_len (msm->iovecs) = n_mtu;
+
+  {
+    struct msghdr h;
+
+    memset (&h, 0, sizeof (h));
+    if (rx_addr)
+      {
+	h.msg_name = rx_addr;
+	h.msg_namelen = sizeof (rx_addr[0]);
+      }
+    h.msg_iov = msm->iovecs;
+    h.msg_iovlen = vec_len (msm->iovecs);
+
+    n_bytes_left = recvmsg (socket, &h, 0);
+    if (n_bytes_left < 0)
+      return clib_error_return_unix (0, "recvmsg");
+  }
+
+  if (drop_message)
+    {
+      *buffer_index = ~0;
+      return 0;
+    }
+
+  *buffer_index = msm->rx_buffers[i_rx];
+  while (1)
+    {
+      b = vlib_get_buffer (vm, msm->rx_buffers[i_rx]);
+
+      b->flags = 0;
+      b->current_data = 0;
+      b->current_length =
+	n_bytes_left < buffer_size ? n_bytes_left : buffer_size;
+
+      n_bytes_left -= buffer_size;
+
+      if (n_bytes_left <= 0)
+	break;
+
+      i_rx--;
+      b->flags |= VLIB_BUFFER_NEXT_PRESENT;
+      b->next_buffer = msm->rx_buffers[i_rx];
+    }
+
+  _vec_len (msm->rx_buffers) = i_rx;
+
+  return 0 /* no error */ ;
+}
+
+static clib_error_t *
+mastership_socket_read_ready (clib_file_t * uf)
+{
+  mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data;
+  mc_main_t *mcm = &msm->mc_main;
+  mc_multicast_socket_t *ms =
+    &msm->multicast_sockets[MC_TRANSPORT_MASTERSHIP];
+  clib_error_t *error;
+  u32 bi;
+
+  error = recvmsg_helper (msm, ms->socket, /* rx_addr */ 0, &bi,	/* drop_message */
+			  0);
+  if (!error)
+    msg_handler (mcm, bi,
+		 /* handler_frees_buffer */ 0,
+		 mc_msg_master_assert_handler);
+
+  return error;
+}
+
+static clib_error_t *
+to_relay_socket_read_ready (clib_file_t * uf)
+{
+  mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data;
+  mc_main_t *mcm = &msm->mc_main;
+  vlib_main_t *vm = msm->mc_main.vlib_main;
+  mc_multicast_socket_t *ms_to_relay =
+    &msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_TO_RELAY];
+  mc_multicast_socket_t *ms_from_relay =
+    &msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_FROM_RELAY];
+  clib_error_t *error;
+  u32 bi;
+  u32 is_master = mcm->relay_state == MC_RELAY_STATE_MASTER;
+
+  /* Not the ordering master? Turf the msg */
+  error = recvmsg_helper (msm, ms_to_relay->socket, /* rx_addr */ 0, &bi,
+			  /* drop_message */ !is_master);
+
+  /* If we are the master, number and rebroadcast the msg. */
+  if (!error && is_master)
+    {
+      vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+      mc_msg_user_request_t *mp = vlib_buffer_get_current (b);
+      mp->global_sequence = clib_host_to_net_u32 (mcm->relay_global_sequence);
+      mcm->relay_global_sequence++;
+      error =
+	sendmsg_helper (msm, ms_from_relay->socket, &ms_from_relay->tx_addr,
+			bi);
+      vlib_buffer_free_one (vm, bi);
+    }
+
+  return error;
+}
+
+static clib_error_t *
+from_relay_socket_read_ready (clib_file_t * uf)
+{
+  mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data;
+  mc_main_t *mcm = &msm->mc_main;
+  mc_multicast_socket_t *ms =
+    &msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_FROM_RELAY];
+  clib_error_t *error;
+  u32 bi;
+
+  error = recvmsg_helper (msm, ms->socket, /* rx_addr */ 0, &bi,	/* drop_message */
+			  0);
+  if (!error)
+    {
+      msg_handler (mcm, bi, /* handler_frees_buffer */ 1,
+		   mc_msg_user_request_handler);
+    }
+  return error;
+}
+
+static clib_error_t *
+join_socket_read_ready (clib_file_t * uf)
+{
+  mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data;
+  mc_main_t *mcm = &msm->mc_main;
+  vlib_main_t *vm = mcm->vlib_main;
+  mc_multicast_socket_t *ms = &msm->multicast_sockets[MC_TRANSPORT_JOIN];
+  clib_error_t *error;
+  u32 bi;
+
+  error = recvmsg_helper (msm, ms->socket, /* rx_addr */ 0, &bi,	/* drop_message */
+			  0);
+  if (!error)
+    {
+      vlib_buffer_t *b = vlib_get_buffer (vm, bi);
+      mc_msg_join_or_leave_request_t *mp = vlib_buffer_get_current (b);
+
+      switch (clib_host_to_net_u32 (mp->type))
+	{
+	case MC_MSG_TYPE_join_or_leave_request:
+	  msg_handler (mcm, bi, /* handler_frees_buffer */ 0,
+		       mc_msg_join_or_leave_request_handler);
+	  break;
+
+	case MC_MSG_TYPE_join_reply:
+	  msg_handler (mcm, bi, /* handler_frees_buffer */ 0,
+		       mc_msg_join_reply_handler);
+	  break;
+
+	default:
+	  ASSERT (0);
+	  break;
+	}
+    }
+  return error;
+}
+
+static clib_error_t *
+ack_socket_read_ready (clib_file_t * uf)
+{
+  mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data;
+  mc_main_t *mcm = &msm->mc_main;
+  clib_error_t *error;
+  u32 bi;
+
+  error = recvmsg_helper (msm, msm->ack_socket, /* rx_addr */ 0, &bi,
+			  /* drop_message */ 0);
+  if (!error)
+    msg_handler (mcm, bi, /* handler_frees_buffer */ 0,
+		 mc_msg_user_ack_handler);
+  return error;
+}
+
+static void
+catchup_cleanup (mc_socket_main_t * msm,
+		 mc_socket_catchup_t * c, clib_file_main_t * um,
+		 clib_file_t * uf)
+{
+  hash_unset (msm->catchup_index_by_file_descriptor, uf->file_descriptor);
+  clib_file_del (um, uf);
+  vec_free (c->input_vector);
+  vec_free (c->output_vector);
+  pool_put (msm->catchups, c);
+}
+
+static mc_socket_catchup_t *
+find_catchup_from_file_descriptor (mc_socket_main_t * msm,
+				   int file_descriptor)
+{
+  uword *p =
+    hash_get (msm->catchup_index_by_file_descriptor, file_descriptor);
+  return p ? pool_elt_at_index (msm->catchups, p[0]) : 0;
+}
+
+static clib_error_t *
+catchup_socket_read_ready (clib_file_t * uf, int is_server)
+{
+  clib_file_main_t *um = &file_main;
+  mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data;
+  mc_main_t *mcm = &msm->mc_main;
+  mc_socket_catchup_t *c =
+    find_catchup_from_file_descriptor (msm, uf->file_descriptor);
+  word l, n, is_eof;
+
+  l = vec_len (c->input_vector);
+  vec_resize (c->input_vector, 4096);
+  n =
+    read (uf->file_descriptor, c->input_vector + l,
+	  vec_len (c->input_vector) - l);
+  is_eof = n == 0;
+
+  if (n < 0)
+    {
+      if (errno == EAGAIN)
+	n = 0;
+      else
+	{
+	  catchup_cleanup (msm, c, um, uf);
+	  return clib_error_return_unix (0, "read");
+	}
+    }
+
+  _vec_len (c->input_vector) = l + n;
+
+  if (is_eof && vec_len (c->input_vector) > 0)
+    {
+      if (is_server)
+	{
+	  mc_msg_catchup_request_handler (mcm, (void *) c->input_vector,
+					  c - msm->catchups);
+	  _vec_len (c->input_vector) = 0;
+	}
+      else
+	{
+	  mc_msg_catchup_reply_handler (mcm, (void *) c->input_vector,
+					c - msm->catchups);
+	  c->input_vector = 0;	/* reply handler is responsible for freeing vector */
+	  catchup_cleanup (msm, c, um, uf);
+	}
+    }
+
+  return 0 /* no error */ ;
+}
+
+static clib_error_t *
+catchup_server_read_ready (clib_file_t * uf)
+{
+  return catchup_socket_read_ready (uf, /* is_server */ 1);
+}
+
+static clib_error_t *
+catchup_client_read_ready (clib_file_t * uf)
+{
+  if (MC_EVENT_LOGGING)
+    {
+      mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data;
+      vlib_main_t *vm = msm->mc_main.vlib_main;
+
+      ELOG_TYPE (e, "catchup_client_read_ready");
+      ELOG (&vm->elog_main, e, 0);
+    }
+  return catchup_socket_read_ready (uf, /* is_server */ 0);
+}
+
+static clib_error_t *
+catchup_socket_write_ready (clib_file_t * uf, int is_server)
+{
+  clib_file_main_t *um = &file_main;
+  mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data;
+  mc_socket_catchup_t *c =
+    find_catchup_from_file_descriptor (msm, uf->file_descriptor);
+  clib_error_t *error = 0;
+  int n;
+
+  if (c->connect_in_progress)
+    {
+      u32 len, value;
+
+      c->connect_in_progress = 0;
+      len = sizeof (value);
+      if (getsockopt (c->socket, SOL_SOCKET, SO_ERROR, &value, &len) < 0)
+	{
+	  error = clib_error_return_unix (0, "getsockopt SO_ERROR");
+	  goto error_quit;
+	}
+      if (value != 0)
+	{
+	  error =
+	    clib_error_return_code (0, value, CLIB_ERROR_ERRNO_VALID,
+				    "connect fails");
+	  goto error_quit;
+	}
+    }
+
+  while (1)
+    {
+      u32 n_this_write;
+
+      n_this_write =
+	clib_min (vec_len (c->output_vector) - c->output_vector_n_written,
+		  msm->rx_mtu_n_bytes -
+		  64 /* ip + tcp + option allowance */ );
+
+      if (n_this_write <= 0)
+	break;
+
+      do
+	{
+	  n = write (uf->file_descriptor,
+		     c->output_vector + c->output_vector_n_written,
+		     n_this_write);
+	}
+      while (n < 0 && errno == EAGAIN);
+
+      if (n < 0)
+	{
+	  error = clib_error_return_unix (0, "write");
+	  goto error_quit;
+	}
+      c->output_vector_n_written += n;
+    }
+
+  if (c->output_vector_n_written >= vec_len (c->output_vector))
+    {
+      if (!is_server)
+	{
+	  uf->flags &= ~UNIX_FILE_DATA_AVAILABLE_TO_WRITE;
+	  file_main.file_update (uf, UNIX_FILE_UPDATE_MODIFY);
+	  /* Send EOF to other side. */
+	  shutdown (uf->file_descriptor, SHUT_WR);
+	  return error;
+	}
+      else
+	{
+	error_quit:
+	  catchup_cleanup (msm, c, um, uf);
+	}
+    }
+  return error;
+}
+
+static clib_error_t *
+catchup_server_write_ready (clib_file_t * uf)
+{
+  return catchup_socket_write_ready (uf, /* is_server */ 1);
+}
+
+static clib_error_t *
+catchup_client_write_ready (clib_file_t * uf)
+{
+  return catchup_socket_write_ready (uf, /* is_server */ 0);
+}
+
+static clib_error_t *
+catchup_socket_error_ready (clib_file_t * uf)
+{
+  clib_file_main_t *um = &file_main;
+  mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data;
+  mc_socket_catchup_t *c =
+    find_catchup_from_file_descriptor (msm, uf->file_descriptor);
+  catchup_cleanup (msm, c, um, uf);
+  return clib_error_return (0, "error");
+}
+
+static clib_error_t *
+catchup_listen_read_ready (clib_file_t * uf)
+{
+  mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data;
+  struct sockaddr_in client_addr;
+  int client_len;
+  mc_socket_catchup_t *c;
+  clib_file_t template = { 0 };
+
+  pool_get (msm->catchups, c);
+  memset (c, 0, sizeof (c[0]));
+
+  client_len = sizeof (client_addr);
+
+  /* Acquires the non-blocking attrib from the server socket. */
+  c->socket = accept (uf->file_descriptor,
+		      (struct sockaddr *) &client_addr,
+		      (socklen_t *) & client_len);
+
+  if (c->socket < 0)
+    {
+      pool_put (msm->catchups, c);
+      return clib_error_return_unix (0, "accept");
+    }
+
+  if (MC_EVENT_LOGGING)
+    {
+      mc_main_t *mcm = &msm->mc_main;
+      vlib_main_t *vm = mcm->vlib_main;
+
+      ELOG_TYPE_DECLARE (e) =
+      {
+      .format = "catchup accepted from 0x%lx",.format_args = "i4",};
+      struct
+      {
+	u32 addr;
+      } *ed = 0;
+
+      ed = ELOG_DATA (&vm->elog_main, e);
+      ed->addr = ntohl (client_addr.sin_addr.s_addr);
+    }
+
+  /* Disable the Nagle algorithm, ship catchup pkts immediately */
+  {
+    int one = 1;
+    if ((setsockopt (c->socket, IPPROTO_TCP,
+		     TCP_NODELAY, (void *) &one, sizeof (one))) < 0)
+      {
+	clib_unix_warning ("catchup socket: set TCP_NODELAY");
+      }
+  }
+
+  template.read_function = catchup_server_read_ready;
+  template.write_function = catchup_server_write_ready;
+  template.error_function = catchup_socket_error_ready;
+  template.file_descriptor = c->socket;
+  template.private_data = pointer_to_uword (msm);
+  c->clib_file_index = clib_file_add (&file_main, &template);
+  hash_set (msm->catchup_index_by_file_descriptor, c->socket,
+	    c - msm->catchups);
+
+  return 0;
+}
+
+/* Return and bind to an unused port. */
+static word
+find_and_bind_to_free_port (word sock, word port)
+{
+  for (; port < 1 << 16; port++)
+    {
+      struct sockaddr_in a;
+
+      memset (&a, 0, sizeof (a));	/* Warnings be gone */
+
+      a.sin_family = PF_INET;
+      a.sin_addr.s_addr = INADDR_ANY;
+      a.sin_port = htons (port);
+
+      if (bind (sock, (struct sockaddr *) &a, sizeof (a)) >= 0)
+	break;
+    }
+
+  return port < 1 << 16 ? port : -1;
+}
+
+static clib_error_t *
+setup_mutlicast_socket (mc_socket_main_t * msm,
+			mc_multicast_socket_t * ms,
+			char *type, uword udp_port)
+{
+  int one = 1;
+  struct ip_mreq mcast_req;
+
+  if (!msm->multicast_ttl)
+    msm->multicast_ttl = 1;
+
+  /* mastership (multicast) TX socket */
+  if ((ms->socket = socket (PF_INET, SOCK_DGRAM, IPPROTO_UDP)) < 0)
+    return clib_error_return_unix (0, "%s socket", type);
+
+  {
+    u8 ttl = msm->multicast_ttl;
+
+    if ((setsockopt (ms->socket, IPPROTO_IP,
+		     IP_MULTICAST_TTL, (void *) &ttl, sizeof (ttl))) < 0)
+      return clib_error_return_unix (0, "%s set multicast ttl", type);
+  }
+
+  if (setsockopt (ms->socket, SOL_SOCKET, SO_REUSEADDR, &one, sizeof (one)) <
+      0)
+    return clib_error_return_unix (0, "%s setsockopt SO_REUSEADDR", type);
+
+  memset (&ms->tx_addr, 0, sizeof (ms->tx_addr));
+  ms->tx_addr.sin_family = AF_INET;
+  ms->tx_addr.sin_addr.s_addr =
+    htonl (msm->multicast_tx_ip4_address_host_byte_order);
+  ms->tx_addr.sin_port = htons (udp_port);
+
+  if (bind (ms->socket, (struct sockaddr *) &ms->tx_addr,
+	    sizeof (ms->tx_addr)) < 0)
+    return clib_error_return_unix (0, "%s bind", type);
+
+  memset (&mcast_req, 0, sizeof (mcast_req));
+  mcast_req.imr_multiaddr.s_addr =
+    htonl (msm->multicast_tx_ip4_address_host_byte_order);
+  mcast_req.imr_interface.s_addr = msm->if_ip4_address_net_byte_order;
+
+  if ((setsockopt (ms->socket, IPPROTO_IP,
+		   IP_ADD_MEMBERSHIP, (void *) &mcast_req,
+		   sizeof (mcast_req))) < 0)
+    return clib_error_return_unix (0, "%s IP_ADD_MEMBERSHIP setsockopt",
+				   type);
+
+  if (ioctl (ms->socket, FIONBIO, &one) < 0)
+    return clib_error_return_unix (0, "%s set FIONBIO", type);
+
+  /* FIXME remove this when we support tx_ready. */
+  {
+    u32 len = 1 << 20;
+    socklen_t sl = sizeof (len);
+    if (setsockopt (ms->socket, SOL_SOCKET, SO_SNDBUF, &len, sl) < 0)
+      clib_unix_error ("setsockopt");
+  }
+
+  return 0;
+}
+
+static clib_error_t *
+socket_setup (mc_socket_main_t * msm)
+{
+  int one = 1;
+  clib_error_t *error;
+  u32 port;
+
+  if (!msm->base_multicast_udp_port_host_byte_order)
+    msm->base_multicast_udp_port_host_byte_order =
+      0xffff - ((MC_N_TRANSPORT_TYPE + 2 /* ack socket, catchup socket */ )
+		- 1);
+
+  port = msm->base_multicast_udp_port_host_byte_order;
+
+  error = setup_mutlicast_socket (msm,
+				  &msm->multicast_sockets
+				  [MC_TRANSPORT_MASTERSHIP], "mastership",
+				  port++);
+  if (error)
+    return error;
+
+  error = setup_mutlicast_socket (msm,
+				  &msm->multicast_sockets[MC_TRANSPORT_JOIN],
+				  "join", port++);
+  if (error)
+    return error;
+
+  error = setup_mutlicast_socket (msm,
+				  &msm->multicast_sockets
+				  [MC_TRANSPORT_USER_REQUEST_TO_RELAY],
+				  "to relay", port++);
+  if (error)
+    return error;
+
+  error = setup_mutlicast_socket (msm,
+				  &msm->multicast_sockets
+				  [MC_TRANSPORT_USER_REQUEST_FROM_RELAY],
+				  "from relay", port++);
+  if (error)
+    return error;
+
+  /* ACK rx socket */
+  msm->ack_socket = socket (PF_INET, SOCK_DGRAM, IPPROTO_UDP);
+  if (msm->ack_socket < 0)
+    return clib_error_return_unix (0, "ack socket");
+
+  msm->ack_udp_port = find_and_bind_to_free_port (msm->ack_socket, port++);
+
+  if (ioctl (msm->ack_socket, FIONBIO, &one) < 0)
+    return clib_error_return_unix (0, "ack socket FIONBIO");
+
+  msm->catchup_server_socket = socket (AF_INET, SOCK_STREAM, 0);
+  if (msm->catchup_server_socket < 0)
+    return clib_error_return_unix (0, "catchup server socket");
+
+  msm->catchup_tcp_port =
+    find_and_bind_to_free_port (msm->catchup_server_socket, port++);
+
+  if (ioctl (msm->catchup_server_socket, FIONBIO, &one) < 0)
+    return clib_error_return_unix (0, "catchup server socket FIONBIO");
+
+  if (listen (msm->catchup_server_socket, 5) < 0)
+    return clib_error_return_unix (0, "catchup server socket listen");
+
+  /* epoll setup for multicast mastership socket */
+  {
+    clib_file_t template = { 0 };
+
+    template.read_function = mastership_socket_read_ready;
+    template.file_descriptor =
+      msm->multicast_sockets[MC_TRANSPORT_MASTERSHIP].socket;
+    template.private_data = (uword) msm;
+    clib_file_add (&file_main, &template);
+
+    /* epoll setup for multicast to_relay socket */
+    template.read_function = to_relay_socket_read_ready;
+    template.file_descriptor =
+      msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_TO_RELAY].socket;
+    template.private_data = (uword) msm;
+    clib_file_add (&file_main, &template);
+
+    /* epoll setup for multicast from_relay socket */
+    template.read_function = from_relay_socket_read_ready;
+    template.file_descriptor =
+      msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_FROM_RELAY].socket;
+    template.private_data = (uword) msm;
+    clib_file_add (&file_main, &template);
+
+    template.read_function = join_socket_read_ready;
+    template.file_descriptor =
+      msm->multicast_sockets[MC_TRANSPORT_JOIN].socket;
+    template.private_data = (uword) msm;
+    clib_file_add (&file_main, &template);
+
+    /* epoll setup for ack rx socket */
+    template.read_function = ack_socket_read_ready;
+    template.file_descriptor = msm->ack_socket;
+    template.private_data = (uword) msm;
+    clib_file_add (&file_main, &template);
+
+    /* epoll setup for TCP catchup server */
+    template.read_function = catchup_listen_read_ready;
+    template.file_descriptor = msm->catchup_server_socket;
+    template.private_data = (uword) msm;
+    clib_file_add (&file_main, &template);
+  }
+
+  return 0;
+}
+
+static void *
+catchup_add_pending_output (mc_socket_catchup_t * c, uword n_bytes,
+			    u8 * set_output_vector)
+{
+  clib_file_t *uf = pool_elt_at_index (file_main.file_pool,
+				       c->clib_file_index);
+  u8 *result = 0;
+
+  if (set_output_vector)
+    c->output_vector = set_output_vector;
+  else
+    vec_add2 (c->output_vector, result, n_bytes);
+  if (vec_len (c->output_vector) > 0)
+    {
+      int skip_update = 0 != (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE);
+      uf->flags |= UNIX_FILE_DATA_AVAILABLE_TO_WRITE;
+      if (!skip_update)
+	file_main.file_update (uf, UNIX_FILE_UPDATE_MODIFY);
+    }
+  return result;
+}
+
+static uword
+catchup_request_fun (void *transport_main,
+		     u32 stream_index, mc_peer_id_t catchup_peer_id)
+{
+  mc_socket_main_t *msm = (mc_socket_main_t *) transport_main;
+  mc_main_t *mcm = &msm->mc_main;
+  vlib_main_t *vm = mcm->vlib_main;
+  mc_socket_catchup_t *c;
+  struct sockaddr_in addr;
+  clib_file_main_t *um = &file_main;
+  int one = 1;
+
+  pool_get (msm->catchups, c);
+  memset (c, 0, sizeof (*c));
+
+  c->socket = socket (AF_INET, SOCK_STREAM, 0);
+  if (c->socket < 0)
+    {
+      clib_unix_warning ("socket");
+      return 0;
+    }
+
+  if (ioctl (c->socket, FIONBIO, &one) < 0)
+    {
+      clib_unix_warning ("FIONBIO");
+      return 0;
+    }
+
+  memset (&addr, 0, sizeof (addr));
+  addr.sin_family = AF_INET;
+  addr.sin_addr.s_addr = mc_socket_peer_id_get_address (catchup_peer_id);
+  addr.sin_port = mc_socket_peer_id_get_port (catchup_peer_id);
+
+  c->connect_in_progress = 1;
+
+  if (MC_EVENT_LOGGING)
+    {
+      ELOG_TYPE_DECLARE (e) =
+      {
+      .format = "connecting to peer 0x%Lx",.format_args = "i8",};
+      struct
+      {
+	u64 peer;
+      } *ed;
+      ed = ELOG_DATA (&vm->elog_main, e);
+      ed->peer = catchup_peer_id.as_u64;
+    }
+
+  if (connect (c->socket, (const void *) &addr, sizeof (addr))
+      < 0 && errno != EINPROGRESS)
+    {
+      clib_unix_warning ("connect to %U fails",
+			 format_socket_peer_id, catchup_peer_id);
+      return 0;
+    }
+
+  {
+    clib_file_t template = { 0 };
+
+    template.read_function = catchup_client_read_ready;
+    template.write_function = catchup_client_write_ready;
+    template.error_function = catchup_socket_error_ready;
+    template.file_descriptor = c->socket;
+    template.private_data = (uword) msm;
+    c->clib_file_index = clib_file_add (um, &template);
+
+    hash_set (msm->catchup_index_by_file_descriptor, c->socket,
+	      c - msm->catchups);
+  }
+
+  {
+    mc_msg_catchup_request_t *mp;
+    mp = catchup_add_pending_output (c, sizeof (mp[0]),	/* set_output_vector */
+				     0);
+    mp->peer_id = msm->mc_main.transport.our_catchup_peer_id;
+    mp->stream_index = stream_index;
+    mc_byte_swap_msg_catchup_request (mp);
+  }
+
+  return c - msm->catchups;
+}
+
+static void
+catchup_send_fun (void *transport_main, uword opaque, u8 * data)
+{
+  mc_socket_main_t *msm = (mc_socket_main_t *) transport_main;
+  mc_socket_catchup_t *c = pool_elt_at_index (msm->catchups, opaque);
+  catchup_add_pending_output (c, 0, data);
+}
+
+static int
+find_interface_ip4_address (char *if_name, u32 * ip4_address, u32 * mtu)
+{
+  int fd;
+  struct ifreq ifr;
+  struct sockaddr_in *sa;
+
+  /* Dig up our IP address */
+  fd = socket (PF_INET, AF_INET, 0);
+  if (fd < 0)
+    {
+      clib_unix_error ("socket");
+      return -1;
+    }
+
+  ifr.ifr_addr.sa_family = AF_INET;
+  strncpy (ifr.ifr_name, if_name, sizeof (ifr.ifr_name) - 1);
+  if (ioctl (fd, SIOCGIFADDR, &ifr) < 0)
+    {
+      clib_unix_error ("ioctl(SIOCFIGADDR)");
+      close (fd);
+      return -1;
+    }
+
+  sa = (void *) &ifr.ifr_addr;
+  clib_memcpy (ip4_address, &sa->sin_addr.s_addr, sizeof (ip4_address[0]));
+
+  if (ioctl (fd, SIOCGIFMTU, &ifr) < 0)
+    {
+      close (fd);
+      return -1;
+    }
+  if (mtu)
+    *mtu = ifr.ifr_mtu - ( /* IP4 header */ 20 + /* UDP header */ 8);
+
+  close (fd);
+
+  return 0;
+}
+
+clib_error_t *
+mc_socket_main_init (mc_socket_main_t * msm, char **intfc_probe_list,
+		     int n_intfcs_to_probe)
+{
+  clib_error_t *error;
+  mc_main_t *mcm;
+  u32 mtu;
+
+  mcm = &msm->mc_main;
+
+  /* 239.255.0.7 */
+  if (!msm->multicast_tx_ip4_address_host_byte_order)
+    msm->multicast_tx_ip4_address_host_byte_order = 0xefff0007;
+
+  {
+    u32 i, a, win;
+
+    win = 0;
+    if (msm->multicast_interface_name)
+      {
+	win =
+	  !find_interface_ip4_address (msm->multicast_interface_name, &a,
+				       &mtu);
+      }
+    else
+      {
+	for (i = 0; i < n_intfcs_to_probe; i++)
+	  if (!find_interface_ip4_address (intfc_probe_list[i], &a, &mtu))
+	    {
+	      win = 1;
+	      msm->multicast_interface_name = intfc_probe_list[i];
+	      break;
+	    }
+      }
+
+    if (!win)
+      return clib_error_return (0, "can't find interface ip4 address");
+
+    msm->if_ip4_address_net_byte_order = a;
+  }
+
+  msm->rx_mtu_n_bytes = mtu;
+  msm->rx_mtu_n_buffers =
+    msm->rx_mtu_n_bytes / VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES;
+  msm->rx_mtu_n_buffers +=
+    (msm->rx_mtu_n_bytes % VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES) != 0;
+
+  error = socket_setup (msm);
+  if (error)
+    return error;
+
+  mcm->transport.our_ack_peer_id =
+    mc_socket_set_peer_id (msm->if_ip4_address_net_byte_order,
+			   msm->ack_udp_port);
+
+  mcm->transport.our_catchup_peer_id =
+    mc_socket_set_peer_id (msm->if_ip4_address_net_byte_order,
+			   msm->catchup_tcp_port);
+
+  mcm->transport.tx_buffer = tx_buffer;
+  mcm->transport.tx_ack = tx_ack;
+  mcm->transport.catchup_request_fun = catchup_request_fun;
+  mcm->transport.catchup_send_fun = catchup_send_fun;
+  mcm->transport.format_peer_id = format_socket_peer_id;
+  mcm->transport.opaque = msm;
+  mcm->transport.max_packet_size = mtu;
+
+  mc_main_init (mcm, "socket");
+
+  return error;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/unix/mc_socket.h b/src/vlib/unix/mc_socket.h
new file mode 100644
index 00000000..3686c824
--- /dev/null
+++ b/src/vlib/unix/mc_socket.h
@@ -0,0 +1,137 @@
+/*
+ * mc_socket.h: socket based multicast for vlib mc
+ *
+ * Copyright (c) 2010 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __included_mc_socket_h__
+#define __included_mc_socket_h__
+
+#include <vlib/unix/unix.h>
+#include <netinet/in.h>
+
+typedef struct
+{
+  int socket;
+  struct sockaddr_in tx_addr;
+} mc_multicast_socket_t;
+
+/* TCP catchup socket */
+typedef struct
+{
+  int socket;
+  u32 clib_file_index;
+
+  u8 *input_vector;
+  u8 *output_vector;
+  u32 output_vector_n_written;
+
+  u32 connect_in_progress;
+} mc_socket_catchup_t;
+
+typedef struct mc_socket_main_t
+{
+  mc_main_t mc_main;
+
+  /* Multicast mastership/to-relay/from-relay sockets. */
+  mc_multicast_socket_t multicast_sockets[MC_N_TRANSPORT_TYPE];
+
+  /* Unicast UDP ack sockets */
+  int ack_socket;
+
+  /* TCP catchup server socket */
+  int catchup_server_socket;
+
+  /* Pool of stream-private catchup sockets */
+  mc_socket_catchup_t *catchups;
+
+  uword *catchup_index_by_file_descriptor;
+
+  u32 rx_mtu_n_bytes;
+
+  /* Receive MTU in bytes and VLIB buffers. */
+  u32 rx_mtu_n_buffers;
+
+  /* Vector of RX VLIB buffers. */
+  u32 *rx_buffers;
+  /* Vector of scatter/gather descriptors for sending/receiving VLIB buffers
+     via kernel. */
+  struct iovec *iovecs;
+
+  /* IP address of interface to use for multicast. */
+  u32 if_ip4_address_net_byte_order;
+
+  u32 ack_udp_port;
+  u32 catchup_tcp_port;
+
+  /* Interface on which to listen for multicasts. */
+  char *multicast_interface_name;
+
+  /* Multicast address to use (e.g. 0xefff0000).
+     Host byte order. */
+  u32 multicast_tx_ip4_address_host_byte_order;
+
+  /* TTL to use for multicasts. */
+  u32 multicast_ttl;
+
+  /* Multicast ports for mastership, joins, etc. will be chosen
+     starting at the given port in host byte order.
+     A total of MC_N_TRANSPORT_TYPE ports will be used. */
+  u32 base_multicast_udp_port_host_byte_order;
+} mc_socket_main_t;
+
+always_inline u32
+mc_socket_peer_id_get_address (mc_peer_id_t i)
+{
+  u32 a = ((i.as_u8[0] << 24)
+	   | (i.as_u8[1] << 16) | (i.as_u8[2] << 8) | (i.as_u8[3] << 0));
+  return clib_host_to_net_u32 (a);
+}
+
+always_inline u32
+mc_socket_peer_id_get_port (mc_peer_id_t i)
+{
+  return clib_host_to_net_u16 ((i.as_u8[4] << 8) | i.as_u8[5]);
+}
+
+static_always_inline mc_peer_id_t
+mc_socket_set_peer_id (u32 address_net_byte_order, u32 port_host_byte_order)
+{
+  mc_peer_id_t i;
+  u32 a = ntohl (address_net_byte_order);
+  u32 p = port_host_byte_order;
+  i.as_u8[0] = (a >> 24) & 0xff;
+  i.as_u8[1] = (a >> 16) & 0xff;
+  i.as_u8[2] = (a >> 8) & 0xff;
+  i.as_u8[3] = (a >> 0) & 0xff;
+  i.as_u8[4] = (p >> 8) & 0xff;
+  i.as_u8[5] = (p >> 0) & 0xff;
+  i.as_u8[6] = 0;
+  i.as_u8[7] = 0;
+  return i;
+}
+
+clib_error_t *mc_socket_main_init (mc_socket_main_t * msm,
+				   char **intfc_probe_list,
+				   int n_intfcs_to_probe);
+#endif /* __included_mc_socket_h__ */
+
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/unix/plugin.c b/src/vlib/unix/plugin.c
new file mode 100644
index 00000000..c2741aaa
--- /dev/null
+++ b/src/vlib/unix/plugin.c
@@ -0,0 +1,553 @@
+/*
+ * plugin.c: plugin handling
+ *
+ * Copyright (c) 2011 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/unix/plugin.h>
+#include <vppinfra/elf.h>
+#include <dlfcn.h>
+#include <dirent.h>
+
+plugin_main_t vlib_plugin_main;
+
+char *vlib_plugin_path __attribute__ ((weak));
+char *vlib_plugin_path = "";
+char *vlib_plugin_app_version __attribute__ ((weak));
+char *vlib_plugin_app_version = "";
+
+void *
+vlib_get_plugin_symbol (char *plugin_name, char *symbol_name)
+{
+  plugin_main_t *pm = &vlib_plugin_main;
+  uword *p;
+  plugin_info_t *pi;
+
+  if ((p = hash_get_mem (pm->plugin_by_name_hash, plugin_name)) == 0)
+    return 0;
+
+  pi = vec_elt_at_index (pm->plugin_info, p[0]);
+  return dlsym (pi->handle, symbol_name);
+}
+
+static char *
+str_array_to_vec (char *array, int len)
+{
+  char c, *r = 0;
+  int n = 0;
+
+  do
+    {
+      c = array[n];
+      vec_add1 (r, c);
+    }
+  while (c && ++n < len);
+
+  if (c)
+    vec_add1 (r, 0);
+
+  return r;
+}
+
+static int
+load_one_plugin (plugin_main_t * pm, plugin_info_t * pi, int from_early_init)
+{
+  void *handle;
+  clib_error_t *error;
+  elf_main_t em = { 0 };
+  elf_section_t *section;
+  u8 *data;
+  char *version_required;
+  vlib_plugin_registration_t *reg;
+  plugin_config_t *pc = 0;
+  uword *p;
+
+  if (elf_read_file (&em, (char *) pi->filename))
+    return -1;
+
+  error = elf_get_section_by_name (&em, ".vlib_plugin_registration",
+				   &section);
+  if (error)
+    {
+      clib_warning ("Not a plugin: %s\n", (char *) pi->name);
+      return -1;
+    }
+
+  data = elf_get_section_contents (&em, section->index, 1);
+  reg = (vlib_plugin_registration_t *) data;
+
+  if (vec_len (data) != sizeof (*reg))
+    {
+      clib_warning ("vlib_plugin_registration size mismatch in plugin %s\n",
+		    (char *) pi->name);
+      goto error;
+    }
+
+  p = hash_get_mem (pm->config_index_by_name, pi->name);
+  if (p)
+    {
+      pc = vec_elt_at_index (pm->configs, p[0]);
+      if (pc->is_disabled)
+	{
+	  clib_warning ("Plugin disabled: %s", pi->name);
+	  goto error;
+	}
+      if (reg->default_disabled && pc->is_enabled == 0)
+	{
+	  clib_warning ("Plugin disabled (default): %s", pi->name);
+	  goto error;
+	}
+    }
+  else if (reg->default_disabled)
+    {
+      clib_warning ("Plugin disabled (default): %s", pi->name);
+      goto error;
+    }
+
+  version_required = str_array_to_vec ((char *) &reg->version_required,
+				       sizeof (reg->version_required));
+
+  if ((strlen (version_required) > 0) &&
+      (strncmp (vlib_plugin_app_version, version_required,
+		strlen (version_required))))
+    {
+      clib_warning ("Plugin %s version mismatch: %s != %s",
+		    pi->name, vlib_plugin_app_version, reg->version_required);
+      if (!(pc && pc->skip_version_check == 1))
+	{
+	  vec_free (version_required);
+	  goto error;
+	}
+    }
+
+  vec_free (version_required);
+  vec_free (data);
+  elf_main_free (&em);
+
+  handle = dlopen ((char *) pi->filename, RTLD_LAZY);
+
+  if (handle == 0)
+    {
+      clib_warning ("%s", dlerror ());
+      clib_warning ("Failed to load plugin '%s'", pi->name);
+      os_exit (1);
+    }
+
+  pi->handle = handle;
+
+  reg = dlsym (pi->handle, "vlib_plugin_registration");
+
+  if (reg == 0)
+    {
+      /* This should never happen unless somebody chagnes registration macro */
+      clib_warning ("Missing plugin registration in plugin '%s'", pi->name);
+      os_exit (1);
+    }
+
+  pi->reg = reg;
+  pi->version = str_array_to_vec ((char *) &reg->version,
+				  sizeof (reg->version));
+
+  if (reg->early_init)
+    {
+      clib_error_t *(*ei) (vlib_main_t *);
+      void *h;
+
+      h = dlsym (pi->handle, reg->early_init);
+      if (h)
+	{
+	  ei = h;
+	  error = (*ei) (pm->vlib_main);
+	  if (error)
+	    {
+	      clib_error_report (error);
+	      os_exit (1);
+	    }
+	}
+      else
+	clib_warning ("Plugin %s: early init function %s set but not found",
+		      (char *) pi->name, reg->early_init);
+    }
+
+  if (reg->description)
+    clib_warning ("Loaded plugin: %s (%s)", pi->name, reg->description);
+  else
+    clib_warning ("Loaded plugin: %s", pi->name);
+
+  return 0;
+error:
+  vec_free (data);
+  elf_main_free (&em);
+  return -1;
+}
+
+static u8 **
+split_plugin_path (plugin_main_t * pm)
+{
+  int i;
+  u8 **rv = 0;
+  u8 *path = pm->plugin_path;
+  u8 *this = 0;
+
+  for (i = 0; i < vec_len (pm->plugin_path); i++)
+    {
+      if (path[i] != ':')
+	{
+	  vec_add1 (this, path[i]);
+	  continue;
+	}
+      vec_add1 (this, 0);
+      vec_add1 (rv, this);
+      this = 0;
+    }
+  if (this)
+    {
+      vec_add1 (this, 0);
+      vec_add1 (rv, this);
+    }
+  return rv;
+}
+
+static int
+plugin_name_sort_cmp (void *a1, void *a2)
+{
+  plugin_info_t *p1 = a1;
+  plugin_info_t *p2 = a2;
+
+  return strcmp ((char *) p1->name, (char *) p2->name);
+}
+
+int
+vlib_load_new_plugins (plugin_main_t * pm, int from_early_init)
+{
+  DIR *dp;
+  struct dirent *entry;
+  struct stat statb;
+  uword *p;
+  plugin_info_t *pi;
+  u8 **plugin_path;
+  u32 *load_fail_indices = 0;
+  int i;
+
+  plugin_path = split_plugin_path (pm);
+
+  for (i = 0; i < vec_len (plugin_path); i++)
+    {
+      dp = opendir ((char *) plugin_path[i]);
+
+      if (dp == 0)
+	continue;
+
+      while ((entry = readdir (dp)))
+	{
+	  u8 *plugin_name;
+	  u8 *filename;
+
+	  if (pm->plugin_name_filter)
+	    {
+	      int j;
+	      for (j = 0; j < vec_len (pm->plugin_name_filter); j++)
+		if (entry->d_name[j] != pm->plugin_name_filter[j])
+		  goto next;
+	    }
+
+	  filename = format (0, "%s/%s%c", plugin_path[i], entry->d_name, 0);
+
+	  /* Only accept .so */
+	  char *ext = strrchr ((const char *) filename, '.');
+	  /* unreadable */
+	  if (!ext || (strcmp (ext, ".so") != 0) ||
+	      stat ((char *) filename, &statb) < 0)
+	    {
+	    ignore:
+	      vec_free (filename);
+	      continue;
+	    }
+
+	  /* a dir or other things which aren't plugins */
+	  if (!S_ISREG (statb.st_mode))
+	    goto ignore;
+
+	  plugin_name = format (0, "%s%c", entry->d_name, 0);
+	  /* Have we seen this plugin already? */
+	  p = hash_get_mem (pm->plugin_by_name_hash, plugin_name);
+	  if (p == 0)
+	    {
+	      /* No, add it to the plugin vector */
+	      vec_add2 (pm->plugin_info, pi, 1);
+	      pi->name = plugin_name;
+	      pi->filename = filename;
+	      pi->file_info = statb;
+	      hash_set_mem (pm->plugin_by_name_hash, plugin_name,
+			    pi - pm->plugin_info);
+	    }
+	next:
+	  ;
+	}
+      closedir (dp);
+      vec_free (plugin_path[i]);
+    }
+  vec_free (plugin_path);
+
+
+  /*
+   * Sort the plugins by name. This is important.
+   * API traces contain absolute message numbers.
+   * Loading plugins in directory (vs. alphabetical) order
+   * makes trace replay incredibly fragile.
+   */
+  vec_sort_with_function (pm->plugin_info, plugin_name_sort_cmp);
+
+  /*
+   * Attempt to load the plugins
+   */
+  for (i = 0; i < vec_len (pm->plugin_info); i++)
+    {
+      pi = vec_elt_at_index (pm->plugin_info, i);
+
+      if (load_one_plugin (pm, pi, from_early_init))
+	{
+	  /* Make a note of any which fail to load */
+	  vec_add1 (load_fail_indices, i);
+	  hash_unset_mem (pm->plugin_by_name_hash, pi->name);
+	  vec_free (pi->name);
+	  vec_free (pi->filename);
+	}
+    }
+
+  /* Remove plugin info vector elements corresponding to load failures */
+  if (vec_len (load_fail_indices) > 0)
+    {
+      for (i = vec_len (load_fail_indices) - 1; i >= 0; i--)
+	vec_delete (pm->plugin_info, 1, load_fail_indices[i]);
+      vec_free (load_fail_indices);
+    }
+
+  /* Recreate the plugin name hash */
+  for (i = 0; i < vec_len (pm->plugin_info); i++)
+    {
+      pi = vec_elt_at_index (pm->plugin_info, i);
+      hash_unset_mem (pm->plugin_by_name_hash, pi->name);
+      hash_set_mem (pm->plugin_by_name_hash, pi->name, pi - pm->plugin_info);
+    }
+
+  return 0;
+}
+
+int
+vlib_plugin_early_init (vlib_main_t * vm)
+{
+  plugin_main_t *pm = &vlib_plugin_main;
+
+  if (pm->plugin_path == 0)
+    pm->plugin_path = format (0, "%s%c", vlib_plugin_path, 0);
+
+  clib_warning ("plugin path %s", pm->plugin_path);
+
+  pm->plugin_by_name_hash = hash_create_string (0, sizeof (uword));
+  pm->vlib_main = vm;
+
+  return vlib_load_new_plugins (pm, 1 /* from_early_init */ );
+}
+
+static clib_error_t *
+vlib_plugins_show_cmd_fn (vlib_main_t * vm,
+			  unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  plugin_main_t *pm = &vlib_plugin_main;
+  u8 *s = 0;
+  u8 *key = 0;
+  uword value = 0;
+  int index = 1;
+  plugin_info_t *pi;
+
+  s = format (s, " Plugin path is: %s\n\n", pm->plugin_path);
+  s = format (s, "     %-41s%-33s%s\n", "Plugin", "Version", "Description");
+
+  /* *INDENT-OFF* */
+  hash_foreach_mem (key, value, pm->plugin_by_name_hash,
+    {
+      if (key != 0)
+        {
+          pi = vec_elt_at_index (pm->plugin_info, value);
+          s = format (s, "%3d. %-40s %-32s %s\n", index, key, pi->version,
+		      pi->reg->description ? pi->reg->description : "");
+	  index++;
+        }
+    });
+  /* *INDENT-ON* */
+
+  vlib_cli_output (vm, "%v", s);
+  vec_free (s);
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (plugins_show_cmd, static) =
+{
+  .path = "show plugins",
+  .short_help = "show loaded plugins",
+  .function = vlib_plugins_show_cmd_fn,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+config_one_plugin (vlib_main_t * vm, char *name, unformat_input_t * input)
+{
+  plugin_main_t *pm = &vlib_plugin_main;
+  plugin_config_t *pc;
+  clib_error_t *error = 0;
+  uword *p;
+  int is_enable = 0;
+  int is_disable = 0;
+  int skip_version_check = 0;
+
+  if (pm->config_index_by_name == 0)
+    pm->config_index_by_name = hash_create_string (0, sizeof (uword));
+
+  p = hash_get_mem (pm->config_index_by_name, name);
+
+  if (p)
+    {
+      error = clib_error_return (0, "plugin '%s' already configured", name);
+      goto done;
+    }
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "enable"))
+	is_enable = 1;
+      else if (unformat (input, "disable"))
+	is_disable = 1;
+      else if (unformat (input, "skip-version-check"))
+	skip_version_check = 1;
+      else
+	{
+	  error = clib_error_return (0, "unknown input '%U'",
+				     format_unformat_error, input);
+	  goto done;
+	}
+    }
+
+  if (is_enable && is_disable)
+    {
+      error = clib_error_return (0, "please specify either enable or disable"
+				 " for plugin '%s'", name);
+      goto done;
+    }
+
+  vec_add2 (pm->configs, pc, 1);
+  hash_set_mem (pm->config_index_by_name, name, pc - pm->configs);
+  pc->is_enabled = is_enable;
+  pc->is_disabled = is_disable;
+  pc->skip_version_check = skip_version_check;
+  pc->name = name;
+
+done:
+  return error;
+}
+
+clib_error_t *
+vlib_plugin_config (vlib_main_t * vm, unformat_input_t * input)
+{
+  plugin_main_t *pm = &vlib_plugin_main;
+  clib_error_t *error = 0;
+  unformat_input_t in;
+
+  unformat_init (&in, 0, 0);
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      u8 *s, *v;
+      if (unformat (input, "%s %v", &s, &v))
+	{
+	  if (strncmp ((const char *) s, "plugins", 8) == 0)
+	    {
+	      if (vec_len (in.buffer) > 0)
+		vec_add1 (in.buffer, ' ');
+	      vec_add (in.buffer, v, vec_len (v));
+	    }
+	}
+      else
+	{
+	  error = clib_error_return (0, "unknown input '%U'",
+				     format_unformat_error, input);
+	  goto done;
+	}
+
+      vec_free (v);
+      vec_free (s);
+    }
+done:
+  input = &in;
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      unformat_input_t sub_input;
+      u8 *s = 0;
+      if (unformat (input, "path %s", &s))
+	pm->plugin_path = s;
+      else if (unformat (input, "plugin %s %U", &s,
+			 unformat_vlib_cli_sub_input, &sub_input))
+	{
+	  error = config_one_plugin (vm, (char *) s, &sub_input);
+	  unformat_free (&sub_input);
+	  if (error)
+	    goto done2;
+	}
+      else
+	{
+	  error = clib_error_return (0, "unknown input '%U'",
+				     format_unformat_error, input);
+	  {
+	    vec_free (s);
+	    goto done2;
+	  }
+	}
+    }
+
+done2:
+  unformat_free (&in);
+  return error;
+}
+
+/* discard whole 'plugins' section, as it is already consumed prior to
+   plugin load */
+static clib_error_t *
+plugins_config (vlib_main_t * vm, unformat_input_t * input)
+{
+  u8 *junk;
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "%s", &junk))
+	{
+	  vec_free (junk);
+	  return 0;
+	}
+      else
+	return clib_error_return (0, "unknown input '%U'",
+				  format_unformat_error, input);
+    }
+  return 0;
+}
+
+VLIB_CONFIG_FUNCTION (plugins_config, "plugins");
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/unix/plugin.h b/src/vlib/unix/plugin.h
new file mode 100644
index 00000000..d9801ec4
--- /dev/null
+++ b/src/vlib/unix/plugin.h
@@ -0,0 +1,126 @@
+/*
+ * plugin.h: plugin handling
+ *
+ * Copyright (c) 2011 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __included_plugin_h__
+#define __included_plugin_h__
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+/*
+ * vlib plugin scheme
+ *
+ * Almost anything which can be made to work in a vlib unix
+ * application will also work in a vlib plugin.
+ *
+ * The elf-section magic which registers static objects
+ * works so long as plugins are preset when the vlib unix process
+ * starts. But wait: there's more...
+ *
+ * If an application calls vlib_load_new_plugins() -- possibly after
+ * changing vlib_plugin_main.plugin_path / vlib_plugin_main.plugin_name_filter,
+ * -- new plugins will be loaded. That, in turn, allows considerable
+ * flexibility in terms of adding feature code or fixing bugs without
+ * requiring the data-plane process to restart.
+ *
+ * When the plugin mechanism loads a plugin, it uses dlsym to locate
+ * and call the plugin's function vlib_plugin_register() if it exists.
+ * A plugin which expects to be loaded after the vlib application
+ * starts uses this callback to modify the application. If vlib_plugin_register
+ * returns non-zero, the plugin mechanism dlclose()'s the plugin.
+ *
+ * Applications control the plugin search path and name filter by
+ * declaring the variables vlib_plugin_path and vlib_plugin_name_filter.
+ * libvlib.la supplies weak references for these symbols which
+ * effectively disable the scheme. In order for the elf-section magic to
+ * work, static plugins must be loaded at the earliest possible moment.
+ *
+ * An application can change these parameters at any time and call
+ * vlib_load_new_plugins().
+ */
+
+/* *INDENT-OFF* */
+typedef CLIB_PACKED(struct {
+  u8 default_disabled;
+  const char version[32];
+  const char version_required[32];
+  const char *early_init;
+  const char *description;
+}) vlib_plugin_registration_t;
+/* *INDENT-ON* */
+
+typedef struct
+{
+  u8 *name;
+  u8 *filename;
+  struct stat file_info;
+  void *handle;
+
+  /* plugin registration */
+  vlib_plugin_registration_t *reg;
+  char *version;
+} plugin_info_t;
+
+typedef struct
+{
+  char *name;
+  u8 is_disabled;
+  u8 is_enabled;
+  u8 skip_version_check;
+} plugin_config_t;
+
+typedef struct
+{
+  /* loaded plugin info */
+  plugin_info_t *plugin_info;
+  uword *plugin_by_name_hash;
+
+  /* path and name filter */
+  u8 *plugin_path;
+  u8 *plugin_name_filter;
+
+  /* plugin configs and hash by name */
+  plugin_config_t *configs;
+  uword *config_index_by_name;
+
+  /* usual */
+  vlib_main_t *vlib_main;
+} plugin_main_t;
+
+extern plugin_main_t vlib_plugin_main;
+
+clib_error_t *vlib_plugin_config (vlib_main_t * vm, unformat_input_t * input);
+int vlib_plugin_early_init (vlib_main_t * vm);
+int vlib_load_new_plugins (plugin_main_t * pm, int from_early_init);
+void *vlib_get_plugin_symbol (char *plugin_name, char *symbol_name);
+
+#define VLIB_PLUGIN_REGISTER() \
+  vlib_plugin_registration_t vlib_plugin_registration \
+  __attribute__((__section__(".vlib_plugin_registration")))
+
+#endif /* __included_plugin_h__ */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/unix/unix.h b/src/vlib/unix/unix.h
new file mode 100644
index 00000000..4c8566b7
--- /dev/null
+++ b/src/vlib/unix/unix.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * unix.h: Unix specific main state
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_unix_unix_h
+#define included_unix_unix_h
+
+#include <vppinfra/file.h>
+#include <vppinfra/socket.h>
+#include <termios.h>
+
+typedef struct
+{
+  f64 time;
+  clib_error_t *error;
+} unix_error_history_t;
+
+typedef struct
+{
+  /* Back pointer to main structure. */
+  vlib_main_t *vlib_main;
+
+  u32 flags;
+  /* Run interactively or as daemon (background process). */
+#define UNIX_FLAG_INTERACTIVE (1 << 0)
+#define UNIX_FLAG_NODAEMON (1 << 1)
+
+  /* CLI listen socket. */
+  clib_socket_t cli_listen_socket;
+
+  /* Circular buffer of last unix errors. */
+  unix_error_history_t error_history[128];
+  u32 error_history_index;
+  u64 n_total_errors;
+
+  /* startup-config filename */
+  u8 *startup_config_filename;
+
+  /* runtime directory path */
+  u8 *runtime_dir;
+
+  /* pidfile filename */
+  u8 *pidfile;
+
+  /* unix config complete */
+  volatile int unix_config_complete;
+
+  /* CLI log file. GIGO. */
+  u8 *log_filename;
+  int log_fd;
+
+  /* Don't put CLI connections into character mode */
+  int cli_line_mode;
+
+  /* Maximum amount of command line history to keep per session */
+  u32 cli_history_limit;
+
+  /* Suppress the welcome banner at CLI session start */
+  int cli_no_banner;
+
+  /* Maximum pager buffer size */
+  u32 cli_pager_buffer_limit;
+
+  /* Suppress the pager */
+  int cli_no_pager;
+
+  /* Store the original state of stdin when it's a tty */
+  struct termios tio_stdin;
+  int tio_isset;
+} unix_main_t;
+
+/* Global main structure. */
+extern unix_main_t unix_main;
+extern clib_file_main_t file_main;
+
+always_inline void
+unix_save_error (unix_main_t * um, clib_error_t * error)
+{
+  unix_error_history_t *eh = um->error_history + um->error_history_index;
+  clib_error_free_vector (eh->error);
+  eh->error = error;
+  eh->time = vlib_time_now (um->vlib_main);
+  um->n_total_errors += 1;
+  if (++um->error_history_index >= ARRAY_LEN (um->error_history))
+    um->error_history_index = 0;
+}
+
+/* Main function for Unix VLIB. */
+int vlib_unix_main (int argc, char *argv[]);
+
+clib_error_t *unix_physmem_init (vlib_main_t * vm);
+
+/* Set prompt for CLI. */
+void vlib_unix_cli_set_prompt (char *prompt);
+
+static inline unix_main_t *
+vlib_unix_get_main (void)
+{
+  return &unix_main;
+}
+
+static inline char *
+vlib_unix_get_runtime_dir (void)
+{
+  return (char *) unix_main.runtime_dir;
+}
+
+/* thread stack array; vec_len = max number of threads */
+extern u8 **vlib_thread_stacks;
+
+/* utils */
+
+clib_error_t *foreach_directory_file (char *dir_name,
+				      clib_error_t * (*f) (void *arg,
+							   u8 * path_name,
+							   u8 * file_name),
+				      void *arg, int scan_dirs);
+
+clib_error_t *vlib_unix_recursive_mkdir (char *path);
+
+clib_error_t *vlib_unix_validate_runtime_file (unix_main_t * um,
+					       const char *path,
+					       u8 ** full_path);
+
+#endif /* included_unix_unix_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/unix/util.c b/src/vlib/unix/util.c
new file mode 100644
index 00000000..5472751e
--- /dev/null
+++ b/src/vlib/unix/util.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * pci.c: Linux user space PCI bus management.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+#include <vlib/unix/unix.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <dirent.h>
+
+clib_error_t *
+foreach_directory_file (char *dir_name,
+			clib_error_t * (*f) (void *arg, u8 * path_name,
+					     u8 * file_name), void *arg,
+			int scan_dirs)
+{
+  DIR *d;
+  struct dirent *e;
+  clib_error_t *error = 0;
+  u8 *s, *t;
+
+  d = opendir (dir_name);
+  if (!d)
+    {
+      if (errno == ENOENT)
+	return 0;
+      return clib_error_return_unix (0, "open `%s'", dir_name);
+    }
+
+  s = t = 0;
+  while (1)
+    {
+      e = readdir (d);
+      if (!e)
+	break;
+      if (scan_dirs)
+	{
+	  if (e->d_type == DT_DIR
+	      && (!strcmp (e->d_name, ".") || !strcmp (e->d_name, "..")))
+	    continue;
+	}
+      else
+	{
+	  if (e->d_type == DT_DIR)
+	    continue;
+	}
+
+      s = format (s, "%s/%s", dir_name, e->d_name);
+      t = format (t, "%s", e->d_name);
+      error = f (arg, s, t);
+      _vec_len (s) = 0;
+      _vec_len (t) = 0;
+
+      if (error)
+	break;
+    }
+
+  vec_free (s);
+  closedir (d);
+
+  return error;
+}
+
+clib_error_t *
+vlib_unix_recursive_mkdir (char *path)
+{
+  clib_error_t *error = 0;
+  char *c = 0;
+  int i = 0;
+
+  while (path[i] != 0)
+    {
+      if (c && path[i] == '/')
+	{
+	  vec_add1 (c, 0);
+	  if ((mkdir (c, 0755)) && (errno != EEXIST))
+	    {
+	      error = clib_error_return_unix (0, "mkdir '%s'", c);
+	      goto done;
+	    }
+	  _vec_len (c)--;
+	}
+      vec_add1 (c, path[i]);
+      i++;
+    }
+
+  if ((mkdir (path, 0755)) && (errno != EEXIST))
+    {
+      error = clib_error_return_unix (0, "mkdir '%s'", path);
+      goto done;
+    }
+
+done:
+  vec_free (c);
+
+  return error;
+}
+
+clib_error_t *
+vlib_unix_validate_runtime_file (unix_main_t * um,
+				 const char *path, u8 ** full_path)
+{
+  u8 *fp = 0;
+  char *last_slash = 0;
+
+  if (path[0] == '\0')
+    {
+      return clib_error_return (0, "path is an empty string");
+    }
+  else if (strncmp (path, "../", 3) == 0 || strstr (path, "/../"))
+    {
+      return clib_error_return (0, "'..' not allowed in runtime path");
+    }
+  else if (path[0] == '/')
+    {
+      /* Absolute path. Has to start with runtime directory */
+      if (strncmp ((char *) um->runtime_dir, path,
+		   strlen ((char *) um->runtime_dir)))
+	{
+	  return clib_error_return (0,
+				    "file %s is not in runtime directory %s",
+				    path, um->runtime_dir);
+	}
+      fp = format (0, "%s%c", path, '\0');
+    }
+  else
+    {
+      /* Relative path, just append to runtime */
+      fp = format (0, "%s/%s%c", um->runtime_dir, path, '\0');
+    }
+
+  /* We don't want to create a directory out of the last file */
+  if ((last_slash = strrchr ((char *) fp, '/')) != NULL)
+    *last_slash = '\0';
+
+  clib_error_t *error = vlib_unix_recursive_mkdir ((char *) fp);
+
+  if (last_slash != NULL)
+    *last_slash = '/';
+
+  if (error)
+    vec_free (fp);
+
+  *full_path = fp;
+  return error;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/vlib.h b/src/vlib/vlib.h
new file mode 100644
index 00000000..eed5c5bc
--- /dev/null
+++ b/src/vlib/vlib.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * vlib.h: top-level include file
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef included_vlib_h
+#define included_vlib_h
+
+#include <vppinfra/clib.h>
+#include <vppinfra/elf_clib.h>
+
+/* Generic definitions. */
+#include <vlib/defs.h>
+
+/* Forward declarations of structs to avoid circular dependencies. */
+struct vlib_main_t;
+
+/* All includes in alphabetical order. */
+#include <vlib/physmem.h>
+#include <vlib/buffer.h>
+#include <vlib/cli.h>
+#include <vlib/counter.h>
+#include <vlib/error.h>
+#include <vlib/init.h>
+#include <vlib/mc.h>
+#include <vlib/node.h>
+#include <vlib/trace.h>
+
+/* Main include depends on other vlib/ includes so we put it last. */
+#include <vlib/main.h>
+
+/* Inline/extern function declarations. */
+#include <vlib/threads.h>
+#include <vlib/physmem_funcs.h>
+#include <vlib/buffer_funcs.h>
+#include <vlib/cli_funcs.h>
+#include <vlib/error_funcs.h>
+#include <vlib/format_funcs.h>
+#include <vlib/node_funcs.h>
+#include <vlib/trace_funcs.h>
+#include <vlib/global_funcs.h>
+
+#include <vlib/buffer_node.h>
+
+#endif /* included_vlib_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/vlib_process_doc.h b/src/vlib/vlib_process_doc.h
new file mode 100644
index 00000000..a47c5e4b
--- /dev/null
+++ b/src/vlib/vlib_process_doc.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+#error do not #include this file!
+
+/** \file
+
+    Cooperative multi-tasking thread support.
+
+    Vlib provides a lightweight cooperative multi-tasking thread
+    model. Context switching costs a setjmp/longjump pair.  It's not
+    unreasonable to put vlib threads to sleep for 10us.
+
+    The graph node scheduler invokes these processes in much the same
+    way as traditional vector-processing run-to-completion graph
+    nodes; plus-or-minus a setjmp/longjmp pair required to switch
+    stacks. Simply set the vlib_node_registration_t type field to
+    VLIB_NODE_TYPE_PROCESS. Process is a misnomer; these are threads.
+
+    As of this writing, the default stack size is 2<<15;
+    32kb. Initialize the node registration's
+    process_log2_n_stack_bytes member as needed. The graph node
+    dispatcher makes some effort to detect stack overrun. We map a
+    no-access page below each thread stack.
+
+    Process node dispatch functions are expected to be while(1) { }
+    loops which suspend when not otherwise occupied, and which must
+    not run for unreasonably long periods of time.  Unreasonably long
+    is an application-dependent concept. Over the years, we have
+    constructed frame-size sensitive control-plane nodes which will
+    use a much higher fraction of the available CPU bandwidth when the
+    frame size is low. Classic example: modifying forwarding
+    tables. So long as the table-builder leaves the forwarding tables
+    in a valid state, one can suspend the table builder to avoid
+    dropping packets as a result of control-plane activity.
+
+    Process nodes can suspend for fixed amounts of time, or until another
+    entity signals an event, or both. See the example below.
+
+    When running in VLIB process context, one must pay strict attention to
+    loop invariant issues. If one walks a data structure and calls a
+    function which may suspend, one had best know by construction that it
+    cannot change. Often, it s best to simply make a snapshot copy of a
+    data structure, walk the copy at leisure, then free the copy.
+
+    Here's an example:
+
+    <code><pre>
+    \#define EXAMPLE_POLL_PERIOD 10.0
+
+    static uword
+    example_process (vlib_main_t * vm, vlib_node_runtime_t * rt,
+                     vlib_frame_t * f)
+    {
+      f64 poll_time_remaining;
+      uword event_type, *event_data = 0;
+
+      poll_time_remaining = EXAMPLE_POLL_PERIOD;
+      while (1)
+        {
+          int i;
+
+           // Sleep until next periodic call due,
+           // or until we receive event(s)
+           //
+          poll_time_remaining =
+    	    vlib_process_wait_for_event_or_clock (vm, poll_time_remaining);
+
+          event_type = vlib_process_get_events (vm, &event_data);
+          switch (event_type)
+     	    {
+       	    case ~0:		// no events => timeout
+      	      break;
+
+            case EVENT1:
+    	      for (i = 0; i < vec_len (event_data); i++)
+    	        handle_event1 (mm, event_data[i]);
+    	      break;
+
+    	    case EVENT2:
+    	      for (i = 0; i < vec_len (event_data); i++)
+    	        handle_event2 (vm, event_data[i]);
+    	      break;
+
+              // ... and so forth for each event type
+
+            default:
+              // This should never happen...
+    	      clib_warning ("BUG: unhandled event type %d",
+                            event_type);
+    	      break;
+      	    }
+          vec_reset_length (event_data);
+
+          // Timer expired, call periodic function
+          if (vlib_process_suspend_time_is_zero (poll_time_remaining))
+    	    {
+    	      example_periodic (vm);
+    	      poll_time_remaining = EXAMPLE_POLL_PERIOD;
+    	    }
+        }
+      // NOTREACHED
+      return 0;
+    }
+
+    static VLIB_REGISTER_NODE (example_node) = {
+      .function = example_process,
+      .type = VLIB_NODE_TYPE_PROCESS,
+      .name = "example-process",
+    };
+    </pre></code>
+
+    In this example, the VLIB process node waits for an event to
+    occur, or for 10 seconds to elapse. The code demuxes on the event
+    type, calling the appropriate handler function.
+
+    Each call to vlib_process_get_events returns a vector of
+    per-event-type data passed to successive vlib_process_signal_event
+    calls; vec_len (event_data) >= 1.  It is an error to process only
+    event_data[0].
+
+    Resetting the event_data vector-length to 0 by calling
+    vec_reset_length (event_data) - instead of calling vec_free (...)
+    - means that the event scheme doesn t burn cycles continuously
+    allocating and freeing the event data vector. This is a common
+    coding pattern, well worth using when appropriate.
+*/
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */