From 7cd468a3d7dee7d6c92f69a0bb7061ae208ec727 Mon Sep 17 00:00:00 2001
From: Damjan Marion <damarion@cisco.com>
Date: Mon, 19 Dec 2016 23:05:39 +0100
Subject: Reorganize source tree to use single autotools instance

Change-Id: I7b51f88292e057c6443b12224486f2d0c9f8ae23
Signed-off-by: Damjan Marion <damarion@cisco.com>
---
 src/vlib/threads.c | 1492 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1492 insertions(+)
 create mode 100644 src/vlib/threads.c

(limited to 'src/vlib/threads.c')

diff --git a/src/vlib/threads.c b/src/vlib/threads.c
new file mode 100644
index 00000000..c5e58bc0
--- /dev/null
+++ b/src/vlib/threads.c
@@ -0,0 +1,1492 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#define _GNU_SOURCE
+
+#include <signal.h>
+#include <math.h>
+#include <vppinfra/format.h>
+#include <vlib/vlib.h>
+
+#include <vlib/threads.h>
+#include <vlib/unix/cj.h>
+
+
+#if DPDK==1
+#include <rte_config.h>
+#include <rte_common.h>
+#include <rte_eal.h>
+#include <rte_launch.h>
+#include <rte_lcore.h>
+#endif
+DECLARE_CJ_GLOBAL_LOG;
+
+#define FRAME_QUEUE_NELTS 32
+
+
+#if DPDK==1
+/*
+ *  Weak definitions of DPDK symbols used in this file.
+ *  Needed for linking test programs without DPDK libs.
+ */
+unsigned __thread __attribute__ ((weak)) RTE_PER_LCORE (_lcore_id);
+struct lcore_config __attribute__ ((weak)) lcore_config[];
+unsigned __attribute__ ((weak)) rte_socket_id ();
+int __attribute__ ((weak)) rte_eal_remote_launch ();
+#endif
+u32
+vl (void *p)
+{
+  return vec_len (p);
+}
+
+vlib_worker_thread_t *vlib_worker_threads;
+vlib_thread_main_t vlib_thread_main;
+
+uword
+os_get_cpu_number (void)
+{
+  void *sp;
+  uword n;
+  u32 len;
+
+  len = vec_len (vlib_thread_stacks);
+  if (len == 0)
+    return 0;
+
+  /* Get any old stack address. */
+  sp = &sp;
+
+  n = ((uword) sp - (uword) vlib_thread_stacks[0])
+    >> VLIB_LOG2_THREAD_STACK_SIZE;
+
+  /* "processes" have their own stacks, and they always run in thread 0 */
+  n = n >= len ? 0 : n;
+
+  return n;
+}
+
+uword
+os_get_ncpus (void)
+{
+  u32 len;
+
+  len = vec_len (vlib_thread_stacks);
+  if (len == 0)
+    return 1;
+  else
+    return len;
+}
+
+void
+vlib_set_thread_name (char *name)
+{
+  int pthread_setname_np (pthread_t __target_thread, const char *__name);
+  int rv;
+  pthread_t thread = pthread_self ();
+
+  if (thread)
+    {
+      rv = pthread_setname_np (thread, name);
+      if (rv)
+	clib_warning ("pthread_setname_np returned %d", rv);
+    }
+}
+
+static int
+sort_registrations_by_no_clone (void *a0, void *a1)
+{
+  vlib_thread_registration_t **tr0 = a0;
+  vlib_thread_registration_t **tr1 = a1;
+
+  return ((i32) ((*tr0)->no_data_structure_clone)
+	  - ((i32) ((*tr1)->no_data_structure_clone)));
+}
+
+static uword *
+vlib_sysfs_list_to_bitmap (char *filename)
+{
+  FILE *fp;
+  uword *r = 0;
+
+  fp = fopen (filename, "r");
+
+  if (fp != NULL)
+    {
+      u8 *buffer = 0;
+      vec_validate (buffer, 256 - 1);
+      if (fgets ((char *) buffer, 256, fp))
+	{
+	  unformat_input_t in;
+	  unformat_init_string (&in, (char *) buffer,
+				strlen ((char *) buffer));
+	  if (unformat (&in, "%U", unformat_bitmap_list, &r) != 1)
+	    clib_warning ("unformat_bitmap_list failed");
+	  unformat_free (&in);
+	}
+      vec_free (buffer);
+      fclose (fp);
+    }
+  return r;
+}
+
+
+/* Called early in the init sequence */
+
+clib_error_t *
+vlib_thread_init (vlib_main_t * vm)
+{
+  vlib_thread_main_t *tm = &vlib_thread_main;
+  vlib_worker_thread_t *w;
+  vlib_thread_registration_t *tr;
+  u32 n_vlib_mains = 1;
+  u32 first_index = 1;
+  u32 i;
+  uword *avail_cpu;
+
+  /* get bitmaps of active cpu cores and sockets */
+  tm->cpu_core_bitmap =
+    vlib_sysfs_list_to_bitmap ("/sys/devices/system/cpu/online");
+  tm->cpu_socket_bitmap =
+    vlib_sysfs_list_to_bitmap ("/sys/devices/system/node/online");
+
+  avail_cpu = clib_bitmap_dup (tm->cpu_core_bitmap);
+
+  /* skip cores */
+  for (i = 0; i < tm->skip_cores; i++)
+    {
+      uword c = clib_bitmap_first_set (avail_cpu);
+      if (c == ~0)
+	return clib_error_return (0, "no available cpus to skip");
+
+      avail_cpu = clib_bitmap_set (avail_cpu, c, 0);
+    }
+
+  /* grab cpu for main thread */
+  if (!tm->main_lcore)
+    {
+      tm->main_lcore = clib_bitmap_first_set (avail_cpu);
+      if (tm->main_lcore == (u8) ~ 0)
+	return clib_error_return (0, "no available cpus to be used for the"
+				  " main thread");
+    }
+  else
+    {
+      if (clib_bitmap_get (avail_cpu, tm->main_lcore) == 0)
+	return clib_error_return (0, "cpu %u is not available to be used"
+				  " for the main thread", tm->main_lcore);
+    }
+  avail_cpu = clib_bitmap_set (avail_cpu, tm->main_lcore, 0);
+
+  /* assume that there is socket 0 only if there is no data from sysfs */
+  if (!tm->cpu_socket_bitmap)
+    tm->cpu_socket_bitmap = clib_bitmap_set (0, 0, 1);
+
+  /* pin main thread to main_lcore  */
+#if DPDK==0
+  {
+    cpu_set_t cpuset;
+    CPU_ZERO (&cpuset);
+    CPU_SET (tm->main_lcore, &cpuset);
+    pthread_setaffinity_np (pthread_self (), sizeof (cpu_set_t), &cpuset);
+  }
+#endif
+
+  /* as many threads as stacks... */
+  vec_validate_aligned (vlib_worker_threads, vec_len (vlib_thread_stacks) - 1,
+			CLIB_CACHE_LINE_BYTES);
+
+  /* Preallocate thread 0 */
+  _vec_len (vlib_worker_threads) = 1;
+  w = vlib_worker_threads;
+  w->thread_mheap = clib_mem_get_heap ();
+  w->thread_stack = vlib_thread_stacks[0];
+  w->lcore_id = tm->main_lcore;
+  w->lwp = syscall (SYS_gettid);
+  w->thread_id = pthread_self ();
+  tm->n_vlib_mains = 1;
+
+  if (tm->sched_policy != ~0)
+    {
+      struct sched_param sched_param;
+      if (!sched_getparam (w->lwp, &sched_param))
+	{
+	  if (tm->sched_priority != ~0)
+	    sched_param.sched_priority = tm->sched_priority;
+	  sched_setscheduler (w->lwp, tm->sched_policy, &sched_param);
+	}
+    }
+
+  /* assign threads to cores and set n_vlib_mains */
+  tr = tm->next;
+
+  while (tr)
+    {
+      vec_add1 (tm->registrations, tr);
+      tr = tr->next;
+    }
+
+  vec_sort_with_function (tm->registrations, sort_registrations_by_no_clone);
+
+  for (i = 0; i < vec_len (tm->registrations); i++)
+    {
+      int j;
+      tr = tm->registrations[i];
+      tr->first_index = first_index;
+      first_index += tr->count;
+      n_vlib_mains += (tr->no_data_structure_clone == 0) ? tr->count : 0;
+
+      /* construct coremask */
+      if (tr->use_pthreads || !tr->count)
+	continue;
+
+      if (tr->coremask)
+	{
+	  uword c;
+          /* *INDENT-OFF* */
+          clib_bitmap_foreach (c, tr->coremask, ({
+            if (clib_bitmap_get(avail_cpu, c) == 0)
+              return clib_error_return (0, "cpu %u is not available to be used"
+                                        " for the '%s' thread",c, tr->name);
+
+            avail_cpu = clib_bitmap_set(avail_cpu, c, 0);
+          }));
+/* *INDENT-ON* */
+
+	}
+      else
+	{
+	  for (j = 0; j < tr->count; j++)
+	    {
+	      uword c = clib_bitmap_first_set (avail_cpu);
+	      if (c == ~0)
+		return clib_error_return (0,
+					  "no available cpus to be used for"
+					  " the '%s' thread", tr->name);
+
+	      avail_cpu = clib_bitmap_set (avail_cpu, c, 0);
+	      tr->coremask = clib_bitmap_set (tr->coremask, c, 1);
+	    }
+	}
+    }
+
+  clib_bitmap_free (avail_cpu);
+
+  tm->n_vlib_mains = n_vlib_mains;
+
+  vec_validate_aligned (vlib_worker_threads, first_index - 1,
+			CLIB_CACHE_LINE_BYTES);
+
+  return 0;
+}
+
+vlib_worker_thread_t *
+vlib_alloc_thread (vlib_main_t * vm)
+{
+  vlib_worker_thread_t *w;
+
+  if (vec_len (vlib_worker_threads) >= vec_len (vlib_thread_stacks))
+    {
+      clib_warning ("out of worker threads... Quitting...");
+      exit (1);
+    }
+  vec_add2 (vlib_worker_threads, w, 1);
+  w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads];
+  return w;
+}
+
+vlib_frame_queue_t *
+vlib_frame_queue_alloc (int nelts)
+{
+  vlib_frame_queue_t *fq;
+
+  fq = clib_mem_alloc_aligned (sizeof (*fq), CLIB_CACHE_LINE_BYTES);
+  memset (fq, 0, sizeof (*fq));
+  fq->nelts = nelts;
+  fq->vector_threshold = 128;	// packets
+  vec_validate_aligned (fq->elts, nelts - 1, CLIB_CACHE_LINE_BYTES);
+
+  if (1)
+    {
+      if (((uword) & fq->tail) & (CLIB_CACHE_LINE_BYTES - 1))
+	fformat (stderr, "WARNING: fq->tail unaligned\n");
+      if (((uword) & fq->head) & (CLIB_CACHE_LINE_BYTES - 1))
+	fformat (stderr, "WARNING: fq->head unaligned\n");
+      if (((uword) fq->elts) & (CLIB_CACHE_LINE_BYTES - 1))
+	fformat (stderr, "WARNING: fq->elts unaligned\n");
+
+      if (sizeof (fq->elts[0]) % CLIB_CACHE_LINE_BYTES)
+	fformat (stderr, "WARNING: fq->elts[0] size %d\n",
+		 sizeof (fq->elts[0]));
+      if (nelts & (nelts - 1))
+	{
+	  fformat (stderr, "FATAL: nelts MUST be a power of 2\n");
+	  abort ();
+	}
+    }
+
+  return (fq);
+}
+
+void vl_msg_api_handler_no_free (void *) __attribute__ ((weak));
+void
+vl_msg_api_handler_no_free (void *v)
+{
+}
+
+/* Turned off, save as reference material... */
+#if 0
+static inline int
+vlib_frame_queue_dequeue_internal (int thread_id,
+				   vlib_main_t * vm, vlib_node_main_t * nm)
+{
+  vlib_frame_queue_t *fq = vlib_frame_queues[thread_id];
+  vlib_frame_queue_elt_t *elt;
+  vlib_frame_t *f;
+  vlib_pending_frame_t *p;
+  vlib_node_runtime_t *r;
+  u32 node_runtime_index;
+  int msg_type;
+  u64 before;
+  int processed = 0;
+
+  ASSERT (vm == vlib_mains[thread_id]);
+
+  while (1)
+    {
+      if (fq->head == fq->tail)
+	return processed;
+
+      elt = fq->elts + ((fq->head + 1) & (fq->nelts - 1));
+
+      if (!elt->valid)
+	return processed;
+
+      before = clib_cpu_time_now ();
+
+      f = elt->frame;
+      node_runtime_index = elt->node_runtime_index;
+      msg_type = elt->msg_type;
+
+      switch (msg_type)
+	{
+	case VLIB_FRAME_QUEUE_ELT_FREE_BUFFERS:
+	  vlib_buffer_free (vm, vlib_frame_vector_args (f), f->n_vectors);
+	  /* note fallthrough... */
+	case VLIB_FRAME_QUEUE_ELT_FREE_FRAME:
+	  r = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL],
+				node_runtime_index);
+	  vlib_frame_free (vm, r, f);
+	  break;
+	case VLIB_FRAME_QUEUE_ELT_DISPATCH_FRAME:
+	  vec_add2 (vm->node_main.pending_frames, p, 1);
+	  f->flags |= (VLIB_FRAME_PENDING | VLIB_FRAME_FREE_AFTER_DISPATCH);
+	  p->node_runtime_index = elt->node_runtime_index;
+	  p->frame_index = vlib_frame_index (vm, f);
+	  p->next_frame_index = VLIB_PENDING_FRAME_NO_NEXT_FRAME;
+	  fq->dequeue_vectors += (u64) f->n_vectors;
+	  break;
+	case VLIB_FRAME_QUEUE_ELT_API_MSG:
+	  vl_msg_api_handler_no_free (f);
+	  break;
+	default:
+	  clib_warning ("bogus frame queue message, type %d", msg_type);
+	  break;
+	}
+      elt->valid = 0;
+      fq->dequeues++;
+      fq->dequeue_ticks += clib_cpu_time_now () - before;
+      CLIB_MEMORY_BARRIER ();
+      fq->head++;
+      processed++;
+    }
+  ASSERT (0);
+  return processed;
+}
+
+int
+vlib_frame_queue_dequeue (int thread_id,
+			  vlib_main_t * vm, vlib_node_main_t * nm)
+{
+  return vlib_frame_queue_dequeue_internal (thread_id, vm, nm);
+}
+
+int
+vlib_frame_queue_enqueue (vlib_main_t * vm, u32 node_runtime_index,
+			  u32 frame_queue_index, vlib_frame_t * frame,
+			  vlib_frame_queue_msg_type_t type)
+{
+  vlib_frame_queue_t *fq = vlib_frame_queues[frame_queue_index];
+  vlib_frame_queue_elt_t *elt;
+  u32 save_count;
+  u64 new_tail;
+  u64 before = clib_cpu_time_now ();
+
+  ASSERT (fq);
+
+  new_tail = __sync_add_and_fetch (&fq->tail, 1);
+
+  /* Wait until a ring slot is available */
+  while (new_tail >= fq->head + fq->nelts)
+    {
+      f64 b4 = vlib_time_now_ticks (vm, before);
+      vlib_worker_thread_barrier_check (vm, b4);
+      /* Bad idea. Dequeue -> enqueue -> dequeue -> trouble */
+      // vlib_frame_queue_dequeue (vm->cpu_index, vm, nm);
+    }
+
+  elt = fq->elts + (new_tail & (fq->nelts - 1));
+
+  /* this would be very bad... */
+  while (elt->valid)
+    {
+    }
+
+  /* Once we enqueue the frame, frame->n_vectors is owned elsewhere... */
+  save_count = frame->n_vectors;
+
+  elt->frame = frame;
+  elt->node_runtime_index = node_runtime_index;
+  elt->msg_type = type;
+  CLIB_MEMORY_BARRIER ();
+  elt->valid = 1;
+
+  return save_count;
+}
+#endif /* 0 */
+
+/* To be called by vlib worker threads upon startup */
+void
+vlib_worker_thread_init (vlib_worker_thread_t * w)
+{
+  vlib_thread_main_t *tm = vlib_get_thread_main ();
+
+  /*
+   * Note: disabling signals in worker threads as follows
+   * prevents the api post-mortem dump scheme from working
+   * {
+   *    sigset_t s;
+   *    sigfillset (&s);
+   *    pthread_sigmask (SIG_SETMASK, &s, 0);
+   *  }
+   */
+
+  clib_mem_set_heap (w->thread_mheap);
+
+  if (vec_len (tm->thread_prefix) && w->registration->short_name)
+    {
+      w->name = format (0, "%v_%s_%d%c", tm->thread_prefix,
+			w->registration->short_name, w->instance_id, '\0');
+      vlib_set_thread_name ((char *) w->name);
+    }
+
+  if (!w->registration->use_pthreads)
+    {
+
+      /* Initial barrier sync, for both worker and i/o threads */
+      clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, 1);
+
+      while (*vlib_worker_threads->wait_at_barrier)
+	;
+
+      clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, -1);
+    }
+}
+
+void *
+vlib_worker_thread_bootstrap_fn (void *arg)
+{
+  void *rv;
+  vlib_worker_thread_t *w = arg;
+
+  w->lwp = syscall (SYS_gettid);
+  w->thread_id = pthread_self ();
+
+  rv = (void *) clib_calljmp
+    ((uword (*)(uword)) w->thread_function,
+     (uword) arg, w->thread_stack + VLIB_THREAD_STACK_SIZE);
+  /* NOTREACHED, we hope */
+  return rv;
+}
+
+static int
+vlib_launch_thread (void *fp, vlib_worker_thread_t * w, unsigned lcore_id)
+{
+  void *(*fp_arg) (void *) = fp;
+
+  w->lcore_id = lcore_id;
+#if DPDK==1
+  if (!w->registration->use_pthreads)
+    if (rte_eal_remote_launch)	/* do we have dpdk linked */
+      return rte_eal_remote_launch (fp, (void *) w, lcore_id);
+    else
+      return -1;
+  else
+#endif
+    {
+      int ret;
+      pthread_t worker;
+      cpu_set_t cpuset;
+      CPU_ZERO (&cpuset);
+      CPU_SET (lcore_id, &cpuset);
+
+      ret = pthread_create (&worker, NULL /* attr */ , fp_arg, (void *) w);
+      if (ret == 0)
+	return pthread_setaffinity_np (worker, sizeof (cpu_set_t), &cpuset);
+      else
+	return ret;
+    }
+}
+
+static clib_error_t *
+start_workers (vlib_main_t * vm)
+{
+  int i, j;
+  vlib_worker_thread_t *w;
+  vlib_main_t *vm_clone;
+  void *oldheap;
+  vlib_thread_main_t *tm = &vlib_thread_main;
+  vlib_thread_registration_t *tr;
+  vlib_node_runtime_t *rt;
+  u32 n_vlib_mains = tm->n_vlib_mains;
+  u32 worker_thread_index;
+  u8 *main_heap = clib_mem_get_per_cpu_heap ();
+  mheap_t *main_heap_header = mheap_header (main_heap);
+
+  vec_reset_length (vlib_worker_threads);
+
+  /* Set up the main thread */
+  vec_add2_aligned (vlib_worker_threads, w, 1, CLIB_CACHE_LINE_BYTES);
+  w->elog_track.name = "main thread";
+  elog_track_register (&vm->elog_main, &w->elog_track);
+
+  if (vec_len (tm->thread_prefix))
+    {
+      w->name = format (0, "%v_main%c", tm->thread_prefix, '\0');
+      vlib_set_thread_name ((char *) w->name);
+    }
+
+  /*
+   * Truth of the matter: we always use at least two
+   * threads. So, make the main heap thread-safe
+   * and make the event log thread-safe.
+   */
+  main_heap_header->flags |= MHEAP_FLAG_THREAD_SAFE;
+  vm->elog_main.lock =
+    clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES);
+  vm->elog_main.lock[0] = 0;
+
+  if (n_vlib_mains > 1)
+    {
+      vec_validate (vlib_mains, tm->n_vlib_mains - 1);
+      _vec_len (vlib_mains) = 0;
+      vec_add1 (vlib_mains, vm);
+
+      vlib_worker_threads->wait_at_barrier =
+	clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES);
+      vlib_worker_threads->workers_at_barrier =
+	clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES);
+
+      /* Ask for an initial barrier sync */
+      *vlib_worker_threads->workers_at_barrier = 0;
+      *vlib_worker_threads->wait_at_barrier = 1;
+
+      worker_thread_index = 1;
+
+      for (i = 0; i < vec_len (tm->registrations); i++)
+	{
+	  vlib_node_main_t *nm, *nm_clone;
+	  vlib_buffer_main_t *bm_clone;
+	  vlib_buffer_free_list_t *fl_clone, *fl_orig;
+	  vlib_buffer_free_list_t *orig_freelist_pool;
+	  int k;
+
+	  tr = tm->registrations[i];
+
+	  if (tr->count == 0)
+	    continue;
+
+	  for (k = 0; k < tr->count; k++)
+	    {
+	      vec_add2 (vlib_worker_threads, w, 1);
+	      if (tr->mheap_size)
+		w->thread_mheap =
+		  mheap_alloc (0 /* use VM */ , tr->mheap_size);
+	      else
+		w->thread_mheap = main_heap;
+	      w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads];
+	      w->thread_function = tr->function;
+	      w->thread_function_arg = w;
+	      w->instance_id = k;
+	      w->registration = tr;
+
+	      w->elog_track.name =
+		(char *) format (0, "%s %d", tr->name, k + 1);
+	      vec_add1 (w->elog_track.name, 0);
+	      elog_track_register (&vm->elog_main, &w->elog_track);
+
+	      if (tr->no_data_structure_clone)
+		continue;
+
+	      /* Fork vlib_global_main et al. Look for bugs here */
+	      oldheap = clib_mem_set_heap (w->thread_mheap);
+
+	      vm_clone = clib_mem_alloc (sizeof (*vm_clone));
+	      clib_memcpy (vm_clone, vlib_mains[0], sizeof (*vm_clone));
+
+	      vm_clone->cpu_index = worker_thread_index;
+	      vm_clone->heap_base = w->thread_mheap;
+	      vm_clone->mbuf_alloc_list = 0;
+	      memset (&vm_clone->random_buffer, 0,
+		      sizeof (vm_clone->random_buffer));
+
+	      nm = &vlib_mains[0]->node_main;
+	      nm_clone = &vm_clone->node_main;
+	      /* fork next frames array, preserving node runtime indices */
+	      nm_clone->next_frames = vec_dup (nm->next_frames);
+	      for (j = 0; j < vec_len (nm_clone->next_frames); j++)
+		{
+		  vlib_next_frame_t *nf = &nm_clone->next_frames[j];
+		  u32 save_node_runtime_index;
+		  u32 save_flags;
+
+		  save_node_runtime_index = nf->node_runtime_index;
+		  save_flags = nf->flags & VLIB_FRAME_NO_FREE_AFTER_DISPATCH;
+		  vlib_next_frame_init (nf);
+		  nf->node_runtime_index = save_node_runtime_index;
+		  nf->flags = save_flags;
+		}
+
+	      /* fork the frame dispatch queue */
+	      nm_clone->pending_frames = 0;
+	      vec_validate (nm_clone->pending_frames, 10);	/* $$$$$?????? */
+	      _vec_len (nm_clone->pending_frames) = 0;
+
+	      /* fork nodes */
+	      nm_clone->nodes = 0;
+	      for (j = 0; j < vec_len (nm->nodes); j++)
+		{
+		  vlib_node_t *n;
+		  n = clib_mem_alloc_no_fail (sizeof (*n));
+		  clib_memcpy (n, nm->nodes[j], sizeof (*n));
+		  /* none of the copied nodes have enqueue rights given out */
+		  n->owner_node_index = VLIB_INVALID_NODE_INDEX;
+		  memset (&n->stats_total, 0, sizeof (n->stats_total));
+		  memset (&n->stats_last_clear, 0,
+			  sizeof (n->stats_last_clear));
+		  vec_add1 (nm_clone->nodes, n);
+		}
+	      nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] =
+		vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]);
+
+	      nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] =
+		vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]);
+	      vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT])
+		rt->cpu_index = vm_clone->cpu_index;
+
+	      nm_clone->processes = vec_dup (nm->processes);
+
+	      /* zap the (per worker) frame freelists, etc */
+	      nm_clone->frame_sizes = 0;
+	      nm_clone->frame_size_hash = 0;
+
+	      /* Packet trace buffers are guaranteed to be empty, nothing to do here */
+
+	      clib_mem_set_heap (oldheap);
+	      vec_add1 (vlib_mains, vm_clone);
+
+	      vm_clone->error_main.counters =
+		vec_dup (vlib_mains[0]->error_main.counters);
+	      vm_clone->error_main.counters_last_clear =
+		vec_dup (vlib_mains[0]->error_main.counters_last_clear);
+
+	      /* Fork the vlib_buffer_main_t free lists, etc. */
+	      bm_clone = vec_dup (vm_clone->buffer_main);
+	      vm_clone->buffer_main = bm_clone;
+
+	      orig_freelist_pool = bm_clone->buffer_free_list_pool;
+	      bm_clone->buffer_free_list_pool = 0;
+
+            /* *INDENT-OFF* */
+            pool_foreach (fl_orig, orig_freelist_pool,
+                          ({
+                            pool_get_aligned (bm_clone->buffer_free_list_pool,
+                                              fl_clone, CLIB_CACHE_LINE_BYTES);
+                            ASSERT (fl_orig - orig_freelist_pool
+                                    == fl_clone - bm_clone->buffer_free_list_pool);
+
+                            fl_clone[0] = fl_orig[0];
+                            fl_clone->aligned_buffers = 0;
+                            fl_clone->unaligned_buffers = 0;
+                            fl_clone->n_alloc = 0;
+                          }));
+/* *INDENT-ON* */
+
+	      worker_thread_index++;
+	    }
+	}
+    }
+  else
+    {
+      /* only have non-data-structure copy threads to create... */
+      for (i = 0; i < vec_len (tm->registrations); i++)
+	{
+	  tr = tm->registrations[i];
+
+	  for (j = 0; j < tr->count; j++)
+	    {
+	      vec_add2 (vlib_worker_threads, w, 1);
+	      if (tr->mheap_size)
+		w->thread_mheap =
+		  mheap_alloc (0 /* use VM */ , tr->mheap_size);
+	      else
+		w->thread_mheap = main_heap;
+	      w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads];
+	      w->thread_function = tr->function;
+	      w->thread_function_arg = w;
+	      w->instance_id = j;
+	      w->elog_track.name =
+		(char *) format (0, "%s %d", tr->name, j + 1);
+	      w->registration = tr;
+	      vec_add1 (w->elog_track.name, 0);
+	      elog_track_register (&vm->elog_main, &w->elog_track);
+	    }
+	}
+    }
+
+  worker_thread_index = 1;
+
+  for (i = 0; i < vec_len (tm->registrations); i++)
+    {
+      int j;
+
+      tr = tm->registrations[i];
+
+      if (tr->use_pthreads || tm->use_pthreads)
+	{
+	  for (j = 0; j < tr->count; j++)
+	    {
+	      w = vlib_worker_threads + worker_thread_index++;
+	      if (vlib_launch_thread (vlib_worker_thread_bootstrap_fn, w, 0) <
+		  0)
+		clib_warning ("Couldn't start '%s' pthread ", tr->name);
+	    }
+	}
+      else
+	{
+	  uword c;
+            /* *INDENT-OFF* */
+            clib_bitmap_foreach (c, tr->coremask, ({
+              w = vlib_worker_threads + worker_thread_index++;
+              if (vlib_launch_thread (vlib_worker_thread_bootstrap_fn, w, c) < 0)
+                clib_warning ("Couldn't start DPDK lcore %d", c);
+
+            }));
+/* *INDENT-ON* */
+	}
+    }
+  vlib_worker_thread_barrier_sync (vm);
+  vlib_worker_thread_barrier_release (vm);
+  return 0;
+}
+
+VLIB_MAIN_LOOP_ENTER_FUNCTION (start_workers);
+
+void
+vlib_worker_thread_node_runtime_update (void)
+{
+  int i, j;
+  vlib_worker_thread_t *w;
+  vlib_main_t *vm;
+  vlib_node_main_t *nm, *nm_clone;
+  vlib_node_t **old_nodes_clone;
+  vlib_main_t *vm_clone;
+  vlib_node_runtime_t *rt, *old_rt;
+  void *oldheap;
+  never_inline void
+    vlib_node_runtime_sync_stats (vlib_main_t * vm,
+				  vlib_node_runtime_t * r,
+				  uword n_calls,
+				  uword n_vectors, uword n_clocks);
+
+  ASSERT (os_get_cpu_number () == 0);
+
+  if (vec_len (vlib_mains) == 0)
+    return;
+
+  vm = vlib_mains[0];
+  nm = &vm->node_main;
+
+  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (*vlib_worker_threads->wait_at_barrier == 1);
+
+  /*
+   * Scrape all runtime stats, so we don't lose node runtime(s) with
+   * pending counts, or throw away worker / io thread counts.
+   */
+  for (j = 0; j < vec_len (nm->nodes); j++)
+    {
+      vlib_node_t *n;
+      n = nm->nodes[j];
+      vlib_node_sync_stats (vm, n);
+    }
+
+  for (i = 1; i < vec_len (vlib_mains); i++)
+    {
+      vlib_node_t *n;
+
+      vm_clone = vlib_mains[i];
+      nm_clone = &vm_clone->node_main;
+
+      for (j = 0; j < vec_len (nm_clone->nodes); j++)
+	{
+	  n = nm_clone->nodes[j];
+
+	  rt = vlib_node_get_runtime (vm_clone, n->index);
+	  vlib_node_runtime_sync_stats (vm_clone, rt, 0, 0, 0);
+	}
+    }
+
+  for (i = 1; i < vec_len (vlib_mains); i++)
+    {
+      vlib_node_runtime_t *rt;
+      w = vlib_worker_threads + i;
+      oldheap = clib_mem_set_heap (w->thread_mheap);
+
+      vm_clone = vlib_mains[i];
+
+      /* Re-clone error heap */
+      u64 *old_counters = vm_clone->error_main.counters;
+      u64 *old_counters_all_clear = vm_clone->error_main.counters_last_clear;
+      clib_memcpy (&vm_clone->error_main, &vm->error_main,
+		   sizeof (vm->error_main));
+      j = vec_len (vm->error_main.counters) - 1;
+      vec_validate_aligned (old_counters, j, CLIB_CACHE_LINE_BYTES);
+      vec_validate_aligned (old_counters_all_clear, j, CLIB_CACHE_LINE_BYTES);
+      vm_clone->error_main.counters = old_counters;
+      vm_clone->error_main.counters_last_clear = old_counters_all_clear;
+
+      nm_clone = &vm_clone->node_main;
+      vec_free (nm_clone->next_frames);
+      nm_clone->next_frames = vec_dup (nm->next_frames);
+
+      for (j = 0; j < vec_len (nm_clone->next_frames); j++)
+	{
+	  vlib_next_frame_t *nf = &nm_clone->next_frames[j];
+	  u32 save_node_runtime_index;
+	  u32 save_flags;
+
+	  save_node_runtime_index = nf->node_runtime_index;
+	  save_flags = nf->flags & VLIB_FRAME_NO_FREE_AFTER_DISPATCH;
+	  vlib_next_frame_init (nf);
+	  nf->node_runtime_index = save_node_runtime_index;
+	  nf->flags = save_flags;
+	}
+
+      old_nodes_clone = nm_clone->nodes;
+      nm_clone->nodes = 0;
+
+      /* re-fork nodes */
+      for (j = 0; j < vec_len (nm->nodes); j++)
+	{
+	  vlib_node_t *old_n_clone;
+	  vlib_node_t *new_n, *new_n_clone;
+
+	  new_n = nm->nodes[j];
+	  old_n_clone = old_nodes_clone[j];
+
+	  new_n_clone = clib_mem_alloc_no_fail (sizeof (*new_n_clone));
+	  clib_memcpy (new_n_clone, new_n, sizeof (*new_n));
+	  /* none of the copied nodes have enqueue rights given out */
+	  new_n_clone->owner_node_index = VLIB_INVALID_NODE_INDEX;
+
+	  if (j >= vec_len (old_nodes_clone))
+	    {
+	      /* new node, set to zero */
+	      memset (&new_n_clone->stats_total, 0,
+		      sizeof (new_n_clone->stats_total));
+	      memset (&new_n_clone->stats_last_clear, 0,
+		      sizeof (new_n_clone->stats_last_clear));
+	    }
+	  else
+	    {
+	      /* Copy stats if the old data is valid */
+	      clib_memcpy (&new_n_clone->stats_total,
+			   &old_n_clone->stats_total,
+			   sizeof (new_n_clone->stats_total));
+	      clib_memcpy (&new_n_clone->stats_last_clear,
+			   &old_n_clone->stats_last_clear,
+			   sizeof (new_n_clone->stats_last_clear));
+
+	      /* keep previous node state */
+	      new_n_clone->state = old_n_clone->state;
+	    }
+	  vec_add1 (nm_clone->nodes, new_n_clone);
+	}
+      /* Free the old node clone */
+      for (j = 0; j < vec_len (old_nodes_clone); j++)
+	clib_mem_free (old_nodes_clone[j]);
+      vec_free (old_nodes_clone);
+
+      vec_free (nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]);
+
+      nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] =
+	vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]);
+
+      /* clone input node runtime */
+      old_rt = nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT];
+
+      nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] =
+	vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]);
+
+      vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT])
+      {
+	rt->cpu_index = vm_clone->cpu_index;
+      }
+
+      for (j = 0; j < vec_len (old_rt); j++)
+	{
+	  rt = vlib_node_get_runtime (vm_clone, old_rt[j].node_index);
+	  rt->state = old_rt[j].state;
+	}
+
+      vec_free (old_rt);
+
+      nm_clone->processes = vec_dup (nm->processes);
+
+      clib_mem_set_heap (oldheap);
+
+      // vnet_main_fork_fixup (i);
+    }
+}
+
+u32
+unformat_sched_policy (unformat_input_t * input, va_list * args)
+{
+  u32 *r = va_arg (*args, u32 *);
+
+  if (0);
+#define _(v,f,s) else if (unformat (input, s)) *r = SCHED_POLICY_##f;
+  foreach_sched_policy
+#undef _
+    else
+    return 0;
+  return 1;
+}
+
+static clib_error_t *
+cpu_config (vlib_main_t * vm, unformat_input_t * input)
+{
+  vlib_thread_registration_t *tr;
+  uword *p;
+  vlib_thread_main_t *tm = &vlib_thread_main;
+  u8 *name;
+  u64 coremask;
+  uword *bitmap;
+  u32 count;
+
+  tm->thread_registrations_by_name = hash_create_string (0, sizeof (uword));
+
+  tm->n_thread_stacks = 1;	/* account for main thread */
+  tm->sched_policy = ~0;
+  tm->sched_priority = ~0;
+
+  tr = tm->next;
+
+  while (tr)
+    {
+      hash_set_mem (tm->thread_registrations_by_name, tr->name, (uword) tr);
+      tr = tr->next;
+    }
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "use-pthreads"))
+	tm->use_pthreads = 1;
+      else if (unformat (input, "thread-prefix %v", &tm->thread_prefix))
+	;
+      else if (unformat (input, "main-core %u", &tm->main_lcore))
+	;
+      else if (unformat (input, "skip-cores %u", &tm->skip_cores))
+	;
+      else if (unformat (input, "coremask-%s %llx", &name, &coremask))
+	{
+	  p = hash_get_mem (tm->thread_registrations_by_name, name);
+	  if (p == 0)
+	    return clib_error_return (0, "no such thread type '%s'", name);
+
+	  tr = (vlib_thread_registration_t *) p[0];
+
+	  if (tr->use_pthreads)
+	    return clib_error_return (0,
+				      "coremask cannot be set for '%s' threads",
+				      name);
+
+	  tr->coremask = clib_bitmap_set_multiple
+	    (tr->coremask, 0, coremask, BITS (coremask));
+	  tr->count = clib_bitmap_count_set_bits (tr->coremask);
+	}
+      else if (unformat (input, "corelist-%s %U", &name, unformat_bitmap_list,
+			 &bitmap))
+	{
+	  p = hash_get_mem (tm->thread_registrations_by_name, name);
+	  if (p == 0)
+	    return clib_error_return (0, "no such thread type '%s'", name);
+
+	  tr = (vlib_thread_registration_t *) p[0];
+
+	  if (tr->use_pthreads)
+	    return clib_error_return (0,
+				      "corelist cannot be set for '%s' threads",
+				      name);
+
+	  tr->coremask = bitmap;
+	  tr->count = clib_bitmap_count_set_bits (tr->coremask);
+	}
+      else
+	if (unformat
+	    (input, "scheduler-policy %U", unformat_sched_policy,
+	     &tm->sched_policy))
+	;
+      else if (unformat (input, "scheduler-priority %u", &tm->sched_priority))
+	;
+      else if (unformat (input, "%s %u", &name, &count))
+	{
+	  p = hash_get_mem (tm->thread_registrations_by_name, name);
+	  if (p == 0)
+	    return clib_error_return (0, "no such thread type 3 '%s'", name);
+
+	  tr = (vlib_thread_registration_t *) p[0];
+	  if (tr->fixed_count)
+	    return clib_error_return
+	      (0, "number of %s threads not configurable", tr->name);
+	  tr->count = count;
+	}
+      else
+	break;
+    }
+
+  if (tm->sched_priority != ~0)
+    {
+      if (tm->sched_policy == SCHED_FIFO || tm->sched_policy == SCHED_RR)
+	{
+	  u32 prio_max = sched_get_priority_max (tm->sched_policy);
+	  u32 prio_min = sched_get_priority_min (tm->sched_policy);
+	  if (tm->sched_priority > prio_max)
+	    tm->sched_priority = prio_max;
+	  if (tm->sched_priority < prio_min)
+	    tm->sched_priority = prio_min;
+	}
+      else
+	{
+	  return clib_error_return
+	    (0,
+	     "scheduling priority (%d) is not allowed for `normal` scheduling policy",
+	     tm->sched_priority);
+	}
+    }
+  tr = tm->next;
+
+  if (!tm->thread_prefix)
+    tm->thread_prefix = format (0, "vpp");
+
+  while (tr)
+    {
+      tm->n_thread_stacks += tr->count;
+      tm->n_pthreads += tr->count * tr->use_pthreads;
+      tm->n_eal_threads += tr->count * (tr->use_pthreads == 0);
+      tr = tr->next;
+    }
+
+  return 0;
+}
+
+VLIB_EARLY_CONFIG_FUNCTION (cpu_config, "cpu");
+
+#if !defined (__x86_64__) && !defined (__aarch64__) && !defined (__powerpc64__) && !defined(__arm__)
+void
+__sync_fetch_and_add_8 (void)
+{
+  fformat (stderr, "%s called\n", __FUNCTION__);
+  abort ();
+}
+
+void
+__sync_add_and_fetch_8 (void)
+{
+  fformat (stderr, "%s called\n", __FUNCTION__);
+  abort ();
+}
+#endif
+
+void vnet_main_fixup (vlib_fork_fixup_t which) __attribute__ ((weak));
+void
+vnet_main_fixup (vlib_fork_fixup_t which)
+{
+}
+
+void
+vlib_worker_thread_fork_fixup (vlib_fork_fixup_t which)
+{
+  vlib_main_t *vm = vlib_get_main ();
+
+  if (vlib_mains == 0)
+    return;
+
+  ASSERT (os_get_cpu_number () == 0);
+  vlib_worker_thread_barrier_sync (vm);
+
+  switch (which)
+    {
+    case VLIB_WORKER_THREAD_FORK_FIXUP_NEW_SW_IF_INDEX:
+      vnet_main_fixup (VLIB_WORKER_THREAD_FORK_FIXUP_NEW_SW_IF_INDEX);
+      break;
+
+    default:
+      ASSERT (0);
+    }
+  vlib_worker_thread_barrier_release (vm);
+}
+
+void
+vlib_worker_thread_barrier_sync (vlib_main_t * vm)
+{
+  f64 deadline;
+  u32 count;
+
+  if (!vlib_mains)
+    return;
+
+  count = vec_len (vlib_mains) - 1;
+
+  /* Tolerate recursive calls */
+  if (++vlib_worker_threads[0].recursion_level > 1)
+    return;
+
+  vlib_worker_threads[0].barrier_sync_count++;
+
+  ASSERT (os_get_cpu_number () == 0);
+
+  deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT;
+
+  *vlib_worker_threads->wait_at_barrier = 1;
+  while (*vlib_worker_threads->workers_at_barrier != count)
+    {
+      if (vlib_time_now (vm) > deadline)
+	{
+	  fformat (stderr, "%s: worker thread deadlock\n", __FUNCTION__);
+	  os_panic ();
+	}
+    }
+}
+
+void
+vlib_worker_thread_barrier_release (vlib_main_t * vm)
+{
+  f64 deadline;
+
+  if (!vlib_mains)
+    return;
+
+  if (--vlib_worker_threads[0].recursion_level > 0)
+    return;
+
+  deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT;
+
+  *vlib_worker_threads->wait_at_barrier = 0;
+
+  while (*vlib_worker_threads->workers_at_barrier > 0)
+    {
+      if (vlib_time_now (vm) > deadline)
+	{
+	  fformat (stderr, "%s: worker thread deadlock\n", __FUNCTION__);
+	  os_panic ();
+	}
+    }
+}
+
+/*
+ * Check the frame queue to see if any frames are available.
+ * If so, pull the packets off the frames and put them to
+ * the handoff node.
+ */
+static inline int
+vlib_frame_queue_dequeue_internal (vlib_main_t * vm,
+				   vlib_frame_queue_main_t * fqm)
+{
+  u32 thread_id = vm->cpu_index;
+  vlib_frame_queue_t *fq = fqm->vlib_frame_queues[thread_id];
+  vlib_frame_queue_elt_t *elt;
+  u32 *from, *to;
+  vlib_frame_t *f;
+  int msg_type;
+  int processed = 0;
+  u32 n_left_to_node;
+  u32 vectors = 0;
+
+  ASSERT (fq);
+  ASSERT (vm == vlib_mains[thread_id]);
+
+  if (PREDICT_FALSE (fqm->node_index == ~0))
+    return 0;
+  /*
+   * Gather trace data for frame queues
+   */
+  if (PREDICT_FALSE (fq->trace))
+    {
+      frame_queue_trace_t *fqt;
+      frame_queue_nelt_counter_t *fqh;
+      u32 elix;
+
+      fqt = &fqm->frame_queue_traces[thread_id];
+
+      fqt->nelts = fq->nelts;
+      fqt->head = fq->head;
+      fqt->head_hint = fq->head_hint;
+      fqt->tail = fq->tail;
+      fqt->threshold = fq->vector_threshold;
+      fqt->n_in_use = fqt->tail - fqt->head;
+      if (fqt->n_in_use >= fqt->nelts)
+	{
+	  // if beyond max then use max
+	  fqt->n_in_use = fqt->nelts - 1;
+	}
+
+      /* Record the number of elements in use in the histogram */
+      fqh = &fqm->frame_queue_histogram[thread_id];
+      fqh->count[fqt->n_in_use]++;
+
+      /* Record a snapshot of the elements in use */
+      for (elix = 0; elix < fqt->nelts; elix++)
+	{
+	  elt = fq->elts + ((fq->head + 1 + elix) & (fq->nelts - 1));
+	  if (1 || elt->valid)
+	    {
+	      fqt->n_vectors[elix] = elt->n_vectors;
+	    }
+	}
+      fqt->written = 1;
+    }
+
+  while (1)
+    {
+      if (fq->head == fq->tail)
+	{
+	  fq->head_hint = fq->head;
+	  return processed;
+	}
+
+      elt = fq->elts + ((fq->head + 1) & (fq->nelts - 1));
+
+      if (!elt->valid)
+	{
+	  fq->head_hint = fq->head;
+	  return processed;
+	}
+
+      from = elt->buffer_index;
+      msg_type = elt->msg_type;
+
+      ASSERT (msg_type == VLIB_FRAME_QUEUE_ELT_DISPATCH_FRAME);
+      ASSERT (elt->n_vectors <= VLIB_FRAME_SIZE);
+
+      f = vlib_get_frame_to_node (vm, fqm->node_index);
+
+      to = vlib_frame_vector_args (f);
+
+      n_left_to_node = elt->n_vectors;
+
+      while (n_left_to_node >= 4)
+	{
+	  to[0] = from[0];
+	  to[1] = from[1];
+	  to[2] = from[2];
+	  to[3] = from[3];
+	  to += 4;
+	  from += 4;
+	  n_left_to_node -= 4;
+	}
+
+      while (n_left_to_node > 0)
+	{
+	  to[0] = from[0];
+	  to++;
+	  from++;
+	  n_left_to_node--;
+	}
+
+      vectors += elt->n_vectors;
+      f->n_vectors = elt->n_vectors;
+      vlib_put_frame_to_node (vm, fqm->node_index, f);
+
+      elt->valid = 0;
+      elt->n_vectors = 0;
+      elt->msg_type = 0xfefefefe;
+      CLIB_MEMORY_BARRIER ();
+      fq->head++;
+      processed++;
+
+      /*
+       * Limit the number of packets pushed into the graph
+       */
+      if (vectors >= fq->vector_threshold)
+	{
+	  fq->head_hint = fq->head;
+	  return processed;
+	}
+    }
+  ASSERT (0);
+  return processed;
+}
+
+static_always_inline void
+vlib_worker_thread_internal (vlib_main_t * vm)
+{
+  vlib_node_main_t *nm = &vm->node_main;
+  vlib_thread_main_t *tm = vlib_get_thread_main ();
+  u64 cpu_time_now = clib_cpu_time_now ();
+  vlib_frame_queue_main_t *fqm;
+
+  vec_alloc (nm->pending_interrupt_node_runtime_indices, 32);
+
+  while (1)
+    {
+      vlib_worker_thread_barrier_check ();
+
+      vec_foreach (fqm, tm->frame_queue_mains)
+	vlib_frame_queue_dequeue_internal (vm, fqm);
+
+      vlib_node_runtime_t *n;
+      vec_foreach (n, nm->nodes_by_type[VLIB_NODE_TYPE_INPUT])
+      {
+	cpu_time_now = dispatch_node (vm, n, VLIB_NODE_TYPE_INPUT,
+				      VLIB_NODE_STATE_POLLING, /* frame */ 0,
+				      cpu_time_now);
+      }
+
+      /* Next handle interrupts. */
+      {
+	uword l = _vec_len (nm->pending_interrupt_node_runtime_indices);
+	uword i;
+	if (l > 0)
+	  {
+	    _vec_len (nm->pending_interrupt_node_runtime_indices) = 0;
+	    for (i = 0; i < l; i++)
+	      {
+		n = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT],
+				      nm->
+				      pending_interrupt_node_runtime_indices
+				      [i]);
+		cpu_time_now =
+		  dispatch_node (vm, n, VLIB_NODE_TYPE_INPUT,
+				 VLIB_NODE_STATE_INTERRUPT,
+				 /* frame */ 0,
+				 cpu_time_now);
+	      }
+	  }
+      }
+
+      if (_vec_len (nm->pending_frames))
+	{
+	  int i;
+	  cpu_time_now = clib_cpu_time_now ();
+	  for (i = 0; i < _vec_len (nm->pending_frames); i++)
+	    {
+	      vlib_pending_frame_t *p;
+
+	      p = nm->pending_frames + i;
+
+	      cpu_time_now = dispatch_pending_node (vm, p, cpu_time_now);
+	    }
+	  _vec_len (nm->pending_frames) = 0;
+	}
+      vlib_increment_main_loop_counter (vm);
+
+      /* Record time stamp in case there are no enabled nodes and above
+         calls do not update time stamp. */
+      cpu_time_now = clib_cpu_time_now ();
+    }
+}
+
+void
+vlib_worker_thread_fn (void *arg)
+{
+  vlib_worker_thread_t *w = (vlib_worker_thread_t *) arg;
+  vlib_main_t *vm = vlib_get_main ();
+
+  ASSERT (vm->cpu_index == os_get_cpu_number ());
+
+  vlib_worker_thread_init (w);
+  clib_time_init (&vm->clib_time);
+  clib_mem_set_heap (w->thread_mheap);
+
+#if DPDK > 0
+  /* Wait until the dpdk init sequence is complete */
+  vlib_thread_main_t *tm = vlib_get_thread_main ();
+  while (tm->worker_thread_release == 0)
+    vlib_worker_thread_barrier_check ();
+#endif
+
+  vlib_worker_thread_internal (vm);
+}
+
+/* *INDENT-OFF* */
+VLIB_REGISTER_THREAD (worker_thread_reg, static) = {
+  .name = "workers",
+  .short_name = "wk",
+  .function = vlib_worker_thread_fn,
+};
+/* *INDENT-ON* */
+
+u32
+vlib_frame_queue_main_init (u32 node_index, u32 frame_queue_nelts)
+{
+  vlib_thread_main_t *tm = vlib_get_thread_main ();
+  vlib_frame_queue_main_t *fqm;
+  vlib_frame_queue_t *fq;
+  int i;
+
+  if (frame_queue_nelts == 0)
+    frame_queue_nelts = FRAME_QUEUE_NELTS;
+
+  vec_add2 (tm->frame_queue_mains, fqm, 1);
+
+  fqm->node_index = node_index;
+
+  vec_validate (fqm->vlib_frame_queues, tm->n_vlib_mains - 1);
+  _vec_len (fqm->vlib_frame_queues) = 0;
+  for (i = 0; i < tm->n_vlib_mains; i++)
+    {
+      fq = vlib_frame_queue_alloc (frame_queue_nelts);
+      vec_add1 (fqm->vlib_frame_queues, fq);
+    }
+
+  return (fqm - tm->frame_queue_mains);
+}
+
+clib_error_t *
+threads_init (vlib_main_t * vm)
+{
+  return 0;
+}
+
+VLIB_INIT_FUNCTION (threads_init);
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
-- 
cgit 1.2.3-korg


From 878c609889dcdc58538d40d8b3f662320f88573d Mon Sep 17 00:00:00 2001
From: Damjan Marion <damarion@cisco.com>
Date: Wed, 4 Jan 2017 13:19:27 +0100
Subject: vlib: add buffer and thread callbacks

Change-Id: I8e2e8f94a884ab2f9909d0c83ba00edd38cdab77
Signed-off-by: Damjan Marion <damarion@cisco.com>
---
 src/plugins/flowperpkt/flowperpkt.c |   2 +-
 src/vlib.am                         |   1 +
 src/vlib/buffer.c                   | 736 +++---------------------------------
 src/vlib/buffer.h                   |  45 ++-
 src/vlib/buffer_funcs.h             |  82 ++--
 src/vlib/buffer_serialize.c         | 248 ++++++++++++
 src/vlib/main.c                     |   7 +-
 src/vlib/threads.c                  | 112 +++---
 src/vlib/threads.h                  |  17 +-
 src/vlib/threads_cli.c              |  25 --
 src/vlib/unix/physmem.c             |  15 +-
 src/vnet.am                         |   2 +
 src/vnet/devices/dpdk/buffer.c      | 729 +++++++++++++++++++++++++++++++++++
 src/vnet/devices/dpdk/cli.c         |   4 +-
 src/vnet/devices/dpdk/device.c      |   7 +-
 src/vnet/devices/dpdk/dpdk.h        |   3 +
 src/vnet/devices/dpdk/dpdk_priv.h   |   3 +
 src/vnet/devices/dpdk/init.c        |   6 +-
 src/vnet/devices/dpdk/thread.c      |  85 +++++
 src/vnet/sr/sr_replicate.c          |   7 +-
 20 files changed, 1304 insertions(+), 832 deletions(-)
 create mode 100644 src/vlib/buffer_serialize.c
 create mode 100644 src/vnet/devices/dpdk/buffer.c
 create mode 100644 src/vnet/devices/dpdk/thread.c

(limited to 'src/vlib/threads.c')

diff --git a/src/plugins/flowperpkt/flowperpkt.c b/src/plugins/flowperpkt/flowperpkt.c
index fb71d5b0..cc351599 100644
--- a/src/plugins/flowperpkt/flowperpkt.c
+++ b/src/plugins/flowperpkt/flowperpkt.c
@@ -643,7 +643,7 @@ flowperpkt_init (vlib_main_t * vm)
   vec_free (name);
 
   /* Decide how many worker threads we have */
-  num_threads = 1 /* main thread */  + tm->n_eal_threads;
+  num_threads = 1 /* main thread */  + tm->n_threads;
 
   /* Allocate per worker thread vectors */
   vec_validate (fm->ipv4_buffers_per_worker, num_threads - 1);
diff --git a/src/vlib.am b/src/vlib.am
index 0154d841..c21f88c4 100644
--- a/src/vlib.am
+++ b/src/vlib.am
@@ -23,6 +23,7 @@ vlib/config.h:
 
 libvlib_la_SOURCES =				\
   vlib/buffer.c					\
+  vlib/buffer_serialize.c			\
   vlib/cli.c					\
   vlib/cli.h					\
   vlib/config.h					\
diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c
index 4bf6d125..0b0e6054 100644
--- a/src/vlib/buffer.c
+++ b/src/vlib/buffer.c
@@ -38,50 +38,13 @@
  */
 
 /**
- * @cond (!DPDK)
  * @file
  *
  * Allocate/free network buffers.
  */
 
-#if DPDK > 0
-#include <rte_config.h>
-
-#include <rte_common.h>
-#include <rte_log.h>
-#include <rte_memory.h>
-#include <rte_memzone.h>
-#include <rte_tailq.h>
-#include <rte_eal.h>
-#include <rte_per_lcore.h>
-#include <rte_launch.h>
-#include <rte_atomic.h>
-#include <rte_cycles.h>
-#include <rte_prefetch.h>
-#include <rte_lcore.h>
-#include <rte_per_lcore.h>
-#include <rte_branch_prediction.h>
-#include <rte_interrupts.h>
-#include <rte_pci.h>
-#include <rte_random.h>
-#include <rte_debug.h>
-#include <rte_ether.h>
-#include <rte_ethdev.h>
-#include <rte_ring.h>
-#include <rte_mempool.h>
-#include <rte_mbuf.h>
-#include <rte_version.h>
-#endif
-
 #include <vlib/vlib.h>
 
-#if DPDK > 0
-#pragma weak rte_mem_virt2phy
-#pragma weak rte_eal_has_hugepages
-#pragma weak rte_socket_id
-#pragma weak rte_pktmbuf_pool_create
-#endif
-
 uword
 vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm,
 				       vlib_buffer_t * b_first)
@@ -103,7 +66,6 @@ u8 *
 format_vlib_buffer (u8 * s, va_list * args)
 {
   vlib_buffer_t *b = va_arg (*args, vlib_buffer_t *);
-#if DPDK > 0
   uword indent = format_get_indent (s);
 
   s = format (s, "current data %d, length %d, free-list %d",
@@ -126,18 +88,6 @@ format_vlib_buffer (u8 * s, va_list * args)
 		  format_white_space, indent, next_buffer, b->current_length);
     }
 
-#else
-
-  s = format (s, "current data %d, length %d, free-list %d",
-	      b->current_data, b->current_length, b->free_list_index);
-
-  if (b->flags & VLIB_BUFFER_IS_TRACED)
-    s = format (s, ", trace 0x%x", b->trace_index);
-
-  if (b->flags & VLIB_BUFFER_NEXT_PRESENT)
-    s = format (s, ", next-buffer 0x%x", b->next_buffer);
-#endif
-
   return s;
 }
 
@@ -153,7 +103,6 @@ format_vlib_buffer_and_data (u8 * s, va_list * args)
   return s;
 }
 
-#if DPDK == 0
 static u8 *
 format_vlib_buffer_known_state (u8 * s, va_list * args)
 {
@@ -181,7 +130,6 @@ format_vlib_buffer_known_state (u8 * s, va_list * args)
 
   return format (s, "%s", t);
 }
-#endif
 
 u8 *
 format_vlib_buffer_contents (u8 * s, va_list * va)
@@ -200,7 +148,6 @@ format_vlib_buffer_contents (u8 * s, va_list * va)
   return s;
 }
 
-#if DPDK == 0
 static u8 *
 vlib_validate_buffer_helper (vlib_main_t * vm,
 			     u32 bi,
@@ -217,11 +164,10 @@ vlib_validate_buffer_helper (vlib_main_t * vm,
 
   if ((signed) b->current_data < (signed) -VLIB_BUFFER_PRE_DATA_SIZE)
     return format (0, "current data %d before pre-data", b->current_data);
-#if DPDK == 0
+
   if (b->current_data + b->current_length > fl->n_data_bytes)
     return format (0, "%d-%d beyond end of buffer %d",
 		   b->current_data, b->current_length, fl->n_data_bytes);
-#endif
 
   if (follow_buffer_next && (b->flags & VLIB_BUFFER_NEXT_PRESENT))
     {
@@ -311,14 +257,12 @@ done:
   hash_free (hash);
   return result;
 }
-#endif
 
 vlib_main_t **vlib_mains;
 
-#if DPDK == 0
 /* When dubugging validate that given buffers are either known allocated
    or known free. */
-static void
+static void __attribute__ ((unused))
 vlib_buffer_validate_alloc_free (vlib_main_t * vm,
 				 u32 * buffers,
 				 uword n_buffers,
@@ -359,7 +303,6 @@ vlib_buffer_validate_alloc_free (vlib_main_t * vm,
 	 is_free ? VLIB_BUFFER_KNOWN_FREE : VLIB_BUFFER_KNOWN_ALLOCATED);
     }
 }
-#endif
 
 #define BUFFERS_PER_COPY (sizeof (vlib_copy_unit_t) / sizeof (u32))
 
@@ -463,7 +406,6 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm,
 {
   vlib_buffer_main_t *bm = vm->buffer_main;
   vlib_buffer_free_list_t *f;
-#if DPDK > 0
   int i;
 
   ASSERT (os_get_cpu_number () == 0);
@@ -519,47 +461,6 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm,
       wf->unaligned_buffers = 0;
       wf->n_alloc = 0;
     }
-#else
-
-  if (!is_default && pool_elts (bm->buffer_free_list_pool) == 0)
-    {
-      u32 default_free_free_list_index;
-
-      default_free_free_list_index = vlib_buffer_create_free_list_helper (vm,
-									  /* default buffer size */
-									  VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES,
-									  /* is_public */
-									  1,
-									  /* is_default */
-									  1,
-									  (u8
-									   *)
-									  "default");
-      ASSERT (default_free_free_list_index ==
-	      VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
-
-      if (n_data_bytes == VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES && is_public)
-	return default_free_free_list_index;
-    }
-
-  pool_get_aligned (bm->buffer_free_list_pool, f, CLIB_CACHE_LINE_BYTES);
-
-  memset (f, 0, sizeof (f[0]));
-  f->index = f - bm->buffer_free_list_pool;
-  f->n_data_bytes = vlib_buffer_round_size (n_data_bytes);
-  f->min_n_buffers_each_physmem_alloc = 256;
-  f->name = clib_mem_is_heap_object (name) ? name : format (0, "%s", name);
-
-  /* Setup free buffer template. */
-  f->buffer_init_template.free_list_index = f->index;
-
-  if (is_public)
-    {
-      uword *p = hash_get (bm->free_list_by_size, f->n_data_bytes);
-      if (!p)
-	hash_set (bm->free_list_by_size, f->n_data_bytes, f->index);
-    }
-#endif
 
   return f->index;
 }
@@ -609,50 +510,30 @@ static void
 del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f)
 {
   u32 i;
-#if DPDK > 0
-  struct rte_mbuf *mb;
-  vlib_buffer_t *b;
-
-  for (i = 0; i < vec_len (f->unaligned_buffers); i++)
-    {
-      b = vlib_get_buffer (vm, f->unaligned_buffers[i]);
-      mb = rte_mbuf_from_vlib_buffer (b);
-      ASSERT (rte_mbuf_refcnt_read (mb) == 1);
-      rte_pktmbuf_free (mb);
-    }
-  for (i = 0; i < vec_len (f->aligned_buffers); i++)
-    {
-      b = vlib_get_buffer (vm, f->aligned_buffers[i]);
-      mb = rte_mbuf_from_vlib_buffer (b);
-      ASSERT (rte_mbuf_refcnt_read (mb) == 1);
-      rte_pktmbuf_free (mb);
-    }
-  vec_free (f->name);
-#else
 
   for (i = 0; i < vec_len (f->buffer_memory_allocated); i++)
     vm->os_physmem_free (f->buffer_memory_allocated[i]);
   vec_free (f->name);
   vec_free (f->buffer_memory_allocated);
-#endif
   vec_free (f->unaligned_buffers);
   vec_free (f->aligned_buffers);
 }
 
 /* Add buffer free list. */
 void
-vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index)
+vlib_buffer_delete_free_list_internal (vlib_main_t * vm, u32 free_list_index)
 {
   vlib_buffer_main_t *bm = vm->buffer_main;
   vlib_buffer_free_list_t *f;
   u32 merge_index;
-#if DPDK > 0
   int i;
 
   ASSERT (os_get_cpu_number () == 0);
 
   f = vlib_buffer_get_free_list (vm, free_list_index);
 
+  ASSERT (vec_len (f->unaligned_buffers) + vec_len (f->aligned_buffers) ==
+	  f->n_alloc);
   merge_index = vlib_buffer_get_free_list_with_size (vm, f->n_data_bytes);
   if (merge_index != ~0 && merge_index != free_list_index)
     {
@@ -674,26 +555,6 @@ vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index)
       memset (f, 0xab, sizeof (f[0]));
       pool_put (bm->buffer_free_list_pool, f);
     }
-#else
-
-  f = vlib_buffer_get_free_list (vm, free_list_index);
-
-  ASSERT (vec_len (f->unaligned_buffers) + vec_len (f->aligned_buffers) ==
-	  f->n_alloc);
-  merge_index = vlib_buffer_get_free_list_with_size (vm, f->n_data_bytes);
-  if (merge_index != ~0 && merge_index != free_list_index)
-    {
-      merge_free_lists (pool_elt_at_index (bm->buffer_free_list_pool,
-					   merge_index), f);
-    }
-
-  del_free_list (vm, f);
-
-  /* Poison it. */
-  memset (f, 0xab, sizeof (f[0]));
-
-  pool_put (bm->buffer_free_list_pool, f);
-#endif
 }
 
 /* Make sure free list has at least given number of free buffers. */
@@ -701,63 +562,6 @@ static uword
 fill_free_list (vlib_main_t * vm,
 		vlib_buffer_free_list_t * fl, uword min_free_buffers)
 {
-#if DPDK > 0
-  vlib_buffer_t *b;
-  int n, i;
-  u32 bi;
-  u32 n_remaining = 0, n_alloc = 0;
-  unsigned socket_id = rte_socket_id ? rte_socket_id () : 0;
-  struct rte_mempool *rmp = vm->buffer_main->pktmbuf_pools[socket_id];
-  struct rte_mbuf *mb;
-
-  /* Too early? */
-  if (PREDICT_FALSE (rmp == 0))
-    return 0;
-
-  trim_aligned (fl);
-
-  /* Already have enough free buffers on free list? */
-  n = min_free_buffers - vec_len (fl->aligned_buffers);
-  if (n <= 0)
-    return min_free_buffers;
-
-  /* Always allocate round number of buffers. */
-  n = round_pow2 (n, BUFFERS_PER_COPY);
-
-  /* Always allocate new buffers in reasonably large sized chunks. */
-  n = clib_max (n, fl->min_n_buffers_each_physmem_alloc);
-
-  vec_validate (vm->mbuf_alloc_list, n - 1);
-
-  if (rte_mempool_get_bulk (rmp, vm->mbuf_alloc_list, n) < 0)
-    return 0;
-
-  _vec_len (vm->mbuf_alloc_list) = n;
-
-  for (i = 0; i < n; i++)
-    {
-      mb = vm->mbuf_alloc_list[i];
-
-      ASSERT (rte_mbuf_refcnt_read (mb) == 0);
-      rte_mbuf_refcnt_set (mb, 1);
-
-      b = vlib_buffer_from_rte_mbuf (mb);
-      bi = vlib_get_buffer_index (vm, b);
-
-      vec_add1_aligned (fl->aligned_buffers, bi, sizeof (vlib_copy_unit_t));
-      n_alloc++;
-      n_remaining--;
-
-      vlib_buffer_init_for_free_list (b, fl);
-
-      if (fl->buffer_init_function)
-	fl->buffer_init_function (vm, fl, &bi, 1);
-    }
-
-  fl->n_alloc += n;
-
-  return n;
-#else
   vlib_buffer_t *buffers, *b;
   int n, n_bytes, i;
   u32 *bi;
@@ -824,7 +628,6 @@ fill_free_list (vlib_main_t * vm,
 	fl->buffer_init_function (vm, fl, bi, n_this_chunk);
     }
   return n_alloc;
-#endif
 }
 
 always_inline uword
@@ -833,6 +636,7 @@ copy_alignment (u32 * x)
   return (pointer_to_uword (x) / sizeof (x[0])) % BUFFERS_PER_COPY;
 }
 
+
 static u32
 alloc_from_free_list (vlib_main_t * vm,
 		      vlib_buffer_free_list_t * free_list,
@@ -842,10 +646,6 @@ alloc_from_free_list (vlib_main_t * vm,
   uword u_len, n_left;
   uword n_unaligned_start, n_unaligned_end, n_filled;
 
-#if DPDK == 0
-  ASSERT (os_get_cpu_number () == 0);
-
-#endif
   n_left = n_alloc_buffers;
   dst = alloc_buffers;
   n_unaligned_start = ((BUFFERS_PER_COPY - copy_alignment (dst))
@@ -945,25 +745,21 @@ alloc_from_free_list (vlib_main_t * vm,
   else
     _vec_len (free_list->unaligned_buffers) = u_len;
 
-#if DPDK == 0
   /* Verify that buffers are known free. */
   vlib_buffer_validate_alloc_free (vm, alloc_buffers,
 				   n_alloc_buffers, VLIB_BUFFER_KNOWN_FREE);
-#endif
 
   return n_alloc_buffers;
 }
 
+
 /* Allocate a given number of buffers into given array.
    Returns number actually allocated which will be either zero or
    number requested. */
-u32
-vlib_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers)
+static u32
+vlib_buffer_alloc_internal (vlib_main_t * vm, u32 * buffers, u32 n_buffers)
 {
   vlib_buffer_main_t *bm = vm->buffer_main;
-#if DPDK == 0
-  ASSERT (os_get_cpu_number () == 0);
-#endif
 
   return alloc_from_free_list
     (vm,
@@ -972,10 +768,10 @@ vlib_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers)
      buffers, n_buffers);
 }
 
-u32
-vlib_buffer_alloc_from_free_list (vlib_main_t * vm,
-				  u32 * buffers,
-				  u32 n_buffers, u32 free_list_index)
+static u32
+vlib_buffer_alloc_from_free_list_internal (vlib_main_t * vm,
+					   u32 * buffers,
+					   u32 n_buffers, u32 free_list_index)
 {
   vlib_buffer_main_t *bm = vm->buffer_main;
   vlib_buffer_free_list_t *f;
@@ -1016,81 +812,10 @@ vlib_set_buffer_free_callback (vlib_main_t * vm, void *fp)
   return rv;
 }
 
-#if DPDK == 0
-void vnet_buffer_free_dpdk_mb (vlib_buffer_t * b) __attribute__ ((weak));
-void
-vnet_buffer_free_dpdk_mb (vlib_buffer_t * b)
-{
-}
-
-#endif
 static_always_inline void
 vlib_buffer_free_inline (vlib_main_t * vm,
 			 u32 * buffers, u32 n_buffers, u32 follow_buffer_next)
 {
-#if DPDK > 0
-  vlib_buffer_main_t *bm = vm->buffer_main;
-  vlib_buffer_free_list_t *fl;
-  u32 fi;
-  int i;
-  u32 (*cb) (vlib_main_t * vm, u32 * buffers, u32 n_buffers,
-	     u32 follow_buffer_next);
-
-  cb = bm->buffer_free_callback;
-
-  if (PREDICT_FALSE (cb != 0))
-    n_buffers = (*cb) (vm, buffers, n_buffers, follow_buffer_next);
-
-  if (!n_buffers)
-    return;
-
-  for (i = 0; i < n_buffers; i++)
-    {
-      vlib_buffer_t *b;
-      struct rte_mbuf *mb;
-
-      b = vlib_get_buffer (vm, buffers[i]);
-
-      fl = buffer_get_free_list (vm, b, &fi);
-
-      /* The only current use of this callback: multicast recycle */
-      if (PREDICT_FALSE (fl->buffers_added_to_freelist_function != 0))
-	{
-	  int j;
-
-	  add_buffer_to_free_list
-	    (vm, fl, buffers[i], (b->flags & VLIB_BUFFER_RECYCLE) == 0);
-
-	  for (j = 0; j < vec_len (bm->announce_list); j++)
-	    {
-	      if (fl == bm->announce_list[j])
-		goto already_announced;
-	    }
-	  vec_add1 (bm->announce_list, fl);
-	already_announced:
-	  ;
-	}
-      else
-	{
-	  if (PREDICT_TRUE ((b->flags & VLIB_BUFFER_RECYCLE) == 0))
-	    {
-	      mb = rte_mbuf_from_vlib_buffer (b);
-	      ASSERT (rte_mbuf_refcnt_read (mb) == 1);
-	      rte_pktmbuf_free (mb);
-	    }
-	}
-    }
-  if (vec_len (bm->announce_list))
-    {
-      vlib_buffer_free_list_t *fl;
-      for (i = 0; i < vec_len (bm->announce_list); i++)
-	{
-	  fl = bm->announce_list[i];
-	  fl->buffers_added_to_freelist_function (vm, fl);
-	}
-      _vec_len (bm->announce_list) = 0;
-    }
-#else
   vlib_buffer_main_t *bm = vm->buffer_main;
   vlib_buffer_free_list_t *fl;
   static u32 *next_to_free[2];	/* smp bad */
@@ -1315,26 +1040,25 @@ again:
 	}
       _vec_len (announce_list) = 0;
     }
-#endif
 }
 
-void
-vlib_buffer_free (vlib_main_t * vm, u32 * buffers, u32 n_buffers)
+static void
+vlib_buffer_free_internal (vlib_main_t * vm, u32 * buffers, u32 n_buffers)
 {
   vlib_buffer_free_inline (vm, buffers, n_buffers,	/* follow_buffer_next */
 			   1);
 }
 
-void
-vlib_buffer_free_no_next (vlib_main_t * vm, u32 * buffers, u32 n_buffers)
+static void
+vlib_buffer_free_no_next_internal (vlib_main_t * vm, u32 * buffers,
+				   u32 n_buffers)
 {
   vlib_buffer_free_inline (vm, buffers, n_buffers,	/* follow_buffer_next */
 			   0);
 }
 
-#if DPDK == 0
 /* Copy template packet data into buffers as they are allocated. */
-static void
+static void __attribute__ ((unused))
 vlib_packet_template_buffer_init (vlib_main_t * vm,
 				  vlib_buffer_free_list_t * fl,
 				  u32 * buffers, u32 n_buffers)
@@ -1352,7 +1076,6 @@ vlib_packet_template_buffer_init (vlib_main_t * vm,
 		   b->current_length);
     }
 }
-#endif
 
 void
 vlib_packet_template_init (vlib_main_t * vm,
@@ -1362,28 +1085,22 @@ vlib_packet_template_init (vlib_main_t * vm,
 			   uword min_n_buffers_each_physmem_alloc,
 			   char *fmt, ...)
 {
-#if DPDK > 0
+  vlib_buffer_main_t *bm = vm->buffer_main;
   va_list va;
   __attribute__ ((unused)) u8 *name;
+  vlib_buffer_free_list_t *fl;
 
   va_start (va, fmt);
   name = va_format (0, fmt, &va);
   va_end (va);
 
-  vlib_worker_thread_barrier_sync (vm);
-  memset (t, 0, sizeof (t[0]));
-
-  vec_add (t->packet_data, packet_data, n_packet_data_bytes);
+  if (bm->cb.vlib_packet_template_init_cb)
+    bm->cb.vlib_packet_template_init_cb (vm, (void *) t, packet_data,
+					 n_packet_data_bytes,
+					 min_n_buffers_each_physmem_alloc,
+					 name);
 
-  vlib_worker_thread_barrier_release (vm);
-#else
-  vlib_buffer_free_list_t *fl;
-  va_list va;
-  u8 *name;
-
-  va_start (va, fmt);
-  name = va_format (0, fmt, &va);
-  va_end (va);
+  vlib_worker_thread_barrier_sync (vm);
 
   memset (t, 0, sizeof (t[0]));
 
@@ -1406,7 +1123,7 @@ vlib_packet_template_init (vlib_main_t * vm,
   fl->buffer_init_template.current_data = 0;
   fl->buffer_init_template.current_length = n_packet_data_bytes;
   fl->buffer_init_template.flags = 0;
-#endif
+  vlib_worker_thread_barrier_release (vm);
 }
 
 void *
@@ -1429,7 +1146,6 @@ vlib_packet_template_get_packet (vlib_main_t * vm,
   return b->data;
 }
 
-#if DPDK == 0
 void
 vlib_packet_template_get_packet_helper (vlib_main_t * vm,
 					vlib_packet_template_t * t)
@@ -1447,7 +1163,6 @@ vlib_packet_template_get_packet_helper (vlib_main_t * vm,
   _vec_len (t->free_buffers) = n_alloc;
 }
 
-#endif
 /* Append given data to end of buffer, possibly allocating new buffers. */
 u32
 vlib_buffer_add_data (vlib_main_t * vm,
@@ -1541,328 +1256,11 @@ vlib_buffer_chain_append_data_with_alloc (vlib_main_t * vm,
   return copied;
 }
 
-#if DPDK > 0
-clib_error_t *
-vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs,
-			 unsigned socket_id)
-{
-  vlib_buffer_main_t *bm = vm->buffer_main;
-  vlib_physmem_main_t *vpm = &vm->physmem_main;
-  struct rte_mempool *rmp;
-  int i;
-
-  if (!rte_pktmbuf_pool_create)
-    return clib_error_return (0, "not linked with DPDK");
-
-  vec_validate_aligned (bm->pktmbuf_pools, socket_id, CLIB_CACHE_LINE_BYTES);
-
-  /* pool already exists, nothing to do */
-  if (bm->pktmbuf_pools[socket_id])
-    return 0;
-
-  u8 *pool_name = format (0, "mbuf_pool_socket%u%c", socket_id, 0);
-
-  rmp = rte_pktmbuf_pool_create ((char *) pool_name,	/* pool name */
-				 num_mbufs,	/* number of mbufs */
-				 512,	/* cache size */
-				 VLIB_BUFFER_HDR_SIZE,	/* priv size */
-				 VLIB_BUFFER_PRE_DATA_SIZE + VLIB_BUFFER_DATA_SIZE,	/* dataroom size */
-				 socket_id);	/* cpu socket */
-
-  if (rmp)
-    {
-      {
-	uword this_pool_end;
-	uword this_pool_start;
-	uword this_pool_size;
-	uword save_vpm_start, save_vpm_end, save_vpm_size;
-	struct rte_mempool_memhdr *memhdr;
-
-	this_pool_start = ~0ULL;
-	this_pool_end = 0LL;
-
-	STAILQ_FOREACH (memhdr, &rmp->mem_list, next)
-	{
-	  if (((uword) (memhdr->addr + memhdr->len)) > this_pool_end)
-	    this_pool_end = (uword) (memhdr->addr + memhdr->len);
-	  if (((uword) memhdr->addr) < this_pool_start)
-	    this_pool_start = (uword) (memhdr->addr);
-	}
-	ASSERT (this_pool_start < ~0ULL && this_pool_end > 0);
-	this_pool_size = this_pool_end - this_pool_start;
-
-	if (CLIB_DEBUG > 1)
-	  {
-	    clib_warning ("%s: pool start %llx pool end %llx pool size %lld",
-			  pool_name, this_pool_start, this_pool_end,
-			  this_pool_size);
-	    clib_warning
-	      ("before: virtual.start %llx virtual.end %llx virtual.size %lld",
-	       vpm->virtual.start, vpm->virtual.end, vpm->virtual.size);
-	  }
-
-	save_vpm_start = vpm->virtual.start;
-	save_vpm_end = vpm->virtual.end;
-	save_vpm_size = vpm->virtual.size;
-
-	if ((this_pool_start < vpm->virtual.start) || vpm->virtual.start == 0)
-	  vpm->virtual.start = this_pool_start;
-	if (this_pool_end > vpm->virtual.end)
-	  vpm->virtual.end = this_pool_end;
-
-	vpm->virtual.size = vpm->virtual.end - vpm->virtual.start;
-
-	if (CLIB_DEBUG > 1)
-	  {
-	    clib_warning
-	      ("after: virtual.start %llx virtual.end %llx virtual.size %lld",
-	       vpm->virtual.start, vpm->virtual.end, vpm->virtual.size);
-	  }
-
-	/* check if fits into buffer index range */
-	if ((u64) vpm->virtual.size >
-	    ((u64) 1 << (32 + CLIB_LOG2_CACHE_LINE_BYTES)))
-	  {
-	    clib_warning ("physmem: virtual size out of range!");
-	    vpm->virtual.start = save_vpm_start;
-	    vpm->virtual.end = save_vpm_end;
-	    vpm->virtual.size = save_vpm_size;
-	    rmp = 0;
-	  }
-      }
-      if (rmp)
-	{
-	  bm->pktmbuf_pools[socket_id] = rmp;
-	  vec_free (pool_name);
-	  return 0;
-	}
-    }
-
-  vec_free (pool_name);
-
-  /* no usable pool for this socket, try to use pool from another one */
-  for (i = 0; i < vec_len (bm->pktmbuf_pools); i++)
-    {
-      if (bm->pktmbuf_pools[i])
-	{
-	  clib_warning
-	    ("WARNING: Failed to allocate mempool for CPU socket %u. "
-	     "Threads running on socket %u will use socket %u mempool.",
-	     socket_id, socket_id, i);
-	  bm->pktmbuf_pools[socket_id] = bm->pktmbuf_pools[i];
-	  return 0;
-	}
-    }
-
-  return clib_error_return (0, "failed to allocate mempool on socket %u",
-			    socket_id);
-}
-#endif
-
-static void
-vlib_serialize_tx (serialize_main_header_t * m, serialize_stream_t * s)
-{
-  vlib_main_t *vm;
-  vlib_serialize_buffer_main_t *sm;
-  uword n, n_bytes_to_write;
-  vlib_buffer_t *last;
-
-  n_bytes_to_write = s->current_buffer_index;
-  sm =
-    uword_to_pointer (s->data_function_opaque,
-		      vlib_serialize_buffer_main_t *);
-  vm = sm->vlib_main;
-
-  ASSERT (sm->tx.max_n_data_bytes_per_chain > 0);
-  if (serialize_stream_is_end_of_stream (s)
-      || sm->tx.n_total_data_bytes + n_bytes_to_write >
-      sm->tx.max_n_data_bytes_per_chain)
-    {
-      vlib_process_t *p = vlib_get_current_process (vm);
-
-      last = vlib_get_buffer (vm, sm->last_buffer);
-      last->current_length = n_bytes_to_write;
-
-      vlib_set_next_frame_buffer (vm, &p->node_runtime, sm->tx.next_index,
-				  sm->first_buffer);
-
-      sm->first_buffer = sm->last_buffer = ~0;
-      sm->tx.n_total_data_bytes = 0;
-    }
-
-  else if (n_bytes_to_write == 0 && s->n_buffer_bytes == 0)
-    {
-      ASSERT (sm->first_buffer == ~0);
-      ASSERT (sm->last_buffer == ~0);
-      n =
-	vlib_buffer_alloc_from_free_list (vm, &sm->first_buffer, 1,
-					  sm->tx.free_list_index);
-      if (n != 1)
-	serialize_error (m,
-			 clib_error_create
-			 ("vlib_buffer_alloc_from_free_list fails"));
-      sm->last_buffer = sm->first_buffer;
-      s->n_buffer_bytes =
-	vlib_buffer_free_list_buffer_size (vm, sm->tx.free_list_index);
-    }
-
-  if (n_bytes_to_write > 0)
-    {
-      vlib_buffer_t *prev = vlib_get_buffer (vm, sm->last_buffer);
-      n =
-	vlib_buffer_alloc_from_free_list (vm, &sm->last_buffer, 1,
-					  sm->tx.free_list_index);
-      if (n != 1)
-	serialize_error (m,
-			 clib_error_create
-			 ("vlib_buffer_alloc_from_free_list fails"));
-      sm->tx.n_total_data_bytes += n_bytes_to_write;
-      prev->current_length = n_bytes_to_write;
-      prev->next_buffer = sm->last_buffer;
-      prev->flags |= VLIB_BUFFER_NEXT_PRESENT;
-    }
-
-  if (sm->last_buffer != ~0)
-    {
-      last = vlib_get_buffer (vm, sm->last_buffer);
-      s->buffer = vlib_buffer_get_current (last);
-      s->current_buffer_index = 0;
-      ASSERT (last->current_data == s->current_buffer_index);
-    }
-}
-
-static void
-vlib_serialize_rx (serialize_main_header_t * m, serialize_stream_t * s)
-{
-  vlib_main_t *vm;
-  vlib_serialize_buffer_main_t *sm;
-  vlib_buffer_t *last;
-
-  sm =
-    uword_to_pointer (s->data_function_opaque,
-		      vlib_serialize_buffer_main_t *);
-  vm = sm->vlib_main;
-
-  if (serialize_stream_is_end_of_stream (s))
-    return;
-
-  if (sm->last_buffer != ~0)
-    {
-      last = vlib_get_buffer (vm, sm->last_buffer);
-
-      if (last->flags & VLIB_BUFFER_NEXT_PRESENT)
-	sm->last_buffer = last->next_buffer;
-      else
-	{
-	  vlib_buffer_free (vm, &sm->first_buffer, /* count */ 1);
-	  sm->first_buffer = sm->last_buffer = ~0;
-	}
-    }
-
-  if (sm->last_buffer == ~0)
-    {
-      while (clib_fifo_elts (sm->rx.buffer_fifo) == 0)
-	{
-	  sm->rx.ready_one_time_event =
-	    vlib_process_create_one_time_event (vm, vlib_current_process (vm),
-						~0);
-	  vlib_process_wait_for_one_time_event (vm, /* no event data */ 0,
-						sm->rx.ready_one_time_event);
-	}
-
-      clib_fifo_sub1 (sm->rx.buffer_fifo, sm->first_buffer);
-      sm->last_buffer = sm->first_buffer;
-    }
-
-  ASSERT (sm->last_buffer != ~0);
-
-  last = vlib_get_buffer (vm, sm->last_buffer);
-  s->current_buffer_index = 0;
-  s->buffer = vlib_buffer_get_current (last);
-  s->n_buffer_bytes = last->current_length;
-}
-
-static void
-serialize_open_vlib_helper (serialize_main_t * m,
-			    vlib_main_t * vm,
-			    vlib_serialize_buffer_main_t * sm, uword is_read)
-{
-  /* Initialize serialize main but save overflow buffer for re-use between calls. */
-  {
-    u8 *save = m->stream.overflow_buffer;
-    memset (m, 0, sizeof (m[0]));
-    m->stream.overflow_buffer = save;
-    if (save)
-      _vec_len (save) = 0;
-  }
-
-  sm->first_buffer = sm->last_buffer = ~0;
-  if (is_read)
-    clib_fifo_reset (sm->rx.buffer_fifo);
-  else
-    sm->tx.n_total_data_bytes = 0;
-  sm->vlib_main = vm;
-  m->header.data_function = is_read ? vlib_serialize_rx : vlib_serialize_tx;
-  m->stream.data_function_opaque = pointer_to_uword (sm);
-}
-
-void
-serialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm,
-			    vlib_serialize_buffer_main_t * sm)
-{
-  serialize_open_vlib_helper (m, vm, sm, /* is_read */ 0);
-}
-
-void
-unserialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm,
-			      vlib_serialize_buffer_main_t * sm)
-{
-  serialize_open_vlib_helper (m, vm, sm, /* is_read */ 1);
-}
-
-u32
-serialize_close_vlib_buffer (serialize_main_t * m)
-{
-  vlib_serialize_buffer_main_t *sm
-    = uword_to_pointer (m->stream.data_function_opaque,
-			vlib_serialize_buffer_main_t *);
-  vlib_buffer_t *last;
-  serialize_stream_t *s = &m->stream;
-
-  last = vlib_get_buffer (sm->vlib_main, sm->last_buffer);
-  last->current_length = s->current_buffer_index;
-
-  if (vec_len (s->overflow_buffer) > 0)
-    {
-      sm->last_buffer
-	= vlib_buffer_add_data (sm->vlib_main, sm->tx.free_list_index,
-				sm->last_buffer == ~0 ? 0 : sm->last_buffer,
-				s->overflow_buffer,
-				vec_len (s->overflow_buffer));
-      _vec_len (s->overflow_buffer) = 0;
-    }
-
-  return sm->first_buffer;
-}
-
-void
-unserialize_close_vlib_buffer (serialize_main_t * m)
-{
-  vlib_serialize_buffer_main_t *sm
-    = uword_to_pointer (m->stream.data_function_opaque,
-			vlib_serialize_buffer_main_t *);
-  if (sm->first_buffer != ~0)
-    vlib_buffer_free_one (sm->vlib_main, sm->first_buffer);
-  clib_fifo_reset (sm->rx.buffer_fifo);
-  if (m->stream.overflow_buffer)
-    _vec_len (m->stream.overflow_buffer) = 0;
-}
 
 static u8 *
 format_vlib_buffer_free_list (u8 * s, va_list * va)
 {
   vlib_buffer_free_list_t *f = va_arg (*va, vlib_buffer_free_list_t *);
-#if DPDK > 0
   u32 threadnum = va_arg (*va, u32);
   uword bytes_alloc, bytes_free, n_free, size;
 
@@ -1877,21 +1275,6 @@ format_vlib_buffer_free_list (u8 * s, va_list * va)
   bytes_free = size * n_free;
 
   s = format (s, "%7d%30s%12d%12d%=12U%=12U%=12d%=12d", threadnum,
-#else
-  uword bytes_alloc, bytes_free, n_free, size;
-
-  if (!f)
-    return format (s, "%=30s%=12s%=12s%=12s%=12s%=12s%=12s",
-		   "Name", "Index", "Size", "Alloc", "Free", "#Alloc",
-		   "#Free");
-
-  size = sizeof (vlib_buffer_t) + f->n_data_bytes;
-  n_free = vec_len (f->aligned_buffers) + vec_len (f->unaligned_buffers);
-  bytes_alloc = size * f->n_alloc;
-  bytes_free = size * n_free;
-
-  s = format (s, "%30s%12d%12d%=12U%=12U%=12d%=12d",
-#endif
 	      f->name, f->index, f->n_data_bytes,
 	      format_memory_size, bytes_alloc,
 	      format_memory_size, bytes_free, f->n_alloc, n_free);
@@ -1903,7 +1286,6 @@ static clib_error_t *
 show_buffers (vlib_main_t * vm,
 	      unformat_input_t * input, vlib_cli_command_t * cmd)
 {
-#if DPDK > 0
   vlib_buffer_main_t *bm;
   vlib_buffer_free_list_t *f;
   vlib_main_t *curr_vm;
@@ -1926,18 +1308,6 @@ show_buffers (vlib_main_t * vm,
     }
   while (vm_index < vec_len (vlib_mains));
 
-#else
-  vlib_buffer_main_t *bm = vm->buffer_main;
-  vlib_buffer_free_list_t *f;
-
-  vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, 0);
-  /* *INDENT-OFF* */
-  pool_foreach (f, bm->buffer_free_list_pool, ({
-    vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, f);
-  }));
-/* *INDENT-ON* */
-
-#endif
   return 0;
 }
 
@@ -1949,34 +1319,38 @@ VLIB_CLI_COMMAND (show_buffers_command, static) = {
 };
 /* *INDENT-ON* */
 
-#if DPDK > 0
-#if CLIB_DEBUG > 0
-
-u32 *vlib_buffer_state_validation_lock;
-uword *vlib_buffer_state_validation_hash;
-void *vlib_buffer_state_heap;
-
-static clib_error_t *
-buffer_state_validation_init (vlib_main_t * vm)
+void
+vlib_buffer_cb_init (struct vlib_main_t *vm)
 {
-  void *oldheap;
-
-  vlib_buffer_state_heap = mheap_alloc (0, 10 << 20);
-
-  oldheap = clib_mem_set_heap (vlib_buffer_state_heap);
+  vlib_buffer_main_t *bm = vm->buffer_main;
+  bm->cb.vlib_buffer_alloc_cb = &vlib_buffer_alloc_internal;
+  bm->cb.vlib_buffer_alloc_from_free_list_cb =
+    &vlib_buffer_alloc_from_free_list_internal;
+  bm->cb.vlib_buffer_free_cb = &vlib_buffer_free_internal;
+  bm->cb.vlib_buffer_free_no_next_cb = &vlib_buffer_free_no_next_internal;
+  bm->cb.vlib_buffer_delete_free_list_cb =
+    &vlib_buffer_delete_free_list_internal;
+  bm->extern_buffer_mgmt = 0;
+}
 
-  vlib_buffer_state_validation_hash = hash_create (0, sizeof (uword));
-  vec_validate_aligned (vlib_buffer_state_validation_lock, 0,
-			CLIB_CACHE_LINE_BYTES);
-  clib_mem_set_heap (oldheap);
+int
+vlib_buffer_cb_register (struct vlib_main_t *vm, vlib_buffer_callbacks_t * cb)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+  if (bm->extern_buffer_mgmt)
+    return -1;
+
+#define _(x) bm->cb.x = cb->x
+  _(vlib_buffer_alloc_cb);
+  _(vlib_buffer_alloc_from_free_list_cb);
+  _(vlib_buffer_free_cb);
+  _(vlib_buffer_free_no_next_cb);
+  _(vlib_buffer_delete_free_list_cb);
+#undef _
+  bm->extern_buffer_mgmt = 1;
   return 0;
 }
 
-VLIB_INIT_FUNCTION (buffer_state_validation_init);
-#endif
-#endif
-
-
 /** @endcond */
 /*
  * fd.io coding-style-patch-verification: ON
diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h
index 5f1e62f0..d270c08a 100644
--- a/src/vlib/buffer.h
+++ b/src/vlib/buffer.h
@@ -46,15 +46,9 @@
 #include <vppinfra/vector.h>
 #include <vlib/error.h>		/* for vlib_error_t */
 
-#if DPDK > 0
-#include <rte_config.h>
-#define VLIB_BUFFER_DATA_SIZE		(2048)
-#define VLIB_BUFFER_PRE_DATA_SIZE	RTE_PKTMBUF_HEADROOM
-#else
 #include <vlib/config.h>	/* for __PRE_DATA_SIZE */
-#define VLIB_BUFFER_DATA_SIZE		(512)
+#define VLIB_BUFFER_DATA_SIZE		(2048)
 #define VLIB_BUFFER_PRE_DATA_SIZE	__PRE_DATA_SIZE
-#endif
 
 #if defined (CLIB_HAVE_VEC128) || defined (__aarch64__)
 typedef u8x16 vlib_copy_unit_t;
@@ -296,6 +290,27 @@ typedef struct vlib_buffer_free_list_t
   uword buffer_init_function_opaque;
 } __attribute__ ((aligned (16))) vlib_buffer_free_list_t;
 
+typedef struct
+{
+  u32 (*vlib_buffer_alloc_cb) (struct vlib_main_t * vm, u32 * buffers,
+			       u32 n_buffers);
+  u32 (*vlib_buffer_alloc_from_free_list_cb) (struct vlib_main_t * vm,
+					      u32 * buffers, u32 n_buffers,
+					      u32 free_list_index);
+  void (*vlib_buffer_free_cb) (struct vlib_main_t * vm, u32 * buffers,
+			       u32 n_buffers);
+  void (*vlib_buffer_free_no_next_cb) (struct vlib_main_t * vm, u32 * buffers,
+				       u32 n_buffers);
+  void (*vlib_packet_template_init_cb) (struct vlib_main_t * vm, void *t,
+					void *packet_data,
+					uword n_packet_data_bytes,
+					uword
+					min_n_buffers_each_physmem_alloc,
+					u8 * name);
+  void (*vlib_buffer_delete_free_list_cb) (struct vlib_main_t * vm,
+					   u32 free_list_index);
+} vlib_buffer_callbacks_t;
+
 typedef struct
 {
   /* Buffer free callback, for subversive activities */
@@ -323,12 +338,15 @@ typedef struct
   /* List of free-lists needing Blue Light Special announcements */
   vlib_buffer_free_list_t **announce_list;
 
-  /*  Vector of rte_mempools per socket */
-#if DPDK == 1
-  struct rte_mempool **pktmbuf_pools;
-#endif
+  /* Callbacks */
+  vlib_buffer_callbacks_t cb;
+  int extern_buffer_mgmt;
 } vlib_buffer_main_t;
 
+void vlib_buffer_cb_init (struct vlib_main_t *vm);
+int vlib_buffer_cb_register (struct vlib_main_t *vm,
+			     vlib_buffer_callbacks_t * cb);
+
 typedef struct
 {
   struct vlib_main_t *vlib_main;
@@ -385,11 +403,6 @@ serialize_vlib_buffer_n_bytes (serialize_main_t * m)
     vec_len (s->overflow_buffer);
 }
 
-#if DPDK > 0
-#define rte_mbuf_from_vlib_buffer(x) (((struct rte_mbuf *)x) - 1)
-#define vlib_buffer_from_rte_mbuf(x) ((vlib_buffer_t *)(x+1))
-#endif
-
 /*
  */
 
diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h
index 75716eca..15d93c16 100644
--- a/src/vlib/buffer_funcs.h
+++ b/src/vlib/buffer_funcs.h
@@ -195,8 +195,6 @@ do {                                                             \
 } while (0)
 #endif
 
-#if DPDK == 0
-
 typedef enum
 {
   /* Index is unknown. */
@@ -232,8 +230,6 @@ vlib_buffer_set_known_state (vlib_main_t * vm,
 u8 *vlib_validate_buffer (vlib_main_t * vm, u32 buffer_index,
 			  uword follow_chain);
 
-#endif /* DPDK == 0 */
-
 clib_error_t *vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs,
 				       unsigned socket_id);
 
@@ -245,7 +241,15 @@ clib_error_t *vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs,
     @return - (u32) number of buffers actually allocated, may be
     less than the number requested or zero
 */
-u32 vlib_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers);
+always_inline u32
+vlib_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+
+  ASSERT (bm->cb.vlib_buffer_alloc_cb);
+
+  return bm->cb.vlib_buffer_alloc_cb (vm, buffers, n_buffers);
+}
 
 always_inline u32
 vlib_buffer_round_size (u32 size)
@@ -261,9 +265,18 @@ vlib_buffer_round_size (u32 size)
     @return - (u32) number of buffers actually allocated, may be
     less than the number requested or zero
 */
-u32 vlib_buffer_alloc_from_free_list (vlib_main_t * vm,
-				      u32 * buffers,
-				      u32 n_buffers, u32 free_list_index);
+always_inline u32
+vlib_buffer_alloc_from_free_list (vlib_main_t * vm,
+				  u32 * buffers,
+				  u32 n_buffers, u32 free_list_index)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+
+  ASSERT (bm->cb.vlib_buffer_alloc_from_free_list_cb);
+
+  return bm->cb.vlib_buffer_alloc_from_free_list_cb (vm, buffers, n_buffers,
+						     free_list_index);
+}
 
 /** \brief Free buffers
     Frees the entire buffer chain for each buffer
@@ -273,11 +286,19 @@ u32 vlib_buffer_alloc_from_free_list (vlib_main_t * vm,
     @param n_buffers - (u32) number of buffers to free
 
 */
-void vlib_buffer_free (vlib_main_t * vm,
-		       /* pointer to first buffer */
-		       u32 * buffers,
-		       /* number of buffers to free */
-		       u32 n_buffers);
+always_inline void
+vlib_buffer_free (vlib_main_t * vm,
+		  /* pointer to first buffer */
+		  u32 * buffers,
+		  /* number of buffers to free */
+		  u32 n_buffers)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+
+  ASSERT (bm->cb.vlib_buffer_free_cb);
+
+  return bm->cb.vlib_buffer_free_cb (vm, buffers, n_buffers);
+}
 
 /** \brief Free buffers, does not free the buffer chain for each buffer
 
@@ -286,11 +307,19 @@ void vlib_buffer_free (vlib_main_t * vm,
     @param n_buffers - (u32) number of buffers to free
 
 */
-void vlib_buffer_free_no_next (vlib_main_t * vm,
-			       /* pointer to first buffer */
-			       u32 * buffers,
-			       /* number of buffers to free */
-			       u32 n_buffers);
+always_inline void
+vlib_buffer_free_no_next (vlib_main_t * vm,
+			  /* pointer to first buffer */
+			  u32 * buffers,
+			  /* number of buffers to free */
+			  u32 n_buffers)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+
+  ASSERT (bm->cb.vlib_buffer_free_no_next_cb);
+
+  return bm->cb.vlib_buffer_free_no_next_cb (vm, buffers, n_buffers);
+}
 
 /** \brief Free one buffer
     Shorthand to free a single buffer chain.
@@ -307,7 +336,15 @@ vlib_buffer_free_one (vlib_main_t * vm, u32 buffer_index)
 /* Add/delete buffer free lists. */
 u32 vlib_buffer_create_free_list (vlib_main_t * vm, u32 n_data_bytes,
 				  char *fmt, ...);
-void vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index);
+always_inline void
+vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+
+  ASSERT (bm->cb.vlib_buffer_delete_free_list_cb);
+
+  bm->cb.vlib_buffer_delete_free_list_cb (vm, free_list_index);
+}
 
 /* Find already existing public free list with given size or create one. */
 u32 vlib_buffer_get_or_create_free_list (vlib_main_t * vm, u32 n_data_bytes,
@@ -453,11 +490,6 @@ vlib_buffer_copy (vlib_main_t * vm, vlib_buffer_t * b)
   return fd;
 }
 
-/*
- * vlib_buffer_chain_* functions provide a way to create long buffers.
- * When DPDK is enabled, the 'hidden' DPDK header is taken care of transparently.
- */
-
 /* Initializes the buffer as an empty packet with no chained buffers. */
 always_inline void
 vlib_buffer_chain_init (vlib_buffer_t * first)
@@ -537,8 +569,6 @@ typedef struct
   /* Vector of packet data. */
   u8 *packet_data;
 
-  /* Note: the next three fields are unused if DPDK == 1 */
-
   /* Number of buffers to allocate in each call to physmem
      allocator. */
   u32 min_n_buffers_each_physmem_alloc;
diff --git a/src/vlib/buffer_serialize.c b/src/vlib/buffer_serialize.c
new file mode 100644
index 00000000..96a5f0a0
--- /dev/null
+++ b/src/vlib/buffer_serialize.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2015 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * buffer.c: allocate/free network buffers.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <vlib/vlib.h>
+
+static void
+vlib_serialize_tx (serialize_main_header_t * m, serialize_stream_t * s)
+{
+  vlib_main_t *vm;
+  vlib_serialize_buffer_main_t *sm;
+  uword n, n_bytes_to_write;
+  vlib_buffer_t *last;
+
+  n_bytes_to_write = s->current_buffer_index;
+  sm =
+    uword_to_pointer (s->data_function_opaque,
+		      vlib_serialize_buffer_main_t *);
+  vm = sm->vlib_main;
+
+  ASSERT (sm->tx.max_n_data_bytes_per_chain > 0);
+  if (serialize_stream_is_end_of_stream (s)
+      || sm->tx.n_total_data_bytes + n_bytes_to_write >
+      sm->tx.max_n_data_bytes_per_chain)
+    {
+      vlib_process_t *p = vlib_get_current_process (vm);
+
+      last = vlib_get_buffer (vm, sm->last_buffer);
+      last->current_length = n_bytes_to_write;
+
+      vlib_set_next_frame_buffer (vm, &p->node_runtime, sm->tx.next_index,
+				  sm->first_buffer);
+
+      sm->first_buffer = sm->last_buffer = ~0;
+      sm->tx.n_total_data_bytes = 0;
+    }
+
+  else if (n_bytes_to_write == 0 && s->n_buffer_bytes == 0)
+    {
+      ASSERT (sm->first_buffer == ~0);
+      ASSERT (sm->last_buffer == ~0);
+      n =
+	vlib_buffer_alloc_from_free_list (vm, &sm->first_buffer, 1,
+					  sm->tx.free_list_index);
+      if (n != 1)
+	serialize_error (m,
+			 clib_error_create
+			 ("vlib_buffer_alloc_from_free_list fails"));
+      sm->last_buffer = sm->first_buffer;
+      s->n_buffer_bytes =
+	vlib_buffer_free_list_buffer_size (vm, sm->tx.free_list_index);
+    }
+
+  if (n_bytes_to_write > 0)
+    {
+      vlib_buffer_t *prev = vlib_get_buffer (vm, sm->last_buffer);
+      n =
+	vlib_buffer_alloc_from_free_list (vm, &sm->last_buffer, 1,
+					  sm->tx.free_list_index);
+      if (n != 1)
+	serialize_error (m,
+			 clib_error_create
+			 ("vlib_buffer_alloc_from_free_list fails"));
+      sm->tx.n_total_data_bytes += n_bytes_to_write;
+      prev->current_length = n_bytes_to_write;
+      prev->next_buffer = sm->last_buffer;
+      prev->flags |= VLIB_BUFFER_NEXT_PRESENT;
+    }
+
+  if (sm->last_buffer != ~0)
+    {
+      last = vlib_get_buffer (vm, sm->last_buffer);
+      s->buffer = vlib_buffer_get_current (last);
+      s->current_buffer_index = 0;
+      ASSERT (last->current_data == s->current_buffer_index);
+    }
+}
+
+static void
+vlib_serialize_rx (serialize_main_header_t * m, serialize_stream_t * s)
+{
+  vlib_main_t *vm;
+  vlib_serialize_buffer_main_t *sm;
+  vlib_buffer_t *last;
+
+  sm =
+    uword_to_pointer (s->data_function_opaque,
+		      vlib_serialize_buffer_main_t *);
+  vm = sm->vlib_main;
+
+  if (serialize_stream_is_end_of_stream (s))
+    return;
+
+  if (sm->last_buffer != ~0)
+    {
+      last = vlib_get_buffer (vm, sm->last_buffer);
+
+      if (last->flags & VLIB_BUFFER_NEXT_PRESENT)
+	sm->last_buffer = last->next_buffer;
+      else
+	{
+	  vlib_buffer_free (vm, &sm->first_buffer, /* count */ 1);
+	  sm->first_buffer = sm->last_buffer = ~0;
+	}
+    }
+
+  if (sm->last_buffer == ~0)
+    {
+      while (clib_fifo_elts (sm->rx.buffer_fifo) == 0)
+	{
+	  sm->rx.ready_one_time_event =
+	    vlib_process_create_one_time_event (vm, vlib_current_process (vm),
+						~0);
+	  vlib_process_wait_for_one_time_event (vm, /* no event data */ 0,
+						sm->rx.ready_one_time_event);
+	}
+
+      clib_fifo_sub1 (sm->rx.buffer_fifo, sm->first_buffer);
+      sm->last_buffer = sm->first_buffer;
+    }
+
+  ASSERT (sm->last_buffer != ~0);
+
+  last = vlib_get_buffer (vm, sm->last_buffer);
+  s->current_buffer_index = 0;
+  s->buffer = vlib_buffer_get_current (last);
+  s->n_buffer_bytes = last->current_length;
+}
+
+static void
+serialize_open_vlib_helper (serialize_main_t * m,
+			    vlib_main_t * vm,
+			    vlib_serialize_buffer_main_t * sm, uword is_read)
+{
+  /* Initialize serialize main but save overflow buffer for re-use between calls. */
+  {
+    u8 *save = m->stream.overflow_buffer;
+    memset (m, 0, sizeof (m[0]));
+    m->stream.overflow_buffer = save;
+    if (save)
+      _vec_len (save) = 0;
+  }
+
+  sm->first_buffer = sm->last_buffer = ~0;
+  if (is_read)
+    clib_fifo_reset (sm->rx.buffer_fifo);
+  else
+    sm->tx.n_total_data_bytes = 0;
+  sm->vlib_main = vm;
+  m->header.data_function = is_read ? vlib_serialize_rx : vlib_serialize_tx;
+  m->stream.data_function_opaque = pointer_to_uword (sm);
+}
+
+void
+serialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm,
+			    vlib_serialize_buffer_main_t * sm)
+{
+  serialize_open_vlib_helper (m, vm, sm, /* is_read */ 0);
+}
+
+void
+unserialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm,
+			      vlib_serialize_buffer_main_t * sm)
+{
+  serialize_open_vlib_helper (m, vm, sm, /* is_read */ 1);
+}
+
+u32
+serialize_close_vlib_buffer (serialize_main_t * m)
+{
+  vlib_serialize_buffer_main_t *sm
+    = uword_to_pointer (m->stream.data_function_opaque,
+			vlib_serialize_buffer_main_t *);
+  vlib_buffer_t *last;
+  serialize_stream_t *s = &m->stream;
+
+  last = vlib_get_buffer (sm->vlib_main, sm->last_buffer);
+  last->current_length = s->current_buffer_index;
+
+  if (vec_len (s->overflow_buffer) > 0)
+    {
+      sm->last_buffer
+	= vlib_buffer_add_data (sm->vlib_main, sm->tx.free_list_index,
+				sm->last_buffer == ~0 ? 0 : sm->last_buffer,
+				s->overflow_buffer,
+				vec_len (s->overflow_buffer));
+      _vec_len (s->overflow_buffer) = 0;
+    }
+
+  return sm->first_buffer;
+}
+
+void
+unserialize_close_vlib_buffer (serialize_main_t * m)
+{
+  vlib_serialize_buffer_main_t *sm
+    = uword_to_pointer (m->stream.data_function_opaque,
+			vlib_serialize_buffer_main_t *);
+  if (sm->first_buffer != ~0)
+    vlib_buffer_free_one (sm->vlib_main, sm->first_buffer);
+  clib_fifo_reset (sm->rx.buffer_fifo);
+  if (m->stream.overflow_buffer)
+    _vec_len (m->stream.overflow_buffer) = 0;
+}
+
+/** @endcond */
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vlib/main.c b/src/vlib/main.c
index 6c6cad98..09f34bbd 100644
--- a/src/vlib/main.c
+++ b/src/vlib/main.c
@@ -465,7 +465,7 @@ vlib_put_next_frame (vlib_main_t * vm,
   vlib_frame_t *f;
   u32 n_vectors_in_frame;
 
-  if (DPDK == 0 && CLIB_DEBUG > 0)
+  if (vm->buffer_main->extern_buffer_mgmt == 0 && CLIB_DEBUG > 0)
     vlib_put_next_frame_validate (vm, r, next_index, n_vectors_left);
 
   nf = vlib_node_runtime_get_next_frame (vm, r, next_index);
@@ -1012,8 +1012,8 @@ dispatch_node (vlib_main_t * vm,
 
       /* When in interrupt mode and vector rate crosses threshold switch to
          polling mode. */
-      if ((DPDK == 0 && dispatch_state == VLIB_NODE_STATE_INTERRUPT)
-	  || (DPDK == 0 && dispatch_state == VLIB_NODE_STATE_POLLING
+      if ((dispatch_state == VLIB_NODE_STATE_INTERRUPT)
+	  || (dispatch_state == VLIB_NODE_STATE_POLLING
 	      && (node->flags
 		  & VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE)))
 	{
@@ -1615,6 +1615,7 @@ vlib_main (vlib_main_t * volatile vm, unformat_input_t * input)
     vm->name = "VLIB";
 
   vec_validate (vm->buffer_main, 0);
+  vlib_buffer_cb_init (vm);
 
   if ((error = vlib_thread_init (vm)))
     {
diff --git a/src/vlib/threads.c b/src/vlib/threads.c
index c5e58bc0..b3bbd30e 100644
--- a/src/vlib/threads.c
+++ b/src/vlib/threads.c
@@ -22,29 +22,10 @@
 #include <vlib/threads.h>
 #include <vlib/unix/cj.h>
 
-
-#if DPDK==1
-#include <rte_config.h>
-#include <rte_common.h>
-#include <rte_eal.h>
-#include <rte_launch.h>
-#include <rte_lcore.h>
-#endif
 DECLARE_CJ_GLOBAL_LOG;
 
 #define FRAME_QUEUE_NELTS 32
 
-
-#if DPDK==1
-/*
- *  Weak definitions of DPDK symbols used in this file.
- *  Needed for linking test programs without DPDK libs.
- */
-unsigned __thread __attribute__ ((weak)) RTE_PER_LCORE (_lcore_id);
-struct lcore_config __attribute__ ((weak)) lcore_config[];
-unsigned __attribute__ ((weak)) rte_socket_id ();
-int __attribute__ ((weak)) rte_eal_remote_launch ();
-#endif
 u32
 vl (void *p)
 {
@@ -194,14 +175,17 @@ vlib_thread_init (vlib_main_t * vm)
     tm->cpu_socket_bitmap = clib_bitmap_set (0, 0, 1);
 
   /* pin main thread to main_lcore  */
-#if DPDK==0
-  {
-    cpu_set_t cpuset;
-    CPU_ZERO (&cpuset);
-    CPU_SET (tm->main_lcore, &cpuset);
-    pthread_setaffinity_np (pthread_self (), sizeof (cpu_set_t), &cpuset);
-  }
-#endif
+  if (tm->cb.vlib_thread_set_lcore_cb)
+    {
+      tm->cb.vlib_thread_set_lcore_cb (0, tm->main_lcore);
+    }
+  else
+    {
+      cpu_set_t cpuset;
+      CPU_ZERO (&cpuset);
+      CPU_SET (tm->main_lcore, &cpuset);
+      pthread_setaffinity_np (pthread_self (), sizeof (cpu_set_t), &cpuset);
+    }
 
   /* as many threads as stacks... */
   vec_validate_aligned (vlib_worker_threads, vec_len (vlib_thread_stacks) - 1,
@@ -520,32 +504,29 @@ vlib_worker_thread_bootstrap_fn (void *arg)
   return rv;
 }
 
-static int
-vlib_launch_thread (void *fp, vlib_worker_thread_t * w, unsigned lcore_id)
+static clib_error_t *
+vlib_launch_thread_int (void *fp, vlib_worker_thread_t * w, unsigned lcore_id)
 {
+  vlib_thread_main_t *tm = &vlib_thread_main;
   void *(*fp_arg) (void *) = fp;
 
   w->lcore_id = lcore_id;
-#if DPDK==1
-  if (!w->registration->use_pthreads)
-    if (rte_eal_remote_launch)	/* do we have dpdk linked */
-      return rte_eal_remote_launch (fp, (void *) w, lcore_id);
-    else
-      return -1;
+  if (tm->cb.vlib_launch_thread_cb && !w->registration->use_pthreads)
+    return tm->cb.vlib_launch_thread_cb (fp, (void *) w, lcore_id);
   else
-#endif
     {
-      int ret;
       pthread_t worker;
       cpu_set_t cpuset;
       CPU_ZERO (&cpuset);
       CPU_SET (lcore_id, &cpuset);
 
-      ret = pthread_create (&worker, NULL /* attr */ , fp_arg, (void *) w);
-      if (ret == 0)
-	return pthread_setaffinity_np (worker, sizeof (cpu_set_t), &cpuset);
-      else
-	return ret;
+      if (pthread_create (&worker, NULL /* attr */ , fp_arg, (void *) w))
+	return clib_error_return_unix (0, "pthread_create");
+
+      if (pthread_setaffinity_np (worker, sizeof (cpu_set_t), &cpuset))
+	return clib_error_return_unix (0, "pthread_setaffinity_np");
+
+      return 0;
     }
 }
 
@@ -769,6 +750,7 @@ start_workers (vlib_main_t * vm)
 
   for (i = 0; i < vec_len (tm->registrations); i++)
     {
+      clib_error_t *err;
       int j;
 
       tr = tm->registrations[i];
@@ -778,22 +760,24 @@ start_workers (vlib_main_t * vm)
 	  for (j = 0; j < tr->count; j++)
 	    {
 	      w = vlib_worker_threads + worker_thread_index++;
-	      if (vlib_launch_thread (vlib_worker_thread_bootstrap_fn, w, 0) <
-		  0)
-		clib_warning ("Couldn't start '%s' pthread ", tr->name);
+	      err = vlib_launch_thread_int (vlib_worker_thread_bootstrap_fn,
+					    w, 0);
+	      if (err)
+		clib_error_report (err);
 	    }
 	}
       else
 	{
 	  uword c;
-            /* *INDENT-OFF* */
-            clib_bitmap_foreach (c, tr->coremask, ({
-              w = vlib_worker_threads + worker_thread_index++;
-              if (vlib_launch_thread (vlib_worker_thread_bootstrap_fn, w, c) < 0)
-                clib_warning ("Couldn't start DPDK lcore %d", c);
-
-            }));
-/* *INDENT-ON* */
+          /* *INDENT-OFF* */
+          clib_bitmap_foreach (c, tr->coremask, ({
+            w = vlib_worker_threads + worker_thread_index++;
+	    err = vlib_launch_thread_int (vlib_worker_thread_bootstrap_fn,
+					  w, c);
+	    if (err)
+	      clib_error_report (err);
+          }));
+          /* *INDENT-ON* */
 	}
     }
   vlib_worker_thread_barrier_sync (vm);
@@ -1105,7 +1089,7 @@ cpu_config (vlib_main_t * vm, unformat_input_t * input)
     {
       tm->n_thread_stacks += tr->count;
       tm->n_pthreads += tr->count * tr->use_pthreads;
-      tm->n_eal_threads += tr->count * (tr->use_pthreads == 0);
+      tm->n_threads += tr->count * (tr->use_pthreads == 0);
       tr = tr->next;
     }
 
@@ -1423,6 +1407,7 @@ void
 vlib_worker_thread_fn (void *arg)
 {
   vlib_worker_thread_t *w = (vlib_worker_thread_t *) arg;
+  vlib_thread_main_t *tm = vlib_get_thread_main ();
   vlib_main_t *vm = vlib_get_main ();
 
   ASSERT (vm->cpu_index == os_get_cpu_number ());
@@ -1431,12 +1416,9 @@ vlib_worker_thread_fn (void *arg)
   clib_time_init (&vm->clib_time);
   clib_mem_set_heap (w->thread_mheap);
 
-#if DPDK > 0
   /* Wait until the dpdk init sequence is complete */
-  vlib_thread_main_t *tm = vlib_get_thread_main ();
-  while (tm->worker_thread_release == 0)
+  while (tm->extern_thread_mgmt && tm->worker_thread_release == 0)
     vlib_worker_thread_barrier_check ();
-#endif
 
   vlib_worker_thread_internal (vm);
 }
@@ -1475,6 +1457,20 @@ vlib_frame_queue_main_init (u32 node_index, u32 frame_queue_nelts)
   return (fqm - tm->frame_queue_mains);
 }
 
+
+int
+vlib_thread_cb_register (struct vlib_main_t *vm, vlib_thread_callbacks_t * cb)
+{
+  vlib_thread_main_t *tm = vlib_get_thread_main ();
+
+  if (tm->extern_thread_mgmt)
+    return -1;
+
+  tm->cb.vlib_launch_thread_cb = cb->vlib_launch_thread_cb;
+  tm->extern_thread_mgmt = 1;
+  return 0;
+}
+
 clib_error_t *
 threads_init (vlib_main_t * vm)
 {
diff --git a/src/vlib/threads.h b/src/vlib/threads.h
index 34ab5be8..75a5a281 100644
--- a/src/vlib/threads.h
+++ b/src/vlib/threads.h
@@ -263,6 +263,13 @@ typedef enum
     SCHED_POLICY_N,
 } sched_policy_t;
 
+typedef struct
+{
+  clib_error_t *(*vlib_launch_thread_cb) (void *fp, vlib_worker_thread_t * w,
+					  unsigned lcore_id);
+  clib_error_t *(*vlib_thread_set_lcore_cb) (u32 thread, u16 lcore);
+} vlib_thread_callbacks_t;
+
 typedef struct
 {
   /* Link list of registrations, built by constructors */
@@ -290,8 +297,8 @@ typedef struct
   /* Number of pthreads */
   u32 n_pthreads;
 
-  /* Number of DPDK eal threads */
-  u32 n_eal_threads;
+  /* Number of threads */
+  u32 n_threads;
 
   /* Number of cores to skip, must match the core mask */
   u32 skip_cores;
@@ -320,6 +327,9 @@ typedef struct
   /* scheduling policy priority */
   u32 sched_priority;
 
+  /* callbacks */
+  vlib_thread_callbacks_t cb;
+  int extern_thread_mgmt;
 } vlib_thread_main_t;
 
 extern vlib_thread_main_t vlib_thread_main;
@@ -459,6 +469,9 @@ vlib_get_worker_handoff_queue_elt (u32 frame_queue_index,
   return elt;
 }
 
+int vlib_thread_cb_register (struct vlib_main_t *vm,
+			     vlib_thread_callbacks_t * cb);
+
 #endif /* included_vlib_threads_h */
 
 /*
diff --git a/src/vlib/threads_cli.c b/src/vlib/threads_cli.c
index ee632279..b64028c4 100644
--- a/src/vlib/threads_cli.c
+++ b/src/vlib/threads_cli.c
@@ -20,14 +20,6 @@
 #include <vlib/threads.h>
 #include <vlib/unix/unix.h>
 
-#if DPDK==1
-#include <rte_config.h>
-#include <rte_common.h>
-#include <rte_eal.h>
-#include <rte_launch.h>
-#include <rte_lcore.h>
-#endif
-
 static u8 *
 format_sched_policy_and_priority (u8 * s, va_list * args)
 {
@@ -116,23 +108,6 @@ show_threads_fn (vlib_main_t * vm,
 	  vec_free (p);
 
 	  line = format (line, "%-7u%-7u%-7u%", lcore, core_id, socket_id);
-#if DPDK==1
-	  ASSERT (lcore <= RTE_MAX_LCORE);
-	  switch (lcore_config[lcore].state)
-	    {
-	    case WAIT:
-	      line = format (line, "wait");
-	      break;
-	    case RUNNING:
-	      line = format (line, "running");
-	      break;
-	    case FINISHED:
-	      line = format (line, "finished");
-	      break;
-	    default:
-	      line = format (line, "unknown");
-	    }
-#endif
 	}
       else
 	{
diff --git a/src/vlib/unix/physmem.c b/src/vlib/unix/physmem.c
index 80ab7b9d..8d10ad2e 100644
--- a/src/vlib/unix/physmem.c
+++ b/src/vlib/unix/physmem.c
@@ -45,13 +45,13 @@ static void *
 unix_physmem_alloc_aligned (vlib_physmem_main_t * vpm, uword n_bytes,
 			    uword alignment)
 {
+  vlib_main_t *vm = vlib_get_main ();
   physmem_main_t *pm = &physmem_main;
   uword lo_offset, hi_offset;
   uword *to_free = 0;
 
-#if DPDK > 0
-  clib_warning ("unsafe alloc!");
-#endif
+  if (vm->buffer_main->extern_buffer_mgmt)
+    clib_warning ("unsafe alloc!");
 
   /* IO memory is always at least cache aligned. */
   alignment = clib_max (alignment, CLIB_CACHE_LINE_BYTES);
@@ -269,16 +269,17 @@ static clib_error_t *
 show_physmem (vlib_main_t * vm,
 	      unformat_input_t * input, vlib_cli_command_t * cmd)
 {
-#if DPDK > 0
-  vlib_cli_output (vm, "Not supported with DPDK drivers.");
-#else
   physmem_main_t *pm = &physmem_main;
+  if (vm->buffer_main->extern_buffer_mgmt)
+    {
+      vlib_cli_output (vm, "Not supported with external buffer management.");
+      return 0;
+    }
 
   if (pm->heap)
     vlib_cli_output (vm, "%U", format_mheap, pm->heap, /* verbose */ 1);
   else
     vlib_cli_output (vm, "No physmem allocated.");
-#endif
   return 0;
 }
 
diff --git a/src/vnet.am b/src/vnet.am
index 665a16ea..47c5eda7 100644
--- a/src/vnet.am
+++ b/src/vnet.am
@@ -761,11 +761,13 @@ nobase_include_HEADERS +=			\
 ########################################
 if WITH_DPDK
 libvnet_la_SOURCES +=				\
+  vnet/devices/dpdk/buffer.c			\
   vnet/devices/dpdk/dpdk_priv.h		\
   vnet/devices/dpdk/device.c		\
   vnet/devices/dpdk/format.c		\
   vnet/devices/dpdk/init.c			\
   vnet/devices/dpdk/node.c			\
+  vnet/devices/dpdk/thread.c			\
   vnet/devices/dpdk/hqos.c			\
   vnet/devices/dpdk/cli.c			\
   vnet/devices/dpdk/dpdk_api.c
diff --git a/src/vnet/devices/dpdk/buffer.c b/src/vnet/devices/dpdk/buffer.c
new file mode 100644
index 00000000..214a9162
--- /dev/null
+++ b/src/vnet/devices/dpdk/buffer.c
@@ -0,0 +1,729 @@
+/*
+ * Copyright (c) 2017 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * buffer.c: allocate/free network buffers.
+ *
+ * Copyright (c) 2008 Eliot Dresselhaus
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file
+ *
+ * Allocate/free network buffers.
+ */
+
+#include <rte_config.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_tailq.h>
+#include <rte_eal.h>
+#include <rte_per_lcore.h>
+#include <rte_launch.h>
+#include <rte_atomic.h>
+#include <rte_cycles.h>
+#include <rte_prefetch.h>
+#include <rte_lcore.h>
+#include <rte_per_lcore.h>
+#include <rte_branch_prediction.h>
+#include <rte_interrupts.h>
+#include <rte_pci.h>
+#include <rte_random.h>
+#include <rte_debug.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_ring.h>
+#include <rte_mempool.h>
+#include <rte_mbuf.h>
+#include <rte_version.h>
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+#include <vnet/devices/dpdk/dpdk_priv.h>
+
+
+STATIC_ASSERT (VLIB_BUFFER_PRE_DATA_SIZE == RTE_PKTMBUF_HEADROOM,
+	       "VLIB_BUFFER_PRE_DATA_SIZE must be equal to RTE_PKTMBUF_HEADROOM");
+
+#define BUFFERS_PER_COPY (sizeof (vlib_copy_unit_t) / sizeof (u32))
+
+/* Make sure we have at least given number of unaligned buffers. */
+static void
+fill_unaligned (vlib_main_t * vm,
+		vlib_buffer_free_list_t * free_list,
+		uword n_unaligned_buffers)
+{
+  word la = vec_len (free_list->aligned_buffers);
+  word lu = vec_len (free_list->unaligned_buffers);
+
+  /* Aligned come in aligned copy-sized chunks. */
+  ASSERT (la % BUFFERS_PER_COPY == 0);
+
+  ASSERT (la >= n_unaligned_buffers);
+
+  while (lu < n_unaligned_buffers)
+    {
+      /* Copy 4 buffers from end of aligned vector to unaligned vector. */
+      vec_add (free_list->unaligned_buffers,
+	       free_list->aligned_buffers + la - BUFFERS_PER_COPY,
+	       BUFFERS_PER_COPY);
+      la -= BUFFERS_PER_COPY;
+      lu += BUFFERS_PER_COPY;
+    }
+  _vec_len (free_list->aligned_buffers) = la;
+}
+
+/* After free aligned buffers may not contain even sized chunks. */
+static void
+trim_aligned (vlib_buffer_free_list_t * f)
+{
+  uword l, n_trim;
+
+  /* Add unaligned to aligned before trim. */
+  l = vec_len (f->unaligned_buffers);
+  if (l > 0)
+    {
+      vec_add_aligned (f->aligned_buffers, f->unaligned_buffers, l,
+		       /* align */ sizeof (vlib_copy_unit_t));
+
+      _vec_len (f->unaligned_buffers) = 0;
+    }
+
+  /* Remove unaligned buffers from end of aligned vector and save for next trim. */
+  l = vec_len (f->aligned_buffers);
+  n_trim = l % BUFFERS_PER_COPY;
+  if (n_trim)
+    {
+      /* Trim aligned -> unaligned. */
+      vec_add (f->unaligned_buffers, f->aligned_buffers + l - n_trim, n_trim);
+
+      /* Remove from aligned. */
+      _vec_len (f->aligned_buffers) = l - n_trim;
+    }
+}
+
+static void
+merge_free_lists (vlib_buffer_free_list_t * dst,
+		  vlib_buffer_free_list_t * src)
+{
+  uword l;
+  u32 *d;
+
+  trim_aligned (src);
+  trim_aligned (dst);
+
+  l = vec_len (src->aligned_buffers);
+  if (l > 0)
+    {
+      vec_add2_aligned (dst->aligned_buffers, d, l,
+			/* align */ sizeof (vlib_copy_unit_t));
+      clib_memcpy (d, src->aligned_buffers, l * sizeof (d[0]));
+      vec_free (src->aligned_buffers);
+    }
+
+  l = vec_len (src->unaligned_buffers);
+  if (l > 0)
+    {
+      vec_add (dst->unaligned_buffers, src->unaligned_buffers, l);
+      vec_free (src->unaligned_buffers);
+    }
+}
+
+always_inline u32
+dpdk_buffer_get_free_list_with_size (vlib_main_t * vm, u32 size)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+
+  size = vlib_buffer_round_size (size);
+  uword *p = hash_get (bm->free_list_by_size, size);
+  return p ? p[0] : ~0;
+}
+
+static void
+del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f)
+{
+  u32 i;
+  struct rte_mbuf *mb;
+  vlib_buffer_t *b;
+
+  for (i = 0; i < vec_len (f->unaligned_buffers); i++)
+    {
+      b = vlib_get_buffer (vm, f->unaligned_buffers[i]);
+      mb = rte_mbuf_from_vlib_buffer (b);
+      ASSERT (rte_mbuf_refcnt_read (mb) == 1);
+      rte_pktmbuf_free (mb);
+    }
+  for (i = 0; i < vec_len (f->aligned_buffers); i++)
+    {
+      b = vlib_get_buffer (vm, f->aligned_buffers[i]);
+      mb = rte_mbuf_from_vlib_buffer (b);
+      ASSERT (rte_mbuf_refcnt_read (mb) == 1);
+      rte_pktmbuf_free (mb);
+    }
+  vec_free (f->name);
+  vec_free (f->unaligned_buffers);
+  vec_free (f->aligned_buffers);
+}
+
+/* Add buffer free list. */
+static void
+dpdk_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+  vlib_buffer_free_list_t *f;
+  u32 merge_index;
+  int i;
+
+  ASSERT (os_get_cpu_number () == 0);
+
+  f = vlib_buffer_get_free_list (vm, free_list_index);
+
+  merge_index = dpdk_buffer_get_free_list_with_size (vm, f->n_data_bytes);
+  if (merge_index != ~0 && merge_index != free_list_index)
+    {
+      merge_free_lists (pool_elt_at_index (bm->buffer_free_list_pool,
+					   merge_index), f);
+    }
+
+  del_free_list (vm, f);
+
+  /* Poison it. */
+  memset (f, 0xab, sizeof (f[0]));
+
+  pool_put (bm->buffer_free_list_pool, f);
+
+  for (i = 1; i < vec_len (vlib_mains); i++)
+    {
+      bm = vlib_mains[i]->buffer_main;
+      f = vlib_buffer_get_free_list (vlib_mains[i], free_list_index);;
+      memset (f, 0xab, sizeof (f[0]));
+      pool_put (bm->buffer_free_list_pool, f);
+    }
+}
+
+/* Make sure free list has at least given number of free buffers. */
+static uword
+fill_free_list (vlib_main_t * vm,
+		vlib_buffer_free_list_t * fl, uword min_free_buffers)
+{
+  dpdk_main_t *dm = &dpdk_main;
+  vlib_buffer_t *b;
+  int n, i;
+  u32 bi;
+  u32 n_remaining = 0, n_alloc = 0;
+  unsigned socket_id = rte_socket_id ();
+  struct rte_mempool *rmp = dm->pktmbuf_pools[socket_id];
+  struct rte_mbuf *mb;
+
+  /* Too early? */
+  if (PREDICT_FALSE (rmp == 0))
+    return 0;
+
+  trim_aligned (fl);
+
+  /* Already have enough free buffers on free list? */
+  n = min_free_buffers - vec_len (fl->aligned_buffers);
+  if (n <= 0)
+    return min_free_buffers;
+
+  /* Always allocate round number of buffers. */
+  n = round_pow2 (n, BUFFERS_PER_COPY);
+
+  /* Always allocate new buffers in reasonably large sized chunks. */
+  n = clib_max (n, fl->min_n_buffers_each_physmem_alloc);
+
+  vec_validate (vm->mbuf_alloc_list, n - 1);
+
+  if (rte_mempool_get_bulk (rmp, vm->mbuf_alloc_list, n) < 0)
+    return 0;
+
+  _vec_len (vm->mbuf_alloc_list) = n;
+
+  for (i = 0; i < n; i++)
+    {
+      mb = vm->mbuf_alloc_list[i];
+
+      ASSERT (rte_mbuf_refcnt_read (mb) == 0);
+      rte_mbuf_refcnt_set (mb, 1);
+
+      b = vlib_buffer_from_rte_mbuf (mb);
+      bi = vlib_get_buffer_index (vm, b);
+
+      vec_add1_aligned (fl->aligned_buffers, bi, sizeof (vlib_copy_unit_t));
+      n_alloc++;
+      n_remaining--;
+
+      vlib_buffer_init_for_free_list (b, fl);
+
+      if (fl->buffer_init_function)
+	fl->buffer_init_function (vm, fl, &bi, 1);
+    }
+
+  fl->n_alloc += n;
+
+  return n;
+}
+
+always_inline uword
+copy_alignment (u32 * x)
+{
+  return (pointer_to_uword (x) / sizeof (x[0])) % BUFFERS_PER_COPY;
+}
+
+static u32
+alloc_from_free_list (vlib_main_t * vm,
+		      vlib_buffer_free_list_t * free_list,
+		      u32 * alloc_buffers, u32 n_alloc_buffers)
+{
+  u32 *dst, *u_src;
+  uword u_len, n_left;
+  uword n_unaligned_start, n_unaligned_end, n_filled;
+
+  n_left = n_alloc_buffers;
+  dst = alloc_buffers;
+  n_unaligned_start = ((BUFFERS_PER_COPY - copy_alignment (dst))
+		       & (BUFFERS_PER_COPY - 1));
+
+  n_filled = fill_free_list (vm, free_list, n_alloc_buffers);
+  if (n_filled == 0)
+    return 0;
+
+  n_left = n_filled < n_left ? n_filled : n_left;
+  n_alloc_buffers = n_left;
+
+  if (n_unaligned_start >= n_left)
+    {
+      n_unaligned_start = n_left;
+      n_unaligned_end = 0;
+    }
+  else
+    n_unaligned_end = copy_alignment (dst + n_alloc_buffers);
+
+  fill_unaligned (vm, free_list, n_unaligned_start + n_unaligned_end);
+
+  u_len = vec_len (free_list->unaligned_buffers);
+  u_src = free_list->unaligned_buffers + u_len - 1;
+
+  if (n_unaligned_start)
+    {
+      uword n_copy = n_unaligned_start;
+      if (n_copy > n_left)
+	n_copy = n_left;
+      n_left -= n_copy;
+
+      while (n_copy > 0)
+	{
+	  *dst++ = *u_src--;
+	  n_copy--;
+	  u_len--;
+	}
+
+      /* Now dst should be aligned. */
+      if (n_left > 0)
+	ASSERT (pointer_to_uword (dst) % sizeof (vlib_copy_unit_t) == 0);
+    }
+
+  /* Aligned copy. */
+  {
+    vlib_copy_unit_t *d, *s;
+    uword n_copy;
+
+    if (vec_len (free_list->aligned_buffers) <
+	((n_left / BUFFERS_PER_COPY) * BUFFERS_PER_COPY))
+      abort ();
+
+    n_copy = n_left / BUFFERS_PER_COPY;
+    n_left = n_left % BUFFERS_PER_COPY;
+
+    /* Remove buffers from aligned free list. */
+    _vec_len (free_list->aligned_buffers) -= n_copy * BUFFERS_PER_COPY;
+
+    s = (vlib_copy_unit_t *) vec_end (free_list->aligned_buffers);
+    d = (vlib_copy_unit_t *) dst;
+
+    /* Fast path loop. */
+    while (n_copy >= 4)
+      {
+	d[0] = s[0];
+	d[1] = s[1];
+	d[2] = s[2];
+	d[3] = s[3];
+	n_copy -= 4;
+	s += 4;
+	d += 4;
+      }
+
+    while (n_copy >= 1)
+      {
+	d[0] = s[0];
+	n_copy -= 1;
+	s += 1;
+	d += 1;
+      }
+
+    dst = (void *) d;
+  }
+
+  /* Unaligned copy. */
+  ASSERT (n_unaligned_end == n_left);
+  while (n_left > 0)
+    {
+      *dst++ = *u_src--;
+      n_left--;
+      u_len--;
+    }
+
+  if (!free_list->unaligned_buffers)
+    ASSERT (u_len == 0);
+  else
+    _vec_len (free_list->unaligned_buffers) = u_len;
+
+  return n_alloc_buffers;
+}
+
+/* Allocate a given number of buffers into given array.
+   Returns number actually allocated which will be either zero or
+   number requested. */
+u32
+dpdk_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+
+  return alloc_from_free_list
+    (vm,
+     pool_elt_at_index (bm->buffer_free_list_pool,
+			VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX),
+     buffers, n_buffers);
+}
+
+
+u32
+dpdk_buffer_alloc_from_free_list (vlib_main_t * vm,
+				  u32 * buffers,
+				  u32 n_buffers, u32 free_list_index)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+  vlib_buffer_free_list_t *f;
+  f = pool_elt_at_index (bm->buffer_free_list_pool, free_list_index);
+  return alloc_from_free_list (vm, f, buffers, n_buffers);
+}
+
+always_inline void
+add_buffer_to_free_list (vlib_main_t * vm,
+			 vlib_buffer_free_list_t * f,
+			 u32 buffer_index, u8 do_init)
+{
+  vlib_buffer_t *b;
+  b = vlib_get_buffer (vm, buffer_index);
+  if (PREDICT_TRUE (do_init))
+    vlib_buffer_init_for_free_list (b, f);
+  vec_add1_aligned (f->aligned_buffers, buffer_index,
+		    sizeof (vlib_copy_unit_t));
+}
+
+always_inline vlib_buffer_free_list_t *
+buffer_get_free_list (vlib_main_t * vm, vlib_buffer_t * b, u32 * index)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+  u32 i;
+
+  *index = i = b->free_list_index;
+  return pool_elt_at_index (bm->buffer_free_list_pool, i);
+}
+
+static_always_inline void
+vlib_buffer_free_inline (vlib_main_t * vm,
+			 u32 * buffers, u32 n_buffers, u32 follow_buffer_next)
+{
+  vlib_buffer_main_t *bm = vm->buffer_main;
+  vlib_buffer_free_list_t *fl;
+  u32 fi;
+  int i;
+  u32 (*cb) (vlib_main_t * vm, u32 * buffers, u32 n_buffers,
+	     u32 follow_buffer_next);
+
+  cb = bm->buffer_free_callback;
+
+  if (PREDICT_FALSE (cb != 0))
+    n_buffers = (*cb) (vm, buffers, n_buffers, follow_buffer_next);
+
+  if (!n_buffers)
+    return;
+
+  for (i = 0; i < n_buffers; i++)
+    {
+      vlib_buffer_t *b;
+      struct rte_mbuf *mb;
+
+      b = vlib_get_buffer (vm, buffers[i]);
+
+      fl = buffer_get_free_list (vm, b, &fi);
+
+      /* The only current use of this callback: multicast recycle */
+      if (PREDICT_FALSE (fl->buffers_added_to_freelist_function != 0))
+	{
+	  int j;
+
+	  add_buffer_to_free_list
+	    (vm, fl, buffers[i], (b->flags & VLIB_BUFFER_RECYCLE) == 0);
+
+	  for (j = 0; j < vec_len (bm->announce_list); j++)
+	    {
+	      if (fl == bm->announce_list[j])
+		goto already_announced;
+	    }
+	  vec_add1 (bm->announce_list, fl);
+	already_announced:
+	  ;
+	}
+      else
+	{
+	  if (PREDICT_TRUE ((b->flags & VLIB_BUFFER_RECYCLE) == 0))
+	    {
+	      mb = rte_mbuf_from_vlib_buffer (b);
+	      ASSERT (rte_mbuf_refcnt_read (mb) == 1);
+	      rte_pktmbuf_free (mb);
+	    }
+	}
+    }
+  if (vec_len (bm->announce_list))
+    {
+      vlib_buffer_free_list_t *fl;
+      for (i = 0; i < vec_len (bm->announce_list); i++)
+	{
+	  fl = bm->announce_list[i];
+	  fl->buffers_added_to_freelist_function (vm, fl);
+	}
+      _vec_len (bm->announce_list) = 0;
+    }
+}
+
+static void
+dpdk_buffer_free (vlib_main_t * vm, u32 * buffers, u32 n_buffers)
+{
+  vlib_buffer_free_inline (vm, buffers, n_buffers,	/* follow_buffer_next */
+			   1);
+}
+
+static void
+dpdk_buffer_free_no_next (vlib_main_t * vm, u32 * buffers, u32 n_buffers)
+{
+  vlib_buffer_free_inline (vm, buffers, n_buffers,	/* follow_buffer_next */
+			   0);
+}
+
+static void
+dpdk_packet_template_init (vlib_main_t * vm,
+			   void *vt,
+			   void *packet_data,
+			   uword n_packet_data_bytes,
+			   uword min_n_buffers_each_physmem_alloc, u8 * name)
+{
+  vlib_packet_template_t *t = (vlib_packet_template_t *) vt;
+
+  vlib_worker_thread_barrier_sync (vm);
+  memset (t, 0, sizeof (t[0]));
+
+  vec_add (t->packet_data, packet_data, n_packet_data_bytes);
+
+  vlib_worker_thread_barrier_release (vm);
+}
+
+clib_error_t *
+vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs,
+			 unsigned socket_id)
+{
+  dpdk_main_t *dm = &dpdk_main;
+  vlib_physmem_main_t *vpm = &vm->physmem_main;
+  struct rte_mempool *rmp;
+  int i;
+
+  vec_validate_aligned (dm->pktmbuf_pools, socket_id, CLIB_CACHE_LINE_BYTES);
+
+  /* pool already exists, nothing to do */
+  if (dm->pktmbuf_pools[socket_id])
+    return 0;
+
+  u8 *pool_name = format (0, "mbuf_pool_socket%u%c", socket_id, 0);
+
+  rmp = rte_pktmbuf_pool_create ((char *) pool_name,	/* pool name */
+				 num_mbufs,	/* number of mbufs */
+				 512,	/* cache size */
+				 VLIB_BUFFER_HDR_SIZE,	/* priv size */
+				 VLIB_BUFFER_PRE_DATA_SIZE + VLIB_BUFFER_DATA_SIZE,	/* dataroom size */
+				 socket_id);	/* cpu socket */
+
+  if (rmp)
+    {
+      {
+	uword this_pool_end;
+	uword this_pool_start;
+	uword this_pool_size;
+	uword save_vpm_start, save_vpm_end, save_vpm_size;
+	struct rte_mempool_memhdr *memhdr;
+
+	this_pool_start = ~0ULL;
+	this_pool_end = 0LL;
+
+	STAILQ_FOREACH (memhdr, &rmp->mem_list, next)
+	{
+	  if (((uword) (memhdr->addr + memhdr->len)) > this_pool_end)
+	    this_pool_end = (uword) (memhdr->addr + memhdr->len);
+	  if (((uword) memhdr->addr) < this_pool_start)
+	    this_pool_start = (uword) (memhdr->addr);
+	}
+	ASSERT (this_pool_start < ~0ULL && this_pool_end > 0);
+	this_pool_size = this_pool_end - this_pool_start;
+
+	if (CLIB_DEBUG > 1)
+	  {
+	    clib_warning ("%s: pool start %llx pool end %llx pool size %lld",
+			  pool_name, this_pool_start, this_pool_end,
+			  this_pool_size);
+	    clib_warning
+	      ("before: virtual.start %llx virtual.end %llx virtual.size %lld",
+	       vpm->virtual.start, vpm->virtual.end, vpm->virtual.size);
+	  }
+
+	save_vpm_start = vpm->virtual.start;
+	save_vpm_end = vpm->virtual.end;
+	save_vpm_size = vpm->virtual.size;
+
+	if ((this_pool_start < vpm->virtual.start) || vpm->virtual.start == 0)
+	  vpm->virtual.start = this_pool_start;
+	if (this_pool_end > vpm->virtual.end)
+	  vpm->virtual.end = this_pool_end;
+
+	vpm->virtual.size = vpm->virtual.end - vpm->virtual.start;
+
+	if (CLIB_DEBUG > 1)
+	  {
+	    clib_warning
+	      ("after: virtual.start %llx virtual.end %llx virtual.size %lld",
+	       vpm->virtual.start, vpm->virtual.end, vpm->virtual.size);
+	  }
+
+	/* check if fits into buffer index range */
+	if ((u64) vpm->virtual.size >
+	    ((u64) 1 << (32 + CLIB_LOG2_CACHE_LINE_BYTES)))
+	  {
+	    clib_warning ("physmem: virtual size out of range!");
+	    vpm->virtual.start = save_vpm_start;
+	    vpm->virtual.end = save_vpm_end;
+	    vpm->virtual.size = save_vpm_size;
+	    rmp = 0;
+	  }
+      }
+      if (rmp)
+	{
+	  dm->pktmbuf_pools[socket_id] = rmp;
+	  vec_free (pool_name);
+	  return 0;
+	}
+    }
+
+  vec_free (pool_name);
+
+  /* no usable pool for this socket, try to use pool from another one */
+  for (i = 0; i < vec_len (dm->pktmbuf_pools); i++)
+    {
+      if (dm->pktmbuf_pools[i])
+	{
+	  clib_warning
+	    ("WARNING: Failed to allocate mempool for CPU socket %u. "
+	     "Threads running on socket %u will use socket %u mempool.",
+	     socket_id, socket_id, i);
+	  dm->pktmbuf_pools[socket_id] = dm->pktmbuf_pools[i];
+	  return 0;
+	}
+    }
+
+  return clib_error_return (0, "failed to allocate mempool on socket %u",
+			    socket_id);
+}
+
+#if CLIB_DEBUG > 0
+
+u32 *vlib_buffer_state_validation_lock;
+uword *vlib_buffer_state_validation_hash;
+void *vlib_buffer_state_heap;
+
+static clib_error_t *
+buffer_state_validation_init (vlib_main_t * vm)
+{
+  void *oldheap;
+
+  vlib_buffer_state_heap = mheap_alloc (0, 10 << 20);
+
+  oldheap = clib_mem_set_heap (vlib_buffer_state_heap);
+
+  vlib_buffer_state_validation_hash = hash_create (0, sizeof (uword));
+  vec_validate_aligned (vlib_buffer_state_validation_lock, 0,
+			CLIB_CACHE_LINE_BYTES);
+  clib_mem_set_heap (oldheap);
+  return 0;
+}
+
+VLIB_INIT_FUNCTION (buffer_state_validation_init);
+#endif
+
+static vlib_buffer_callbacks_t callbacks = {
+  .vlib_buffer_alloc_cb = &dpdk_buffer_alloc,
+  .vlib_buffer_alloc_from_free_list_cb = &dpdk_buffer_alloc_from_free_list,
+  .vlib_buffer_free_cb = &dpdk_buffer_free,
+  .vlib_buffer_free_no_next_cb = &dpdk_buffer_free_no_next,
+  .vlib_packet_template_init_cb = &dpdk_packet_template_init,
+  .vlib_buffer_delete_free_list_cb = &dpdk_buffer_delete_free_list,
+};
+
+static clib_error_t *
+dpdk_buffer_init (vlib_main_t * vm)
+{
+  vlib_buffer_cb_register (vm, &callbacks);
+  return 0;
+}
+
+VLIB_INIT_FUNCTION (dpdk_buffer_init);
+
+/** @endcond */
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/devices/dpdk/cli.c b/src/vnet/devices/dpdk/cli.c
index 538a00fd..22bd4b4f 100644
--- a/src/vnet/devices/dpdk/cli.c
+++ b/src/vnet/devices/dpdk/cli.c
@@ -164,9 +164,9 @@ show_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input,
   struct rte_mempool *rmp;
   int i;
 
-  for (i = 0; i < vec_len (vm->buffer_main->pktmbuf_pools); i++)
+  for (i = 0; i < vec_len (dpdk_main.pktmbuf_pools); i++)
     {
-      rmp = vm->buffer_main->pktmbuf_pools[i];
+      rmp = dpdk_main.pktmbuf_pools[i];
       if (rmp)
 	{
 	  unsigned count = rte_mempool_avail_count (rmp);
diff --git a/src/vnet/devices/dpdk/device.c b/src/vnet/devices/dpdk/device.c
index b22fbf2e..0deab6aa 100644
--- a/src/vnet/devices/dpdk/device.c
+++ b/src/vnet/devices/dpdk/device.c
@@ -87,19 +87,18 @@ dpdk_set_mc_filter (vnet_hw_interface_t * hi,
 struct rte_mbuf *
 dpdk_replicate_packet_mb (vlib_buffer_t * b)
 {
-  vlib_main_t *vm = vlib_get_main ();
-  vlib_buffer_main_t *bm = vm->buffer_main;
+  dpdk_main_t *dm = &dpdk_main;
   struct rte_mbuf **mbufs = 0, *s, *d;
   u8 nb_segs;
   unsigned socket_id = rte_socket_id ();
   int i;
 
-  ASSERT (bm->pktmbuf_pools[socket_id]);
+  ASSERT (dm->pktmbuf_pools[socket_id]);
   s = rte_mbuf_from_vlib_buffer (b);
   nb_segs = s->nb_segs;
   vec_validate (mbufs, nb_segs - 1);
 
-  if (rte_pktmbuf_alloc_bulk (bm->pktmbuf_pools[socket_id], mbufs, nb_segs))
+  if (rte_pktmbuf_alloc_bulk (dm->pktmbuf_pools[socket_id], mbufs, nb_segs))
     {
       vec_free (mbufs);
       return 0;
diff --git a/src/vnet/devices/dpdk/dpdk.h b/src/vnet/devices/dpdk/dpdk.h
index e0436031..066ec6fa 100644
--- a/src/vnet/devices/dpdk/dpdk.h
+++ b/src/vnet/devices/dpdk/dpdk.h
@@ -425,6 +425,9 @@ typedef struct
   vlib_main_t *vlib_main;
   vnet_main_t *vnet_main;
   dpdk_config_main_t *conf;
+
+  /* mempool */
+  struct rte_mempool **pktmbuf_pools;
 } dpdk_main_t;
 
 dpdk_main_t dpdk_main;
diff --git a/src/vnet/devices/dpdk/dpdk_priv.h b/src/vnet/devices/dpdk/dpdk_priv.h
index 0c81dbc3..dd40ff48 100644
--- a/src/vnet/devices/dpdk/dpdk_priv.h
+++ b/src/vnet/devices/dpdk/dpdk_priv.h
@@ -13,6 +13,9 @@
  * limitations under the License.
  */
 
+#define rte_mbuf_from_vlib_buffer(x) (((struct rte_mbuf *)x) - 1)
+#define vlib_buffer_from_rte_mbuf(x) ((vlib_buffer_t *)(x+1))
+
 #define DPDK_NB_RX_DESC_DEFAULT   1024
 #define DPDK_NB_TX_DESC_DEFAULT   1024
 #define DPDK_NB_RX_DESC_VIRTIO    256
diff --git a/src/vnet/devices/dpdk/init.c b/src/vnet/devices/dpdk/init.c
index 60689463..4c040d20 100755
--- a/src/vnet/devices/dpdk/init.c
+++ b/src/vnet/devices/dpdk/init.c
@@ -64,8 +64,6 @@ static struct rte_eth_conf port_conf_template = {
 clib_error_t *
 dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd)
 {
-  vlib_main_t *vm = vlib_get_main ();
-  vlib_buffer_main_t *bm = vm->buffer_main;
   int rv;
   int j;
 
@@ -107,7 +105,7 @@ dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd)
 
       rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc,
 				   xd->cpu_socket, 0,
-				   bm->
+				   dm->
 				   pktmbuf_pools[xd->cpu_socket_id_by_queue
 						 [j]]);
 
@@ -115,7 +113,7 @@ dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd)
       if (rv < 0)
 	rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc,
 				     SOCKET_ID_ANY, 0,
-				     bm->
+				     dm->
 				     pktmbuf_pools[xd->cpu_socket_id_by_queue
 						   [j]]);
       if (rv < 0)
diff --git a/src/vnet/devices/dpdk/thread.c b/src/vnet/devices/dpdk/thread.c
new file mode 100644
index 00000000..475dd142
--- /dev/null
+++ b/src/vnet/devices/dpdk/thread.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <rte_config.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_tailq.h>
+#include <rte_eal.h>
+#include <rte_per_lcore.h>
+#include <rte_launch.h>
+#include <rte_atomic.h>
+#include <rte_cycles.h>
+#include <rte_prefetch.h>
+#include <rte_lcore.h>
+#include <rte_per_lcore.h>
+#include <rte_branch_prediction.h>
+#include <rte_interrupts.h>
+#include <rte_pci.h>
+#include <rte_random.h>
+#include <rte_debug.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_ring.h>
+#include <rte_mempool.h>
+#include <rte_mbuf.h>
+#include <rte_version.h>
+
+#include <vlib/vlib.h>
+#include <vnet/vnet.h>
+#include <vnet/devices/dpdk/dpdk.h>
+#include <vnet/devices/dpdk/dpdk_priv.h>
+
+static clib_error_t *
+dpdk_launch_thread (void *fp, vlib_worker_thread_t * w, unsigned lcore_id)
+{
+  int r;
+  r = rte_eal_remote_launch (fp, (void *) w, lcore_id);
+  if (r)
+    return clib_error_return (0, "Failed to launch thread %u", lcore_id);
+  return 0;
+}
+
+static clib_error_t *
+dpdk_thread_set_lcore (u32 thread, u16 lcore)
+{
+  return 0;
+}
+
+static vlib_thread_callbacks_t callbacks = {
+  .vlib_launch_thread_cb = &dpdk_launch_thread,
+  .vlib_thread_set_lcore_cb = &dpdk_thread_set_lcore,
+};
+
+static clib_error_t *
+dpdk_thread_init (vlib_main_t * vm)
+{
+  vlib_thread_cb_register (vm, &callbacks);
+  return 0;
+}
+
+VLIB_INIT_FUNCTION (dpdk_thread_init);
+
+/** @endcond */
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vnet/sr/sr_replicate.c b/src/vnet/sr/sr_replicate.c
index 5f9de504..fa5a68c3 100644
--- a/src/vnet/sr/sr_replicate.c
+++ b/src/vnet/sr/sr_replicate.c
@@ -30,6 +30,7 @@
 #include <vnet/pg/pg.h>
 #include <vnet/sr/sr.h>
 #include <vnet/devices/dpdk/dpdk.h>
+#include <vnet/devices/dpdk/dpdk_priv.h>
 #include <vnet/ip/ip.h>
 #include <vnet/fib/ip6_fib.h>
 
@@ -142,6 +143,7 @@ static uword
 sr_replicate_node_fn (vlib_main_t * vm,
 		      vlib_node_runtime_t * node, vlib_frame_t * frame)
 {
+  dpdk_main_t *dm = &dpdk_main;
   u32 n_left_from, *from, *to_next;
   sr_replicate_next_t next_index;
   int pkts_replicated = 0;
@@ -149,7 +151,6 @@ sr_replicate_node_fn (vlib_main_t * vm,
   int no_buffer_drops = 0;
   vlib_buffer_free_list_t *fl;
   unsigned socket_id = rte_socket_id ();
-  vlib_buffer_main_t *bm = vm->buffer_main;
 
   from = vlib_frame_vector_args (frame);
   n_left_from = frame->n_vectors;
@@ -246,13 +247,13 @@ sr_replicate_node_fn (vlib_main_t * vm,
 	      vlib_buffer_t *clone0_c, *clone_b0;
 
 	      t0 = vec_elt_at_index (sm->tunnels, pol0->tunnel_indices[i]);
-	      hdr_mb0 = rte_pktmbuf_alloc (bm->pktmbuf_pools[socket_id]);
+	      hdr_mb0 = rte_pktmbuf_alloc (dm->pktmbuf_pools[socket_id]);
 
 	      if (i < (num_replicas - 1))
 		{
 		  /* Not the last tunnel to process */
 		  clone0 = rte_pktmbuf_clone
-		    (orig_mb0, bm->pktmbuf_pools[socket_id]);
+		    (orig_mb0, dm->pktmbuf_pools[socket_id]);
 		  if (clone0 == 0)
 		    goto clone_fail;
 		  nb_seg = 0;
-- 
cgit 1.2.3-korg


From bd69a5f24c6e83e9101f203dd124864fb2877a17 Mon Sep 17 00:00:00 2001
From: Damjan Marion <damarion@cisco.com>
Date: Sun, 5 Feb 2017 23:44:42 +0100
Subject: vlib: remove algned/unaligned buffers scheme

Change-Id: I4433eaed3f4e201edc329c4842cbbf74beb19a9a
Signed-off-by: Damjan Marion <damarion@cisco.com>
---
 src/vlib/buffer.c              | 220 +++++------------------------------------
 src/vlib/buffer.h              |  13 +--
 src/vlib/buffer_funcs.h        |  53 +++-------
 src/vlib/threads.c             |   3 +-
 src/vnet/devices/dpdk/buffer.c | 131 +++---------------------
 src/vnet/replication.c         |  23 +----
 6 files changed, 57 insertions(+), 386 deletions(-)

(limited to 'src/vlib/threads.c')

diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c
index ea4960e2..95b4344f 100644
--- a/src/vlib/buffer.c
+++ b/src/vlib/buffer.c
@@ -304,63 +304,6 @@ vlib_buffer_validate_alloc_free (vlib_main_t * vm,
     }
 }
 
-#define BUFFERS_PER_COPY (sizeof (vlib_copy_unit_t) / sizeof (u32))
-
-/* Make sure we have at least given number of unaligned buffers. */
-void
-vlib_buffer_free_list_fill_unaligned (vlib_main_t * vm,
-				      vlib_buffer_free_list_t * free_list,
-				      uword n_unaligned_buffers)
-{
-  word la = vec_len (free_list->aligned_buffers);
-  word lu = vec_len (free_list->unaligned_buffers);
-
-  /* Aligned come in aligned copy-sized chunks. */
-  ASSERT (la % BUFFERS_PER_COPY == 0);
-
-  ASSERT (la >= n_unaligned_buffers);
-
-  while (lu < n_unaligned_buffers)
-    {
-      /* Copy 4 buffers from end of aligned vector to unaligned vector. */
-      vec_add (free_list->unaligned_buffers,
-	       free_list->aligned_buffers + la - BUFFERS_PER_COPY,
-	       BUFFERS_PER_COPY);
-      la -= BUFFERS_PER_COPY;
-      lu += BUFFERS_PER_COPY;
-    }
-  _vec_len (free_list->aligned_buffers) = la;
-}
-
-/* After free aligned buffers may not contain even sized chunks. */
-void
-vlib_buffer_free_list_trim_aligned (vlib_buffer_free_list_t * f)
-{
-  uword l, n_trim;
-
-  /* Add unaligned to aligned before trim. */
-  l = vec_len (f->unaligned_buffers);
-  if (l > 0)
-    {
-      vec_add_aligned (f->aligned_buffers, f->unaligned_buffers, l,
-		       /* align */ sizeof (vlib_copy_unit_t));
-
-      _vec_len (f->unaligned_buffers) = 0;
-    }
-
-  /* Remove unaligned buffers from end of aligned vector and save for next trim. */
-  l = vec_len (f->aligned_buffers);
-  n_trim = l % BUFFERS_PER_COPY;
-  if (n_trim)
-    {
-      /* Trim aligned -> unaligned. */
-      vec_add (f->unaligned_buffers, f->aligned_buffers + l - n_trim, n_trim);
-
-      /* Remove from aligned. */
-      _vec_len (f->aligned_buffers) = l - n_trim;
-    }
-}
-
 void
 vlib_buffer_merge_free_lists (vlib_buffer_free_list_t * dst,
 			      vlib_buffer_free_list_t * src)
@@ -368,23 +311,12 @@ vlib_buffer_merge_free_lists (vlib_buffer_free_list_t * dst,
   uword l;
   u32 *d;
 
-  vlib_buffer_free_list_trim_aligned (src);
-  vlib_buffer_free_list_trim_aligned (dst);
-
-  l = vec_len (src->aligned_buffers);
-  if (l > 0)
-    {
-      vec_add2_aligned (dst->aligned_buffers, d, l,
-			/* align */ sizeof (vlib_copy_unit_t));
-      clib_memcpy (d, src->aligned_buffers, l * sizeof (d[0]));
-      vec_free (src->aligned_buffers);
-    }
-
-  l = vec_len (src->unaligned_buffers);
+  l = vec_len (src->buffers);
   if (l > 0)
     {
-      vec_add (dst->unaligned_buffers, src->unaligned_buffers, l);
-      vec_free (src->unaligned_buffers);
+      vec_add2_aligned (dst->buffers, d, l, CLIB_CACHE_LINE_BYTES);
+      clib_memcpy (d, src->buffers, l * sizeof (d[0]));
+      vec_free (src->buffers);
     }
 }
 
@@ -447,8 +379,7 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm,
       ASSERT (f - bm->buffer_free_list_pool ==
 	      wf - wbm->buffer_free_list_pool);
       wf[0] = f[0];
-      wf->aligned_buffers = 0;
-      wf->unaligned_buffers = 0;
+      wf->buffers = 0;
       wf->n_alloc = 0;
     }
 
@@ -505,8 +436,7 @@ del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f)
     vm->os_physmem_free (f->buffer_memory_allocated[i]);
   vec_free (f->name);
   vec_free (f->buffer_memory_allocated);
-  vec_free (f->unaligned_buffers);
-  vec_free (f->aligned_buffers);
+  vec_free (f->buffers);
 }
 
 /* Add buffer free list. */
@@ -522,8 +452,7 @@ vlib_buffer_delete_free_list_internal (vlib_main_t * vm, u32 free_list_index)
 
   f = vlib_buffer_get_free_list (vm, free_list_index);
 
-  ASSERT (vec_len (f->unaligned_buffers) + vec_len (f->aligned_buffers) ==
-	  f->n_alloc);
+  ASSERT (vec_len (f->buffers) == f->n_alloc);
   merge_index = vlib_buffer_get_free_list_with_size (vm, f->n_data_bytes);
   if (merge_index != ~0 && merge_index != free_list_index)
     {
@@ -558,15 +487,13 @@ fill_free_list (vlib_main_t * vm,
   u32 *bi;
   u32 n_remaining, n_alloc, n_this_chunk;
 
-  vlib_buffer_free_list_trim_aligned (fl);
-
   /* Already have enough free buffers on free list? */
-  n = min_free_buffers - vec_len (fl->aligned_buffers);
+  n = min_free_buffers - vec_len (fl->buffers);
   if (n <= 0)
     return min_free_buffers;
 
   /* Always allocate round number of buffers. */
-  n = round_pow2 (n, BUFFERS_PER_COPY);
+  n = round_pow2 (n, CLIB_CACHE_LINE_BYTES / sizeof (u32));
 
   /* Always allocate new buffers in reasonably large sized chunks. */
   n = clib_max (n, fl->min_n_buffers_each_physmem_alloc);
@@ -594,8 +521,7 @@ fill_free_list (vlib_main_t * vm,
       n_remaining -= n_this_chunk;
 
       b = buffers;
-      vec_add2_aligned (fl->aligned_buffers, bi, n_this_chunk,
-			sizeof (vlib_copy_unit_t));
+      vec_add2_aligned (fl->buffers, bi, n_this_chunk, CLIB_CACHE_LINE_BYTES);
       for (i = 0; i < n_this_chunk; i++)
 	{
 	  bi[i] = vlib_get_buffer_index (vm, b);
@@ -621,121 +547,28 @@ fill_free_list (vlib_main_t * vm,
   return n_alloc;
 }
 
-always_inline uword
-copy_alignment (u32 * x)
-{
-  return (pointer_to_uword (x) / sizeof (x[0])) % BUFFERS_PER_COPY;
-}
-
-
 static u32
 alloc_from_free_list (vlib_main_t * vm,
 		      vlib_buffer_free_list_t * free_list,
 		      u32 * alloc_buffers, u32 n_alloc_buffers)
 {
-  u32 *dst, *u_src;
-  uword u_len, n_left;
-  uword n_unaligned_start, n_unaligned_end, n_filled;
+  u32 *dst, *src;
+  uword len;
+  uword n_filled;
 
-  n_left = n_alloc_buffers;
   dst = alloc_buffers;
-  n_unaligned_start = ((BUFFERS_PER_COPY - copy_alignment (dst))
-		       & (BUFFERS_PER_COPY - 1));
 
   n_filled = fill_free_list (vm, free_list, n_alloc_buffers);
   if (n_filled == 0)
     return 0;
 
-  n_left = n_filled < n_left ? n_filled : n_left;
-  n_alloc_buffers = n_left;
-
-  if (n_unaligned_start >= n_left)
-    {
-      n_unaligned_start = n_left;
-      n_unaligned_end = 0;
-    }
-  else
-    n_unaligned_end = copy_alignment (dst + n_alloc_buffers);
-
-  vlib_buffer_free_list_fill_unaligned (vm, free_list,
-					n_unaligned_start + n_unaligned_end);
-
-  u_len = vec_len (free_list->unaligned_buffers);
-  u_src = free_list->unaligned_buffers + u_len - 1;
+  len = vec_len (free_list->buffers);
+  ASSERT (len >= n_alloc_buffers);
 
-  if (n_unaligned_start)
-    {
-      uword n_copy = n_unaligned_start;
-      if (n_copy > n_left)
-	n_copy = n_left;
-      n_left -= n_copy;
-
-      while (n_copy > 0)
-	{
-	  *dst++ = *u_src--;
-	  n_copy--;
-	  u_len--;
-	}
-
-      /* Now dst should be aligned. */
-      if (n_left > 0)
-	ASSERT (pointer_to_uword (dst) % sizeof (vlib_copy_unit_t) == 0);
-    }
-
-  /* Aligned copy. */
-  {
-    vlib_copy_unit_t *d, *s;
-    uword n_copy;
-
-    if (vec_len (free_list->aligned_buffers) <
-	((n_left / BUFFERS_PER_COPY) * BUFFERS_PER_COPY))
-      abort ();
-
-    n_copy = n_left / BUFFERS_PER_COPY;
-    n_left = n_left % BUFFERS_PER_COPY;
-
-    /* Remove buffers from aligned free list. */
-    _vec_len (free_list->aligned_buffers) -= n_copy * BUFFERS_PER_COPY;
-
-    s = (vlib_copy_unit_t *) vec_end (free_list->aligned_buffers);
-    d = (vlib_copy_unit_t *) dst;
-
-    /* Fast path loop. */
-    while (n_copy >= 4)
-      {
-	d[0] = s[0];
-	d[1] = s[1];
-	d[2] = s[2];
-	d[3] = s[3];
-	n_copy -= 4;
-	s += 4;
-	d += 4;
-      }
-
-    while (n_copy >= 1)
-      {
-	d[0] = s[0];
-	n_copy -= 1;
-	s += 1;
-	d += 1;
-      }
-
-    dst = (void *) d;
-  }
-
-  /* Unaligned copy. */
-  ASSERT (n_unaligned_end == n_left);
-  while (n_left > 0)
-    {
-      *dst++ = *u_src--;
-      n_left--;
-      u_len--;
-    }
+  src = free_list->buffers + len - n_alloc_buffers;
+  clib_memcpy (dst, src, n_alloc_buffers * sizeof (u32));
 
-  if (!free_list->unaligned_buffers)
-    ASSERT (u_len == 0);
-  else
-    _vec_len (free_list->unaligned_buffers) = u_len;
+  _vec_len (free_list->buffers) -= n_alloc_buffers;
 
   /* Verify that buffers are known free. */
   vlib_buffer_validate_alloc_free (vm, alloc_buffers,
@@ -831,8 +664,7 @@ again:
   vlib_buffer_validate_alloc_free (vm, b,
 				   n_left, VLIB_BUFFER_KNOWN_ALLOCATED);
 
-  vec_add2_aligned (fl->aligned_buffers, f, n_left,
-		    /* align */ sizeof (vlib_copy_unit_t));
+  vec_add2_aligned (fl->buffers, f, n_left, CLIB_CACHE_LINE_BYTES);
 
   n = next_to_free[i_next_to_free];
   while (n_left >= 4)
@@ -890,7 +722,7 @@ again:
       f -= 2;
       n -= free_next0 + free_next1;
 
-      _vec_len (fl->aligned_buffers) = f - fl->aligned_buffers;
+      _vec_len (fl->buffers) = f - fl->buffers;
 
       fl0 = pool_elt_at_index (bm->buffer_free_list_pool, fi0);
       fl1 = pool_elt_at_index (bm->buffer_free_list_pool, fi1);
@@ -924,8 +756,7 @@ again:
 	  fl = pool_elt_at_index (bm->buffer_free_list_pool, fi);
 	}
 
-      vec_add2_aligned (fl->aligned_buffers, f, n_left,
-			/* align */ sizeof (vlib_copy_unit_t));
+      vec_add2_aligned (fl->buffers, f, n_left, CLIB_CACHE_LINE_BYTES);
     }
 
   while (n_left >= 1)
@@ -968,7 +799,7 @@ again:
       f -= 1;
       n -= free_next0;
 
-      _vec_len (fl->aligned_buffers) = f - fl->aligned_buffers;
+      _vec_len (fl->buffers) = f - fl->buffers;
 
       fl0 = pool_elt_at_index (bm->buffer_free_list_pool, fi0);
 
@@ -986,8 +817,7 @@ again:
       fi = fi0;
       fl = pool_elt_at_index (bm->buffer_free_list_pool, fi);
 
-      vec_add2_aligned (fl->aligned_buffers, f, n_left,
-			/* align */ sizeof (vlib_copy_unit_t));
+      vec_add2_aligned (fl->buffers, f, n_left, CLIB_CACHE_LINE_BYTES);
     }
 
   if (follow_buffer_next && ((n_left = n - next_to_free[i_next_to_free]) > 0))
@@ -997,7 +827,7 @@ again:
       goto again;
     }
 
-  _vec_len (fl->aligned_buffers) = f - fl->aligned_buffers;
+  _vec_len (fl->buffers) = f - fl->buffers;
 
   if (vec_len (announce_list))
     {
@@ -1239,7 +1069,7 @@ format_vlib_buffer_free_list (u8 * s, va_list * va)
 		   "#Alloc", "#Free");
 
   size = sizeof (vlib_buffer_t) + f->n_data_bytes;
-  n_free = vec_len (f->aligned_buffers) + vec_len (f->unaligned_buffers);
+  n_free = vec_len (f->buffers);
   bytes_alloc = size * f->n_alloc;
   bytes_free = size * n_free;
 
diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h
index d270c08a..fffb50c8 100644
--- a/src/vlib/buffer.h
+++ b/src/vlib/buffer.h
@@ -50,12 +50,6 @@
 #define VLIB_BUFFER_DATA_SIZE		(2048)
 #define VLIB_BUFFER_PRE_DATA_SIZE	__PRE_DATA_SIZE
 
-#if defined (CLIB_HAVE_VEC128) || defined (__aarch64__)
-typedef u8x16 vlib_copy_unit_t;
-#else
-typedef u64 vlib_copy_unit_t;
-#endif
-
 /** \file
     vlib buffer structure definition and a few select
     access methods. This structure and the buffer allocation
@@ -262,11 +256,8 @@ typedef struct vlib_buffer_free_list_t
   /* Total number of buffers allocated from this free list. */
   u32 n_alloc;
 
-  /* Vector of free buffers.  Each element is a byte offset into I/O heap.
-     Aligned vectors always has naturally aligned vlib_copy_unit_t sized chunks
-     of buffer indices.  Unaligned vector has any left over.  This is meant to
-     speed up copy routines. */
-  u32 *aligned_buffers, *unaligned_buffers;
+  /* Vector of free buffers.  Each element is a byte offset into I/O heap. */
+  u32 *buffers;
 
   /* Memory chunks allocated for this free list
      recorded here so they can be freed when free list
diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h
index 543a903c..fd051de5 100644
--- a/src/vlib/buffer_funcs.h
+++ b/src/vlib/buffer_funcs.h
@@ -350,10 +350,6 @@ vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index)
 u32 vlib_buffer_get_or_create_free_list (vlib_main_t * vm, u32 n_data_bytes,
 					 char *fmt, ...);
 
-
-/* After free aligned buffers may not contain even sized chunks. */
-void vlib_buffer_free_list_trim_aligned (vlib_buffer_free_list_t * f);
-
 /* Merge two free lists */
 void vlib_buffer_merge_free_lists (vlib_buffer_free_list_t * dst,
 				   vlib_buffer_free_list_t * src);
@@ -664,23 +660,14 @@ unserialize_vlib_buffer_n_bytes (serialize_main_t * m)
   return n;
 }
 
-typedef union
-{
-  vlib_buffer_t b;
-  vlib_copy_unit_t i[sizeof (vlib_buffer_t) / sizeof (vlib_copy_unit_t)];
-}
-vlib_buffer_union_t;
-
 /* Set a buffer quickly into "uninitialized" state.  We want this to
    be extremely cheap and arrange for all fields that need to be
    initialized to be in the first 128 bits of the buffer. */
 always_inline void
-vlib_buffer_init_for_free_list (vlib_buffer_t * _dst,
+vlib_buffer_init_for_free_list (vlib_buffer_t * dst,
 				vlib_buffer_free_list_t * fl)
 {
-  vlib_buffer_union_t *dst = (vlib_buffer_union_t *) _dst;
-  vlib_buffer_union_t *src =
-    (vlib_buffer_union_t *) & fl->buffer_init_template;
+  vlib_buffer_t *src = &fl->buffer_init_template;
 
   /* Make sure vlib_buffer_t is cacheline aligned and sized */
   ASSERT (STRUCT_OFFSET_OF (vlib_buffer_t, cacheline0) == 0);
@@ -692,21 +679,14 @@ vlib_buffer_init_for_free_list (vlib_buffer_t * _dst,
   /* Make sure buffer template is sane. */
   ASSERT (fl->index == fl->buffer_init_template.free_list_index);
 
-  /* Copy template from src->current_data thru src->free_list_index */
-  dst->i[0] = src->i[0];
-  if (1 * sizeof (dst->i[0]) < 16)
-    dst->i[1] = src->i[1];
-  if (2 * sizeof (dst->i[0]) < 16)
-    dst->i[2] = src->i[2];
-
   /* Make sure it really worked. */
-#define _(f) ASSERT (dst->b.f == src->b.f)
+#define _(f) dst->f = src->f
   _(current_data);
   _(current_length);
   _(flags);
   _(free_list_index);
 #undef _
-  ASSERT (dst->b.total_length_not_including_first_buffer == 0);
+  ASSERT (dst->total_length_not_including_first_buffer == 0);
 }
 
 always_inline void
@@ -718,39 +698,28 @@ vlib_buffer_add_to_free_list (vlib_main_t * vm,
   b = vlib_get_buffer (vm, buffer_index);
   if (PREDICT_TRUE (do_init))
     vlib_buffer_init_for_free_list (b, f);
-  vec_add1_aligned (f->aligned_buffers, buffer_index,
-		    sizeof (vlib_copy_unit_t));
+  vec_add1_aligned (f->buffers, buffer_index, CLIB_CACHE_LINE_BYTES);
 }
 
 always_inline void
-vlib_buffer_init_two_for_free_list (vlib_buffer_t * _dst0,
-				    vlib_buffer_t * _dst1,
+vlib_buffer_init_two_for_free_list (vlib_buffer_t * dst0,
+				    vlib_buffer_t * dst1,
 				    vlib_buffer_free_list_t * fl)
 {
-  vlib_buffer_union_t *dst0 = (vlib_buffer_union_t *) _dst0;
-  vlib_buffer_union_t *dst1 = (vlib_buffer_union_t *) _dst1;
-  vlib_buffer_union_t *src =
-    (vlib_buffer_union_t *) & fl->buffer_init_template;
+  vlib_buffer_t *src = &fl->buffer_init_template;
 
   /* Make sure buffer template is sane. */
   ASSERT (fl->index == fl->buffer_init_template.free_list_index);
 
-  /* Copy template from src->current_data thru src->free_list_index */
-  dst0->i[0] = dst1->i[0] = src->i[0];
-  if (1 * sizeof (dst0->i[0]) < 16)
-    dst0->i[1] = dst1->i[1] = src->i[1];
-  if (2 * sizeof (dst0->i[0]) < 16)
-    dst0->i[2] = dst1->i[2] = src->i[2];
-
   /* Make sure it really worked. */
-#define _(f) ASSERT (dst0->b.f == src->b.f && dst1->b.f == src->b.f)
+#define _(f) dst0->f = src->f;  dst1->f = src->f
   _(current_data);
   _(current_length);
   _(flags);
   _(free_list_index);
 #undef _
-  ASSERT (dst0->b.total_length_not_including_first_buffer == 0);
-  ASSERT (dst1->b.total_length_not_including_first_buffer == 0);
+  ASSERT (dst0->total_length_not_including_first_buffer == 0);
+  ASSERT (dst1->total_length_not_including_first_buffer == 0);
 }
 
 #if CLIB_DEBUG > 0
diff --git a/src/vlib/threads.c b/src/vlib/threads.c
index b3bbd30e..e3ea3c9c 100644
--- a/src/vlib/threads.c
+++ b/src/vlib/threads.c
@@ -708,8 +708,7 @@ start_workers (vlib_main_t * vm)
                                     == fl_clone - bm_clone->buffer_free_list_pool);
 
                             fl_clone[0] = fl_orig[0];
-                            fl_clone->aligned_buffers = 0;
-                            fl_clone->unaligned_buffers = 0;
+                            fl_clone->buffers = 0;
                             fl_clone->n_alloc = 0;
                           }));
 /* *INDENT-ON* */
diff --git a/src/vnet/devices/dpdk/buffer.c b/src/vnet/devices/dpdk/buffer.c
index 038f46d9..43ceb91e 100644
--- a/src/vnet/devices/dpdk/buffer.c
+++ b/src/vnet/devices/dpdk/buffer.c
@@ -79,8 +79,6 @@
 STATIC_ASSERT (VLIB_BUFFER_PRE_DATA_SIZE == RTE_PKTMBUF_HEADROOM,
 	       "VLIB_BUFFER_PRE_DATA_SIZE must be equal to RTE_PKTMBUF_HEADROOM");
 
-#define BUFFERS_PER_COPY (sizeof (vlib_copy_unit_t) / sizeof (u32))
-
 static void
 del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f)
 {
@@ -88,23 +86,15 @@ del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f)
   struct rte_mbuf *mb;
   vlib_buffer_t *b;
 
-  for (i = 0; i < vec_len (f->unaligned_buffers); i++)
+  for (i = 0; i < vec_len (f->buffers); i++)
     {
-      b = vlib_get_buffer (vm, f->unaligned_buffers[i]);
-      mb = rte_mbuf_from_vlib_buffer (b);
-      ASSERT (rte_mbuf_refcnt_read (mb) == 1);
-      rte_pktmbuf_free (mb);
-    }
-  for (i = 0; i < vec_len (f->aligned_buffers); i++)
-    {
-      b = vlib_get_buffer (vm, f->aligned_buffers[i]);
+      b = vlib_get_buffer (vm, f->buffers[i]);
       mb = rte_mbuf_from_vlib_buffer (b);
       ASSERT (rte_mbuf_refcnt_read (mb) == 1);
       rte_pktmbuf_free (mb);
     }
   vec_free (f->name);
-  vec_free (f->unaligned_buffers);
-  vec_free (f->aligned_buffers);
+  vec_free (f->buffers);
 }
 
 /* Add buffer free list. */
@@ -162,15 +152,13 @@ fill_free_list (vlib_main_t * vm,
   if (PREDICT_FALSE (rmp == 0))
     return 0;
 
-  vlib_buffer_free_list_trim_aligned (fl);
-
   /* Already have enough free buffers on free list? */
-  n = min_free_buffers - vec_len (fl->aligned_buffers);
+  n = min_free_buffers - vec_len (fl->buffers);
   if (n <= 0)
     return min_free_buffers;
 
   /* Always allocate round number of buffers. */
-  n = round_pow2 (n, BUFFERS_PER_COPY);
+  n = round_pow2 (n, CLIB_CACHE_LINE_BYTES / sizeof (u32));
 
   /* Always allocate new buffers in reasonably large sized chunks. */
   n = clib_max (n, fl->min_n_buffers_each_physmem_alloc);
@@ -192,7 +180,7 @@ fill_free_list (vlib_main_t * vm,
       b = vlib_buffer_from_rte_mbuf (mb);
       bi = vlib_get_buffer_index (vm, b);
 
-      vec_add1_aligned (fl->aligned_buffers, bi, sizeof (vlib_copy_unit_t));
+      vec_add1_aligned (fl->buffers, bi, CLIB_CACHE_LINE_BYTES);
       n_alloc++;
       n_remaining--;
 
@@ -207,120 +195,27 @@ fill_free_list (vlib_main_t * vm,
   return n;
 }
 
-always_inline uword
-copy_alignment (u32 * x)
-{
-  return (pointer_to_uword (x) / sizeof (x[0])) % BUFFERS_PER_COPY;
-}
-
 static u32
 alloc_from_free_list (vlib_main_t * vm,
 		      vlib_buffer_free_list_t * free_list,
 		      u32 * alloc_buffers, u32 n_alloc_buffers)
 {
-  u32 *dst, *u_src;
-  uword u_len, n_left;
-  uword n_unaligned_start, n_unaligned_end, n_filled;
+  u32 *dst, *src;
+  uword len, n_filled;
 
-  n_left = n_alloc_buffers;
   dst = alloc_buffers;
-  n_unaligned_start = ((BUFFERS_PER_COPY - copy_alignment (dst))
-		       & (BUFFERS_PER_COPY - 1));
 
   n_filled = fill_free_list (vm, free_list, n_alloc_buffers);
   if (n_filled == 0)
     return 0;
 
-  n_left = n_filled < n_left ? n_filled : n_left;
-  n_alloc_buffers = n_left;
-
-  if (n_unaligned_start >= n_left)
-    {
-      n_unaligned_start = n_left;
-      n_unaligned_end = 0;
-    }
-  else
-    n_unaligned_end = copy_alignment (dst + n_alloc_buffers);
-
-  vlib_buffer_free_list_fill_unaligned (vm, free_list,
-					n_unaligned_start + n_unaligned_end);
-
-  u_len = vec_len (free_list->unaligned_buffers);
-  u_src = free_list->unaligned_buffers + u_len - 1;
-
-  if (n_unaligned_start)
-    {
-      uword n_copy = n_unaligned_start;
-      if (n_copy > n_left)
-	n_copy = n_left;
-      n_left -= n_copy;
-
-      while (n_copy > 0)
-	{
-	  *dst++ = *u_src--;
-	  n_copy--;
-	  u_len--;
-	}
-
-      /* Now dst should be aligned. */
-      if (n_left > 0)
-	ASSERT (pointer_to_uword (dst) % sizeof (vlib_copy_unit_t) == 0);
-    }
-
-  /* Aligned copy. */
-  {
-    vlib_copy_unit_t *d, *s;
-    uword n_copy;
-
-    if (vec_len (free_list->aligned_buffers) <
-	((n_left / BUFFERS_PER_COPY) * BUFFERS_PER_COPY))
-      abort ();
-
-    n_copy = n_left / BUFFERS_PER_COPY;
-    n_left = n_left % BUFFERS_PER_COPY;
-
-    /* Remove buffers from aligned free list. */
-    _vec_len (free_list->aligned_buffers) -= n_copy * BUFFERS_PER_COPY;
-
-    s = (vlib_copy_unit_t *) vec_end (free_list->aligned_buffers);
-    d = (vlib_copy_unit_t *) dst;
+  len = vec_len (free_list->buffers);
+  ASSERT (len >= n_alloc_buffers);
 
-    /* Fast path loop. */
-    while (n_copy >= 4)
-      {
-	d[0] = s[0];
-	d[1] = s[1];
-	d[2] = s[2];
-	d[3] = s[3];
-	n_copy -= 4;
-	s += 4;
-	d += 4;
-      }
-
-    while (n_copy >= 1)
-      {
-	d[0] = s[0];
-	n_copy -= 1;
-	s += 1;
-	d += 1;
-      }
-
-    dst = (void *) d;
-  }
-
-  /* Unaligned copy. */
-  ASSERT (n_unaligned_end == n_left);
-  while (n_left > 0)
-    {
-      *dst++ = *u_src--;
-      n_left--;
-      u_len--;
-    }
+  src = free_list->buffers + len - n_alloc_buffers;
+  clib_memcpy (dst, src, n_alloc_buffers * sizeof (u32));
 
-  if (!free_list->unaligned_buffers)
-    ASSERT (u_len == 0);
-  else
-    _vec_len (free_list->unaligned_buffers) = u_len;
+  _vec_len (free_list->buffers) -= n_alloc_buffers;
 
   return n_alloc_buffers;
 }
diff --git a/src/vnet/replication.c b/src/vnet/replication.c
index 561c86cd..02755195 100644
--- a/src/vnet/replication.c
+++ b/src/vnet/replication.c
@@ -168,32 +168,20 @@ replication_recycle_callback (vlib_main_t * vm, vlib_buffer_free_list_t * fl)
    * Note: this could be sped up if the node index were stuffed into
    * the freelist itself.
    */
-  if (vec_len (fl->aligned_buffers) > 0)
+  if (vec_len (fl->buffers) > 0)
     {
-      bi0 = fl->aligned_buffers[0];
-      b0 = vlib_get_buffer (vm, bi0);
-      ctx = pool_elt_at_index (rm->contexts[cpu_number], b0->recycle_count);
-      feature_node_index = ctx->recycle_node_index;
-    }
-  else if (vec_len (fl->unaligned_buffers) > 0)
-    {
-      bi0 = fl->unaligned_buffers[0];
+      bi0 = fl->buffers[0];
       b0 = vlib_get_buffer (vm, bi0);
       ctx = pool_elt_at_index (rm->contexts[cpu_number], b0->recycle_count);
       feature_node_index = ctx->recycle_node_index;
     }
 
-  /* aligned, unaligned buffers */
+  /* buffers */
   for (i = 0; i < 2; i++)
     {
       if (i == 0)
 	{
-	  from = fl->aligned_buffers;
-	  n_left_from = vec_len (from);
-	}
-      else
-	{
-	  from = fl->unaligned_buffers;
+	  from = fl->buffers;
 	  n_left_from = vec_len (from);
 	}
 
@@ -245,8 +233,7 @@ replication_recycle_callback (vlib_main_t * vm, vlib_buffer_free_list_t * fl)
 	}
     }
 
-  vec_reset_length (fl->aligned_buffers);
-  vec_reset_length (fl->unaligned_buffers);
+  vec_reset_length (fl->buffers);
 
   if (f)
     {
-- 
cgit 1.2.3-korg


From 80f54e20270ed0628ee725e3e3c515731a0188f2 Mon Sep 17 00:00:00 2001
From: Dave Barach <dave@barachs.net>
Date: Wed, 8 Mar 2017 19:08:56 -0500
Subject: vlib_mains == 0 special cases be gone

Clean up spurious binary API client link dependency on libvlib.so,
which managed to hide behind vlib_mains == 0 checks reached by
VLIB_xxx_FUNCTION macros.

Change-Id: I5df1f8ab07dca1944250e643ccf06e60a8462325
Signed-off-by: Dave Barach <dave@barachs.net>
---
 src/plugins/dpdk/ipsec/ipsec.c       |   8 +-
 src/vlib-api.am                      |   4 +-
 src/vlib/buffer.c                    |  27 +-
 src/vlib/global_funcs.h              |   2 +-
 src/vlib/node_cli.c                  |  28 +-
 src/vlib/node_funcs.h                |   4 +-
 src/vlib/threads.c                   |  16 +-
 src/vlib/threads.h                   |  43 ++-
 src/vlibapi/api.h                    |   4 +-
 src/vlibapi/api_shared.c             | 530 ++---------------------------------
 src/vlibapi/node_serialize.c         |  15 +-
 src/vlibmemory/memory_vlib.c         | 471 +++++++++++++++++++++++++++++++
 src/vnet/devices/virtio/vhost-user.c |   9 +-
 src/vpp-api-test.am                  |   2 -
 src/vpp/api/api.c                    |   1 -
 src/vpp/api/gmon.c                   |   9 +-
 16 files changed, 575 insertions(+), 598 deletions(-)

(limited to 'src/vlib/threads.c')

diff --git a/src/plugins/dpdk/ipsec/ipsec.c b/src/plugins/dpdk/ipsec/ipsec.c
index 16bec20a..b0aaaaec 100644
--- a/src/plugins/dpdk/ipsec/ipsec.c
+++ b/src/plugins/dpdk/ipsec/ipsec.c
@@ -380,13 +380,9 @@ dpdk_ipsec_process (vlib_main_t * vm, vlib_node_runtime_t * rt,
   im->cb.check_support_cb = dpdk_ipsec_check_support;
   im->cb.add_del_sa_sess_cb = add_del_sa_sess;
 
-  if (vec_len (vlib_mains) == 0)
-    vlib_node_set_state (&vlib_global_main, dpdk_crypto_input_node.index,
+  for (i = 1; i < tm->n_vlib_mains; i++)
+    vlib_node_set_state (vlib_mains[i], dpdk_crypto_input_node.index,
 			 VLIB_NODE_STATE_POLLING);
-  else
-    for (i = 1; i < tm->n_vlib_mains; i++)
-      vlib_node_set_state (vlib_mains[i], dpdk_crypto_input_node.index,
-			   VLIB_NODE_STATE_POLLING);
 
   /* TODO cryptodev counters */
 
diff --git a/src/vlib-api.am b/src/vlib-api.am
index c05929b1..4e1dae99 100644
--- a/src/vlib-api.am
+++ b/src/vlib-api.am
@@ -14,7 +14,7 @@
 lib_LTLIBRARIES += libvlibmemory.la libvlibapi.la libvlibmemoryclient.la \
 	           libvlibsocket.la
 
-libvlibmemory_la_DEPENDENCIES = libvppinfra.la libsvm.la libvlib.la
+libvlibmemory_la_DEPENDENCIES = libvppinfra.la libsvm.la 
 libvlibmemory_la_LIBADD = $(libvlibmemory_la_DEPENDENCIES) -lpthread
 libvlibmemory_la_SOURCES =			\
 	vlibmemory/api.h			\
@@ -26,7 +26,7 @@ libvlibmemory_la_SOURCES =			\
 	vlibmemory/unix_shared_memory_queue.c	\
 	vlibmemory/unix_shared_memory_queue.h
 
-libvlibapi_la_DEPENDENCIES = libvppinfra.la libvlib.la libvlibmemory.la
+libvlibapi_la_DEPENDENCIES = libvppinfra.la 
 libvlibapi_la_LIBADD = $(libvlibapi_la_DEPENDENCIES)
 libvlibapi_la_SOURCES = 			\
 	vlibapi/api.h				\
diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c
index 9f26bec7..6ba82584 100644
--- a/src/vlib/buffer.c
+++ b/src/vlib/buffer.c
@@ -261,7 +261,28 @@ done:
   return result;
 }
 
-vlib_main_t **vlib_mains;
+/*
+ * Hand-craft a static vector w/ length 1, so vec_len(vlib_mains) =1
+ * and vlib_mains[0] = &vlib_global_main from the beginning of time.
+ *
+ * The only place which should ever expand vlib_mains is start_workers()
+ * in threads.c. It knows about the bootstrap vector.
+ */
+/* *INDENT-OFF* */
+static struct
+{
+  vec_header_t h;
+  vlib_main_t *vm;
+} __attribute__ ((packed)) __bootstrap_vlib_main_vector
+  __attribute__ ((aligned (CLIB_CACHE_LINE_BYTES))) =
+{
+  .h.len = 1,
+  .vm = &vlib_global_main,
+};
+/* *INDENT-ON* */
+
+vlib_main_t **vlib_mains = &__bootstrap_vlib_main_vector.vm;
+
 
 /* When dubugging validate that given buffers are either known allocated
    or known free. */
@@ -280,7 +301,7 @@ vlib_buffer_validate_alloc_free (vlib_main_t * vm,
   ASSERT (os_get_cpu_number () == 0);
 
   /* smp disaster check */
-  if (vlib_mains)
+  if (vec_len (vlib_mains) > 1)
     ASSERT (vm == vlib_mains[0]);
 
   is_free = expected_state == VLIB_BUFFER_KNOWN_ALLOCATED;
@@ -956,7 +977,7 @@ show_buffers (vlib_main_t * vm,
 
   do
     {
-      curr_vm = vec_len (vlib_mains) ? vlib_mains[vm_index] : vm;
+      curr_vm = vlib_mains[vm_index];
       bm = curr_vm->buffer_main;
 
     /* *INDENT-OFF* */
diff --git a/src/vlib/global_funcs.h b/src/vlib/global_funcs.h
index bbdbdef5..f51ec381 100644
--- a/src/vlib/global_funcs.h
+++ b/src/vlib/global_funcs.h
@@ -23,7 +23,7 @@ always_inline vlib_main_t *
 vlib_get_main (void)
 {
   vlib_main_t *vm;
-  vm = vlib_mains ? vlib_mains[os_get_cpu_number ()] : &vlib_global_main;
+  vm = vlib_mains[os_get_cpu_number ()];
   ASSERT (vm);
   return vm;
 }
diff --git a/src/vlib/node_cli.c b/src/vlib/node_cli.c
index 05d0f0b5..62ab2e64 100644
--- a/src/vlib/node_cli.c
+++ b/src/vlib/node_cli.c
@@ -248,16 +248,11 @@ show_node_runtime (vlib_main_t * vm,
       if (unformat (input, "max") || unformat (input, "m"))
 	max = 1;
 
-      if (vec_len (vlib_mains) == 0)
-	vec_add1 (stat_vms, vm);
-      else
+      for (i = 0; i < vec_len (vlib_mains); i++)
 	{
-	  for (i = 0; i < vec_len (vlib_mains); i++)
-	    {
-	      stat_vm = vlib_mains[i];
-	      if (stat_vm)
-		vec_add1 (stat_vms, stat_vm);
-	    }
+	  stat_vm = vlib_mains[i];
+	  if (stat_vm)
+	    vec_add1 (stat_vms, stat_vm);
 	}
 
       /*
@@ -331,7 +326,7 @@ show_node_runtime (vlib_main_t * vm,
 		}
 	    }
 
-	  if (vec_len (vlib_mains))
+	  if (vec_len (vlib_mains) > 1)
 	    {
 	      vlib_worker_thread_t *w = vlib_worker_threads + j;
 	      if (j > 0)
@@ -404,16 +399,11 @@ clear_node_runtime (vlib_main_t * vm,
   vlib_main_t **stat_vms = 0, *stat_vm;
   vlib_node_runtime_t *r;
 
-  if (vec_len (vlib_mains) == 0)
-    vec_add1 (stat_vms, vm);
-  else
+  for (i = 0; i < vec_len (vlib_mains); i++)
     {
-      for (i = 0; i < vec_len (vlib_mains); i++)
-	{
-	  stat_vm = vlib_mains[i];
-	  if (stat_vm)
-	    vec_add1 (stat_vms, stat_vm);
-	}
+      stat_vm = vlib_mains[i];
+      if (stat_vm)
+	vec_add1 (stat_vms, stat_vm);
     }
 
   vlib_worker_thread_barrier_sync (vm);
diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h
index f49a8d6f..8ccfc438 100644
--- a/src/vlib/node_funcs.h
+++ b/src/vlib/node_funcs.h
@@ -201,7 +201,7 @@ vlib_get_frame_no_check (vlib_main_t * vm, uword frame_index)
   vlib_frame_t *f;
   u32 cpu_index = frame_index & VLIB_CPU_MASK;
   u32 offset = frame_index & VLIB_OFFSET_MASK;
-  vm = vlib_mains ? vlib_mains[cpu_index] : vm;
+  vm = vlib_mains[cpu_index];
   f = vm->heap_base + offset;
   return f;
 }
@@ -213,7 +213,7 @@ vlib_frame_index_no_check (vlib_main_t * vm, vlib_frame_t * f)
 
   ASSERT (((uword) f & VLIB_CPU_MASK) == 0);
 
-  vm = vlib_mains ? vlib_mains[f->cpu_index] : vm;
+  vm = vlib_mains[f->cpu_index];
 
   i = ((u8 *) f - (u8 *) vm->heap_base);
   return i | f->cpu_index;
diff --git a/src/vlib/threads.c b/src/vlib/threads.c
index e3ea3c9c..4676be97 100644
--- a/src/vlib/threads.c
+++ b/src/vlib/threads.c
@@ -570,9 +570,13 @@ start_workers (vlib_main_t * vm)
 
   if (n_vlib_mains > 1)
     {
-      vec_validate (vlib_mains, tm->n_vlib_mains - 1);
+      /* Replace hand-crafted length-1 vector with a real vector */
+      vlib_mains = 0;
+
+      vec_validate_aligned (vlib_mains, tm->n_vlib_mains - 1,
+			    CLIB_CACHE_LINE_BYTES);
       _vec_len (vlib_mains) = 0;
-      vec_add1 (vlib_mains, vm);
+      vec_add1_aligned (vlib_mains, vm, CLIB_CACHE_LINE_BYTES);
 
       vlib_worker_threads->wait_at_barrier =
 	clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES);
@@ -685,7 +689,7 @@ start_workers (vlib_main_t * vm)
 	      /* Packet trace buffers are guaranteed to be empty, nothing to do here */
 
 	      clib_mem_set_heap (oldheap);
-	      vec_add1 (vlib_mains, vm_clone);
+	      vec_add1_aligned (vlib_mains, vm_clone, CLIB_CACHE_LINE_BYTES);
 
 	      vm_clone->error_main.counters =
 		vec_dup (vlib_mains[0]->error_main.counters);
@@ -805,7 +809,7 @@ vlib_worker_thread_node_runtime_update (void)
 
   ASSERT (os_get_cpu_number () == 0);
 
-  if (vec_len (vlib_mains) == 0)
+  if (vec_len (vlib_mains) == 1)
     return;
 
   vm = vlib_mains[0];
@@ -1148,7 +1152,7 @@ vlib_worker_thread_barrier_sync (vlib_main_t * vm)
   f64 deadline;
   u32 count;
 
-  if (!vlib_mains)
+  if (vec_len (vlib_mains) < 2)
     return;
 
   count = vec_len (vlib_mains) - 1;
@@ -1179,7 +1183,7 @@ vlib_worker_thread_barrier_release (vlib_main_t * vm)
 {
   f64 deadline;
 
-  if (!vlib_mains)
+  if (vec_len (vlib_mains) < 2)
     return;
 
   if (--vlib_worker_threads[0].recursion_level > 0)
diff --git a/src/vlib/threads.h b/src/vlib/threads.h
index 75a5a281..a032311c 100644
--- a/src/vlib/threads.h
+++ b/src/vlib/threads.h
@@ -222,30 +222,25 @@ vlib_worker_thread_barrier_check (void)
     }
 }
 
-#define foreach_vlib_main(body)			                        \
-do {                                                                    \
-    vlib_main_t ** __vlib_mains = 0, *this_vlib_main;                   \
-    int ii;                                                             \
-                                                                        \
-    if (vec_len (vlib_mains) == 0)                                      \
-        vec_add1 (__vlib_mains, &vlib_global_main);                     \
-    else                                                                \
-    {                                                                   \
-        for (ii = 0; ii < vec_len (vlib_mains); ii++)                   \
-        {                                                               \
-            this_vlib_main = vlib_mains[ii];                            \
-            if (this_vlib_main)                                         \
-                vec_add1 (__vlib_mains, this_vlib_main);                \
-        }                                                               \
-    }                                                                   \
-                                                                        \
-    for (ii = 0; ii < vec_len (__vlib_mains); ii++)                     \
-    {                                                                   \
-        this_vlib_main = __vlib_mains[ii];                              \
-        /* body uses this_vlib_main... */                               \
-        (body);                                                         \
-    }                                                                   \
-    vec_free (__vlib_mains);                                            \
+#define foreach_vlib_main(body)                         \
+do {                                                    \
+  vlib_main_t ** __vlib_mains = 0, *this_vlib_main;     \
+  int ii;                                               \
+                                                        \
+  for (ii = 0; ii < vec_len (vlib_mains); ii++)         \
+    {                                                   \
+      this_vlib_main = vlib_mains[ii];                  \
+      if (this_vlib_main)                               \
+        vec_add1 (__vlib_mains, this_vlib_main);        \
+    }                                                   \
+                                                        \
+  for (ii = 0; ii < vec_len (__vlib_mains); ii++)       \
+    {                                                   \
+      this_vlib_main = __vlib_mains[ii];                \
+      /* body uses this_vlib_main... */                 \
+      (body);                                           \
+    }                                                   \
+  vec_free (__vlib_mains);                              \
 } while (0);
 
 #define foreach_sched_policy \
diff --git a/src/vlibapi/api.h b/src/vlibapi/api.h
index 2cbeb63c..87a56121 100644
--- a/src/vlibapi/api.h
+++ b/src/vlibapi/api.h
@@ -252,11 +252,13 @@ void vl_msg_api_queue_handler (unix_shared_memory_queue_t * q);
 vl_api_trace_t *vl_msg_api_trace_get (api_main_t * am,
 				      vl_api_trace_which_t which);
 
+void vl_msg_api_barrier_sync (void) __attribute__ ((weak));
+void vl_msg_api_barrier_release (void) __attribute__ ((weak));
 void vl_msg_api_free (void *);
 void vl_noop_handler (void *mp);
-clib_error_t *vl_api_init (vlib_main_t * vm);
 void vl_msg_api_increment_missing_client_counter (void);
 void vl_msg_api_post_mortem_dump (void);
+void vl_msg_api_post_mortem_dump_enable_disable (int enable);
 void vl_msg_api_register_pd_handler (void *handler,
 				     u16 msg_id_host_byte_order);
 int vl_msg_api_pd_handler (void *mp, int rv);
diff --git a/src/vlibapi/api_shared.c b/src/vlibapi/api_shared.c
index 69ba10c1..6774e3dd 100644
--- a/src/vlibapi/api_shared.c
+++ b/src/vlibapi/api_shared.c
@@ -23,11 +23,6 @@
 #include <stdlib.h>
 #include <stddef.h>
 #include <string.h>
-#include <sys/types.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
 #include <vppinfra/format.h>
 #include <vppinfra/byte_order.h>
 #include <vppinfra/error.h>
@@ -36,19 +31,14 @@
 #include <vlibapi/api.h>
 #include <vppinfra/elog.h>
 
-api_main_t api_main;
-
-void vl_msg_api_barrier_sync (void) __attribute__ ((weak));
-void
-vl_msg_api_barrier_sync (void)
-{
-}
-
-void vl_msg_api_barrier_release (void) __attribute__ ((weak));
-void
-vl_msg_api_barrier_release (void)
-{
-}
+/* *INDENT-OFF* */
+api_main_t api_main =
+  {
+    .region_name = "/unset",
+    .api_uid = -1,
+    .api_gid = -1,
+  };
+/* *INDENT-ON* */
 
 void
 vl_msg_api_increment_missing_client_counter (void)
@@ -57,14 +47,6 @@ vl_msg_api_increment_missing_client_counter (void)
   am->missing_clients++;
 }
 
-typedef enum
-{
-  DUMP,
-  CUSTOM_DUMP,
-  REPLAY,
-  INITIALIZERS,
-} vl_api_replay_t;
-
 int
 vl_msg_api_rx_trace_enabled (api_main_t * am)
 {
@@ -397,6 +379,16 @@ vl_msg_api_trace_configure (api_main_t * am, vl_api_trace_which_t which,
   return 0;
 }
 
+void
+vl_msg_api_barrier_sync (void)
+{
+}
+
+void
+vl_msg_api_barrier_release (void)
+{
+}
+
 always_inline void
 msg_handler_internal (api_main_t * am,
 		      void *the_msg, int trace_it, int do_it, int free_it)
@@ -748,495 +740,15 @@ vl_noop_handler (void *mp)
 {
 }
 
-clib_error_t *
-vl_api_init (vlib_main_t * vm)
-{
-  static u8 once;
-  api_main_t *am = &api_main;
-
-  if (once)
-    return 0;
-
-  once = 1;
-
-  am->region_name = "/unset";
-  /*
-   * Eventually passed to fchown, -1 => "current user"
-   * instead of 0 => "root". A very fine disctinction at best.
-   */
-  if (am->api_uid == 0)
-    am->api_uid = -1;
-  if (am->api_gid == 0)
-    am->api_gid = -1;
-
-  return (0);
-}
-
-void vl_msg_api_custom_dump_configure (api_main_t * am)
-  __attribute__ ((weak));
-void
-vl_msg_api_custom_dump_configure (api_main_t * am)
-{
-}
-
-VLIB_INIT_FUNCTION (vl_api_init);
-
-static void
-vl_msg_api_process_file (vlib_main_t * vm, u8 * filename,
-			 u32 first_index, u32 last_index,
-			 vl_api_replay_t which)
-{
-  vl_api_trace_file_header_t *hp;
-  int i, fd;
-  struct stat statb;
-  size_t file_size;
-  u8 *msg;
-  u8 endian_swap_needed = 0;
-  api_main_t *am = &api_main;
-  u8 *tmpbuf = 0;
-  u32 nitems;
-  void **saved_print_handlers = 0;
-
-  fd = open ((char *) filename, O_RDONLY);
-
-  if (fd < 0)
-    {
-      vlib_cli_output (vm, "Couldn't open %s\n", filename);
-      return;
-    }
-
-  if (fstat (fd, &statb) < 0)
-    {
-      vlib_cli_output (vm, "Couldn't stat %s\n", filename);
-      close (fd);
-      return;
-    }
-
-  if (!(statb.st_mode & S_IFREG) || (statb.st_size < sizeof (*hp)))
-    {
-      vlib_cli_output (vm, "File not plausible: %s\n", filename);
-      close (fd);
-      return;
-    }
-
-  file_size = statb.st_size;
-  file_size = (file_size + 4095) & ~(4096);
-
-  hp = mmap (0, file_size, PROT_READ, MAP_PRIVATE, fd, 0);
-
-  if (hp == (vl_api_trace_file_header_t *) MAP_FAILED)
-    {
-      vlib_cli_output (vm, "mmap failed: %s\n", filename);
-      close (fd);
-      return;
-    }
-  close (fd);
-
-  if ((clib_arch_is_little_endian && hp->endian == VL_API_BIG_ENDIAN)
-      || (clib_arch_is_big_endian && hp->endian == VL_API_LITTLE_ENDIAN))
-    endian_swap_needed = 1;
-
-  if (endian_swap_needed)
-    nitems = ntohl (hp->nitems);
-  else
-    nitems = hp->nitems;
-
-  if (last_index == (u32) ~ 0)
-    {
-      last_index = nitems - 1;
-    }
-
-  if (first_index >= nitems || last_index >= nitems)
-    {
-      vlib_cli_output (vm, "Range (%d, %d) outside file range (0, %d)\n",
-		       first_index, last_index, nitems - 1);
-      munmap (hp, file_size);
-      return;
-    }
-  if (hp->wrapped)
-    vlib_cli_output (vm,
-		     "Note: wrapped/incomplete trace, results may vary\n");
-
-  if (which == CUSTOM_DUMP)
-    {
-      saved_print_handlers = (void **) vec_dup (am->msg_print_handlers);
-      vl_msg_api_custom_dump_configure (am);
-    }
-
-
-  msg = (u8 *) (hp + 1);
-
-  for (i = 0; i < first_index; i++)
-    {
-      trace_cfg_t *cfgp;
-      int size;
-      u16 msg_id;
-
-      size = clib_host_to_net_u32 (*(u32 *) msg);
-      msg += sizeof (u32);
-
-      if (clib_arch_is_little_endian)
-	msg_id = ntohs (*((u16 *) msg));
-      else
-	msg_id = *((u16 *) msg);
-
-      cfgp = am->api_trace_cfg + msg_id;
-      if (!cfgp)
-	{
-	  vlib_cli_output (vm, "Ugh: msg id %d no trace config\n", msg_id);
-	  munmap (hp, file_size);
-	  return;
-	}
-      msg += size;
-    }
-
-  if (which == REPLAY)
-    am->replay_in_progress = 1;
-
-  for (; i <= last_index; i++)
-    {
-      trace_cfg_t *cfgp;
-      u16 *msg_idp;
-      u16 msg_id;
-      int size;
-
-      if (which == DUMP)
-	vlib_cli_output (vm, "---------- trace %d -----------\n", i);
-
-      size = clib_host_to_net_u32 (*(u32 *) msg);
-      msg += sizeof (u32);
-
-      if (clib_arch_is_little_endian)
-	msg_id = ntohs (*((u16 *) msg));
-      else
-	msg_id = *((u16 *) msg);
-
-      cfgp = am->api_trace_cfg + msg_id;
-      if (!cfgp)
-	{
-	  vlib_cli_output (vm, "Ugh: msg id %d no trace config\n", msg_id);
-	  munmap (hp, file_size);
-	  vec_free (tmpbuf);
-	  am->replay_in_progress = 0;
-	  return;
-	}
-
-      /* Copy the buffer (from the read-only mmap'ed file) */
-      vec_validate (tmpbuf, size - 1 + sizeof (uword));
-      clib_memcpy (tmpbuf + sizeof (uword), msg, size);
-      memset (tmpbuf, 0xf, sizeof (uword));
-
-      /*
-       * Endian swap if needed. All msg data is supposed to be
-       * in network byte order. All msg handlers are supposed to
-       * know that. The generic message dumpers don't know that.
-       * One could fix apigen, I suppose.
-       */
-      if ((which == DUMP && clib_arch_is_little_endian) || endian_swap_needed)
-	{
-	  void (*endian_fp) (void *);
-	  if (msg_id >= vec_len (am->msg_endian_handlers)
-	      || (am->msg_endian_handlers[msg_id] == 0))
-	    {
-	      vlib_cli_output (vm, "Ugh: msg id %d no endian swap\n", msg_id);
-	      munmap (hp, file_size);
-	      vec_free (tmpbuf);
-	      am->replay_in_progress = 0;
-	      return;
-	    }
-	  endian_fp = am->msg_endian_handlers[msg_id];
-	  (*endian_fp) (tmpbuf + sizeof (uword));
-	}
-
-      /* msg_id always in network byte order */
-      if (clib_arch_is_little_endian)
-	{
-	  msg_idp = (u16 *) (tmpbuf + sizeof (uword));
-	  *msg_idp = msg_id;
-	}
-
-      switch (which)
-	{
-	case CUSTOM_DUMP:
-	case DUMP:
-	  if (msg_id < vec_len (am->msg_print_handlers) &&
-	      am->msg_print_handlers[msg_id])
-	    {
-	      u8 *(*print_fp) (void *, void *);
-
-	      print_fp = (void *) am->msg_print_handlers[msg_id];
-	      (*print_fp) (tmpbuf + sizeof (uword), vm);
-	    }
-	  else
-	    {
-	      vlib_cli_output (vm, "Skipping msg id %d: no print fcn\n",
-			       msg_id);
-	      break;
-	    }
-	  break;
-
-	case INITIALIZERS:
-	  if (msg_id < vec_len (am->msg_print_handlers) &&
-	      am->msg_print_handlers[msg_id])
-	    {
-	      u8 *s;
-	      int j;
-	      u8 *(*print_fp) (void *, void *);
-
-	      print_fp = (void *) am->msg_print_handlers[msg_id];
-
-	      vlib_cli_output (vm, "/*");
-
-	      (*print_fp) (tmpbuf + sizeof (uword), vm);
-	      vlib_cli_output (vm, "*/\n");
-
-	      s = format (0, "static u8 * vl_api_%s_%d[%d] = {",
-			  am->msg_names[msg_id], i,
-			  am->api_trace_cfg[msg_id].size);
-
-	      for (j = 0; j < am->api_trace_cfg[msg_id].size; j++)
-		{
-		  if ((j & 7) == 0)
-		    s = format (s, "\n    ");
-		  s = format (s, "0x%02x,", tmpbuf[sizeof (uword) + j]);
-		}
-	      s = format (s, "\n};\n%c", 0);
-	      vlib_cli_output (vm, (char *) s);
-	      vec_free (s);
-	    }
-	  break;
-
-	case REPLAY:
-	  if (msg_id < vec_len (am->msg_print_handlers) &&
-	      am->msg_print_handlers[msg_id] && cfgp->replay_enable)
-	    {
-	      void (*handler) (void *);
-
-	      handler = (void *) am->msg_handlers[msg_id];
-
-	      if (!am->is_mp_safe[msg_id])
-		vl_msg_api_barrier_sync ();
-	      (*handler) (tmpbuf + sizeof (uword));
-	      if (!am->is_mp_safe[msg_id])
-		vl_msg_api_barrier_release ();
-	    }
-	  else
-	    {
-	      if (cfgp->replay_enable)
-		vlib_cli_output (vm, "Skipping msg id %d: no handler\n",
-				 msg_id);
-	      break;
-	    }
-	  break;
-	}
-
-      _vec_len (tmpbuf) = 0;
-      msg += size;
-    }
-
-  if (saved_print_handlers)
-    {
-      clib_memcpy (am->msg_print_handlers, saved_print_handlers,
-		   vec_len (am->msg_print_handlers) * sizeof (void *));
-      vec_free (saved_print_handlers);
-    }
-
-  munmap (hp, file_size);
-  vec_free (tmpbuf);
-  am->replay_in_progress = 0;
-}
-
-u8 *
-format_vl_msg_api_trace_status (u8 * s, va_list * args)
-{
-  api_main_t *am = va_arg (*args, api_main_t *);
-  vl_api_trace_which_t which = va_arg (*args, vl_api_trace_which_t);
-  vl_api_trace_t *tp;
-  char *trace_name;
-
-  switch (which)
-    {
-    case VL_API_TRACE_TX:
-      tp = am->tx_trace;
-      trace_name = "TX trace";
-      break;
-
-    case VL_API_TRACE_RX:
-      tp = am->rx_trace;
-      trace_name = "RX trace";
-      break;
-
-    default:
-      abort ();
-    }
-
-  if (tp == 0)
-    {
-      s = format (s, "%s: not yet configured.\n", trace_name);
-      return s;
-    }
-
-  s = format (s, "%s: used %d of %d items, %s enabled, %s wrapped\n",
-	      trace_name, vec_len (tp->traces), tp->nitems,
-	      tp->enabled ? "is" : "is not", tp->wrapped ? "has" : "has not");
-  return s;
-}
 
 static u8 post_mortem_dump_enabled;
 
-static clib_error_t *
-api_trace_command_fn (vlib_main_t * vm,
-		      unformat_input_t * input, vlib_cli_command_t * cmd)
-{
-  u32 nitems = 256 << 10;
-  api_main_t *am = &api_main;
-  vl_api_trace_which_t which = VL_API_TRACE_RX;
-  u8 *filename;
-  u32 first = 0;
-  u32 last = (u32) ~ 0;
-  FILE *fp;
-  int rv;
-
-  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
-    {
-      if (unformat (input, "on") || unformat (input, "enable"))
-	{
-	  if (unformat (input, "nitems %d", &nitems))
-	    ;
-	  vl_msg_api_trace_configure (am, which, nitems);
-	  vl_msg_api_trace_onoff (am, which, 1 /* on */ );
-	}
-      else if (unformat (input, "off"))
-	{
-	  vl_msg_api_trace_onoff (am, which, 0);
-	}
-      else if (unformat (input, "save %s", &filename))
-	{
-	  u8 *chroot_filename;
-	  if (strstr ((char *) filename, "..")
-	      || index ((char *) filename, '/'))
-	    {
-	      vlib_cli_output (vm, "illegal characters in filename '%s'",
-			       filename);
-	      return 0;
-	    }
-
-	  chroot_filename = format (0, "/tmp/%s%c", filename, 0);
-
-	  vec_free (filename);
-
-	  fp = fopen ((char *) chroot_filename, "w");
-	  if (fp == NULL)
-	    {
-	      vlib_cli_output (vm, "Couldn't create %s\n", chroot_filename);
-	      return 0;
-	    }
-	  rv = vl_msg_api_trace_save (am, which, fp);
-	  fclose (fp);
-	  if (rv == -1)
-	    vlib_cli_output (vm, "API Trace data not present\n");
-	  else if (rv == -2)
-	    vlib_cli_output (vm, "File for writing is closed\n");
-	  else if (rv == -10)
-	    vlib_cli_output (vm, "Error while writing header to file\n");
-	  else if (rv == -11)
-	    vlib_cli_output (vm, "Error while writing trace to file\n");
-	  else if (rv == -12)
-	    vlib_cli_output (vm,
-			     "Error while writing end of buffer trace to file\n");
-	  else if (rv == -13)
-	    vlib_cli_output (vm,
-			     "Error while writing start of buffer trace to file\n");
-	  else if (rv < 0)
-	    vlib_cli_output (vm, "Unkown error while saving: %d", rv);
-	  else
-	    vlib_cli_output (vm, "API trace saved to %s\n", chroot_filename);
-	  vec_free (chroot_filename);
-	}
-      else if (unformat (input, "dump %s", &filename))
-	{
-	  vl_msg_api_process_file (vm, filename, first, last, DUMP);
-	}
-      else if (unformat (input, "custom-dump %s", &filename))
-	{
-	  vl_msg_api_process_file (vm, filename, first, last, CUSTOM_DUMP);
-	}
-      else if (unformat (input, "replay %s", &filename))
-	{
-	  vl_msg_api_process_file (vm, filename, first, last, REPLAY);
-	}
-      else if (unformat (input, "initializers %s", &filename))
-	{
-	  vl_msg_api_process_file (vm, filename, first, last, INITIALIZERS);
-	}
-      else if (unformat (input, "tx"))
-	{
-	  which = VL_API_TRACE_TX;
-	}
-      else if (unformat (input, "first %d", &first))
-	{
-	  ;
-	}
-      else if (unformat (input, "last %d", &last))
-	{
-	  ;
-	}
-      else if (unformat (input, "status"))
-	{
-	  vlib_cli_output (vm, "%U", format_vl_msg_api_trace_status,
-			   am, which);
-	}
-      else if (unformat (input, "free"))
-	{
-	  vl_msg_api_trace_onoff (am, which, 0);
-	  vl_msg_api_trace_free (am, which);
-	}
-      else if (unformat (input, "post-mortem-on"))
-	post_mortem_dump_enabled = 1;
-      else if (unformat (input, "post-mortem-off"))
-	post_mortem_dump_enabled = 0;
-      else
-	return clib_error_return (0, "unknown input `%U'",
-				  format_unformat_error, input);
-    }
-  return 0;
-}
-
-/* *INDENT-OFF* */
-VLIB_CLI_COMMAND (api_trace_command, static) = {
-    .path = "api trace",
-    .short_help =
-    "api trace [on|off][dump|save|replay <file>][status][free][post-mortem-on]",
-    .function = api_trace_command_fn,
-};
-/* *INDENT-ON* */
-
-static clib_error_t *
-api_config_fn (vlib_main_t * vm, unformat_input_t * input)
+void
+vl_msg_api_post_mortem_dump_enable_disable (int enable)
 {
-  u32 nitems = 256 << 10;
-  vl_api_trace_which_t which = VL_API_TRACE_RX;
-  api_main_t *am = &api_main;
-
-  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
-    {
-      if (unformat (input, "on") || unformat (input, "enable"))
-	{
-	  if (unformat (input, "nitems %d", &nitems))
-	    ;
-	  vl_msg_api_trace_configure (am, which, nitems);
-	  vl_msg_api_trace_onoff (am, which, 1 /* on */ );
-	  post_mortem_dump_enabled = 1;
-	}
-      else
-	return clib_error_return (0, "unknown input `%U'",
-				  format_unformat_error, input);
-    }
-  return 0;
+  post_mortem_dump_enabled = enable;
 }
 
-VLIB_CONFIG_FUNCTION (api_config_fn, "api-trace");
-
 void
 vl_msg_api_post_mortem_dump (void)
 {
diff --git a/src/vlibapi/node_serialize.c b/src/vlibapi/node_serialize.c
index 4dc1a7d2..50e5c41c 100644
--- a/src/vlibapi/node_serialize.c
+++ b/src/vlibapi/node_serialize.c
@@ -73,16 +73,11 @@ vlib_node_serialize (vlib_node_main_t * nm, u8 * vector,
 
   if (vec_len (stat_vms) == 0)
     {
-      if (vec_len (vlib_mains) == 0)
-	vec_add1 (stat_vms, vm);
-      else
+      for (i = 0; i < vec_len (vlib_mains); i++)
 	{
-	  for (i = 0; i < vec_len (vlib_mains); i++)
-	    {
-	      stat_vm = vlib_mains[i];
-	      if (stat_vm)
-		vec_add1 (stat_vms, stat_vm);
-	    }
+	  stat_vm = vlib_mains[i];
+	  if (stat_vm)
+	    vec_add1 (stat_vms, stat_vm);
 	}
     }
 
@@ -286,7 +281,7 @@ vlib_node_unserialize (u8 * vector)
   return nodes_by_thread;
 }
 
-#if CLIB_DEBUG > 0
+#if TEST_CODE
 
 static clib_error_t *
 test_node_serialize_command_fn (vlib_main_t * vm,
diff --git a/src/vlibmemory/memory_vlib.c b/src/vlibmemory/memory_vlib.c
index 3a7415c0..d2e05968 100644
--- a/src/vlibmemory/memory_vlib.c
+++ b/src/vlibmemory/memory_vlib.c
@@ -22,6 +22,8 @@
 #include <string.h>
 #include <unistd.h>
 #include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
 #include <signal.h>
 #include <pthread.h>
 #include <vppinfra/vec.h>
@@ -1437,6 +1439,475 @@ rpc_api_hookup (vlib_main_t * vm)
 
 VLIB_API_INIT_FUNCTION (rpc_api_hookup);
 
+typedef enum
+{
+  DUMP,
+  CUSTOM_DUMP,
+  REPLAY,
+  INITIALIZERS,
+} vl_api_replay_t;
+
+u8 *
+format_vl_msg_api_trace_status (u8 * s, va_list * args)
+{
+  api_main_t *am = va_arg (*args, api_main_t *);
+  vl_api_trace_which_t which = va_arg (*args, vl_api_trace_which_t);
+  vl_api_trace_t *tp;
+  char *trace_name;
+
+  switch (which)
+    {
+    case VL_API_TRACE_TX:
+      tp = am->tx_trace;
+      trace_name = "TX trace";
+      break;
+
+    case VL_API_TRACE_RX:
+      tp = am->rx_trace;
+      trace_name = "RX trace";
+      break;
+
+    default:
+      abort ();
+    }
+
+  if (tp == 0)
+    {
+      s = format (s, "%s: not yet configured.\n", trace_name);
+      return s;
+    }
+
+  s = format (s, "%s: used %d of %d items, %s enabled, %s wrapped\n",
+	      trace_name, vec_len (tp->traces), tp->nitems,
+	      tp->enabled ? "is" : "is not", tp->wrapped ? "has" : "has not");
+  return s;
+}
+
+void vl_msg_api_custom_dump_configure (api_main_t * am)
+  __attribute__ ((weak));
+void
+vl_msg_api_custom_dump_configure (api_main_t * am)
+{
+}
+
+static void
+vl_msg_api_process_file (vlib_main_t * vm, u8 * filename,
+			 u32 first_index, u32 last_index,
+			 vl_api_replay_t which)
+{
+  vl_api_trace_file_header_t *hp;
+  int i, fd;
+  struct stat statb;
+  size_t file_size;
+  u8 *msg;
+  u8 endian_swap_needed = 0;
+  api_main_t *am = &api_main;
+  u8 *tmpbuf = 0;
+  u32 nitems;
+  void **saved_print_handlers = 0;
+
+  fd = open ((char *) filename, O_RDONLY);
+
+  if (fd < 0)
+    {
+      vlib_cli_output (vm, "Couldn't open %s\n", filename);
+      return;
+    }
+
+  if (fstat (fd, &statb) < 0)
+    {
+      vlib_cli_output (vm, "Couldn't stat %s\n", filename);
+      close (fd);
+      return;
+    }
+
+  if (!(statb.st_mode & S_IFREG) || (statb.st_size < sizeof (*hp)))
+    {
+      vlib_cli_output (vm, "File not plausible: %s\n", filename);
+      close (fd);
+      return;
+    }
+
+  file_size = statb.st_size;
+  file_size = (file_size + 4095) & ~(4096);
+
+  hp = mmap (0, file_size, PROT_READ, MAP_PRIVATE, fd, 0);
+
+  if (hp == (vl_api_trace_file_header_t *) MAP_FAILED)
+    {
+      vlib_cli_output (vm, "mmap failed: %s\n", filename);
+      close (fd);
+      return;
+    }
+  close (fd);
+
+  if ((clib_arch_is_little_endian && hp->endian == VL_API_BIG_ENDIAN)
+      || (clib_arch_is_big_endian && hp->endian == VL_API_LITTLE_ENDIAN))
+    endian_swap_needed = 1;
+
+  if (endian_swap_needed)
+    nitems = ntohl (hp->nitems);
+  else
+    nitems = hp->nitems;
+
+  if (last_index == (u32) ~ 0)
+    {
+      last_index = nitems - 1;
+    }
+
+  if (first_index >= nitems || last_index >= nitems)
+    {
+      vlib_cli_output (vm, "Range (%d, %d) outside file range (0, %d)\n",
+		       first_index, last_index, nitems - 1);
+      munmap (hp, file_size);
+      return;
+    }
+  if (hp->wrapped)
+    vlib_cli_output (vm,
+		     "Note: wrapped/incomplete trace, results may vary\n");
+
+  if (which == CUSTOM_DUMP)
+    {
+      saved_print_handlers = (void **) vec_dup (am->msg_print_handlers);
+      vl_msg_api_custom_dump_configure (am);
+    }
+
+
+  msg = (u8 *) (hp + 1);
+
+  for (i = 0; i < first_index; i++)
+    {
+      trace_cfg_t *cfgp;
+      int size;
+      u16 msg_id;
+
+      size = clib_host_to_net_u32 (*(u32 *) msg);
+      msg += sizeof (u32);
+
+      if (clib_arch_is_little_endian)
+	msg_id = ntohs (*((u16 *) msg));
+      else
+	msg_id = *((u16 *) msg);
+
+      cfgp = am->api_trace_cfg + msg_id;
+      if (!cfgp)
+	{
+	  vlib_cli_output (vm, "Ugh: msg id %d no trace config\n", msg_id);
+	  munmap (hp, file_size);
+	  return;
+	}
+      msg += size;
+    }
+
+  if (which == REPLAY)
+    am->replay_in_progress = 1;
+
+  for (; i <= last_index; i++)
+    {
+      trace_cfg_t *cfgp;
+      u16 *msg_idp;
+      u16 msg_id;
+      int size;
+
+      if (which == DUMP)
+	vlib_cli_output (vm, "---------- trace %d -----------\n", i);
+
+      size = clib_host_to_net_u32 (*(u32 *) msg);
+      msg += sizeof (u32);
+
+      if (clib_arch_is_little_endian)
+	msg_id = ntohs (*((u16 *) msg));
+      else
+	msg_id = *((u16 *) msg);
+
+      cfgp = am->api_trace_cfg + msg_id;
+      if (!cfgp)
+	{
+	  vlib_cli_output (vm, "Ugh: msg id %d no trace config\n", msg_id);
+	  munmap (hp, file_size);
+	  vec_free (tmpbuf);
+	  am->replay_in_progress = 0;
+	  return;
+	}
+
+      /* Copy the buffer (from the read-only mmap'ed file) */
+      vec_validate (tmpbuf, size - 1 + sizeof (uword));
+      clib_memcpy (tmpbuf + sizeof (uword), msg, size);
+      memset (tmpbuf, 0xf, sizeof (uword));
+
+      /*
+       * Endian swap if needed. All msg data is supposed to be
+       * in network byte order. All msg handlers are supposed to
+       * know that. The generic message dumpers don't know that.
+       * One could fix apigen, I suppose.
+       */
+      if ((which == DUMP && clib_arch_is_little_endian) || endian_swap_needed)
+	{
+	  void (*endian_fp) (void *);
+	  if (msg_id >= vec_len (am->msg_endian_handlers)
+	      || (am->msg_endian_handlers[msg_id] == 0))
+	    {
+	      vlib_cli_output (vm, "Ugh: msg id %d no endian swap\n", msg_id);
+	      munmap (hp, file_size);
+	      vec_free (tmpbuf);
+	      am->replay_in_progress = 0;
+	      return;
+	    }
+	  endian_fp = am->msg_endian_handlers[msg_id];
+	  (*endian_fp) (tmpbuf + sizeof (uword));
+	}
+
+      /* msg_id always in network byte order */
+      if (clib_arch_is_little_endian)
+	{
+	  msg_idp = (u16 *) (tmpbuf + sizeof (uword));
+	  *msg_idp = msg_id;
+	}
+
+      switch (which)
+	{
+	case CUSTOM_DUMP:
+	case DUMP:
+	  if (msg_id < vec_len (am->msg_print_handlers) &&
+	      am->msg_print_handlers[msg_id])
+	    {
+	      u8 *(*print_fp) (void *, void *);
+
+	      print_fp = (void *) am->msg_print_handlers[msg_id];
+	      (*print_fp) (tmpbuf + sizeof (uword), vm);
+	    }
+	  else
+	    {
+	      vlib_cli_output (vm, "Skipping msg id %d: no print fcn\n",
+			       msg_id);
+	      break;
+	    }
+	  break;
+
+	case INITIALIZERS:
+	  if (msg_id < vec_len (am->msg_print_handlers) &&
+	      am->msg_print_handlers[msg_id])
+	    {
+	      u8 *s;
+	      int j;
+	      u8 *(*print_fp) (void *, void *);
+
+	      print_fp = (void *) am->msg_print_handlers[msg_id];
+
+	      vlib_cli_output (vm, "/*");
+
+	      (*print_fp) (tmpbuf + sizeof (uword), vm);
+	      vlib_cli_output (vm, "*/\n");
+
+	      s = format (0, "static u8 * vl_api_%s_%d[%d] = {",
+			  am->msg_names[msg_id], i,
+			  am->api_trace_cfg[msg_id].size);
+
+	      for (j = 0; j < am->api_trace_cfg[msg_id].size; j++)
+		{
+		  if ((j & 7) == 0)
+		    s = format (s, "\n    ");
+		  s = format (s, "0x%02x,", tmpbuf[sizeof (uword) + j]);
+		}
+	      s = format (s, "\n};\n%c", 0);
+	      vlib_cli_output (vm, (char *) s);
+	      vec_free (s);
+	    }
+	  break;
+
+	case REPLAY:
+	  if (msg_id < vec_len (am->msg_print_handlers) &&
+	      am->msg_print_handlers[msg_id] && cfgp->replay_enable)
+	    {
+	      void (*handler) (void *);
+
+	      handler = (void *) am->msg_handlers[msg_id];
+
+	      if (!am->is_mp_safe[msg_id])
+		vl_msg_api_barrier_sync ();
+	      (*handler) (tmpbuf + sizeof (uword));
+	      if (!am->is_mp_safe[msg_id])
+		vl_msg_api_barrier_release ();
+	    }
+	  else
+	    {
+	      if (cfgp->replay_enable)
+		vlib_cli_output (vm, "Skipping msg id %d: no handler\n",
+				 msg_id);
+	      break;
+	    }
+	  break;
+	}
+
+      _vec_len (tmpbuf) = 0;
+      msg += size;
+    }
+
+  if (saved_print_handlers)
+    {
+      clib_memcpy (am->msg_print_handlers, saved_print_handlers,
+		   vec_len (am->msg_print_handlers) * sizeof (void *));
+      vec_free (saved_print_handlers);
+    }
+
+  munmap (hp, file_size);
+  vec_free (tmpbuf);
+  am->replay_in_progress = 0;
+}
+
+static clib_error_t *
+api_trace_command_fn (vlib_main_t * vm,
+		      unformat_input_t * input, vlib_cli_command_t * cmd)
+{
+  u32 nitems = 256 << 10;
+  api_main_t *am = &api_main;
+  vl_api_trace_which_t which = VL_API_TRACE_RX;
+  u8 *filename;
+  u32 first = 0;
+  u32 last = (u32) ~ 0;
+  FILE *fp;
+  int rv;
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "on") || unformat (input, "enable"))
+	{
+	  if (unformat (input, "nitems %d", &nitems))
+	    ;
+	  vl_msg_api_trace_configure (am, which, nitems);
+	  vl_msg_api_trace_onoff (am, which, 1 /* on */ );
+	}
+      else if (unformat (input, "off"))
+	{
+	  vl_msg_api_trace_onoff (am, which, 0);
+	}
+      else if (unformat (input, "save %s", &filename))
+	{
+	  u8 *chroot_filename;
+	  if (strstr ((char *) filename, "..")
+	      || index ((char *) filename, '/'))
+	    {
+	      vlib_cli_output (vm, "illegal characters in filename '%s'",
+			       filename);
+	      return 0;
+	    }
+
+	  chroot_filename = format (0, "/tmp/%s%c", filename, 0);
+
+	  vec_free (filename);
+
+	  fp = fopen ((char *) chroot_filename, "w");
+	  if (fp == NULL)
+	    {
+	      vlib_cli_output (vm, "Couldn't create %s\n", chroot_filename);
+	      return 0;
+	    }
+	  rv = vl_msg_api_trace_save (am, which, fp);
+	  fclose (fp);
+	  if (rv == -1)
+	    vlib_cli_output (vm, "API Trace data not present\n");
+	  else if (rv == -2)
+	    vlib_cli_output (vm, "File for writing is closed\n");
+	  else if (rv == -10)
+	    vlib_cli_output (vm, "Error while writing header to file\n");
+	  else if (rv == -11)
+	    vlib_cli_output (vm, "Error while writing trace to file\n");
+	  else if (rv == -12)
+	    vlib_cli_output (vm,
+			     "Error while writing end of buffer trace to file\n");
+	  else if (rv == -13)
+	    vlib_cli_output (vm,
+			     "Error while writing start of buffer trace to file\n");
+	  else if (rv < 0)
+	    vlib_cli_output (vm, "Unkown error while saving: %d", rv);
+	  else
+	    vlib_cli_output (vm, "API trace saved to %s\n", chroot_filename);
+	  vec_free (chroot_filename);
+	}
+      else if (unformat (input, "dump %s", &filename))
+	{
+	  vl_msg_api_process_file (vm, filename, first, last, DUMP);
+	}
+      else if (unformat (input, "custom-dump %s", &filename))
+	{
+	  vl_msg_api_process_file (vm, filename, first, last, CUSTOM_DUMP);
+	}
+      else if (unformat (input, "replay %s", &filename))
+	{
+	  vl_msg_api_process_file (vm, filename, first, last, REPLAY);
+	}
+      else if (unformat (input, "initializers %s", &filename))
+	{
+	  vl_msg_api_process_file (vm, filename, first, last, INITIALIZERS);
+	}
+      else if (unformat (input, "tx"))
+	{
+	  which = VL_API_TRACE_TX;
+	}
+      else if (unformat (input, "first %d", &first))
+	{
+	  ;
+	}
+      else if (unformat (input, "last %d", &last))
+	{
+	  ;
+	}
+      else if (unformat (input, "status"))
+	{
+	  vlib_cli_output (vm, "%U", format_vl_msg_api_trace_status,
+			   am, which);
+	}
+      else if (unformat (input, "free"))
+	{
+	  vl_msg_api_trace_onoff (am, which, 0);
+	  vl_msg_api_trace_free (am, which);
+	}
+      else if (unformat (input, "post-mortem-on"))
+	vl_msg_api_post_mortem_dump_enable_disable (1 /* enable */ );
+      else if (unformat (input, "post-mortem-off"))
+	vl_msg_api_post_mortem_dump_enable_disable (0 /* enable */ );
+      else
+	return clib_error_return (0, "unknown input `%U'",
+				  format_unformat_error, input);
+    }
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (api_trace_command, static) = {
+    .path = "api trace",
+    .short_help =
+    "api trace [on|off][dump|save|replay <file>][status][free][post-mortem-on]",
+    .function = api_trace_command_fn,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+api_config_fn (vlib_main_t * vm, unformat_input_t * input)
+{
+  u32 nitems = 256 << 10;
+  vl_api_trace_which_t which = VL_API_TRACE_RX;
+  api_main_t *am = &api_main;
+
+  while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat (input, "on") || unformat (input, "enable"))
+	{
+	  if (unformat (input, "nitems %d", &nitems))
+	    ;
+	  vl_msg_api_trace_configure (am, which, nitems);
+	  vl_msg_api_trace_onoff (am, which, 1 /* on */ );
+	  vl_msg_api_post_mortem_dump_enable_disable (1 /* enable */ );
+	}
+      else
+	return clib_error_return (0, "unknown input `%U'",
+				  format_unformat_error, input);
+    }
+  return 0;
+}
+
+VLIB_CONFIG_FUNCTION (api_config_fn, "api-trace");
+
 /*
  * fd.io coding-style-patch-verification: ON
  *
diff --git a/src/vnet/devices/virtio/vhost-user.c b/src/vnet/devices/virtio/vhost-user.c
index b6b4c04a..100ec613 100644
--- a/src/vnet/devices/virtio/vhost-user.c
+++ b/src/vnet/devices/virtio/vhost-user.c
@@ -374,8 +374,7 @@ vhost_user_rx_thread_placement ()
   for (i = vum->input_cpu_first_index;
        i < vum->input_cpu_first_index + vum->input_cpu_count; i++)
     {
-      vlib_node_set_state (vlib_mains ? vlib_mains[i] : &vlib_global_main,
-			   vhost_user_input_node.index,
+      vlib_node_set_state (vlib_mains[i], vhost_user_input_node.index,
 			   VLIB_NODE_STATE_DISABLED);
       vec_add1 (workers, i);
     }
@@ -406,9 +405,9 @@ vhost_user_rx_thread_placement ()
 	  iaq.qid = qid;
 	  iaq.vhost_iface_index = vui - vum->vhost_user_interfaces;
 	  vec_add1 (vhc->rx_queues, iaq);
-	  vlib_node_set_state (vlib_mains ? vlib_mains[cpu_index] :
-	      &vlib_global_main, vhost_user_input_node.index,
-	      VLIB_NODE_STATE_POLLING);
+	  vlib_node_set_state (vlib_mains[cpu_index],
+                               vhost_user_input_node.index,
+                               VLIB_NODE_STATE_POLLING);
 	}
   });
   /* *INDENT-ON* */
diff --git a/src/vpp-api-test.am b/src/vpp-api-test.am
index f0d5df62..ceab687c 100644
--- a/src/vpp-api-test.am
+++ b/src/vpp-api-test.am
@@ -34,14 +34,12 @@ vpp_json_test_SOURCES = \
   vat/json_test.c
 
 vpp_api_test_LDADD = \
-  libvlib.la				\
   libvlibmemoryclient.la		\
   libsvm.la				\
   libvatplugin.la			\
   libvppinfra.la 			\
   libvlibapi.la				\
   libvlibmemory.la			\
-  libvnet.la				\
   -lpthread -lm -lrt -ldl -lcrypto
 
 vpp_api_test_LDFLAGS = -Wl,--export-dynamic
diff --git a/src/vpp/api/api.c b/src/vpp/api/api.c
index 828394ed..c85dc680 100644
--- a/src/vpp/api/api.c
+++ b/src/vpp/api/api.c
@@ -2143,7 +2143,6 @@ vpe_api_init (vlib_main_t * vm)
   am->oam_events_registration_hash = hash_create (0, sizeof (uword));
   am->bfd_events_registration_hash = hash_create (0, sizeof (uword));
 
-  vl_api_init (vm);
   vl_set_memory_region_name ("/vpe-api");
   vl_enable_disable_memory_api (vm, 1 /* enable it */ );
 
diff --git a/src/vpp/api/gmon.c b/src/vpp/api/gmon.c
index 610f40ed..277be8c0 100644
--- a/src/vpp/api/gmon.c
+++ b/src/vpp/api/gmon.c
@@ -122,13 +122,8 @@ gmon_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f)
   /* Initial wait for the world to settle down */
   vlib_process_suspend (vm, 5.0);
 
-  if (vec_len (vlib_mains) == 0)
-    vec_add1 (gm->my_vlib_mains, &vlib_global_main);
-  else
-    {
-      for (i = 0; i < vec_len (vlib_mains); i++)
-	vec_add1 (gm->my_vlib_mains, vlib_mains[i]);
-    }
+  for (i = 0; i < vec_len (vlib_mains); i++)
+    vec_add1 (gm->my_vlib_mains, vlib_mains[i]);
 
   while (1)
     {
-- 
cgit 1.2.3-korg


From e9d52d54361296af520e1ece0c25307a2d86c018 Mon Sep 17 00:00:00 2001
From: Damjan Marion <damarion@cisco.com>
Date: Thu, 9 Mar 2017 15:42:26 +0100
Subject: vlib: deduplicatee code in main and worker main loop

Change-Id: Id18d59c9442602633a6310b2001a95bce8b6b232
Signed-off-by: Damjan Marion <damarion@cisco.com>
---
 src/vlib/main.c    | 176 +++++++++++++++++++++++++++++++++--------------------
 src/vlib/main.h    |   2 +
 src/vlib/threads.c |  76 +----------------------
 src/vlib/threads.h |   4 +-
 4 files changed, 116 insertions(+), 142 deletions(-)

(limited to 'src/vlib/threads.c')

diff --git a/src/vlib/main.c b/src/vlib/main.c
index 09f34bbd..91760706 100644
--- a/src/vlib/main.c
+++ b/src/vlib/main.c
@@ -1398,51 +1398,75 @@ dispatch_suspended_process (vlib_main_t * vm,
   return t;
 }
 
-static void
-vlib_main_loop (vlib_main_t * vm)
+static_always_inline void
+vlib_main_or_worker_loop (vlib_main_t * vm, int is_main)
 {
   vlib_node_main_t *nm = &vm->node_main;
+  vlib_thread_main_t *tm = vlib_get_thread_main ();
   uword i;
   u64 cpu_time_now;
+  vlib_frame_queue_main_t *fqm;
 
   /* Initialize pending node vector. */
-  vec_resize (nm->pending_frames, 32);
-  _vec_len (nm->pending_frames) = 0;
+  if (is_main)
+    {
+      vec_resize (nm->pending_frames, 32);
+      _vec_len (nm->pending_frames) = 0;
+    }
 
   /* Mark time of main loop start. */
-  cpu_time_now = vm->clib_time.last_cpu_time;
-  vm->cpu_time_main_loop_start = cpu_time_now;
+  if (is_main)
+    {
+      cpu_time_now = vm->clib_time.last_cpu_time;
+      vm->cpu_time_main_loop_start = cpu_time_now;
+    }
+  else
+    cpu_time_now = clib_cpu_time_now ();
 
   /* Arrange for first level of timing wheel to cover times we care
      most about. */
-  nm->timing_wheel.min_sched_time = 10e-6;
-  nm->timing_wheel.max_sched_time = 10e-3;
-  timing_wheel_init (&nm->timing_wheel,
-		     cpu_time_now, vm->clib_time.clocks_per_second);
+  if (is_main)
+    {
+      nm->timing_wheel.min_sched_time = 10e-6;
+      nm->timing_wheel.max_sched_time = 10e-3;
+      timing_wheel_init (&nm->timing_wheel,
+			 cpu_time_now, vm->clib_time.clocks_per_second);
+      vec_alloc (nm->data_from_advancing_timing_wheel, 32);
+    }
 
   /* Pre-allocate expired nodes. */
-  vec_alloc (nm->data_from_advancing_timing_wheel, 32);
   vec_alloc (nm->pending_interrupt_node_runtime_indices, 32);
 
-  if (!nm->polling_threshold_vector_length)
-    nm->polling_threshold_vector_length = 10;
-  if (!nm->interrupt_threshold_vector_length)
-    nm->interrupt_threshold_vector_length = 5;
+  if (is_main)
+    {
+      if (!nm->polling_threshold_vector_length)
+	nm->polling_threshold_vector_length = 10;
+      if (!nm->interrupt_threshold_vector_length)
+	nm->interrupt_threshold_vector_length = 5;
 
-  nm->current_process_index = ~0;
+      nm->current_process_index = ~0;
+    }
 
   /* Start all processes. */
-  {
-    uword i;
-    for (i = 0; i < vec_len (nm->processes); i++)
-      cpu_time_now =
-	dispatch_process (vm, nm->processes[i], /* frame */ 0, cpu_time_now);
-  }
+  if (is_main)
+    {
+      uword i;
+      for (i = 0; i < vec_len (nm->processes); i++)
+	cpu_time_now = dispatch_process (vm, nm->processes[i], /* frame */ 0,
+					 cpu_time_now);
+    }
 
   while (1)
     {
       vlib_node_runtime_t *n;
 
+      if (!is_main)
+	{
+	  vlib_worker_thread_barrier_check ();
+	  vec_foreach (fqm, tm->frame_queue_mains)
+	    vlib_frame_queue_dequeue (vm, fqm);
+	}
+
       /* Process pre-input nodes. */
       vec_foreach (n, nm->nodes_by_type[VLIB_NODE_TYPE_PRE_INPUT])
 	cpu_time_now = dispatch_node (vm, n,
@@ -1459,7 +1483,7 @@ vlib_main_loop (vlib_main_t * vm)
 				      /* frame */ 0,
 				      cpu_time_now);
 
-      if (PREDICT_TRUE (vm->queue_signal_pending == 0))
+      if (PREDICT_TRUE (is_main && vm->queue_signal_pending == 0))
 	vm->queue_signal_callback (vm);
 
       /* Next handle interrupts. */
@@ -1484,58 +1508,64 @@ vlib_main_loop (vlib_main_t * vm)
 	  }
       }
 
-      /* Check if process nodes have expired from timing wheel. */
-      nm->data_from_advancing_timing_wheel
-	= timing_wheel_advance (&nm->timing_wheel, cpu_time_now,
-				nm->data_from_advancing_timing_wheel,
-				&nm->cpu_time_next_process_ready);
-
-      ASSERT (nm->data_from_advancing_timing_wheel != 0);
-      if (PREDICT_FALSE (_vec_len (nm->data_from_advancing_timing_wheel) > 0))
+      if (is_main)
 	{
-	  uword i;
-
-	processes_timing_wheel_data:
-	  for (i = 0; i < _vec_len (nm->data_from_advancing_timing_wheel);
-	       i++)
+	  /* Check if process nodes have expired from timing wheel. */
+	  nm->data_from_advancing_timing_wheel
+	    = timing_wheel_advance (&nm->timing_wheel, cpu_time_now,
+				    nm->data_from_advancing_timing_wheel,
+				    &nm->cpu_time_next_process_ready);
+
+	  ASSERT (nm->data_from_advancing_timing_wheel != 0);
+	  if (PREDICT_FALSE
+	      (_vec_len (nm->data_from_advancing_timing_wheel) > 0))
 	    {
-	      u32 d = nm->data_from_advancing_timing_wheel[i];
-	      u32 di = vlib_timing_wheel_data_get_index (d);
+	      uword i;
 
-	      if (vlib_timing_wheel_data_is_timed_event (d))
+	    processes_timing_wheel_data:
+	      for (i = 0; i < _vec_len (nm->data_from_advancing_timing_wheel);
+		   i++)
 		{
-		  vlib_signal_timed_event_data_t *te =
-		    pool_elt_at_index (nm->signal_timed_event_data_pool, di);
-		  vlib_node_t *n = vlib_get_node (vm, te->process_node_index);
-		  vlib_process_t *p =
-		    vec_elt (nm->processes, n->runtime_index);
-		  void *data;
-		  data =
-		    vlib_process_signal_event_helper (nm, n, p,
-						      te->event_type_index,
-						      te->n_data_elts,
-						      te->n_data_elt_bytes);
-		  if (te->n_data_bytes < sizeof (te->inline_event_data))
-		    clib_memcpy (data, te->inline_event_data,
-				 te->n_data_bytes);
+		  u32 d = nm->data_from_advancing_timing_wheel[i];
+		  u32 di = vlib_timing_wheel_data_get_index (d);
+
+		  if (vlib_timing_wheel_data_is_timed_event (d))
+		    {
+		      vlib_signal_timed_event_data_t *te =
+			pool_elt_at_index (nm->signal_timed_event_data_pool,
+					   di);
+		      vlib_node_t *n =
+			vlib_get_node (vm, te->process_node_index);
+		      vlib_process_t *p =
+			vec_elt (nm->processes, n->runtime_index);
+		      void *data;
+		      data =
+			vlib_process_signal_event_helper (nm, n, p,
+							  te->event_type_index,
+							  te->n_data_elts,
+							  te->n_data_elt_bytes);
+		      if (te->n_data_bytes < sizeof (te->inline_event_data))
+			clib_memcpy (data, te->inline_event_data,
+				     te->n_data_bytes);
+		      else
+			{
+			  clib_memcpy (data, te->event_data_as_vector,
+				       te->n_data_bytes);
+			  vec_free (te->event_data_as_vector);
+			}
+		      pool_put (nm->signal_timed_event_data_pool, te);
+		    }
 		  else
 		    {
-		      clib_memcpy (data, te->event_data_as_vector,
-				   te->n_data_bytes);
-		      vec_free (te->event_data_as_vector);
+		      cpu_time_now = clib_cpu_time_now ();
+		      cpu_time_now =
+			dispatch_suspended_process (vm, di, cpu_time_now);
 		    }
-		  pool_put (nm->signal_timed_event_data_pool, te);
 		}
-	      else
-		{
-		  cpu_time_now = clib_cpu_time_now ();
-		  cpu_time_now =
-		    dispatch_suspended_process (vm, di, cpu_time_now);
-		}
-	    }
 
-	  /* Reset vector. */
-	  _vec_len (nm->data_from_advancing_timing_wheel) = 0;
+	      /* Reset vector. */
+	      _vec_len (nm->data_from_advancing_timing_wheel) = 0;
+	    }
 	}
 
       /* Input nodes may have added work to the pending vector.
@@ -1548,7 +1578,7 @@ vlib_main_loop (vlib_main_t * vm)
       _vec_len (nm->pending_frames) = 0;
 
       /* Pending internal nodes may resume processes. */
-      if (_vec_len (nm->data_from_advancing_timing_wheel) > 0)
+      if (is_main && _vec_len (nm->data_from_advancing_timing_wheel) > 0)
 	goto processes_timing_wheel_data;
 
       vlib_increment_main_loop_counter (vm);
@@ -1559,6 +1589,18 @@ vlib_main_loop (vlib_main_t * vm)
     }
 }
 
+static void
+vlib_main_loop (vlib_main_t * vm)
+{
+  vlib_main_or_worker_loop (vm, /* is_main */ 1);
+}
+
+void
+vlib_worker_loop (vlib_main_t * vm)
+{
+  vlib_main_or_worker_loop (vm, /* is_main */ 0);
+}
+
 vlib_main_t vlib_global_main;
 
 static clib_error_t *
diff --git a/src/vlib/main.h b/src/vlib/main.h
index d9ac1445..a6d50b39 100644
--- a/src/vlib/main.h
+++ b/src/vlib/main.h
@@ -178,6 +178,8 @@ typedef struct vlib_main_t
 /* Global main structure. */
 extern vlib_main_t vlib_global_main;
 
+void vlib_worker_loop (vlib_main_t * vm);
+
 always_inline f64
 vlib_time_now (vlib_main_t * vm)
 {
diff --git a/src/vlib/threads.c b/src/vlib/threads.c
index 4676be97..07dbff33 100644
--- a/src/vlib/threads.c
+++ b/src/vlib/threads.c
@@ -1208,9 +1208,8 @@ vlib_worker_thread_barrier_release (vlib_main_t * vm)
  * If so, pull the packets off the frames and put them to
  * the handoff node.
  */
-static inline int
-vlib_frame_queue_dequeue_internal (vlib_main_t * vm,
-				   vlib_frame_queue_main_t * fqm)
+int
+vlib_frame_queue_dequeue (vlib_main_t * vm, vlib_frame_queue_main_t * fqm)
 {
   u32 thread_id = vm->cpu_index;
   vlib_frame_queue_t *fq = fqm->vlib_frame_queues[thread_id];
@@ -1337,75 +1336,6 @@ vlib_frame_queue_dequeue_internal (vlib_main_t * vm,
   return processed;
 }
 
-static_always_inline void
-vlib_worker_thread_internal (vlib_main_t * vm)
-{
-  vlib_node_main_t *nm = &vm->node_main;
-  vlib_thread_main_t *tm = vlib_get_thread_main ();
-  u64 cpu_time_now = clib_cpu_time_now ();
-  vlib_frame_queue_main_t *fqm;
-
-  vec_alloc (nm->pending_interrupt_node_runtime_indices, 32);
-
-  while (1)
-    {
-      vlib_worker_thread_barrier_check ();
-
-      vec_foreach (fqm, tm->frame_queue_mains)
-	vlib_frame_queue_dequeue_internal (vm, fqm);
-
-      vlib_node_runtime_t *n;
-      vec_foreach (n, nm->nodes_by_type[VLIB_NODE_TYPE_INPUT])
-      {
-	cpu_time_now = dispatch_node (vm, n, VLIB_NODE_TYPE_INPUT,
-				      VLIB_NODE_STATE_POLLING, /* frame */ 0,
-				      cpu_time_now);
-      }
-
-      /* Next handle interrupts. */
-      {
-	uword l = _vec_len (nm->pending_interrupt_node_runtime_indices);
-	uword i;
-	if (l > 0)
-	  {
-	    _vec_len (nm->pending_interrupt_node_runtime_indices) = 0;
-	    for (i = 0; i < l; i++)
-	      {
-		n = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT],
-				      nm->
-				      pending_interrupt_node_runtime_indices
-				      [i]);
-		cpu_time_now =
-		  dispatch_node (vm, n, VLIB_NODE_TYPE_INPUT,
-				 VLIB_NODE_STATE_INTERRUPT,
-				 /* frame */ 0,
-				 cpu_time_now);
-	      }
-	  }
-      }
-
-      if (_vec_len (nm->pending_frames))
-	{
-	  int i;
-	  cpu_time_now = clib_cpu_time_now ();
-	  for (i = 0; i < _vec_len (nm->pending_frames); i++)
-	    {
-	      vlib_pending_frame_t *p;
-
-	      p = nm->pending_frames + i;
-
-	      cpu_time_now = dispatch_pending_node (vm, p, cpu_time_now);
-	    }
-	  _vec_len (nm->pending_frames) = 0;
-	}
-      vlib_increment_main_loop_counter (vm);
-
-      /* Record time stamp in case there are no enabled nodes and above
-         calls do not update time stamp. */
-      cpu_time_now = clib_cpu_time_now ();
-    }
-}
-
 void
 vlib_worker_thread_fn (void *arg)
 {
@@ -1423,7 +1353,7 @@ vlib_worker_thread_fn (void *arg)
   while (tm->extern_thread_mgmt && tm->worker_thread_release == 0)
     vlib_worker_thread_barrier_check ();
 
-  vlib_worker_thread_internal (vm);
+  vlib_worker_loop (vm);
 }
 
 /* *INDENT-OFF* */
diff --git a/src/vlib/threads.h b/src/vlib/threads.h
index a032311c..fc1633f6 100644
--- a/src/vlib/threads.h
+++ b/src/vlib/threads.h
@@ -159,8 +159,8 @@ int vlib_frame_queue_enqueue (vlib_main_t * vm, u32 node_runtime_index,
 			      u32 frame_queue_index, vlib_frame_t * frame,
 			      vlib_frame_queue_msg_type_t type);
 
-int vlib_frame_queue_dequeue (int thread_id,
-			      vlib_main_t * vm, vlib_node_main_t * nm);
+int
+vlib_frame_queue_dequeue (vlib_main_t * vm, vlib_frame_queue_main_t * fqm);
 
 u64 dispatch_node (vlib_main_t * vm,
 		   vlib_node_runtime_t * node,
-- 
cgit 1.2.3-korg


From e9f929b52ddb741ec1e4cb2d92c6be1e798933a0 Mon Sep 17 00:00:00 2001
From: Damjan Marion <damarion@cisco.com>
Date: Thu, 16 Mar 2017 11:32:09 +0100
Subject: vlib: make runtime_data thread-local

Change-Id: I4aa3e7e42fb81211de1aed07dc7befee87a1e18b
Signed-off-by: Damjan Marion <damarion@cisco.com>
---
 src/vlib/init.h                  |  1 +
 src/vlib/main.h                  |  1 +
 src/vlib/node.c                  |  4 +-
 src/vlib/node.h                  | 81 +++++++++++++++++++++-------------------
 src/vlib/threads.c               | 61 ++++++++++++++++++++++++++++--
 src/vnet/gre/node.c              | 26 ++++++++++---
 src/vnet/hdlc/node.c             | 27 +++++++++-----
 src/vnet/l2/l2_input_classify.c  | 15 ++++++++
 src/vnet/l2/l2_output_classify.c | 16 ++++++++
 src/vnet/l2tp/l2tp.c             | 10 +++++
 src/vnet/mpls/node.c             | 13 +++++++
 src/vnet/ppp/node.c              | 27 +++++++++-----
 src/vnet/tcp/tcp_syn_filter4.c   | 23 +++++++-----
 src/vnet/udp/udp_local.c         | 74 ++++++++++++++++++++++--------------
 14 files changed, 273 insertions(+), 106 deletions(-)

(limited to 'src/vlib/threads.c')

diff --git a/src/vlib/init.h b/src/vlib/init.h
index 4fa5b304..12db3f90 100644
--- a/src/vlib/init.h
+++ b/src/vlib/init.h
@@ -109,6 +109,7 @@ static void __vlib_add_##tag##_function_##x (void)              \
 }
 
 #define VLIB_INIT_FUNCTION(x) VLIB_DECLARE_INIT_FUNCTION(x,init)
+#define VLIB_WORKER_INIT_FUNCTION(x) VLIB_DECLARE_INIT_FUNCTION(x,worker_init)
 
 #define VLIB_MAIN_LOOP_ENTER_FUNCTION(x) \
   VLIB_DECLARE_INIT_FUNCTION(x,main_loop_enter)
diff --git a/src/vlib/main.h b/src/vlib/main.h
index a6d50b39..98bc823d 100644
--- a/src/vlib/main.h
+++ b/src/vlib/main.h
@@ -162,6 +162,7 @@ typedef struct vlib_main_t
 
   /* List of init functions to call, setup by constructors */
   _vlib_init_function_list_elt_t *init_function_registrations;
+  _vlib_init_function_list_elt_t *worker_init_function_registrations;
   _vlib_init_function_list_elt_t *main_loop_enter_function_registrations;
   _vlib_init_function_list_elt_t *main_loop_exit_function_registrations;
   _vlib_init_function_list_elt_t *api_init_function_registrations;
diff --git a/src/vlib/node.c b/src/vlib/node.c
index c419a13a..dc0a4de5 100644
--- a/src/vlib/node.c
+++ b/src/vlib/node.c
@@ -434,9 +434,7 @@ register_node (vlib_main_t * vm, vlib_node_registration_t * r)
       rt->errors[i] = vlib_error_set (n->index, i);
 
     STATIC_ASSERT_SIZEOF (vlib_node_runtime_t, 128);
-    ASSERT (vec_len (n->runtime_data) <=
-	    sizeof (vlib_node_runtime_t) -
-	    STRUCT_OFFSET_OF (vlib_node_runtime_t, runtime_data));
+    ASSERT (vec_len (n->runtime_data) <= VLIB_NODE_RUNTIME_DATA_SIZE);
 
     if (vec_len (n->runtime_data) > 0)
       clib_memcpy (rt->runtime_data, n->runtime_data,
diff --git a/src/vlib/node.h b/src/vlib/node.h
index b624e9d6..2a532cc3 100644
--- a/src/vlib/node.h
+++ b/src/vlib/node.h
@@ -411,65 +411,68 @@ typedef struct
 
 typedef struct vlib_node_runtime_t
 {
-  CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
-  /* Node function to call. */
-  vlib_node_function_t *function;
+  CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);	/**< cacheline mark */
 
-  /* Vector of errors for this node. */
-  vlib_error_t *errors;
+  vlib_node_function_t *function;	/**< Node function to call. */
 
-  /* Number of clock cycles. */
-  u32 clocks_since_last_overflow;
+  vlib_error_t *errors;			/**< Vector of errors for this node. */
 
-  /* Maximum clock cycle for an invocation. */
-  u32 max_clock;
+  u32 clocks_since_last_overflow;	/**< Number of clock cycles. */
 
-  /* Number of vectors in the recorded max_clock. */
-  u32 max_clock_n;
+  u32 max_clock;			/**< Maximum clock cycle for an
+					  invocation. */
 
-  /* Number of calls. */
-  u32 calls_since_last_overflow;
+  u32 max_clock_n;			/**< Number of vectors in the recorded
+					  max_clock. */
 
-  /* Number of vector elements processed by this node. */
-  u32 vectors_since_last_overflow;
+  u32 calls_since_last_overflow;	/**< Number of calls. */
 
-  /* Start of next frames for this node. */
-  u32 next_frame_index;
+  u32 vectors_since_last_overflow;	/**< Number of vector elements
+					  processed by this node. */
 
-  /* Node index. */
-  u32 node_index;
+  u32 next_frame_index;			/**< Start of next frames for this
+					  node. */
 
-  /* For input nodes: decremented on each main loop interation until it reaches zero
-     and function is called.  Allows some input nodes to be called
-     more than others. */
-  u32 input_main_loops_per_call;
+  u32 node_index;			/**< Node index. */
 
-  /* Saved main loop counter of last dispatch of this node. */
-  u32 main_loop_count_last_dispatch;
+  u32 input_main_loops_per_call;	/**< For input nodes: decremented
+					  on each main loop interation until
+					  it reaches zero and function is
+					  called.  Allows some input nodes to
+					  be called more than others. */
+
+  u32 main_loop_count_last_dispatch;	/**< Saved main loop counter of last
+					  dispatch of this node. */
 
   u32 main_loop_vector_stats[2];
 
-  /* Copy of main node flags. */
-  u16 flags;
+  u16 flags;				/**< Copy of main node flags. */
 
-  /* Input node state. */
-  u16 state;
+  u16 state;				/**< Input node state. */
 
   u16 n_next_nodes;
 
-  /* Next frame index that vector arguments were last enqueued to
-     last time this node ran.  Set to zero before first run
-     of this node. */
-  u16 cached_next_index;
-
-  /* CPU this node runs on */
-  u16 cpu_index;
-
-  /* Function dependent node-runtime. */
-  u8 runtime_data[0];
+  u16 cached_next_index;		/**< Next frame index that vector
+					  arguments were last enqueued to
+					  last time this node ran. Set to
+					  zero before first run of this
+					  node. */
+
+  u16 cpu_index;			/**< CPU this node runs on */
+
+  u8 runtime_data[0];			/**< Function dependent
+					  node-runtime data. This data is
+					  thread local, and it is not
+					  cloned from main thread. It needs
+					  to be initialized for each thread
+					  before it is used unless
+					  runtime_data template exists in
+					  vlib_node_t. */
 }
 vlib_node_runtime_t;
 
+#define VLIB_NODE_RUNTIME_DATA_SIZE	(sizeof (vlib_node_runtime_t) - STRUCT_OFFSET_OF (vlib_node_runtime_t, runtime_data))
+
 typedef struct
 {
   /* Number of allocated frames for this scalar/vector size. */
diff --git a/src/vlib/threads.c b/src/vlib/threads.c
index 07dbff33..3756c3fa 100644
--- a/src/vlib/threads.c
+++ b/src/vlib/threads.c
@@ -633,6 +633,8 @@ start_workers (vlib_main_t * vm)
 	      vm_clone->cpu_index = worker_thread_index;
 	      vm_clone->heap_base = w->thread_mheap;
 	      vm_clone->mbuf_alloc_list = 0;
+	      vm_clone->init_functions_called =
+		hash_create (0, /* value bytes */ 0);
 	      memset (&vm_clone->random_buffer, 0,
 		      sizeof (vm_clone->random_buffer));
 
@@ -674,11 +676,33 @@ start_workers (vlib_main_t * vm)
 		}
 	      nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] =
 		vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]);
+	      vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT])
+	      {
+		vlib_node_t *n = vlib_get_node (vm, rt->node_index);
+		rt->cpu_index = vm_clone->cpu_index;
+		/* copy initial runtime_data from node */
+		if (n->runtime_data_bytes > 0)
+		  clib_memcpy (rt->runtime_data, n->runtime_data,
+			       VLIB_NODE_RUNTIME_DATA_SIZE);
+		else if (CLIB_DEBUG > 0)
+		  memset (rt->runtime_data, 0xfe,
+			  VLIB_NODE_RUNTIME_DATA_SIZE);
+	      }
 
 	      nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] =
 		vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]);
 	      vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT])
+	      {
+		vlib_node_t *n = vlib_get_node (vm, rt->node_index);
 		rt->cpu_index = vm_clone->cpu_index;
+		/* copy initial runtime_data from node */
+		if (n->runtime_data_bytes > 0)
+		  clib_memcpy (rt->runtime_data, n->runtime_data,
+			       VLIB_NODE_RUNTIME_DATA_SIZE);
+		else if (CLIB_DEBUG > 0)
+		  memset (rt->runtime_data, 0xfe,
+			  VLIB_NODE_RUNTIME_DATA_SIZE);
+	      }
 
 	      nm_clone->processes = vec_dup (nm->processes);
 
@@ -926,26 +950,51 @@ vlib_worker_thread_node_runtime_update (void)
 	clib_mem_free (old_nodes_clone[j]);
       vec_free (old_nodes_clone);
 
-      vec_free (nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]);
 
+      /* re-clone internal nodes */
+      old_rt = nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL];
       nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] =
 	vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]);
 
-      /* clone input node runtime */
-      old_rt = nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT];
+      vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL])
+      {
+	vlib_node_t *n = vlib_get_node (vm, rt->node_index);
+	rt->cpu_index = vm_clone->cpu_index;
+	/* copy runtime_data, will be overwritten later for existing rt */
+	clib_memcpy (rt->runtime_data, n->runtime_data,
+		     VLIB_NODE_RUNTIME_DATA_SIZE);
+      }
+
+      for (j = 0; j < vec_len (old_rt); j++)
+	{
+	  rt = vlib_node_get_runtime (vm_clone, old_rt[j].node_index);
+	  rt->state = old_rt[j].state;
+	  clib_memcpy (rt->runtime_data, old_rt[j].runtime_data,
+		       VLIB_NODE_RUNTIME_DATA_SIZE);
+	}
 
+      vec_free (old_rt);
+
+      /* re-clone input nodes */
+      old_rt = nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT];
       nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] =
 	vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]);
 
       vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT])
       {
+	vlib_node_t *n = vlib_get_node (vm, rt->node_index);
 	rt->cpu_index = vm_clone->cpu_index;
+	/* copy runtime_data, will be overwritten later for existing rt */
+	clib_memcpy (rt->runtime_data, n->runtime_data,
+		     VLIB_NODE_RUNTIME_DATA_SIZE);
       }
 
       for (j = 0; j < vec_len (old_rt); j++)
 	{
 	  rt = vlib_node_get_runtime (vm_clone, old_rt[j].node_index);
 	  rt->state = old_rt[j].state;
+	  clib_memcpy (rt->runtime_data, old_rt[j].runtime_data,
+		       VLIB_NODE_RUNTIME_DATA_SIZE);
 	}
 
       vec_free (old_rt);
@@ -1342,6 +1391,7 @@ vlib_worker_thread_fn (void *arg)
   vlib_worker_thread_t *w = (vlib_worker_thread_t *) arg;
   vlib_thread_main_t *tm = vlib_get_thread_main ();
   vlib_main_t *vm = vlib_get_main ();
+  clib_error_t *e;
 
   ASSERT (vm->cpu_index == os_get_cpu_number ());
 
@@ -1349,6 +1399,11 @@ vlib_worker_thread_fn (void *arg)
   clib_time_init (&vm->clib_time);
   clib_mem_set_heap (w->thread_mheap);
 
+  e = vlib_call_init_exit_functions
+    (vm, vm->worker_init_function_registrations, 1 /* call_once */ );
+  if (e)
+    clib_error_report (e);
+
   /* Wait until the dpdk init sequence is complete */
   while (tm->extern_thread_mgmt && tm->worker_thread_release == 0)
     vlib_worker_thread_barrier_check ();
diff --git a/src/vnet/gre/node.c b/src/vnet/gre/node.c
index 86f7a6ee..dd16db5e 100644
--- a/src/vnet/gre/node.c
+++ b/src/vnet/gre/node.c
@@ -448,7 +448,6 @@ gre_register_input_protocol (vlib_main_t * vm,
 {
   gre_main_t * em = &gre_main;
   gre_protocol_info_t * pi;
-  gre_input_runtime_t * rt;
   u16 * n;
 
   {
@@ -464,10 +463,13 @@ gre_register_input_protocol (vlib_main_t * vm,
 				       node_index);
 
   /* Setup gre protocol -> next index sparse vector mapping. */
-  rt = vlib_node_get_runtime_data (vm, gre_input_node.index);
-  n = sparse_vec_validate (rt->next_by_protocol, 
-                           clib_host_to_net_u16 (protocol));
-  n[0] = pi->next_index;
+  foreach_vlib_main ({
+    gre_input_runtime_t * rt;
+    rt = vlib_node_get_runtime_data (this_vlib_main, gre_input_node.index);
+    n = sparse_vec_validate (rt->next_by_protocol,
+                             clib_host_to_net_u16 (protocol));
+    n[0] = pi->next_index;
+  });
 }
 
 static void
@@ -529,3 +531,17 @@ static clib_error_t * gre_input_init (vlib_main_t * vm)
 }
 
 VLIB_INIT_FUNCTION (gre_input_init);
+
+static clib_error_t * gre_input_worker_init (vlib_main_t * vm)
+{
+  gre_input_runtime_t * rt;
+
+  rt = vlib_node_get_runtime_data (vm, gre_input_node.index);
+
+  rt->next_by_protocol = sparse_vec_new
+    (/* elt bytes */ sizeof (rt->next_by_protocol[0]),
+     /* bits in index */ BITS (((gre_header_t *) 0)->protocol));
+  return 0;
+}
+
+VLIB_WORKER_INIT_FUNCTION (gre_input_worker_init);
diff --git a/src/vnet/hdlc/node.c b/src/vnet/hdlc/node.c
index 4fe0296a..57e04c85 100644
--- a/src/vnet/hdlc/node.c
+++ b/src/vnet/hdlc/node.c
@@ -285,18 +285,9 @@ VLIB_REGISTER_NODE (hdlc_input_node) = {
   .unformat_buffer = unformat_hdlc_header,
 };
 
-static clib_error_t * hdlc_input_init (vlib_main_t * vm)
+static clib_error_t * hdlc_input_runtime_init (vlib_main_t * vm)
 {
   hdlc_input_runtime_t * rt;
-
-  {
-    clib_error_t * error = vlib_call_init_function (vm, hdlc_init);
-    if (error)
-      clib_error_report (error);
-  }
-
-  hdlc_setup_node (vm, hdlc_input_node.index);
-
   rt = vlib_node_get_runtime_data (vm, hdlc_input_node.index);
 
   rt->next_by_protocol = sparse_vec_new
@@ -313,7 +304,23 @@ static clib_error_t * hdlc_input_init (vlib_main_t * vm)
   return 0;
 }
 
+static clib_error_t * hdlc_input_init (vlib_main_t * vm)
+{
+
+  {
+    clib_error_t * error = vlib_call_init_function (vm, hdlc_init);
+    if (error)
+      clib_error_report (error);
+  }
+
+  hdlc_setup_node (vm, hdlc_input_node.index);
+  hdlc_input_runtime_init (vm);
+
+  return 0;
+}
+
 VLIB_INIT_FUNCTION (hdlc_input_init);
+VLIB_WORKER_INIT_FUNCTION (hdlc_input_runtime_init);
 
 void
 hdlc_register_input_protocol (vlib_main_t * vm,
diff --git a/src/vnet/l2/l2_input_classify.c b/src/vnet/l2/l2_input_classify.c
index 497df192..485b9abd 100644
--- a/src/vnet/l2/l2_input_classify.c
+++ b/src/vnet/l2/l2_input_classify.c
@@ -505,6 +505,21 @@ l2_input_classify_init (vlib_main_t * vm)
 
 VLIB_INIT_FUNCTION (l2_input_classify_init);
 
+clib_error_t *
+l2_input_classify_worker_init (vlib_main_t * vm)
+{
+  l2_input_classify_main_t *cm = &l2_input_classify_main;
+  l2_input_classify_runtime_t *rt;
+
+  rt = vlib_node_get_runtime_data (vm, l2_input_classify_node.index);
+
+  rt->l2cm = cm;
+  rt->vcm = cm->vnet_classify_main;
+
+  return 0;
+}
+
+VLIB_WORKER_INIT_FUNCTION (l2_input_classify_worker_init);
 
 /** Enable/disable l2 input classification on a specific interface. */
 void
diff --git a/src/vnet/l2/l2_output_classify.c b/src/vnet/l2/l2_output_classify.c
index 832be1a1..c1bdaddc 100644
--- a/src/vnet/l2/l2_output_classify.c
+++ b/src/vnet/l2/l2_output_classify.c
@@ -505,6 +505,22 @@ l2_output_classify_init (vlib_main_t * vm)
 
 VLIB_INIT_FUNCTION (l2_output_classify_init);
 
+clib_error_t *
+l2_output_classify_worker_init (vlib_main_t * vm)
+{
+  l2_output_classify_main_t *cm = &l2_output_classify_main;
+  l2_output_classify_runtime_t *rt;
+
+  rt = vlib_node_get_runtime_data (vm, l2_output_classify_node.index);
+
+  rt->l2cm = cm;
+  rt->vcm = cm->vnet_classify_main;
+
+  return 0;
+}
+
+VLIB_WORKER_INIT_FUNCTION (l2_output_classify_worker_init);
+
 /** Enable/disable l2 input classification on a specific interface. */
 void
 vnet_l2_output_classify_enable_disable (u32 sw_if_index, int enable_disable)
diff --git a/src/vnet/l2tp/l2tp.c b/src/vnet/l2tp/l2tp.c
index 2d323397..cb94d7e7 100644
--- a/src/vnet/l2tp/l2tp.c
+++ b/src/vnet/l2tp/l2tp.c
@@ -747,6 +747,16 @@ l2tp_init (vlib_main_t * vm)
 
 VLIB_INIT_FUNCTION (l2tp_init);
 
+clib_error_t *
+l2tp_worker_init (vlib_main_t * vm)
+{
+  l2tp_encap_init (vm);
+
+  return 0;
+}
+
+VLIB_WORKER_INIT_FUNCTION (l2tp_worker_init);
+
 /*
  * fd.io coding-style-patch-verification: ON
  *
diff --git a/src/vnet/mpls/node.c b/src/vnet/mpls/node.c
index 18100912..03bfaf56 100644
--- a/src/vnet/mpls/node.c
+++ b/src/vnet/mpls/node.c
@@ -301,3 +301,16 @@ static clib_error_t * mpls_input_init (vlib_main_t * vm)
 }
 
 VLIB_INIT_FUNCTION (mpls_input_init);
+
+static clib_error_t * mpls_input_worker_init (vlib_main_t * vm)
+{
+  mpls_input_runtime_t * rt;
+  rt = vlib_node_get_runtime_data (vm, mpls_input_node.index);
+  rt->last_label = (u32) ~0;
+  rt->last_inner_fib_index = 0;
+  rt->last_outer_fib_index = 0;
+  rt->mpls_main = &mpls_main;
+  return 0;
+}
+
+VLIB_WORKER_INIT_FUNCTION (mpls_input_worker_init);
diff --git a/src/vnet/ppp/node.c b/src/vnet/ppp/node.c
index 4f1f6a71..2f6e0c33 100644
--- a/src/vnet/ppp/node.c
+++ b/src/vnet/ppp/node.c
@@ -295,18 +295,10 @@ VLIB_REGISTER_NODE (ppp_input_node) = {
 /* *INDENT-ON* */
 
 static clib_error_t *
-ppp_input_init (vlib_main_t * vm)
+ppp_input_runtime_init (vlib_main_t * vm)
 {
   ppp_input_runtime_t *rt;
 
-  {
-    clib_error_t *error = vlib_call_init_function (vm, ppp_init);
-    if (error)
-      clib_error_report (error);
-  }
-
-  ppp_setup_node (vm, ppp_input_node.index);
-
   rt = vlib_node_get_runtime_data (vm, ppp_input_node.index);
 
   rt->next_by_protocol = sparse_vec_new
@@ -323,7 +315,24 @@ ppp_input_init (vlib_main_t * vm)
   return 0;
 }
 
+static clib_error_t *
+ppp_input_init (vlib_main_t * vm)
+{
+
+  {
+    clib_error_t *error = vlib_call_init_function (vm, ppp_init);
+    if (error)
+      clib_error_report (error);
+  }
+
+  ppp_setup_node (vm, ppp_input_node.index);
+  ppp_input_runtime_init (vm);
+
+  return 0;
+}
+
 VLIB_INIT_FUNCTION (ppp_input_init);
+VLIB_WORKER_INIT_FUNCTION (ppp_input_runtime_init);
 
 void
 ppp_register_input_protocol (vlib_main_t * vm,
diff --git a/src/vnet/tcp/tcp_syn_filter4.c b/src/vnet/tcp/tcp_syn_filter4.c
index c7605a30..9b2a8ac7 100644
--- a/src/vnet/tcp/tcp_syn_filter4.c
+++ b/src/vnet/tcp/tcp_syn_filter4.c
@@ -450,18 +450,21 @@ syn_filter_enable_disable (u32 sw_if_index, int enable_disable)
 
   if (enable_disable)
     {
-      vlib_main_t *vm = vlib_get_main ();
       syn_filter4_runtime_t *rt;
 
-      rt = vlib_node_get_runtime_data (vm, syn_filter4_node.index);
-      vec_validate (rt->syn_counts, 1023);
-      /*
-       * Given perfect disperson / optimal hashing results:
-       * Allow 128k (successful) syns/sec. 1024, buckets each of which
-       * absorb 128 syns before filtering. Reset table once a second.
-       * Reality bites, lets try resetting once every 100ms.
-       */
-      rt->reset_interval = 0.1;	/* reset interval in seconds */
+      /* *INDENT-OFF* */
+      foreach_vlib_main ({
+	rt = vlib_node_get_runtime_data (this_vlib_main, syn_filter4_node.index);
+	vec_validate (rt->syn_counts, 1023);
+	/*
+	 * Given perfect disperson / optimal hashing results:
+	 * Allow 128k (successful) syns/sec. 1024, buckets each of which
+	 * absorb 128 syns before filtering. Reset table once a second.
+	 * Reality bites, lets try resetting once every 100ms.
+	 */
+	rt->reset_interval = 0.1;	/* reset interval in seconds */
+      });
+      /* *INDENT-ON* */
     }
 
   rv = vnet_feature_enable_disable ("ip4-local", "syn-filter-4",
diff --git a/src/vnet/udp/udp_local.c b/src/vnet/udp/udp_local.c
index 6b239f73..3a60b29b 100644
--- a/src/vnet/udp/udp_local.c
+++ b/src/vnet/udp/udp_local.c
@@ -520,11 +520,15 @@ udp_register_dst_port (vlib_main_t * vm,
 				       : udp6_input_node.index, node_index);
 
   /* Setup udp protocol -> next index sparse vector mapping. */
-  rt = vlib_node_get_runtime_data
-    (vm, is_ip4 ? udp4_input_node.index : udp6_input_node.index);
-  n = sparse_vec_validate (rt->next_by_dst_port,
-			   clib_host_to_net_u16 (dst_port));
-  n[0] = pi->next_index;
+  /* *INDENT-OFF* */
+  foreach_vlib_main({
+    rt = vlib_node_get_runtime_data
+      (this_vlib_main, is_ip4 ? udp4_input_node.index : udp6_input_node.index);
+    n = sparse_vec_validate (rt->next_by_dst_port,
+			     clib_host_to_net_u16 (dst_port));
+    n[0] = pi->next_index;
+  });
+  /* *INDENT-ON* */
 }
 
 void
@@ -541,11 +545,15 @@ udp_unregister_dst_port (vlib_main_t * vm, udp_dst_port_t dst_port, u8 is_ip4)
     return;
 
   /* Kill the mapping. Don't bother killing the pi, it may be back. */
-  rt = vlib_node_get_runtime_data
-    (vm, is_ip4 ? udp4_input_node.index : udp6_input_node.index);
-  n = sparse_vec_validate (rt->next_by_dst_port,
-			   clib_host_to_net_u16 (dst_port));
-  n[0] = SPARSE_VEC_INVALID_INDEX;
+  /* *INDENT-OFF* */
+  foreach_vlib_main({
+    rt = vlib_node_get_runtime_data
+      (this_vlib_main, is_ip4 ? udp4_input_node.index : udp6_input_node.index);
+    n = sparse_vec_validate (rt->next_by_dst_port,
+			     clib_host_to_net_u16 (dst_port));
+    n[0] = SPARSE_VEC_INVALID_INDEX;
+  });
+  /* *INDENT-ON* */
 }
 
 void
@@ -604,10 +612,27 @@ udp_setup_node (vlib_main_t * vm, u32 node_index)
   pn->unformat_edit = unformat_pg_udp_header;
 }
 
+static void
+udp_local_node_runtime_init (vlib_main_t * vm)
+{
+  udp_input_runtime_t *rt;
+
+  rt = vlib_node_get_runtime_data (vm, udp4_input_node.index);
+  rt->next_by_dst_port = sparse_vec_new
+    ( /* elt bytes */ sizeof (rt->next_by_dst_port[0]),
+     /* bits in index */ BITS (((udp_header_t *) 0)->dst_port));
+  rt->punt_unknown = 0;
+
+  rt = vlib_node_get_runtime_data (vm, udp6_input_node.index);
+  rt->next_by_dst_port = sparse_vec_new
+    ( /* elt bytes */ sizeof (rt->next_by_dst_port[0]),
+     /* bits in index */ BITS (((udp_header_t *) 0)->dst_port));
+  rt->punt_unknown = 0;
+}
+
 clib_error_t *
 udp_local_init (vlib_main_t * vm)
 {
-  udp_input_runtime_t *rt;
   udp_main_t *um = &udp_main;
   int i;
 
@@ -628,27 +653,13 @@ udp_local_init (vlib_main_t * vm)
   udp_setup_node (vm, udp4_input_node.index);
   udp_setup_node (vm, udp6_input_node.index);
 
-  rt = vlib_node_get_runtime_data (vm, udp4_input_node.index);
-
-  rt->next_by_dst_port = sparse_vec_new
-    ( /* elt bytes */ sizeof (rt->next_by_dst_port[0]),
-     /* bits in index */ BITS (((udp_header_t *) 0)->dst_port));
-
-  rt->punt_unknown = 0;
+  udp_local_node_runtime_init (vm);
 
 #define _(n,s) add_dst_port (um, UDP_DST_PORT_##s, #s, 1 /* is_ip4 */);
   foreach_udp4_dst_port
 #undef _
-    rt = vlib_node_get_runtime_data (vm, udp6_input_node.index);
-
-  rt->next_by_dst_port = sparse_vec_new
-    ( /* elt bytes */ sizeof (rt->next_by_dst_port[0]),
-     /* bits in index */ BITS (((udp_header_t *) 0)->dst_port));
-
-  rt->punt_unknown = 0;
-
 #define _(n,s) add_dst_port (um, UDP_DST_PORT_##s, #s, 0 /* is_ip4 */);
-  foreach_udp6_dst_port
+    foreach_udp6_dst_port
 #undef _
     ip4_register_protocol (IP_PROTOCOL_UDP, udp4_input_node.index);
   /* Note: ip6 differs from ip4, UDP is hotwired to ip6-udp-lookup */
@@ -657,6 +668,15 @@ udp_local_init (vlib_main_t * vm)
 
 VLIB_INIT_FUNCTION (udp_local_init);
 
+clib_error_t *
+udp_local_worker_init (vlib_main_t * vm)
+{
+  udp_local_node_runtime_init (vm);
+  return 0;
+}
+
+VLIB_WORKER_INIT_FUNCTION (udp_local_worker_init);
+
 /*
  * fd.io coding-style-patch-verification: ON
  *
-- 
cgit 1.2.3-korg


From b6f93a1d1acf0f6ad2cdac0f0ea72842f36776a1 Mon Sep 17 00:00:00 2001
From: Damjan Marion <damarion@cisco.com>
Date: Thu, 16 Mar 2017 17:46:41 +0100
Subject: vlib: additional runtime_data checks

Change-Id: I9b6ed9741fae89bdefa6f601398eb63a21155069
Signed-off-by: Damjan Marion <damarion@cisco.com>
---
 src/vlib/threads.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'src/vlib/threads.c')

diff --git a/src/vlib/threads.c b/src/vlib/threads.c
index 3756c3fa..40789f59 100644
--- a/src/vlib/threads.c
+++ b/src/vlib/threads.c
@@ -681,9 +681,10 @@ start_workers (vlib_main_t * vm)
 		vlib_node_t *n = vlib_get_node (vm, rt->node_index);
 		rt->cpu_index = vm_clone->cpu_index;
 		/* copy initial runtime_data from node */
-		if (n->runtime_data_bytes > 0)
+		if (n->runtime_data && n->runtime_data_bytes > 0)
 		  clib_memcpy (rt->runtime_data, n->runtime_data,
-			       VLIB_NODE_RUNTIME_DATA_SIZE);
+			       clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
+					 n->runtime_data_bytes));
 		else if (CLIB_DEBUG > 0)
 		  memset (rt->runtime_data, 0xfe,
 			  VLIB_NODE_RUNTIME_DATA_SIZE);
@@ -696,9 +697,10 @@ start_workers (vlib_main_t * vm)
 		vlib_node_t *n = vlib_get_node (vm, rt->node_index);
 		rt->cpu_index = vm_clone->cpu_index;
 		/* copy initial runtime_data from node */
-		if (n->runtime_data_bytes > 0)
+		if (n->runtime_data && n->runtime_data_bytes > 0)
 		  clib_memcpy (rt->runtime_data, n->runtime_data,
-			       VLIB_NODE_RUNTIME_DATA_SIZE);
+			       clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
+					 n->runtime_data_bytes));
 		else if (CLIB_DEBUG > 0)
 		  memset (rt->runtime_data, 0xfe,
 			  VLIB_NODE_RUNTIME_DATA_SIZE);
@@ -961,8 +963,10 @@ vlib_worker_thread_node_runtime_update (void)
 	vlib_node_t *n = vlib_get_node (vm, rt->node_index);
 	rt->cpu_index = vm_clone->cpu_index;
 	/* copy runtime_data, will be overwritten later for existing rt */
-	clib_memcpy (rt->runtime_data, n->runtime_data,
-		     VLIB_NODE_RUNTIME_DATA_SIZE);
+	if (n->runtime_data && n->runtime_data_bytes > 0)
+	  clib_memcpy (rt->runtime_data, n->runtime_data,
+		       clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
+				 n->runtime_data_bytes));
       }
 
       for (j = 0; j < vec_len (old_rt); j++)
@@ -985,8 +989,10 @@ vlib_worker_thread_node_runtime_update (void)
 	vlib_node_t *n = vlib_get_node (vm, rt->node_index);
 	rt->cpu_index = vm_clone->cpu_index;
 	/* copy runtime_data, will be overwritten later for existing rt */
-	clib_memcpy (rt->runtime_data, n->runtime_data,
-		     VLIB_NODE_RUNTIME_DATA_SIZE);
+	if (n->runtime_data && n->runtime_data_bytes > 0)
+	  clib_memcpy (rt->runtime_data, n->runtime_data,
+		       clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
+				 n->runtime_data_bytes));
       }
 
       for (j = 0; j < vec_len (old_rt); j++)
-- 
cgit 1.2.3-korg


From eb743fad56b32cb20ad2d2cadc4760f9c25be5e1 Mon Sep 17 00:00:00 2001
From: Damjan Marion <damarion@cisco.com>
Date: Mon, 20 Mar 2017 16:34:15 +0100
Subject: vnet: add device-input threadplacement infra

This change adds two new debug CLI command:

- "show interface placmenet" to display which
thread (main or worker) is responsible for processing
interface rx queue

vpp# show interface placement
Thread 0 (vpp_main):
  node af-packet-input:
    host-vpp1 queue 0
Thread 1 (vpp_wk_0):
  node af-packet-input:
    host-virbr0 queue 0
Thread 2 (vpp_wk_1):
  node af-packet-input:
    host-vpp2 queue 0
    host-lxcbr0 queue 0

- "set interface placmenet" to assign thread (main or worker)
which process specific interface rx queue

vpp# set interface placement host-vpp1 queue 0 main

Change-Id: Id4dd00cf2b05e10fae2125ac7cb4411b446c5e9c
Signed-off-by: Damjan Marion <damarion@cisco.com>
---
 src/vlib/threads.c                     |  14 +-
 src/vnet/devices/af_packet/af_packet.c |  54 +-------
 src/vnet/devices/af_packet/af_packet.h |   6 -
 src/vnet/devices/af_packet/node.c      |  23 ++--
 src/vnet/devices/devices.c             | 240 +++++++++++++++++++++++++++++++++
 src/vnet/devices/devices.h             |  45 +++++++
 src/vnet/interface.h                   |   6 +
 7 files changed, 310 insertions(+), 78 deletions(-)

(limited to 'src/vlib/threads.c')

diff --git a/src/vlib/threads.c b/src/vlib/threads.c
index 40789f59..ef3a24d3 100644
--- a/src/vlib/threads.c
+++ b/src/vlib/threads.c
@@ -685,9 +685,6 @@ start_workers (vlib_main_t * vm)
 		  clib_memcpy (rt->runtime_data, n->runtime_data,
 			       clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
 					 n->runtime_data_bytes));
-		else if (CLIB_DEBUG > 0)
-		  memset (rt->runtime_data, 0xfe,
-			  VLIB_NODE_RUNTIME_DATA_SIZE);
 	      }
 
 	      nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] =
@@ -701,9 +698,6 @@ start_workers (vlib_main_t * vm)
 		  clib_memcpy (rt->runtime_data, n->runtime_data,
 			       clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
 					 n->runtime_data_bytes));
-		else if (CLIB_DEBUG > 0)
-		  memset (rt->runtime_data, 0xfe,
-			  VLIB_NODE_RUNTIME_DATA_SIZE);
 	      }
 
 	      nm_clone->processes = vec_dup (nm->processes);
@@ -1405,15 +1399,15 @@ vlib_worker_thread_fn (void *arg)
   clib_time_init (&vm->clib_time);
   clib_mem_set_heap (w->thread_mheap);
 
+  /* Wait until the dpdk init sequence is complete */
+  while (tm->extern_thread_mgmt && tm->worker_thread_release == 0)
+    vlib_worker_thread_barrier_check ();
+
   e = vlib_call_init_exit_functions
     (vm, vm->worker_init_function_registrations, 1 /* call_once */ );
   if (e)
     clib_error_report (e);
 
-  /* Wait until the dpdk init sequence is complete */
-  while (tm->extern_thread_mgmt && tm->worker_thread_release == 0)
-    vlib_worker_thread_barrier_check ();
-
   vlib_worker_loop (vm);
 }
 
diff --git a/src/vnet/devices/af_packet/af_packet.c b/src/vnet/devices/af_packet/af_packet.c
index e491ba47..5fdc59f2 100644
--- a/src/vnet/devices/af_packet/af_packet.c
+++ b/src/vnet/devices/af_packet/af_packet.c
@@ -67,15 +67,16 @@ af_packet_eth_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi,
 static clib_error_t *
 af_packet_fd_read_ready (unix_file_t * uf)
 {
-  vlib_main_t *vm = vlib_get_main ();
   af_packet_main_t *apm = &af_packet_main;
+  vnet_main_t *vnm = vnet_get_main ();
   u32 idx = uf->private_data;
+  af_packet_if_t *apif = pool_elt_at_index (apm->interfaces, idx);
 
   apm->pending_input_bitmap =
     clib_bitmap_set (apm->pending_input_bitmap, idx, 1);
 
   /* Schedule the rx node */
-  vlib_node_set_interrupt_pending (vm, af_packet_input_node.index);
+  vnet_device_input_set_interrupt_pending (vnm, apif->hw_if_index, 0);
 
   return 0;
 }
@@ -171,31 +172,6 @@ error:
   return ret;
 }
 
-static void
-af_packet_worker_thread_enable ()
-{
-  /* If worker threads are enabled, switch to polling mode */
-  foreach_vlib_main ((
-		       {
-		       vlib_node_set_state (this_vlib_main,
-					    af_packet_input_node.index,
-					    VLIB_NODE_STATE_POLLING);
-		       }));
-
-}
-
-static void
-af_packet_worker_thread_disable ()
-{
-  foreach_vlib_main ((
-		       {
-		       vlib_node_set_state (this_vlib_main,
-					    af_packet_input_node.index,
-					    VLIB_NODE_STATE_INTERRUPT);
-		       }));
-
-}
-
 int
 af_packet_create_if (vlib_main_t * vm, u8 * host_if_name, u8 * hw_addr_set,
 		     u32 * sw_if_index)
@@ -298,6 +274,9 @@ af_packet_create_if (vlib_main_t * vm, u8 * host_if_name, u8 * hw_addr_set,
 
   sw = vnet_get_hw_sw_interface (vnm, apif->hw_if_index);
   apif->sw_if_index = sw->sw_if_index;
+  vnet_set_device_input_node (apif->hw_if_index, af_packet_input_node.index);
+  vnet_device_input_assign_thread (apif->hw_if_index, 0,	/* queue */
+				   ~0 /* any cpu */ );
 
   vnet_hw_interface_set_flags (vnm, apif->hw_if_index,
 			       VNET_HW_INTERFACE_FLAG_LINK_UP);
@@ -307,9 +286,6 @@ af_packet_create_if (vlib_main_t * vm, u8 * host_if_name, u8 * hw_addr_set,
   if (sw_if_index)
     *sw_if_index = apif->sw_if_index;
 
-  if (tm->n_vlib_mains > 1 && pool_elts (apm->interfaces) == 1)
-    af_packet_worker_thread_enable ();
-
   return 0;
 
 error:
@@ -323,7 +299,6 @@ int
 af_packet_delete_if (vlib_main_t * vm, u8 * host_if_name)
 {
   vnet_main_t *vnm = vnet_get_main ();
-  vlib_thread_main_t *tm = vlib_get_thread_main ();
   af_packet_main_t *apm = &af_packet_main;
   af_packet_if_t *apif;
   uword *p;
@@ -373,8 +348,6 @@ af_packet_delete_if (vlib_main_t * vm, u8 * host_if_name)
   ethernet_delete_interface (vnm, apif->hw_if_index);
 
   pool_put (apm->interfaces, apif);
-  if (tm->n_vlib_mains > 1 && pool_elts (apm->interfaces) == 0)
-    af_packet_worker_thread_disable ();
 
   return 0;
 }
@@ -384,24 +357,9 @@ af_packet_init (vlib_main_t * vm)
 {
   af_packet_main_t *apm = &af_packet_main;
   vlib_thread_main_t *tm = vlib_get_thread_main ();
-  vlib_thread_registration_t *tr;
-  uword *p;
 
   memset (apm, 0, sizeof (af_packet_main_t));
 
-  apm->input_cpu_first_index = 0;
-  apm->input_cpu_count = 1;
-
-  /* find out which cpus will be used for input */
-  p = hash_get_mem (tm->thread_registrations_by_name, "workers");
-  tr = p ? (vlib_thread_registration_t *) p[0] : 0;
-
-  if (tr && tr->count > 0)
-    {
-      apm->input_cpu_first_index = tr->first_index;
-      apm->input_cpu_count = tr->count;
-    }
-
   mhash_init_vec_string (&apm->if_index_by_host_if_name, sizeof (uword));
 
   vec_validate_aligned (apm->rx_buffers, tm->n_vlib_mains - 1,
diff --git a/src/vnet/devices/af_packet/af_packet.h b/src/vnet/devices/af_packet/af_packet.h
index e00e5cb4..50ec2378 100644
--- a/src/vnet/devices/af_packet/af_packet.h
+++ b/src/vnet/devices/af_packet/af_packet.h
@@ -51,12 +51,6 @@ typedef struct
 
   /* hash of host interface names */
   mhash_t if_index_by_host_if_name;
-
-  /* first cpu index */
-  u32 input_cpu_first_index;
-
-  /* total cpu count */
-  u32 input_cpu_count;
 } af_packet_main_t;
 
 af_packet_main_t af_packet_main;
diff --git a/src/vnet/devices/af_packet/node.c b/src/vnet/devices/af_packet/node.c
index ab7fd800..ba337f3f 100644
--- a/src/vnet/devices/af_packet/node.c
+++ b/src/vnet/devices/af_packet/node.c
@@ -246,20 +246,18 @@ static uword
 af_packet_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
 		    vlib_frame_t * frame)
 {
-  int i;
   u32 n_rx_packets = 0;
-  u32 cpu_index = os_get_cpu_number ();
   af_packet_main_t *apm = &af_packet_main;
-  af_packet_if_t *apif;
+  vnet_device_input_runtime_t *rt = (void *) node->runtime_data;
+  vnet_device_and_queue_t *dq;
 
-  for (i = 0; i < vec_len (apm->interfaces); i++)
-    {
-      apif = vec_elt_at_index (apm->interfaces, i);
-      if (apif->is_admin_up &&
-	  (i % apm->input_cpu_count) ==
-	  (cpu_index - apm->input_cpu_first_index))
-	n_rx_packets += af_packet_device_input_fn (vm, node, frame, apif);
-    }
+  vec_foreach (dq, rt->devices_and_queues)
+  {
+    af_packet_if_t *apif;
+    apif = vec_elt_at_index (apm->interfaces, dq->dev_instance);
+    if (apif->is_admin_up)
+      n_rx_packets += af_packet_device_input_fn (vm, node, frame, apif);
+  }
 
   return n_rx_packets;
 }
@@ -271,9 +269,6 @@ VLIB_REGISTER_NODE (af_packet_input_node) = {
   .sibling_of = "device-input",
   .format_trace = format_af_packet_input_trace,
   .type = VLIB_NODE_TYPE_INPUT,
-  /**
-   * default state is INTERRUPT mode, switch to POLLING if worker threads are enabled
-   */
   .state = VLIB_NODE_STATE_INTERRUPT,
   .n_errors = AF_PACKET_INPUT_N_ERROR,
   .error_strings = af_packet_input_error_strings,
diff --git a/src/vnet/devices/devices.c b/src/vnet/devices/devices.c
index 38f3002b..41645220 100644
--- a/src/vnet/devices/devices.c
+++ b/src/vnet/devices/devices.c
@@ -32,6 +32,7 @@ device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
 VLIB_REGISTER_NODE (device_input_node) = {
   .function = device_input_fn,
   .name = "device-input",
+  .runtime_data_bytes = sizeof (vnet_device_input_runtime_t),
   .type = VLIB_NODE_TYPE_INPUT,
   .state = VLIB_NODE_STATE_DISABLED,
   .n_next_nodes = VNET_DEVICE_INPUT_N_NEXT_NODES,
@@ -83,18 +84,257 @@ VNET_FEATURE_INIT (ethernet_input, static) = {
 };
 /* *INDENT-ON* */
 
+static int
+vnet_device_queue_sort (void *a1, void *a2)
+{
+  vnet_device_and_queue_t *dq1 = a1;
+  vnet_device_and_queue_t *dq2 = a2;
+
+  if (dq1->dev_instance > dq2->dev_instance)
+    return 1;
+  else if (dq1->dev_instance < dq2->dev_instance)
+    return -1;
+  else if (dq1->queue_id > dq2->queue_id)
+    return 1;
+  else if (dq1->queue_id < dq2->queue_id)
+    return -1;
+  else
+    return 0;
+}
+
+void
+vnet_device_input_assign_thread (u32 hw_if_index,
+				 u16 queue_id, uword cpu_index)
+{
+  vnet_main_t *vnm = vnet_get_main ();
+  vnet_device_main_t *vdm = &vnet_device_main;
+  vlib_main_t *vm;
+  vnet_device_input_runtime_t *rt;
+  vnet_device_and_queue_t *dq;
+  vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+
+  ASSERT (hw->input_node_index > 0);
+
+  if (vdm->first_worker_cpu_index == 0)
+    cpu_index = 0;
+
+  if (cpu_index != 0 &&
+      (cpu_index < vdm->first_worker_cpu_index ||
+       cpu_index > vdm->last_worker_cpu_index))
+    {
+      cpu_index = vdm->next_worker_cpu_index++;
+      if (vdm->next_worker_cpu_index > vdm->last_worker_cpu_index)
+	vdm->next_worker_cpu_index = vdm->first_worker_cpu_index;
+    }
+
+  vm = vlib_mains[cpu_index];
+  rt = vlib_node_get_runtime_data (vm, hw->input_node_index);
+
+  vec_add2 (rt->devices_and_queues, dq, 1);
+  dq->hw_if_index = hw_if_index;
+  dq->dev_instance = hw->dev_instance;
+  dq->queue_id = queue_id;
+
+  vec_sort_with_function (rt->devices_and_queues, vnet_device_queue_sort);
+  vec_validate (hw->input_node_cpu_index_by_queue, queue_id);
+  hw->input_node_cpu_index_by_queue[queue_id] = cpu_index;
+}
+
+static int
+vnet_device_input_unassign_thread (u32 hw_if_index, u16 queue_id,
+				   uword cpu_index)
+{
+  vnet_main_t *vnm = vnet_get_main ();
+  vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+  vnet_device_input_runtime_t *rt;
+  vnet_device_and_queue_t *dq;
+  uword old_cpu_index;
+
+  if (hw->input_node_cpu_index_by_queue == 0)
+    return VNET_API_ERROR_INVALID_INTERFACE;
+
+  if (vec_len (hw->input_node_cpu_index_by_queue) < queue_id + 1)
+    return VNET_API_ERROR_INVALID_INTERFACE;
+
+  old_cpu_index = hw->input_node_cpu_index_by_queue[queue_id];
+
+  if (old_cpu_index == cpu_index)
+    return 0;
+
+  rt =
+    vlib_node_get_runtime_data (vlib_mains[old_cpu_index],
+				hw->input_node_index);
+
+  vec_foreach (dq, rt->devices_and_queues)
+    if (dq->hw_if_index == hw_if_index && dq->queue_id == queue_id)
+    {
+      vec_del1 (rt->devices_and_queues, dq - rt->devices_and_queues);
+      goto deleted;
+    }
+
+  return VNET_API_ERROR_INVALID_INTERFACE;
+
+deleted:
+  vec_sort_with_function (rt->devices_and_queues, vnet_device_queue_sort);
+
+  return 0;
+}
+
+static clib_error_t *
+show_device_placement_fn (vlib_main_t * vm, unformat_input_t * input,
+			  vlib_cli_command_t * cmd)
+{
+  u8 *s = 0;
+  vnet_main_t *vnm = vnet_get_main ();
+  vnet_device_input_runtime_t *rt;
+  vnet_device_and_queue_t *dq;
+  vlib_node_t *pn = vlib_get_node_by_name (vm, (u8 *) "device-input");
+  uword si;
+  int index = 0;
+
+  /* *INDENT-OFF* */
+  foreach_vlib_main (({
+    clib_bitmap_foreach (si, pn->sibling_bitmap,
+      ({
+        rt = vlib_node_get_runtime_data (this_vlib_main, si);
+
+        if (vec_len (rt->devices_and_queues))
+          s = format (s, "  node %U:\n", format_vlib_node_name, vm, si);
+
+        vec_foreach (dq, rt->devices_and_queues)
+	  {
+	    s = format (s, "    %U queue %u\n",
+			format_vnet_sw_if_index_name, vnm, dq->hw_if_index,
+			dq->queue_id);
+	  }
+      }));
+    if (vec_len (s) > 0)
+      {
+        vlib_cli_output(vm, "Thread %u (%v):\n%v", index,
+			vlib_worker_threads[index].name, s);
+        vec_reset_length (s);
+      }
+    index++;
+  }));
+  /* *INDENT-ON* */
+
+  vec_free (s);
+  return 0;
+}
+
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (memif_delete_command, static) = {
+  .path = "show interface placement",
+  .short_help = "show interface placement",
+  .function = show_device_placement_fn,
+};
+/* *INDENT-ON* */
+
+static clib_error_t *
+set_device_placement (vlib_main_t * vm, unformat_input_t * input,
+		      vlib_cli_command_t * cmd)
+{
+  clib_error_t *error = 0;
+  unformat_input_t _line_input, *line_input = &_line_input;
+  vnet_main_t *vnm = vnet_get_main ();
+  vnet_device_main_t *vdm = &vnet_device_main;
+  u32 hw_if_index = (u32) ~ 0;
+  u32 queue_id = (u32) 0;
+  u32 cpu_index = (u32) ~ 0;
+  int rv;
+
+  if (!unformat_user (input, unformat_line_input, line_input))
+    return 0;
+
+  while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+    {
+      if (unformat
+	  (line_input, "%U", unformat_vnet_hw_interface, vnm, &hw_if_index))
+	;
+      else if (unformat (line_input, "queue %d", &queue_id))
+	;
+      else if (unformat (line_input, "main", &cpu_index))
+	cpu_index = 0;
+      else if (unformat (line_input, "worker %d", &cpu_index))
+	cpu_index += vdm->first_worker_cpu_index;
+      else
+	{
+	  error = clib_error_return (0, "parse error: '%U'",
+				     format_unformat_error, line_input);
+	  unformat_free (line_input);
+	  return error;
+	}
+    }
+
+  unformat_free (line_input);
+
+  if (hw_if_index == (u32) ~ 0)
+    return clib_error_return (0, "please specify valid interface name");
+
+  if (cpu_index > vdm->last_worker_cpu_index)
+    return clib_error_return (0,
+			      "please specify valid worker thread or main");
+
+  rv = vnet_device_input_unassign_thread (hw_if_index, queue_id, cpu_index);
+
+  if (rv)
+    return clib_error_return (0, "not found");
+
+  vnet_device_input_assign_thread (hw_if_index, queue_id, cpu_index);
+
+  return 0;
+}
+
+/*?
+ * This command is used to assign a given interface, and optionally a
+ * given queue, to a different thread. If the '<em>queue</em>' is not provided,
+ * it defaults to 0.
+ *
+ * @cliexpar
+ * Example of how to display the interface placement:
+ * @cliexstart{show interface placement}
+ * Thread 1 (vpp_wk_0):
+ *   GigabitEthernet0/8/0 queue 0
+ *   GigabitEthernet0/9/0 queue 0
+ * Thread 2 (vpp_wk_1):
+ *   GigabitEthernet0/8/0 queue 1
+ *   GigabitEthernet0/9/0 queue 1
+ * @cliexend
+ * Example of how to assign a interface and queue to a thread:
+ * @cliexcmd{set interface placement GigabitEthernet0/8/0 queue 1 thread 1}
+?*/
+/* *INDENT-OFF* */
+VLIB_CLI_COMMAND (cmd_set_dpdk_if_placement,static) = {
+    .path = "set interface placement",
+    .short_help = "set interface placement <interface> [queue <n>] [thread <n> | main]",
+    .function = set_device_placement,
+};
+/* *INDENT-ON* */
+
 static clib_error_t *
 vnet_device_init (vlib_main_t * vm)
 {
   vnet_device_main_t *vdm = &vnet_device_main;
   vlib_thread_main_t *tm = vlib_get_thread_main ();
+  vlib_thread_registration_t *tr;
+  uword *p;
 
   vec_validate_aligned (vdm->workers, tm->n_vlib_mains - 1,
 			CLIB_CACHE_LINE_BYTES);
+
+  p = hash_get_mem (tm->thread_registrations_by_name, "workers");
+  tr = p ? (vlib_thread_registration_t *) p[0] : 0;
+  if (tr && tr->count > 0)
+    {
+      vdm->first_worker_cpu_index = tr->first_index;
+      vdm->next_worker_cpu_index = tr->first_index;
+      vdm->last_worker_cpu_index = tr->first_index + tr->count - 1;
+    }
   return 0;
 }
 
 VLIB_INIT_FUNCTION (vnet_device_init);
+
 /*
  * fd.io coding-style-patch-verification: ON
  *
diff --git a/src/vnet/devices/devices.h b/src/vnet/devices/devices.h
index a5cbc35e..bbb29fe3 100644
--- a/src/vnet/devices/devices.h
+++ b/src/vnet/devices/devices.h
@@ -50,12 +50,38 @@ typedef struct
 typedef struct
 {
   vnet_device_per_worker_data_t *workers;
+  uword first_worker_cpu_index;
+  uword last_worker_cpu_index;
+  uword next_worker_cpu_index;
 } vnet_device_main_t;
 
+typedef struct
+{
+  u32 hw_if_index;
+  u32 dev_instance;
+  u16 queue_id;
+} vnet_device_and_queue_t;
+
+typedef struct
+{
+  vnet_device_and_queue_t *devices_and_queues;
+} vnet_device_input_runtime_t;
+
 extern vnet_device_main_t vnet_device_main;
 extern vlib_node_registration_t device_input_node;
 extern const u32 device_input_next_node_advance[];
 
+static inline void
+vnet_set_device_input_node (u32 hw_if_index, u32 node_index)
+{
+  vnet_main_t *vnm = vnet_get_main ();
+  vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+  hw->input_node_index = node_index;
+}
+
+void vnet_device_input_assign_thread (u32 hw_if_index, u16 queue_id,
+				      uword cpu_index);
+
 static inline u64
 vnet_get_aggregate_rx_packets (void)
 {
@@ -78,6 +104,25 @@ vnet_device_increment_rx_packets (u32 cpu_index, u64 count)
   pwd->aggregate_rx_packets += count;
 }
 
+static_always_inline vnet_device_and_queue_t *
+vnet_get_device_and_queue (vlib_main_t * vm, vlib_node_runtime_t * node)
+{
+  vnet_device_input_runtime_t *rt = (void *) node->runtime_data;
+  return rt->devices_and_queues;
+}
+
+static_always_inline void
+vnet_device_input_set_interrupt_pending (vnet_main_t * vnm, u32 hw_if_index,
+					 u16 queue_id)
+{
+  vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
+
+  ASSERT (queue_id < vec_len (hw->input_node_cpu_index_by_queue));
+  u32 cpu_index = hw->input_node_cpu_index_by_queue[queue_id];
+  vlib_node_set_interrupt_pending (vlib_mains[cpu_index],
+				   hw->input_node_index);
+}
+
 #endif /* included_vnet_vnet_device_h */
 
 /*
diff --git a/src/vnet/interface.h b/src/vnet/interface.h
index ef8f9118..a1ea2d61 100644
--- a/src/vnet/interface.h
+++ b/src/vnet/interface.h
@@ -464,6 +464,12 @@ typedef struct vnet_hw_interface_t
 #define VNET_HW_INTERFACE_BOND_INFO_NONE ((uword *) 0)
 #define VNET_HW_INTERFACE_BOND_INFO_SLAVE ((uword *) ~0)
 
+  /* Input node */
+  u32 input_node_index;
+
+  /* input node cpu index by queue */
+  u32 *input_node_cpu_index_by_queue;
+
 } vnet_hw_interface_t;
 
 extern vnet_device_class_t vnet_local_interface_device_class;
-- 
cgit 1.2.3-korg


From 586afd762bfa149f5ca167bd5fd5a0cd59ce94fe Mon Sep 17 00:00:00 2001
From: Damjan Marion <damarion@cisco.com>
Date: Wed, 5 Apr 2017 19:18:20 +0200
Subject: Use thread local storage for thread index

This patch deprecates stack-based thread identification,
Also removes requirement that thread stacks are adjacent.

Finally, possibly annoying for some folks, it renames
all occurences of cpu_index and cpu_number with thread
index. Using word "cpu" is misleading here as thread can
be migrated ti different CPU, and also it is not related
to linux cpu index.

Change-Id: I68cdaf661e701d2336fc953dcb9978d10a70f7c1
Signed-off-by: Damjan Marion <damarion@cisco.com>
---
 src/examples/srv6-sample-localsid/node.c           |   4 +-
 src/plugins/dpdk/buffer.c                          |   2 +-
 src/plugins/dpdk/device/device.c                   |   8 +-
 src/plugins/dpdk/device/dpdk_priv.h                |   8 +-
 src/plugins/dpdk/device/init.c                     |   2 +-
 src/plugins/dpdk/device/node.c                     |  32 +++---
 src/plugins/dpdk/hqos/hqos.c                       |  16 +--
 src/plugins/dpdk/ipsec/cli.c                       |   8 +-
 src/plugins/dpdk/ipsec/crypto_node.c               |   4 +-
 src/plugins/dpdk/ipsec/esp.h                       |   4 +-
 src/plugins/dpdk/ipsec/esp_decrypt.c               |   4 +-
 src/plugins/dpdk/ipsec/esp_encrypt.c               |   5 +-
 src/plugins/dpdk/ipsec/ipsec.c                     |   2 +-
 src/plugins/dpdk/ipsec/ipsec.h                     |   4 +-
 src/plugins/dpdk/main.c                            |   2 +-
 src/plugins/flowperpkt/l2_node.c                   |   2 +-
 src/plugins/flowperpkt/node.c                      |   2 +-
 src/plugins/ioam/export-common/ioam_export.h       |   6 +-
 .../ioam/ip6/ioam_cache_tunnel_select_node.c       |  16 +--
 src/plugins/ixge/ixge.c                            |   2 +-
 src/plugins/lb/lb.c                                |   8 +-
 src/plugins/lb/node.c                              |  22 ++--
 src/plugins/lb/refcount.c                          |   8 +-
 src/plugins/lb/refcount.h                          |   4 +-
 src/plugins/memif/node.c                           |  35 +++---
 src/plugins/snat/in2out.c                          | 110 +++++++++---------
 src/plugins/snat/out2in.c                          | 102 ++++++++---------
 src/plugins/snat/snat.h                            |  10 +-
 src/vlib/buffer.c                                  |   6 +-
 src/vlib/buffer_funcs.h                            |   4 +-
 src/vlib/cli.c                                     |   6 +-
 src/vlib/counter.h                                 |  16 +--
 src/vlib/error.c                                   |   2 +-
 src/vlib/global_funcs.h                            |   2 +-
 src/vlib/main.c                                    |  14 +--
 src/vlib/main.h                                    |   2 +-
 src/vlib/node.c                                    |   2 +-
 src/vlib/node.h                                    |   6 +-
 src/vlib/node_funcs.h                              |   8 +-
 src/vlib/threads.c                                 |  69 ++++-------
 src/vlib/threads.h                                 |  21 ++--
 src/vlib/unix/cj.c                                 |   7 +-
 src/vlib/unix/cj.h                                 |   2 +-
 src/vlib/unix/main.c                               |  43 +++----
 src/vnet/adj/adj_l2.c                              |   4 +-
 src/vnet/adj/adj_midchain.c                        |   8 +-
 src/vnet/adj/adj_nsh.c                             |   4 +-
 src/vnet/classify/vnet_classify.c                  |  16 +--
 src/vnet/cop/ip4_whitelist.c                       |   8 +-
 src/vnet/cop/ip6_whitelist.c                       |   8 +-
 src/vnet/devices/af_packet/node.c                  |  20 ++--
 src/vnet/devices/devices.c                         |  61 +++++-----
 src/vnet/devices/devices.h                         |  18 +--
 src/vnet/devices/netmap/node.c                     |  24 ++--
 src/vnet/devices/ssvm/node.c                       |   6 +-
 src/vnet/devices/virtio/vhost-user.c               | 127 +++++++++++----------
 src/vnet/dpo/lookup_dpo.c                          |  20 ++--
 src/vnet/dpo/replicate_dpo.c                       |  12 +-
 src/vnet/ethernet/arp.c                            |   2 +-
 src/vnet/ethernet/interface.c                      |   7 +-
 src/vnet/ethernet/node.c                           |  14 +--
 src/vnet/gre/node.c                                |   8 +-
 src/vnet/interface.h                               |   2 +-
 src/vnet/interface_output.c                        |  53 ++++-----
 src/vnet/ip/ip4_forward.c                          |  34 +++---
 src/vnet/ip/ip4_input.c                            |   8 +-
 src/vnet/ip/ip6_forward.c                          |  24 ++--
 src/vnet/ip/ip6_input.c                            |   8 +-
 src/vnet/ip/ip6_neighbor.c                         |   4 +-
 src/vnet/ipsec/esp.h                               |   8 +-
 src/vnet/ipsec/esp_decrypt.c                       |  13 ++-
 src/vnet/ipsec/esp_encrypt.c                       |  13 ++-
 src/vnet/ipsec/ikev2.c                             |  64 ++++++-----
 src/vnet/ipsec/ipsec.h                             |  12 +-
 src/vnet/ipsec/ipsec_if.c                          |   2 +-
 src/vnet/l2/l2_bvi.h                               |   2 +-
 src/vnet/l2/l2_input.c                             |  14 +--
 src/vnet/l2/l2_output.c                            |   6 +-
 src/vnet/l2tp/decap.c                              |   2 +-
 src/vnet/l2tp/encap.c                              |   2 +-
 src/vnet/l2tp/l2tp.c                               |   6 +-
 src/vnet/lisp-gpe/decap.c                          |  16 +--
 src/vnet/lldp/lldp_input.c                         |   2 +-
 src/vnet/map/ip4_map.c                             |  14 +--
 src/vnet/map/ip4_map_t.c                           |  12 +-
 src/vnet/map/ip6_map.c                             |  19 +--
 src/vnet/map/ip6_map_t.c                           |  12 +-
 src/vnet/mpls/mpls_input.c                         |   8 +-
 src/vnet/mpls/mpls_lookup.c                        |  20 ++--
 src/vnet/mpls/mpls_output.c                        |  10 +-
 src/vnet/pg/input.c                                |   4 +-
 src/vnet/replication.c                             |  20 ++--
 src/vnet/replication.h                             |   2 +-
 src/vnet/session/node.c                            |   2 +-
 src/vnet/sr/sr_localsid.c                          |  44 +++----
 src/vnet/tcp/builtin_client.c                      |   2 +-
 src/vnet/tcp/tcp.c                                 |   8 +-
 src/vnet/tcp/tcp_debug.h                           |   2 +-
 src/vnet/tcp/tcp_input.c                           |  10 +-
 src/vnet/tcp/tcp_output.c                          |  20 ++--
 src/vnet/udp/udp_input.c                           |   2 +-
 src/vnet/unix/tapcli.c                             |   2 +-
 src/vnet/unix/tuntap.c                             |   4 +-
 src/vnet/vxlan-gpe/decap.c                         |  10 +-
 src/vnet/vxlan-gpe/encap.c                         |  12 +-
 src/vnet/vxlan/decap.c                             |  10 +-
 src/vnet/vxlan/encap.c                             |  12 +-
 src/vpp/stats/stats.c                              |  14 +--
 src/vpp/stats/stats.h                              |   2 +-
 109 files changed, 790 insertions(+), 791 deletions(-)

(limited to 'src/vlib/threads.c')

diff --git a/src/examples/srv6-sample-localsid/node.c b/src/examples/srv6-sample-localsid/node.c
index 7bae9cd7..e83e2352 100644
--- a/src/examples/srv6-sample-localsid/node.c
+++ b/src/examples/srv6-sample-localsid/node.c
@@ -114,7 +114,7 @@ srv6_localsid_sample_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_fram
   from = vlib_frame_vector_args (frame);
   n_left_from = frame->n_vectors;
   next_index = node->cached_next_index;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   while (n_left_from > 0)
   {
@@ -168,7 +168,7 @@ srv6_localsid_sample_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_fram
       /* This increments the SRv6 per LocalSID counters.*/
       vlib_increment_combined_counter
         (((next0 == SRV6_SAMPLE_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : &(sm->sr_ls_valid_counters)),
-        cpu_index,
+        thread_index,
         ls0 - sm->localsids,
         1, vlib_buffer_length_in_chain (vm, b0));
 
diff --git a/src/plugins/dpdk/buffer.c b/src/plugins/dpdk/buffer.c
index 2765c292..c80b3fa8 100644
--- a/src/plugins/dpdk/buffer.c
+++ b/src/plugins/dpdk/buffer.c
@@ -132,7 +132,7 @@ dpdk_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index)
   u32 merge_index;
   int i;
 
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
 
   f = vlib_buffer_get_free_list (vm, free_list_index);
 
diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c
index 50b26689..91661246 100644
--- a/src/plugins/dpdk/device/device.c
+++ b/src/plugins/dpdk/device/device.c
@@ -243,7 +243,7 @@ static_always_inline
   ASSERT (ring->tx_tail == 0);
 
   n_retry = 16;
-  queue_id = vm->cpu_index;
+  queue_id = vm->thread_index;
 
   do
     {
@@ -266,7 +266,7 @@ static_always_inline
 	{
 	  /* no wrap, transmit in one burst */
 	  dpdk_device_hqos_per_worker_thread_t *hqos =
-	    &xd->hqos_wt[vm->cpu_index];
+	    &xd->hqos_wt[vm->thread_index];
 
 	  ASSERT (hqos->swq != NULL);
 
@@ -332,7 +332,7 @@ dpdk_buffer_recycle (vlib_main_t * vm, vlib_node_runtime_t * node,
 		     vlib_buffer_t * b, u32 bi, struct rte_mbuf **mbp)
 {
   dpdk_main_t *dm = &dpdk_main;
-  u32 my_cpu = vm->cpu_index;
+  u32 my_cpu = vm->thread_index;
   struct rte_mbuf *mb_new;
 
   if (PREDICT_FALSE (b->flags & VLIB_BUFFER_RECYCLE) == 0)
@@ -376,7 +376,7 @@ dpdk_interface_tx (vlib_main_t * vm,
   tx_ring_hdr_t *ring;
   u32 n_on_ring;
 
-  my_cpu = vm->cpu_index;
+  my_cpu = vm->thread_index;
 
   queue_id = my_cpu;
 
diff --git a/src/plugins/dpdk/device/dpdk_priv.h b/src/plugins/dpdk/device/dpdk_priv.h
index dd40ff48..52b4ca4b 100644
--- a/src/plugins/dpdk/device/dpdk_priv.h
+++ b/src/plugins/dpdk/device/dpdk_priv.h
@@ -79,7 +79,7 @@ dpdk_update_counters (dpdk_device_t * xd, f64 now)
 {
   vlib_simple_counter_main_t *cm;
   vnet_main_t *vnm = vnet_get_main ();
-  u32 my_cpu = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   u64 rxerrors, last_rxerrors;
 
   /* only update counters for PMD interfaces */
@@ -96,7 +96,7 @@ dpdk_update_counters (dpdk_device_t * xd, f64 now)
       cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
 			     VNET_INTERFACE_COUNTER_RX_NO_BUF);
 
-      vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index,
+      vlib_increment_simple_counter (cm, thread_index, xd->vlib_sw_if_index,
 				     xd->stats.rx_nombuf -
 				     xd->last_stats.rx_nombuf);
     }
@@ -107,7 +107,7 @@ dpdk_update_counters (dpdk_device_t * xd, f64 now)
       cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
 			     VNET_INTERFACE_COUNTER_RX_MISS);
 
-      vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index,
+      vlib_increment_simple_counter (cm, thread_index, xd->vlib_sw_if_index,
 				     xd->stats.imissed -
 				     xd->last_stats.imissed);
     }
@@ -119,7 +119,7 @@ dpdk_update_counters (dpdk_device_t * xd, f64 now)
       cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
 			     VNET_INTERFACE_COUNTER_RX_ERROR);
 
-      vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index,
+      vlib_increment_simple_counter (cm, thread_index, xd->vlib_sw_if_index,
 				     rxerrors - last_rxerrors);
     }
 
diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c
index 538db6cb..7eaf8da7 100755
--- a/src/plugins/dpdk/device/init.c
+++ b/src/plugins/dpdk/device/init.c
@@ -324,7 +324,7 @@ dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd)
   int rv;
   int j;
 
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
 
   if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP)
     {
diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c
index e740fd18..b10e0fad 100644
--- a/src/plugins/dpdk/device/node.c
+++ b/src/plugins/dpdk/device/node.c
@@ -283,7 +283,7 @@ dpdk_buffer_init_from_template (void *d0, void *d1, void *d2, void *d3,
  */
 static_always_inline u32
 dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd,
-		   vlib_node_runtime_t * node, u32 cpu_index, u16 queue_id,
+		   vlib_node_runtime_t * node, u32 thread_index, u16 queue_id,
 		   int maybe_multiseg)
 {
   u32 n_buffers;
@@ -294,7 +294,7 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd,
   uword n_rx_bytes = 0;
   u32 n_trace, trace_cnt __attribute__ ((unused));
   vlib_buffer_free_list_t *fl;
-  vlib_buffer_t *bt = vec_elt_at_index (dm->buffer_templates, cpu_index);
+  vlib_buffer_t *bt = vec_elt_at_index (dm->buffer_templates, thread_index);
 
   if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) == 0)
     return 0;
@@ -306,7 +306,7 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd,
       return 0;
     }
 
-  vec_reset_length (xd->d_trace_buffers[cpu_index]);
+  vec_reset_length (xd->d_trace_buffers[thread_index]);
   trace_cnt = n_trace = vlib_get_trace_count (vm, node);
 
   if (n_trace > 0)
@@ -318,7 +318,7 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd,
 	{
 	  struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index++];
 	  vlib_buffer_t *b = vlib_buffer_from_rte_mbuf (mb);
-	  vec_add1 (xd->d_trace_buffers[cpu_index],
+	  vec_add1 (xd->d_trace_buffers[thread_index],
 		    vlib_get_buffer_index (vm, b));
 	}
     }
@@ -546,20 +546,22 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd,
       vlib_put_next_frame (vm, node, next_index, n_left_to_next);
     }
 
-  if (PREDICT_FALSE (vec_len (xd->d_trace_buffers[cpu_index]) > 0))
+  if (PREDICT_FALSE (vec_len (xd->d_trace_buffers[thread_index]) > 0))
     {
-      dpdk_rx_trace (dm, node, xd, queue_id, xd->d_trace_buffers[cpu_index],
-		     vec_len (xd->d_trace_buffers[cpu_index]));
-      vlib_set_trace_count (vm, node, n_trace -
-			    vec_len (xd->d_trace_buffers[cpu_index]));
+      dpdk_rx_trace (dm, node, xd, queue_id,
+		     xd->d_trace_buffers[thread_index],
+		     vec_len (xd->d_trace_buffers[thread_index]));
+      vlib_set_trace_count (vm, node,
+			    n_trace -
+			    vec_len (xd->d_trace_buffers[thread_index]));
     }
 
   vlib_increment_combined_counter
     (vnet_get_main ()->interface_main.combined_sw_if_counters
      + VNET_INTERFACE_COUNTER_RX,
-     cpu_index, xd->vlib_sw_if_index, mb_index, n_rx_bytes);
+     thread_index, xd->vlib_sw_if_index, mb_index, n_rx_bytes);
 
-  vnet_device_increment_rx_packets (cpu_index, mb_index);
+  vnet_device_increment_rx_packets (thread_index, mb_index);
 
   return mb_index;
 }
@@ -630,19 +632,19 @@ dpdk_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f)
   dpdk_device_t *xd;
   uword n_rx_packets = 0;
   dpdk_device_and_queue_t *dq;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   /*
    * Poll all devices on this cpu for input/interrupts.
    */
   /* *INDENT-OFF* */
-  vec_foreach (dq, dm->devices_by_cpu[cpu_index])
+  vec_foreach (dq, dm->devices_by_cpu[thread_index])
     {
       xd = vec_elt_at_index(dm->devices, dq->device);
       if (xd->flags & DPDK_DEVICE_FLAG_MAYBE_MULTISEG)
-        n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id, /* maybe_multiseg */ 1);
+        n_rx_packets += dpdk_device_input (dm, xd, node, thread_index, dq->queue_id, /* maybe_multiseg */ 1);
       else
-        n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id, /* maybe_multiseg */ 0);
+        n_rx_packets += dpdk_device_input (dm, xd, node, thread_index, dq->queue_id, /* maybe_multiseg */ 0);
     }
   /* *INDENT-ON* */
 
diff --git a/src/plugins/dpdk/hqos/hqos.c b/src/plugins/dpdk/hqos/hqos.c
index a288fca7..8b251beb 100644
--- a/src/plugins/dpdk/hqos/hqos.c
+++ b/src/plugins/dpdk/hqos/hqos.c
@@ -397,7 +397,7 @@ static_always_inline void
 dpdk_hqos_thread_internal_hqos_dbg_bypass (vlib_main_t * vm)
 {
   dpdk_main_t *dm = &dpdk_main;
-  u32 cpu_index = vm->cpu_index;
+  u32 thread_index = vm->thread_index;
   u32 dev_pos;
 
   dev_pos = 0;
@@ -405,12 +405,12 @@ dpdk_hqos_thread_internal_hqos_dbg_bypass (vlib_main_t * vm)
     {
       vlib_worker_thread_barrier_check ();
 
-      u32 n_devs = vec_len (dm->devices_by_hqos_cpu[cpu_index]);
+      u32 n_devs = vec_len (dm->devices_by_hqos_cpu[thread_index]);
       if (dev_pos >= n_devs)
 	dev_pos = 0;
 
       dpdk_device_and_queue_t *dq =
-	vec_elt_at_index (dm->devices_by_hqos_cpu[cpu_index], dev_pos);
+	vec_elt_at_index (dm->devices_by_hqos_cpu[thread_index], dev_pos);
       dpdk_device_t *xd = vec_elt_at_index (dm->devices, dq->device);
 
       dpdk_device_hqos_per_hqos_thread_t *hqos = xd->hqos_ht;
@@ -479,7 +479,7 @@ static_always_inline void
 dpdk_hqos_thread_internal (vlib_main_t * vm)
 {
   dpdk_main_t *dm = &dpdk_main;
-  u32 cpu_index = vm->cpu_index;
+  u32 thread_index = vm->thread_index;
   u32 dev_pos;
 
   dev_pos = 0;
@@ -487,7 +487,7 @@ dpdk_hqos_thread_internal (vlib_main_t * vm)
     {
       vlib_worker_thread_barrier_check ();
 
-      u32 n_devs = vec_len (dm->devices_by_hqos_cpu[cpu_index]);
+      u32 n_devs = vec_len (dm->devices_by_hqos_cpu[thread_index]);
       if (PREDICT_FALSE (n_devs == 0))
 	{
 	  dev_pos = 0;
@@ -497,7 +497,7 @@ dpdk_hqos_thread_internal (vlib_main_t * vm)
 	dev_pos = 0;
 
       dpdk_device_and_queue_t *dq =
-	vec_elt_at_index (dm->devices_by_hqos_cpu[cpu_index], dev_pos);
+	vec_elt_at_index (dm->devices_by_hqos_cpu[thread_index], dev_pos);
       dpdk_device_t *xd = vec_elt_at_index (dm->devices, dq->device);
 
       dpdk_device_hqos_per_hqos_thread_t *hqos = xd->hqos_ht;
@@ -586,7 +586,7 @@ dpdk_hqos_thread (vlib_worker_thread_t * w)
 
   vm = vlib_get_main ();
 
-  ASSERT (vm->cpu_index == os_get_cpu_number ());
+  ASSERT (vm->thread_index == vlib_get_thread_index ());
 
   clib_time_init (&vm->clib_time);
   clib_mem_set_heap (w->thread_mheap);
@@ -595,7 +595,7 @@ dpdk_hqos_thread (vlib_worker_thread_t * w)
   while (tm->worker_thread_release == 0)
     vlib_worker_thread_barrier_check ();
 
-  if (vec_len (dm->devices_by_hqos_cpu[vm->cpu_index]) == 0)
+  if (vec_len (dm->devices_by_hqos_cpu[vm->thread_index]) == 0)
     return
       clib_error
       ("current I/O TX thread does not have any devices assigned to it");
diff --git a/src/plugins/dpdk/ipsec/cli.c b/src/plugins/dpdk/ipsec/cli.c
index cd0a6037..3ae8c9b8 100644
--- a/src/plugins/dpdk/ipsec/cli.c
+++ b/src/plugins/dpdk/ipsec/cli.c
@@ -42,8 +42,8 @@ dpdk_ipsec_show_mapping (vlib_main_t * vm, u16 detail_display)
   for (i = 0; i < tm->n_vlib_mains; i++)
     {
       uword key, data;
-      u32 cpu_index = vlib_mains[i]->cpu_index;
-      crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index];
+      u32 thread_index = vlib_mains[i]->thread_index;
+      crypto_worker_main_t *cwm = &dcm->workers_main[thread_index];
       u8 *s = 0;
 
       if (skip_master)
@@ -57,7 +57,7 @@ dpdk_ipsec_show_mapping (vlib_main_t * vm, u16 detail_display)
 	  i32 last_cdev = -1;
 	  crypto_qp_data_t *qpd;
 
-	  s = format (s, "%u\t", cpu_index);
+	  s = format (s, "%u\t", thread_index);
 
 	  /* *INDENT-OFF* */
 	  vec_foreach (qpd, cwm->qp_data)
@@ -95,7 +95,7 @@ dpdk_ipsec_show_mapping (vlib_main_t * vm, u16 detail_display)
 	    cap.sym.auth.algo = p_key->auth_algo;
 	    check_algo_is_supported (&cap, auth_str);
 	    vlib_cli_output (vm, "%u\t%10s\t%15s\t%3s\t%u\t%u\n",
-			     vlib_mains[i]->cpu_index, cipher_str, auth_str,
+			     vlib_mains[i]->thread_index, cipher_str, auth_str,
 			     p_key->is_outbound ? "out" : "in",
 			     cwm->qp_data[data].dev_id,
 			     cwm->qp_data[data].qp_id);
diff --git a/src/plugins/dpdk/ipsec/crypto_node.c b/src/plugins/dpdk/ipsec/crypto_node.c
index dc3452b2..a3c45902 100644
--- a/src/plugins/dpdk/ipsec/crypto_node.c
+++ b/src/plugins/dpdk/ipsec/crypto_node.c
@@ -171,9 +171,9 @@ static uword
 dpdk_crypto_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
 		      vlib_frame_t * frame)
 {
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   dpdk_crypto_main_t *dcm = &dpdk_crypto_main;
-  crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index];
+  crypto_worker_main_t *cwm = &dcm->workers_main[thread_index];
   crypto_qp_data_t *qpd;
   u32 n_deq = 0;
 
diff --git a/src/plugins/dpdk/ipsec/esp.h b/src/plugins/dpdk/ipsec/esp.h
index 320295b1..56f0c756 100644
--- a/src/plugins/dpdk/ipsec/esp.h
+++ b/src/plugins/dpdk/ipsec/esp.h
@@ -170,9 +170,9 @@ static_always_inline int
 create_sym_sess (ipsec_sa_t * sa, crypto_sa_session_t * sa_sess,
 		 u8 is_outbound)
 {
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   dpdk_crypto_main_t *dcm = &dpdk_crypto_main;
-  crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index];
+  crypto_worker_main_t *cwm = &dcm->workers_main[thread_index];
   struct rte_crypto_sym_xform cipher_xform = { 0 };
   struct rte_crypto_sym_xform auth_xform = { 0 };
   struct rte_crypto_sym_xform *xfs;
diff --git a/src/plugins/dpdk/ipsec/esp_decrypt.c b/src/plugins/dpdk/ipsec/esp_decrypt.c
index 286e03f8..bab76e3b 100644
--- a/src/plugins/dpdk/ipsec/esp_decrypt.c
+++ b/src/plugins/dpdk/ipsec/esp_decrypt.c
@@ -88,7 +88,7 @@ dpdk_esp_decrypt_node_fn (vlib_main_t * vm,
 {
   u32 n_left_from, *from, *to_next, next_index;
   ipsec_main_t *im = &ipsec_main;
-  u32 cpu_index = os_get_cpu_number();
+  u32 thread_index = vlib_get_thread_index();
   dpdk_crypto_main_t * dcm = &dpdk_crypto_main;
   dpdk_esp_main_t * em = &dpdk_esp_main;
   u32 i;
@@ -104,7 +104,7 @@ dpdk_esp_decrypt_node_fn (vlib_main_t * vm,
       return n_left_from;
     }
 
-  crypto_worker_main_t *cwm = vec_elt_at_index(dcm->workers_main, cpu_index);
+  crypto_worker_main_t *cwm = vec_elt_at_index(dcm->workers_main, thread_index);
   u32 n_qps = vec_len(cwm->qp_data);
   struct rte_crypto_op ** cops_to_enq[n_qps];
   u32 n_cop_qp[n_qps], * bi_to_enq[n_qps];
diff --git a/src/plugins/dpdk/ipsec/esp_encrypt.c b/src/plugins/dpdk/ipsec/esp_encrypt.c
index 5b03de73..f996d7df 100644
--- a/src/plugins/dpdk/ipsec/esp_encrypt.c
+++ b/src/plugins/dpdk/ipsec/esp_encrypt.c
@@ -93,7 +93,7 @@ dpdk_esp_encrypt_node_fn (vlib_main_t * vm,
 {
   u32 n_left_from, *from, *to_next, next_index;
   ipsec_main_t *im = &ipsec_main;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   dpdk_crypto_main_t *dcm = &dpdk_crypto_main;
   dpdk_esp_main_t *em = &dpdk_esp_main;
   u32 i;
@@ -111,7 +111,8 @@ dpdk_esp_encrypt_node_fn (vlib_main_t * vm,
       return n_left_from;
     }
 
-  crypto_worker_main_t *cwm = vec_elt_at_index (dcm->workers_main, cpu_index);
+  crypto_worker_main_t *cwm =
+    vec_elt_at_index (dcm->workers_main, thread_index);
   u32 n_qps = vec_len (cwm->qp_data);
   struct rte_crypto_op **cops_to_enq[n_qps];
   u32 n_cop_qp[n_qps], *bi_to_enq[n_qps];
diff --git a/src/plugins/dpdk/ipsec/ipsec.c b/src/plugins/dpdk/ipsec/ipsec.c
index b0aaaaec..5d8f4fba 100644
--- a/src/plugins/dpdk/ipsec/ipsec.c
+++ b/src/plugins/dpdk/ipsec/ipsec.c
@@ -289,7 +289,7 @@ dpdk_ipsec_process (vlib_main_t * vm, vlib_node_runtime_t * rt,
 	      if (!map)
 		{
 		  clib_warning ("unable to create hash table for worker %u",
-				vlib_mains[i]->cpu_index);
+				vlib_mains[i]->thread_index);
 		  goto error;
 		}
 	      cwm->algo_qp_map = map;
diff --git a/src/plugins/dpdk/ipsec/ipsec.h b/src/plugins/dpdk/ipsec/ipsec.h
index 28bffc80..f0f793c0 100644
--- a/src/plugins/dpdk/ipsec/ipsec.h
+++ b/src/plugins/dpdk/ipsec/ipsec.h
@@ -95,8 +95,8 @@ static_always_inline void
 crypto_alloc_cops ()
 {
   dpdk_crypto_main_t *dcm = &dpdk_crypto_main;
-  u32 cpu_index = os_get_cpu_number ();
-  crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index];
+  u32 thread_index = vlib_get_thread_index ();
+  crypto_worker_main_t *cwm = &dcm->workers_main[thread_index];
   unsigned socket_id = rte_socket_id ();
   crypto_qp_data_t *qpd;
 
diff --git a/src/plugins/dpdk/main.c b/src/plugins/dpdk/main.c
index 7ee2a785..942b8b2d 100644
--- a/src/plugins/dpdk/main.c
+++ b/src/plugins/dpdk/main.c
@@ -39,7 +39,7 @@ rte_delay_us_override (unsigned us)
    * thread then do not intercept. (Must not be called from an
    * independent pthread).
    */
-  if (os_get_cpu_number () == 0)
+  if (vlib_get_thread_index () == 0)
     {
       /*
        * We're in the vlib main thread or a vlib process. Make sure
diff --git a/src/plugins/flowperpkt/l2_node.c b/src/plugins/flowperpkt/l2_node.c
index 1c2f681e..fdaf81d1 100644
--- a/src/plugins/flowperpkt/l2_node.c
+++ b/src/plugins/flowperpkt/l2_node.c
@@ -102,7 +102,7 @@ add_to_flow_record_l2 (vlib_main_t * vm,
 		       u8 * src_mac, u8 * dst_mac,
 		       u16 ethertype, u64 timestamp, u16 length, int do_flush)
 {
-  u32 my_cpu_number = vm->cpu_index;
+  u32 my_cpu_number = vm->thread_index;
   flow_report_main_t *frm = &flow_report_main;
   ip4_header_t *ip;
   udp_header_t *udp;
diff --git a/src/plugins/flowperpkt/node.c b/src/plugins/flowperpkt/node.c
index f77f087d..0277682d 100644
--- a/src/plugins/flowperpkt/node.c
+++ b/src/plugins/flowperpkt/node.c
@@ -101,7 +101,7 @@ add_to_flow_record_ipv4 (vlib_main_t * vm,
 			 u32 src_address, u32 dst_address,
 			 u8 tos, u64 timestamp, u16 length, int do_flush)
 {
-  u32 my_cpu_number = vm->cpu_index;
+  u32 my_cpu_number = vm->thread_index;
   flow_report_main_t *frm = &flow_report_main;
   ip4_header_t *ip;
   udp_header_t *udp;
diff --git a/src/plugins/ioam/export-common/ioam_export.h b/src/plugins/ioam/export-common/ioam_export.h
index 2bf3fd54..9de0d13b 100644
--- a/src/plugins/ioam/export-common/ioam_export.h
+++ b/src/plugins/ioam/export-common/ioam_export.h
@@ -477,8 +477,8 @@ do {                                                                           \
   from = vlib_frame_vector_args (F);                                           \
   n_left_from = (F)->n_vectors;                                                \
   next_index = (N)->cached_next_index;                                         \
-  while (__sync_lock_test_and_set ((EM)->lockp[(VM)->cpu_index], 1));          \
-  my_buf = ioam_export_get_my_buffer (EM, (VM)->cpu_index);                    \
+  while (__sync_lock_test_and_set ((EM)->lockp[(VM)->thread_index], 1));       \
+  my_buf = ioam_export_get_my_buffer (EM, (VM)->thread_index);                 \
   my_buf->touched_at = vlib_time_now (VM);                                     \
   while (n_left_from > 0)                                                      \
     {                                                                          \
@@ -620,7 +620,7 @@ do {                                                                           \
     }                                                                          \
   vlib_node_increment_counter (VM, export_node.index,                          \
 			       EXPORT_ERROR_RECORDED, pkts_recorded);          \
-  *(EM)->lockp[(VM)->cpu_index] = 0;                                           \
+  *(EM)->lockp[(VM)->thread_index] = 0;                                        \
 } while(0)
 
 #endif /* __included_ioam_export_h__ */
diff --git a/src/plugins/ioam/ip6/ioam_cache_tunnel_select_node.c b/src/plugins/ioam/ip6/ioam_cache_tunnel_select_node.c
index a56dc040..0cf742c9 100644
--- a/src/plugins/ioam/ip6/ioam_cache_tunnel_select_node.c
+++ b/src/plugins/ioam/ip6/ioam_cache_tunnel_select_node.c
@@ -396,7 +396,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm,
 					      clib_net_to_host_u32
 					      (tcp0->seq_number) + 1,
 					      no_of_responses, now,
-					      vm->cpu_index, &pool_index0))
+					      vm->thread_index, &pool_index0))
 		    {
 		      cache_ts_added++;
 		    }
@@ -419,7 +419,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm,
 	      e2e =
 		(ioam_e2e_cache_option_t *) ((u8 *) hbh0 +
 					     cm->rewrite_pool_index_offset);
-	      e2e->pool_id = (u8) vm->cpu_index;
+	      e2e->pool_id = (u8) vm->thread_index;
 	      e2e->pool_index = pool_index0;
 	      ioam_e2e_id_rewrite_handler ((ioam_e2e_id_option_t *)
 					   ((u8 *) e2e +
@@ -455,7 +455,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm,
 					      clib_net_to_host_u32
 					      (tcp1->seq_number) + 1,
 					      no_of_responses, now,
-					      vm->cpu_index, &pool_index1))
+					      vm->thread_index, &pool_index1))
 		    {
 		      cache_ts_added++;
 		    }
@@ -479,7 +479,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm,
 	      e2e =
 		(ioam_e2e_cache_option_t *) ((u8 *) hbh1 +
 					     cm->rewrite_pool_index_offset);
-	      e2e->pool_id = (u8) vm->cpu_index;
+	      e2e->pool_id = (u8) vm->thread_index;
 	      e2e->pool_index = pool_index1;
 	      ioam_e2e_id_rewrite_handler ((ioam_e2e_id_option_t *)
 					   ((u8 *) e2e +
@@ -562,7 +562,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm,
 					      clib_net_to_host_u32
 					      (tcp0->seq_number) + 1,
 					      no_of_responses, now,
-					      vm->cpu_index, &pool_index0))
+					      vm->thread_index, &pool_index0))
 		    {
 		      cache_ts_added++;
 		    }
@@ -585,7 +585,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm,
 	      e2e =
 		(ioam_e2e_cache_option_t *) ((u8 *) hbh0 +
 					     cm->rewrite_pool_index_offset);
-	      e2e->pool_id = (u8) vm->cpu_index;
+	      e2e->pool_id = (u8) vm->thread_index;
 	      e2e->pool_index = pool_index0;
 	      ioam_e2e_id_rewrite_handler ((ioam_e2e_id_option_t *)
 					   ((u8 *) e2e +
@@ -701,7 +701,7 @@ expired_cache_ts_timer_callback (u32 * expired_timers)
   ioam_cache_main_t *cm = &ioam_cache_main;
   int i;
   u32 pool_index;
-  u32 thread_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   u32 count = 0;
 
   for (i = 0; i < vec_len (expired_timers); i++)
@@ -724,7 +724,7 @@ ioam_cache_ts_timer_tick_node_fn (vlib_main_t * vm,
 				  vlib_frame_t * f)
 {
   ioam_cache_main_t *cm = &ioam_cache_main;
-  u32 my_thread_index = os_get_cpu_number ();
+  u32 my_thread_index = vlib_get_thread_index ();
   struct timespec ts, tsrem;
 
   tw_timer_expire_timers_16t_2w_512sl (&cm->timer_wheels[my_thread_index],
diff --git a/src/plugins/ixge/ixge.c b/src/plugins/ixge/ixge.c
index f3c5cc09..08f5b692 100644
--- a/src/plugins/ixge/ixge.c
+++ b/src/plugins/ixge/ixge.c
@@ -1887,7 +1887,7 @@ done:
   vlib_increment_combined_counter (vnet_main.
 				   interface_main.combined_sw_if_counters +
 				   VNET_INTERFACE_COUNTER_RX,
-				   0 /* cpu_index */ ,
+				   0 /* thread_index */ ,
 				   xd->vlib_sw_if_index, n_packets,
 				   dq->rx.n_bytes);
 
diff --git a/src/plugins/lb/lb.c b/src/plugins/lb/lb.c
index add81236..addc2a42 100644
--- a/src/plugins/lb/lb.c
+++ b/src/plugins/lb/lb.c
@@ -63,11 +63,11 @@ u8 *format_lb_main (u8 * s, va_list * args)
   s = format(s, " #vips: %u\n", pool_elts(lbm->vips));
   s = format(s, " #ass: %u\n", pool_elts(lbm->ass) - 1);
 
-  u32 cpu_index;
-  for(cpu_index = 0; cpu_index < tm->n_vlib_mains; cpu_index++ ) {
-    lb_hash_t *h = lbm->per_cpu[cpu_index].sticky_ht;
+  u32 thread_index;
+  for(thread_index = 0; thread_index < tm->n_vlib_mains; thread_index++ ) {
+    lb_hash_t *h = lbm->per_cpu[thread_index].sticky_ht;
     if (h) {
-      s = format(s, "core %d\n", cpu_index);
+      s = format(s, "core %d\n", thread_index);
       s = format(s, "  timeout: %ds\n", h->timeout);
       s = format(s, "  usage: %d / %d\n", lb_hash_elts(h, lb_hash_time_now(vlib_get_main())),  lb_hash_size(h));
     }
diff --git a/src/plugins/lb/node.c b/src/plugins/lb/node.c
index 8b763c53..3171148b 100644
--- a/src/plugins/lb/node.c
+++ b/src/plugins/lb/node.c
@@ -60,10 +60,10 @@ format_lb_trace (u8 * s, va_list * args)
   return s;
 }
 
-lb_hash_t *lb_get_sticky_table(u32 cpu_index)
+lb_hash_t *lb_get_sticky_table(u32 thread_index)
 {
   lb_main_t *lbm = &lb_main;
-  lb_hash_t *sticky_ht = lbm->per_cpu[cpu_index].sticky_ht;
+  lb_hash_t *sticky_ht = lbm->per_cpu[thread_index].sticky_ht;
   //Check if size changed
   if (PREDICT_FALSE(sticky_ht && (lbm->per_cpu_sticky_buckets != lb_hash_nbuckets(sticky_ht))))
     {
@@ -71,8 +71,8 @@ lb_hash_t *lb_get_sticky_table(u32 cpu_index)
       lb_hash_bucket_t *b;
       u32 i;
       lb_hash_foreach_entry(sticky_ht, b, i) {
-	vlib_refcount_add(&lbm->as_refcount, cpu_index, b->value[i], -1);
-	vlib_refcount_add(&lbm->as_refcount, cpu_index, 0, 1);
+	vlib_refcount_add(&lbm->as_refcount, thread_index, b->value[i], -1);
+	vlib_refcount_add(&lbm->as_refcount, thread_index, 0, 1);
       }
 
       lb_hash_free(sticky_ht);
@@ -81,8 +81,8 @@ lb_hash_t *lb_get_sticky_table(u32 cpu_index)
 
   //Create if necessary
   if (PREDICT_FALSE(sticky_ht == NULL)) {
-    lbm->per_cpu[cpu_index].sticky_ht = lb_hash_alloc(lbm->per_cpu_sticky_buckets, lbm->flow_timeout);
-    sticky_ht = lbm->per_cpu[cpu_index].sticky_ht;
+    lbm->per_cpu[thread_index].sticky_ht = lb_hash_alloc(lbm->per_cpu_sticky_buckets, lbm->flow_timeout);
+    sticky_ht = lbm->per_cpu[thread_index].sticky_ht;
     clib_warning("Regenerated sticky table %p", sticky_ht);
   }
 
@@ -153,10 +153,10 @@ lb_node_fn (vlib_main_t * vm,
 {
   lb_main_t *lbm = &lb_main;
   u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
-  u32 cpu_index = os_get_cpu_number();
+  u32 thread_index = vlib_get_thread_index();
   u32 lb_time = lb_hash_time_now(vm);
 
-  lb_hash_t *sticky_ht = lb_get_sticky_table(cpu_index);
+  lb_hash_t *sticky_ht = lb_get_sticky_table(thread_index);
   from = vlib_frame_vector_args (frame);
   n_left_from = frame->n_vectors;
   next_index = node->cached_next_index;
@@ -240,9 +240,9 @@ lb_node_fn (vlib_main_t * vm,
 	  //Configuration may be changed, vectors resized, etc...
 
 	  //Dereference previously used
-	  vlib_refcount_add(&lbm->as_refcount, cpu_index,
+	  vlib_refcount_add(&lbm->as_refcount, thread_index,
 			    lb_hash_available_value(sticky_ht, hash0, available_index0), -1);
-	  vlib_refcount_add(&lbm->as_refcount, cpu_index,
+	  vlib_refcount_add(&lbm->as_refcount, thread_index,
 			    asindex0, 1);
 
 	  //Add sticky entry
@@ -260,7 +260,7 @@ lb_node_fn (vlib_main_t * vm,
 	}
 
       vlib_increment_simple_counter(&lbm->vip_counters[counter],
-				    cpu_index,
+				    thread_index,
 				    vnet_buffer (p0)->ip.adj_index[VLIB_TX],
 				    1);
 
diff --git a/src/plugins/lb/refcount.c b/src/plugins/lb/refcount.c
index 22415c88..6f01ab5a 100644
--- a/src/plugins/lb/refcount.c
+++ b/src/plugins/lb/refcount.c
@@ -31,10 +31,10 @@ u64 vlib_refcount_get(vlib_refcount_t *r, u32 index)
 {
   u64 count = 0;
   vlib_thread_main_t *tm = vlib_get_thread_main ();
-  u32 cpu_index;
-  for (cpu_index = 0; cpu_index < tm->n_vlib_mains; cpu_index++) {
-    if (r->per_cpu[cpu_index].length > index)
-      count += r->per_cpu[cpu_index].counters[index];
+  u32 thread_index;
+  for (thread_index = 0; thread_index < tm->n_vlib_mains; thread_index++) {
+    if (r->per_cpu[thread_index].length > index)
+      count += r->per_cpu[thread_index].counters[index];
   }
   return count;
 }
diff --git a/src/plugins/lb/refcount.h b/src/plugins/lb/refcount.h
index 8c26e7be..dcfcb3fe 100644
--- a/src/plugins/lb/refcount.h
+++ b/src/plugins/lb/refcount.h
@@ -45,9 +45,9 @@ typedef struct {
 void __vlib_refcount_resize(vlib_refcount_per_cpu_t *per_cpu, u32 size);
 
 static_always_inline
-void vlib_refcount_add(vlib_refcount_t *r, u32 cpu_index, u32 counter_index, i32 v)
+void vlib_refcount_add(vlib_refcount_t *r, u32 thread_index, u32 counter_index, i32 v)
 {
-  vlib_refcount_per_cpu_t *per_cpu = &r->per_cpu[cpu_index];
+  vlib_refcount_per_cpu_t *per_cpu = &r->per_cpu[thread_index];
   if (PREDICT_FALSE(counter_index >= per_cpu->length))
     __vlib_refcount_resize(per_cpu, clib_max(counter_index + 16, per_cpu->length * 2));
 
diff --git a/src/plugins/memif/node.c b/src/plugins/memif/node.c
index 659d5dfb..cee1f3d1 100644
--- a/src/plugins/memif/node.c
+++ b/src/plugins/memif/node.c
@@ -94,7 +94,7 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
   u32 n_rx_bytes = 0;
   u32 *to_next = 0;
   u32 n_free_bufs;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   u32 bi0, bi1;
   vlib_buffer_t *b0, *b1;
   u16 ring_size = 1 << mif->log2_ring_size;
@@ -105,14 +105,15 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
   if (mif->per_interface_next_index != ~0)
     next_index = mif->per_interface_next_index;
 
-  n_free_bufs = vec_len (nm->rx_buffers[cpu_index]);
+  n_free_bufs = vec_len (nm->rx_buffers[thread_index]);
   if (PREDICT_FALSE (n_free_bufs < ring_size))
     {
-      vec_validate (nm->rx_buffers[cpu_index], ring_size + n_free_bufs - 1);
+      vec_validate (nm->rx_buffers[thread_index],
+		    ring_size + n_free_bufs - 1);
       n_free_bufs +=
-	vlib_buffer_alloc (vm, &nm->rx_buffers[cpu_index][n_free_bufs],
+	vlib_buffer_alloc (vm, &nm->rx_buffers[thread_index][n_free_bufs],
 			   ring_size);
-      _vec_len (nm->rx_buffers[cpu_index]) = n_free_bufs;
+      _vec_len (nm->rx_buffers[thread_index]) = n_free_bufs;
     }
 
   head = ring->head;
@@ -158,15 +159,15 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 			     CLIB_CACHE_LINE_BYTES, LOAD);
 	    }
 	  /* get empty buffer */
-	  u32 last_buf = vec_len (nm->rx_buffers[cpu_index]) - 1;
-	  bi0 = nm->rx_buffers[cpu_index][last_buf];
-	  bi1 = nm->rx_buffers[cpu_index][last_buf - 1];
-	  _vec_len (nm->rx_buffers[cpu_index]) -= 2;
+	  u32 last_buf = vec_len (nm->rx_buffers[thread_index]) - 1;
+	  bi0 = nm->rx_buffers[thread_index][last_buf];
+	  bi1 = nm->rx_buffers[thread_index][last_buf - 1];
+	  _vec_len (nm->rx_buffers[thread_index]) -= 2;
 
 	  if (last_buf > 4)
 	    {
-	      memif_prefetch (vm, nm->rx_buffers[cpu_index][last_buf - 2]);
-	      memif_prefetch (vm, nm->rx_buffers[cpu_index][last_buf - 3]);
+	      memif_prefetch (vm, nm->rx_buffers[thread_index][last_buf - 2]);
+	      memif_prefetch (vm, nm->rx_buffers[thread_index][last_buf - 3]);
 	    }
 
 	  /* enqueue buffer */
@@ -256,9 +257,9 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
       while (num_slots && n_left_to_next)
 	{
 	  /* get empty buffer */
-	  u32 last_buf = vec_len (nm->rx_buffers[cpu_index]) - 1;
-	  bi0 = nm->rx_buffers[cpu_index][last_buf];
-	  _vec_len (nm->rx_buffers[cpu_index]) = last_buf;
+	  u32 last_buf = vec_len (nm->rx_buffers[thread_index]) - 1;
+	  bi0 = nm->rx_buffers[thread_index][last_buf];
+	  _vec_len (nm->rx_buffers[thread_index]) = last_buf;
 
 	  /* enqueue buffer */
 	  to_next[0] = bi0;
@@ -315,7 +316,7 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
   ring->tail = head;
 
   vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters
-				   + VNET_INTERFACE_COUNTER_RX, cpu_index,
+				   + VNET_INTERFACE_COUNTER_RX, thread_index,
 				   mif->hw_if_index, n_rx_packets,
 				   n_rx_bytes);
 
@@ -327,7 +328,7 @@ memif_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
 		vlib_frame_t * frame)
 {
   u32 n_rx_packets = 0;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   memif_main_t *nm = &memif_main;
   memif_if_t *mif;
 
@@ -337,7 +338,7 @@ memif_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
       if (mif->flags & MEMIF_IF_FLAG_ADMIN_UP &&
 	  mif->flags & MEMIF_IF_FLAG_CONNECTED &&
 	  (mif->if_index % nm->input_cpu_count) ==
-	  (cpu_index - nm->input_cpu_first_index))
+	  (thread_index - nm->input_cpu_first_index))
 	{
 	  if (mif->flags & MEMIF_IF_FLAG_IS_SLAVE)
 	    n_rx_packets +=
diff --git a/src/plugins/snat/in2out.c b/src/plugins/snat/in2out.c
index b4961365..e5ee965f 100644
--- a/src/plugins/snat/in2out.c
+++ b/src/plugins/snat/in2out.c
@@ -212,7 +212,7 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0,
                       snat_session_t ** sessionp,
                       vlib_node_runtime_t * node,
                       u32 next0,
-                      u32 cpu_index)
+                      u32 thread_index)
 {
   snat_user_t *u;
   snat_user_key_t user_key;
@@ -246,27 +246,27 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0,
   if (clib_bihash_search_8_8 (&sm->user_hash, &kv0, &value0))
     {
       /* no, make a new one */
-      pool_get (sm->per_thread_data[cpu_index].users, u);
+      pool_get (sm->per_thread_data[thread_index].users, u);
       memset (u, 0, sizeof (*u));
       u->addr = ip0->src_address;
       u->fib_index = rx_fib_index0;
 
-      pool_get (sm->per_thread_data[cpu_index].list_pool, per_user_list_head_elt);
+      pool_get (sm->per_thread_data[thread_index].list_pool, per_user_list_head_elt);
 
       u->sessions_per_user_list_head_index = per_user_list_head_elt -
-        sm->per_thread_data[cpu_index].list_pool;
+        sm->per_thread_data[thread_index].list_pool;
 
-      clib_dlist_init (sm->per_thread_data[cpu_index].list_pool,
+      clib_dlist_init (sm->per_thread_data[thread_index].list_pool,
                        u->sessions_per_user_list_head_index);
 
-      kv0.value = u - sm->per_thread_data[cpu_index].users;
+      kv0.value = u - sm->per_thread_data[thread_index].users;
 
       /* add user */
       clib_bihash_add_del_8_8 (&sm->user_hash, &kv0, 1 /* is_add */);
     }
   else
     {
-      u = pool_elt_at_index (sm->per_thread_data[cpu_index].users,
+      u = pool_elt_at_index (sm->per_thread_data[thread_index].users,
                              value0.value);
     }
 
@@ -276,25 +276,25 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0,
       /* Remove the oldest dynamic translation */
       do {
           oldest_per_user_translation_list_index =
-            clib_dlist_remove_head (sm->per_thread_data[cpu_index].list_pool,
+            clib_dlist_remove_head (sm->per_thread_data[thread_index].list_pool,
                                     u->sessions_per_user_list_head_index);
 
           ASSERT (oldest_per_user_translation_list_index != ~0);
 
           /* add it back to the end of the LRU list */
-          clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool,
+          clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool,
                               u->sessions_per_user_list_head_index,
                               oldest_per_user_translation_list_index);
           /* Get the list element */
           oldest_per_user_translation_list_elt =
-            pool_elt_at_index (sm->per_thread_data[cpu_index].list_pool,
+            pool_elt_at_index (sm->per_thread_data[thread_index].list_pool,
                                oldest_per_user_translation_list_index);
 
           /* Get the session index from the list element */
           session_index = oldest_per_user_translation_list_elt->value;
 
           /* Get the session */
-          s = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions,
+          s = pool_elt_at_index (sm->per_thread_data[thread_index].sessions,
                                  session_index);
       } while (snat_is_session_static (s));
 
@@ -346,7 +346,7 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0,
         }
 
       /* Create a new session */
-      pool_get (sm->per_thread_data[cpu_index].sessions, s);
+      pool_get (sm->per_thread_data[thread_index].sessions, s);
       memset (s, 0, sizeof (*s));
       
       s->outside_address_index = address_index;
@@ -362,22 +362,22 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0,
         }
 
       /* Create list elts */
-      pool_get (sm->per_thread_data[cpu_index].list_pool,
+      pool_get (sm->per_thread_data[thread_index].list_pool,
                 per_user_translation_list_elt);
-      clib_dlist_init (sm->per_thread_data[cpu_index].list_pool,
+      clib_dlist_init (sm->per_thread_data[thread_index].list_pool,
                        per_user_translation_list_elt -
-                       sm->per_thread_data[cpu_index].list_pool);
+                       sm->per_thread_data[thread_index].list_pool);
 
       per_user_translation_list_elt->value =
-        s - sm->per_thread_data[cpu_index].sessions;
+        s - sm->per_thread_data[thread_index].sessions;
       s->per_user_index = per_user_translation_list_elt -
-                          sm->per_thread_data[cpu_index].list_pool;
+                          sm->per_thread_data[thread_index].list_pool;
       s->per_user_list_head_index = u->sessions_per_user_list_head_index;
 
-      clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool,
+      clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool,
                           s->per_user_list_head_index,
                           per_user_translation_list_elt -
-                          sm->per_thread_data[cpu_index].list_pool);
+                          sm->per_thread_data[thread_index].list_pool);
    }
   
   s->in2out = *key0;
@@ -388,12 +388,12 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0,
 
   /* Add to translation hashes */
   kv0.key = s->in2out.as_u64;
-  kv0.value = s - sm->per_thread_data[cpu_index].sessions;
+  kv0.value = s - sm->per_thread_data[thread_index].sessions;
   if (clib_bihash_add_del_8_8 (&sm->in2out, &kv0, 1 /* is_add */))
       clib_warning ("in2out key add failed");
   
   kv0.key = s->out2in.as_u64;
-  kv0.value = s - sm->per_thread_data[cpu_index].sessions;
+  kv0.value = s - sm->per_thread_data[thread_index].sessions;
   
   if (clib_bihash_add_del_8_8 (&sm->out2in, &kv0, 1 /* is_add */))
       clib_warning ("out2in key add failed");
@@ -403,7 +403,7 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0,
   worker_by_out_key.port = s->out2in.port;
   worker_by_out_key.fib_index = s->out2in.fib_index;
   kv0.key = worker_by_out_key.as_u64;
-  kv0.value = cpu_index;
+  kv0.value = thread_index;
   clib_bihash_add_del_8_8 (&sm->worker_by_out, &kv0, 1);
 
   /* log NAT event */
@@ -465,7 +465,7 @@ snat_in2out_error_t icmp_get_key(icmp46_header_t *icmp0,
  *
  * @param[in,out] sm             SNAT main
  * @param[in,out] node           SNAT node runtime
- * @param[in] cpu_index          CPU index
+ * @param[in] thread_index       thread index
  * @param[in,out] b0             buffer containing packet to be translated
  * @param[out] p_key             address and port before NAT translation
  * @param[out] p_value           address and port after NAT translation
@@ -473,7 +473,7 @@ snat_in2out_error_t icmp_get_key(icmp46_header_t *icmp0,
  * @param d                      optional parameter
  */
 u32 icmp_match_in2out_slow(snat_main_t *sm, vlib_node_runtime_t *node,
-                           u32 cpu_index, vlib_buffer_t *b0,
+                           u32 thread_index, vlib_buffer_t *b0,
                            snat_session_key_t *p_key,
                            snat_session_key_t *p_value,
                            u8 *p_dont_translate, void *d)
@@ -524,13 +524,13 @@ u32 icmp_match_in2out_slow(snat_main_t *sm, vlib_node_runtime_t *node,
         }
 
       next0 = slow_path (sm, b0, ip0, rx_fib_index0, &key0,
-                         &s0, node, next0, cpu_index);
+                         &s0, node, next0, thread_index);
 
       if (PREDICT_FALSE (next0 == SNAT_IN2OUT_NEXT_DROP))
         goto out;
     }
   else
-    s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions,
+    s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions,
                             value0.value);
 
 out:
@@ -548,7 +548,7 @@ out:
  *
  * @param[in] sm                 SNAT main
  * @param[in,out] node           SNAT node runtime
- * @param[in] cpu_index          CPU index
+ * @param[in] thread_index       thread index
  * @param[in,out] b0             buffer containing packet to be translated
  * @param[out] p_key             address and port before NAT translation
  * @param[out] p_value           address and port after NAT translation
@@ -556,7 +556,7 @@ out:
  * @param d                      optional parameter
  */
 u32 icmp_match_in2out_fast(snat_main_t *sm, vlib_node_runtime_t *node,
-                           u32 cpu_index, vlib_buffer_t *b0,
+                           u32 thread_index, vlib_buffer_t *b0,
                            snat_session_key_t *p_key,
                            snat_session_key_t *p_value,
                            u8 *p_dont_translate, void *d)
@@ -624,7 +624,7 @@ static inline u32 icmp_in2out (snat_main_t *sm,
                                u32 rx_fib_index0,
                                vlib_node_runtime_t * node,
                                u32 next0,
-                               u32 cpu_index,
+                               u32 thread_index,
                                void *d)
 {
   snat_session_key_t key0, sm0;
@@ -641,7 +641,7 @@ static inline u32 icmp_in2out (snat_main_t *sm,
 
   echo0 = (icmp_echo_header_t *)(icmp0+1);
 
-  next0_tmp = sm->icmp_match_in2out_cb(sm, node, cpu_index, b0,
+  next0_tmp = sm->icmp_match_in2out_cb(sm, node, thread_index, b0,
                                        &key0, &sm0, &dont_translate, d);
   if (next0_tmp != ~0)
     next0 = next0_tmp;
@@ -847,11 +847,11 @@ static inline u32 icmp_in2out_slow_path (snat_main_t *sm,
                                          vlib_node_runtime_t * node,
                                          u32 next0,
                                          f64 now,
-                                         u32 cpu_index,
+                                         u32 thread_index,
                                          snat_session_t ** p_s0)
 {
   next0 = icmp_in2out(sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node,
-                      next0, cpu_index, p_s0);
+                      next0, thread_index, p_s0);
   snat_session_t * s0 = *p_s0;
   if (PREDICT_TRUE(next0 != SNAT_IN2OUT_NEXT_DROP && s0))
     {
@@ -862,9 +862,9 @@ static inline u32 icmp_in2out_slow_path (snat_main_t *sm,
       /* Per-user LRU list maintenance for dynamic translations */
       if (!snat_is_session_static (s0))
         {
-          clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool,
+          clib_dlist_remove (sm->per_thread_data[thread_index].list_pool,
                              s0->per_user_index);
-          clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool,
+          clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool,
                               s0->per_user_list_head_index,
                               s0->per_user_index);
         }
@@ -884,7 +884,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm,
   snat_runtime_t * rt = (snat_runtime_t *)node->runtime_data;
   f64 now = vlib_time_now (vm);
   u32 stats_node_index;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   stats_node_index = is_slow_path ? snat_in2out_slowpath_node.index :
     snat_in2out_node.index;
@@ -977,7 +977,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm,
                 {
                   next0 = icmp_in2out_slow_path 
                     (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, 
-                     node, next0, now, cpu_index, &s0);
+                     node, next0, now, thread_index, &s0);
                   goto trace00;
                 }
             }
@@ -1006,7 +1006,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm,
                     goto trace00;
 
                   next0 = slow_path (sm, b0, ip0, rx_fib_index0, &key0,
-                                     &s0, node, next0, cpu_index);
+                                     &s0, node, next0, thread_index);
                   if (PREDICT_FALSE (next0 == SNAT_IN2OUT_NEXT_DROP))
                     goto trace00;
                 }
@@ -1017,7 +1017,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm,
                 }
             }
           else
-            s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions,
+            s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions,
                                     value0.value);
 
           old_addr0 = ip0->src_address.as_u32;
@@ -1063,9 +1063,9 @@ snat_in2out_node_fn_inline (vlib_main_t * vm,
           /* Per-user LRU list maintenance for dynamic translation */
           if (!snat_is_session_static (s0))
             {
-              clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool,
+              clib_dlist_remove (sm->per_thread_data[thread_index].list_pool,
                                  s0->per_user_index);
-              clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool,
+              clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool,
                                   s0->per_user_list_head_index,
                                   s0->per_user_index);
             }
@@ -1081,7 +1081,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm,
               t->next_index = next0;
                   t->session_index = ~0;
               if (s0)
-                t->session_index = s0 - sm->per_thread_data[cpu_index].sessions;
+                t->session_index = s0 - sm->per_thread_data[thread_index].sessions;
             }
 
           pkts_processed += next0 != SNAT_IN2OUT_NEXT_DROP;
@@ -1117,7 +1117,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm,
                 {
                   next1 = icmp_in2out_slow_path 
                     (sm, b1, ip1, icmp1, sw_if_index1, rx_fib_index1, node,
-                     next1, now, cpu_index, &s1);
+                     next1, now, thread_index, &s1);
                   goto trace01;
                 }
             }
@@ -1146,7 +1146,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm,
                     goto trace01;
 
                   next1 = slow_path (sm, b1, ip1, rx_fib_index1, &key1,
-                                     &s1, node, next1, cpu_index);
+                                     &s1, node, next1, thread_index);
                   if (PREDICT_FALSE (next1 == SNAT_IN2OUT_NEXT_DROP))
                     goto trace01;
                 }
@@ -1157,7 +1157,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm,
                 }
             }
           else
-            s1 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions,
+            s1 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions,
                                     value1.value);
 
           old_addr1 = ip1->src_address.as_u32;
@@ -1203,9 +1203,9 @@ snat_in2out_node_fn_inline (vlib_main_t * vm,
           /* Per-user LRU list maintenance for dynamic translation */
           if (!snat_is_session_static (s1))
             {
-              clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool,
+              clib_dlist_remove (sm->per_thread_data[thread_index].list_pool,
                                  s1->per_user_index);
-              clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool,
+              clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool,
                                   s1->per_user_list_head_index,
                                   s1->per_user_index);
             }
@@ -1220,7 +1220,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm,
               t->next_index = next1;
               t->session_index = ~0;
               if (s1)
-                t->session_index = s1 - sm->per_thread_data[cpu_index].sessions;
+                t->session_index = s1 - sm->per_thread_data[thread_index].sessions;
             }
 
           pkts_processed += next1 != SNAT_IN2OUT_NEXT_DROP;
@@ -1292,7 +1292,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm,
                 {
                   next0 = icmp_in2out_slow_path 
                     (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node,
-                     next0, now, cpu_index, &s0);
+                     next0, now, thread_index, &s0);
                   goto trace0;
                 }
             }
@@ -1321,7 +1321,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm,
                     goto trace0;
 
                   next0 = slow_path (sm, b0, ip0, rx_fib_index0, &key0,
-                                     &s0, node, next0, cpu_index);
+                                     &s0, node, next0, thread_index);
 
                   if (PREDICT_FALSE (next0 == SNAT_IN2OUT_NEXT_DROP))
                     goto trace0;
@@ -1333,7 +1333,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm,
                 }
             }
           else
-            s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions,
+            s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions,
                                     value0.value);
 
           old_addr0 = ip0->src_address.as_u32;
@@ -1379,9 +1379,9 @@ snat_in2out_node_fn_inline (vlib_main_t * vm,
           /* Per-user LRU list maintenance for dynamic translation */
           if (!snat_is_session_static (s0))
             {
-              clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool,
+              clib_dlist_remove (sm->per_thread_data[thread_index].list_pool,
                                  s0->per_user_index);
-              clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool,
+              clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool,
                                   s0->per_user_list_head_index,
                                   s0->per_user_index);
             }
@@ -1397,7 +1397,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm,
               t->next_index = next0;
                   t->session_index = ~0;
               if (s0)
-                t->session_index = s0 - sm->per_thread_data[cpu_index].sessions;
+                t->session_index = s0 - sm->per_thread_data[thread_index].sessions;
             }
 
           pkts_processed += next0 != SNAT_IN2OUT_NEXT_DROP;
@@ -2010,7 +2010,7 @@ snat_in2out_worker_handoff_fn (vlib_main_t * vm,
   u32 n_left_to_next_worker = 0, *to_next_worker = 0;
   u32 next_worker_index = 0;
   u32 current_worker_index = ~0;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   ASSERT (vec_len (sm->workers));
 
@@ -2048,7 +2048,7 @@ snat_in2out_worker_handoff_fn (vlib_main_t * vm,
 
       next_worker_index = sm->worker_in2out_cb(ip0, rx_fib_index0);
 
-      if (PREDICT_FALSE (next_worker_index != cpu_index))
+      if (PREDICT_FALSE (next_worker_index != thread_index))
         {
           do_handoff = 1;
 
diff --git a/src/plugins/snat/out2in.c b/src/plugins/snat/out2in.c
index 656e42db..5d308d78 100644
--- a/src/plugins/snat/out2in.c
+++ b/src/plugins/snat/out2in.c
@@ -129,7 +129,7 @@ create_session_for_static_mapping (snat_main_t *sm,
                                    snat_session_key_t in2out,
                                    snat_session_key_t out2in,
                                    vlib_node_runtime_t * node,
-                                   u32 cpu_index)
+                                   u32 thread_index)
 {
   snat_user_t *u;
   snat_user_key_t user_key;
@@ -146,36 +146,36 @@ create_session_for_static_mapping (snat_main_t *sm,
   if (clib_bihash_search_8_8 (&sm->user_hash, &kv0, &value0))
     {
       /* no, make a new one */
-      pool_get (sm->per_thread_data[cpu_index].users, u);
+      pool_get (sm->per_thread_data[thread_index].users, u);
       memset (u, 0, sizeof (*u));
       u->addr = in2out.addr;
       u->fib_index = in2out.fib_index;
 
-      pool_get (sm->per_thread_data[cpu_index].list_pool,
+      pool_get (sm->per_thread_data[thread_index].list_pool,
                 per_user_list_head_elt);
 
       u->sessions_per_user_list_head_index = per_user_list_head_elt -
-        sm->per_thread_data[cpu_index].list_pool;
+        sm->per_thread_data[thread_index].list_pool;
 
-      clib_dlist_init (sm->per_thread_data[cpu_index].list_pool,
+      clib_dlist_init (sm->per_thread_data[thread_index].list_pool,
                        u->sessions_per_user_list_head_index);
 
-      kv0.value = u - sm->per_thread_data[cpu_index].users;
+      kv0.value = u - sm->per_thread_data[thread_index].users;
 
       /* add user */
       clib_bihash_add_del_8_8 (&sm->user_hash, &kv0, 1 /* is_add */);
 
       /* add non-traslated packets worker lookup */
-      kv0.value = cpu_index;
+      kv0.value = thread_index;
       clib_bihash_add_del_8_8 (&sm->worker_by_in, &kv0, 1);
     }
   else
     {
-      u = pool_elt_at_index (sm->per_thread_data[cpu_index].users,
+      u = pool_elt_at_index (sm->per_thread_data[thread_index].users,
                              value0.value);
     }
 
-  pool_get (sm->per_thread_data[cpu_index].sessions, s);
+  pool_get (sm->per_thread_data[thread_index].sessions, s);
   memset (s, 0, sizeof (*s));
 
   s->outside_address_index = ~0;
@@ -183,22 +183,22 @@ create_session_for_static_mapping (snat_main_t *sm,
   u->nstaticsessions++;
 
   /* Create list elts */
-  pool_get (sm->per_thread_data[cpu_index].list_pool,
+  pool_get (sm->per_thread_data[thread_index].list_pool,
             per_user_translation_list_elt);
-  clib_dlist_init (sm->per_thread_data[cpu_index].list_pool,
+  clib_dlist_init (sm->per_thread_data[thread_index].list_pool,
                    per_user_translation_list_elt -
-                   sm->per_thread_data[cpu_index].list_pool);
+                   sm->per_thread_data[thread_index].list_pool);
 
   per_user_translation_list_elt->value =
-    s - sm->per_thread_data[cpu_index].sessions;
+    s - sm->per_thread_data[thread_index].sessions;
   s->per_user_index =
-    per_user_translation_list_elt - sm->per_thread_data[cpu_index].list_pool;
+    per_user_translation_list_elt - sm->per_thread_data[thread_index].list_pool;
   s->per_user_list_head_index = u->sessions_per_user_list_head_index;
 
-  clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool,
+  clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool,
                       s->per_user_list_head_index,
                       per_user_translation_list_elt -
-                      sm->per_thread_data[cpu_index].list_pool);
+                      sm->per_thread_data[thread_index].list_pool);
 
   s->in2out = in2out;
   s->out2in = out2in;
@@ -206,12 +206,12 @@ create_session_for_static_mapping (snat_main_t *sm,
 
   /* Add to translation hashes */
   kv0.key = s->in2out.as_u64;
-  kv0.value = s - sm->per_thread_data[cpu_index].sessions;
+  kv0.value = s - sm->per_thread_data[thread_index].sessions;
   if (clib_bihash_add_del_8_8 (&sm->in2out, &kv0, 1 /* is_add */))
       clib_warning ("in2out key add failed");
 
   kv0.key = s->out2in.as_u64;
-  kv0.value = s - sm->per_thread_data[cpu_index].sessions;
+  kv0.value = s - sm->per_thread_data[thread_index].sessions;
 
   if (clib_bihash_add_del_8_8 (&sm->out2in, &kv0, 1 /* is_add */))
       clib_warning ("out2in key add failed");
@@ -298,7 +298,7 @@ is_interface_addr(snat_main_t *sm, vlib_node_runtime_t *node, u32 sw_if_index0,
  *
  * @param[in,out] sm             SNAT main
  * @param[in,out] node           SNAT node runtime
- * @param[in] cpu_index          CPU index
+ * @param[in] thread_index       thread index
  * @param[in,out] b0             buffer containing packet to be translated
  * @param[out] p_key             address and port before NAT translation
  * @param[out] p_value           address and port after NAT translation
@@ -306,7 +306,7 @@ is_interface_addr(snat_main_t *sm, vlib_node_runtime_t *node, u32 sw_if_index0,
  * @param d                      optional parameter
  */
 u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node,
-                           u32 cpu_index, vlib_buffer_t *b0,
+                           u32 thread_index, vlib_buffer_t *b0,
                            snat_session_key_t *p_key,
                            snat_session_key_t *p_value,
                            u8 *p_dont_translate, void *d)
@@ -366,7 +366,7 @@ u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node,
 
       /* Create session initiated by host from external network */
       s0 = create_session_for_static_mapping(sm, b0, sm0, key0,
-                                             node, cpu_index);
+                                             node, thread_index);
 
       if (!s0)
         {
@@ -375,7 +375,7 @@ u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node,
         }
     }
   else
-    s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions,
+    s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions,
                             value0.value);
 
 out:
@@ -393,7 +393,7 @@ out:
  *
  * @param[in] sm                 SNAT main
  * @param[in,out] node           SNAT node runtime
- * @param[in] cpu_index          CPU index
+ * @param[in] thread_index       thread index
  * @param[in,out] b0             buffer containing packet to be translated
  * @param[out] p_key             address and port before NAT translation
  * @param[out] p_value           address and port after NAT translation
@@ -401,7 +401,7 @@ out:
  * @param d                      optional parameter
  */
 u32 icmp_match_out2in_fast(snat_main_t *sm, vlib_node_runtime_t *node,
-                           u32 cpu_index, vlib_buffer_t *b0,
+                           u32 thread_index, vlib_buffer_t *b0,
                            snat_session_key_t *p_key,
                            snat_session_key_t *p_value,
                            u8 *p_dont_translate, void *d)
@@ -460,7 +460,7 @@ static inline u32 icmp_out2in (snat_main_t *sm,
                                u32 rx_fib_index0,
                                vlib_node_runtime_t * node,
                                u32 next0,
-                               u32 cpu_index,
+                               u32 thread_index,
                                void *d)
 {
   snat_session_key_t key0, sm0;
@@ -477,7 +477,7 @@ static inline u32 icmp_out2in (snat_main_t *sm,
 
   echo0 = (icmp_echo_header_t *)(icmp0+1);
 
-  next0_tmp = sm->icmp_match_out2in_cb(sm, node, cpu_index, b0,
+  next0_tmp = sm->icmp_match_out2in_cb(sm, node, thread_index, b0,
                                        &key0, &sm0, &dont_translate, d);
   if (next0_tmp != ~0)
     next0 = next0_tmp;
@@ -589,11 +589,11 @@ static inline u32 icmp_out2in_slow_path (snat_main_t *sm,
                                          u32 rx_fib_index0,
                                          vlib_node_runtime_t * node,
                                          u32 next0, f64 now,
-                                         u32 cpu_index,
+                                         u32 thread_index,
                                          snat_session_t ** p_s0)
 {
   next0 = icmp_out2in(sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node,
-                      next0, cpu_index, p_s0);
+                      next0, thread_index, p_s0);
   snat_session_t * s0 = *p_s0;
   if (PREDICT_TRUE(next0 != SNAT_OUT2IN_NEXT_DROP && s0))
     {
@@ -604,9 +604,9 @@ static inline u32 icmp_out2in_slow_path (snat_main_t *sm,
       /* Per-user LRU list maintenance for dynamic translation */
       if (!snat_is_session_static (s0))
         {
-          clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool,
+          clib_dlist_remove (sm->per_thread_data[thread_index].list_pool,
                              s0->per_user_index);
-          clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool,
+          clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool,
                               s0->per_user_list_head_index,
                               s0->per_user_index);
         }
@@ -624,7 +624,7 @@ snat_out2in_node_fn (vlib_main_t * vm,
   u32 pkts_processed = 0;
   snat_main_t * sm = &snat_main;
   f64 now = vlib_time_now (vm);
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   from = vlib_frame_vector_args (frame);
   n_left_from = frame->n_vectors;
@@ -712,7 +712,7 @@ snat_out2in_node_fn (vlib_main_t * vm,
             {
               next0 = icmp_out2in_slow_path 
                 (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, 
-                 next0, now, cpu_index, &s0);
+                 next0, now, thread_index, &s0);
               goto trace0;
             }
 
@@ -743,7 +743,7 @@ snat_out2in_node_fn (vlib_main_t * vm,
 
               /* Create session initiated by host from external network */
               s0 = create_session_for_static_mapping(sm, b0, sm0, key0, node,
-                                                     cpu_index);
+                                                     thread_index);
               if (!s0)
                 {
                   b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION];
@@ -752,7 +752,7 @@ snat_out2in_node_fn (vlib_main_t * vm,
                 }
             }
           else
-            s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions,
+            s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions,
                                     value0.value);
 
           old_addr0 = ip0->dst_address.as_u32;
@@ -796,9 +796,9 @@ snat_out2in_node_fn (vlib_main_t * vm,
           /* Per-user LRU list maintenance for dynamic translation */
           if (!snat_is_session_static (s0))
             {
-              clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool,
+              clib_dlist_remove (sm->per_thread_data[thread_index].list_pool,
                                  s0->per_user_index);
-              clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool,
+              clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool,
                                   s0->per_user_list_head_index,
                                   s0->per_user_index);
             }
@@ -813,7 +813,7 @@ snat_out2in_node_fn (vlib_main_t * vm,
               t->next_index = next0;
               t->session_index = ~0;
               if (s0)
-                t->session_index = s0 - sm->per_thread_data[cpu_index].sessions;
+                t->session_index = s0 - sm->per_thread_data[thread_index].sessions;
             }
 
           pkts_processed += next0 != SNAT_OUT2IN_NEXT_DROP;
@@ -847,7 +847,7 @@ snat_out2in_node_fn (vlib_main_t * vm,
             {
               next1 = icmp_out2in_slow_path 
                 (sm, b1, ip1, icmp1, sw_if_index1, rx_fib_index1, node, 
-                 next1, now, cpu_index, &s1);
+                 next1, now, thread_index, &s1);
               goto trace1;
             }
 
@@ -878,7 +878,7 @@ snat_out2in_node_fn (vlib_main_t * vm,
 
               /* Create session initiated by host from external network */
               s1 = create_session_for_static_mapping(sm, b1, sm1, key1, node,
-                                                     cpu_index);
+                                                     thread_index);
               if (!s1)
                 {
                   b1->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION];
@@ -887,7 +887,7 @@ snat_out2in_node_fn (vlib_main_t * vm,
                 }
             }
           else
-            s1 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions,
+            s1 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions,
                                     value1.value);
 
           old_addr1 = ip1->dst_address.as_u32;
@@ -931,9 +931,9 @@ snat_out2in_node_fn (vlib_main_t * vm,
           /* Per-user LRU list maintenance for dynamic translation */
           if (!snat_is_session_static (s1))
             {
-              clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool,
+              clib_dlist_remove (sm->per_thread_data[thread_index].list_pool,
                                  s1->per_user_index);
-              clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool,
+              clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool,
                                   s1->per_user_list_head_index,
                                   s1->per_user_index);
             }
@@ -948,7 +948,7 @@ snat_out2in_node_fn (vlib_main_t * vm,
               t->next_index = next1;
               t->session_index = ~0;
               if (s1)
-                t->session_index = s1 - sm->per_thread_data[cpu_index].sessions;
+                t->session_index = s1 - sm->per_thread_data[thread_index].sessions;
             }
 
           pkts_processed += next1 != SNAT_OUT2IN_NEXT_DROP;
@@ -1016,7 +1016,7 @@ snat_out2in_node_fn (vlib_main_t * vm,
             {
               next0 = icmp_out2in_slow_path 
                 (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, 
-                 next0, now, cpu_index, &s0);
+                 next0, now, thread_index, &s0);
               goto trace00;
             }
 
@@ -1048,7 +1048,7 @@ snat_out2in_node_fn (vlib_main_t * vm,
 
               /* Create session initiated by host from external network */
               s0 = create_session_for_static_mapping(sm, b0, sm0, key0, node,
-                                                     cpu_index);
+                                                     thread_index);
               if (!s0)
                 {
                   b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION];
@@ -1057,7 +1057,7 @@ snat_out2in_node_fn (vlib_main_t * vm,
                 }
             }
           else
-            s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions,
+            s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions,
                                     value0.value);
 
           old_addr0 = ip0->dst_address.as_u32;
@@ -1101,9 +1101,9 @@ snat_out2in_node_fn (vlib_main_t * vm,
           /* Per-user LRU list maintenance for dynamic translation */
           if (!snat_is_session_static (s0))
             {
-              clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool,
+              clib_dlist_remove (sm->per_thread_data[thread_index].list_pool,
                                  s0->per_user_index);
-              clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool,
+              clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool,
                                   s0->per_user_list_head_index,
                                   s0->per_user_index);
             }
@@ -1118,7 +1118,7 @@ snat_out2in_node_fn (vlib_main_t * vm,
               t->next_index = next0;
               t->session_index = ~0;
               if (s0)
-                t->session_index = s0 - sm->per_thread_data[cpu_index].sessions;
+                t->session_index = s0 - sm->per_thread_data[thread_index].sessions;
             }
 
           pkts_processed += next0 != SNAT_OUT2IN_NEXT_DROP;
@@ -1599,7 +1599,7 @@ snat_out2in_worker_handoff_fn (vlib_main_t * vm,
   u32 n_left_to_next_worker = 0, *to_next_worker = 0;
   u32 next_worker_index = 0;
   u32 current_worker_index = ~0;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   ASSERT (vec_len (sm->workers));
 
@@ -1637,7 +1637,7 @@ snat_out2in_worker_handoff_fn (vlib_main_t * vm,
 
       next_worker_index = sm->worker_out2in_cb(ip0, rx_fib_index0);
 
-      if (PREDICT_FALSE (next_worker_index != cpu_index))
+      if (PREDICT_FALSE (next_worker_index != thread_index))
         {
           do_handoff = 1;
 
diff --git a/src/plugins/snat/snat.h b/src/plugins/snat/snat.h
index 017825c0..f4e1c5c0 100644
--- a/src/plugins/snat/snat.h
+++ b/src/plugins/snat/snat.h
@@ -221,7 +221,7 @@ struct snat_main_s;
 
 typedef u32 snat_icmp_match_function_t (struct snat_main_s *sm,
                                         vlib_node_runtime_t *node,
-                                        u32 cpu_index,
+                                        u32 thread_index,
                                         vlib_buffer_t *b0,
                                         snat_session_key_t *p_key,
                                         snat_session_key_t *p_value,
@@ -402,22 +402,22 @@ typedef struct {
 } tcp_udp_header_t;
 
 u32 icmp_match_in2out_fast(snat_main_t *sm, vlib_node_runtime_t *node,
-                           u32 cpu_index, vlib_buffer_t *b0,
+                           u32 thread_index, vlib_buffer_t *b0,
                            snat_session_key_t *p_key,
                            snat_session_key_t *p_value,
                            u8 *p_dont_translate, void *d);
 u32 icmp_match_in2out_slow(snat_main_t *sm, vlib_node_runtime_t *node,
-                           u32 cpu_index, vlib_buffer_t *b0,
+                           u32 thread_index, vlib_buffer_t *b0,
                            snat_session_key_t *p_key,
                            snat_session_key_t *p_value,
                            u8 *p_dont_translate, void *d);
 u32 icmp_match_out2in_fast(snat_main_t *sm, vlib_node_runtime_t *node,
-                           u32 cpu_index, vlib_buffer_t *b0,
+                           u32 thread_index, vlib_buffer_t *b0,
                            snat_session_key_t *p_key,
                            snat_session_key_t *p_value,
                            u8 *p_dont_translate, void *d);
 u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node,
-                           u32 cpu_index, vlib_buffer_t *b0,
+                           u32 thread_index, vlib_buffer_t *b0,
                            snat_session_key_t *p_key,
                            snat_session_key_t *p_value,
                            u8 *p_dont_translate, void *d);
diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c
index a517a597..be3b41ef 100644
--- a/src/vlib/buffer.c
+++ b/src/vlib/buffer.c
@@ -299,7 +299,7 @@ vlib_buffer_validate_alloc_free (vlib_main_t * vm,
   if (CLIB_DEBUG == 0)
     return;
 
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
 
   /* smp disaster check */
   if (vec_len (vlib_mains) > 1)
@@ -355,7 +355,7 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm,
   vlib_buffer_free_list_t *f;
   int i;
 
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
 
   if (!is_default && pool_elts (bm->buffer_free_list_pool) == 0)
     {
@@ -474,7 +474,7 @@ vlib_buffer_delete_free_list_internal (vlib_main_t * vm, u32 free_list_index)
   u32 merge_index;
   int i;
 
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
 
   f = vlib_buffer_get_free_list (vm, free_list_index);
 
diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h
index 394c336a..328660a3 100644
--- a/src/vlib/buffer_funcs.h
+++ b/src/vlib/buffer_funcs.h
@@ -209,7 +209,7 @@ always_inline vlib_buffer_known_state_t
 vlib_buffer_is_known (vlib_main_t * vm, u32 buffer_index)
 {
   vlib_buffer_main_t *bm = vm->buffer_main;
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
 
   uword *p = hash_get (bm->buffer_known_hash, buffer_index);
   return p ? p[0] : VLIB_BUFFER_UNKNOWN;
@@ -221,7 +221,7 @@ vlib_buffer_set_known_state (vlib_main_t * vm,
 			     vlib_buffer_known_state_t state)
 {
   vlib_buffer_main_t *bm = vm->buffer_main;
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
   hash_set (bm->buffer_known_hash, buffer_index, state);
 }
 
diff --git a/src/vlib/cli.c b/src/vlib/cli.c
index f853f655..3cc95076 100644
--- a/src/vlib/cli.c
+++ b/src/vlib/cli.c
@@ -709,7 +709,7 @@ test_heap_validate (vlib_main_t * vm, unformat_input_t * input,
     {
         /* *INDENT-OFF* */
         foreach_vlib_main({
-          heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index];
+          heap = clib_per_cpu_mheaps[this_vlib_main->thread_index];
           mheap = mheap_header(heap);
           mheap->flags |= MHEAP_FLAG_VALIDATE;
           // Turn off small object cache because it delays detection of errors
@@ -722,7 +722,7 @@ test_heap_validate (vlib_main_t * vm, unformat_input_t * input,
     {
         /* *INDENT-OFF* */
         foreach_vlib_main({
-          heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index];
+          heap = clib_per_cpu_mheaps[this_vlib_main->thread_index];
           mheap = mheap_header(heap);
           mheap->flags &= ~MHEAP_FLAG_VALIDATE;
           mheap->flags |= MHEAP_FLAG_SMALL_OBJECT_CACHE;
@@ -733,7 +733,7 @@ test_heap_validate (vlib_main_t * vm, unformat_input_t * input,
     {
         /* *INDENT-OFF* */
         foreach_vlib_main({
-          heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index];
+          heap = clib_per_cpu_mheaps[this_vlib_main->thread_index];
           mheap = mheap_header(heap);
           mheap_validate(heap);
         });
diff --git a/src/vlib/counter.h b/src/vlib/counter.h
index 17a85217..60e2055d 100644
--- a/src/vlib/counter.h
+++ b/src/vlib/counter.h
@@ -70,17 +70,17 @@ u32 vlib_simple_counter_n_counters (const vlib_simple_counter_main_t * cm);
 
 /** Increment a simple counter
     @param cm - (vlib_simple_counter_main_t *) simple counter main pointer
-    @param cpu_index - (u32) the current cpu index
+    @param thread_index - (u32) the current cpu index
     @param index - (u32) index of the counter to increment
     @param increment - (u64) quantitiy to add to the counter
 */
 always_inline void
 vlib_increment_simple_counter (vlib_simple_counter_main_t * cm,
-			       u32 cpu_index, u32 index, u64 increment)
+			       u32 thread_index, u32 index, u64 increment)
 {
   counter_t *my_counters;
 
-  my_counters = cm->counters[cpu_index];
+  my_counters = cm->counters[thread_index];
   my_counters[index] += increment;
 }
 
@@ -201,7 +201,7 @@ void vlib_clear_combined_counters (vlib_combined_counter_main_t * cm);
 
 /** Increment a combined counter
     @param cm - (vlib_combined_counter_main_t *) comined counter main pointer
-    @param cpu_index - (u32) the current cpu index
+    @param thread_index - (u32) the current cpu index
     @param index - (u32) index of the counter to increment
     @param packet_increment - (u64) number of packets to add to the counter
     @param byte_increment - (u64) number of bytes to add to the counter
@@ -209,13 +209,13 @@ void vlib_clear_combined_counters (vlib_combined_counter_main_t * cm);
 
 always_inline void
 vlib_increment_combined_counter (vlib_combined_counter_main_t * cm,
-				 u32 cpu_index,
+				 u32 thread_index,
 				 u32 index, u64 n_packets, u64 n_bytes)
 {
   vlib_counter_t *my_counters;
 
   /* Use this CPU's counter array */
-  my_counters = cm->counters[cpu_index];
+  my_counters = cm->counters[thread_index];
 
   my_counters[index].packets += n_packets;
   my_counters[index].bytes += n_bytes;
@@ -224,14 +224,14 @@ vlib_increment_combined_counter (vlib_combined_counter_main_t * cm,
 /** Pre-fetch a per-thread combined counter for the given object index */
 always_inline void
 vlib_prefetch_combined_counter (const vlib_combined_counter_main_t * cm,
-				u32 cpu_index, u32 index)
+				u32 thread_index, u32 index)
 {
   vlib_counter_t *cpu_counters;
 
   /*
    * This CPU's index is assumed to already be in cache
    */
-  cpu_counters = cm->counters[cpu_index];
+  cpu_counters = cm->counters[thread_index];
   CLIB_PREFETCH (cpu_counters + index, CLIB_CACHE_LINE_BYTES, STORE);
 }
 
diff --git a/src/vlib/error.c b/src/vlib/error.c
index a2c23176..e4ed4ee3 100644
--- a/src/vlib/error.c
+++ b/src/vlib/error.c
@@ -149,7 +149,7 @@ vlib_register_errors (vlib_main_t * vm,
   vlib_node_t *n = vlib_get_node (vm, node_index);
   uword l;
 
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
 
   /* Free up any previous error strings. */
   if (n->n_errors > 0)
diff --git a/src/vlib/global_funcs.h b/src/vlib/global_funcs.h
index f51ec381..9dd01fbf 100644
--- a/src/vlib/global_funcs.h
+++ b/src/vlib/global_funcs.h
@@ -23,7 +23,7 @@ always_inline vlib_main_t *
 vlib_get_main (void)
 {
   vlib_main_t *vm;
-  vm = vlib_mains[os_get_cpu_number ()];
+  vm = vlib_mains[vlib_get_thread_index ()];
   ASSERT (vm);
   return vm;
 }
diff --git a/src/vlib/main.c b/src/vlib/main.c
index b22203f0..422d3e26 100644
--- a/src/vlib/main.c
+++ b/src/vlib/main.c
@@ -136,18 +136,18 @@ vlib_frame_alloc_to_node (vlib_main_t * vm, u32 to_node_index,
   else
     {
       f = clib_mem_alloc_aligned_no_fail (n, VLIB_FRAME_ALIGN);
-      f->cpu_index = vm->cpu_index;
+      f->thread_index = vm->thread_index;
       fi = vlib_frame_index_no_check (vm, f);
     }
 
   /* Poison frame when debugging. */
   if (CLIB_DEBUG > 0)
     {
-      u32 save_cpu_index = f->cpu_index;
+      u32 save_thread_index = f->thread_index;
 
       memset (f, 0xfe, n);
 
-      f->cpu_index = save_cpu_index;
+      f->thread_index = save_thread_index;
     }
 
   /* Insert magic number. */
@@ -517,7 +517,7 @@ vlib_put_next_frame (vlib_main_t * vm,
 	   * a dangling frame reference. Each thread has its own copy of
 	   * the next_frames vector.
 	   */
-	  if (0 && r->cpu_index != next_runtime->cpu_index)
+	  if (0 && r->thread_index != next_runtime->thread_index)
 	    {
 	      nf->frame_index = ~0;
 	      nf->flags &= ~(VLIB_FRAME_PENDING | VLIB_FRAME_IS_ALLOCATED);
@@ -866,7 +866,7 @@ vlib_elog_main_loop_event (vlib_main_t * vm,
 				  : evm->node_call_elog_event_types,
 				  node_index),
 		/* track */
-		(vm->cpu_index ? &vlib_worker_threads[vm->cpu_index].
+		(vm->thread_index ? &vlib_worker_threads[vm->thread_index].
 		 elog_track : &em->default_track),
 		/* data to log */ n_vectors);
 }
@@ -963,7 +963,7 @@ dispatch_node (vlib_main_t * vm,
 
   vm->cpu_time_last_node_dispatch = last_time_stamp;
 
-  if (1 /* || vm->cpu_index == node->cpu_index */ )
+  if (1 /* || vm->thread_index == node->thread_index */ )
     {
       vlib_main_t *stat_vm;
 
@@ -1029,7 +1029,7 @@ dispatch_node (vlib_main_t * vm,
 	  {
 	    u32 node_name, vector_length, is_polling;
 	  } *ed;
-	  vlib_worker_thread_t *w = vlib_worker_threads + vm->cpu_index;
+	  vlib_worker_thread_t *w = vlib_worker_threads + vm->thread_index;
 #endif
 
 	  if ((dispatch_state == VLIB_NODE_STATE_INTERRUPT
diff --git a/src/vlib/main.h b/src/vlib/main.h
index 0197b4f3..329bf073 100644
--- a/src/vlib/main.h
+++ b/src/vlib/main.h
@@ -156,7 +156,7 @@ typedef struct vlib_main_t
   uword *init_functions_called;
 
   /* to compare with node runtime */
-  u32 cpu_index;
+  u32 thread_index;
 
   void **mbuf_alloc_list;
 
diff --git a/src/vlib/node.c b/src/vlib/node.c
index dc0a4de5..bbd3a42e 100644
--- a/src/vlib/node.c
+++ b/src/vlib/node.c
@@ -99,7 +99,7 @@ vlib_node_runtime_update (vlib_main_t * vm, u32 node_index, u32 next_index)
   vlib_pending_frame_t *pf;
   i32 i, j, n_insert;
 
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
 
   vlib_worker_thread_barrier_sync (vm);
 
diff --git a/src/vlib/node.h b/src/vlib/node.h
index fc7e7da2..1e2f4c38 100644
--- a/src/vlib/node.h
+++ b/src/vlib/node.h
@@ -344,8 +344,8 @@ typedef struct vlib_frame_t
   /* Number of vector elements currently in frame. */
   u16 n_vectors;
 
-  /* Owner cpuid / heap id */
-  u16 cpu_index;
+  /* Owner thread / heap id */
+  u16 thread_index;
 
   /* Scalar and vector arguments to next node. */
   u8 arguments[0];
@@ -459,7 +459,7 @@ typedef struct vlib_node_runtime_t
 					  zero before first run of this
 					  node. */
 
-  u16 cpu_index;			/**< CPU this node runs on */
+  u16 thread_index;			/**< thread this node runs on */
 
   u8 runtime_data[0];			/**< Function dependent
 					  node-runtime data. This data is
diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h
index 1f7d94e1..54e36874 100644
--- a/src/vlib/node_funcs.h
+++ b/src/vlib/node_funcs.h
@@ -201,9 +201,9 @@ always_inline vlib_frame_t *
 vlib_get_frame_no_check (vlib_main_t * vm, uword frame_index)
 {
   vlib_frame_t *f;
-  u32 cpu_index = frame_index & VLIB_CPU_MASK;
+  u32 thread_index = frame_index & VLIB_CPU_MASK;
   u32 offset = frame_index & VLIB_OFFSET_MASK;
-  vm = vlib_mains[cpu_index];
+  vm = vlib_mains[thread_index];
   f = vm->heap_base + offset;
   return f;
 }
@@ -215,10 +215,10 @@ vlib_frame_index_no_check (vlib_main_t * vm, vlib_frame_t * f)
 
   ASSERT (((uword) f & VLIB_CPU_MASK) == 0);
 
-  vm = vlib_mains[f->cpu_index];
+  vm = vlib_mains[f->thread_index];
 
   i = ((u8 *) f - (u8 *) vm->heap_base);
-  return i | f->cpu_index;
+  return i | f->thread_index;
 }
 
 always_inline vlib_frame_t *
diff --git a/src/vlib/threads.c b/src/vlib/threads.c
index ef3a24d3..4a111f8d 100644
--- a/src/vlib/threads.c
+++ b/src/vlib/threads.c
@@ -35,27 +35,12 @@ vl (void *p)
 vlib_worker_thread_t *vlib_worker_threads;
 vlib_thread_main_t vlib_thread_main;
 
+__thread uword vlib_thread_index = 0;
+
 uword
 os_get_cpu_number (void)
 {
-  void *sp;
-  uword n;
-  u32 len;
-
-  len = vec_len (vlib_thread_stacks);
-  if (len == 0)
-    return 0;
-
-  /* Get any old stack address. */
-  sp = &sp;
-
-  n = ((uword) sp - (uword) vlib_thread_stacks[0])
-    >> VLIB_LOG2_THREAD_STACK_SIZE;
-
-  /* "processes" have their own stacks, and they always run in thread 0 */
-  n = n >= len ? 0 : n;
-
-  return n;
+  return vlib_thread_index;
 }
 
 uword
@@ -275,21 +260,6 @@ vlib_thread_init (vlib_main_t * vm)
   return 0;
 }
 
-vlib_worker_thread_t *
-vlib_alloc_thread (vlib_main_t * vm)
-{
-  vlib_worker_thread_t *w;
-
-  if (vec_len (vlib_worker_threads) >= vec_len (vlib_thread_stacks))
-    {
-      clib_warning ("out of worker threads... Quitting...");
-      exit (1);
-    }
-  vec_add2 (vlib_worker_threads, w, 1);
-  w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads];
-  return w;
-}
-
 vlib_frame_queue_t *
 vlib_frame_queue_alloc (int nelts)
 {
@@ -427,7 +397,7 @@ vlib_frame_queue_enqueue (vlib_main_t * vm, u32 node_runtime_index,
       f64 b4 = vlib_time_now_ticks (vm, before);
       vlib_worker_thread_barrier_check (vm, b4);
       /* Bad idea. Dequeue -> enqueue -> dequeue -> trouble */
-      // vlib_frame_queue_dequeue (vm->cpu_index, vm, nm);
+      // vlib_frame_queue_dequeue (vm->thread_index, vm, nm);
     }
 
   elt = fq->elts + (new_tail & (fq->nelts - 1));
@@ -497,6 +467,8 @@ vlib_worker_thread_bootstrap_fn (void *arg)
   w->lwp = syscall (SYS_gettid);
   w->thread_id = pthread_self ();
 
+  vlib_thread_index = w - vlib_worker_threads;
+
   rv = (void *) clib_calljmp
     ((uword (*)(uword)) w->thread_function,
      (uword) arg, w->thread_stack + VLIB_THREAD_STACK_SIZE);
@@ -610,7 +582,9 @@ start_workers (vlib_main_t * vm)
 		  mheap_alloc (0 /* use VM */ , tr->mheap_size);
 	      else
 		w->thread_mheap = main_heap;
-	      w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads];
+
+	      w->thread_stack =
+		vlib_thread_stack_init (w - vlib_worker_threads);
 	      w->thread_function = tr->function;
 	      w->thread_function_arg = w;
 	      w->instance_id = k;
@@ -630,7 +604,7 @@ start_workers (vlib_main_t * vm)
 	      vm_clone = clib_mem_alloc (sizeof (*vm_clone));
 	      clib_memcpy (vm_clone, vlib_mains[0], sizeof (*vm_clone));
 
-	      vm_clone->cpu_index = worker_thread_index;
+	      vm_clone->thread_index = worker_thread_index;
 	      vm_clone->heap_base = w->thread_mheap;
 	      vm_clone->mbuf_alloc_list = 0;
 	      vm_clone->init_functions_called =
@@ -679,7 +653,7 @@ start_workers (vlib_main_t * vm)
 	      vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT])
 	      {
 		vlib_node_t *n = vlib_get_node (vm, rt->node_index);
-		rt->cpu_index = vm_clone->cpu_index;
+		rt->thread_index = vm_clone->thread_index;
 		/* copy initial runtime_data from node */
 		if (n->runtime_data && n->runtime_data_bytes > 0)
 		  clib_memcpy (rt->runtime_data, n->runtime_data,
@@ -692,7 +666,7 @@ start_workers (vlib_main_t * vm)
 	      vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT])
 	      {
 		vlib_node_t *n = vlib_get_node (vm, rt->node_index);
-		rt->cpu_index = vm_clone->cpu_index;
+		rt->thread_index = vm_clone->thread_index;
 		/* copy initial runtime_data from node */
 		if (n->runtime_data && n->runtime_data_bytes > 0)
 		  clib_memcpy (rt->runtime_data, n->runtime_data,
@@ -756,7 +730,8 @@ start_workers (vlib_main_t * vm)
 		  mheap_alloc (0 /* use VM */ , tr->mheap_size);
 	      else
 		w->thread_mheap = main_heap;
-	      w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads];
+	      w->thread_stack =
+		vlib_thread_stack_init (w - vlib_worker_threads);
 	      w->thread_function = tr->function;
 	      w->thread_function_arg = w;
 	      w->instance_id = j;
@@ -827,7 +802,7 @@ vlib_worker_thread_node_runtime_update (void)
 				  uword n_calls,
 				  uword n_vectors, uword n_clocks);
 
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
 
   if (vec_len (vlib_mains) == 1)
     return;
@@ -835,7 +810,7 @@ vlib_worker_thread_node_runtime_update (void)
   vm = vlib_mains[0];
   nm = &vm->node_main;
 
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
   ASSERT (*vlib_worker_threads->wait_at_barrier == 1);
 
   /*
@@ -955,7 +930,7 @@ vlib_worker_thread_node_runtime_update (void)
       vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL])
       {
 	vlib_node_t *n = vlib_get_node (vm, rt->node_index);
-	rt->cpu_index = vm_clone->cpu_index;
+	rt->thread_index = vm_clone->thread_index;
 	/* copy runtime_data, will be overwritten later for existing rt */
 	if (n->runtime_data && n->runtime_data_bytes > 0)
 	  clib_memcpy (rt->runtime_data, n->runtime_data,
@@ -981,7 +956,7 @@ vlib_worker_thread_node_runtime_update (void)
       vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT])
       {
 	vlib_node_t *n = vlib_get_node (vm, rt->node_index);
-	rt->cpu_index = vm_clone->cpu_index;
+	rt->thread_index = vm_clone->thread_index;
 	/* copy runtime_data, will be overwritten later for existing rt */
 	if (n->runtime_data && n->runtime_data_bytes > 0)
 	  clib_memcpy (rt->runtime_data, n->runtime_data,
@@ -1180,7 +1155,7 @@ vlib_worker_thread_fork_fixup (vlib_fork_fixup_t which)
   if (vlib_mains == 0)
     return;
 
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
   vlib_worker_thread_barrier_sync (vm);
 
   switch (which)
@@ -1212,7 +1187,7 @@ vlib_worker_thread_barrier_sync (vlib_main_t * vm)
 
   vlib_worker_threads[0].barrier_sync_count++;
 
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
 
   deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT;
 
@@ -1260,7 +1235,7 @@ vlib_worker_thread_barrier_release (vlib_main_t * vm)
 int
 vlib_frame_queue_dequeue (vlib_main_t * vm, vlib_frame_queue_main_t * fqm)
 {
-  u32 thread_id = vm->cpu_index;
+  u32 thread_id = vm->thread_index;
   vlib_frame_queue_t *fq = fqm->vlib_frame_queues[thread_id];
   vlib_frame_queue_elt_t *elt;
   u32 *from, *to;
@@ -1393,7 +1368,7 @@ vlib_worker_thread_fn (void *arg)
   vlib_main_t *vm = vlib_get_main ();
   clib_error_t *e;
 
-  ASSERT (vm->cpu_index == os_get_cpu_number ());
+  ASSERT (vm->thread_index == vlib_get_thread_index ());
 
   vlib_worker_thread_init (w);
   clib_time_init (&vm->clib_time);
diff --git a/src/vlib/threads.h b/src/vlib/threads.h
index eca4fc26..101d3d4a 100644
--- a/src/vlib/threads.h
+++ b/src/vlib/threads.h
@@ -153,8 +153,6 @@ typedef struct
 /* Called early, in thread 0's context */
 clib_error_t *vlib_thread_init (vlib_main_t * vm);
 
-vlib_worker_thread_t *vlib_alloc_thread (vlib_main_t * vm);
-
 int vlib_frame_queue_enqueue (vlib_main_t * vm, u32 node_runtime_index,
 			      u32 frame_queue_index, vlib_frame_t * frame,
 			      vlib_frame_queue_msg_type_t type);
@@ -183,12 +181,19 @@ u32 vlib_frame_queue_main_init (u32 node_index, u32 frame_queue_nelts);
 void vlib_worker_thread_barrier_sync (vlib_main_t * vm);
 void vlib_worker_thread_barrier_release (vlib_main_t * vm);
 
+extern __thread uword vlib_thread_index;
+static_always_inline uword
+vlib_get_thread_index (void)
+{
+  return vlib_thread_index;
+}
+
 always_inline void
 vlib_smp_unsafe_warning (void)
 {
   if (CLIB_DEBUG > 0)
     {
-      if (os_get_cpu_number ())
+      if (vlib_get_thread_index ())
 	fformat (stderr, "%s: SMP unsafe warning...\n", __FUNCTION__);
     }
 }
@@ -331,21 +336,21 @@ vlib_num_workers ()
 }
 
 always_inline u32
-vlib_get_worker_cpu_index (u32 worker_index)
+vlib_get_worker_thread_index (u32 worker_index)
 {
   return worker_index + 1;
 }
 
 always_inline u32
-vlib_get_worker_index (u32 cpu_index)
+vlib_get_worker_index (u32 thread_index)
 {
-  return cpu_index - 1;
+  return thread_index - 1;
 }
 
 always_inline u32
 vlib_get_current_worker_index ()
 {
-  return os_get_cpu_number () - 1;
+  return vlib_get_thread_index () - 1;
 }
 
 static inline void
@@ -467,6 +472,8 @@ vlib_get_worker_handoff_queue_elt (u32 frame_queue_index,
   return elt;
 }
 
+u8 *vlib_thread_stack_init (uword thread_index);
+
 int vlib_thread_cb_register (struct vlib_main_t *vm,
 			     vlib_thread_callbacks_t * cb);
 
diff --git a/src/vlib/unix/cj.c b/src/vlib/unix/cj.c
index 33ba163a..7c1e9475 100644
--- a/src/vlib/unix/cj.c
+++ b/src/vlib/unix/cj.c
@@ -48,7 +48,7 @@ cj_log (u32 type, void *data0, void *data1)
 
   r = (cj_record_t *) & (cjm->records[new_tail & (cjm->num_records - 1)]);
   r->time = vlib_time_now (cjm->vlib_main);
-  r->cpu = os_get_cpu_number ();
+  r->thread_index = vlib_get_thread_index ();
   r->type = type;
   r->data[0] = pointer_to_uword (data0);
   r->data[1] = pointer_to_uword (data1);
@@ -133,7 +133,8 @@ static inline void
 cj_dump_one_record (cj_record_t * r)
 {
   fprintf (stderr, "[%d]: %10.6f T%02d %llx %llx\n",
-	   r->cpu, r->time, r->type, (long long unsigned int) r->data[0],
+	   r->thread_index, r->time, r->type,
+	   (long long unsigned int) r->data[0],
 	   (long long unsigned int) r->data[1]);
 }
 
@@ -161,7 +162,7 @@ cj_dump_internal (u8 filter0_enable, u64 filter0,
   index = (cjm->tail + 1) & (cjm->num_records - 1);
   r = &(cjm->records[index]);
 
-  if (r->cpu != (u32) ~ 0)
+  if (r->thread_index != (u32) ~ 0)
     {
       /* Yes, dump from tail + 1 to the end */
       for (i = index; i < cjm->num_records; i++)
diff --git a/src/vlib/unix/cj.h b/src/vlib/unix/cj.h
index 67626afe..d0a1d46e 100644
--- a/src/vlib/unix/cj.h
+++ b/src/vlib/unix/cj.h
@@ -23,7 +23,7 @@
 typedef struct
 {
   f64 time;
-  u32 cpu;
+  u32 thread_index;
   u32 type;
   u64 data[2];
 } cj_record_t;
diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c
index 6b96cc0d..db5ddd64 100644
--- a/src/vlib/unix/main.c
+++ b/src/vlib/unix/main.c
@@ -510,13 +510,28 @@ thread0 (uword arg)
   return i;
 }
 
+u8 *
+vlib_thread_stack_init (uword thread_index)
+{
+  vec_validate (vlib_thread_stacks, thread_index);
+  vlib_thread_stacks[thread_index] = clib_mem_alloc_aligned
+    (VLIB_THREAD_STACK_SIZE, VLIB_THREAD_STACK_SIZE);
+
+  /*
+   * Disallow writes to the bottom page of the stack, to
+   * catch stack overflows.
+   */
+  if (mprotect (vlib_thread_stacks[thread_index],
+		clib_mem_get_page_size (), PROT_READ) < 0)
+    clib_unix_warning ("thread stack");
+  return vlib_thread_stacks[thread_index];
+}
+
 int
 vlib_unix_main (int argc, char *argv[])
 {
   vlib_main_t *vm = &vlib_global_main;	/* one and only time for this! */
-  vlib_thread_main_t *tm = &vlib_thread_main;
   unformat_input_t input;
-  u8 *thread_stacks;
   clib_error_t *e;
   int i;
 
@@ -548,29 +563,9 @@ vlib_unix_main (int argc, char *argv[])
     }
   unformat_free (&input);
 
-  /*
-   * allocate n x VLIB_THREAD_STACK_SIZE stacks, aligned to a
-   * VLIB_THREAD_STACK_SIZE boundary
-   * See also: os_get_cpu_number() in vlib/vlib/threads.c
-   */
-  thread_stacks = clib_mem_alloc_aligned
-    ((uword) tm->n_thread_stacks * VLIB_THREAD_STACK_SIZE,
-     VLIB_THREAD_STACK_SIZE);
-
-  vec_validate (vlib_thread_stacks, tm->n_thread_stacks - 1);
-  for (i = 0; i < vec_len (vlib_thread_stacks); i++)
-    {
-      vlib_thread_stacks[i] = thread_stacks;
-
-      /*
-       * Disallow writes to the bottom page of the stack, to
-       * catch stack overflows.
-       */
-      if (mprotect (thread_stacks, clib_mem_get_page_size (), PROT_READ) < 0)
-	clib_unix_warning ("thread stack");
+  vlib_thread_stack_init (0);
 
-      thread_stacks += VLIB_THREAD_STACK_SIZE;
-    }
+  vlib_thread_index = 0;
 
   i = clib_calljmp (thread0, (uword) vm,
 		    (void *) (vlib_thread_stacks[0] +
diff --git a/src/vnet/adj/adj_l2.c b/src/vnet/adj/adj_l2.c
index f68e54e0..20d70dd4 100644
--- a/src/vnet/adj/adj_l2.c
+++ b/src/vnet/adj/adj_l2.c
@@ -52,7 +52,7 @@ adj_l2_rewrite_inline (vlib_main_t * vm,
 {
     u32 * from = vlib_frame_vector_args (frame);
     u32 n_left_from, n_left_to_next, * to_next, next_index;
-    u32 cpu_index = os_get_cpu_number();
+    u32 thread_index = vlib_get_thread_index();
     ethernet_main_t * em = &ethernet_main;
 
     n_left_from = frame->n_vectors;
@@ -93,7 +93,7 @@ adj_l2_rewrite_inline (vlib_main_t * vm,
             vnet_buffer(p0)->sw_if_index[VLIB_TX] = adj0->rewrite_header.sw_if_index;
 
 	    vlib_increment_combined_counter(&adjacency_counters,
-                                            cpu_index,
+                                            thread_index,
                                             adj_index0,
                                             /* packet increment */ 0,
                                             /* byte increment */ rw_len0);
diff --git a/src/vnet/adj/adj_midchain.c b/src/vnet/adj/adj_midchain.c
index e8087f08..5756de43 100644
--- a/src/vnet/adj/adj_midchain.c
+++ b/src/vnet/adj/adj_midchain.c
@@ -49,7 +49,7 @@ adj_midchain_tx_inline (vlib_main_t * vm,
     u32 next_index;
     vnet_main_t *vnm = vnet_get_main ();
     vnet_interface_main_t *im = &vnm->interface_main;
-    u32 cpu_index = vm->cpu_index;
+    u32 thread_index = vm->thread_index;
 
     /* Vector of buffer / pkt indices we're supposed to process */
     from = vlib_frame_vector_args (frame);
@@ -124,13 +124,13 @@ adj_midchain_tx_inline (vlib_main_t * vm,
 	    {
 		vlib_increment_combined_counter (im->combined_sw_if_counters
 						 + VNET_INTERFACE_COUNTER_TX,
-						 cpu_index,
+						 thread_index,
 						 adj0->rewrite_header.sw_if_index,
 						 1,
 						 vlib_buffer_length_in_chain (vm, b0));
 		vlib_increment_combined_counter (im->combined_sw_if_counters
 						 + VNET_INTERFACE_COUNTER_TX,
-						 cpu_index,
+						 thread_index,
 						 adj1->rewrite_header.sw_if_index,
 						 1,
 						 vlib_buffer_length_in_chain (vm, b1));
@@ -181,7 +181,7 @@ adj_midchain_tx_inline (vlib_main_t * vm,
 	    {
 		vlib_increment_combined_counter (im->combined_sw_if_counters
 						 + VNET_INTERFACE_COUNTER_TX,
-						 cpu_index,
+						 thread_index,
 						 adj0->rewrite_header.sw_if_index,
 						 1,
 						 vlib_buffer_length_in_chain (vm, b0));
diff --git a/src/vnet/adj/adj_nsh.c b/src/vnet/adj/adj_nsh.c
index 9a0f9d8b..128570b0 100644
--- a/src/vnet/adj/adj_nsh.c
+++ b/src/vnet/adj/adj_nsh.c
@@ -53,7 +53,7 @@ adj_nsh_rewrite_inline (vlib_main_t * vm,
 {
     u32 * from = vlib_frame_vector_args (frame);
     u32 n_left_from, n_left_to_next, * to_next, next_index;
-    u32 cpu_index = os_get_cpu_number();
+    u32 thread_index = vlib_get_thread_index();
 
     n_left_from = frame->n_vectors;
     next_index = node->cached_next_index;
@@ -94,7 +94,7 @@ adj_nsh_rewrite_inline (vlib_main_t * vm,
             vnet_buffer(p0)->ip.save_rewrite_length = rw_len0;
 
             vlib_increment_combined_counter(&adjacency_counters,
-                                            cpu_index,
+                                            thread_index,
                                             adj_index0,
                                             /* packet increment */ 0,
                                             /* byte increment */ rw_len0);
diff --git a/src/vnet/classify/vnet_classify.c b/src/vnet/classify/vnet_classify.c
index 98842a48..70a189b0 100644
--- a/src/vnet/classify/vnet_classify.c
+++ b/src/vnet/classify/vnet_classify.c
@@ -251,12 +251,12 @@ static inline void make_working_copy
   vnet_classify_entry_##size##_t * working_copy##size = 0;
   foreach_size_in_u32x4;
 #undef _
-  u32 cpu_number = os_get_cpu_number();
+  u32 thread_index = vlib_get_thread_index();
 
-  if (cpu_number >= vec_len (t->working_copies))
+  if (thread_index >= vec_len (t->working_copies))
     {
       oldheap = clib_mem_set_heap (t->mheap);
-      vec_validate (t->working_copies, cpu_number);
+      vec_validate (t->working_copies, thread_index);
       clib_mem_set_heap (oldheap);
     }
 
@@ -265,7 +265,7 @@ static inline void make_working_copy
    * updates from multiple threads will not result in sporadic, spurious
    * lookup failures. 
    */
-  working_copy = t->working_copies[cpu_number];
+  working_copy = t->working_copies[thread_index];
 
   t->saved_bucket.as_u64 = b->as_u64;
   oldheap = clib_mem_set_heap (t->mheap);
@@ -290,7 +290,7 @@ static inline void make_working_copy
         default:
           abort();
         }
-      t->working_copies[cpu_number] = working_copy;
+      t->working_copies[thread_index] = working_copy;
     }
 
   _vec_len(working_copy) = (1<<b->log2_pages)*t->entries_per_page;
@@ -318,7 +318,7 @@ static inline void make_working_copy
   working_bucket.offset = vnet_classify_get_offset (t, working_copy);
   CLIB_MEMORY_BARRIER();
   b->as_u64 = working_bucket.as_u64;
-  t->working_copies[cpu_number] = working_copy;
+  t->working_copies[thread_index] = working_copy;
 }
 
 static vnet_classify_entry_t *
@@ -387,7 +387,7 @@ int vnet_classify_add_del (vnet_classify_table_t * t,
   int i;
   u64 hash, new_hash;
   u32 new_log2_pages;
-  u32 cpu_number = os_get_cpu_number();
+  u32 thread_index = vlib_get_thread_index();
   u8 * key_minus_skip;
 
   ASSERT ((add_v->flags & VNET_CLASSIFY_ENTRY_FREE) == 0);
@@ -498,7 +498,7 @@ int vnet_classify_add_del (vnet_classify_table_t * t,
   new_log2_pages = t->saved_bucket.log2_pages + 1;
 
  expand_again:
-  working_copy = t->working_copies[cpu_number];
+  working_copy = t->working_copies[thread_index];
   new_v = split_and_rehash (t, working_copy, new_log2_pages);
 
   if (new_v == 0)
diff --git a/src/vnet/cop/ip4_whitelist.c b/src/vnet/cop/ip4_whitelist.c
index 6ef3d7d7..1b5e336b 100644
--- a/src/vnet/cop/ip4_whitelist.c
+++ b/src/vnet/cop/ip4_whitelist.c
@@ -60,7 +60,7 @@ ip4_cop_whitelist_node_fn (vlib_main_t * vm,
   cop_feature_type_t next_index;
   cop_main_t *cm = &cop_main;
   vlib_combined_counter_main_t * vcm = &load_balance_main.lbm_via_counters;
-  u32 cpu_index = vm->cpu_index;
+  u32 thread_index = vm->thread_index;
 
   from = vlib_frame_vector_args (frame);
   n_left_from = frame->n_vectors;
@@ -177,12 +177,12 @@ ip4_cop_whitelist_node_fn (vlib_main_t * vm,
           dpo1 = load_balance_get_bucket_i(lb1, 0);
 
           vlib_increment_combined_counter
-              (vcm, cpu_index, lb_index0, 1,
+              (vcm, thread_index, lb_index0, 1,
                vlib_buffer_length_in_chain (vm, b0)
                + sizeof(ethernet_header_t));
 
           vlib_increment_combined_counter
-              (vcm, cpu_index, lb_index1, 1,
+              (vcm, thread_index, lb_index1, 1,
                vlib_buffer_length_in_chain (vm, b1)
                + sizeof(ethernet_header_t));
 
@@ -273,7 +273,7 @@ ip4_cop_whitelist_node_fn (vlib_main_t * vm,
           dpo0 = load_balance_get_bucket_i(lb0, 0);
 
           vlib_increment_combined_counter 
-              (vcm, cpu_index, lb_index0, 1,
+              (vcm, thread_index, lb_index0, 1,
                vlib_buffer_length_in_chain (vm, b0) 
                + sizeof(ethernet_header_t));
 
diff --git a/src/vnet/cop/ip6_whitelist.c b/src/vnet/cop/ip6_whitelist.c
index c2e16ccf..f3fe62e3 100644
--- a/src/vnet/cop/ip6_whitelist.c
+++ b/src/vnet/cop/ip6_whitelist.c
@@ -61,7 +61,7 @@ ip6_cop_whitelist_node_fn (vlib_main_t * vm,
   cop_main_t *cm = &cop_main;
   ip6_main_t * im6 = &ip6_main;
   vlib_combined_counter_main_t * vcm = &load_balance_main.lbm_via_counters;
-  u32 cpu_index = vm->cpu_index;
+  u32 thread_index = vm->thread_index;
 
   from = vlib_frame_vector_args (frame);
   n_left_from = frame->n_vectors;
@@ -153,12 +153,12 @@ ip6_cop_whitelist_node_fn (vlib_main_t * vm,
           dpo1 = load_balance_get_bucket_i(lb1, 0);
 
           vlib_increment_combined_counter 
-              (vcm, cpu_index, lb_index0, 1,
+              (vcm, thread_index, lb_index0, 1,
                vlib_buffer_length_in_chain (vm, b0) 
                + sizeof(ethernet_header_t));
 
           vlib_increment_combined_counter 
-              (vcm, cpu_index, lb_index1, 1,
+              (vcm, thread_index, lb_index1, 1,
                vlib_buffer_length_in_chain (vm, b1)
                + sizeof(ethernet_header_t));
 
@@ -233,7 +233,7 @@ ip6_cop_whitelist_node_fn (vlib_main_t * vm,
           dpo0 = load_balance_get_bucket_i(lb0, 0);
 
           vlib_increment_combined_counter 
-              (vcm, cpu_index, lb_index0, 1,
+              (vcm, thread_index, lb_index0, 1,
                vlib_buffer_length_in_chain (vm, b0) 
                + sizeof(ethernet_header_t));
 
diff --git a/src/vnet/devices/af_packet/node.c b/src/vnet/devices/af_packet/node.c
index ba337f3f..76980102 100644
--- a/src/vnet/devices/af_packet/node.c
+++ b/src/vnet/devices/af_packet/node.c
@@ -124,7 +124,7 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
   u32 frame_num = apif->rx_req->tp_frame_nr;
   u8 *block_start = apif->rx_ring + block * block_size;
   uword n_trace = vlib_get_trace_count (vm, node);
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   u32 n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm,
 							  VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
   u32 min_bufs = apif->rx_req->tp_frame_size / n_buffer_bytes;
@@ -132,15 +132,15 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
   if (apif->per_interface_next_index != ~0)
     next_index = apif->per_interface_next_index;
 
-  n_free_bufs = vec_len (apm->rx_buffers[cpu_index]);
+  n_free_bufs = vec_len (apm->rx_buffers[thread_index]);
   if (PREDICT_FALSE (n_free_bufs < VLIB_FRAME_SIZE))
     {
-      vec_validate (apm->rx_buffers[cpu_index],
+      vec_validate (apm->rx_buffers[thread_index],
 		    VLIB_FRAME_SIZE + n_free_bufs - 1);
       n_free_bufs +=
-	vlib_buffer_alloc (vm, &apm->rx_buffers[cpu_index][n_free_bufs],
+	vlib_buffer_alloc (vm, &apm->rx_buffers[thread_index][n_free_bufs],
 			   VLIB_FRAME_SIZE);
-      _vec_len (apm->rx_buffers[cpu_index]) = n_free_bufs;
+      _vec_len (apm->rx_buffers[thread_index]) = n_free_bufs;
     }
 
   rx_frame = apif->next_rx_frame;
@@ -163,11 +163,11 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
 	    {
 	      /* grab free buffer */
 	      u32 last_empty_buffer =
-		vec_len (apm->rx_buffers[cpu_index]) - 1;
+		vec_len (apm->rx_buffers[thread_index]) - 1;
 	      prev_bi0 = bi0;
-	      bi0 = apm->rx_buffers[cpu_index][last_empty_buffer];
+	      bi0 = apm->rx_buffers[thread_index][last_empty_buffer];
 	      b0 = vlib_get_buffer (vm, bi0);
-	      _vec_len (apm->rx_buffers[cpu_index]) = last_empty_buffer;
+	      _vec_len (apm->rx_buffers[thread_index]) = last_empty_buffer;
 	      n_free_bufs--;
 
 	      /* copy data */
@@ -236,9 +236,9 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
   vlib_increment_combined_counter
     (vnet_get_main ()->interface_main.combined_sw_if_counters
      + VNET_INTERFACE_COUNTER_RX,
-     os_get_cpu_number (), apif->hw_if_index, n_rx_packets, n_rx_bytes);
+     vlib_get_thread_index (), apif->hw_if_index, n_rx_packets, n_rx_bytes);
 
-  vnet_device_increment_rx_packets (cpu_index, n_rx_packets);
+  vnet_device_increment_rx_packets (thread_index, n_rx_packets);
   return n_rx_packets;
 }
 
diff --git a/src/vnet/devices/devices.c b/src/vnet/devices/devices.c
index 41645220..5e5e812c 100644
--- a/src/vnet/devices/devices.c
+++ b/src/vnet/devices/devices.c
@@ -104,7 +104,7 @@ vnet_device_queue_sort (void *a1, void *a2)
 
 void
 vnet_device_input_assign_thread (u32 hw_if_index,
-				 u16 queue_id, uword cpu_index)
+				 u16 queue_id, uword thread_index)
 {
   vnet_main_t *vnm = vnet_get_main ();
   vnet_device_main_t *vdm = &vnet_device_main;
@@ -115,19 +115,19 @@ vnet_device_input_assign_thread (u32 hw_if_index,
 
   ASSERT (hw->input_node_index > 0);
 
-  if (vdm->first_worker_cpu_index == 0)
-    cpu_index = 0;
+  if (vdm->first_worker_thread_index == 0)
+    thread_index = 0;
 
-  if (cpu_index != 0 &&
-      (cpu_index < vdm->first_worker_cpu_index ||
-       cpu_index > vdm->last_worker_cpu_index))
+  if (thread_index != 0 &&
+      (thread_index < vdm->first_worker_thread_index ||
+       thread_index > vdm->last_worker_thread_index))
     {
-      cpu_index = vdm->next_worker_cpu_index++;
-      if (vdm->next_worker_cpu_index > vdm->last_worker_cpu_index)
-	vdm->next_worker_cpu_index = vdm->first_worker_cpu_index;
+      thread_index = vdm->next_worker_thread_index++;
+      if (vdm->next_worker_thread_index > vdm->last_worker_thread_index)
+	vdm->next_worker_thread_index = vdm->first_worker_thread_index;
     }
 
-  vm = vlib_mains[cpu_index];
+  vm = vlib_mains[thread_index];
   rt = vlib_node_get_runtime_data (vm, hw->input_node_index);
 
   vec_add2 (rt->devices_and_queues, dq, 1);
@@ -136,33 +136,33 @@ vnet_device_input_assign_thread (u32 hw_if_index,
   dq->queue_id = queue_id;
 
   vec_sort_with_function (rt->devices_and_queues, vnet_device_queue_sort);
-  vec_validate (hw->input_node_cpu_index_by_queue, queue_id);
-  hw->input_node_cpu_index_by_queue[queue_id] = cpu_index;
+  vec_validate (hw->input_node_thread_index_by_queue, queue_id);
+  hw->input_node_thread_index_by_queue[queue_id] = thread_index;
 }
 
 static int
 vnet_device_input_unassign_thread (u32 hw_if_index, u16 queue_id,
-				   uword cpu_index)
+				   uword thread_index)
 {
   vnet_main_t *vnm = vnet_get_main ();
   vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
   vnet_device_input_runtime_t *rt;
   vnet_device_and_queue_t *dq;
-  uword old_cpu_index;
+  uword old_thread_index;
 
-  if (hw->input_node_cpu_index_by_queue == 0)
+  if (hw->input_node_thread_index_by_queue == 0)
     return VNET_API_ERROR_INVALID_INTERFACE;
 
-  if (vec_len (hw->input_node_cpu_index_by_queue) < queue_id + 1)
+  if (vec_len (hw->input_node_thread_index_by_queue) < queue_id + 1)
     return VNET_API_ERROR_INVALID_INTERFACE;
 
-  old_cpu_index = hw->input_node_cpu_index_by_queue[queue_id];
+  old_thread_index = hw->input_node_thread_index_by_queue[queue_id];
 
-  if (old_cpu_index == cpu_index)
+  if (old_thread_index == thread_index)
     return 0;
 
   rt =
-    vlib_node_get_runtime_data (vlib_mains[old_cpu_index],
+    vlib_node_get_runtime_data (vlib_mains[old_thread_index],
 				hw->input_node_index);
 
   vec_foreach (dq, rt->devices_and_queues)
@@ -240,7 +240,7 @@ set_device_placement (vlib_main_t * vm, unformat_input_t * input,
   vnet_device_main_t *vdm = &vnet_device_main;
   u32 hw_if_index = (u32) ~ 0;
   u32 queue_id = (u32) 0;
-  u32 cpu_index = (u32) ~ 0;
+  u32 thread_index = (u32) ~ 0;
   int rv;
 
   if (!unformat_user (input, unformat_line_input, line_input))
@@ -253,10 +253,10 @@ set_device_placement (vlib_main_t * vm, unformat_input_t * input,
 	;
       else if (unformat (line_input, "queue %d", &queue_id))
 	;
-      else if (unformat (line_input, "main", &cpu_index))
-	cpu_index = 0;
-      else if (unformat (line_input, "worker %d", &cpu_index))
-	cpu_index += vdm->first_worker_cpu_index;
+      else if (unformat (line_input, "main", &thread_index))
+	thread_index = 0;
+      else if (unformat (line_input, "worker %d", &thread_index))
+	thread_index += vdm->first_worker_thread_index;
       else
 	{
 	  error = clib_error_return (0, "parse error: '%U'",
@@ -271,16 +271,17 @@ set_device_placement (vlib_main_t * vm, unformat_input_t * input,
   if (hw_if_index == (u32) ~ 0)
     return clib_error_return (0, "please specify valid interface name");
 
-  if (cpu_index > vdm->last_worker_cpu_index)
+  if (thread_index > vdm->last_worker_thread_index)
     return clib_error_return (0,
 			      "please specify valid worker thread or main");
 
-  rv = vnet_device_input_unassign_thread (hw_if_index, queue_id, cpu_index);
+  rv =
+    vnet_device_input_unassign_thread (hw_if_index, queue_id, thread_index);
 
   if (rv)
     return clib_error_return (0, "not found");
 
-  vnet_device_input_assign_thread (hw_if_index, queue_id, cpu_index);
+  vnet_device_input_assign_thread (hw_if_index, queue_id, thread_index);
 
   return 0;
 }
@@ -326,9 +327,9 @@ vnet_device_init (vlib_main_t * vm)
   tr = p ? (vlib_thread_registration_t *) p[0] : 0;
   if (tr && tr->count > 0)
     {
-      vdm->first_worker_cpu_index = tr->first_index;
-      vdm->next_worker_cpu_index = tr->first_index;
-      vdm->last_worker_cpu_index = tr->first_index + tr->count - 1;
+      vdm->first_worker_thread_index = tr->first_index;
+      vdm->next_worker_thread_index = tr->first_index;
+      vdm->last_worker_thread_index = tr->first_index + tr->count - 1;
     }
   return 0;
 }
diff --git a/src/vnet/devices/devices.h b/src/vnet/devices/devices.h
index bbb29fe3..966f8302 100644
--- a/src/vnet/devices/devices.h
+++ b/src/vnet/devices/devices.h
@@ -50,9 +50,9 @@ typedef struct
 typedef struct
 {
   vnet_device_per_worker_data_t *workers;
-  uword first_worker_cpu_index;
-  uword last_worker_cpu_index;
-  uword next_worker_cpu_index;
+  uword first_worker_thread_index;
+  uword last_worker_thread_index;
+  uword next_worker_thread_index;
 } vnet_device_main_t;
 
 typedef struct
@@ -80,7 +80,7 @@ vnet_set_device_input_node (u32 hw_if_index, u32 node_index)
 }
 
 void vnet_device_input_assign_thread (u32 hw_if_index, u16 queue_id,
-				      uword cpu_index);
+				      uword thread_index);
 
 static inline u64
 vnet_get_aggregate_rx_packets (void)
@@ -95,12 +95,12 @@ vnet_get_aggregate_rx_packets (void)
 }
 
 static inline void
-vnet_device_increment_rx_packets (u32 cpu_index, u64 count)
+vnet_device_increment_rx_packets (u32 thread_index, u64 count)
 {
   vnet_device_main_t *vdm = &vnet_device_main;
   vnet_device_per_worker_data_t *pwd;
 
-  pwd = vec_elt_at_index (vdm->workers, cpu_index);
+  pwd = vec_elt_at_index (vdm->workers, thread_index);
   pwd->aggregate_rx_packets += count;
 }
 
@@ -117,9 +117,9 @@ vnet_device_input_set_interrupt_pending (vnet_main_t * vnm, u32 hw_if_index,
 {
   vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index);
 
-  ASSERT (queue_id < vec_len (hw->input_node_cpu_index_by_queue));
-  u32 cpu_index = hw->input_node_cpu_index_by_queue[queue_id];
-  vlib_node_set_interrupt_pending (vlib_mains[cpu_index],
+  ASSERT (queue_id < vec_len (hw->input_node_thread_index_by_queue));
+  u32 thread_index = hw->input_node_thread_index_by_queue[queue_id];
+  vlib_node_set_interrupt_pending (vlib_mains[thread_index],
 				   hw->input_node_index);
 }
 
diff --git a/src/vnet/devices/netmap/node.c b/src/vnet/devices/netmap/node.c
index 68ea7832..e120eeae 100644
--- a/src/vnet/devices/netmap/node.c
+++ b/src/vnet/devices/netmap/node.c
@@ -98,22 +98,22 @@ netmap_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
   u32 n_free_bufs;
   struct netmap_ring *ring;
   int cur_ring;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   u32 n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm,
 							  VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
 
   if (nif->per_interface_next_index != ~0)
     next_index = nif->per_interface_next_index;
 
-  n_free_bufs = vec_len (nm->rx_buffers[cpu_index]);
+  n_free_bufs = vec_len (nm->rx_buffers[thread_index]);
   if (PREDICT_FALSE (n_free_bufs < VLIB_FRAME_SIZE))
     {
-      vec_validate (nm->rx_buffers[cpu_index],
+      vec_validate (nm->rx_buffers[thread_index],
 		    VLIB_FRAME_SIZE + n_free_bufs - 1);
       n_free_bufs +=
-	vlib_buffer_alloc (vm, &nm->rx_buffers[cpu_index][n_free_bufs],
+	vlib_buffer_alloc (vm, &nm->rx_buffers[thread_index][n_free_bufs],
 			   VLIB_FRAME_SIZE);
-      _vec_len (nm->rx_buffers[cpu_index]) = n_free_bufs;
+      _vec_len (nm->rx_buffers[thread_index]) = n_free_bufs;
     }
 
   cur_ring = nif->first_rx_ring;
@@ -163,11 +163,11 @@ netmap_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
 		  vlib_buffer_t *b0;
 		  /* grab free buffer */
 		  u32 last_empty_buffer =
-		    vec_len (nm->rx_buffers[cpu_index]) - 1;
+		    vec_len (nm->rx_buffers[thread_index]) - 1;
 		  prev_bi0 = bi0;
-		  bi0 = nm->rx_buffers[cpu_index][last_empty_buffer];
+		  bi0 = nm->rx_buffers[thread_index][last_empty_buffer];
 		  b0 = vlib_get_buffer (vm, bi0);
-		  _vec_len (nm->rx_buffers[cpu_index]) = last_empty_buffer;
+		  _vec_len (nm->rx_buffers[thread_index]) = last_empty_buffer;
 		  n_free_bufs--;
 
 		  /* copy data */
@@ -247,9 +247,9 @@ netmap_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
   vlib_increment_combined_counter
     (vnet_get_main ()->interface_main.combined_sw_if_counters
      + VNET_INTERFACE_COUNTER_RX,
-     os_get_cpu_number (), nif->hw_if_index, n_rx_packets, n_rx_bytes);
+     vlib_get_thread_index (), nif->hw_if_index, n_rx_packets, n_rx_bytes);
 
-  vnet_device_increment_rx_packets (cpu_index, n_rx_packets);
+  vnet_device_increment_rx_packets (thread_index, n_rx_packets);
 
   return n_rx_packets;
 }
@@ -260,7 +260,7 @@ netmap_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
 {
   int i;
   u32 n_rx_packets = 0;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   netmap_main_t *nm = &netmap_main;
   netmap_if_t *nmi;
 
@@ -269,7 +269,7 @@ netmap_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
       nmi = vec_elt_at_index (nm->interfaces, i);
       if (nmi->is_admin_up &&
 	  (i % nm->input_cpu_count) ==
-	  (cpu_index - nm->input_cpu_first_index))
+	  (thread_index - nm->input_cpu_first_index))
 	n_rx_packets += netmap_device_input_fn (vm, node, frame, nmi);
     }
 
diff --git a/src/vnet/devices/ssvm/node.c b/src/vnet/devices/ssvm/node.c
index a6c9dfd7..539b4161 100644
--- a/src/vnet/devices/ssvm/node.c
+++ b/src/vnet/devices/ssvm/node.c
@@ -89,7 +89,7 @@ ssvm_eth_device_input (ssvm_eth_main_t * em,
   ethernet_header_t *eh0;
   u16 type0;
   u32 n_rx_bytes = 0, l3_offset0;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   u32 trace_cnt __attribute__ ((unused)) = vlib_get_trace_count (vm, node);
   volatile u32 *lock;
   u32 *elt_indices;
@@ -284,10 +284,10 @@ out:
 
   vlib_increment_combined_counter
     (vnet_get_main ()->interface_main.combined_sw_if_counters
-     + VNET_INTERFACE_COUNTER_RX, cpu_index,
+     + VNET_INTERFACE_COUNTER_RX, thread_index,
      intfc->vlib_hw_if_index, rx_queue_index, n_rx_bytes);
 
-  vnet_device_increment_rx_packets (cpu_index, rx_queue_index);
+  vnet_device_increment_rx_packets (thread_index, rx_queue_index);
 
   return rx_queue_index;
 }
diff --git a/src/vnet/devices/virtio/vhost-user.c b/src/vnet/devices/virtio/vhost-user.c
index 00807dc0..5e720f65 100644
--- a/src/vnet/devices/virtio/vhost-user.c
+++ b/src/vnet/devices/virtio/vhost-user.c
@@ -331,7 +331,7 @@ vhost_user_tx_thread_placement (vhost_user_intf_t * vui)
 {
   //Let's try to assign one queue to each thread
   u32 qid = 0;
-  u32 cpu_index = 0;
+  u32 thread_index = 0;
   vui->use_tx_spinlock = 0;
   while (1)
     {
@@ -341,20 +341,21 @@ vhost_user_tx_thread_placement (vhost_user_intf_t * vui)
 	  if (!rxvq->started || !rxvq->enabled)
 	    continue;
 
-	  vui->per_cpu_tx_qid[cpu_index] = qid;
-	  cpu_index++;
-	  if (cpu_index == vlib_get_thread_main ()->n_vlib_mains)
+	  vui->per_cpu_tx_qid[thread_index] = qid;
+	  thread_index++;
+	  if (thread_index == vlib_get_thread_main ()->n_vlib_mains)
 	    return;
 	}
       //We need to loop, meaning the spinlock has to be used
       vui->use_tx_spinlock = 1;
-      if (cpu_index == 0)
+      if (thread_index == 0)
 	{
 	  //Could not find a single valid one
-	  for (cpu_index = 0;
-	       cpu_index < vlib_get_thread_main ()->n_vlib_mains; cpu_index++)
+	  for (thread_index = 0;
+	       thread_index < vlib_get_thread_main ()->n_vlib_mains;
+	       thread_index++)
 	    {
-	      vui->per_cpu_tx_qid[cpu_index] = 0;
+	      vui->per_cpu_tx_qid[thread_index] = 0;
 	    }
 	  return;
 	}
@@ -368,7 +369,7 @@ vhost_user_rx_thread_placement ()
   vhost_user_intf_t *vui;
   vhost_cpu_t *vhc;
   u32 *workers = 0;
-  u32 cpu_index;
+  u32 thread_index;
   vlib_main_t *vm;
 
   //Let's list all workers cpu indexes
@@ -400,9 +401,9 @@ vhost_user_rx_thread_placement ()
 	    continue;
 
 	  i %= vec_len (vui_workers);
-	  cpu_index = vui_workers[i];
+	  thread_index = vui_workers[i];
 	  i++;
-	  vhc = &vum->cpus[cpu_index];
+	  vhc = &vum->cpus[thread_index];
 
 	  iaq.qid = qid;
 	  iaq.vhost_iface_index = vui - vum->vhost_user_interfaces;
@@ -429,14 +430,14 @@ vhost_user_rx_thread_placement ()
     vhc->operation_mode = mode;
   }
 
-  for (cpu_index = vum->input_cpu_first_index;
-       cpu_index < vum->input_cpu_first_index + vum->input_cpu_count;
-       cpu_index++)
+  for (thread_index = vum->input_cpu_first_index;
+       thread_index < vum->input_cpu_first_index + vum->input_cpu_count;
+       thread_index++)
     {
       vlib_node_state_t state = VLIB_NODE_STATE_POLLING;
 
-      vhc = &vum->cpus[cpu_index];
-      vm = vlib_mains ? vlib_mains[cpu_index] : &vlib_global_main;
+      vhc = &vum->cpus[thread_index];
+      vm = vlib_mains ? vlib_mains[thread_index] : &vlib_global_main;
       switch (vhc->operation_mode)
 	{
 	case VHOST_USER_INTERRUPT_MODE:
@@ -532,7 +533,7 @@ vhost_user_set_interrupt_pending (vhost_user_intf_t * vui, u32 ifq)
 {
   vhost_user_main_t *vum = &vhost_user_main;
   vhost_cpu_t *vhc;
-  u32 cpu_index;
+  u32 thread_index;
   vhost_iface_and_queue_t *vhiq;
   vlib_main_t *vm;
   u32 ifq2;
@@ -553,8 +554,8 @@ vhost_user_set_interrupt_pending (vhost_user_intf_t * vui, u32 ifq)
 	  if ((vhiq->vhost_iface_index == (ifq >> 8)) &&
 	      (VHOST_VRING_IDX_TX (vhiq->qid) == (ifq & 0xff)))
 	    {
-	      cpu_index = vhc - vum->cpus;
-	      vm = vlib_mains ? vlib_mains[cpu_index] : &vlib_global_main;
+	      thread_index = vhc - vum->cpus;
+	      vm = vlib_mains ? vlib_mains[thread_index] : &vlib_global_main;
 	      /*
 	       * Convert RX virtqueue number in the lower byte to vring
 	       * queue index for the input node process. Top bytes contain
@@ -1592,7 +1593,7 @@ vhost_user_if_input (vlib_main_t * vm,
   u32 n_trace = vlib_get_trace_count (vm, node);
   u16 qsz_mask;
   u32 map_hint = 0;
-  u16 cpu_index = os_get_cpu_number ();
+  u16 thread_index = vlib_get_thread_index ();
   u16 copy_len = 0;
 
   {
@@ -1651,32 +1652,32 @@ vhost_user_if_input (vlib_main_t * vm,
    * in the loop and come back later. This is not an issue as for big packet,
    * processing cost really comes from the memory copy.
    */
-  if (PREDICT_FALSE (vum->cpus[cpu_index].rx_buffers_len < n_left + 1))
+  if (PREDICT_FALSE (vum->cpus[thread_index].rx_buffers_len < n_left + 1))
     {
-      u32 curr_len = vum->cpus[cpu_index].rx_buffers_len;
-      vum->cpus[cpu_index].rx_buffers_len +=
+      u32 curr_len = vum->cpus[thread_index].rx_buffers_len;
+      vum->cpus[thread_index].rx_buffers_len +=
 	vlib_buffer_alloc_from_free_list (vm,
-					  vum->cpus[cpu_index].rx_buffers +
+					  vum->cpus[thread_index].rx_buffers +
 					  curr_len,
 					  VHOST_USER_RX_BUFFERS_N - curr_len,
 					  VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
 
       if (PREDICT_FALSE
-	  (vum->cpus[cpu_index].rx_buffers_len <
+	  (vum->cpus[thread_index].rx_buffers_len <
 	   VHOST_USER_RX_BUFFER_STARVATION))
 	{
 	  /* In case of buffer starvation, discard some packets from the queue
 	   * and log the event.
 	   * We keep doing best effort for the remaining packets. */
-	  u32 flush = (n_left + 1 > vum->cpus[cpu_index].rx_buffers_len) ?
-	    n_left + 1 - vum->cpus[cpu_index].rx_buffers_len : 1;
+	  u32 flush = (n_left + 1 > vum->cpus[thread_index].rx_buffers_len) ?
+	    n_left + 1 - vum->cpus[thread_index].rx_buffers_len : 1;
 	  flush = vhost_user_rx_discard_packet (vm, vui, txvq, flush);
 
 	  n_left -= flush;
 	  vlib_increment_simple_counter (vnet_main.
 					 interface_main.sw_if_counters +
 					 VNET_INTERFACE_COUNTER_DROP,
-					 os_get_cpu_number (),
+					 vlib_get_thread_index (),
 					 vui->sw_if_index, flush);
 
 	  vlib_error_count (vm, vhost_user_input_node.index,
@@ -1696,7 +1697,7 @@ vhost_user_if_input (vlib_main_t * vm,
 	  u32 desc_data_offset;
 	  vring_desc_t *desc_table = txvq->desc;
 
-	  if (PREDICT_FALSE (vum->cpus[cpu_index].rx_buffers_len <= 1))
+	  if (PREDICT_FALSE (vum->cpus[thread_index].rx_buffers_len <= 1))
 	    {
 	      /* Not enough rx_buffers
 	       * Note: We yeld on 1 so we don't need to do an additional
@@ -1707,17 +1708,18 @@ vhost_user_if_input (vlib_main_t * vm,
 	    }
 
 	  desc_current = txvq->avail->ring[txvq->last_avail_idx & qsz_mask];
-	  vum->cpus[cpu_index].rx_buffers_len--;
-	  bi_current = (vum->cpus[cpu_index].rx_buffers)
-	    [vum->cpus[cpu_index].rx_buffers_len];
+	  vum->cpus[thread_index].rx_buffers_len--;
+	  bi_current = (vum->cpus[thread_index].rx_buffers)
+	    [vum->cpus[thread_index].rx_buffers_len];
 	  b_head = b_current = vlib_get_buffer (vm, bi_current);
 	  to_next[0] = bi_current;	//We do that now so we can forget about bi_current
 	  to_next++;
 	  n_left_to_next--;
 
 	  vlib_prefetch_buffer_with_index (vm,
-					   (vum->cpus[cpu_index].rx_buffers)
-					   [vum->cpus[cpu_index].
+					   (vum->
+					    cpus[thread_index].rx_buffers)
+					   [vum->cpus[thread_index].
 					    rx_buffers_len - 1], LOAD);
 
 	  /* Just preset the used descriptor id and length for later */
@@ -1791,7 +1793,7 @@ vhost_user_if_input (vlib_main_t * vm,
 		  (b_current->current_length == VLIB_BUFFER_DATA_SIZE))
 		{
 		  if (PREDICT_FALSE
-		      (vum->cpus[cpu_index].rx_buffers_len == 0))
+		      (vum->cpus[thread_index].rx_buffers_len == 0))
 		    {
 		      /* Cancel speculation */
 		      to_next--;
@@ -1805,17 +1807,18 @@ vhost_user_if_input (vlib_main_t * vm,
 		       * but valid.
 		       */
 		      vhost_user_input_rewind_buffers (vm,
-						       &vum->cpus[cpu_index],
+						       &vum->cpus
+						       [thread_index],
 						       b_head);
 		      n_left = 0;
 		      goto stop;
 		    }
 
 		  /* Get next output */
-		  vum->cpus[cpu_index].rx_buffers_len--;
+		  vum->cpus[thread_index].rx_buffers_len--;
 		  u32 bi_next =
-		    (vum->cpus[cpu_index].rx_buffers)[vum->cpus
-						      [cpu_index].rx_buffers_len];
+		    (vum->cpus[thread_index].rx_buffers)[vum->cpus
+							 [thread_index].rx_buffers_len];
 		  b_current->next_buffer = bi_next;
 		  b_current->flags |= VLIB_BUFFER_NEXT_PRESENT;
 		  bi_current = bi_next;
@@ -1823,7 +1826,7 @@ vhost_user_if_input (vlib_main_t * vm,
 		}
 
 	      /* Prepare a copy order executed later for the data */
-	      vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len];
+	      vhost_copy_t *cpy = &vum->cpus[thread_index].copy[copy_len];
 	      copy_len++;
 	      u32 desc_data_l =
 		desc_table[desc_current].len - desc_data_offset;
@@ -1880,7 +1883,7 @@ vhost_user_if_input (vlib_main_t * vm,
 	  if (PREDICT_FALSE (copy_len >= VHOST_USER_RX_COPY_THRESHOLD))
 	    {
 	      if (PREDICT_FALSE
-		  (vhost_user_input_copy (vui, vum->cpus[cpu_index].copy,
+		  (vhost_user_input_copy (vui, vum->cpus[thread_index].copy,
 					  copy_len, &map_hint)))
 		{
 		  clib_warning
@@ -1905,7 +1908,7 @@ vhost_user_if_input (vlib_main_t * vm,
 
   /* Do the memory copies */
   if (PREDICT_FALSE
-      (vhost_user_input_copy (vui, vum->cpus[cpu_index].copy,
+      (vhost_user_input_copy (vui, vum->cpus[thread_index].copy,
 			      copy_len, &map_hint)))
     {
       clib_warning ("Memory mapping error on interface hw_if_index=%d "
@@ -1933,9 +1936,9 @@ vhost_user_if_input (vlib_main_t * vm,
   vlib_increment_combined_counter
     (vnet_main.interface_main.combined_sw_if_counters
      + VNET_INTERFACE_COUNTER_RX,
-     os_get_cpu_number (), vui->sw_if_index, n_rx_packets, n_rx_bytes);
+     vlib_get_thread_index (), vui->sw_if_index, n_rx_packets, n_rx_bytes);
 
-  vnet_device_increment_rx_packets (cpu_index, n_rx_packets);
+  vnet_device_increment_rx_packets (thread_index, n_rx_packets);
 
   return n_rx_packets;
 }
@@ -1946,15 +1949,15 @@ vhost_user_input (vlib_main_t * vm,
 {
   vhost_user_main_t *vum = &vhost_user_main;
   uword n_rx_packets = 0;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   vhost_iface_and_queue_t *vhiq;
   vhost_user_intf_t *vui;
   vhost_cpu_t *vhc;
 
-  vhc = &vum->cpus[cpu_index];
+  vhc = &vum->cpus[thread_index];
   if (PREDICT_TRUE (vhc->operation_mode == VHOST_USER_POLLING_MODE))
     {
-      vec_foreach (vhiq, vum->cpus[cpu_index].rx_queues)
+      vec_foreach (vhiq, vum->cpus[thread_index].rx_queues)
       {
 	vui = &vum->vhost_user_interfaces[vhiq->vhost_iface_index];
 	n_rx_packets += vhost_user_if_input (vm, vum, vui, vhiq->qid, node);
@@ -2096,7 +2099,7 @@ vhost_user_tx (vlib_main_t * vm,
   vhost_user_vring_t *rxvq;
   u16 qsz_mask;
   u8 error;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   u32 map_hint = 0;
   u8 retry = 8;
   u16 copy_len;
@@ -2116,7 +2119,7 @@ vhost_user_tx (vlib_main_t * vm,
 
   qid =
     VHOST_VRING_IDX_RX (*vec_elt_at_index
-			(vui->per_cpu_tx_qid, os_get_cpu_number ()));
+			(vui->per_cpu_tx_qid, vlib_get_thread_index ()));
   rxvq = &vui->vrings[qid];
   if (PREDICT_FALSE (vui->use_tx_spinlock))
     vhost_user_vring_lock (vui, qid);
@@ -2143,10 +2146,10 @@ retry:
 
       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
 	{
-	  vum->cpus[cpu_index].current_trace =
+	  vum->cpus[thread_index].current_trace =
 	    vlib_add_trace (vm, node, b0,
-			    sizeof (*vum->cpus[cpu_index].current_trace));
-	  vhost_user_tx_trace (vum->cpus[cpu_index].current_trace,
+			    sizeof (*vum->cpus[thread_index].current_trace));
+	  vhost_user_tx_trace (vum->cpus[thread_index].current_trace,
 			       vui, qid / 2, b0, rxvq);
 	}
 
@@ -2188,14 +2191,14 @@ retry:
       {
 	// Get a header from the header array
 	virtio_net_hdr_mrg_rxbuf_t *hdr =
-	  &vum->cpus[cpu_index].tx_headers[tx_headers_len];
+	  &vum->cpus[thread_index].tx_headers[tx_headers_len];
 	tx_headers_len++;
 	hdr->hdr.flags = 0;
 	hdr->hdr.gso_type = 0;
 	hdr->num_buffers = 1;	//This is local, no need to check
 
 	// Prepare a copy order executed later for the header
-	vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len];
+	vhost_copy_t *cpy = &vum->cpus[thread_index].copy[copy_len];
 	copy_len++;
 	cpy->len = vui->virtio_net_hdr_sz;
 	cpy->dst = buffer_map_addr;
@@ -2220,7 +2223,7 @@ retry:
 	      else if (vui->virtio_net_hdr_sz == 12)	//MRG is available
 		{
 		  virtio_net_hdr_mrg_rxbuf_t *hdr =
-		    &vum->cpus[cpu_index].tx_headers[tx_headers_len - 1];
+		    &vum->cpus[thread_index].tx_headers[tx_headers_len - 1];
 
 		  //Move from available to used buffer
 		  rxvq->used->ring[rxvq->last_used_idx & qsz_mask].id =
@@ -2282,7 +2285,7 @@ retry:
 	    }
 
 	  {
-	    vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len];
+	    vhost_copy_t *cpy = &vum->cpus[thread_index].copy[copy_len];
 	    copy_len++;
 	    cpy->len = bytes_left;
 	    cpy->len = (cpy->len > buffer_len) ? buffer_len : cpy->len;
@@ -2325,8 +2328,8 @@ retry:
 
       if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED))
 	{
-	  vum->cpus[cpu_index].current_trace->hdr =
-	    vum->cpus[cpu_index].tx_headers[tx_headers_len - 1];
+	  vum->cpus[thread_index].current_trace->hdr =
+	    vum->cpus[thread_index].tx_headers[tx_headers_len - 1];
 	}
 
       n_left--;			//At the end for error counting when 'goto done' is invoked
@@ -2336,7 +2339,7 @@ retry:
 done:
   //Do the memory copies
   if (PREDICT_FALSE
-      (vhost_user_tx_copy (vui, vum->cpus[cpu_index].copy,
+      (vhost_user_tx_copy (vui, vum->cpus[thread_index].copy,
 			   copy_len, &map_hint)))
     {
       clib_warning ("Memory mapping error on interface hw_if_index=%d "
@@ -2386,7 +2389,7 @@ done3:
       vlib_increment_simple_counter
 	(vnet_main.interface_main.sw_if_counters
 	 + VNET_INTERFACE_COUNTER_DROP,
-	 os_get_cpu_number (), vui->sw_if_index, n_left);
+	 vlib_get_thread_index (), vui->sw_if_index, n_left);
     }
 
   vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors);
@@ -2773,11 +2776,11 @@ vhost_user_send_interrupt_process (vlib_main_t * vm,
 	case ~0:
 	  vec_foreach (vhc, vum->cpus)
 	  {
-	    u32 cpu_index = vhc - vum->cpus;
+	    u32 thread_index = vhc - vum->cpus;
 	    f64 next_timeout;
 
 	    next_timeout = timeout;
-	    vec_foreach (vhiq, vum->cpus[cpu_index].rx_queues)
+	    vec_foreach (vhiq, vum->cpus[thread_index].rx_queues)
 	    {
 	      vui = &vum->vhost_user_interfaces[vhiq->vhost_iface_index];
 	      vhost_user_vring_t *rxvq =
diff --git a/src/vnet/dpo/lookup_dpo.c b/src/vnet/dpo/lookup_dpo.c
index e94e871c..97ad0a44 100644
--- a/src/vnet/dpo/lookup_dpo.c
+++ b/src/vnet/dpo/lookup_dpo.c
@@ -266,7 +266,7 @@ lookup_dpo_ip4_inline (vlib_main_t * vm,
                        int table_from_interface)
 {
     u32 n_left_from, next_index, * from, * to_next;
-    u32 cpu_index = os_get_cpu_number();
+    u32 thread_index = vlib_get_thread_index();
     vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters;
 
     from = vlib_frame_vector_args (from_frame);
@@ -407,10 +407,10 @@ lookup_dpo_ip4_inline (vlib_main_t * vm,
 	    vnet_buffer(b1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
 
 	    vlib_increment_combined_counter
-		(cm, cpu_index, lbi0, 1,
+		(cm, thread_index, lbi0, 1,
 		 vlib_buffer_length_in_chain (vm, b0));
 	    vlib_increment_combined_counter
-		(cm, cpu_index, lbi1, 1,
+		(cm, thread_index, lbi1, 1,
 		 vlib_buffer_length_in_chain (vm, b1));
 
 	    if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
@@ -511,7 +511,7 @@ lookup_dpo_ip4_inline (vlib_main_t * vm,
 	    vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
 
 	    vlib_increment_combined_counter
-		(cm, cpu_index, lbi0, 1,
+		(cm, thread_index, lbi0, 1,
 		 vlib_buffer_length_in_chain (vm, b0));
 
 	    if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
@@ -606,7 +606,7 @@ lookup_dpo_ip6_inline (vlib_main_t * vm,
 {
     vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters;
     u32 n_left_from, next_index, * from, * to_next;
-    u32 cpu_index = os_get_cpu_number();
+    u32 thread_index = vlib_get_thread_index();
 
     from = vlib_frame_vector_args (from_frame);
     n_left_from = from_frame->n_vectors;
@@ -749,10 +749,10 @@ lookup_dpo_ip6_inline (vlib_main_t * vm,
 	    vnet_buffer(b1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
 
 	    vlib_increment_combined_counter
-		(cm, cpu_index, lbi0, 1,
+		(cm, thread_index, lbi0, 1,
 		 vlib_buffer_length_in_chain (vm, b0));
 	    vlib_increment_combined_counter
-		(cm, cpu_index, lbi1, 1,
+		(cm, thread_index, lbi1, 1,
 		 vlib_buffer_length_in_chain (vm, b1));
 
 	    if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
@@ -853,7 +853,7 @@ lookup_dpo_ip6_inline (vlib_main_t * vm,
 	    vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
 
 	    vlib_increment_combined_counter
-		(cm, cpu_index, lbi0, 1,
+		(cm, thread_index, lbi0, 1,
 		 vlib_buffer_length_in_chain (vm, b0));
 
 	    if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
@@ -930,7 +930,7 @@ lookup_dpo_mpls_inline (vlib_main_t * vm,
                        int table_from_interface)
 {
     u32 n_left_from, next_index, * from, * to_next;
-    u32 cpu_index = os_get_cpu_number();
+    u32 thread_index = vlib_get_thread_index();
     vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters;
 
     from = vlib_frame_vector_args (from_frame);
@@ -994,7 +994,7 @@ lookup_dpo_mpls_inline (vlib_main_t * vm,
             vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
 
             vlib_increment_combined_counter
-                (cm, cpu_index, lbi0, 1,
+                (cm, thread_index, lbi0, 1,
                  vlib_buffer_length_in_chain (vm, b0));
 
 	    if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) 
diff --git a/src/vnet/dpo/replicate_dpo.c b/src/vnet/dpo/replicate_dpo.c
index a9f334be..e25ceae9 100644
--- a/src/vnet/dpo/replicate_dpo.c
+++ b/src/vnet/dpo/replicate_dpo.c
@@ -627,7 +627,7 @@ replicate_inline (vlib_main_t * vm,
     vlib_combined_counter_main_t * cm = &replicate_main.repm_counters;
     replicate_main_t * rm = &replicate_main;
     u32 n_left_from, * from, * to_next, next_index;
-    u32 cpu_index = os_get_cpu_number();
+    u32 thread_index = vlib_get_thread_index();
 
     from = vlib_frame_vector_args (frame);
     n_left_from = frame->n_vectors;
@@ -657,12 +657,12 @@ replicate_inline (vlib_main_t * vm,
             rep0 = replicate_get(repi0);
 
             vlib_increment_combined_counter(
-                cm, cpu_index, repi0, 1,
+                cm, thread_index, repi0, 1,
                 vlib_buffer_length_in_chain(vm, b0));
 
-	    vec_validate (rm->clones[cpu_index], rep0->rep_n_buckets - 1);
+	    vec_validate (rm->clones[thread_index], rep0->rep_n_buckets - 1);
 
-	    num_cloned = vlib_buffer_clone (vm, bi0, rm->clones[cpu_index], rep0->rep_n_buckets, 128);
+	    num_cloned = vlib_buffer_clone (vm, bi0, rm->clones[thread_index], rep0->rep_n_buckets, 128);
 
 	    if (num_cloned != rep0->rep_n_buckets)
 	      {
@@ -673,7 +673,7 @@ replicate_inline (vlib_main_t * vm,
 
             for (bucket = 0; bucket < num_cloned; bucket++)
             {
-                ci0 = rm->clones[cpu_index][bucket];
+                ci0 = rm->clones[thread_index][bucket];
                 c0 = vlib_get_buffer(vm, ci0);
 
                 to_next[0] = ci0;
@@ -700,7 +700,7 @@ replicate_inline (vlib_main_t * vm,
 		    vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
 		  }
             }
-	    vec_reset_length (rm->clones[cpu_index]);
+	    vec_reset_length (rm->clones[thread_index]);
         }
 
         vlib_put_next_frame (vm, node, next_index, n_left_to_next);
diff --git a/src/vnet/ethernet/arp.c b/src/vnet/ethernet/arp.c
index ee757505..c74a097e 100644
--- a/src/vnet/ethernet/arp.c
+++ b/src/vnet/ethernet/arp.c
@@ -1771,7 +1771,7 @@ set_ip4_over_ethernet_rpc_callback (vnet_arp_set_ip4_over_ethernet_rpc_args_t
 				    * a)
 {
   vnet_main_t *vm = vnet_get_main ();
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
 
   if (a->flags & ETHERNET_ARP_ARGS_REMOVE)
     vnet_arp_unset_ip4_over_ethernet_internal (vm, a);
diff --git a/src/vnet/ethernet/interface.c b/src/vnet/ethernet/interface.c
index 9894e3c8..335e3f9f 100644
--- a/src/vnet/ethernet/interface.c
+++ b/src/vnet/ethernet/interface.c
@@ -362,7 +362,7 @@ simulated_ethernet_interface_tx (vlib_main_t * vm,
   u32 next_index = VNET_SIMULATED_ETHERNET_TX_NEXT_ETHERNET_INPUT;
   u32 i, next_node_index, bvi_flag, sw_if_index;
   u32 n_pkts = 0, n_bytes = 0;
-  u32 cpu_index = vm->cpu_index;
+  u32 thread_index = vm->thread_index;
   vnet_main_t *vnm = vnet_get_main ();
   vnet_interface_main_t *im = &vnm->interface_main;
   vlib_node_main_t *nm = &vm->node_main;
@@ -420,8 +420,9 @@ simulated_ethernet_interface_tx (vlib_main_t * vm,
 
       /* increment TX interface stat */
       vlib_increment_combined_counter (im->combined_sw_if_counters +
-				       VNET_INTERFACE_COUNTER_TX, cpu_index,
-				       sw_if_index, n_pkts, n_bytes);
+				       VNET_INTERFACE_COUNTER_TX,
+				       thread_index, sw_if_index, n_pkts,
+				       n_bytes);
     }
 
   return n_left_from;
diff --git a/src/vnet/ethernet/node.c b/src/vnet/ethernet/node.c
index b699e381..f7787ed2 100755
--- a/src/vnet/ethernet/node.c
+++ b/src/vnet/ethernet/node.c
@@ -291,7 +291,7 @@ ethernet_input_inline (vlib_main_t * vm,
   vlib_node_runtime_t *error_node;
   u32 n_left_from, next_index, *from, *to_next;
   u32 stats_sw_if_index, stats_n_packets, stats_n_bytes;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   u32 cached_sw_if_index = ~0;
   u32 cached_is_l2 = 0;		/* shut up gcc */
   vnet_hw_interface_t *hi = NULL;	/* used for main interface only */
@@ -510,7 +510,7 @@ ethernet_input_inline (vlib_main_t * vm,
 						     interface_main.combined_sw_if_counters
 						     +
 						     VNET_INTERFACE_COUNTER_RX,
-						     cpu_index,
+						     thread_index,
 						     new_sw_if_index0, 1,
 						     len0);
 		  if (new_sw_if_index1 != old_sw_if_index1
@@ -519,7 +519,7 @@ ethernet_input_inline (vlib_main_t * vm,
 						     interface_main.combined_sw_if_counters
 						     +
 						     VNET_INTERFACE_COUNTER_RX,
-						     cpu_index,
+						     thread_index,
 						     new_sw_if_index1, 1,
 						     len1);
 
@@ -530,7 +530,7 @@ ethernet_input_inline (vlib_main_t * vm,
 			  vlib_increment_combined_counter
 			    (vnm->interface_main.combined_sw_if_counters
 			     + VNET_INTERFACE_COUNTER_RX,
-			     cpu_index,
+			     thread_index,
 			     stats_sw_if_index,
 			     stats_n_packets, stats_n_bytes);
 			  stats_n_packets = stats_n_bytes = 0;
@@ -696,13 +696,13 @@ ethernet_input_inline (vlib_main_t * vm,
 		    vlib_increment_combined_counter
 		      (vnm->interface_main.combined_sw_if_counters
 		       + VNET_INTERFACE_COUNTER_RX,
-		       cpu_index, new_sw_if_index0, 1, len0);
+		       thread_index, new_sw_if_index0, 1, len0);
 		  if (stats_n_packets > 0)
 		    {
 		      vlib_increment_combined_counter
 			(vnm->interface_main.combined_sw_if_counters
 			 + VNET_INTERFACE_COUNTER_RX,
-			 cpu_index,
+			 thread_index,
 			 stats_sw_if_index, stats_n_packets, stats_n_bytes);
 		      stats_n_packets = stats_n_bytes = 0;
 		    }
@@ -734,7 +734,7 @@ ethernet_input_inline (vlib_main_t * vm,
       vlib_increment_combined_counter
 	(vnm->interface_main.combined_sw_if_counters
 	 + VNET_INTERFACE_COUNTER_RX,
-	 cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes);
+	 thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes);
       node->runtime_data[0] = stats_sw_if_index;
     }
 
diff --git a/src/vnet/gre/node.c b/src/vnet/gre/node.c
index 2683586e..acf15f24 100644
--- a/src/vnet/gre/node.c
+++ b/src/vnet/gre/node.c
@@ -75,7 +75,7 @@ gre_input (vlib_main_t * vm,
   u64 cached_tunnel_key6[4];
   u32 cached_tunnel_sw_if_index = 0, tunnel_sw_if_index = 0;
 
-  u32 cpu_index = os_get_cpu_number();
+  u32 thread_index = vlib_get_thread_index();
   u32 len;
   vnet_interface_main_t *im = &gm->vnet_main->interface_main;
 
@@ -257,7 +257,7 @@ gre_input (vlib_main_t * vm,
           len = vlib_buffer_length_in_chain (vm, b0);
           vlib_increment_combined_counter (im->combined_sw_if_counters
                                            + VNET_INTERFACE_COUNTER_RX,
-                                           cpu_index,
+                                           thread_index,
                                            tunnel_sw_if_index,
                                            1 /* packets */,
                                            len /* bytes */);
@@ -324,7 +324,7 @@ drop0:
           len = vlib_buffer_length_in_chain (vm, b1);
           vlib_increment_combined_counter (im->combined_sw_if_counters
                                            + VNET_INTERFACE_COUNTER_RX,
-                                           cpu_index,
+                                           thread_index,
                                            tunnel_sw_if_index,
                                            1 /* packets */,
                                            len /* bytes */);
@@ -502,7 +502,7 @@ drop1:
           len = vlib_buffer_length_in_chain (vm, b0);
           vlib_increment_combined_counter (im->combined_sw_if_counters
                                            + VNET_INTERFACE_COUNTER_RX,
-                                           cpu_index,
+                                           thread_index,
                                            tunnel_sw_if_index,
                                            1 /* packets */,
                                            len /* bytes */);
diff --git a/src/vnet/interface.h b/src/vnet/interface.h
index a1ea2d61..08f08b10 100644
--- a/src/vnet/interface.h
+++ b/src/vnet/interface.h
@@ -468,7 +468,7 @@ typedef struct vnet_hw_interface_t
   u32 input_node_index;
 
   /* input node cpu index by queue */
-  u32 *input_node_cpu_index_by_queue;
+  u32 *input_node_thread_index_by_queue;
 
 } vnet_hw_interface_t;
 
diff --git a/src/vnet/interface_output.c b/src/vnet/interface_output.c
index 03f2cdca..663dc309 100644
--- a/src/vnet/interface_output.c
+++ b/src/vnet/interface_output.c
@@ -196,7 +196,7 @@ slow_path (vlib_main_t * vm,
  */
 static_always_inline void
 incr_output_stats (vnet_main_t * vnm,
-		   u32 cpu_index,
+		   u32 thread_index,
 		   u32 length,
 		   u32 sw_if_index,
 		   u32 * last_sw_if_index, u32 * n_packets, u32 * n_bytes)
@@ -216,7 +216,7 @@ incr_output_stats (vnet_main_t * vnm,
 
 	  vlib_increment_combined_counter (im->combined_sw_if_counters
 					   + VNET_INTERFACE_COUNTER_TX,
-					   cpu_index,
+					   thread_index,
 					   *last_sw_if_index,
 					   *n_packets, *n_bytes);
 	}
@@ -240,7 +240,7 @@ vnet_interface_output_node_flatten (vlib_main_t * vm,
   u32 n_left_to_tx, *from, *from_end, *to_tx;
   u32 n_bytes, n_buffers, n_packets;
   u32 last_sw_if_index;
-  u32 cpu_index = vm->cpu_index;
+  u32 thread_index = vm->thread_index;
 
   n_buffers = frame->n_vectors;
 
@@ -266,7 +266,7 @@ vnet_interface_output_node_flatten (vlib_main_t * vm,
 
       cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
 			     VNET_INTERFACE_COUNTER_TX_ERROR);
-      vlib_increment_simple_counter (cm, cpu_index,
+      vlib_increment_simple_counter (cm, thread_index,
 				     rt->sw_if_index, n_buffers);
       return vlib_error_drop_buffers (vm, node, from,
 				      /* buffer stride */ 1,
@@ -341,18 +341,18 @@ vnet_interface_output_node_flatten (vlib_main_t * vm,
 		  from += 1;
 		  to_tx += n_buffers;
 		  n_left_to_tx -= n_buffers;
-		  incr_output_stats (vnm, cpu_index, n_slow_bytes,
+		  incr_output_stats (vnm, thread_index, n_slow_bytes,
 				     vnet_buffer (b)->sw_if_index[VLIB_TX],
 				     &last_sw_if_index, &n_packets, &n_bytes);
 		}
 	    }
 	  else
 	    {
-	      incr_output_stats (vnm, cpu_index,
+	      incr_output_stats (vnm, thread_index,
 				 vlib_buffer_length_in_chain (vm, b0),
 				 vnet_buffer (b0)->sw_if_index[VLIB_TX],
 				 &last_sw_if_index, &n_packets, &n_bytes);
-	      incr_output_stats (vnm, cpu_index,
+	      incr_output_stats (vnm, thread_index,
 				 vlib_buffer_length_in_chain (vm, b0),
 				 vnet_buffer (b1)->sw_if_index[VLIB_TX],
 				 &last_sw_if_index, &n_packets, &n_bytes);
@@ -396,7 +396,7 @@ vnet_interface_output_node_flatten (vlib_main_t * vm,
 	      to_tx += n_buffers;
 	      n_left_to_tx -= n_buffers;
 	    }
-	  incr_output_stats (vnm, cpu_index,
+	  incr_output_stats (vnm, thread_index,
 			     vlib_buffer_length_in_chain (vm, b0),
 			     vnet_buffer (b0)->sw_if_index[VLIB_TX],
 			     &last_sw_if_index, &n_packets, &n_bytes);
@@ -408,7 +408,7 @@ vnet_interface_output_node_flatten (vlib_main_t * vm,
     }
 
   /* Final update of interface stats. */
-  incr_output_stats (vnm, cpu_index, 0, ~0,	/* ~0 will flush stats */
+  incr_output_stats (vnm, thread_index, 0, ~0,	/* ~0 will flush stats */
 		     &last_sw_if_index, &n_packets, &n_bytes);
 
   return n_buffers;
@@ -428,7 +428,7 @@ vnet_interface_output_node (vlib_main_t * vm,
   u32 n_left_to_tx, *from, *from_end, *to_tx;
   u32 n_bytes, n_buffers, n_packets;
   u32 n_bytes_b0, n_bytes_b1, n_bytes_b2, n_bytes_b3;
-  u32 cpu_index = vm->cpu_index;
+  u32 thread_index = vm->thread_index;
   vnet_interface_main_t *im = &vnm->interface_main;
   u32 next_index = VNET_INTERFACE_OUTPUT_NEXT_TX;
   u32 current_config_index = ~0;
@@ -458,7 +458,7 @@ vnet_interface_output_node (vlib_main_t * vm,
 
       cm = vec_elt_at_index (vnm->interface_main.sw_if_counters,
 			     VNET_INTERFACE_COUNTER_TX_ERROR);
-      vlib_increment_simple_counter (cm, cpu_index,
+      vlib_increment_simple_counter (cm, thread_index,
 				     rt->sw_if_index, n_buffers);
 
       return vlib_error_drop_buffers (vm, node, from,
@@ -558,7 +558,7 @@ vnet_interface_output_node (vlib_main_t * vm,
 	    {
 	      vlib_increment_combined_counter (im->combined_sw_if_counters +
 					       VNET_INTERFACE_COUNTER_TX,
-					       cpu_index, tx_swif0, 1,
+					       thread_index, tx_swif0, 1,
 					       n_bytes_b0);
 	    }
 
@@ -567,7 +567,7 @@ vnet_interface_output_node (vlib_main_t * vm,
 
 	      vlib_increment_combined_counter (im->combined_sw_if_counters +
 					       VNET_INTERFACE_COUNTER_TX,
-					       cpu_index, tx_swif1, 1,
+					       thread_index, tx_swif1, 1,
 					       n_bytes_b1);
 	    }
 
@@ -576,7 +576,7 @@ vnet_interface_output_node (vlib_main_t * vm,
 
 	      vlib_increment_combined_counter (im->combined_sw_if_counters +
 					       VNET_INTERFACE_COUNTER_TX,
-					       cpu_index, tx_swif2, 1,
+					       thread_index, tx_swif2, 1,
 					       n_bytes_b2);
 	    }
 	  if (PREDICT_FALSE (tx_swif3 != rt->sw_if_index))
@@ -584,7 +584,7 @@ vnet_interface_output_node (vlib_main_t * vm,
 
 	      vlib_increment_combined_counter (im->combined_sw_if_counters +
 					       VNET_INTERFACE_COUNTER_TX,
-					       cpu_index, tx_swif3, 1,
+					       thread_index, tx_swif3, 1,
 					       n_bytes_b3);
 	    }
 	}
@@ -623,7 +623,7 @@ vnet_interface_output_node (vlib_main_t * vm,
 
 	      vlib_increment_combined_counter (im->combined_sw_if_counters +
 					       VNET_INTERFACE_COUNTER_TX,
-					       cpu_index, tx_swif0, 1,
+					       thread_index, tx_swif0, 1,
 					       n_bytes_b0);
 	    }
 	}
@@ -634,7 +634,7 @@ vnet_interface_output_node (vlib_main_t * vm,
   /* Update main interface stats. */
   vlib_increment_combined_counter (im->combined_sw_if_counters
 				   + VNET_INTERFACE_COUNTER_TX,
-				   cpu_index,
+				   thread_index,
 				   rt->sw_if_index, n_packets, n_bytes);
   return n_buffers;
 }
@@ -893,7 +893,7 @@ process_drop_punt (vlib_main_t * vm,
   u32 current_sw_if_index, n_errors_current_sw_if_index;
   u64 current_counter;
   vlib_simple_counter_main_t *cm;
-  u32 cpu_index = vm->cpu_index;
+  u32 thread_index = vm->thread_index;
 
   static vlib_error_t memory[VNET_ERROR_N_DISPOSITION];
   static char memory_init[VNET_ERROR_N_DISPOSITION];
@@ -965,19 +965,19 @@ process_drop_punt (vlib_main_t * vm,
 	  current_counter -= 2;
 	  n_errors_current_sw_if_index -= 2;
 
-	  vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1);
-	  vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1);
+	  vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1);
+	  vlib_increment_simple_counter (cm, thread_index, sw_if_index1, 1);
 
 	  /* Increment super-interface drop/punt counters for
 	     sub-interfaces. */
 	  sw_if0 = vnet_get_sw_interface (vnm, sw_if_index0);
 	  vlib_increment_simple_counter
-	    (cm, cpu_index, sw_if0->sup_sw_if_index,
+	    (cm, thread_index, sw_if0->sup_sw_if_index,
 	     sw_if0->sup_sw_if_index != sw_if_index0);
 
 	  sw_if1 = vnet_get_sw_interface (vnm, sw_if_index1);
 	  vlib_increment_simple_counter
-	    (cm, cpu_index, sw_if1->sup_sw_if_index,
+	    (cm, thread_index, sw_if1->sup_sw_if_index,
 	     sw_if1->sup_sw_if_index != sw_if_index1);
 
 	  em->counters[current_counter_index] = current_counter;
@@ -1013,11 +1013,12 @@ process_drop_punt (vlib_main_t * vm,
       sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX];
 
       /* Increment drop/punt counters. */
-      vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1);
+      vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1);
 
       /* Increment super-interface drop/punt counters for sub-interfaces. */
       sw_if0 = vnet_get_sw_interface (vnm, sw_if_index0);
-      vlib_increment_simple_counter (cm, cpu_index, sw_if0->sup_sw_if_index,
+      vlib_increment_simple_counter (cm, thread_index,
+				     sw_if0->sup_sw_if_index,
 				     sw_if0->sup_sw_if_index != sw_if_index0);
 
       if (PREDICT_FALSE (e0 != current_error))
@@ -1041,12 +1042,12 @@ process_drop_punt (vlib_main_t * vm,
     {
       vnet_sw_interface_t *si;
 
-      vlib_increment_simple_counter (cm, cpu_index, current_sw_if_index,
+      vlib_increment_simple_counter (cm, thread_index, current_sw_if_index,
 				     n_errors_current_sw_if_index);
 
       si = vnet_get_sw_interface (vnm, current_sw_if_index);
       if (si->sup_sw_if_index != current_sw_if_index)
-	vlib_increment_simple_counter (cm, cpu_index, si->sup_sw_if_index,
+	vlib_increment_simple_counter (cm, thread_index, si->sup_sw_if_index,
 				       n_errors_current_sw_if_index);
     }
 
diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c
index ee1703e7..fdfe7f63 100644
--- a/src/vnet/ip/ip4_forward.c
+++ b/src/vnet/ip/ip4_forward.c
@@ -75,7 +75,7 @@ ip4_lookup_inline (vlib_main_t * vm,
   vlib_combined_counter_main_t *cm = &load_balance_main.lbm_to_counters;
   u32 n_left_from, n_left_to_next, *from, *to_next;
   ip_lookup_next_t next;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   from = vlib_frame_vector_args (frame);
   n_left_from = frame->n_vectors;
@@ -292,19 +292,19 @@ ip4_lookup_inline (vlib_main_t * vm,
 	  vnet_buffer (p3)->ip.adj_index[VLIB_TX] = dpo3->dpoi_index;
 
 	  vlib_increment_combined_counter
-	    (cm, cpu_index, lb_index0, 1,
+	    (cm, thread_index, lb_index0, 1,
 	     vlib_buffer_length_in_chain (vm, p0)
 	     + sizeof (ethernet_header_t));
 	  vlib_increment_combined_counter
-	    (cm, cpu_index, lb_index1, 1,
+	    (cm, thread_index, lb_index1, 1,
 	     vlib_buffer_length_in_chain (vm, p1)
 	     + sizeof (ethernet_header_t));
 	  vlib_increment_combined_counter
-	    (cm, cpu_index, lb_index2, 1,
+	    (cm, thread_index, lb_index2, 1,
 	     vlib_buffer_length_in_chain (vm, p2)
 	     + sizeof (ethernet_header_t));
 	  vlib_increment_combined_counter
-	    (cm, cpu_index, lb_index3, 1,
+	    (cm, thread_index, lb_index3, 1,
 	     vlib_buffer_length_in_chain (vm, p3)
 	     + sizeof (ethernet_header_t));
 
@@ -392,7 +392,7 @@ ip4_lookup_inline (vlib_main_t * vm,
 	  vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
 
 	  vlib_increment_combined_counter
-	    (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0));
+	    (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0));
 
 	  from += 1;
 	  to_next += 1;
@@ -479,7 +479,7 @@ ip4_load_balance (vlib_main_t * vm,
   vlib_combined_counter_main_t *cm = &load_balance_main.lbm_via_counters;
   u32 n_left_from, n_left_to_next, *from, *to_next;
   ip_lookup_next_t next;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   from = vlib_frame_vector_args (frame);
   n_left_from = frame->n_vectors;
@@ -584,9 +584,9 @@ ip4_load_balance (vlib_main_t * vm,
 	  vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
 
 	  vlib_increment_combined_counter
-	    (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0));
+	    (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0));
 	  vlib_increment_combined_counter
-	    (cm, cpu_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1));
+	    (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1));
 
 	  vlib_validate_buffer_enqueue_x2 (vm, node, next,
 					   to_next, n_left_to_next,
@@ -639,7 +639,7 @@ ip4_load_balance (vlib_main_t * vm,
 	  vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
 
 	  vlib_increment_combined_counter
-	    (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0));
+	    (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0));
 
 	  vlib_validate_buffer_enqueue_x1 (vm, node, next,
 					   to_next, n_left_to_next,
@@ -2330,7 +2330,7 @@ ip4_rewrite_inline (vlib_main_t * vm,
 
   n_left_from = frame->n_vectors;
   next_index = node->cached_next_index;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   while (n_left_from > 0)
     {
@@ -2379,9 +2379,9 @@ ip4_rewrite_inline (vlib_main_t * vm,
 	  if (do_counters)
 	    {
 	      vlib_prefetch_combined_counter (&adjacency_counters,
-					      cpu_index, adj_index0);
+					      thread_index, adj_index0);
 	      vlib_prefetch_combined_counter (&adjacency_counters,
-					      cpu_index, adj_index1);
+					      thread_index, adj_index1);
 	    }
 
 	  ip0 = vlib_buffer_get_current (p0);
@@ -2527,13 +2527,13 @@ ip4_rewrite_inline (vlib_main_t * vm,
 	    {
 	      vlib_increment_combined_counter
 		(&adjacency_counters,
-		 cpu_index,
+		 thread_index,
 		 adj_index0, 1,
 		 vlib_buffer_length_in_chain (vm, p0) + rw_len0);
 
 	      vlib_increment_combined_counter
 		(&adjacency_counters,
-		 cpu_index,
+		 thread_index,
 		 adj_index1, 1,
 		 vlib_buffer_length_in_chain (vm, p1) + rw_len1);
 	    }
@@ -2618,7 +2618,7 @@ ip4_rewrite_inline (vlib_main_t * vm,
 
 	  if (do_counters)
 	    vlib_prefetch_combined_counter (&adjacency_counters,
-					    cpu_index, adj_index0);
+					    thread_index, adj_index0);
 
 	  /* Guess we are only writing on simple Ethernet header. */
 	  vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t));
@@ -2637,7 +2637,7 @@ ip4_rewrite_inline (vlib_main_t * vm,
 	  if (do_counters)
 	    vlib_increment_combined_counter
 	      (&adjacency_counters,
-	       cpu_index, adj_index0, 1,
+	       thread_index, adj_index0, 1,
 	       vlib_buffer_length_in_chain (vm, p0) + rw_len0);
 
 	  /* Check MTU of outgoing interface. */
diff --git a/src/vnet/ip/ip4_input.c b/src/vnet/ip/ip4_input.c
index ba200a9f..3b08f4b0 100644
--- a/src/vnet/ip/ip4_input.c
+++ b/src/vnet/ip/ip4_input.c
@@ -85,7 +85,7 @@ ip4_input_inline (vlib_main_t * vm,
   vlib_node_runtime_t *error_node =
     vlib_node_get_runtime (vm, ip4_input_node.index);
   vlib_simple_counter_main_t *cm;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   from = vlib_frame_vector_args (frame);
   n_left_from = frame->n_vectors;
@@ -178,8 +178,8 @@ ip4_input_inline (vlib_main_t * vm,
 	  vnet_feature_arc_start (arc0, sw_if_index0, &next0, p0);
 	  vnet_feature_arc_start (arc1, sw_if_index1, &next1, p1);
 
-	  vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1);
-	  vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1);
+	  vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1);
+	  vlib_increment_simple_counter (cm, thread_index, sw_if_index1, 1);
 
 	  /* Punt packets with options or wrong version. */
 	  if (PREDICT_FALSE (ip0->ip_version_and_header_length != 0x45))
@@ -299,7 +299,7 @@ ip4_input_inline (vlib_main_t * vm,
 	  vnet_buffer (p0)->ip.adj_index[VLIB_RX] = ~0;
 	  vnet_feature_arc_start (arc0, sw_if_index0, &next0, p0);
 
-	  vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1);
+	  vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1);
 
 	  /* Punt packets with options or wrong version. */
 	  if (PREDICT_FALSE (ip0->ip_version_and_header_length != 0x45))
diff --git a/src/vnet/ip/ip6_forward.c b/src/vnet/ip/ip6_forward.c
index c120f12c..c2fc4f87 100644
--- a/src/vnet/ip/ip6_forward.c
+++ b/src/vnet/ip/ip6_forward.c
@@ -74,7 +74,7 @@ ip6_lookup_inline (vlib_main_t * vm,
   vlib_combined_counter_main_t *cm = &load_balance_main.lbm_to_counters;
   u32 n_left_from, n_left_to_next, *from, *to_next;
   ip_lookup_next_t next;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   from = vlib_frame_vector_args (frame);
   n_left_from = frame->n_vectors;
@@ -185,9 +185,9 @@ ip6_lookup_inline (vlib_main_t * vm,
 	  vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
 
 	  vlib_increment_combined_counter
-	    (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0));
+	    (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0));
 	  vlib_increment_combined_counter
-	    (cm, cpu_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1));
+	    (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1));
 
 	  from += 2;
 	  to_next += 2;
@@ -291,7 +291,7 @@ ip6_lookup_inline (vlib_main_t * vm,
 	  vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
 
 	  vlib_increment_combined_counter
-	    (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0));
+	    (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0));
 
 	  from += 1;
 	  to_next += 1;
@@ -703,7 +703,7 @@ ip6_load_balance (vlib_main_t * vm,
   vlib_combined_counter_main_t *cm = &load_balance_main.lbm_via_counters;
   u32 n_left_from, n_left_to_next, *from, *to_next;
   ip_lookup_next_t next;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   ip6_main_t *im = &ip6_main;
 
   from = vlib_frame_vector_args (frame);
@@ -824,9 +824,9 @@ ip6_load_balance (vlib_main_t * vm,
 	  vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
 
 	  vlib_increment_combined_counter
-	    (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0));
+	    (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0));
 	  vlib_increment_combined_counter
-	    (cm, cpu_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1));
+	    (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1));
 
 	  vlib_validate_buffer_enqueue_x2 (vm, node, next,
 					   to_next, n_left_to_next,
@@ -886,7 +886,7 @@ ip6_load_balance (vlib_main_t * vm,
 	    }
 
 	  vlib_increment_combined_counter
-	    (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0));
+	    (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0));
 
 	  vlib_validate_buffer_enqueue_x1 (vm, node, next,
 					   to_next, n_left_to_next,
@@ -1897,7 +1897,7 @@ ip6_rewrite_inline (vlib_main_t * vm,
 
   n_left_from = frame->n_vectors;
   next_index = node->cached_next_index;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   while (n_left_from > 0)
     {
@@ -2019,11 +2019,11 @@ ip6_rewrite_inline (vlib_main_t * vm,
 	    {
 	      vlib_increment_combined_counter
 		(&adjacency_counters,
-		 cpu_index, adj_index0, 1,
+		 thread_index, adj_index0, 1,
 		 vlib_buffer_length_in_chain (vm, p0) + rw_len0);
 	      vlib_increment_combined_counter
 		(&adjacency_counters,
-		 cpu_index, adj_index1, 1,
+		 thread_index, adj_index1, 1,
 		 vlib_buffer_length_in_chain (vm, p1) + rw_len1);
 	    }
 
@@ -2156,7 +2156,7 @@ ip6_rewrite_inline (vlib_main_t * vm,
 	    {
 	      vlib_increment_combined_counter
 		(&adjacency_counters,
-		 cpu_index, adj_index0, 1,
+		 thread_index, adj_index0, 1,
 		 vlib_buffer_length_in_chain (vm, p0) + rw_len0);
 	    }
 
diff --git a/src/vnet/ip/ip6_input.c b/src/vnet/ip/ip6_input.c
index 20306088..ffdc4727 100644
--- a/src/vnet/ip/ip6_input.c
+++ b/src/vnet/ip/ip6_input.c
@@ -82,7 +82,7 @@ ip6_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
   vlib_node_runtime_t *error_node =
     vlib_node_get_runtime (vm, ip6_input_node.index);
   vlib_simple_counter_main_t *cm;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   from = vlib_frame_vector_args (frame);
   n_left_from = frame->n_vectors;
@@ -171,8 +171,8 @@ ip6_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
 	  vnet_feature_arc_start (arc0, sw_if_index0, &next0, p0);
 	  vnet_feature_arc_start (arc1, sw_if_index1, &next1, p1);
 
-	  vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1);
-	  vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1);
+	  vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1);
+	  vlib_increment_simple_counter (cm, thread_index, sw_if_index1, 1);
 
 	  error0 = error1 = IP6_ERROR_NONE;
 
@@ -270,7 +270,7 @@ ip6_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
 	  vnet_buffer (p0)->ip.adj_index[VLIB_RX] = ~0;
 	  vnet_feature_arc_start (arc0, sw_if_index0, &next0, p0);
 
-	  vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1);
+	  vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1);
 	  error0 = IP6_ERROR_NONE;
 
 	  /* Version != 6?  Drop it. */
diff --git a/src/vnet/ip/ip6_neighbor.c b/src/vnet/ip/ip6_neighbor.c
index 5d1fb6f8..2af546df 100644
--- a/src/vnet/ip/ip6_neighbor.c
+++ b/src/vnet/ip/ip6_neighbor.c
@@ -581,7 +581,7 @@ vnet_set_ip6_ethernet_neighbor (vlib_main_t * vm,
   u32 next_index;
   pending_resolution_t *pr, *mc;
 
-  if (os_get_cpu_number ())
+  if (vlib_get_thread_index ())
     {
       set_unset_ip6_neighbor_rpc (vm, sw_if_index, a, link_layer_address,
 				  1 /* set new neighbor */ , is_static,
@@ -722,7 +722,7 @@ vnet_unset_ip6_ethernet_neighbor (vlib_main_t * vm,
   uword *p;
   int rv = 0;
 
-  if (os_get_cpu_number ())
+  if (vlib_get_thread_index ())
     {
       set_unset_ip6_neighbor_rpc (vm, sw_if_index, a, link_layer_address,
 				  0 /* unset */ , 0, 0);
diff --git a/src/vnet/ipsec/esp.h b/src/vnet/ipsec/esp.h
index 50cac806..799003b9 100644
--- a/src/vnet/ipsec/esp.h
+++ b/src/vnet/ipsec/esp.h
@@ -282,8 +282,8 @@ hmac_calc (ipsec_integ_alg_t alg,
 	   u8 * data, int data_len, u8 * signature, u8 use_esn, u32 seq_hi)
 {
   esp_main_t *em = &esp_main;
-  u32 cpu_index = os_get_cpu_number ();
-  HMAC_CTX *ctx = &(em->per_thread_data[cpu_index].hmac_ctx);
+  u32 thread_index = vlib_get_thread_index ();
+  HMAC_CTX *ctx = &(em->per_thread_data[thread_index].hmac_ctx);
   const EVP_MD *md = NULL;
   unsigned int len;
 
@@ -292,10 +292,10 @@ hmac_calc (ipsec_integ_alg_t alg,
   if (PREDICT_FALSE (em->esp_integ_algs[alg].md == 0))
     return 0;
 
-  if (PREDICT_FALSE (alg != em->per_thread_data[cpu_index].last_integ_alg))
+  if (PREDICT_FALSE (alg != em->per_thread_data[thread_index].last_integ_alg))
     {
       md = em->esp_integ_algs[alg].md;
-      em->per_thread_data[cpu_index].last_integ_alg = alg;
+      em->per_thread_data[thread_index].last_integ_alg = alg;
     }
 
   HMAC_Init (ctx, key, key_len, md);
diff --git a/src/vnet/ipsec/esp_decrypt.c b/src/vnet/ipsec/esp_decrypt.c
index 7289b260..925d2b45 100644
--- a/src/vnet/ipsec/esp_decrypt.c
+++ b/src/vnet/ipsec/esp_decrypt.c
@@ -85,8 +85,8 @@ esp_decrypt_aes_cbc (ipsec_crypto_alg_t alg,
 		     u8 * in, u8 * out, size_t in_len, u8 * key, u8 * iv)
 {
   esp_main_t *em = &esp_main;
-  u32 cpu_index = os_get_cpu_number ();
-  EVP_CIPHER_CTX *ctx = &(em->per_thread_data[cpu_index].decrypt_ctx);
+  u32 thread_index = vlib_get_thread_index ();
+  EVP_CIPHER_CTX *ctx = &(em->per_thread_data[thread_index].decrypt_ctx);
   const EVP_CIPHER *cipher = NULL;
   int out_len;
 
@@ -95,10 +95,11 @@ esp_decrypt_aes_cbc (ipsec_crypto_alg_t alg,
   if (PREDICT_FALSE (em->esp_crypto_algs[alg].type == 0))
     return;
 
-  if (PREDICT_FALSE (alg != em->per_thread_data[cpu_index].last_decrypt_alg))
+  if (PREDICT_FALSE
+      (alg != em->per_thread_data[thread_index].last_decrypt_alg))
     {
       cipher = em->esp_crypto_algs[alg].type;
-      em->per_thread_data[cpu_index].last_decrypt_alg = alg;
+      em->per_thread_data[thread_index].last_decrypt_alg = alg;
     }
 
   EVP_DecryptInit_ex (ctx, cipher, NULL, key, iv);
@@ -117,11 +118,11 @@ esp_decrypt_node_fn (vlib_main_t * vm,
   u32 *recycle = 0;
   from = vlib_frame_vector_args (from_frame);
   n_left_from = from_frame->n_vectors;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   ipsec_alloc_empty_buffers (vm, im);
 
-  u32 *empty_buffers = im->empty_buffers[cpu_index];
+  u32 *empty_buffers = im->empty_buffers[thread_index];
 
   if (PREDICT_FALSE (vec_len (empty_buffers) < n_left_from))
     {
diff --git a/src/vnet/ipsec/esp_encrypt.c b/src/vnet/ipsec/esp_encrypt.c
index 44ae2297..b2bc4e0b 100644
--- a/src/vnet/ipsec/esp_encrypt.c
+++ b/src/vnet/ipsec/esp_encrypt.c
@@ -88,8 +88,8 @@ esp_encrypt_aes_cbc (ipsec_crypto_alg_t alg,
 		     u8 * in, u8 * out, size_t in_len, u8 * key, u8 * iv)
 {
   esp_main_t *em = &esp_main;
-  u32 cpu_index = os_get_cpu_number ();
-  EVP_CIPHER_CTX *ctx = &(em->per_thread_data[cpu_index].encrypt_ctx);
+  u32 thread_index = vlib_get_thread_index ();
+  EVP_CIPHER_CTX *ctx = &(em->per_thread_data[thread_index].encrypt_ctx);
   const EVP_CIPHER *cipher = NULL;
   int out_len;
 
@@ -98,10 +98,11 @@ esp_encrypt_aes_cbc (ipsec_crypto_alg_t alg,
   if (PREDICT_FALSE (em->esp_crypto_algs[alg].type == IPSEC_CRYPTO_ALG_NONE))
     return;
 
-  if (PREDICT_FALSE (alg != em->per_thread_data[cpu_index].last_encrypt_alg))
+  if (PREDICT_FALSE
+      (alg != em->per_thread_data[thread_index].last_encrypt_alg))
     {
       cipher = em->esp_crypto_algs[alg].type;
-      em->per_thread_data[cpu_index].last_encrypt_alg = alg;
+      em->per_thread_data[thread_index].last_encrypt_alg = alg;
     }
 
   EVP_EncryptInit_ex (ctx, cipher, NULL, key, iv);
@@ -119,11 +120,11 @@ esp_encrypt_node_fn (vlib_main_t * vm,
   n_left_from = from_frame->n_vectors;
   ipsec_main_t *im = &ipsec_main;
   u32 *recycle = 0;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   ipsec_alloc_empty_buffers (vm, im);
 
-  u32 *empty_buffers = im->empty_buffers[cpu_index];
+  u32 *empty_buffers = im->empty_buffers[thread_index];
 
   if (PREDICT_FALSE (vec_len (empty_buffers) < n_left_from))
     {
diff --git a/src/vnet/ipsec/ikev2.c b/src/vnet/ipsec/ikev2.c
index 2c1074d8..3f9978a7 100644
--- a/src/vnet/ipsec/ikev2.c
+++ b/src/vnet/ipsec/ikev2.c
@@ -303,16 +303,16 @@ static void
 ikev2_delete_sa (ikev2_sa_t * sa)
 {
   ikev2_main_t *km = &ikev2_main;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   uword *p;
 
   ikev2_sa_free_all_vec (sa);
 
-  p = hash_get (km->per_thread_data[cpu_index].sa_by_rspi, sa->rspi);
+  p = hash_get (km->per_thread_data[thread_index].sa_by_rspi, sa->rspi);
   if (p)
     {
-      hash_unset (km->per_thread_data[cpu_index].sa_by_rspi, sa->rspi);
-      pool_put (km->per_thread_data[cpu_index].sas, sa);
+      hash_unset (km->per_thread_data[thread_index].sa_by_rspi, sa->rspi);
+      pool_put (km->per_thread_data[thread_index].sas, sa);
     }
 }
 
@@ -776,29 +776,31 @@ ikev2_initial_contact_cleanup (ikev2_sa_t * sa)
   ikev2_sa_t *tmp;
   u32 i, *delete = 0;
   ikev2_child_sa_t *c;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   if (!sa->initial_contact)
     return;
 
   /* find old IKE SAs with the same authenticated identity */
   /* *INDENT-OFF* */
-  pool_foreach (tmp, km->per_thread_data[cpu_index].sas, ({
+  pool_foreach (tmp, km->per_thread_data[thread_index].sas, ({
         if (tmp->i_id.type != sa->i_id.type ||
             vec_len(tmp->i_id.data) != vec_len(sa->i_id.data) ||
             memcmp(sa->i_id.data, tmp->i_id.data, vec_len(sa->i_id.data)))
           continue;
 
         if (sa->rspi != tmp->rspi)
-          vec_add1(delete, tmp - km->per_thread_data[cpu_index].sas);
+          vec_add1(delete, tmp - km->per_thread_data[thread_index].sas);
   }));
   /* *INDENT-ON* */
 
   for (i = 0; i < vec_len (delete); i++)
     {
-      tmp = pool_elt_at_index (km->per_thread_data[cpu_index].sas, delete[i]);
-      vec_foreach (c, tmp->childs)
-	ikev2_delete_tunnel_interface (km->vnet_main, tmp, c);
+      tmp =
+	pool_elt_at_index (km->per_thread_data[thread_index].sas, delete[i]);
+      vec_foreach (c,
+		   tmp->childs) ikev2_delete_tunnel_interface (km->vnet_main,
+							       tmp, c);
       ikev2_delete_sa (tmp);
     }
 
@@ -1922,10 +1924,10 @@ ikev2_retransmit_sa_init (ike_header_t * ike,
 {
   ikev2_main_t *km = &ikev2_main;
   ikev2_sa_t *sa;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   /* *INDENT-OFF* */
-  pool_foreach (sa, km->per_thread_data[cpu_index].sas, ({
+  pool_foreach (sa, km->per_thread_data[thread_index].sas, ({
     if (sa->ispi == clib_net_to_host_u64(ike->ispi) &&
         sa->iaddr.as_u32 == iaddr.as_u32 &&
         sa->raddr.as_u32 == raddr.as_u32)
@@ -2036,7 +2038,7 @@ ikev2_node_fn (vlib_main_t * vm,
   u32 n_left_from, *from, *to_next;
   ikev2_next_t next_index;
   ikev2_main_t *km = &ikev2_main;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   from = vlib_frame_vector_args (frame);
   n_left_from = frame->n_vectors;
@@ -2134,11 +2136,14 @@ ikev2_node_fn (vlib_main_t * vm,
 		      if (sa0->state == IKEV2_STATE_SA_INIT)
 			{
 			  /* add SA to the pool */
-			  pool_get (km->per_thread_data[cpu_index].sas, sa0);
+			  pool_get (km->per_thread_data[thread_index].sas,
+				    sa0);
 			  clib_memcpy (sa0, &sa, sizeof (*sa0));
-			  hash_set (km->per_thread_data[cpu_index].sa_by_rspi,
+			  hash_set (km->
+				    per_thread_data[thread_index].sa_by_rspi,
 				    sa0->rspi,
-				    sa0 - km->per_thread_data[cpu_index].sas);
+				    sa0 -
+				    km->per_thread_data[thread_index].sas);
 			}
 		      else
 			{
@@ -2169,11 +2174,11 @@ ikev2_node_fn (vlib_main_t * vm,
 		  if (sa0->state == IKEV2_STATE_SA_INIT)
 		    {
 		      /* add SA to the pool */
-		      pool_get (km->per_thread_data[cpu_index].sas, sa0);
+		      pool_get (km->per_thread_data[thread_index].sas, sa0);
 		      clib_memcpy (sa0, &sa, sizeof (*sa0));
-		      hash_set (km->per_thread_data[cpu_index].sa_by_rspi,
+		      hash_set (km->per_thread_data[thread_index].sa_by_rspi,
 				sa0->rspi,
-				sa0 - km->per_thread_data[cpu_index].sas);
+				sa0 - km->per_thread_data[thread_index].sas);
 		    }
 		  else
 		    {
@@ -2184,12 +2189,13 @@ ikev2_node_fn (vlib_main_t * vm,
 	  else if (ike0->exchange == IKEV2_EXCHANGE_IKE_AUTH)
 	    {
 	      uword *p;
-	      p = hash_get (km->per_thread_data[cpu_index].sa_by_rspi,
+	      p = hash_get (km->per_thread_data[thread_index].sa_by_rspi,
 			    clib_net_to_host_u64 (ike0->rspi));
 	      if (p)
 		{
-		  sa0 = pool_elt_at_index (km->per_thread_data[cpu_index].sas,
-					   p[0]);
+		  sa0 =
+		    pool_elt_at_index (km->per_thread_data[thread_index].sas,
+				       p[0]);
 
 		  r = ikev2_retransmit_resp (sa0, ike0);
 		  if (r == 1)
@@ -2240,12 +2246,13 @@ ikev2_node_fn (vlib_main_t * vm,
 	  else if (ike0->exchange == IKEV2_EXCHANGE_INFORMATIONAL)
 	    {
 	      uword *p;
-	      p = hash_get (km->per_thread_data[cpu_index].sa_by_rspi,
+	      p = hash_get (km->per_thread_data[thread_index].sa_by_rspi,
 			    clib_net_to_host_u64 (ike0->rspi));
 	      if (p)
 		{
-		  sa0 = pool_elt_at_index (km->per_thread_data[cpu_index].sas,
-					   p[0]);
+		  sa0 =
+		    pool_elt_at_index (km->per_thread_data[thread_index].sas,
+				       p[0]);
 
 		  r = ikev2_retransmit_resp (sa0, ike0);
 		  if (r == 1)
@@ -2305,12 +2312,13 @@ ikev2_node_fn (vlib_main_t * vm,
 	  else if (ike0->exchange == IKEV2_EXCHANGE_CREATE_CHILD_SA)
 	    {
 	      uword *p;
-	      p = hash_get (km->per_thread_data[cpu_index].sa_by_rspi,
+	      p = hash_get (km->per_thread_data[thread_index].sa_by_rspi,
 			    clib_net_to_host_u64 (ike0->rspi));
 	      if (p)
 		{
-		  sa0 = pool_elt_at_index (km->per_thread_data[cpu_index].sas,
-					   p[0]);
+		  sa0 =
+		    pool_elt_at_index (km->per_thread_data[thread_index].sas,
+				       p[0]);
 
 		  r = ikev2_retransmit_resp (sa0, ike0);
 		  if (r == 1)
diff --git a/src/vnet/ipsec/ipsec.h b/src/vnet/ipsec/ipsec.h
index 58f0f145..c884e360 100644
--- a/src/vnet/ipsec/ipsec.h
+++ b/src/vnet/ipsec/ipsec.h
@@ -324,21 +324,21 @@ int ipsec_set_interface_key (vnet_main_t * vnm, u32 hw_if_index,
 always_inline void
 ipsec_alloc_empty_buffers (vlib_main_t * vm, ipsec_main_t * im)
 {
-  u32 cpu_index = os_get_cpu_number ();
-  uword l = vec_len (im->empty_buffers[cpu_index]);
+  u32 thread_index = vlib_get_thread_index ();
+  uword l = vec_len (im->empty_buffers[thread_index]);
   uword n_alloc = 0;
 
   if (PREDICT_FALSE (l < VLIB_FRAME_SIZE))
     {
-      if (!im->empty_buffers[cpu_index])
+      if (!im->empty_buffers[thread_index])
 	{
-	  vec_alloc (im->empty_buffers[cpu_index], 2 * VLIB_FRAME_SIZE);
+	  vec_alloc (im->empty_buffers[thread_index], 2 * VLIB_FRAME_SIZE);
 	}
 
-      n_alloc = vlib_buffer_alloc (vm, im->empty_buffers[cpu_index] + l,
+      n_alloc = vlib_buffer_alloc (vm, im->empty_buffers[thread_index] + l,
 				   2 * VLIB_FRAME_SIZE - l);
 
-      _vec_len (im->empty_buffers[cpu_index]) = l + n_alloc;
+      _vec_len (im->empty_buffers[thread_index]) = l + n_alloc;
     }
 }
 
diff --git a/src/vnet/ipsec/ipsec_if.c b/src/vnet/ipsec/ipsec_if.c
index dc882004..ed124894 100644
--- a/src/vnet/ipsec/ipsec_if.c
+++ b/src/vnet/ipsec/ipsec_if.c
@@ -99,7 +99,7 @@ static int
 ipsec_add_del_tunnel_if_rpc_callback (ipsec_add_del_tunnel_args_t * a)
 {
   vnet_main_t *vnm = vnet_get_main ();
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
 
   return ipsec_add_del_tunnel_if_internal (vnm, a);
 }
diff --git a/src/vnet/l2/l2_bvi.h b/src/vnet/l2/l2_bvi.h
index dd1130a6..e21a1616 100644
--- a/src/vnet/l2/l2_bvi.h
+++ b/src/vnet/l2/l2_bvi.h
@@ -97,7 +97,7 @@ l2_to_bvi (vlib_main_t * vlib_main,
   vlib_increment_combined_counter
     (vnet_main->interface_main.combined_sw_if_counters
      + VNET_INTERFACE_COUNTER_RX,
-     vlib_main->cpu_index,
+     vlib_main->thread_index,
      vnet_buffer (b0)->sw_if_index[VLIB_RX],
      1, vlib_buffer_length_in_chain (vlib_main, b0));
   return TO_BVI_ERR_OK;
diff --git a/src/vnet/l2/l2_input.c b/src/vnet/l2/l2_input.c
index 041ff38d..e5d6878a 100644
--- a/src/vnet/l2/l2_input.c
+++ b/src/vnet/l2/l2_input.c
@@ -117,7 +117,7 @@ typedef enum
 static_always_inline void
 classify_and_dispatch (vlib_main_t * vm,
 		       vlib_node_runtime_t * node,
-		       u32 cpu_index,
+		       u32 thread_index,
 		       l2input_main_t * msm, vlib_buffer_t * b0, u32 * next0)
 {
   /*
@@ -237,7 +237,7 @@ l2input_node_inline (vlib_main_t * vm,
   u32 n_left_from, *from, *to_next;
   l2input_next_t next_index;
   l2input_main_t *msm = &l2input_main;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   from = vlib_frame_vector_args (frame);
   n_left_from = frame->n_vectors;	/* number of packets to process */
@@ -350,10 +350,10 @@ l2input_node_inline (vlib_main_t * vm,
 	  vlib_node_increment_counter (vm, l2input_node.index,
 				       L2INPUT_ERROR_L2INPUT, 4);
 
-	  classify_and_dispatch (vm, node, cpu_index, msm, b0, &next0);
-	  classify_and_dispatch (vm, node, cpu_index, msm, b1, &next1);
-	  classify_and_dispatch (vm, node, cpu_index, msm, b2, &next2);
-	  classify_and_dispatch (vm, node, cpu_index, msm, b3, &next3);
+	  classify_and_dispatch (vm, node, thread_index, msm, b0, &next0);
+	  classify_and_dispatch (vm, node, thread_index, msm, b1, &next1);
+	  classify_and_dispatch (vm, node, thread_index, msm, b2, &next2);
+	  classify_and_dispatch (vm, node, thread_index, msm, b3, &next3);
 
 	  /* verify speculative enqueues, maybe switch current next frame */
 	  /* if next0==next1==next_index then nothing special needs to be done */
@@ -393,7 +393,7 @@ l2input_node_inline (vlib_main_t * vm,
 	  vlib_node_increment_counter (vm, l2input_node.index,
 				       L2INPUT_ERROR_L2INPUT, 1);
 
-	  classify_and_dispatch (vm, node, cpu_index, msm, b0, &next0);
+	  classify_and_dispatch (vm, node, thread_index, msm, b0, &next0);
 
 	  /* verify speculative enqueue, maybe switch current next frame */
 	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
diff --git a/src/vnet/l2/l2_output.c b/src/vnet/l2/l2_output.c
index 00f22571..e17b2a16 100644
--- a/src/vnet/l2/l2_output.c
+++ b/src/vnet/l2/l2_output.c
@@ -643,11 +643,11 @@ l2output_create_output_node_mapping (vlib_main_t * vlib_main, vnet_main_t * vnet
 
   hw0 = vnet_get_sup_hw_interface (vnet_main, sw_if_index);
 
-  uword cpu_number;
+  uword thread_index;
 
-  cpu_number = os_get_cpu_number ();
+  thread_index = vlib_get_thread_index ();
 
-  if (cpu_number)
+  if (thread_index)
     {
       u32 oldflags;
 
diff --git a/src/vnet/l2tp/decap.c b/src/vnet/l2tp/decap.c
index e8986935..46104129 100644
--- a/src/vnet/l2tp/decap.c
+++ b/src/vnet/l2tp/decap.c
@@ -149,7 +149,7 @@ last_stage (vlib_main_t * vm, vlib_node_runtime_t * node, u32 bi)
 
   /* per-mapping byte stats include the ethernet header */
   vlib_increment_combined_counter (&lm->counter_main,
-				   os_get_cpu_number (),
+				   vlib_get_thread_index (),
 				   counter_index, 1 /* packet_increment */ ,
 				   vlib_buffer_length_in_chain (vm, b) +
 				   sizeof (ethernet_header_t));
diff --git a/src/vnet/l2tp/encap.c b/src/vnet/l2tp/encap.c
index ed7a9580..dcdfde4b 100644
--- a/src/vnet/l2tp/encap.c
+++ b/src/vnet/l2tp/encap.c
@@ -124,7 +124,7 @@ last_stage (vlib_main_t * vm, vlib_node_runtime_t * node, u32 bi)
 
   /* per-mapping byte stats include the ethernet header */
   vlib_increment_combined_counter (&lm->counter_main,
-				   os_get_cpu_number (),
+				   vlib_get_thread_index (),
 				   counter_index, 1 /* packet_increment */ ,
 				   vlib_buffer_length_in_chain (vm, b));
 
diff --git a/src/vnet/l2tp/l2tp.c b/src/vnet/l2tp/l2tp.c
index cb94d7e7..3dedc447 100644
--- a/src/vnet/l2tp/l2tp.c
+++ b/src/vnet/l2tp/l2tp.c
@@ -157,7 +157,7 @@ test_counters_command_fn (vlib_main_t * vm,
   u32 session_index;
   u32 counter_index;
   u32 nincr = 0;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   /* *INDENT-OFF* */
   pool_foreach (session, lm->sessions,
@@ -167,11 +167,11 @@ test_counters_command_fn (vlib_main_t * vm,
       session_index_to_counter_index (session_index,
                                       SESSION_COUNTER_USER_TO_NETWORK);
     vlib_increment_combined_counter (&lm->counter_main,
-                                     cpu_index,
+                                     thread_index,
                                      counter_index,
                                      1/*pkt*/, 1111 /*bytes*/);
     vlib_increment_combined_counter (&lm->counter_main,
-                                     cpu_index,
+                                     thread_index,
                                      counter_index+1,
                                      1/*pkt*/, 2222 /*bytes*/);
     nincr++;
diff --git a/src/vnet/lisp-gpe/decap.c b/src/vnet/lisp-gpe/decap.c
index d887a95f..68769710 100644
--- a/src/vnet/lisp-gpe/decap.c
+++ b/src/vnet/lisp-gpe/decap.c
@@ -103,7 +103,7 @@ next_index_to_iface (lisp_gpe_main_t * lgm, u32 next_index)
 }
 
 static_always_inline void
-incr_decap_stats (vnet_main_t * vnm, u32 cpu_index, u32 length,
+incr_decap_stats (vnet_main_t * vnm, u32 thread_index, u32 length,
 		  u32 sw_if_index, u32 * last_sw_if_index, u32 * n_packets,
 		  u32 * n_bytes)
 {
@@ -122,7 +122,7 @@ incr_decap_stats (vnet_main_t * vnm, u32 cpu_index, u32 length,
 
 	  vlib_increment_combined_counter (im->combined_sw_if_counters +
 					   VNET_INTERFACE_COUNTER_RX,
-					   cpu_index, *last_sw_if_index,
+					   thread_index, *last_sw_if_index,
 					   *n_packets, *n_bytes);
 	}
       *last_sw_if_index = sw_if_index;
@@ -150,11 +150,11 @@ static uword
 lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 		       vlib_frame_t * from_frame, u8 is_v4)
 {
-  u32 n_left_from, next_index, *from, *to_next, cpu_index;
+  u32 n_left_from, next_index, *from, *to_next, thread_index;
   u32 n_bytes = 0, n_packets = 0, last_sw_if_index = ~0, drops = 0;
   lisp_gpe_main_t *lgm = vnet_lisp_gpe_get_main ();
 
-  cpu_index = os_get_cpu_number ();
+  thread_index = vlib_get_thread_index ();
   from = vlib_frame_vector_args (from_frame);
   n_left_from = from_frame->n_vectors;
 
@@ -267,7 +267,7 @@ lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 
 	  if (si0)
 	    {
-	      incr_decap_stats (lgm->vnet_main, cpu_index,
+	      incr_decap_stats (lgm->vnet_main, thread_index,
 				vlib_buffer_length_in_chain (vm, b0), si0[0],
 				&last_sw_if_index, &n_packets, &n_bytes);
 	      vnet_buffer (b0)->sw_if_index[VLIB_RX] = si0[0];
@@ -282,7 +282,7 @@ lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 
 	  if (si1)
 	    {
-	      incr_decap_stats (lgm->vnet_main, cpu_index,
+	      incr_decap_stats (lgm->vnet_main, thread_index,
 				vlib_buffer_length_in_chain (vm, b1), si1[0],
 				&last_sw_if_index, &n_packets, &n_bytes);
 	      vnet_buffer (b1)->sw_if_index[VLIB_RX] = si1[0];
@@ -397,7 +397,7 @@ lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 
 	  if (si0)
 	    {
-	      incr_decap_stats (lgm->vnet_main, cpu_index,
+	      incr_decap_stats (lgm->vnet_main, thread_index,
 				vlib_buffer_length_in_chain (vm, b0), si0[0],
 				&last_sw_if_index, &n_packets, &n_bytes);
 	      vnet_buffer (b0)->sw_if_index[VLIB_RX] = si0[0];
@@ -430,7 +430,7 @@ lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
     }
 
   /* flush iface stats */
-  incr_decap_stats (lgm->vnet_main, cpu_index, 0, ~0, &last_sw_if_index,
+  incr_decap_stats (lgm->vnet_main, thread_index, 0, ~0, &last_sw_if_index,
 		    &n_packets, &n_bytes);
   vlib_node_increment_counter (vm, lisp_gpe_ip4_input_node.index,
 			       LISP_GPE_ERROR_NO_TUNNEL, drops);
diff --git a/src/vnet/lldp/lldp_input.c b/src/vnet/lldp/lldp_input.c
index 762743d0..e88f6fdb 100644
--- a/src/vnet/lldp/lldp_input.c
+++ b/src/vnet/lldp/lldp_input.c
@@ -35,7 +35,7 @@ typedef struct
 static void
 lldp_rpc_update_peer_cb (const lldp_intf_update_t * a)
 {
-  ASSERT (os_get_cpu_number () == 0);
+  ASSERT (vlib_get_thread_index () == 0);
 
   lldp_intf_t *n = lldp_get_intf (&lldp_main, a->hw_if_index);
   if (!n)
diff --git a/src/vnet/map/ip4_map.c b/src/vnet/map/ip4_map.c
index 1a20d704..e39b6f14 100644
--- a/src/vnet/map/ip4_map.c
+++ b/src/vnet/map/ip4_map.c
@@ -248,7 +248,7 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
   next_index = node->cached_next_index;
   map_main_t *mm = &map_main;
   vlib_combined_counter_main_t *cm = mm->domain_counters;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   while (n_left_from > 0)
     {
@@ -377,7 +377,7 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
 					       ip40) ?
 		    IP4_MAP_NEXT_IP6_REWRITE : next0;
 		  vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX,
-						   cpu_index,
+						   thread_index,
 						   map_domain_index0, 1,
 						   clib_net_to_host_u16
 						   (ip6h0->payload_length) +
@@ -409,7 +409,7 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
 					       ip41) ?
 		    IP4_MAP_NEXT_IP6_REWRITE : next1;
 		  vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX,
-						   cpu_index,
+						   thread_index,
 						   map_domain_index1, 1,
 						   clib_net_to_host_u16
 						   (ip6h1->payload_length) +
@@ -520,7 +520,7 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
 					       ip40) ?
 		    IP4_MAP_NEXT_IP6_REWRITE : next0;
 		  vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX,
-						   cpu_index,
+						   thread_index,
 						   map_domain_index0, 1,
 						   clib_net_to_host_u16
 						   (ip6h0->payload_length) +
@@ -564,7 +564,7 @@ ip4_map_reass (vlib_main_t * vm,
   next_index = node->cached_next_index;
   map_main_t *mm = &map_main;
   vlib_combined_counter_main_t *cm = mm->domain_counters;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   u32 *fragments_to_drop = NULL;
   u32 *fragments_to_loopback = NULL;
 
@@ -694,8 +694,8 @@ ip4_map_reass (vlib_main_t * vm,
 	    {
 	      if (error0 == MAP_ERROR_NONE)
 		vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX,
-						 cpu_index, map_domain_index0,
-						 1,
+						 thread_index,
+						 map_domain_index0, 1,
 						 clib_net_to_host_u16
 						 (ip60->payload_length) + 40);
 	      next0 =
diff --git a/src/vnet/map/ip4_map_t.c b/src/vnet/map/ip4_map_t.c
index b63d76bf..5f2bcbf9 100644
--- a/src/vnet/map/ip4_map_t.c
+++ b/src/vnet/map/ip4_map_t.c
@@ -477,7 +477,7 @@ ip4_map_t_icmp (vlib_main_t * vm,
   n_left_from = frame->n_vectors;
   next_index = node->cached_next_index;
   vlib_combined_counter_main_t *cm = map_main.domain_counters;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   while (n_left_from > 0)
     {
@@ -520,7 +520,7 @@ ip4_map_t_icmp (vlib_main_t * vm,
 	  if (PREDICT_TRUE (error0 == MAP_ERROR_NONE))
 	    {
 	      vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX,
-					       cpu_index,
+					       thread_index,
 					       vnet_buffer (p0)->map_t.
 					       map_domain_index, 1, len0);
 	    }
@@ -1051,7 +1051,7 @@ ip4_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
   n_left_from = frame->n_vectors;
   next_index = node->cached_next_index;
   vlib_combined_counter_main_t *cm = map_main.domain_counters;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   while (n_left_from > 0)
     {
@@ -1158,7 +1158,7 @@ ip4_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
 	      (error0 == MAP_ERROR_NONE && next0 != IP4_MAPT_NEXT_MAPT_ICMP))
 	    {
 	      vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX,
-					       cpu_index,
+					       thread_index,
 					       vnet_buffer (p0)->map_t.
 					       map_domain_index, 1,
 					       clib_net_to_host_u16 (ip40->
@@ -1169,7 +1169,7 @@ ip4_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
 	      (error1 == MAP_ERROR_NONE && next1 != IP4_MAPT_NEXT_MAPT_ICMP))
 	    {
 	      vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX,
-					       cpu_index,
+					       thread_index,
 					       vnet_buffer (p1)->map_t.
 					       map_domain_index, 1,
 					       clib_net_to_host_u16 (ip41->
@@ -1252,7 +1252,7 @@ ip4_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
 	      (error0 == MAP_ERROR_NONE && next0 != IP4_MAPT_NEXT_MAPT_ICMP))
 	    {
 	      vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX,
-					       cpu_index,
+					       thread_index,
 					       vnet_buffer (p0)->map_t.
 					       map_domain_index, 1,
 					       clib_net_to_host_u16 (ip40->
diff --git a/src/vnet/map/ip6_map.c b/src/vnet/map/ip6_map.c
index f7eb768f..63ada962 100644
--- a/src/vnet/map/ip6_map.c
+++ b/src/vnet/map/ip6_map.c
@@ -172,7 +172,7 @@ ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
     vlib_node_get_runtime (vm, ip6_map_node.index);
   map_main_t *mm = &map_main;
   vlib_combined_counter_main_t *cm = mm->domain_counters;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   from = vlib_frame_vector_args (frame);
   n_left_from = frame->n_vectors;
@@ -319,7 +319,7 @@ ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
 			IP6_MAP_NEXT_IP4_REWRITE : next0;
 		    }
 		  vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX,
-						   cpu_index,
+						   thread_index,
 						   map_domain_index0, 1,
 						   clib_net_to_host_u16
 						   (ip40->length));
@@ -352,7 +352,7 @@ ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
 			IP6_MAP_NEXT_IP4_REWRITE : next1;
 		    }
 		  vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX,
-						   cpu_index,
+						   thread_index,
 						   map_domain_index1, 1,
 						   clib_net_to_host_u16
 						   (ip41->length));
@@ -505,7 +505,7 @@ ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
 			IP6_MAP_NEXT_IP4_REWRITE : next0;
 		    }
 		  vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX,
-						   cpu_index,
+						   thread_index,
 						   map_domain_index0, 1,
 						   clib_net_to_host_u16
 						   (ip40->length));
@@ -820,7 +820,7 @@ ip6_map_ip4_reass (vlib_main_t * vm,
     vlib_node_get_runtime (vm, ip6_map_ip4_reass_node.index);
   map_main_t *mm = &map_main;
   vlib_combined_counter_main_t *cm = mm->domain_counters;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   u32 *fragments_to_drop = NULL;
   u32 *fragments_to_loopback = NULL;
 
@@ -958,8 +958,8 @@ ip6_map_ip4_reass (vlib_main_t * vm,
 	    {
 	      if (error0 == MAP_ERROR_NONE)
 		vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX,
-						 cpu_index, map_domain_index0,
-						 1,
+						 thread_index,
+						 map_domain_index0, 1,
 						 clib_net_to_host_u16
 						 (ip40->length));
 	      next0 =
@@ -1015,7 +1015,7 @@ ip6_map_icmp_relay (vlib_main_t * vm,
   vlib_node_runtime_t *error_node =
     vlib_node_get_runtime (vm, ip6_map_icmp_relay_node.index);
   map_main_t *mm = &map_main;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   u16 *fragment_ids, *fid;
 
   from = vlib_frame_vector_args (frame);
@@ -1143,7 +1143,8 @@ ip6_map_icmp_relay (vlib_main_t * vm,
 	  ip_csum_t sum = ip_incremental_checksum (0, new_icmp40, nlen - 20);
 	  new_icmp40->checksum = ~ip_csum_fold (sum);
 
-	  vlib_increment_simple_counter (&mm->icmp_relayed, cpu_index, 0, 1);
+	  vlib_increment_simple_counter (&mm->icmp_relayed, thread_index, 0,
+					 1);
 
 	error:
 	  if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED))
diff --git a/src/vnet/map/ip6_map_t.c b/src/vnet/map/ip6_map_t.c
index eb3996c2..99151678 100644
--- a/src/vnet/map/ip6_map_t.c
+++ b/src/vnet/map/ip6_map_t.c
@@ -448,7 +448,7 @@ ip6_map_t_icmp (vlib_main_t * vm,
   n_left_from = frame->n_vectors;
   next_index = node->cached_next_index;
   vlib_combined_counter_main_t *cm = map_main.domain_counters;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   while (n_left_from > 0)
     {
@@ -493,7 +493,7 @@ ip6_map_t_icmp (vlib_main_t * vm,
 	  if (PREDICT_TRUE (error0 == MAP_ERROR_NONE))
 	    {
 	      vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX,
-					       cpu_index,
+					       thread_index,
 					       vnet_buffer (p0)->
 					       map_t.map_domain_index, 1,
 					       len0);
@@ -1051,7 +1051,7 @@ ip6_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
   vlib_node_runtime_t *error_node =
     vlib_node_get_runtime (vm, ip6_map_t_node.index);
   vlib_combined_counter_main_t *cm = map_main.domain_counters;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   from = vlib_frame_vector_args (frame);
   n_left_from = frame->n_vectors;
@@ -1218,7 +1218,7 @@ ip6_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
 	      (error0 == MAP_ERROR_NONE && next0 != IP6_MAPT_NEXT_MAPT_ICMP))
 	    {
 	      vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX,
-					       cpu_index,
+					       thread_index,
 					       vnet_buffer (p0)->
 					       map_t.map_domain_index, 1,
 					       clib_net_to_host_u16
@@ -1229,7 +1229,7 @@ ip6_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
 	      (error1 == MAP_ERROR_NONE && next1 != IP6_MAPT_NEXT_MAPT_ICMP))
 	    {
 	      vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX,
-					       cpu_index,
+					       thread_index,
 					       vnet_buffer (p1)->
 					       map_t.map_domain_index, 1,
 					       clib_net_to_host_u16
@@ -1403,7 +1403,7 @@ ip6_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame)
 	      (error0 == MAP_ERROR_NONE && next0 != IP6_MAPT_NEXT_MAPT_ICMP))
 	    {
 	      vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX,
-					       cpu_index,
+					       thread_index,
 					       vnet_buffer (p0)->
 					       map_t.map_domain_index, 1,
 					       clib_net_to_host_u16
diff --git a/src/vnet/mpls/mpls_input.c b/src/vnet/mpls/mpls_input.c
index 893c4511..1b9bdd05 100644
--- a/src/vnet/mpls/mpls_input.c
+++ b/src/vnet/mpls/mpls_input.c
@@ -76,7 +76,7 @@ mpls_input_inline (vlib_main_t * vm,
   u32 n_left_from, next_index, * from, * to_next;
   mpls_input_runtime_t * rt;
   mpls_main_t * mm;
-  u32 cpu_index = os_get_cpu_number();
+  u32 thread_index = vlib_get_thread_index();
   vlib_simple_counter_main_t * cm;
   vnet_main_t * vnm = vnet_get_main();
 
@@ -151,7 +151,7 @@ mpls_input_inline (vlib_main_t * vm,
               next0 = MPLS_INPUT_NEXT_LOOKUP;
               vnet_feature_arc_start(mm->input_feature_arc_index,
                                      sw_if_index0, &next0, b0);
-              vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1);
+              vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1);
           }
 
           if (PREDICT_FALSE(h1[3] == 0))
@@ -164,7 +164,7 @@ mpls_input_inline (vlib_main_t * vm,
               next1 = MPLS_INPUT_NEXT_LOOKUP;
               vnet_feature_arc_start(mm->input_feature_arc_index,
                                      sw_if_index1, &next1, b1);
-              vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1);
+              vlib_increment_simple_counter (cm, thread_index, sw_if_index1, 1);
           }
 
           if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED))
@@ -215,7 +215,7 @@ mpls_input_inline (vlib_main_t * vm,
             {
               next0 = MPLS_INPUT_NEXT_LOOKUP;
 	      vnet_feature_arc_start(mm->input_feature_arc_index, sw_if_index0, &next0, b0);
-              vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1);
+              vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1);
             }
 
           if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) 
diff --git a/src/vnet/mpls/mpls_lookup.c b/src/vnet/mpls/mpls_lookup.c
index 475bb204..ace6a70f 100644
--- a/src/vnet/mpls/mpls_lookup.c
+++ b/src/vnet/mpls/mpls_lookup.c
@@ -67,7 +67,7 @@ mpls_lookup (vlib_main_t * vm,
   vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters;
   u32 n_left_from, next_index, * from, * to_next;
   mpls_main_t * mm = &mpls_main;
-  u32 cpu_index = os_get_cpu_number();
+  u32 thread_index = vlib_get_thread_index();
 
   from = vlib_frame_vector_args (from_frame);
   n_left_from = from_frame->n_vectors;
@@ -220,16 +220,16 @@ mpls_lookup (vlib_main_t * vm,
           vnet_buffer (b3)->ip.adj_index[VLIB_TX] = dpo3->dpoi_index;
 
           vlib_increment_combined_counter
-              (cm, cpu_index, lbi0, 1,
+              (cm, thread_index, lbi0, 1,
                vlib_buffer_length_in_chain (vm, b0));
           vlib_increment_combined_counter
-              (cm, cpu_index, lbi1, 1,
+              (cm, thread_index, lbi1, 1,
                vlib_buffer_length_in_chain (vm, b1));
           vlib_increment_combined_counter
-              (cm, cpu_index, lbi2, 1,
+              (cm, thread_index, lbi2, 1,
                vlib_buffer_length_in_chain (vm, b2));
           vlib_increment_combined_counter
-              (cm, cpu_index, lbi3, 1,
+              (cm, thread_index, lbi3, 1,
                vlib_buffer_length_in_chain (vm, b3));
 
           /*
@@ -351,7 +351,7 @@ mpls_lookup (vlib_main_t * vm,
           vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
 
           vlib_increment_combined_counter
-              (cm, cpu_index, lbi0, 1,
+              (cm, thread_index, lbi0, 1,
                vlib_buffer_length_in_chain (vm, b0));
 
           /*
@@ -440,7 +440,7 @@ mpls_load_balance (vlib_main_t * vm,
 {
   vlib_combined_counter_main_t * cm = &load_balance_main.lbm_via_counters;
   u32 n_left_from, n_left_to_next, * from, * to_next;
-  u32 cpu_index = os_get_cpu_number();
+  u32 thread_index = vlib_get_thread_index();
   u32 next;
 
   from = vlib_frame_vector_args (frame);
@@ -536,10 +536,10 @@ mpls_load_balance (vlib_main_t * vm,
           vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index;
 
           vlib_increment_combined_counter
-              (cm, cpu_index, lbi0, 1,
+              (cm, thread_index, lbi0, 1,
                vlib_buffer_length_in_chain (vm, p0));
           vlib_increment_combined_counter
-              (cm, cpu_index, lbi1, 1,
+              (cm, thread_index, lbi1, 1,
                vlib_buffer_length_in_chain (vm, p1));
 
           if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED))
@@ -597,7 +597,7 @@ mpls_load_balance (vlib_main_t * vm,
           vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index;
 
           vlib_increment_combined_counter
-              (cm, cpu_index, lbi0, 1,
+              (cm, thread_index, lbi0, 1,
                vlib_buffer_length_in_chain (vm, p0));
 
           vlib_validate_buffer_enqueue_x1 (vm, node, next,
diff --git a/src/vnet/mpls/mpls_output.c b/src/vnet/mpls/mpls_output.c
index 08018fd1..d90dec21 100644
--- a/src/vnet/mpls/mpls_output.c
+++ b/src/vnet/mpls/mpls_output.c
@@ -64,12 +64,12 @@ mpls_output_inline (vlib_main_t * vm,
                     vlib_frame_t * from_frame,
 		    int is_midchain)
 {
-  u32 n_left_from, next_index, * from, * to_next, cpu_index;
+  u32 n_left_from, next_index, * from, * to_next, thread_index;
   vlib_node_runtime_t * error_node;
   u32 n_left_to_next;
   mpls_main_t *mm;
 
-  cpu_index = os_get_cpu_number();
+  thread_index = vlib_get_thread_index();
   error_node = vlib_node_get_runtime (vm, mpls_output_node.index);
   from = vlib_frame_vector_args (from_frame);
   n_left_from = from_frame->n_vectors;
@@ -137,13 +137,13 @@ mpls_output_inline (vlib_main_t * vm,
           /* Bump the adj counters for packet and bytes */
           vlib_increment_combined_counter
               (&adjacency_counters,
-               cpu_index,
+               thread_index,
                adj_index0,
                1,
                vlib_buffer_length_in_chain (vm, p0) + rw_len0);
           vlib_increment_combined_counter
               (&adjacency_counters,
-               cpu_index,
+               thread_index,
                adj_index1,
                1,
                vlib_buffer_length_in_chain (vm, p1) + rw_len1);
@@ -245,7 +245,7 @@ mpls_output_inline (vlib_main_t * vm,
           
           vlib_increment_combined_counter
               (&adjacency_counters,
-               cpu_index,
+               thread_index,
                adj_index0,
                1,
                vlib_buffer_length_in_chain (vm, p0) + rw_len0);
diff --git a/src/vnet/pg/input.c b/src/vnet/pg/input.c
index 2649798b..597ae060 100644
--- a/src/vnet/pg/input.c
+++ b/src/vnet/pg/input.c
@@ -893,7 +893,7 @@ pg_generate_set_lengths (pg_main_t * pg,
 
     vlib_increment_combined_counter (im->combined_sw_if_counters
 				     + VNET_INTERFACE_COUNTER_RX,
-				     os_get_cpu_number (),
+				     vlib_get_thread_index (),
 				     si->sw_if_index, n_buffers, length_sum);
   }
 
@@ -1266,7 +1266,7 @@ pg_stream_fill_helper (pg_main_t * pg,
 	    l += vlib_buffer_index_length_in_chain (vm, buffers[i]);
 	  vlib_increment_combined_counter (im->combined_sw_if_counters
 					   + VNET_INTERFACE_COUNTER_RX,
-					   os_get_cpu_number (),
+					   vlib_get_thread_index (),
 					   si->sw_if_index, n_alloc, l);
 	  s->current_replay_packet_index += n_alloc;
 	  s->current_replay_packet_index %=
diff --git a/src/vnet/replication.c b/src/vnet/replication.c
index 86d922b5..233a8c2f 100644
--- a/src/vnet/replication.c
+++ b/src/vnet/replication.c
@@ -31,16 +31,16 @@ replication_prep (vlib_main_t * vm,
 {
   replication_main_t *rm = &replication_main;
   replication_context_t *ctx;
-  uword cpu_number = vm->cpu_index;
+  uword thread_index = vm->thread_index;
   ip4_header_t *ip;
   u32 ctx_id;
 
   /* Allocate a context, reserve context 0 */
-  if (PREDICT_FALSE (rm->contexts[cpu_number] == 0))
-    pool_get_aligned (rm->contexts[cpu_number], ctx, CLIB_CACHE_LINE_BYTES);
+  if (PREDICT_FALSE (rm->contexts[thread_index] == 0))
+    pool_get_aligned (rm->contexts[thread_index], ctx, CLIB_CACHE_LINE_BYTES);
 
-  pool_get_aligned (rm->contexts[cpu_number], ctx, CLIB_CACHE_LINE_BYTES);
-  ctx_id = ctx - rm->contexts[cpu_number];
+  pool_get_aligned (rm->contexts[thread_index], ctx, CLIB_CACHE_LINE_BYTES);
+  ctx_id = ctx - rm->contexts[thread_index];
 
   /* Save state from vlib buffer */
   ctx->saved_free_list_index = b0->free_list_index;
@@ -94,11 +94,11 @@ replication_recycle (vlib_main_t * vm, vlib_buffer_t * b0, u32 is_last)
 {
   replication_main_t *rm = &replication_main;
   replication_context_t *ctx;
-  uword cpu_number = vm->cpu_index;
+  uword thread_index = vm->thread_index;
   ip4_header_t *ip;
 
   /* Get access to the replication context */
-  ctx = pool_elt_at_index (rm->contexts[cpu_number], b0->recycle_count);
+  ctx = pool_elt_at_index (rm->contexts[thread_index], b0->recycle_count);
 
   /* Restore vnet buffer state */
   clib_memcpy (vnet_buffer (b0), ctx->vnet_buffer,
@@ -133,7 +133,7 @@ replication_recycle (vlib_main_t * vm, vlib_buffer_t * b0, u32 is_last)
       b0->flags &= ~VLIB_BUFFER_RECYCLE;
 
       /* Free context back to its pool */
-      pool_put (rm->contexts[cpu_number], ctx);
+      pool_put (rm->contexts[thread_index], ctx);
     }
 
   return ctx;
@@ -160,7 +160,7 @@ replication_recycle_callback (vlib_main_t * vm, vlib_buffer_free_list_t * fl)
   replication_main_t *rm = &replication_main;
   replication_context_t *ctx;
   u32 feature_node_index = 0;
-  uword cpu_number = vm->cpu_index;
+  uword thread_index = vm->thread_index;
 
   /*
    * All buffers in the list are destined to the same recycle node.
@@ -172,7 +172,7 @@ replication_recycle_callback (vlib_main_t * vm, vlib_buffer_free_list_t * fl)
     {
       bi0 = fl->buffers[0];
       b0 = vlib_get_buffer (vm, bi0);
-      ctx = pool_elt_at_index (rm->contexts[cpu_number], b0->recycle_count);
+      ctx = pool_elt_at_index (rm->contexts[thread_index], b0->recycle_count);
       feature_node_index = ctx->recycle_node_index;
     }
 
diff --git a/src/vnet/replication.h b/src/vnet/replication.h
index 5dc554c9..ce4b3ff1 100644
--- a/src/vnet/replication.h
+++ b/src/vnet/replication.h
@@ -100,7 +100,7 @@ replication_get_ctx (vlib_buffer_t * b0)
   replication_main_t *rm = &replication_main;
 
   return replication_is_recycled (b0) ?
-    pool_elt_at_index (rm->contexts[os_get_cpu_number ()],
+    pool_elt_at_index (rm->contexts[vlib_get_thread_index ()],
 		       b0->recycle_count) : 0;
 }
 
diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c
index b86e87d9..dd211c51 100644
--- a/src/vnet/session/node.c
+++ b/src/vnet/session/node.c
@@ -311,7 +311,7 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
   unix_shared_memory_queue_t *q;
   application_t *app;
   int n_tx_packets = 0;
-  u32 my_thread_index = vm->cpu_index;
+  u32 my_thread_index = vm->thread_index;
   int i, rv;
   f64 now = vlib_time_now (vm);
 
diff --git a/src/vnet/sr/sr_localsid.c b/src/vnet/sr/sr_localsid.c
index 2e3d56de..6d72a506 100755
--- a/src/vnet/sr/sr_localsid.c
+++ b/src/vnet/sr/sr_localsid.c
@@ -887,7 +887,7 @@ sr_localsid_d_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
   from = vlib_frame_vector_args (from_frame);
   n_left_from = from_frame->n_vectors;
   next_index = node->cached_next_index;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   while (n_left_from > 0)
     {
@@ -974,26 +974,26 @@ sr_localsid_d_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
 	  vlib_increment_combined_counter
 	    (((next0 ==
 	       SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) :
-	      &(sm->sr_ls_valid_counters)), cpu_index, ls0 - sm->localsids, 1,
-	     vlib_buffer_length_in_chain (vm, b0));
+	      &(sm->sr_ls_valid_counters)), thread_index, ls0 - sm->localsids,
+	     1, vlib_buffer_length_in_chain (vm, b0));
 
 	  vlib_increment_combined_counter
 	    (((next1 ==
 	       SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) :
-	      &(sm->sr_ls_valid_counters)), cpu_index, ls1 - sm->localsids, 1,
-	     vlib_buffer_length_in_chain (vm, b1));
+	      &(sm->sr_ls_valid_counters)), thread_index, ls1 - sm->localsids,
+	     1, vlib_buffer_length_in_chain (vm, b1));
 
 	  vlib_increment_combined_counter
 	    (((next2 ==
 	       SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) :
-	      &(sm->sr_ls_valid_counters)), cpu_index, ls2 - sm->localsids, 1,
-	     vlib_buffer_length_in_chain (vm, b2));
+	      &(sm->sr_ls_valid_counters)), thread_index, ls2 - sm->localsids,
+	     1, vlib_buffer_length_in_chain (vm, b2));
 
 	  vlib_increment_combined_counter
 	    (((next3 ==
 	       SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) :
-	      &(sm->sr_ls_valid_counters)), cpu_index, ls3 - sm->localsids, 1,
-	     vlib_buffer_length_in_chain (vm, b3));
+	      &(sm->sr_ls_valid_counters)), thread_index, ls3 - sm->localsids,
+	     1, vlib_buffer_length_in_chain (vm, b3));
 
 	  vlib_validate_buffer_enqueue_x4 (vm, node, next_index, to_next,
 					   n_left_to_next, bi0, bi1, bi2, bi3,
@@ -1062,8 +1062,8 @@ sr_localsid_d_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
 	  vlib_increment_combined_counter
 	    (((next0 ==
 	       SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) :
-	      &(sm->sr_ls_valid_counters)), cpu_index, ls0 - sm->localsids, 1,
-	     vlib_buffer_length_in_chain (vm, b0));
+	      &(sm->sr_ls_valid_counters)), thread_index, ls0 - sm->localsids,
+	     1, vlib_buffer_length_in_chain (vm, b0));
 
 	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
 					   n_left_to_next, bi0, next0);
@@ -1103,7 +1103,7 @@ sr_localsid_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
   from = vlib_frame_vector_args (from_frame);
   n_left_from = from_frame->n_vectors;
   next_index = node->cached_next_index;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   while (n_left_from > 0)
     {
@@ -1205,26 +1205,26 @@ sr_localsid_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
 	  vlib_increment_combined_counter
 	    (((next0 ==
 	       SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) :
-	      &(sm->sr_ls_valid_counters)), cpu_index, ls0 - sm->localsids, 1,
-	     vlib_buffer_length_in_chain (vm, b0));
+	      &(sm->sr_ls_valid_counters)), thread_index, ls0 - sm->localsids,
+	     1, vlib_buffer_length_in_chain (vm, b0));
 
 	  vlib_increment_combined_counter
 	    (((next1 ==
 	       SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) :
-	      &(sm->sr_ls_valid_counters)), cpu_index, ls1 - sm->localsids, 1,
-	     vlib_buffer_length_in_chain (vm, b1));
+	      &(sm->sr_ls_valid_counters)), thread_index, ls1 - sm->localsids,
+	     1, vlib_buffer_length_in_chain (vm, b1));
 
 	  vlib_increment_combined_counter
 	    (((next2 ==
 	       SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) :
-	      &(sm->sr_ls_valid_counters)), cpu_index, ls2 - sm->localsids, 1,
-	     vlib_buffer_length_in_chain (vm, b2));
+	      &(sm->sr_ls_valid_counters)), thread_index, ls2 - sm->localsids,
+	     1, vlib_buffer_length_in_chain (vm, b2));
 
 	  vlib_increment_combined_counter
 	    (((next3 ==
 	       SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) :
-	      &(sm->sr_ls_valid_counters)), cpu_index, ls3 - sm->localsids, 1,
-	     vlib_buffer_length_in_chain (vm, b3));
+	      &(sm->sr_ls_valid_counters)), thread_index, ls3 - sm->localsids,
+	     1, vlib_buffer_length_in_chain (vm, b3));
 
 	  vlib_validate_buffer_enqueue_x4 (vm, node, next_index, to_next,
 					   n_left_to_next, bi0, bi1, bi2, bi3,
@@ -1295,8 +1295,8 @@ sr_localsid_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
 	  vlib_increment_combined_counter
 	    (((next0 ==
 	       SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) :
-	      &(sm->sr_ls_valid_counters)), cpu_index, ls0 - sm->localsids, 1,
-	     vlib_buffer_length_in_chain (vm, b0));
+	      &(sm->sr_ls_valid_counters)), thread_index, ls0 - sm->localsids,
+	     1, vlib_buffer_length_in_chain (vm, b0));
 
 	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
 					   n_left_to_next, bi0, next0);
diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c
index e3705060..c1567aa0 100644
--- a/src/vnet/tcp/builtin_client.c
+++ b/src/vnet/tcp/builtin_client.c
@@ -174,7 +174,7 @@ tclient_thread_fn (void *arg)
     pthread_sigmask (SIG_SETMASK, &s, 0);
   }
 
-  clib_per_cpu_mheaps[os_get_cpu_number ()] = clib_per_cpu_mheaps[0];
+  clib_per_cpu_mheaps[vlib_get_thread_index ()] = clib_per_cpu_mheaps[0];
 
   while (1)
     {
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index b2a371e2..b6c34828 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -646,10 +646,10 @@ const static transport_proto_vft_t tcp6_proto = {
 void
 tcp_timer_keep_handler (u32 conn_index)
 {
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   tcp_connection_t *tc;
 
-  tc = tcp_connection_get (conn_index, cpu_index);
+  tc = tcp_connection_get (conn_index, thread_index);
   tc->timers[TCP_TIMER_KEEP] = TCP_TIMER_HANDLE_INVALID;
 
   tcp_connection_close (tc);
@@ -675,10 +675,10 @@ tcp_timer_establish_handler (u32 conn_index)
 void
 tcp_timer_waitclose_handler (u32 conn_index)
 {
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   tcp_connection_t *tc;
 
-  tc = tcp_connection_get (conn_index, cpu_index);
+  tc = tcp_connection_get (conn_index, thread_index);
   tc->timers[TCP_TIMER_WAITCLOSE] = TCP_TIMER_HANDLE_INVALID;
 
   /* Session didn't come back with a close(). Send FIN either way
diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h
index 0090e15e..eaca672c 100644
--- a/src/vnet/tcp/tcp_debug.h
+++ b/src/vnet/tcp/tcp_debug.h
@@ -343,7 +343,7 @@ typedef enum _tcp_dbg_evt
     }                                                           	\
   else                                                          	\
     {                                                           	\
-      u32 _thread_index = os_get_cpu_number ();                 	\
+      u32 _thread_index = vlib_get_thread_index ();                 	\
       _tc = tcp_connection_get (_tc_index, _thread_index);      	\
     }                                                           	\
   ELOG_TYPE_DECLARE (_e) =                                      	\
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index a8224dc2..7e9fa47b 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -1142,7 +1142,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 			  vlib_frame_t * from_frame, int is_ip4)
 {
   u32 n_left_from, next_index, *from, *to_next;
-  u32 my_thread_index = vm->cpu_index, errors = 0;
+  u32 my_thread_index = vm->thread_index, errors = 0;
   tcp_main_t *tm = vnet_get_tcp_main ();
 
   from = vlib_frame_vector_args (from_frame);
@@ -1332,7 +1332,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 {
   tcp_main_t *tm = vnet_get_tcp_main ();
   u32 n_left_from, next_index, *from, *to_next;
-  u32 my_thread_index = vm->cpu_index, errors = 0;
+  u32 my_thread_index = vm->thread_index, errors = 0;
   u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP;
 
   from = vlib_frame_vector_args (from_frame);
@@ -1634,7 +1634,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 {
   tcp_main_t *tm = vnet_get_tcp_main ();
   u32 n_left_from, next_index, *from, *to_next;
-  u32 my_thread_index = vm->cpu_index, errors = 0;
+  u32 my_thread_index = vm->thread_index, errors = 0;
 
   from = vlib_frame_vector_args (from_frame);
   n_left_from = from_frame->n_vectors;
@@ -1989,7 +1989,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 		     vlib_frame_t * from_frame, int is_ip4)
 {
   u32 n_left_from, next_index, *from, *to_next;
-  u32 my_thread_index = vm->cpu_index;
+  u32 my_thread_index = vm->thread_index;
   tcp_main_t *tm = vnet_get_tcp_main ();
   u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP;
 
@@ -2243,7 +2243,7 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 		    vlib_frame_t * from_frame, int is_ip4)
 {
   u32 n_left_from, next_index, *from, *to_next;
-  u32 my_thread_index = vm->cpu_index;
+  u32 my_thread_index = vm->thread_index;
   tcp_main_t *tm = vnet_get_tcp_main ();
 
   from = vlib_frame_vector_args (from_frame);
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index ea157bd7..e18bfad7 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -387,8 +387,8 @@ tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts,
 #define tcp_get_free_buffer_index(tm, bidx)                             \
 do {                                                                    \
   u32 *my_tx_buffers, n_free_buffers;                                   \
-  u32 cpu_index = os_get_cpu_number();                             	\
-  my_tx_buffers = tm->tx_buffers[cpu_index];                            \
+  u32 thread_index = vlib_get_thread_index();                             	\
+  my_tx_buffers = tm->tx_buffers[thread_index];                            \
   if (PREDICT_FALSE(vec_len (my_tx_buffers) == 0))                      \
     {                                                                   \
       n_free_buffers = 32;      /* TODO config or macro */              \
@@ -396,7 +396,7 @@ do {                                                                    \
       _vec_len(my_tx_buffers) = vlib_buffer_alloc_from_free_list (      \
           tm->vlib_main, my_tx_buffers, n_free_buffers,                 \
           VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);                         \
-      tm->tx_buffers[cpu_index] = my_tx_buffers;                        \
+      tm->tx_buffers[thread_index] = my_tx_buffers;                        \
     }                                                                   \
   /* buffer shortage */                                                 \
   if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0))                     \
@@ -408,8 +408,8 @@ do {                                                                    \
 #define tcp_return_buffer(tm)						\
 do {									\
   u32 *my_tx_buffers;							\
-  u32 cpu_index = os_get_cpu_number();                             	\
-  my_tx_buffers = tm->tx_buffers[cpu_index];                          	\
+  u32 thread_index = vlib_get_thread_index();                             	\
+  my_tx_buffers = tm->tx_buffers[thread_index];                          	\
   _vec_len (my_tx_buffers) +=1;						\
 } while (0)
 
@@ -942,7 +942,7 @@ tcp_send_ack (tcp_connection_t * tc)
 void
 tcp_timer_delack_handler (u32 index)
 {
-  u32 thread_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   tcp_connection_t *tc;
 
   tc = tcp_connection_get (index, thread_index);
@@ -1022,7 +1022,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
 {
   tcp_main_t *tm = vnet_get_tcp_main ();
   vlib_main_t *vm = vlib_get_main ();
-  u32 thread_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   tcp_connection_t *tc;
   vlib_buffer_t *b;
   u32 bi, snd_space, n_bytes;
@@ -1152,7 +1152,7 @@ tcp_timer_persist_handler (u32 index)
 {
   tcp_main_t *tm = vnet_get_tcp_main ();
   vlib_main_t *vm = vlib_get_main ();
-  u32 thread_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   tcp_connection_t *tc;
   vlib_buffer_t *b;
   u32 bi, n_bytes;
@@ -1313,7 +1313,7 @@ tcp46_output_inline (vlib_main_t * vm,
 		     vlib_frame_t * from_frame, int is_ip4)
 {
   u32 n_left_from, next_index, *from, *to_next;
-  u32 my_thread_index = vm->cpu_index;
+  u32 my_thread_index = vm->thread_index;
 
   from = vlib_frame_vector_args (from_frame);
   n_left_from = from_frame->n_vectors;
@@ -1524,7 +1524,7 @@ tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 			 vlib_frame_t * from_frame, u8 is_ip4)
 {
   u32 n_left_from, next_index, *from, *to_next;
-  u32 my_thread_index = vm->cpu_index;
+  u32 my_thread_index = vm->thread_index;
 
   from = vlib_frame_vector_args (from_frame);
   n_left_from = from_frame->n_vectors;
diff --git a/src/vnet/udp/udp_input.c b/src/vnet/udp/udp_input.c
index 4b22109b..810278e6 100644
--- a/src/vnet/udp/udp_input.c
+++ b/src/vnet/udp/udp_input.c
@@ -70,7 +70,7 @@ udp4_uri_input_node_fn (vlib_main_t * vm,
   udp4_uri_input_next_t next_index;
   udp_uri_main_t *um = vnet_get_udp_main ();
   session_manager_main_t *smm = vnet_get_session_manager_main ();
-  u32 my_thread_index = vm->cpu_index;
+  u32 my_thread_index = vm->thread_index;
   u8 my_enqueue_epoch;
   u32 *session_indices_to_enqueue;
   static u32 serial_number;
diff --git a/src/vnet/unix/tapcli.c b/src/vnet/unix/tapcli.c
index fb1a8bac..0fc62f6c 100644
--- a/src/vnet/unix/tapcli.c
+++ b/src/vnet/unix/tapcli.c
@@ -366,7 +366,7 @@ static uword tapcli_rx_iface(vlib_main_t * vm,
       vlib_increment_combined_counter (
           vnet_main.interface_main.combined_sw_if_counters
           + VNET_INTERFACE_COUNTER_RX,
-          os_get_cpu_number(), ti->sw_if_index,
+          vlib_get_thread_index(), ti->sw_if_index,
           1, n_bytes_in_packet);
 
       if (PREDICT_FALSE(n_trace > 0)) {
diff --git a/src/vnet/unix/tuntap.c b/src/vnet/unix/tuntap.c
index 2cfcc92f..ac674653 100644
--- a/src/vnet/unix/tuntap.c
+++ b/src/vnet/unix/tuntap.c
@@ -189,7 +189,7 @@ tuntap_tx (vlib_main_t * vm,
   /* Update tuntap interface output stats. */
   vlib_increment_combined_counter (im->combined_sw_if_counters
 				   + VNET_INTERFACE_COUNTER_TX,
-				   vm->cpu_index,
+				   vm->thread_index,
 				   tm->sw_if_index, n_packets, n_bytes);
 
 
@@ -297,7 +297,7 @@ tuntap_rx (vlib_main_t * vm,
     vlib_increment_combined_counter
         (vnet_main.interface_main.combined_sw_if_counters
          + VNET_INTERFACE_COUNTER_RX,
-         os_get_cpu_number(),
+         vlib_get_thread_index(),
          tm->sw_if_index,
          1, n_bytes_in_packet);
 
diff --git a/src/vnet/vxlan-gpe/decap.c b/src/vnet/vxlan-gpe/decap.c
index 22ab4b62..d4fe4231 100644
--- a/src/vnet/vxlan-gpe/decap.c
+++ b/src/vnet/vxlan-gpe/decap.c
@@ -115,7 +115,7 @@ vxlan_gpe_input (vlib_main_t * vm,
   vxlan4_gpe_tunnel_key_t last_key4;
   vxlan6_gpe_tunnel_key_t last_key6;
   u32 pkts_decapsulated = 0;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   u32 stats_sw_if_index, stats_n_packets, stats_n_bytes;
 
   if (is_ip4)
@@ -342,7 +342,7 @@ vxlan_gpe_input (vlib_main_t * vm,
         if (stats_n_packets)
           vlib_increment_combined_counter (
               im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX,
-              cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes);
+              thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes);
         stats_n_packets = 1;
         stats_n_bytes = len0;
         stats_sw_if_index = sw_if_index0;
@@ -427,7 +427,7 @@ vxlan_gpe_input (vlib_main_t * vm,
         if (stats_n_packets)
           vlib_increment_combined_counter (
               im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX,
-              cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes);
+              thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes);
         stats_n_packets = 1;
         stats_n_bytes = len1;
         stats_sw_if_index = sw_if_index1;
@@ -588,7 +588,7 @@ vxlan_gpe_input (vlib_main_t * vm,
         if (stats_n_packets)
           vlib_increment_combined_counter (
               im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX,
-              cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes);
+              thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes);
         stats_n_packets = 1;
         stats_n_bytes = len0;
         stats_sw_if_index = sw_if_index0;
@@ -615,7 +615,7 @@ vxlan_gpe_input (vlib_main_t * vm,
   if (stats_n_packets)
   {
     vlib_increment_combined_counter (
-        im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, cpu_index,
+        im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, thread_index,
         stats_sw_if_index, stats_n_packets, stats_n_bytes);
     node->runtime_data[0] = stats_sw_if_index;
   }
diff --git a/src/vnet/vxlan-gpe/encap.c b/src/vnet/vxlan-gpe/encap.c
index 3a486e56..67ed94b4 100644
--- a/src/vnet/vxlan-gpe/encap.c
+++ b/src/vnet/vxlan-gpe/encap.c
@@ -151,7 +151,7 @@ vxlan_gpe_encap (vlib_main_t * vm,
   vnet_main_t * vnm = ngm->vnet_main;
   vnet_interface_main_t * im = &vnm->interface_main;
   u32 pkts_encapsulated = 0;
-  u32 cpu_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
   u32 stats_sw_if_index, stats_n_packets, stats_n_bytes;
 
   from = vlib_frame_vector_args (from_frame);
@@ -253,7 +253,7 @@ vxlan_gpe_encap (vlib_main_t * vm,
           if (stats_n_packets)
             vlib_increment_combined_counter (
                 im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX,
-                cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes);
+                thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes);
           stats_sw_if_index = sw_if_index0;
           stats_n_packets = 2;
           stats_n_bytes = len0 + len1;
@@ -262,10 +262,10 @@ vxlan_gpe_encap (vlib_main_t * vm,
         {
           vlib_increment_combined_counter (
               im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX,
-              cpu_index, sw_if_index0, 1, len0);
+              thread_index, sw_if_index0, 1, len0);
           vlib_increment_combined_counter (
               im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX,
-              cpu_index, sw_if_index1, 1, len1);
+              thread_index, sw_if_index1, 1, len1);
         }
       }
 
@@ -335,7 +335,7 @@ vxlan_gpe_encap (vlib_main_t * vm,
         if (stats_n_packets)
           vlib_increment_combined_counter (
               im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX,
-              cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes);
+              thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes);
         stats_n_packets = 1;
         stats_n_bytes = len0;
         stats_sw_if_index = sw_if_index0;
@@ -359,7 +359,7 @@ vxlan_gpe_encap (vlib_main_t * vm,
   if (stats_n_packets)
   {
     vlib_increment_combined_counter (
-        im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, cpu_index,
+        im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, thread_index,
         stats_sw_if_index, stats_n_packets, stats_n_bytes);
     node->runtime_data[0] = stats_sw_if_index;
   }
diff --git a/src/vnet/vxlan/decap.c b/src/vnet/vxlan/decap.c
index 514b2c99..2acb1f6f 100644
--- a/src/vnet/vxlan/decap.c
+++ b/src/vnet/vxlan/decap.c
@@ -81,7 +81,7 @@ vxlan_input (vlib_main_t * vm,
   vxlan4_tunnel_key_t last_key4;
   vxlan6_tunnel_key_t last_key6;
   u32 pkts_decapsulated = 0;
-  u32 cpu_index = os_get_cpu_number();
+  u32 thread_index = vlib_get_thread_index();
   u32 stats_sw_if_index, stats_n_packets, stats_n_bytes;
 
   if (is_ip4)
@@ -314,7 +314,7 @@ vxlan_input (vlib_main_t * vm,
 	      if (stats_n_packets)
 		vlib_increment_combined_counter 
 		  (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX,
-		   cpu_index, stats_sw_if_index, 
+		   thread_index, stats_sw_if_index, 
 		   stats_n_packets, stats_n_bytes);
 	      stats_n_packets = 1;
 	      stats_n_bytes = len0;
@@ -468,7 +468,7 @@ vxlan_input (vlib_main_t * vm,
 	      if (stats_n_packets)
 		vlib_increment_combined_counter 
 		  (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX,
-		   cpu_index, stats_sw_if_index, 
+		   thread_index, stats_sw_if_index, 
 		   stats_n_packets, stats_n_bytes);
 	      stats_n_packets = 1;
 	      stats_n_bytes = len1;
@@ -674,7 +674,7 @@ vxlan_input (vlib_main_t * vm,
 	      if (stats_n_packets)
 		vlib_increment_combined_counter 
 		  (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX,
-		   cpu_index, stats_sw_if_index, 
+		   thread_index, stats_sw_if_index, 
 		   stats_n_packets, stats_n_bytes);
 	      stats_n_packets = 1;
 	      stats_n_bytes = len0;
@@ -711,7 +711,7 @@ vxlan_input (vlib_main_t * vm,
     {
       vlib_increment_combined_counter 
 	(im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX,
-	 cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes);
+	 thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes);
       node->runtime_data[0] = stats_sw_if_index;
     }
 
diff --git a/src/vnet/vxlan/encap.c b/src/vnet/vxlan/encap.c
index 5b63064a..4cfbbc23 100644
--- a/src/vnet/vxlan/encap.c
+++ b/src/vnet/vxlan/encap.c
@@ -77,7 +77,7 @@ vxlan_encap_inline (vlib_main_t * vm,
   vnet_interface_main_t * im = &vnm->interface_main;
   u32 pkts_encapsulated = 0;
   u16 old_l0 = 0, old_l1 = 0;
-  u32 cpu_index = os_get_cpu_number();
+  u32 thread_index = vlib_get_thread_index();
   u32 stats_sw_if_index, stats_n_packets, stats_n_bytes;
   u32 sw_if_index0 = 0, sw_if_index1 = 0;
   u32 next0 = 0, next1 = 0;
@@ -301,7 +301,7 @@ vxlan_encap_inline (vlib_main_t * vm,
 		  if (stats_n_packets) 
 		    vlib_increment_combined_counter 
 		      (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX,
-		       cpu_index, stats_sw_if_index, 
+		       thread_index, stats_sw_if_index, 
 		       stats_n_packets, stats_n_bytes);
 		  stats_sw_if_index = sw_if_index0;
 		  stats_n_packets = 2;
@@ -311,10 +311,10 @@ vxlan_encap_inline (vlib_main_t * vm,
 	        {
 		  vlib_increment_combined_counter 
 		      (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX,
-		       cpu_index, sw_if_index0, 1, len0);
+		       thread_index, sw_if_index0, 1, len0);
 		  vlib_increment_combined_counter 
 		      (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX,
-		       cpu_index, sw_if_index1, 1, len1);
+		       thread_index, sw_if_index1, 1, len1);
 		}
 	    }
 
@@ -464,7 +464,7 @@ vxlan_encap_inline (vlib_main_t * vm,
 	      if (stats_n_packets)
 		vlib_increment_combined_counter 
 		  (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX,
-		   cpu_index, stats_sw_if_index, 
+		   thread_index, stats_sw_if_index, 
 		   stats_n_packets, stats_n_bytes);
 	      stats_n_packets = 1;
 	      stats_n_bytes = len0;
@@ -496,7 +496,7 @@ vxlan_encap_inline (vlib_main_t * vm,
     {
       vlib_increment_combined_counter 
 	(im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX,
-	 cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes);
+	 thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes);
       node->runtime_data[0] = stats_sw_if_index;
     }
 
diff --git a/src/vpp/stats/stats.c b/src/vpp/stats/stats.c
index 042d02e2..4309cd51 100644
--- a/src/vpp/stats/stats.c
+++ b/src/vpp/stats/stats.c
@@ -66,14 +66,14 @@ _(VNET_IP6_NBR_COUNTERS, vnet_ip6_nbr_counters)
 void
 dslock (stats_main_t * sm, int release_hint, int tag)
 {
-  u32 thread_id;
+  u32 thread_index;
   data_structure_lock_t *l = sm->data_structure_lock;
 
   if (PREDICT_FALSE (l == 0))
     return;
 
-  thread_id = os_get_cpu_number ();
-  if (l->lock && l->thread_id == thread_id)
+  thread_index = vlib_get_thread_index ();
+  if (l->lock && l->thread_index == thread_index)
     {
       l->count++;
       return;
@@ -85,7 +85,7 @@ dslock (stats_main_t * sm, int release_hint, int tag)
   while (__sync_lock_test_and_set (&l->lock, 1))
     /* zzzz */ ;
   l->tag = tag;
-  l->thread_id = thread_id;
+  l->thread_index = thread_index;
   l->count = 1;
 }
 
@@ -99,14 +99,14 @@ stats_dslock_with_hint (int hint, int tag)
 void
 dsunlock (stats_main_t * sm)
 {
-  u32 thread_id;
+  u32 thread_index;
   data_structure_lock_t *l = sm->data_structure_lock;
 
   if (PREDICT_FALSE (l == 0))
     return;
 
-  thread_id = os_get_cpu_number ();
-  ASSERT (l->lock && l->thread_id == thread_id);
+  thread_index = vlib_get_thread_index ();
+  ASSERT (l->lock && l->thread_index == thread_index);
   l->count--;
   if (l->count == 0)
     {
diff --git a/src/vpp/stats/stats.h b/src/vpp/stats/stats.h
index 118115be..024dc78e 100644
--- a/src/vpp/stats/stats.h
+++ b/src/vpp/stats/stats.h
@@ -30,7 +30,7 @@ typedef struct
 {
   volatile u32 lock;
   volatile u32 release_hint;
-  u32 thread_id;
+  u32 thread_index;
   u32 count;
   int tag;
 } data_structure_lock_t;
-- 
cgit 1.2.3-korg


From 7bee80c823ca77de3aca803fdede77e4c7385a52 Mon Sep 17 00:00:00 2001
From: Damjan Marion <damarion@cisco.com>
Date: Wed, 26 Apr 2017 15:32:12 +0200
Subject: Fix remaining 32-bit compile issues

Change-Id: I9664214652229b663c3e3ba7406b4ede96bfb123
Signed-off-by: Damjan Marion <damarion@cisco.com>
---
 Makefile                                 |  8 ++++----
 build-data/platforms/vpp.mk              |  5 +++++
 src/plugins/dpdk/buffer.c                |  6 +++---
 src/plugins/ixge/ixge.c                  |  5 +++--
 src/svm/svm.c                            | 11 ++++++-----
 src/svm/svmtool.c                        |  4 ++--
 src/tools/vppapigen/gram.y               |  6 +++---
 src/tools/vppapigen/node.c               |  4 ++--
 src/uri/uri_tcp_test.c                   | 21 ++++++++++++---------
 src/uri/uri_udp_test.c                   | 15 +++++++++------
 src/vat/api_format.c                     | 10 +++++-----
 src/vlib/threads.c                       |  2 +-
 src/vlibmemory/memory_client.c           |  2 +-
 src/vlibmemory/memory_vlib.c             |  3 ++-
 src/vnet/devices/virtio/vhost-user.c     |  2 +-
 src/vnet/session/application_interface.c |  2 +-
 src/vnet/session/session_api.c           | 14 +++++++-------
 src/vnet/tcp/builtin_client.c            | 19 +++++++++++--------
 src/vnet/tcp/builtin_server.c            |  2 +-
 src/vppinfra/mheap.c                     |  2 +-
 20 files changed, 80 insertions(+), 63 deletions(-)

(limited to 'src/vlib/threads.c')

diff --git a/Makefile b/Makefile
index 8240e789..b344f377 100644
--- a/Makefile
+++ b/Makefile
@@ -263,9 +263,9 @@ define test
 	  TEST_DIR=$(WS_ROOT)/test \
 	  VPP_TEST_BUILD_DIR=$(BR)/build-$(2)-native \
 	  VPP_TEST_BIN=$(BR)/install-$(2)-native/vpp/bin/vpp \
-	  VPP_TEST_PLUGIN_PATH=$(BR)/install-$(2)-native/vpp/lib64/vpp_plugins \
+	  VPP_TEST_PLUGIN_PATH=$(wildcard $(BR)/install-$(2)-native/vpp/lib*/vpp_plugins) \
 	  VPP_TEST_INSTALL_PATH=$(BR)/install-$(2)-native/ \
-	  LD_LIBRARY_PATH=$(BR)/install-$(2)-native/vpp/lib64/ \
+	  LD_LIBRARY_PATH=$(subst $(subst ,, ),:,$(wildcard $(BR)/install-$(2)-native/vpp/lib*/)) \
 	  EXTENDED_TESTS=$(EXTENDED_TESTS) \
 	  PYTHON=$(PYTHON) \
 	  $(3)
@@ -325,12 +325,12 @@ define run
 	@echo "WARNING: STARTUP_CONF not defined or file doesn't exist."
 	@echo "         Running with minimal startup config: $(MINIMAL_STARTUP_CONF)\n"
 	@cd $(STARTUP_DIR) && \
-	  sudo $(2) $(1)/vpp/bin/vpp $(MINIMAL_STARTUP_CONF) plugin_path $(1)/vpp/lib64/vpp_plugins
+	  sudo $(2) $(1)/vpp/bin/vpp $(MINIMAL_STARTUP_CONF) plugin_path $(wildcard $(1)/vpp/lib*/vpp_plugins)
 endef
 else
 define run
 	@cd $(STARTUP_DIR) && \
-	  sudo $(2) $(1)/vpp/bin/vpp $(shell cat $(STARTUP_CONF) | sed -e 's/#.*//') plugin_path $(1)/vpp/lib64/vpp_plugins
+	  sudo $(2) $(1)/vpp/bin/vpp $(shell cat $(STARTUP_CONF) | sed -e 's/#.*//') plugin_path $(wildcard $(1)/vpp/lib*/vpp_plugins)
 endef
 endif
 
diff --git a/build-data/platforms/vpp.mk b/build-data/platforms/vpp.mk
index 5aafdd76..4577fa2e 100644
--- a/build-data/platforms/vpp.mk
+++ b/build-data/platforms/vpp.mk
@@ -46,6 +46,11 @@ vpp_root_packages = vpp gmod
 # vpp_dpdk_lib_dir = /usr/lib
 # vpp_dpdk_shared_lib = yes
 
+# load balancer plugin is not portable on 32 bit platform
+ifeq ($(MACHINE),i686)
+vpp_configure_args_vpp = --disable-lb-plugin
+endif
+
 vpp_debug_TAG_CFLAGS = -g -O0 -DCLIB_DEBUG -DFORTIFY_SOURCE=2 -march=$(MARCH) \
 	-fstack-protector-all -fPIC -Werror
 vpp_debug_TAG_LDFLAGS = -g -O0 -DCLIB_DEBUG -DFORTIFY_SOURCE=2 -march=$(MARCH) \
diff --git a/src/plugins/dpdk/buffer.c b/src/plugins/dpdk/buffer.c
index c80b3fa8..2d4762ab 100644
--- a/src/plugins/dpdk/buffer.c
+++ b/src/plugins/dpdk/buffer.c
@@ -455,8 +455,8 @@ vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs,
 	uword save_vpm_start, save_vpm_end, save_vpm_size;
 	struct rte_mempool_memhdr *memhdr;
 
-	this_pool_start = ~0ULL;
-	this_pool_end = 0LL;
+	this_pool_start = ~0;
+	this_pool_end = 0;
 
 	STAILQ_FOREACH (memhdr, &rmp->mem_list, next)
 	{
@@ -465,7 +465,7 @@ vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs,
 	  if (((uword) memhdr->addr) < this_pool_start)
 	    this_pool_start = (uword) (memhdr->addr);
 	}
-	ASSERT (this_pool_start < ~0ULL && this_pool_end > 0);
+	ASSERT (this_pool_start < ~0 && this_pool_end > 0);
 	this_pool_size = this_pool_end - this_pool_start;
 
 	if (CLIB_DEBUG > 1)
diff --git a/src/plugins/ixge/ixge.c b/src/plugins/ixge/ixge.c
index 08f5b692..0d287250 100644
--- a/src/plugins/ixge/ixge.c
+++ b/src/plugins/ixge/ixge.c
@@ -20,7 +20,7 @@
  *   Please use supported DPDK driver instead.
  */
 
-#if __x86_64__
+#if __x86_64__ || __i386__
 #include <vppinfra/vector.h>
 
 #ifndef CLIB_HAVE_VEC128
@@ -2929,7 +2929,6 @@ ixge_set_next_node (ixge_rx_next_t next, char *name)
       break;
     }
 }
-#endif
 
 /* *INDENT-OFF* */
 VLIB_PLUGIN_REGISTER () = {
@@ -2937,8 +2936,10 @@ VLIB_PLUGIN_REGISTER () = {
     .default_disabled = 1,
     .description = "Intel 82599 Family Native Driver (experimental)",
 };
+#endif
 
 /* *INDENT-ON* */
+
 /*
  * fd.io coding-style-patch-verification: ON
  *
diff --git a/src/svm/svm.c b/src/svm/svm.c
index 97add5a7..c96135cf 100644
--- a/src/svm/svm.c
+++ b/src/svm/svm.c
@@ -491,7 +491,7 @@ svm_map_region (svm_map_region_args_t * a)
 	  return (0);
 	}
 
-      rp = mmap ((void *) a->baseva, a->size,
+      rp = mmap (uword_to_pointer (a->baseva, void *), a->size,
 		 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, svm_fd, 0);
 
       if (rp == (svm_region_t *) MAP_FAILED)
@@ -533,9 +533,10 @@ svm_map_region (svm_map_region_args_t * a)
       rp->virtual_size = a->size;
 
       rp->region_heap =
-	mheap_alloc_with_flags ((void *) (a->baseva + MMAP_PAGESIZE),
-				(a->pvt_heap_size != 0) ?
-				a->pvt_heap_size : SVM_PVT_MHEAP_SIZE,
+	mheap_alloc_with_flags (uword_to_pointer
+				(a->baseva + MMAP_PAGESIZE, void *),
+				(a->pvt_heap_size !=
+				 0) ? a->pvt_heap_size : SVM_PVT_MHEAP_SIZE,
 				MHEAP_FLAG_DISABLE_VM);
       oldheap = svm_push_pvt_heap (rp);
 
@@ -661,7 +662,7 @@ svm_map_region (svm_map_region_args_t * a)
       a->size = rp->virtual_size;
       munmap (rp, MMAP_PAGESIZE);
 
-      rp = (void *) mmap ((void *) a->baseva, a->size,
+      rp = (void *) mmap (uword_to_pointer (a->baseva, void *), a->size,
 			  PROT_READ | PROT_WRITE,
 			  MAP_SHARED | MAP_FIXED, svm_fd, 0);
       if ((uword) rp == (uword) MAP_FAILED)
diff --git a/src/svm/svmtool.c b/src/svm/svmtool.c
index b3195514..01ae4221 100644
--- a/src/svm/svmtool.c
+++ b/src/svm/svmtool.c
@@ -172,7 +172,7 @@ svm_map_region_nolock (svm_map_region_args_t * a)
   a->size = rp->virtual_size;
   munmap (rp, MMAP_PAGESIZE);
 
-  rp = (void *) mmap ((void *) a->baseva, a->size,
+  rp = (void *) mmap (uword_to_pointer (a->baseva, void *), a->size,
 		      PROT_READ | PROT_WRITE,
 		      MAP_SHARED | MAP_FIXED, svm_fd, 0);
   if ((uword) rp == (uword) MAP_FAILED)
@@ -401,7 +401,7 @@ repair (char *chroot_path, int crash_root_region)
   a->size = root_rp->virtual_size;
   munmap (root_rp, MMAP_PAGESIZE);
 
-  root_rp = (void *) mmap ((void *) a->baseva, a->size,
+  root_rp = (void *) mmap (uword_to_pointer (a->baseva, void *), a->size,
 			   PROT_READ | PROT_WRITE,
 			   MAP_SHARED | MAP_FIXED, svm_fd, 0);
   if ((uword) root_rp == (uword) MAP_FAILED)
diff --git a/src/tools/vppapigen/gram.y b/src/tools/vppapigen/gram.y
index 9cea6023..52bb65c5 100644
--- a/src/tools/vppapigen/gram.y
+++ b/src/tools/vppapigen/gram.y
@@ -53,9 +53,9 @@ stmt:     flist defn            {$$ = set_flags($1, $2);}
         | defn                  {$$ = $1;}
           ;
 
-flist:    flist flag            {$$ = (YYSTYPE)(unsigned long long)
-                                     ((unsigned long long) $1 
-                                    | (unsigned long long) $2);}
+flist:    flist flag            {$$ = (YYSTYPE)(unsigned long)
+                                     ((unsigned long) $1 
+                                    | (unsigned long) $2);}
         | flag                  {$$ = $1;}
           ;
 
diff --git a/src/tools/vppapigen/node.c b/src/tools/vppapigen/node.c
index 9f234037..15868ee5 100644
--- a/src/tools/vppapigen/node.c
+++ b/src/tools/vppapigen/node.c
@@ -397,7 +397,7 @@ void node_define_generate (node_t *this, enum passid which, FILE *fp)
 	    fprintf(fp, ",\n");
         }
 	indent_me(fp);
-	fprintf (fp, "{\"crc\" : \"0x%08x\"}\n", (u32)(u64)CDATA3);
+	fprintf (fp, "{\"crc\" : \"0x%08x\"}\n", (u32)(uword)CDATA3);
         indent -= 4;
 	indent_me(fp);
         fprintf(fp, "]");
@@ -1219,7 +1219,7 @@ void generate_msg_name_crc_list (YYSTYPE a1, FILE *fp)
             if (!(np->flags & NODE_FLAG_TYPEONLY)) {
                 fprintf (fp, "\\\n_(VL_API_%s, %s, %08x) ",
                          uppercase (np->data[0]), (i8 *) np->data[0],
-                         (u32)(u64)np->data[3]);
+                         (u32)(uword)np->data[3]);
             }
         }
         np = np->peer;
diff --git a/src/uri/uri_tcp_test.c b/src/uri/uri_tcp_test.c
index b15fd6ce..22f246e5 100755
--- a/src/uri/uri_tcp_test.c
+++ b/src/uri/uri_tcp_test.c
@@ -262,7 +262,8 @@ vl_api_application_attach_reply_t_handler (vl_api_application_attach_reply_t *
     }
 
   utm->our_event_queue =
-    (unix_shared_memory_queue_t *) mp->app_event_queue_address;
+    uword_to_pointer (mp->app_event_queue_address,
+		      unix_shared_memory_queue_t *);
   utm->state = STATE_ATTACHED;
 }
 
@@ -524,8 +525,9 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp)
       return;
     }
 
-  utm->vpp_event_queue = (unix_shared_memory_queue_t *)
-    mp->vpp_event_queue_address;
+  utm->vpp_event_queue =
+    uword_to_pointer (mp->vpp_event_queue_address,
+		      unix_shared_memory_queue_t *);
 
   /*
    * Setup session
@@ -534,9 +536,9 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp)
   pool_get (utm->sessions, session);
   session_index = session - utm->sessions;
 
-  rx_fifo = (svm_fifo_t *) mp->server_rx_fifo;
+  rx_fifo = uword_to_pointer (mp->server_rx_fifo, svm_fifo_t *);
   rx_fifo->client_session_index = session_index;
-  tx_fifo = (svm_fifo_t *) mp->server_tx_fifo;
+  tx_fifo = uword_to_pointer (mp->server_tx_fifo, svm_fifo_t *);
   tx_fifo->client_session_index = session_index;
 
   session->server_rx_fifo = rx_fifo;
@@ -858,16 +860,17 @@ vl_api_accept_session_t_handler (vl_api_accept_session_t * mp)
   ip_str = format (0, "%U", format_ip46_address, &mp->ip, mp->is_ip4);
   clib_warning ("Accepted session from: %s:%d", ip_str,
 		clib_net_to_host_u16 (mp->port));
-  utm->vpp_event_queue = (unix_shared_memory_queue_t *)
-    mp->vpp_event_queue_address;
+  utm->vpp_event_queue =
+    uword_to_pointer (mp->vpp_event_queue_address,
+		      unix_shared_memory_queue_t *);
 
   /* Allocate local session and set it up */
   pool_get (utm->sessions, session);
   session_index = session - utm->sessions;
 
-  rx_fifo = (svm_fifo_t *) mp->server_rx_fifo;
+  rx_fifo = uword_to_pointer (mp->server_rx_fifo, svm_fifo_t *);
   rx_fifo->client_session_index = session_index;
-  tx_fifo = (svm_fifo_t *) mp->server_tx_fifo;
+  tx_fifo = uword_to_pointer (mp->server_tx_fifo, svm_fifo_t *);
   tx_fifo->client_session_index = session_index;
 
   session->server_rx_fifo = rx_fifo;
diff --git a/src/uri/uri_udp_test.c b/src/uri/uri_udp_test.c
index 266215c8..8fb12ed2 100644
--- a/src/uri/uri_udp_test.c
+++ b/src/uri/uri_udp_test.c
@@ -232,7 +232,8 @@ vl_api_application_attach_reply_t_handler (vl_api_application_attach_reply_t *
     }
 
   utm->our_event_queue =
-    (unix_shared_memory_queue_t *) mp->app_event_queue_address;
+    uword_to_pointer (mp->app_event_queue_address,
+		      unix_shared_memory_queue_t *);
 }
 
 static void
@@ -581,7 +582,8 @@ send_reply:
 
   vec_free (a->segment_name);
 
-  client_q = (unix_shared_memory_queue_t *) mp->client_queue_address;
+  client_q =
+    uword_to_pointer (mp->client_queue_address, unix_shared_memory_queue_t *);
   vl_msg_api_send_shmem (client_q, (u8 *) & rmp);
 }
 
@@ -608,14 +610,15 @@ vl_api_accept_session_t_handler (vl_api_accept_session_t * mp)
   if (start_time == 0.0)
     start_time = clib_time_now (&utm->clib_time);
 
-  utm->vpp_event_queue = (unix_shared_memory_queue_t *)
-    mp->vpp_event_queue_address;
+  utm->vpp_event_queue =
+    uword_to_pointer (mp->vpp_event_queue_address,
+		      unix_shared_memory_queue_t *);
 
   pool_get (utm->sessions, session);
 
-  rx_fifo = (svm_fifo_t *) mp->server_rx_fifo;
+  rx_fifo = uword_to_pointer (mp->server_rx_fifo, svm_fifo_t *);
   rx_fifo->client_session_index = session - utm->sessions;
-  tx_fifo = (svm_fifo_t *) mp->server_tx_fifo;
+  tx_fifo = uword_to_pointer (mp->server_tx_fifo, svm_fifo_t *);
   tx_fifo->client_session_index = session - utm->sessions;
 
   session->server_rx_fifo = rx_fifo;
diff --git a/src/vat/api_format.c b/src/vat/api_format.c
index 28b227b4..495b660e 100644
--- a/src/vat/api_format.c
+++ b/src/vat/api_format.c
@@ -1037,7 +1037,7 @@ vl_api_cli_reply_t_handler (vl_api_cli_reply_t * mp)
   i32 retval = ntohl (mp->retval);
 
   vam->retval = retval;
-  vam->shmem_result = (u8 *) mp->reply_in_shmem;
+  vam->shmem_result = uword_to_pointer (mp->reply_in_shmem, u8 *);
   vam->result_ready = 1;
 }
 
@@ -1058,7 +1058,7 @@ vl_api_cli_reply_t_handler_json (vl_api_cli_reply_t * mp)
   pthread_mutex_lock (&am->vlib_rp->mutex);
   oldheap = svm_push_data_heap (am->vlib_rp);
 
-  reply = (u8 *) (mp->reply_in_shmem);
+  reply = uword_to_pointer (mp->reply_in_shmem, u8 *);
   vec_free (reply);
 
   svm_pop_heap (oldheap);
@@ -2405,7 +2405,7 @@ static void vl_api_get_node_graph_reply_t_handler
   if (retval != 0)
     return;
 
-  reply = (u8 *) (mp->reply_in_shmem);
+  reply = uword_to_pointer (mp->reply_in_shmem, u8 *);
   pvt_copy = vec_dup (reply);
 
   /* Toss the shared-memory original... */
@@ -2456,7 +2456,7 @@ static void vl_api_get_node_graph_reply_t_handler_json
   vat_json_object_add_int (&node, "retval", ntohl (mp->retval));
   vat_json_object_add_uint (&node, "reply_in_shmem", mp->reply_in_shmem);
 
-  reply = (u8 *) (mp->reply_in_shmem);
+  reply = uword_to_pointer (mp->reply_in_shmem, u8 *);
 
   /* Toss the shared-memory original... */
   pthread_mutex_lock (&am->vlib_rp->mutex);
@@ -4959,7 +4959,7 @@ exec (vat_main_t * vam)
   svm_pop_heap (oldheap);
   pthread_mutex_unlock (&am->vlib_rp->mutex);
 
-  mp->cmd_in_shmem = (u64) cmd;
+  mp->cmd_in_shmem = pointer_to_uword (cmd);
   S (mp);
   timeout = vat_time_now (vam) + 10.0;
 
diff --git a/src/vlib/threads.c b/src/vlib/threads.c
index 4a111f8d..9ccfd3a2 100644
--- a/src/vlib/threads.c
+++ b/src/vlib/threads.c
@@ -1125,7 +1125,7 @@ cpu_config (vlib_main_t * vm, unformat_input_t * input)
 
 VLIB_EARLY_CONFIG_FUNCTION (cpu_config, "cpu");
 
-#if !defined (__x86_64__) && !defined (__aarch64__) && !defined (__powerpc64__) && !defined(__arm__)
+#if !defined (__x86_64__) && !defined (__i386__) && !defined (__aarch64__) && !defined (__powerpc64__) && !defined(__arm__)
 void
 __sync_fetch_and_add_8 (void)
 {
diff --git a/src/vlibmemory/memory_client.c b/src/vlibmemory/memory_client.c
index d48a4fa1..a162d6bb 100644
--- a/src/vlibmemory/memory_client.c
+++ b/src/vlibmemory/memory_client.c
@@ -137,7 +137,7 @@ vl_api_memclnt_create_reply_t_handler (vl_api_memclnt_create_reply_t * mp)
   am->msg_index_by_name_and_crc = hash_create_string (0, sizeof (uword));
 
   /* Recreate the vnet-side API message handler table */
-  tblv = (u8 *) mp->message_table;
+  tblv = uword_to_pointer (mp->message_table, u8 *);
   serialize_open_vector (sm, tblv);
   unserialize_integer (sm, &nmsgs, sizeof (u32));
 
diff --git a/src/vlibmemory/memory_vlib.c b/src/vlibmemory/memory_vlib.c
index 29a5c2c2..acba8b3f 100644
--- a/src/vlibmemory/memory_vlib.c
+++ b/src/vlibmemory/memory_vlib.c
@@ -216,7 +216,8 @@ vl_api_memclnt_create_t_handler (vl_api_memclnt_create_t * mp)
      am->shmem_hdr->application_restarts);
   rp->context = mp->context;
   rp->response = ntohl (rv);
-  rp->message_table = (u64) am->serialized_message_table_in_shmem;
+  rp->message_table =
+    pointer_to_uword (am->serialized_message_table_in_shmem);
 
   vl_msg_api_send_shmem (q, (u8 *) & rp);
 }
diff --git a/src/vnet/devices/virtio/vhost-user.c b/src/vnet/devices/virtio/vhost-user.c
index acc7bf82..6ccc0d87 100644
--- a/src/vnet/devices/virtio/vhost-user.c
+++ b/src/vnet/devices/virtio/vhost-user.c
@@ -719,7 +719,7 @@ vhost_user_log_dirty_pages_2 (vhost_user_intf_t * vui,
     }
   if (is_host_address)
     {
-      addr = (u64) map_user_mem (vui, (uword) addr);
+      addr = pointer_to_uword (map_user_mem (vui, (uword) addr));
     }
   if (PREDICT_FALSE ((addr + len - 1) / VHOST_LOG_PAGE / 8 >= vui->log_size))
     {
diff --git a/src/vnet/session/application_interface.c b/src/vnet/session/application_interface.c
index ad44baa1..f74b0cfe 100644
--- a/src/vnet/session/application_interface.c
+++ b/src/vnet/session/application_interface.c
@@ -247,7 +247,7 @@ vnet_application_attach (vnet_app_attach_args_t * a)
 			      a->session_cb_vft)))
     return rv;
 
-  a->app_event_queue_address = (u64) app->event_queue;
+  a->app_event_queue_address = pointer_to_uword (app->event_queue);
   sm = segment_manager_get (app->first_segment_manager);
   segment_manager_get_segment_info (sm->segment_indices[0],
 				    &seg_name, &a->segment_size);
diff --git a/src/vnet/session/session_api.c b/src/vnet/session/session_api.c
index 5a02a08e..8266922c 100755
--- a/src/vnet/session/session_api.c
+++ b/src/vnet/session/session_api.c
@@ -102,9 +102,9 @@ send_session_accept_callback (stream_session_t * s)
   tc = tp_vft->get_connection (s->connection_index, s->thread_index);
   mp->listener_handle = listen_session_get_handle (listener);
   mp->handle = stream_session_handle (s);
-  mp->server_rx_fifo = (u64) s->server_rx_fifo;
-  mp->server_tx_fifo = (u64) s->server_tx_fifo;
-  mp->vpp_event_queue_address = (u64) vpp_queue;
+  mp->server_rx_fifo = pointer_to_uword (s->server_rx_fifo);
+  mp->server_tx_fifo = pointer_to_uword (s->server_tx_fifo);
+  mp->vpp_event_queue_address = pointer_to_uword (vpp_queue);
   mp->port = tc->rmt_port;
   mp->is_ip4 = tc->is_ip4;
   clib_memcpy (&mp->ip, &tc->rmt_ip, sizeof (tc->rmt_ip));
@@ -172,10 +172,10 @@ send_session_connected_callback (u32 app_index, u32 api_context,
   if (!is_fail)
     {
       vpp_queue = session_manager_get_vpp_event_queue (s->thread_index);
-      mp->server_rx_fifo = (u64) s->server_rx_fifo;
-      mp->server_tx_fifo = (u64) s->server_tx_fifo;
+      mp->server_rx_fifo = pointer_to_uword (s->server_rx_fifo);
+      mp->server_tx_fifo = pointer_to_uword (s->server_tx_fifo);
       mp->handle = stream_session_handle (s);
-      mp->vpp_event_queue_address = (u64) vpp_queue;
+      mp->vpp_event_queue_address = pointer_to_uword (vpp_queue);
       mp->retval = 0;
     }
   else
@@ -225,7 +225,7 @@ redirect_connect_callback (u32 server_api_client_index, void *mp_arg)
     }
 
   /* Tell the server the client's API queue address, so it can reply */
-  mp->client_queue_address = (u64) client_q;
+  mp->client_queue_address = pointer_to_uword (client_q);
   app = application_lookup (mp->client_index);
   if (!app)
     {
diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c
index 32d69a96..6f890874 100644
--- a/src/vnet/tcp/builtin_client.c
+++ b/src/vnet/tcp/builtin_client.c
@@ -274,11 +274,12 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp)
       return;
     }
 
-  tm->our_event_queue = (unix_shared_memory_queue_t *)
-    mp->vpp_event_queue_address;
-
-  tm->vpp_event_queue = (unix_shared_memory_queue_t *)
-    mp->vpp_event_queue_address;
+  tm->our_event_queue =
+    uword_to_pointer (mp->vpp_event_queue_address,
+		      unix_shared_memory_queue_t *);
+  tm->vpp_event_queue =
+    uword_to_pointer (mp->vpp_event_queue_address,
+		      unix_shared_memory_queue_t *);
 
   /*
    * Setup session
@@ -288,9 +289,11 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp)
   session_index = session - tm->sessions;
   session->bytes_to_receive = session->bytes_to_send = tm->bytes_to_send;
 
-  session->server_rx_fifo = (svm_fifo_t *) mp->server_rx_fifo;
+  session->server_rx_fifo =
+    uword_to_pointer (mp->server_rx_fifo, svm_fifo_t *);
   session->server_rx_fifo->client_session_index = session_index;
-  session->server_tx_fifo = (svm_fifo_t *) mp->server_tx_fifo;
+  session->server_tx_fifo =
+    uword_to_pointer (mp->server_tx_fifo, svm_fifo_t *);
   session->server_tx_fifo->client_session_index = session_index;
   session->vpp_session_handle = mp->handle;
 
@@ -321,7 +324,7 @@ create_api_loopback (tclient_main_t * tm)
   memset (mp, 0, sizeof (*mp));
   mp->_vl_msg_id = VL_API_MEMCLNT_CREATE;
   mp->context = 0xFEEDFACE;
-  mp->input_queue = (u64) tm->vl_input_queue;
+  mp->input_queue = pointer_to_uword (tm->vl_input_queue);
   strncpy ((char *) mp->name, "tcp_tester", sizeof (mp->name) - 1);
 
   vl_api_memclnt_create_t_handler (mp);
diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c
index 34682699..621ce02a 100644
--- a/src/vnet/tcp/builtin_server.c
+++ b/src/vnet/tcp/builtin_server.c
@@ -244,7 +244,7 @@ create_api_loopback (vlib_main_t * vm)
   memset (mp, 0, sizeof (*mp));
   mp->_vl_msg_id = VL_API_MEMCLNT_CREATE;
   mp->context = 0xFEEDFACE;
-  mp->input_queue = (u64) bsm->vl_input_queue;
+  mp->input_queue = pointer_to_uword (bsm->vl_input_queue);
   strncpy ((char *) mp->name, "tcp_test_server", sizeof (mp->name) - 1);
 
   vl_api_memclnt_create_t_handler (mp);
diff --git a/src/vppinfra/mheap.c b/src/vppinfra/mheap.c
index b8828f9e..192732db 100644
--- a/src/vppinfra/mheap.c
+++ b/src/vppinfra/mheap.c
@@ -304,7 +304,7 @@ mheap_small_object_cache_mask (mheap_small_object_cache_t * c, uword bin)
   uword mask;
 
 /* $$$$ ELIOT FIXME: add Altivec version of this routine */
-#if !defined (CLIB_HAVE_VEC128) || defined (__ALTIVEC__)
+#if !defined (CLIB_HAVE_VEC128) || defined (__ALTIVEC__) || defined (__i386__)
   mask = 0;
 #else
   u8x16 b = u8x16_splat (bin);
-- 
cgit 1.2.3-korg


From f55f9b851f59264d737d92c6277a87588c565d24 Mon Sep 17 00:00:00 2001
From: Damjan Marion <damarion@cisco.com>
Date: Wed, 10 May 2017 21:06:28 +0200
Subject: completelly deprecate os_get_cpu_number, replace new occurences

Change-Id: I82c663bc0866c6c68ba354104b0bb059387f4b9d
Signed-off-by: Damjan Marion <damarion@cisco.com>
---
 src/plugins/flowperpkt/l2_node.c       | 20 ++++++++++----------
 src/plugins/flowperpkt/node.c          | 20 ++++++++++----------
 src/plugins/snat/in2out.c              |  2 +-
 src/plugins/snat/out2in.c              |  2 +-
 src/vlib/main.h                        |  2 +-
 src/vlib/threads.c                     | 12 ++----------
 src/vlib/threads.h                     |  3 +--
 src/vlib/unix/main.c                   |  2 +-
 src/vlibmemory/memory_vlib.c           |  2 +-
 src/vnet/dpo/interface_dpo.c           |  8 ++++----
 src/vnet/lisp-gpe/lisp_gpe_adjacency.c |  2 +-
 src/vppinfra/bihash_template.c         | 16 ++++++++--------
 src/vppinfra/lock.h                    |  6 +++---
 src/vppinfra/mem.h                     |  6 +++---
 src/vppinfra/mhash.c                   |  2 +-
 src/vppinfra/mhash.h                   |  2 +-
 src/vppinfra/mheap.c                   |  4 ++--
 src/vppinfra/os.h                      | 20 ++++++++++++++++++--
 src/vppinfra/smp.c                     |  2 +-
 src/vppinfra/unix-misc.c               | 19 +++++++------------
 20 files changed, 77 insertions(+), 75 deletions(-)

(limited to 'src/vlib/threads.c')

diff --git a/src/plugins/flowperpkt/l2_node.c b/src/plugins/flowperpkt/l2_node.c
index fdaf81d1..db80e990 100644
--- a/src/plugins/flowperpkt/l2_node.c
+++ b/src/plugins/flowperpkt/l2_node.c
@@ -102,7 +102,7 @@ add_to_flow_record_l2 (vlib_main_t * vm,
 		       u8 * src_mac, u8 * dst_mac,
 		       u16 ethertype, u64 timestamp, u16 length, int do_flush)
 {
-  u32 my_cpu_number = vm->thread_index;
+  u32 my_thread_index = vm->thread_index;
   flow_report_main_t *frm = &flow_report_main;
   ip4_header_t *ip;
   udp_header_t *udp;
@@ -116,7 +116,7 @@ add_to_flow_record_l2 (vlib_main_t * vm,
   vlib_buffer_free_list_t *fl;
 
   /* Find or allocate a buffer */
-  b0 = fm->l2_buffers_per_worker[my_cpu_number];
+  b0 = fm->l2_buffers_per_worker[my_thread_index];
 
   /* Need to allocate a buffer? */
   if (PREDICT_FALSE (b0 == 0))
@@ -130,7 +130,7 @@ add_to_flow_record_l2 (vlib_main_t * vm,
 	return;
 
       /* Initialize the buffer */
-      b0 = fm->l2_buffers_per_worker[my_cpu_number] =
+      b0 = fm->l2_buffers_per_worker[my_thread_index] =
 	vlib_get_buffer (vm, bi0);
       fl =
 	vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
@@ -142,16 +142,16 @@ add_to_flow_record_l2 (vlib_main_t * vm,
     {
       /* use the current buffer */
       bi0 = vlib_get_buffer_index (vm, b0);
-      offset = fm->l2_next_record_offset_per_worker[my_cpu_number];
+      offset = fm->l2_next_record_offset_per_worker[my_thread_index];
     }
 
   /* Find or allocate a frame */
-  f = fm->l2_frames_per_worker[my_cpu_number];
+  f = fm->l2_frames_per_worker[my_thread_index];
   if (PREDICT_FALSE (f == 0))
     {
       u32 *to_next;
       f = vlib_get_frame_to_node (vm, ip4_lookup_node.index);
-      fm->l2_frames_per_worker[my_cpu_number] = f;
+      fm->l2_frames_per_worker[my_thread_index] = f;
 
       /* Enqueue the buffer */
       to_next = vlib_frame_vector_args (f);
@@ -299,13 +299,13 @@ add_to_flow_record_l2 (vlib_main_t * vm,
 	}
 
       vlib_put_frame_to_node (vm, ip4_lookup_node.index,
-			      fm->l2_frames_per_worker[my_cpu_number]);
-      fm->l2_frames_per_worker[my_cpu_number] = 0;
-      fm->l2_buffers_per_worker[my_cpu_number] = 0;
+			      fm->l2_frames_per_worker[my_thread_index]);
+      fm->l2_frames_per_worker[my_thread_index] = 0;
+      fm->l2_buffers_per_worker[my_thread_index] = 0;
       offset = 0;
     }
 
-  fm->l2_next_record_offset_per_worker[my_cpu_number] = offset;
+  fm->l2_next_record_offset_per_worker[my_thread_index] = offset;
 }
 
 void
diff --git a/src/plugins/flowperpkt/node.c b/src/plugins/flowperpkt/node.c
index 0277682d..9bac4166 100644
--- a/src/plugins/flowperpkt/node.c
+++ b/src/plugins/flowperpkt/node.c
@@ -101,7 +101,7 @@ add_to_flow_record_ipv4 (vlib_main_t * vm,
 			 u32 src_address, u32 dst_address,
 			 u8 tos, u64 timestamp, u16 length, int do_flush)
 {
-  u32 my_cpu_number = vm->thread_index;
+  u32 my_thread_index = vm->thread_index;
   flow_report_main_t *frm = &flow_report_main;
   ip4_header_t *ip;
   udp_header_t *udp;
@@ -115,7 +115,7 @@ add_to_flow_record_ipv4 (vlib_main_t * vm,
   vlib_buffer_free_list_t *fl;
 
   /* Find or allocate a buffer */
-  b0 = fm->ipv4_buffers_per_worker[my_cpu_number];
+  b0 = fm->ipv4_buffers_per_worker[my_thread_index];
 
   /* Need to allocate a buffer? */
   if (PREDICT_FALSE (b0 == 0))
@@ -129,7 +129,7 @@ add_to_flow_record_ipv4 (vlib_main_t * vm,
 	return;
 
       /* Initialize the buffer */
-      b0 = fm->ipv4_buffers_per_worker[my_cpu_number] =
+      b0 = fm->ipv4_buffers_per_worker[my_thread_index] =
 	vlib_get_buffer (vm, bi0);
       fl =
 	vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
@@ -141,16 +141,16 @@ add_to_flow_record_ipv4 (vlib_main_t * vm,
     {
       /* use the current buffer */
       bi0 = vlib_get_buffer_index (vm, b0);
-      offset = fm->ipv4_next_record_offset_per_worker[my_cpu_number];
+      offset = fm->ipv4_next_record_offset_per_worker[my_thread_index];
     }
 
   /* Find or allocate a frame */
-  f = fm->ipv4_frames_per_worker[my_cpu_number];
+  f = fm->ipv4_frames_per_worker[my_thread_index];
   if (PREDICT_FALSE (f == 0))
     {
       u32 *to_next;
       f = vlib_get_frame_to_node (vm, ip4_lookup_node.index);
-      fm->ipv4_frames_per_worker[my_cpu_number] = f;
+      fm->ipv4_frames_per_worker[my_thread_index] = f;
 
       /* Enqueue the buffer */
       to_next = vlib_frame_vector_args (f);
@@ -300,13 +300,13 @@ add_to_flow_record_ipv4 (vlib_main_t * vm,
 	}
 
       vlib_put_frame_to_node (vm, ip4_lookup_node.index,
-			      fm->ipv4_frames_per_worker[my_cpu_number]);
-      fm->ipv4_frames_per_worker[my_cpu_number] = 0;
-      fm->ipv4_buffers_per_worker[my_cpu_number] = 0;
+			      fm->ipv4_frames_per_worker[my_thread_index]);
+      fm->ipv4_frames_per_worker[my_thread_index] = 0;
+      fm->ipv4_buffers_per_worker[my_thread_index] = 0;
       offset = 0;
     }
 
-  fm->ipv4_next_record_offset_per_worker[my_cpu_number] = offset;
+  fm->ipv4_next_record_offset_per_worker[my_thread_index] = offset;
 }
 
 void
diff --git a/src/plugins/snat/in2out.c b/src/plugins/snat/in2out.c
index f7d29c69..bc86a7a4 100644
--- a/src/plugins/snat/in2out.c
+++ b/src/plugins/snat/in2out.c
@@ -1514,7 +1514,7 @@ snat_det_in2out_node_fn (vlib_main_t * vm,
   u32 pkts_processed = 0;
   snat_main_t * sm = &snat_main;
   u32 now = (u32) vlib_time_now (vm);
-  u32 thread_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   from = vlib_frame_vector_args (frame);
   n_left_from = frame->n_vectors;
diff --git a/src/plugins/snat/out2in.c b/src/plugins/snat/out2in.c
index 3d7b106a..824406ab 100644
--- a/src/plugins/snat/out2in.c
+++ b/src/plugins/snat/out2in.c
@@ -1168,7 +1168,7 @@ snat_det_out2in_node_fn (vlib_main_t * vm,
   snat_out2in_next_t next_index;
   u32 pkts_processed = 0;
   snat_main_t * sm = &snat_main;
-  u32 thread_index = os_get_cpu_number ();
+  u32 thread_index = vlib_get_thread_index ();
 
   from = vlib_frame_vector_args (frame);
   n_left_from = frame->n_vectors;
diff --git a/src/vlib/main.h b/src/vlib/main.h
index 329bf073..0e8026d1 100644
--- a/src/vlib/main.h
+++ b/src/vlib/main.h
@@ -320,7 +320,7 @@ always_inline void vlib_set_queue_signal_callback
 /* Main routine. */
 int vlib_main (vlib_main_t * vm, unformat_input_t * input);
 
-/* Thread stacks, for os_get_cpu_number */
+/* Thread stacks, for os_get_thread_index */
 extern u8 **vlib_thread_stacks;
 
 /* Number of thread stacks that the application needs */
diff --git a/src/vlib/threads.c b/src/vlib/threads.c
index 9ccfd3a2..b7bc9e26 100644
--- a/src/vlib/threads.c
+++ b/src/vlib/threads.c
@@ -35,16 +35,8 @@ vl (void *p)
 vlib_worker_thread_t *vlib_worker_threads;
 vlib_thread_main_t vlib_thread_main;
 
-__thread uword vlib_thread_index = 0;
-
-uword
-os_get_cpu_number (void)
-{
-  return vlib_thread_index;
-}
-
 uword
-os_get_ncpus (void)
+os_get_nthreads (void)
 {
   u32 len;
 
@@ -467,7 +459,7 @@ vlib_worker_thread_bootstrap_fn (void *arg)
   w->lwp = syscall (SYS_gettid);
   w->thread_id = pthread_self ();
 
-  vlib_thread_index = w - vlib_worker_threads;
+  __os_thread_index = w - vlib_worker_threads;
 
   rv = (void *) clib_calljmp
     ((uword (*)(uword)) w->thread_function,
diff --git a/src/vlib/threads.h b/src/vlib/threads.h
index 101d3d4a..17d35a24 100644
--- a/src/vlib/threads.h
+++ b/src/vlib/threads.h
@@ -181,11 +181,10 @@ u32 vlib_frame_queue_main_init (u32 node_index, u32 frame_queue_nelts);
 void vlib_worker_thread_barrier_sync (vlib_main_t * vm);
 void vlib_worker_thread_barrier_release (vlib_main_t * vm);
 
-extern __thread uword vlib_thread_index;
 static_always_inline uword
 vlib_get_thread_index (void)
 {
-  return vlib_thread_index;
+  return __os_thread_index;
 }
 
 always_inline void
diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c
index db5ddd64..103576db 100644
--- a/src/vlib/unix/main.c
+++ b/src/vlib/unix/main.c
@@ -565,7 +565,7 @@ vlib_unix_main (int argc, char *argv[])
 
   vlib_thread_stack_init (0);
 
-  vlib_thread_index = 0;
+  __os_thread_index = 0;
 
   i = clib_calljmp (thread0, (uword) vm,
 		    (void *) (vlib_thread_stacks[0] +
diff --git a/src/vlibmemory/memory_vlib.c b/src/vlibmemory/memory_vlib.c
index acba8b3f..e5d88732 100644
--- a/src/vlibmemory/memory_vlib.c
+++ b/src/vlibmemory/memory_vlib.c
@@ -1333,7 +1333,7 @@ vl_api_rpc_call_main_thread (void *fp, u8 * data, u32 data_length)
   unix_shared_memory_queue_t *q;
 
   /* Main thread: call the function directly */
-  if (os_get_cpu_number () == 0)
+  if (vlib_get_thread_index () == 0)
     {
       vlib_main_t *vm = vlib_get_main ();
       void (*call_fp) (void *);
diff --git a/src/vnet/dpo/interface_dpo.c b/src/vnet/dpo/interface_dpo.c
index 50ca756f..8d700c23 100644
--- a/src/vnet/dpo/interface_dpo.c
+++ b/src/vnet/dpo/interface_dpo.c
@@ -231,7 +231,7 @@ interface_dpo_inline (vlib_main_t * vm,
                       vlib_frame_t * from_frame)
 {
     u32 n_left_from, next_index, * from, * to_next;
-    u32 cpu_index = os_get_cpu_number();
+    u32 thread_index = vlib_get_thread_index ();
     vnet_interface_main_t *im;
 
     im = &vnet_get_main ()->interface_main;
@@ -274,13 +274,13 @@ interface_dpo_inline (vlib_main_t * vm,
 
             vlib_increment_combined_counter (im->combined_sw_if_counters
                                              + VNET_INTERFACE_COUNTER_RX,
-                                             cpu_index,
+                                             thread_index,
                                              ido0->ido_sw_if_index,
                                              1,
                                              vlib_buffer_length_in_chain (vm, b0));
             vlib_increment_combined_counter (im->combined_sw_if_counters
                                              + VNET_INTERFACE_COUNTER_RX,
-                                             cpu_index,
+                                             thread_index,
                                              ido1->ido_sw_if_index,
                                              1,
                                              vlib_buffer_length_in_chain (vm, b1));
@@ -331,7 +331,7 @@ interface_dpo_inline (vlib_main_t * vm,
             /* Bump the interface's RX coutners */
             vlib_increment_combined_counter (im->combined_sw_if_counters
                                              + VNET_INTERFACE_COUNTER_RX,
-                                             cpu_index,
+                                             thread_index,
                                              ido0->ido_sw_if_index,
                                              1,
                                              vlib_buffer_length_in_chain (vm, b0));
diff --git a/src/vnet/lisp-gpe/lisp_gpe_adjacency.c b/src/vnet/lisp-gpe/lisp_gpe_adjacency.c
index d5f3a28a..7db1c9bb 100644
--- a/src/vnet/lisp-gpe/lisp_gpe_adjacency.c
+++ b/src/vnet/lisp-gpe/lisp_gpe_adjacency.c
@@ -302,7 +302,7 @@ lisp_gpe_increment_stats_counters (lisp_cp_main_t * lcm, ip_adjacency_t * adj,
 
   /* compute payload length starting after GPE */
   u32 bytes = b->current_length - (lisp_data - b->data - b->current_data);
-  vlib_increment_combined_counter (&lgm->counters, os_get_cpu_number (),
+  vlib_increment_combined_counter (&lgm->counters, vlib_get_thread_index (),
 				   p[0], 1, bytes);
 }
 
diff --git a/src/vppinfra/bihash_template.c b/src/vppinfra/bihash_template.c
index d8b97b5f..51fadeb8 100644
--- a/src/vppinfra/bihash_template.c
+++ b/src/vppinfra/bihash_template.c
@@ -96,12 +96,12 @@ BV (make_working_copy) (BVT (clib_bihash) * h, clib_bihash_bucket_t * b)
   clib_bihash_bucket_t working_bucket __attribute__ ((aligned (8)));
   void *oldheap;
   BVT (clib_bihash_value) * working_copy;
-  u32 cpu_number = os_get_cpu_number ();
+  u32 thread_index = os_get_thread_index ();
 
-  if (cpu_number >= vec_len (h->working_copies))
+  if (thread_index >= vec_len (h->working_copies))
     {
       oldheap = clib_mem_set_heap (h->mheap);
-      vec_validate (h->working_copies, cpu_number);
+      vec_validate (h->working_copies, thread_index);
       clib_mem_set_heap (oldheap);
     }
 
@@ -110,7 +110,7 @@ BV (make_working_copy) (BVT (clib_bihash) * h, clib_bihash_bucket_t * b)
    * updates from multiple threads will not result in sporadic, spurious
    * lookup failures.
    */
-  working_copy = h->working_copies[cpu_number];
+  working_copy = h->working_copies[thread_index];
 
   h->saved_bucket.as_u64 = b->as_u64;
   oldheap = clib_mem_set_heap (h->mheap);
@@ -119,7 +119,7 @@ BV (make_working_copy) (BVT (clib_bihash) * h, clib_bihash_bucket_t * b)
     {
       vec_validate_aligned (working_copy, (1 << b->log2_pages) - 1,
 			    sizeof (u64));
-      h->working_copies[cpu_number] = working_copy;
+      h->working_copies[thread_index] = working_copy;
     }
 
   _vec_len (working_copy) = 1 << b->log2_pages;
@@ -132,7 +132,7 @@ BV (make_working_copy) (BVT (clib_bihash) * h, clib_bihash_bucket_t * b)
   working_bucket.offset = BV (clib_bihash_get_offset) (h, working_copy);
   CLIB_MEMORY_BARRIER ();
   b->as_u64 = working_bucket.as_u64;
-  h->working_copies[cpu_number] = working_copy;
+  h->working_copies[thread_index] = working_copy;
 }
 
 static
@@ -233,7 +233,7 @@ int BV (clib_bihash_add_del)
   int i, limit;
   u64 hash, new_hash;
   u32 new_log2_pages;
-  u32 cpu_number = os_get_cpu_number ();
+  u32 thread_index = os_get_thread_index ();
   int mark_bucket_linear;
   int resplit_once;
 
@@ -323,7 +323,7 @@ int BV (clib_bihash_add_del)
   new_log2_pages = h->saved_bucket.log2_pages + 1;
   mark_bucket_linear = 0;
 
-  working_copy = h->working_copies[cpu_number];
+  working_copy = h->working_copies[thread_index];
   resplit_once = 0;
 
   new_v = BV (split_and_rehash) (h, working_copy, new_log2_pages);
diff --git a/src/vppinfra/lock.h b/src/vppinfra/lock.h
index c60ff414..0cd2b4fe 100644
--- a/src/vppinfra/lock.h
+++ b/src/vppinfra/lock.h
@@ -24,7 +24,7 @@ typedef struct
   u32 lock;
 #if CLIB_DEBUG > 0
   pid_t pid;
-  uword cpu_index;
+  uword thread_index;
   void *frame_address;
 #endif
 } *clib_spinlock_t;
@@ -57,7 +57,7 @@ clib_spinlock_lock (clib_spinlock_t * p)
 #if CLIB_DEBUG > 0
   (*p)->frame_address = __builtin_frame_address (0);
   (*p)->pid = getpid ();
-  (*p)->cpu_index = os_get_cpu_number ();
+  (*p)->thread_index = os_get_thread_index ();
 #endif
 }
 
@@ -75,7 +75,7 @@ clib_spinlock_unlock (clib_spinlock_t * p)
 #if CLIB_DEBUG > 0
   (*p)->frame_address = 0;
   (*p)->pid = 0;
-  (*p)->cpu_index = 0;
+  (*p)->thread_index = 0;
 #endif
 }
 
diff --git a/src/vppinfra/mem.h b/src/vppinfra/mem.h
index 1260eab2..63c5ac16 100644
--- a/src/vppinfra/mem.h
+++ b/src/vppinfra/mem.h
@@ -54,14 +54,14 @@ extern void *clib_per_cpu_mheaps[CLIB_MAX_MHEAPS];
 always_inline void *
 clib_mem_get_per_cpu_heap (void)
 {
-  int cpu = os_get_cpu_number ();
+  int cpu = os_get_thread_index ();
   return clib_per_cpu_mheaps[cpu];
 }
 
 always_inline void *
 clib_mem_set_per_cpu_heap (u8 * new_heap)
 {
-  int cpu = os_get_cpu_number ();
+  int cpu = os_get_thread_index ();
   void *old = clib_per_cpu_mheaps[cpu];
   clib_per_cpu_mheaps[cpu] = new_heap;
   return old;
@@ -83,7 +83,7 @@ clib_mem_alloc_aligned_at_offset (uword size, uword align, uword align_offset,
 	align_offset = align;
     }
 
-  cpu = os_get_cpu_number ();
+  cpu = os_get_thread_index ();
   heap = clib_per_cpu_mheaps[cpu];
   heap = mheap_get_aligned (heap, size, align, align_offset, &offset);
   clib_per_cpu_mheaps[cpu] = heap;
diff --git a/src/vppinfra/mhash.c b/src/vppinfra/mhash.c
index c917e164..00b67c49 100644
--- a/src/vppinfra/mhash.c
+++ b/src/vppinfra/mhash.c
@@ -226,7 +226,7 @@ static uword
 mhash_set_tmp_key (mhash_t * h, const void *key)
 {
   u8 *key_tmp;
-  int my_cpu = os_get_cpu_number ();
+  int my_cpu = os_get_thread_index ();
 
   vec_validate (h->key_tmps, my_cpu);
   key_tmp = h->key_tmps[my_cpu];
diff --git a/src/vppinfra/mhash.h b/src/vppinfra/mhash.h
index 102adf4e..7eb19183 100644
--- a/src/vppinfra/mhash.h
+++ b/src/vppinfra/mhash.h
@@ -93,7 +93,7 @@ mhash_key_to_mem (mhash_t * h, uword key)
     {
       u8 *key_tmp;
 
-      int my_cpu = os_get_cpu_number ();
+      int my_cpu = os_get_thread_index ();
       vec_validate (h->key_tmps, my_cpu);
       key_tmp = h->key_tmps[my_cpu];
       return key_tmp;
diff --git a/src/vppinfra/mheap.c b/src/vppinfra/mheap.c
index 192732db..d4010ceb 100644
--- a/src/vppinfra/mheap.c
+++ b/src/vppinfra/mheap.c
@@ -56,7 +56,7 @@ mheap_maybe_lock (void *v)
   mheap_t *h = mheap_header (v);
   if (v && (h->flags & MHEAP_FLAG_THREAD_SAFE))
     {
-      u32 my_cpu = os_get_cpu_number ();
+      u32 my_cpu = os_get_thread_index ();
       if (h->owner_cpu == my_cpu)
 	{
 	  h->recursion_count++;
@@ -77,7 +77,7 @@ mheap_maybe_unlock (void *v)
   mheap_t *h = mheap_header (v);
   if (v && h->flags & MHEAP_FLAG_THREAD_SAFE)
     {
-      ASSERT (os_get_cpu_number () == h->owner_cpu);
+      ASSERT (os_get_thread_index () == h->owner_cpu);
       if (--h->recursion_count == 0)
 	{
 	  h->owner_cpu = ~0;
diff --git a/src/vppinfra/os.h b/src/vppinfra/os.h
index a5c74f8c..33300716 100644
--- a/src/vppinfra/os.h
+++ b/src/vppinfra/os.h
@@ -56,8 +56,24 @@ void os_out_of_memory (void);
 /* Estimate, measure or divine CPU timestamp clock frequency. */
 f64 os_cpu_clock_frequency (void);
 
-uword os_get_cpu_number (void);
-uword os_get_ncpus (void);
+extern __thread uword __os_thread_index;
+
+static_always_inline uword
+os_get_thread_index (void)
+{
+  return __os_thread_index;
+}
+
+static_always_inline uword
+os_get_cpu_number (void) __attribute__ ((deprecated));
+
+static_always_inline uword
+os_get_cpu_number (void)
+{
+  return __os_thread_index;
+}
+
+uword os_get_nthreads (void);
 
 #include <vppinfra/smp.h>
 
diff --git a/src/vppinfra/smp.c b/src/vppinfra/smp.c
index 8ac19960..f603283e 100644
--- a/src/vppinfra/smp.c
+++ b/src/vppinfra/smp.c
@@ -53,7 +53,7 @@ allocate_per_cpu_mheap (uword cpu)
   void *heap;
   uword vm_size, stack_size, mheap_flags;
 
-  ASSERT (os_get_cpu_number () == cpu);
+  ASSERT (os_get_thread_index () == cpu);
 
   vm_size = (uword) 1 << m->log2_n_per_cpu_vm_bytes;
   stack_size = (uword) 1 << m->log2_n_per_cpu_stack_bytes;
diff --git a/src/vppinfra/unix-misc.c b/src/vppinfra/unix-misc.c
index 2928369d..361015b4 100644
--- a/src/vppinfra/unix-misc.c
+++ b/src/vppinfra/unix-misc.c
@@ -45,6 +45,8 @@
 #include <fcntl.h>
 #include <stdio.h>		/* for sprintf */
 
+__thread uword __os_thread_index = 0;
+
 clib_error_t *
 unix_file_n_bytes (char *file, uword * result)
 {
@@ -188,14 +190,14 @@ void os_puts (u8 * string, uword string_length, uword is_error)
 void
 os_puts (u8 * string, uword string_length, uword is_error)
 {
-  int cpu = os_get_cpu_number ();
-  int ncpus = os_get_ncpus ();
+  int cpu = os_get_thread_index ();
+  int nthreads = os_get_nthreads ();
   char buf[64];
   int fd = is_error ? 2 : 1;
   struct iovec iovs[2];
   int n_iovs = 0;
 
-  if (ncpus > 1)
+  if (nthreads > 1)
     {
       snprintf (buf, sizeof (buf), "%d: ", cpu);
 
@@ -219,16 +221,9 @@ os_out_of_memory (void)
   os_panic ();
 }
 
-uword os_get_cpu_number (void) __attribute__ ((weak));
-uword
-os_get_cpu_number (void)
-{
-  return 0;
-}
-
-uword os_get_ncpus (void) __attribute__ ((weak));
+uword os_get_nthreads (void) __attribute__ ((weak));
 uword
-os_get_ncpus (void)
+os_get_nthreads (void)
 {
   return 1;
 }
-- 
cgit 1.2.3-korg


From 93992a9048cb6e5dcd22de5091e72de778122627 Mon Sep 17 00:00:00 2001
From: Florin Coras <fcoras@cisco.com>
Date: Wed, 24 May 2017 18:03:56 -0700
Subject: Implement sack based tcp loss recovery (RFC 6675)

- refactor existing congestion control code (RFC 6582/5681). Handling of ack
  feedback now consists of: ack parsing, cc event detection, event handling,
  congestion control update
- extend sack scoreboard to support sack based retransmissions
- basic implementation of Eifel detection algorithm (RFC 3522) for
  detecting spurious retransmissions
- actually initialize the per-thread frame freelist hash tables
- increase worker stack size to 2mb
- fix session queue node out-of-buffer handling
  - ensure that the local buffer cache vec_len matches reality
  - avoid 2x spurious event requeues when short of buffers
  - count out-of-buffer events
- make the builtin server thread-safe
- fix bihash template threading issue: need to paint -1 across uninitialized
  working_copy_length vector elements (via rebase from master)

Change-Id: I646cb9f1add9a67d08f4a87badbcb117980ebfc4
Signed-off-by: Florin Coras <fcoras@cisco.com>
Signed-off-by: Dave Barach <dbarach@cisco.com>
---
 src/svm/svm_fifo.c             |   5 +-
 src/vlib/node.c                |   1 +
 src/vlib/threads.c             |   2 +-
 src/vlib/threads.h             |   2 +-
 src/vnet/session/node.c        |  53 ++--
 src/vnet/session/session.c     |  11 +-
 src/vnet/session/session.h     |   6 +-
 src/vnet/session/session_cli.c |  26 +-
 src/vnet/tcp/builtin_client.c  |  40 ++-
 src/vnet/tcp/builtin_server.c  |  20 +-
 src/vnet/tcp/tcp.c             |  57 ++--
 src/vnet/tcp/tcp.h             | 112 +++++--
 src/vnet/tcp/tcp_debug.h       |  16 +-
 src/vnet/tcp/tcp_input.c       | 671 +++++++++++++++++++++++++++++------------
 src/vnet/tcp/tcp_newreno.c     |  20 +-
 src/vnet/tcp/tcp_output.c      | 287 ++++++++++++------
 src/vnet/tcp/tcp_test.c        |  53 ++--
 17 files changed, 973 insertions(+), 409 deletions(-)

(limited to 'src/vlib/threads.c')

diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c
index f13f6fea..5c8f244a 100644
--- a/src/svm/svm_fifo.c
+++ b/src/svm/svm_fifo.c
@@ -540,7 +540,7 @@ svm_fifo_peek (svm_fifo_t * f, u32 relative_offset, u32 max_bytes,
 
   /* read cursize, which can only increase while we're working */
   cursize = svm_fifo_max_dequeue (f);
-  if (PREDICT_FALSE (cursize == 0))
+  if (PREDICT_FALSE (cursize < relative_offset))
     return -2;			/* nothing in the fifo */
 
   nitems = f->nitems;
@@ -548,7 +548,8 @@ svm_fifo_peek (svm_fifo_t * f, u32 relative_offset, u32 max_bytes,
   real_head = real_head >= nitems ? real_head - nitems : real_head;
 
   /* Number of bytes we're going to copy */
-  total_copy_bytes = (cursize < max_bytes) ? cursize : max_bytes;
+  total_copy_bytes = (cursize - relative_offset < max_bytes) ?
+    cursize - relative_offset : max_bytes;
 
   if (PREDICT_TRUE (copy_here != 0))
     {
diff --git a/src/vlib/node.c b/src/vlib/node.c
index bbd3a42e..eecad274 100644
--- a/src/vlib/node.c
+++ b/src/vlib/node.c
@@ -502,6 +502,7 @@ vlib_node_main_init (vlib_main_t * vm)
   vlib_node_t *n;
   uword ni;
 
+  nm->frame_size_hash = hash_create (0, sizeof (uword));
   nm->flags |= VLIB_NODE_MAIN_RUNTIME_STARTED;
 
   /* Generate sibling relationships */
diff --git a/src/vlib/threads.c b/src/vlib/threads.c
index b7bc9e26..0c775e2d 100644
--- a/src/vlib/threads.c
+++ b/src/vlib/threads.c
@@ -670,7 +670,7 @@ start_workers (vlib_main_t * vm)
 
 	      /* zap the (per worker) frame freelists, etc */
 	      nm_clone->frame_sizes = 0;
-	      nm_clone->frame_size_hash = 0;
+	      nm_clone->frame_size_hash = hash_create (0, sizeof (uword));
 
 	      /* Packet trace buffers are guaranteed to be empty, nothing to do here */
 
diff --git a/src/vlib/threads.h b/src/vlib/threads.h
index 17d35a24..572ce77f 100644
--- a/src/vlib/threads.h
+++ b/src/vlib/threads.h
@@ -62,7 +62,7 @@ typedef struct vlib_thread_registration_
 #define VLIB_CPU_MASK (VLIB_MAX_CPUS - 1)	/* 0x3f, max */
 #define VLIB_OFFSET_MASK (~VLIB_CPU_MASK)
 
-#define VLIB_LOG2_THREAD_STACK_SIZE (20)
+#define VLIB_LOG2_THREAD_STACK_SIZE (21)
 #define VLIB_THREAD_STACK_SIZE (1<<VLIB_LOG2_THREAD_STACK_SIZE)
 
 typedef enum
diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c
index 3053ccc2..07eeae82 100644
--- a/src/vnet/session/node.c
+++ b/src/vnet/session/node.c
@@ -47,7 +47,8 @@ vlib_node_registration_t session_queue_node;
 
 #define foreach_session_queue_error		\
 _(TX, "Packets transmitted")                  	\
-_(TIMER, "Timer events")
+_(TIMER, "Timer events")			\
+_(NO_BUFFER, "Out of buffers")
 
 typedef enum
 {
@@ -141,6 +142,7 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
   u8 *data0;
   int i, n_bytes_read;
   u32 n_bytes_per_buf, deq_per_buf;
+  u32 buffers_allocated, buffers_allocated_this_call;
 
   next_index = next0 = session_type_to_next[s0->session_type];
 
@@ -167,9 +169,6 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
   /* Check how much we can pull. If buffering, subtract the offset */
   max_dequeue0 = svm_fifo_max_dequeue (s0->server_tx_fifo) - rx_offset;
 
-  /* Allow enqueuing of a new event */
-  svm_fifo_unset_event (s0->server_tx_fifo);
-
   /* Nothing to read return */
   if (max_dequeue0 == 0)
     return 0;
@@ -187,8 +186,8 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
       max_len_to_snd0 = snd_space0;
     }
 
-  n_bytes_per_buf = vlib_buffer_free_list_buffer_size (vm,
-						       VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
+  n_bytes_per_buf = vlib_buffer_free_list_buffer_size
+    (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX);
   n_bytes_per_seg = MAX_HDRS_LEN + snd_mss0;
   n_bufs_per_seg = ceil ((double) n_bytes_per_seg / n_bytes_per_buf);
   n_bufs_per_evt = (ceil ((double) max_len_to_snd0 / n_bytes_per_seg))
@@ -205,24 +204,33 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
       if (PREDICT_FALSE (n_bufs < VLIB_FRAME_SIZE))
 	{
 	  vec_validate (smm->tx_buffers[thread_index],
-			n_bufs + VLIB_FRAME_SIZE - 1);
-	  n_bufs += vlib_buffer_alloc (vm,
-				       &smm->tx_buffers[thread_index][n_bufs],
-				       VLIB_FRAME_SIZE);
-
-	  /* buffer shortage
-	   * XXX 0.9 because when debugging we might not get a full frame */
-	  if (PREDICT_FALSE (n_bufs < 0.9 * VLIB_FRAME_SIZE))
+			n_bufs + 2 * VLIB_FRAME_SIZE - 1);
+
+	  buffers_allocated = 0;
+	  do
 	    {
-	      if (svm_fifo_set_event (s0->server_tx_fifo))
-		{
-		  vec_add1 (smm->pending_event_vector[thread_index], *e0);
-		}
-	      return -1;
+	      buffers_allocated_this_call =
+		vlib_buffer_alloc
+		(vm,
+		 &smm->tx_buffers[thread_index][n_bufs + buffers_allocated],
+		 2 * VLIB_FRAME_SIZE - buffers_allocated);
+	      buffers_allocated += buffers_allocated_this_call;
 	    }
+	  while (buffers_allocated_this_call > 0
+		 && ((buffers_allocated + n_bufs < VLIB_FRAME_SIZE)));
+
+	  n_bufs += buffers_allocated;
 
 	  _vec_len (smm->tx_buffers[thread_index]) = n_bufs;
+
+	  if (PREDICT_FALSE (n_bufs < VLIB_FRAME_SIZE))
+	    {
+	      vec_add1 (smm->pending_event_vector[thread_index], *e0);
+	      return -1;
+	    }
 	}
+      /* Allow enqueuing of a new event */
+      svm_fifo_unset_event (s0->server_tx_fifo);
 
       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
       while (left_to_snd0 && n_left_to_next >= n_bufs_per_seg)
@@ -232,7 +240,9 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node,
 	   */
 
 	  /* Get free buffer */
+	  ASSERT (n_bufs >= 1);
 	  bi0 = smm->tx_buffers[thread_index][--n_bufs];
+	  ASSERT (bi0);
 	  _vec_len (smm->tx_buffers[thread_index]) = n_bufs;
 
 	  b0 = vlib_get_buffer (vm, bi0);
@@ -545,9 +555,10 @@ skip_dequeue:
 							my_thread_index,
 							&n_tx_packets);
 	  /* Out of buffers */
-	  if (rv < 0)
+	  if (PREDICT_FALSE (rv < 0))
 	    {
-	      vec_add1 (smm->pending_event_vector[my_thread_index], *e0);
+	      vlib_node_increment_counter (vm, node->node_index,
+					   SESSION_QUEUE_ERROR_NO_BUFFER, 1);
 	      continue;
 	    }
 	  break;
diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c
index 02b0cced..534598d6 100644
--- a/src/vnet/session/session.c
+++ b/src/vnet/session/session.c
@@ -551,7 +551,7 @@ u8
 stream_session_no_space (transport_connection_t * tc, u32 thread_index,
 			 u16 data_len)
 {
-  stream_session_t *s = stream_session_get (tc->c_index, thread_index);
+  stream_session_t *s = stream_session_get (tc->s_index, thread_index);
 
   if (PREDICT_FALSE (s->session_state != SESSION_STATE_READY))
     return 1;
@@ -563,6 +563,15 @@ stream_session_no_space (transport_connection_t * tc, u32 thread_index,
 }
 
 u32
+stream_session_tx_fifo_max_dequeue (transport_connection_t * tc)
+{
+  stream_session_t *s = stream_session_get (tc->s_index, tc->thread_index);
+  if (s->session_state != SESSION_STATE_READY)
+    return 0;
+  return svm_fifo_max_dequeue (s->server_tx_fifo);
+}
+
+int
 stream_session_peek_bytes (transport_connection_t * tc, u8 * buffer,
 			   u32 offset, u32 max_bytes)
 {
diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h
index a8728649..d9c38bd1 100644
--- a/src/vnet/session/session.h
+++ b/src/vnet/session/session.h
@@ -352,16 +352,18 @@ stream_session_max_rx_enqueue (transport_connection_t * tc)
 }
 
 always_inline u32
-stream_session_fifo_size (transport_connection_t * tc)
+stream_session_rx_fifo_size (transport_connection_t * tc)
 {
   stream_session_t *s = stream_session_get (tc->s_index, tc->thread_index);
   return s->server_rx_fifo->nitems;
 }
 
+u32 stream_session_tx_fifo_max_dequeue (transport_connection_t * tc);
+
 int
 stream_session_enqueue_data (transport_connection_t * tc, vlib_buffer_t * b,
 			     u32 offset, u8 queue_event, u8 is_in_order);
-u32
+int
 stream_session_peek_bytes (transport_connection_t * tc, u8 * buffer,
 			   u32 offset, u32 max_bytes);
 u32 stream_session_dequeue_drop (transport_connection_t * tc, u32 max_bytes);
diff --git a/src/vnet/session/session_cli.c b/src/vnet/session/session_cli.c
index 509eedbb..6b8341aa 100755
--- a/src/vnet/session/session_cli.c
+++ b/src/vnet/session/session_cli.c
@@ -15,6 +15,15 @@
 #include <vnet/session/application.h>
 #include <vnet/session/session.h>
 
+u8 *
+format_stream_session_fifos (u8 * s, va_list * args)
+{
+  stream_session_t *ss = va_arg (*args, stream_session_t *);
+  s = format (s, " Rx fifo: %U", format_svm_fifo, ss->server_rx_fifo, 1);
+  s = format (s, " Tx fifo: %U", format_svm_fifo, ss->server_tx_fifo, 1);
+  return s;
+}
+
 /**
  * Format stream session as per the following format
  *
@@ -44,6 +53,8 @@ format_stream_session (u8 * s, va_list * args)
 		  ss->thread_index, verbose);
       if (verbose == 1)
 	s = format (s, "%v", str);
+      if (verbose > 1)
+	s = format (s, "%U", format_stream_session_fifos, ss);
     }
   else if (ss->session_state == SESSION_STATE_LISTENING)
     {
@@ -57,8 +68,12 @@ format_stream_session (u8 * s, va_list * args)
     }
   else if (ss->session_state == SESSION_STATE_CLOSED)
     {
-      s = format (s, "[CL] %-40U%v", tp_vft->format_connection,
-		  ss->connection_index, ss->thread_index, verbose, str);
+      s = format (s, "[CL] %-40U", tp_vft->format_connection,
+		  ss->connection_index, ss->thread_index, verbose);
+      if (verbose == 1)
+	s = format (s, "%v", str);
+      if (verbose > 1)
+	s = format (s, "%U", format_stream_session_fifos, ss);
     }
   else
     {
@@ -124,13 +139,6 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input,
               ({
         	vec_reset_length (str);
                 str = format (str, "%U", format_stream_session, s, verbose);
-                if (verbose > 1)
-                  {
-                    str = format (str, " Rx fifo: %U", format_svm_fifo,
-				  s->server_rx_fifo, 1);
-                    str = format (str, " Tx fifo: %U", format_svm_fifo,
-				  s->server_tx_fifo, 1);
-                  }
                 vlib_cli_output (vm, "%v", str);
               }));
               /* *INDENT-ON* */
diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c
index 768f0c3c..7238cda3 100644
--- a/src/vnet/tcp/builtin_client.c
+++ b/src/vnet/tcp/builtin_client.c
@@ -115,8 +115,17 @@ receive_test_chunk (tclient_main_t * tm, session_t * s)
   /* Allow enqueuing of new event */
   // svm_fifo_unset_event (rx_fifo);
 
-  n_read = svm_fifo_dequeue_nowait (rx_fifo, vec_len (tm->rx_buf),
-				    tm->rx_buf);
+  if (test_bytes)
+    {
+      n_read = svm_fifo_dequeue_nowait (rx_fifo, vec_len (tm->rx_buf),
+					tm->rx_buf);
+    }
+  else
+    {
+      n_read = svm_fifo_max_dequeue (rx_fifo);
+      svm_fifo_dequeue_drop (rx_fifo, n_read);
+    }
+
   if (n_read > 0)
     {
       if (TCP_BUILTIN_CLIENT_DBG)
@@ -165,6 +174,8 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
   int i;
   int delete_session;
   u32 *connection_indices;
+  u32 tx_quota = 0;
+  u32 delta, prev_bytes_received_this_session;
 
   connection_indices = tm->connection_index_by_thread[my_thread_index];
 
@@ -177,14 +188,19 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
 
       sp = pool_elt_at_index (tm->sessions, connection_indices[i]);
 
-      if (sp->bytes_to_send > 0)
+      if (tx_quota < 60 && sp->bytes_to_send > 0)
 	{
 	  send_test_chunk (tm, sp);
 	  delete_session = 0;
+	  tx_quota++;
 	}
       if (sp->bytes_to_receive > 0)
 	{
+	  prev_bytes_received_this_session = sp->bytes_received;
 	  receive_test_chunk (tm, sp);
+	  delta = sp->bytes_received - prev_bytes_received_this_session;
+	  if (delta > 0)
+	    tx_quota--;
 	  delete_session = 0;
 	}
       if (PREDICT_FALSE (delete_session == 1))
@@ -195,11 +211,19 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node,
 	  dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION);
 	  dmp->client_index = tm->my_client_index;
 	  dmp->handle = sp->vpp_session_handle;
-	  vl_msg_api_send_shmem (tm->vl_input_queue, (u8 *) & dmp);
-	  vec_delete (connection_indices, 1, i);
-	  tm->connection_index_by_thread[my_thread_index] =
-	    connection_indices;
-	  __sync_fetch_and_add (&tm->ready_connections, -1);
+//        vl_msg_api_send_shmem (tm->vl_input_queue, (u8 *) & dmp);
+	  if (!unix_shared_memory_queue_add (tm->vl_input_queue, (u8 *) & dmp,
+					     1))
+	    {
+	      vec_delete (connection_indices, 1, i);
+	      tm->connection_index_by_thread[my_thread_index] =
+		connection_indices;
+	      __sync_fetch_and_add (&tm->ready_connections, -1);
+	    }
+	  else
+	    {
+	      vl_msg_api_free (dmp);
+	    }
 
 	  /* Kick the debug CLI process */
 	  if (tm->ready_connections == 0)
diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c
index 4f0e211c..8bd2f360 100644
--- a/src/vnet/tcp/builtin_server.c
+++ b/src/vnet/tcp/builtin_server.c
@@ -39,7 +39,8 @@
 
 typedef struct
 {
-  u8 *rx_buf;
+  /* Per-thread RX buffer */
+  u8 **rx_buf;
   unix_shared_memory_queue_t **vpp_queue;
   u64 byte_index;
 
@@ -117,13 +118,15 @@ void
 test_bytes (builtin_server_main_t * bsm, int actual_transfer)
 {
   int i;
+  u32 my_thread_id = vlib_get_thread_index ();
 
   for (i = 0; i < actual_transfer; i++)
     {
-      if (bsm->rx_buf[i] != ((bsm->byte_index + i) & 0xff))
+      if (bsm->rx_buf[my_thread_id][i] != ((bsm->byte_index + i) & 0xff))
 	{
 	  clib_warning ("at %lld expected %d got %d", bsm->byte_index + i,
-			(bsm->byte_index + i) & 0xff, bsm->rx_buf[i]);
+			(bsm->byte_index + i) & 0xff,
+			bsm->rx_buf[my_thread_id][i]);
 	}
     }
   bsm->byte_index += actual_transfer;
@@ -138,6 +141,7 @@ builtin_server_rx_callback (stream_session_t * s)
   builtin_server_main_t *bsm = &builtin_server_main;
   session_fifo_event_t evt;
   static int serial_number = 0;
+  u32 my_thread_id = vlib_get_thread_index ();
 
   tx_fifo = s->server_tx_fifo;
   rx_fifo = s->server_rx_fifo;
@@ -171,11 +175,12 @@ builtin_server_rx_callback (stream_session_t * s)
       return 0;
     }
 
-  vec_validate (bsm->rx_buf, max_transfer - 1);
-  _vec_len (bsm->rx_buf) = max_transfer;
+  vec_validate (bsm->rx_buf, my_thread_id);
+  vec_validate (bsm->rx_buf[my_thread_id], max_transfer - 1);
+  _vec_len (bsm->rx_buf[my_thread_id]) = max_transfer;
 
   actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, max_transfer,
-					     bsm->rx_buf);
+					     bsm->rx_buf[my_thread_id]);
   ASSERT (actual_transfer == max_transfer);
 
 //  test_bytes (bsm, actual_transfer);
@@ -184,7 +189,8 @@ builtin_server_rx_callback (stream_session_t * s)
    * Echo back
    */
 
-  n_written = svm_fifo_enqueue_nowait (tx_fifo, actual_transfer, bsm->rx_buf);
+  n_written = svm_fifo_enqueue_nowait (tx_fifo, actual_transfer,
+				       bsm->rx_buf[my_thread_id]);
 
   if (n_written != max_transfer)
     clib_warning ("short trout!");
diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c
index 9b7b2f65..e0b67a8e 100644
--- a/src/vnet/tcp/tcp.c
+++ b/src/vnet/tcp/tcp.c
@@ -195,8 +195,8 @@ tcp_connection_close (tcp_connection_t * tc)
   TCP_EVT_DBG (TCP_EVT_CLOSE, tc);
 
   /* Send FIN if needed */
-  if (tc->state == TCP_STATE_ESTABLISHED || tc->state == TCP_STATE_SYN_RCVD
-      || tc->state == TCP_STATE_CLOSE_WAIT)
+  if (tc->state == TCP_STATE_ESTABLISHED
+      || tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_CLOSE_WAIT)
     tcp_send_fin (tc);
 
   /* Switch state */
@@ -480,7 +480,7 @@ u8 *
 format_tcp_timers (u8 * s, va_list * args)
 {
   tcp_connection_t *tc = va_arg (*args, tcp_connection_t *);
-  int i, last = 0;
+  int i, last = -1;
 
   for (i = 0; i < TCP_N_TIMERS; i++)
     if (tc->timers[i] != TCP_TIMER_HANDLE_INVALID)
@@ -493,7 +493,7 @@ format_tcp_timers (u8 * s, va_list * args)
 	s = format (s, "%s,", tcp_conn_timers[i]);
     }
 
-  if (last > 0)
+  if (last >= 0)
     s = format (s, "%s]", tcp_conn_timers[i]);
   else
     s = format (s, "]");
@@ -526,19 +526,19 @@ format_tcp_vars (u8 * s, va_list * args)
   s = format (s, " snd_wnd %u rcv_wnd %u snd_wl1 %u snd_wl2 %u\n",
 	      tc->snd_wnd, tc->rcv_wnd, tc->snd_wl1 - tc->irs,
 	      tc->snd_wl2 - tc->iss);
-  s = format (s, " flight size %u send space %u rcv_wnd available %d\n",
-	      tcp_flight_size (tc), tcp_snd_space (tc),
-	      tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las));
+  s = format (s, " flight size %u send space %u rcv_wnd_av %d\n",
+	      tcp_flight_size (tc), tcp_available_snd_space (tc),
+	      tcp_rcv_wnd_available (tc));
   s = format (s, " cong %U ", format_tcp_congestion_status, tc);
   s = format (s, "cwnd %u ssthresh %u rtx_bytes %u bytes_acked %u\n",
-	      tc->cwnd, tc->ssthresh, tc->rtx_bytes, tc->bytes_acked);
-  s = format (s, " prev_ssthresh %u snd_congestion %u\n", tc->prev_ssthresh,
-	      tc->snd_congestion - tc->iss);
+	      tc->cwnd, tc->ssthresh, tc->snd_rxt_bytes, tc->bytes_acked);
+  s = format (s, " prev_ssthresh %u snd_congestion %u dupack %u\n",
+	      tc->prev_ssthresh, tc->snd_congestion - tc->iss,
+	      tc->rcv_dupacks);
   s = format (s, " rto %u rto_boff %u srtt %u rttvar %u rtt_ts %u ", tc->rto,
 	      tc->rto_boff, tc->srtt, tc->rttvar, tc->rtt_ts);
   s = format (s, "rtt_seq %u\n", tc->rtt_seq);
-  if (scoreboard_first_hole (&tc->sack_sb))
-    s = format (s, " scoreboard: %U\n", format_tcp_scoreboard, &tc->sack_sb);
+  s = format (s, " scoreboard: %U\n", format_tcp_scoreboard, &tc->sack_sb);
   if (vec_len (tc->snd_sacks))
     s = format (s, " sacks tx: %U\n", format_tcp_sacks, tc);
 
@@ -595,9 +595,10 @@ format_tcp_session (u8 * s, va_list * args)
 
   tc = tcp_connection_get (tci, thread_index);
   if (tc)
-    return format (s, "%U", format_tcp_connection, tc, verbose);
+    s = format (s, "%U", format_tcp_connection, tc, verbose);
   else
-    return format (s, "empty");
+    s = format (s, "empty");
+  return s;
 }
 
 u8 *
@@ -643,13 +644,17 @@ format_tcp_scoreboard (u8 * s, va_list * args)
 {
   sack_scoreboard_t *sb = va_arg (*args, sack_scoreboard_t *);
   sack_scoreboard_hole_t *hole;
-  s = format (s, "head %u tail %u snd_una_adv %u\n", sb->head, sb->tail,
-	      sb->snd_una_adv);
-  s = format (s, "sacked_bytes %u last_sacked_bytes %u", sb->sacked_bytes,
-	      sb->last_sacked_bytes);
-  s = format (s, " max_byte_sacked %u\n", sb->max_byte_sacked);
-  s = format (s, "holes:\n");
+  s = format (s, "sacked_bytes %u last_sacked_bytes %u lost_bytes %u\n",
+	      sb->sacked_bytes, sb->last_sacked_bytes, sb->lost_bytes);
+  s = format (s, " last_bytes_delivered %u high_sacked %u snd_una_adv %u\n",
+	      sb->last_bytes_delivered, sb->high_sacked, sb->snd_una_adv);
+  s = format (s, " cur_rxt_hole %u high_rxt %u rescue_rxt %u",
+	      sb->cur_rxt_hole, sb->high_rxt, sb->rescue_rxt);
+
   hole = scoreboard_first_hole (sb);
+  if (hole)
+    s = format (s, "\n head %u tail %u holes:\n", sb->head, sb->tail);
+
   while (hole)
     {
       s = format (s, "%U", format_tcp_sack_hole, hole);
@@ -736,7 +741,7 @@ tcp_snd_space (tcp_connection_t * tc)
   if (tcp_in_recovery (tc))
     {
       tc->snd_nxt = tc->snd_una_max;
-      snd_space = tcp_available_wnd (tc) - tc->rtx_bytes
+      snd_space = tcp_available_wnd (tc) - tc->snd_rxt_bytes
 	- (tc->snd_una_max - tc->snd_congestion);
       if (snd_space <= 0 || (tc->snd_una_max - tc->snd_una) >= tc->snd_wnd)
 	return 0;
@@ -744,8 +749,8 @@ tcp_snd_space (tcp_connection_t * tc)
     }
 
   /* If in fast recovery, send 1 SMSS if wnd allows */
-  if (tcp_in_fastrecovery (tc) && tcp_available_snd_space (tc)
-      && tcp_fastrecovery_sent_1_smss (tc))
+  if (tcp_in_fastrecovery (tc)
+      && tcp_available_snd_space (tc) && !tcp_fastrecovery_sent_1_smss (tc))
     {
       tcp_fastrecovery_1_smss_on (tc);
       return tc->snd_mss;
@@ -761,6 +766,12 @@ tcp_session_send_space (transport_connection_t * trans_conn)
   return tcp_snd_space (tc);
 }
 
+i32
+tcp_rcv_wnd_available (tcp_connection_t * tc)
+{
+  return (i32) tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las);
+}
+
 u32
 tcp_session_tx_fifo_offset (transport_connection_t * trans_conn)
 {
diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h
index c3ebe22b..071f1ab1 100644
--- a/src/vnet/tcp/tcp.h
+++ b/src/vnet/tcp/tcp.h
@@ -34,6 +34,7 @@
 #define TCP_MAX_RX_FIFO_SIZE 	2 << 20
 #define TCP_IW_N_SEGMENTS 	10
 #define TCP_ALWAYS_ACK		0	/**< If on, we always ack */
+#define TCP_USE_SACKS		1	/**< Disable only for testing */
 
 /** TCP FSM state definitions as per RFC793. */
 #define foreach_tcp_fsm_state   \
@@ -94,7 +95,7 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler;
 #define TCP_DELACK_TIME         1	/* 0.1s */
 #define TCP_ESTABLISH_TIME      750	/* 75s */
 #define TCP_2MSL_TIME           300	/* 30s */
-#define TCP_CLOSEWAIT_TIME	1	/* 0.1s */
+#define TCP_CLOSEWAIT_TIME	20	/* 0.1s */
 #define TCP_CLEANUP_TIME	5	/* 0.5s Time to wait before cleanup */
 #define TCP_TIMER_PERSIST_MIN	2	/* 0.2s */
 
@@ -157,6 +158,7 @@ typedef struct _sack_scoreboard_hole
   u32 prev;		/**< Index for previous entry in linked list */
   u32 start;		/**< Start sequence number */
   u32 end;		/**< End sequence number */
+  u8 is_lost;		/**< Mark hole as lost */
 } sack_scoreboard_hole_t;
 
 typedef struct _sack_scoreboard
@@ -166,8 +168,13 @@ typedef struct _sack_scoreboard
   u32 tail;				/**< Index of last entry */
   u32 sacked_bytes;			/**< Number of bytes sacked in sb */
   u32 last_sacked_bytes;		/**< Number of bytes last sacked */
+  u32 last_bytes_delivered;		/**< Number of sack bytes delivered */
   u32 snd_una_adv;			/**< Bytes to add to snd_una */
-  u32 max_byte_sacked;			/**< Highest byte acked */
+  u32 high_sacked;			/**< Highest byte sacked (fack) */
+  u32 high_rxt;				/**< Highest retransmitted sequence */
+  u32 rescue_rxt;			/**< Rescue sequence number */
+  u32 lost_bytes;			/**< Bytes lost as per RFC6675 */
+  u32 cur_rxt_hole;			/**< Retransmitting from this hole */
 } sack_scoreboard_t;
 
 typedef enum _tcp_cc_algorithm_type
@@ -211,7 +218,7 @@ typedef struct _tcp_connection
   u32 irs;		/**< initial remote sequence */
 
   /* Options */
-  tcp_options_t opt;		/**< TCP connection options parsed */
+  tcp_options_t rcv_opts;	/**< Rx options for connection */
   tcp_options_t snd_opts;	/**< Tx options for connection */
   u8 snd_opts_len;		/**< Tx options len */
   u8 rcv_wscale;	/**< Window scale to advertise to peer */
@@ -229,8 +236,10 @@ typedef struct _tcp_connection
   u32 cwnd;		/**< Congestion window */
   u32 ssthresh;		/**< Slow-start threshold */
   u32 prev_ssthresh;	/**< ssthresh before congestion */
+  u32 prev_cwnd;	/**< ssthresh before congestion */
   u32 bytes_acked;	/**< Bytes acknowledged by current segment */
-  u32 rtx_bytes;	/**< Retransmitted bytes */
+  u32 snd_rxt_bytes;	/**< Retransmitted bytes */
+  u32 snd_rxt_ts;	/**< Timestamp when first packet is retransmitted */
   u32 tsecr_last_ack;	/**< Timestamp echoed to us in last healthy ACK */
   u32 snd_congestion;	/**< snd_una_max when congestion is detected */
   tcp_cc_algorithm_t *cc_algo;	/**< Congestion control algorithm */
@@ -411,6 +420,7 @@ void tcp_send_syn (tcp_connection_t * tc);
 void tcp_send_fin (tcp_connection_t * tc);
 void tcp_init_mss (tcp_connection_t * tc);
 void tcp_update_snd_mss (tcp_connection_t * tc);
+void tcp_update_rto (tcp_connection_t * tc);
 
 always_inline u32
 tcp_end_seq (tcp_header_t * th, u32 len)
@@ -428,17 +438,39 @@ tcp_end_seq (tcp_header_t * th, u32 len)
 #define timestamp_lt(_t1, _t2) ((i32)((_t1)-(_t2)) < 0)
 #define timestamp_leq(_t1, _t2) ((i32)((_t1)-(_t2)) <= 0)
 
+/**
+ * Our estimate of the number of bytes that have left the network
+ */
+always_inline u32
+tcp_bytes_out (const tcp_connection_t * tc)
+{
+  if (tcp_opts_sack_permitted (&tc->rcv_opts))
+    return tc->sack_sb.sacked_bytes + tc->sack_sb.lost_bytes;
+  else
+    return tc->rcv_dupacks * tc->snd_mss;
+}
+
+/**
+ * Our estimate of the number of bytes in flight (pipe size)
+ */
 always_inline u32
 tcp_flight_size (const tcp_connection_t * tc)
 {
   int flight_size;
 
-  flight_size = (int) ((tc->snd_una_max - tc->snd_una) + tc->rtx_bytes)
-    - (tc->rcv_dupacks * tc->snd_mss) /* - tc->sack_sb.sacked_bytes */ ;
+  flight_size = (int) (tc->snd_una_max - tc->snd_una) - tcp_bytes_out (tc)
+    + tc->snd_rxt_bytes;
 
-  /* Happens if we don't clear sacked bytes */
   if (flight_size < 0)
-    return 0;
+    {
+      if (0)
+	clib_warning
+	  ("Negative: %u %u %u dupacks %u sacked bytes %u flags %d",
+	   tc->snd_una_max - tc->snd_una, tcp_bytes_out (tc),
+	   tc->snd_rxt_bytes, tc->rcv_dupacks, tc->sack_sb.sacked_bytes,
+	   tc->rcv_opts.flags);
+      return 0;
+    }
 
   return flight_size;
 }
@@ -481,14 +513,17 @@ tcp_available_snd_space (const tcp_connection_t * tc)
   return available_wnd - flight_size;
 }
 
-u32 tcp_rcv_wnd_available (tcp_connection_t * tc);
+i32 tcp_rcv_wnd_available (tcp_connection_t * tc);
 u32 tcp_snd_space (tcp_connection_t * tc);
 void tcp_update_rcv_wnd (tcp_connection_t * tc);
 
 void tcp_retransmit_first_unacked (tcp_connection_t * tc);
+void tcp_fast_retransmit_no_sack (tcp_connection_t * tc);
+void tcp_fast_retransmit_sack (tcp_connection_t * tc);
 void tcp_fast_retransmit (tcp_connection_t * tc);
-void tcp_cc_congestion (tcp_connection_t * tc);
-void tcp_cc_recover (tcp_connection_t * tc);
+void tcp_cc_init_congestion (tcp_connection_t * tc);
+int tcp_cc_recover (tcp_connection_t * tc);
+void tcp_cc_fastrecovery_exit (tcp_connection_t * tc);
 
 /* Made public for unit testing only */
 void tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end);
@@ -563,16 +598,16 @@ tcp_retransmit_timer_set (tcp_connection_t * tc)
 }
 
 always_inline void
-tcp_retransmit_timer_update (tcp_connection_t * tc)
+tcp_retransmit_timer_reset (tcp_connection_t * tc)
 {
-  tcp_timer_update (tc, TCP_TIMER_RETRANSMIT,
-		    clib_max (tc->rto * TCP_TO_TIMER_TICK, 1));
+  tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT);
 }
 
 always_inline void
-tcp_retransmit_timer_reset (tcp_connection_t * tc)
+tcp_retransmit_timer_force_update (tcp_connection_t * tc)
 {
-  tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT);
+  tcp_timer_update (tc, TCP_TIMER_RETRANSMIT,
+		    clib_max (tc->rto * TCP_TO_TIMER_TICK, 1));
 }
 
 always_inline void
@@ -598,15 +633,43 @@ tcp_persist_timer_reset (tcp_connection_t * tc)
   tcp_timer_reset (tc, TCP_TIMER_PERSIST);
 }
 
+always_inline void
+tcp_retransmit_timer_update (tcp_connection_t * tc)
+{
+  if (tc->snd_una == tc->snd_una_max)
+    {
+      tcp_retransmit_timer_reset (tc);
+      if (tc->snd_wnd < tc->snd_mss)
+	tcp_persist_timer_set (tc);
+    }
+  else
+    tcp_timer_update (tc, TCP_TIMER_RETRANSMIT,
+		      clib_max (tc->rto * TCP_TO_TIMER_TICK, 1));
+}
+
 always_inline u8
 tcp_timer_is_active (tcp_connection_t * tc, tcp_timers_e timer)
 {
   return tc->timers[timer] != TCP_TIMER_HANDLE_INVALID;
 }
 
+#define tcp_validate_txf_size(_tc, _a) 					\
+  ASSERT(_tc->state != TCP_STATE_ESTABLISHED 				\
+	 || stream_session_tx_fifo_max_dequeue (&_tc->connection) >= _a)
+
 void
 scoreboard_remove_hole (sack_scoreboard_t * sb,
 			sack_scoreboard_hole_t * hole);
+void scoreboard_update_lost (tcp_connection_t * tc, sack_scoreboard_t * sb);
+sack_scoreboard_hole_t *scoreboard_insert_hole (sack_scoreboard_t * sb,
+						u32 prev_index, u32 start,
+						u32 end);
+sack_scoreboard_hole_t *scoreboard_next_rxt_hole (sack_scoreboard_t * sb,
+						  sack_scoreboard_hole_t *
+						  start, u8 have_sent_1_smss,
+						  u8 * can_rescue,
+						  u8 * snd_limited);
+void scoreboard_init_high_rxt (sack_scoreboard_t * sb);
 
 always_inline sack_scoreboard_hole_t *
 scoreboard_get_hole (sack_scoreboard_t * sb, u32 index)
@@ -624,6 +687,14 @@ scoreboard_next_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole)
   return 0;
 }
 
+always_inline sack_scoreboard_hole_t *
+scoreboard_prev_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole)
+{
+  if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX)
+    return pool_elt_at_index (sb->holes, hole->prev);
+  return 0;
+}
+
 always_inline sack_scoreboard_hole_t *
 scoreboard_first_hole (sack_scoreboard_t * sb)
 {
@@ -643,15 +714,19 @@ scoreboard_last_hole (sack_scoreboard_t * sb)
 always_inline void
 scoreboard_clear (sack_scoreboard_t * sb)
 {
-  sack_scoreboard_hole_t *hole = scoreboard_first_hole (sb);
+  sack_scoreboard_hole_t *hole;
   while ((hole = scoreboard_first_hole (sb)))
     {
       scoreboard_remove_hole (sb, hole);
     }
   sb->sacked_bytes = 0;
   sb->last_sacked_bytes = 0;
+  sb->last_bytes_delivered = 0;
   sb->snd_una_adv = 0;
-  sb->max_byte_sacked = 0;
+  sb->high_sacked = 0;
+  sb->high_rxt = 0;
+  sb->lost_bytes = 0;
+  sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
 }
 
 always_inline u32
@@ -671,6 +746,7 @@ scoreboard_init (sack_scoreboard_t * sb)
 {
   sb->head = TCP_INVALID_SACK_HOLE_INDEX;
   sb->tail = TCP_INVALID_SACK_HOLE_INDEX;
+  sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
 }
 
 void tcp_rcv_sacks (tcp_connection_t * tc, u32 ack);
diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h
index b4497a3b..3a16cf63 100755
--- a/src/vnet/tcp/tcp_debug.h
+++ b/src/vnet/tcp/tcp_debug.h
@@ -393,7 +393,7 @@ typedef enum _tcp_dbg_evt
   DECLARE_ETD(_tc, _e, 4);						\
   ed->data[0] = _seq - _tc->irs;					\
   ed->data[1] = _end - _tc->irs;					\
-  ed->data[2] = _tc->opt.tsval;						\
+  ed->data[2] = _tc->rcv_opts.tsval;					\
   ed->data[3] = _tc->tsval_recent;					\
 }
 
@@ -427,27 +427,27 @@ typedef enum _tcp_dbg_evt
 {									\
   ELOG_TYPE_DECLARE (_e) =						\
   {									\
-    .format = "rtx: snd_nxt %u offset %u snd %u rtx %u",		\
+    .format = "rxt: snd_nxt %u offset %u snd %u rxt %u",		\
     .format_args = "i4i4i4i4",						\
   };									\
   DECLARE_ETD(_tc, _e, 4);						\
   ed->data[0] = _tc->snd_nxt - _tc->iss;				\
   ed->data[1] = offset;							\
   ed->data[2] = n_bytes;						\
-  ed->data[3] = _tc->rtx_bytes;						\
+  ed->data[3] = _tc->snd_rxt_bytes;					\
 }
 
 #define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...)			\
 {									\
   ELOG_TYPE_DECLARE (_e) =						\
   {									\
-    .format = "cc: %s wnd %u snd_cong %u rtx_bytes %u",			\
+    .format = "cc: %s wnd %u snd_cong %u rxt_bytes %u",			\
     .format_args = "t4i4i4i4",						\
     .n_enum_strings = 5,						\
     .enum_strings = {                                           	\
-      "fast-rtx",	                                             	\
-      "rtx-timeout",                                                 	\
-      "first-rtx",                                                 	\
+      "fast-rxt",	                                             	\
+      "rxt-timeout",                                                 	\
+      "first-rxt",                                                 	\
       "recovered",							\
       "congestion",							\
     },  								\
@@ -456,7 +456,7 @@ typedef enum _tcp_dbg_evt
   ed->data[0] = _sub_evt;						\
   ed->data[1] = tcp_available_snd_space (_tc);				\
   ed->data[2] = _tc->snd_congestion - _tc->iss;				\
-  ed->data[3] = _tc->rtx_bytes;						\
+  ed->data[3] = _tc->snd_rxt_bytes;					\
 }
 
 #define TCP_EVT_CC_PACK_HANDLER(_tc, ...)				\
diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c
index 35bc9094..ff2229b3 100644
--- a/src/vnet/tcp/tcp_input.c
+++ b/src/vnet/tcp/tcp_input.c
@@ -231,8 +231,8 @@ tcp_options_parse (tcp_header_t * th, tcp_options_t * to)
 always_inline int
 tcp_segment_check_paws (tcp_connection_t * tc)
 {
-  return tcp_opts_tstamp (&tc->opt) && tc->tsval_recent
-    && timestamp_lt (tc->opt.tsval, tc->tsval_recent);
+  return tcp_opts_tstamp (&tc->rcv_opts) && tc->tsval_recent
+    && timestamp_lt (tc->rcv_opts.tsval, tc->tsval_recent);
 }
 
 /**
@@ -248,10 +248,10 @@ tcp_update_timestamp (tcp_connection_t * tc, u32 seq, u32 seq_end)
    * then the TSval from the segment is copied to TS.Recent;
    * otherwise, the TSval is ignored.
    */
-  if (tcp_opts_tstamp (&tc->opt) && tc->tsval_recent
+  if (tcp_opts_tstamp (&tc->rcv_opts) && tc->tsval_recent
       && seq_leq (seq, tc->rcv_las) && seq_leq (tc->rcv_las, seq_end))
     {
-      tc->tsval_recent = tc->opt.tsval;
+      tc->tsval_recent = tc->rcv_opts.tsval;
       tc->tsval_recent_age = tcp_time_now ();
     }
 }
@@ -272,14 +272,21 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0,
   if (PREDICT_FALSE (!tcp_ack (th0) && !tcp_rst (th0) && !tcp_syn (th0)))
     return -1;
 
-  if (PREDICT_FALSE (tcp_options_parse (th0, &tc0->opt)))
+  if (PREDICT_FALSE (tcp_options_parse (th0, &tc0->rcv_opts)))
     {
       return -1;
     }
 
   if (tcp_segment_check_paws (tc0))
     {
-      clib_warning ("paws failed");
+      if (CLIB_DEBUG > 2)
+	{
+	  clib_warning ("paws failed\n%U", format_tcp_connection, tc0, 2);
+	  clib_warning ("seq %u seq_end %u ack %u",
+			vnet_buffer (b0)->tcp.seq_number - tc0->irs,
+			vnet_buffer (b0)->tcp.seq_end - tc0->irs,
+			vnet_buffer (b0)->tcp.ack_number - tc0->iss);
+	}
       TCP_EVT_DBG (TCP_EVT_PAWS_FAIL, tc0, vnet_buffer (b0)->tcp.seq_number,
 		   vnet_buffer (b0)->tcp.seq_end);
 
@@ -348,7 +355,6 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0,
   /* If segment in window, save timestamp */
   tcp_update_timestamp (tc0, vnet_buffer (b0)->tcp.seq_number,
 			vnet_buffer (b0)->tcp.seq_end);
-
   return 0;
 }
 
@@ -391,6 +397,12 @@ tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt)
     }
 }
 
+void
+tcp_update_rto (tcp_connection_t * tc)
+{
+  tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX);
+}
+
 /** Update RTT estimate and RTO timer
  *
  * Measure RTT: We have two sources of RTT measurements: TSOPT and ACK
@@ -405,7 +417,7 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack)
   u32 mrtt = 0;
   u8 rtx_acked;
 
-  /* Determine if only rtx bytes are acked. TODO fast retransmit */
+  /* Determine if only rtx bytes are acked. TODO XXX fast retransmit */
   rtx_acked = tc->rto_boff && (tc->bytes_acked <= tc->snd_mss);
 
   /* Karn's rule, part 1. Don't use retransmitted segments to estimate
@@ -418,9 +430,10 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack)
    * snd_una, i.e., the left side of the send window:
    * seq_lt (tc->snd_una, ack). Note: last condition could be dropped, we don't
    * try to update rtt for dupacks */
-  else if (tcp_opts_tstamp (&tc->opt) && tc->opt.tsecr && tc->bytes_acked)
+  else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr
+	   && tc->bytes_acked)
     {
-      mrtt = tcp_time_now () - tc->opt.tsecr;
+      mrtt = tcp_time_now () - tc->rcv_opts.tsecr;
     }
 
   /* Allow measuring of a new RTT */
@@ -436,7 +449,7 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack)
     return 0;
 
   tcp_estimate_rtt (tc, mrtt);
-  tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX);
+  tcp_update_rto (tc);
 
   return 0;
 }
@@ -447,25 +460,46 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack)
 static void
 tcp_dequeue_acked (tcp_connection_t * tc, u32 ack)
 {
-  /* Dequeue the newly ACKed bytes */
-  stream_session_dequeue_drop (&tc->connection, tc->bytes_acked);
+  /* Dequeue the newly ACKed add SACKed bytes */
+  stream_session_dequeue_drop (&tc->connection,
+			       tc->bytes_acked + tc->sack_sb.snd_una_adv);
+
+  tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
 
   /* Update rtt and rto */
   tcp_update_rtt (tc, ack);
+
+  /* If everything has been acked, stop retransmit timer
+   * otherwise update. */
+  tcp_retransmit_timer_update (tc);
 }
 
 /**
- * Check if dupack as per RFC5681 Sec. 2
- *
- * This works only if called before updating snd_wnd.
- * */
-always_inline u8
-tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 new_snd_wnd)
+ * Check if duplicate ack as per RFC5681 Sec. 2
+ */
+static u8
+tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 prev_snd_wnd,
+		   u32 prev_snd_una)
 {
-  return ((vnet_buffer (b)->tcp.ack_number == tc->snd_una)
+  return ((vnet_buffer (b)->tcp.ack_number == prev_snd_una)
 	  && seq_gt (tc->snd_una_max, tc->snd_una)
 	  && (vnet_buffer (b)->tcp.seq_end == vnet_buffer (b)->tcp.seq_number)
-	  && (new_snd_wnd == tc->snd_wnd));
+	  && (prev_snd_wnd == tc->snd_wnd));
+}
+
+/**
+ * Checks if ack is a congestion control event.
+ */
+static u8
+tcp_ack_is_cc_event (tcp_connection_t * tc, vlib_buffer_t * b,
+		     u32 prev_snd_wnd, u32 prev_snd_una, u8 * is_dack)
+{
+  /* Check if ack is duplicate. Per RFC 6675, ACKs that SACK new data are
+   * defined to be 'duplicate' */
+  *is_dack = tc->sack_sb.last_sacked_bytes
+    || tcp_ack_is_dupack (tc, b, prev_snd_wnd, prev_snd_una);
+
+  return (*is_dack || tcp_in_cong_recovery (tc));
 }
 
 void
@@ -478,6 +512,10 @@ scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole)
       next = pool_elt_at_index (sb->holes, hole->next);
       next->prev = hole->prev;
     }
+  else
+    {
+      sb->tail = hole->prev;
+    }
 
   if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX)
     {
@@ -489,6 +527,9 @@ scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole)
       sb->head = hole->next;
     }
 
+  if (scoreboard_hole_index (sb, hole) == sb->cur_rxt_hole)
+    sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
+
   pool_put (sb->holes, hole);
 }
 
@@ -527,26 +568,131 @@ scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index,
   return hole;
 }
 
+void
+scoreboard_update_lost (tcp_connection_t * tc, sack_scoreboard_t * sb)
+{
+  sack_scoreboard_hole_t *hole, *prev;
+  u32 bytes = 0, blks = 0;
+
+  sb->lost_bytes = 0;
+  hole = scoreboard_last_hole (sb);
+  if (!hole)
+    return;
+
+  if (seq_gt (sb->high_sacked, hole->end))
+    {
+      bytes = sb->high_sacked - hole->end;
+      blks = 1;
+    }
+
+  while ((prev = scoreboard_prev_hole (sb, hole))
+	 && (bytes < (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss
+	     && blks < TCP_DUPACK_THRESHOLD))
+    {
+      bytes += hole->start - prev->end;
+      blks++;
+      hole = prev;
+    }
+
+  hole = prev;
+  while (hole)
+    {
+      sb->lost_bytes += scoreboard_hole_bytes (hole);
+      hole->is_lost = 1;
+      hole = scoreboard_prev_hole (sb, hole);
+    }
+}
+
+/**
+ * Figure out the next hole to retransmit
+ *
+ * Follows logic proposed in RFC6675 Sec. 4, NextSeg()
+ */
+sack_scoreboard_hole_t *
+scoreboard_next_rxt_hole (sack_scoreboard_t * sb,
+			  sack_scoreboard_hole_t * start,
+			  u8 have_sent_1_smss,
+			  u8 * can_rescue, u8 * snd_limited)
+{
+  sack_scoreboard_hole_t *hole = 0;
+
+  hole = start ? start : scoreboard_first_hole (sb);
+  while (hole && seq_leq (hole->end, sb->high_rxt) && hole->is_lost)
+    hole = scoreboard_next_hole (sb, hole);
+
+  /* Nothing, return */
+  if (!hole)
+    {
+      sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
+      return 0;
+    }
+
+  /* Rule (1): if higher than rxt, less than high_sacked and lost */
+  if (hole->is_lost && seq_lt (hole->start, sb->high_sacked))
+    {
+      sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
+    }
+  else
+    {
+      /* Rule (2): output takes care of transmitting new data */
+      if (!have_sent_1_smss)
+	{
+	  hole = 0;
+	  sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX;
+	}
+      /* Rule (3): if hole not lost */
+      else if (seq_lt (hole->start, sb->high_sacked))
+	{
+	  *snd_limited = 1;
+	  sb->cur_rxt_hole = scoreboard_hole_index (sb, hole);
+	}
+      /* Rule (4): if hole beyond high_sacked */
+      else
+	{
+	  ASSERT (seq_geq (hole->start, sb->high_sacked));
+	  *snd_limited = 1;
+	  *can_rescue = 1;
+	  /* HighRxt MUST NOT be updated */
+	  return 0;
+	}
+    }
+
+  if (hole && seq_lt (sb->high_rxt, hole->start))
+    sb->high_rxt = hole->start;
+
+  return hole;
+}
+
+void
+scoreboard_init_high_rxt (sack_scoreboard_t * sb)
+{
+  sack_scoreboard_hole_t *hole;
+  hole = scoreboard_first_hole (sb);
+  sb->high_rxt = hole->start;
+  sb->cur_rxt_hole = sb->head;
+}
+
 void
 tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
 {
   sack_scoreboard_t *sb = &tc->sack_sb;
   sack_block_t *blk, tmp;
   sack_scoreboard_hole_t *hole, *next_hole, *last_hole, *new_hole;
-  u32 blk_index = 0, old_sacked_bytes, delivered_bytes, hole_index;
+  u32 blk_index = 0, old_sacked_bytes, hole_index;
   int i, j;
 
   sb->last_sacked_bytes = 0;
   sb->snd_una_adv = 0;
   old_sacked_bytes = sb->sacked_bytes;
-  delivered_bytes = 0;
+  sb->last_bytes_delivered = 0;
 
-  if (!tcp_opts_sack (&tc->opt) && sb->head == TCP_INVALID_SACK_HOLE_INDEX)
+  if (!tcp_opts_sack (&tc->rcv_opts)
+      && sb->head == TCP_INVALID_SACK_HOLE_INDEX)
     return;
 
   /* Remove invalid blocks */
-  blk = tc->opt.sacks;
-  while (blk < vec_end (tc->opt.sacks))
+  blk = tc->rcv_opts.sacks;
+  while (blk < vec_end (tc->rcv_opts.sacks))
     {
       if (seq_lt (blk->start, blk->end)
 	  && seq_gt (blk->start, tc->snd_una)
@@ -555,7 +701,7 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
 	  blk++;
 	  continue;
 	}
-      vec_del1 (tc->opt.sacks, blk - tc->opt.sacks);
+      vec_del1 (tc->rcv_opts.sacks, blk - tc->rcv_opts.sacks);
     }
 
   /* Add block for cumulative ack */
@@ -563,20 +709,20 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
     {
       tmp.start = tc->snd_una;
       tmp.end = ack;
-      vec_add1 (tc->opt.sacks, tmp);
+      vec_add1 (tc->rcv_opts.sacks, tmp);
     }
 
-  if (vec_len (tc->opt.sacks) == 0)
+  if (vec_len (tc->rcv_opts.sacks) == 0)
     return;
 
   /* Make sure blocks are ordered */
-  for (i = 0; i < vec_len (tc->opt.sacks); i++)
-    for (j = i + 1; j < vec_len (tc->opt.sacks); j++)
-      if (seq_lt (tc->opt.sacks[j].start, tc->opt.sacks[i].start))
+  for (i = 0; i < vec_len (tc->rcv_opts.sacks); i++)
+    for (j = i + 1; j < vec_len (tc->rcv_opts.sacks); j++)
+      if (seq_lt (tc->rcv_opts.sacks[j].start, tc->rcv_opts.sacks[i].start))
 	{
-	  tmp = tc->opt.sacks[i];
-	  tc->opt.sacks[i] = tc->opt.sacks[j];
-	  tc->opt.sacks[j] = tmp;
+	  tmp = tc->rcv_opts.sacks[i];
+	  tc->rcv_opts.sacks[i] = tc->rcv_opts.sacks[j];
+	  tc->rcv_opts.sacks[j] = tmp;
 	}
 
   if (sb->head == TCP_INVALID_SACK_HOLE_INDEX)
@@ -585,25 +731,25 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
       last_hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX,
 					  tc->snd_una, tc->snd_una_max);
       sb->tail = scoreboard_hole_index (sb, last_hole);
-      tmp = tc->opt.sacks[vec_len (tc->opt.sacks) - 1];
-      sb->max_byte_sacked = tmp.end;
+      tmp = tc->rcv_opts.sacks[vec_len (tc->rcv_opts.sacks) - 1];
+      sb->high_sacked = tmp.end;
     }
   else
     {
       /* If we have holes but snd_una_max is beyond the last hole, update
        * last hole end */
-      tmp = tc->opt.sacks[vec_len (tc->opt.sacks) - 1];
+      tmp = tc->rcv_opts.sacks[vec_len (tc->rcv_opts.sacks) - 1];
       last_hole = scoreboard_last_hole (sb);
-      if (seq_gt (tc->snd_una_max, sb->max_byte_sacked)
+      if (seq_gt (tc->snd_una_max, sb->high_sacked)
 	  && seq_gt (tc->snd_una_max, last_hole->end))
 	last_hole->end = tc->snd_una_max;
     }
 
   /* Walk the holes with the SACK blocks */
   hole = pool_elt_at_index (sb->holes, sb->head);
-  while (hole && blk_index < vec_len (tc->opt.sacks))
+  while (hole && blk_index < vec_len (tc->rcv_opts.sacks))
     {
-      blk = &tc->opt.sacks[blk_index];
+      blk = &tc->rcv_opts.sacks[blk_index];
 
       if (seq_leq (blk->start, hole->start))
 	{
@@ -617,9 +763,9 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
 		{
 		  /* Bytes lost because snd_wnd left edge advances */
 		  if (next_hole && seq_leq (next_hole->start, ack))
-		    delivered_bytes += next_hole->start - hole->end;
+		    sb->last_bytes_delivered += next_hole->start - hole->end;
 		  else
-		    delivered_bytes += ack - hole->end;
+		    sb->last_bytes_delivered += ack - hole->end;
 		}
 	      else
 		{
@@ -633,8 +779,8 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
 		  last_hole = scoreboard_last_hole (sb);
 		  /* keep track of max byte sacked for when the last hole
 		   * is acked */
-		  if (seq_gt (hole->end, sb->max_byte_sacked))
-		    sb->max_byte_sacked = hole->end;
+		  if (seq_gt (hole->end, sb->high_sacked))
+		    sb->high_sacked = hole->end;
 		}
 
 	      /* snd_una needs to be advanced */
@@ -645,12 +791,12 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
 		      sb->snd_una_adv = next_hole->start - ack;
 
 		      /* all these can be delivered */
-		      delivered_bytes += sb->snd_una_adv;
+		      sb->last_bytes_delivered += sb->snd_una_adv;
 		    }
 		  else if (!next_hole)
 		    {
-		      sb->snd_una_adv = sb->max_byte_sacked - ack;
-		      delivered_bytes += sb->snd_una_adv;
+		      sb->snd_una_adv = sb->high_sacked - ack;
+		      sb->last_bytes_delivered += sb->snd_una_adv;
 		    }
 		}
 
@@ -691,28 +837,33 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack)
 		}
 
 	      blk_index++;
-	      hole = scoreboard_next_hole (sb, hole);
 	    }
-	  else
+	  else if (seq_leq (blk->start, hole->end))
 	    {
 	      sb->sacked_bytes += hole->end - blk->start;
 	      hole->end = blk->start;
-	      hole = scoreboard_next_hole (sb, hole);
 	    }
+
+	  hole = scoreboard_next_hole (sb, hole);
 	}
     }
 
   sb->last_sacked_bytes = sb->sacked_bytes - old_sacked_bytes;
-  sb->sacked_bytes -= delivered_bytes;
+  sb->sacked_bytes -= sb->last_bytes_delivered;
+  scoreboard_update_lost (tc, sb);
 }
 
-/** Update snd_wnd
+/**
+ * Try to update snd_wnd based on feedback received from peer.
  *
- * If (SND.WL1 < SEG.SEQ or (SND.WL1 = SEG.SEQ and SND.WL2 =< SEG.ACK)), set
- * SND.WND <- SEG.WND, set SND.WL1 <- SEG.SEQ, and set SND.WL2 <- SEG.ACK */
+ * If successful, and new window is 'effectively' 0, activate persist
+ * timer.
+ */
 static void
 tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd)
 {
+  /* If (SND.WL1 < SEG.SEQ or (SND.WL1 = SEG.SEQ and SND.WL2 =< SEG.ACK)), set
+   * SND.WND <- SEG.WND, set SND.WL1 <- SEG.SEQ, and set SND.WL2 <- SEG.ACK */
   if (seq_lt (tc->snd_wl1, seq)
       || (tc->snd_wl1 == seq && seq_leq (tc->snd_wl2, ack)))
     {
@@ -721,138 +872,269 @@ tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd)
       tc->snd_wl2 = ack;
       TCP_EVT_DBG (TCP_EVT_SND_WND, tc);
 
-      /* Set probe timer if we just got 0 wnd */
       if (tc->snd_wnd < tc->snd_mss)
 	{
-	  if (!tcp_timer_is_active (tc, TCP_TIMER_PERSIST))
+	  /* Set persist timer if not set and we just got 0 wnd */
+	  if (!tcp_timer_is_active (tc, TCP_TIMER_PERSIST)
+	      && !tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT))
 	    tcp_persist_timer_set (tc);
 	}
       else
-	tcp_persist_timer_reset (tc);
+	{
+	  tcp_persist_timer_reset (tc);
+	  if (!tcp_in_recovery (tc) && tc->rto_boff > 0)
+	    {
+	      tc->rto_boff = 0;
+	      tcp_update_rto (tc);
+	    }
+	}
     }
 }
 
 void
-tcp_cc_congestion (tcp_connection_t * tc)
+tcp_cc_init_congestion (tcp_connection_t * tc)
 {
-  tc->snd_congestion = tc->snd_nxt;
+  tcp_fastrecovery_on (tc);
+  tc->snd_congestion = tc->snd_una_max;
   tc->cc_algo->congestion (tc);
   TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 4);
 }
 
-void
-tcp_cc_recover (tcp_connection_t * tc)
+static void
+tcp_cc_recovery_exit (tcp_connection_t * tc)
 {
-  /* TODO: check if time to recover was small. It might be that RTO popped
-   * too soon.
-   */
+  /* Deflate rto */
+  tcp_update_rto (tc);
+  tc->rto_boff = 0;
+  tc->snd_rxt_ts = 0;
+  tcp_recovery_off (tc);
+}
 
+void
+tcp_cc_fastrecovery_exit (tcp_connection_t * tc)
+{
   tc->cc_algo->recovered (tc);
+  tc->snd_rxt_bytes = 0;
+  tc->rcv_dupacks = 0;
+  tcp_fastrecovery_off (tc);
+  tcp_fastrecovery_1_smss_off (tc);
+}
 
-  tc->rtx_bytes = 0;
+static void
+tcp_cc_congestion_undo (tcp_connection_t * tc)
+{
+  tc->cwnd = tc->prev_cwnd;
+  tc->ssthresh = tc->prev_ssthresh;
+  tc->snd_nxt = tc->snd_una_max;
   tc->rcv_dupacks = 0;
-  tc->snd_nxt = tc->snd_una;
+  if (tcp_in_recovery (tc))
+    tcp_cc_recovery_exit (tc);
+  ASSERT (tc->rto_boff == 0);
+  /* TODO extend for fastrecovery */
+}
 
-  tc->cc_algo->rcv_ack (tc);
-  tc->tsecr_last_ack = tc->opt.tsecr;
+static u8
+tcp_cc_is_spurious_retransmit (tcp_connection_t * tc)
+{
+  return (tc->snd_rxt_ts
+	  && tcp_opts_tstamp (&tc->rcv_opts)
+	  && timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts));
+}
 
-  tcp_cong_recovery_off (tc);
+int
+tcp_cc_recover (tcp_connection_t * tc)
+{
+  ASSERT (tcp_in_cong_recovery (tc));
+  if (tcp_cc_is_spurious_retransmit (tc))
+    {
+      tcp_cc_congestion_undo (tc);
+      return 1;
+    }
+
+  if (tcp_in_recovery (tc))
+    tcp_cc_recovery_exit (tc);
+  else if (tcp_in_fastrecovery (tc))
+    tcp_cc_fastrecovery_exit (tc);
+
+  ASSERT (tc->rto_boff == 0);
+  ASSERT (!tcp_in_cong_recovery (tc));
 
   TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3);
+  return 0;
 }
 
 static void
-tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b)
+tcp_cc_update (tcp_connection_t * tc, vlib_buffer_t * b)
+{
+  ASSERT (!tcp_in_cong_recovery (tc));
+
+  /* Congestion avoidance */
+  tc->cc_algo->rcv_ack (tc);
+  tc->tsecr_last_ack = tc->rcv_opts.tsecr;
+
+  /* If a cumulative ack, make sure dupacks is 0 */
+  tc->rcv_dupacks = 0;
+
+  /* When dupacks hits the threshold we only enter fast retransmit if
+   * cumulative ack covers more than snd_congestion. Should snd_una
+   * wrap this test may fail under otherwise valid circumstances.
+   * Therefore, proactively update snd_congestion when wrap detected. */
+  if (PREDICT_FALSE
+      (seq_leq (tc->snd_congestion, tc->snd_una - tc->bytes_acked)
+       && seq_gt (tc->snd_congestion, tc->snd_una)))
+    tc->snd_congestion = tc->snd_una - 1;
+}
+
+static u8
+tcp_should_fastrecover_sack (tcp_connection_t * tc)
 {
-  u8 partial_ack;
-  u32 bytes_advanced;
+  return (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss < tc->sack_sb.sacked_bytes;
+}
 
-  if (tcp_in_fastrecovery (tc))
+static u8
+tcp_should_fastrecover (tcp_connection_t * tc)
+{
+  return (tc->rcv_dupacks == TCP_DUPACK_THRESHOLD
+	  || tcp_should_fastrecover_sack (tc));
+}
+
+static void
+tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack)
+{
+  /*
+   * Duplicate ACK. Check if we should enter fast recovery, or if already in
+   * it account for the bytes that left the network.
+   */
+  if (is_dack)
     {
-      partial_ack = seq_lt (tc->snd_una, tc->snd_congestion);
-      if (!partial_ack)
+      ASSERT (tc->snd_una != tc->snd_una_max
+	      || tc->sack_sb.last_sacked_bytes);
+      tc->rcv_dupacks++;
+
+      if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD && !tc->bytes_acked)
 	{
-	  /* Clear retransmitted bytes. */
-	  tcp_cc_recover (tc);
+	  ASSERT (tcp_in_fastrecovery (tc));
+	  /* Pure duplicate ack. If some data got acked, it's handled lower */
+	  tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
+	  return;
 	}
-      else
+      else if (tcp_should_fastrecover (tc))
 	{
-	  TCP_EVT_DBG (TCP_EVT_CC_PACK, tc);
+	  /* Things are already bad */
+	  if (tcp_in_cong_recovery (tc))
+	    {
+	      tc->rcv_dupacks = 0;
+	      goto partial_ack_test;
+	    }
 
-	  /* Clear retransmitted bytes. XXX should we clear all? */
-	  tc->rtx_bytes = 0;
+	  /* If of of the two conditions lower hold, reset dupacks
+	   * 1) Cumulative ack does not cover more than congestion threshold
+	   * 2) RFC6582 heuristic to avoid multiple fast retransmits
+	   */
+	  if (seq_leq (tc->snd_una, tc->snd_congestion)
+	      || tc->rcv_opts.tsecr != tc->tsecr_last_ack)
+	    {
+	      tc->rcv_dupacks = 0;
+	      return;
+	    }
+
+	  tcp_cc_init_congestion (tc);
+	  tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
 
-	  tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK);
+	  /* The first segment MUST be retransmitted */
+	  tcp_retransmit_first_unacked (tc);
 
-	  /* In case snd_nxt is still in the past and output tries to
-	   * shove some new bytes */
-	  tc->snd_nxt = tc->snd_una_max;
+	  /* Post retransmit update cwnd to ssthresh and account for the
+	   * three segments that have left the network and should've been
+	   * buffered at the receiver XXX */
+	  tc->cwnd = tc->ssthresh + tc->rcv_dupacks * tc->snd_mss;
 
-	  /* XXX need proper RFC6675 support */
-	  if (tc->sack_sb.last_sacked_bytes && !tcp_in_recovery (tc))
+	  /* If cwnd allows, send more data */
+	  if (tcp_opts_sack_permitted (&tc->rcv_opts)
+	      && scoreboard_first_hole (&tc->sack_sb))
 	    {
-	      tcp_fast_retransmit (tc);
+	      scoreboard_init_high_rxt (&tc->sack_sb);
+	      tcp_fast_retransmit_sack (tc);
 	    }
 	  else
 	    {
-	      /* Retransmit first unacked segment */
-	      tcp_retransmit_first_unacked (tc);
+	      tcp_fast_retransmit_no_sack (tc);
 	    }
+
+	  return;
 	}
-    }
-  else
-    {
-      tc->cc_algo->rcv_ack (tc);
-      tc->tsecr_last_ack = tc->opt.tsecr;
-      tc->rcv_dupacks = 0;
-      if (tcp_in_recovery (tc))
+      else if (!tc->bytes_acked
+	       || (tc->bytes_acked && !tcp_in_cong_recovery (tc)))
 	{
-	  bytes_advanced = tc->bytes_acked + tc->sack_sb.snd_una_adv;
-	  tc->rtx_bytes -= clib_min (bytes_advanced, tc->rtx_bytes);
-	  tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX);
-	  if (seq_geq (tc->snd_una, tc->snd_congestion))
-	    {
-	      tc->rtx_bytes = 0;
-	      tcp_recovery_off (tc);
-	    }
+	  tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
+	  return;
 	}
+      else
+	goto partial_ack;
     }
-}
 
-static void
-tcp_cc_rcv_dupack (tcp_connection_t * tc, u32 ack)
-{
-//  ASSERT (seq_geq(tc->snd_una, ack));
+partial_ack_test:
+
+  if (!tc->bytes_acked)
+    return;
+
+partial_ack:
+  /*
+   * Legitimate ACK. 1) See if we can exit recovery
+   */
+  /* XXX limit this only to first partial ack? */
+  tcp_retransmit_timer_update (tc);
 
-  tc->rcv_dupacks++;
-  if (tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
+  if (seq_geq (tc->snd_una, tc->snd_congestion))
     {
-      /* RFC6582 NewReno heuristic to avoid multiple fast retransmits */
-      if (tc->opt.tsecr != tc->tsecr_last_ack)
-	{
-	  tc->rcv_dupacks = 0;
-	  return;
-	}
+      /* If spurious return, we've already updated everything */
+      if (tcp_cc_recover (tc))
+	return;
+
+      tc->snd_nxt = tc->snd_una_max;
 
-      tcp_fastrecovery_on (tc);
+      /* Treat as congestion avoidance ack */
+      tc->cc_algo->rcv_ack (tc);
+      tc->tsecr_last_ack = tc->rcv_opts.tsecr;
+      return;
+    }
+
+  /*
+   * Legitimate ACK. 2) If PARTIAL ACK try to retransmit
+   */
+  TCP_EVT_DBG (TCP_EVT_CC_PACK, tc);
+
+  /* RFC6675: If the incoming ACK is a cumulative acknowledgment,
+   * reset dupacks to 0 */
+  tc->rcv_dupacks = 0;
 
-      /* Handle congestion and dupack */
-      tcp_cc_congestion (tc);
-      tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
+  tcp_retransmit_first_unacked (tc);
 
-      tcp_fast_retransmit (tc);
+  /* Post RTO timeout don't try anything fancy */
+  if (tcp_in_recovery (tc))
+    return;
 
-      /* Post retransmit update cwnd to ssthresh and account for the
-       * three segments that have left the network and should've been
-       * buffered at the receiver */
-      tc->cwnd = tc->ssthresh + TCP_DUPACK_THRESHOLD * tc->snd_mss;
+  /* Remove retransmitted bytes that have been delivered */
+  if (tc->sack_sb.last_bytes_delivered
+      && seq_gt (tc->sack_sb.high_rxt, tc->snd_una))
+    {
+      /* If we have sacks and we haven't gotten an ack beyond high_rxt,
+       * remove sacked bytes delivered */
+      tc->snd_rxt_bytes -= tc->sack_sb.last_bytes_delivered;
     }
-  else if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD)
+  else
     {
-      ASSERT (tcp_in_fastrecovery (tc));
-
-      tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK);
+      /* Either all retransmitted holes have been acked, or we're
+       * "in the blind" and retransmitting segment by segment */
+      tc->snd_rxt_bytes = 0;
     }
+
+  tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK);
+
+  /*
+   * Since this was a partial ack, try to retransmit some more data
+   */
+  tcp_fast_retransmit (tc);
 }
 
 void
@@ -862,14 +1144,18 @@ tcp_cc_init (tcp_connection_t * tc)
   tc->cc_algo->init (tc);
 }
 
+/**
+ * Process incoming ACK
+ */
 static int
 tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
 	     tcp_header_t * th, u32 * next, u32 * error)
 {
-  u32 new_snd_wnd;
+  u32 prev_snd_wnd, prev_snd_una;
+  u8 is_dack;
 
   /* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) */
-  if (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt))
+  if (PREDICT_FALSE (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)))
     {
       /* If we have outstanding data and this is within the window, accept it,
        * probably retransmit has timed out. Otherwise ACK segment and then
@@ -892,7 +1178,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
     }
 
   /* If old ACK, probably it's an old dupack */
-  if (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una))
+  if (PREDICT_FALSE (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una)))
     {
       *error = TCP_ERROR_ACK_OLD;
       TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 1,
@@ -900,54 +1186,50 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b,
       if (tcp_in_fastrecovery (tc) && tc->rcv_dupacks == TCP_DUPACK_THRESHOLD)
 	{
 	  TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc);
-	  tcp_cc_rcv_dupack (tc, vnet_buffer (b)->tcp.ack_number);
+	  tcp_cc_handle_event (tc, 1);
 	}
       /* Don't drop yet */
       return 0;
     }
 
-  if (tcp_opts_sack_permitted (&tc->opt))
-    tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number);
-
-  new_snd_wnd = clib_net_to_host_u16 (th->window) << tc->snd_wscale;
-
-  if (tcp_ack_is_dupack (tc, b, new_snd_wnd))
-    {
-      TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc, 1);
-      tcp_cc_rcv_dupack (tc, vnet_buffer (b)->tcp.ack_number);
-      *error = TCP_ERROR_ACK_DUP;
-      return -1;
-    }
-
   /*
-   * Valid ACK
+   * Looks okay, process feedback
    */
 
-  tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una;
-  tc->snd_una = vnet_buffer (b)->tcp.ack_number + tc->sack_sb.snd_una_adv;
+  TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc);
+
+  if (tcp_opts_sack_permitted (&tc->rcv_opts))
+    tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number);
 
-  /* Dequeue ACKed data and update RTT */
-  tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number);
+  prev_snd_wnd = tc->snd_wnd;
+  prev_snd_una = tc->snd_una;
   tcp_update_snd_wnd (tc, vnet_buffer (b)->tcp.seq_number,
-		      vnet_buffer (b)->tcp.ack_number, new_snd_wnd);
+		      vnet_buffer (b)->tcp.ack_number,
+		      clib_net_to_host_u16 (th->window) << tc->snd_wscale);
+  tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una;
+  tc->snd_una = vnet_buffer (b)->tcp.ack_number + tc->sack_sb.snd_una_adv;
+  tcp_validate_txf_size (tc, tc->bytes_acked);
 
-  /* If some of our sent bytes have been acked, update cc and retransmit
-   * timer. */
   if (tc->bytes_acked)
-    {
-      TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc);
+    tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number);
 
-      /* Updates congestion control (slow start/congestion avoidance) */
-      tcp_cc_rcv_ack (tc, b);
+  /*
+   * Check if we have congestion event
+   */
 
-      /* If everything has been acked, stop retransmit timer
-       * otherwise update. */
-      if (tc->snd_una == tc->snd_una_max)
-	tcp_retransmit_timer_reset (tc);
-      else
-	tcp_retransmit_timer_update (tc);
+  if (tcp_ack_is_cc_event (tc, b, prev_snd_wnd, prev_snd_una, &is_dack))
+    {
+      tcp_cc_handle_event (tc, is_dack);
+      *error = TCP_ERROR_ACK_DUP;
+      TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc, 1);
+      return vnet_buffer (b)->tcp.data_len ? 0 : -1;
     }
 
+  /*
+   * Update congestion control (slow start/congestion avoidance)
+   */
+  tcp_cc_update (tc, b);
+
   return 0;
 }
 
@@ -1059,7 +1341,7 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b,
     }
 
   /* Update SACK list if need be */
-  if (tcp_opts_sack_permitted (&tc->opt))
+  if (tcp_opts_sack_permitted (&tc->rcv_opts))
     {
       /* Remove SACK blocks that have been delivered */
       tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt);
@@ -1097,7 +1379,7 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b,
   TCP_EVT_DBG (TCP_EVT_INPUT, tc, 1, data_len, data_len);
 
   /* Update SACK list if in use */
-  if (tcp_opts_sack_permitted (&tc->opt))
+  if (tcp_opts_sack_permitted (&tc->rcv_opts))
     {
       ooo_segment_t *newest;
       u32 start, end;
@@ -1294,7 +1576,6 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
       u32 n_left_to_next;
 
       vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
-
       while (n_left_from > 0 && n_left_to_next > 0)
 	{
 	  u32 bi0;
@@ -1321,7 +1602,6 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 	    }
 
 	  th0 = tcp_buffer_hdr (b0);
-
 	  is_fin = (th0->flags & TCP_FLAG_FIN) != 0;
 
 	  /* SYNs, FINs and data consume sequence numbers */
@@ -1387,7 +1667,6 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 
   errors = session_manager_flush_enqueue_events (my_thread_index);
   tcp_established_inc_counter (vm, is_ip4, TCP_ERROR_EVENT_FIFO_FULL, errors);
-
   return from_frame->n_vectors;
 }
 
@@ -1582,17 +1861,17 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 	  new_tc0->irs = seq0;
 
 	  /* Parse options */
-	  if (tcp_options_parse (tcp0, &new_tc0->opt))
+	  if (tcp_options_parse (tcp0, &new_tc0->rcv_opts))
 	    goto drop;
 
-	  if (tcp_opts_tstamp (&new_tc0->opt))
+	  if (tcp_opts_tstamp (&new_tc0->rcv_opts))
 	    {
-	      new_tc0->tsval_recent = new_tc0->opt.tsval;
+	      new_tc0->tsval_recent = new_tc0->rcv_opts.tsval;
 	      new_tc0->tsval_recent_age = tcp_time_now ();
 	    }
 
-	  if (tcp_opts_wscale (&new_tc0->opt))
-	    new_tc0->snd_wscale = new_tc0->opt.wscale;
+	  if (tcp_opts_wscale (&new_tc0->rcv_opts))
+	    new_tc0->snd_wscale = new_tc0->rcv_opts.wscale;
 
 	  /* No scaling */
 	  new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window);
@@ -1845,7 +2124,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 	      /* Initialize session variables */
 	      tc0->snd_una = vnet_buffer (b0)->tcp.ack_number;
 	      tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window)
-		<< tc0->opt.wscale;
+		<< tc0->rcv_opts.wscale;
 	      tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number;
 	      tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number;
 
@@ -1903,13 +2182,21 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 
 	      break;
 	    case TCP_STATE_LAST_ACK:
-	      /* The only thing that can arrive in this state is an
+	      /* The only thing that [should] arrive in this state is an
 	       * acknowledgment of our FIN. If our FIN is now acknowledged,
 	       * delete the TCB, enter the CLOSED state, and return. */
 
 	      if (!tcp_rcv_ack_is_acceptable (tc0, b0))
 		goto drop;
 
+	      /* Apparently our FIN was lost */
+	      if (tcp_fin (tcp0))
+		{
+		  /* Don't "make" fin since that increments snd_nxt */
+		  tcp_send_fin (tc0);
+		  goto drop;
+		}
+
 	      tc0->state = TCP_STATE_CLOSED;
 
 	      /* Don't delete the connection/session yet. Instead, wait a
@@ -1929,8 +2216,15 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 	       * retransmission of the remote FIN. Acknowledge it, and restart
 	       * the 2 MSL timeout. */
 
-	      /* TODO */
+	      if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0))
+		goto drop;
+
+	      tcp_make_ack (tc0, b0);
+	      tcp_timer_reset (tc0, TCP_TIMER_WAITCLOSE);
+	      tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME);
+
 	      goto drop;
+
 	      break;
 	    default:
 	      ASSERT (0);
@@ -2194,7 +2488,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 	      goto drop;
 	    }
 
-	  if (tcp_options_parse (th0, &child0->opt))
+	  if (tcp_options_parse (th0, &child0->rcv_opts))
 	    {
 	      goto drop;
 	    }
@@ -2205,14 +2499,14 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 
 	  /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK}
 	   * segments are used to initialize PAWS. */
-	  if (tcp_opts_tstamp (&child0->opt))
+	  if (tcp_opts_tstamp (&child0->rcv_opts))
 	    {
-	      child0->tsval_recent = child0->opt.tsval;
+	      child0->tsval_recent = child0->rcv_opts.tsval;
 	      child0->tsval_recent_age = tcp_time_now ();
 	    }
 
-	  if (tcp_opts_wscale (&child0->opt))
-	    child0->snd_wscale = child0->opt.wscale;
+	  if (tcp_opts_wscale (&child0->rcv_opts))
+	    child0->snd_wscale = child0->rcv_opts.wscale;
 
 	  /* No scaling */
 	  child0->snd_wnd = clib_net_to_host_u16 (th0->window);
@@ -2477,7 +2771,6 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node,
 		vlib_add_trace (vm, node, b0, sizeof (*t0));
 	      tcp_set_rx_trace_data (t0, tc0, tcp0, b0, is_ip4);
 	    }
-
 	  vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next,
 					   n_left_to_next, bi0, next0);
 	}
@@ -2600,7 +2893,13 @@ do {                                                       	\
   _(FIN_WAIT_2, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
     TCP_ERROR_NONE);
   _(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+  _(LAST_ACK, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+  _(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
+    TCP_ERROR_NONE);
   _(LAST_ACK, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+  _(TIME_WAIT, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE);
+  _(TIME_WAIT, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS,
+    TCP_ERROR_NONE);
   _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED);
   _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED);
 #undef _
diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c
index 3525f4e5..c66250e4 100644
--- a/src/vnet/tcp/tcp_newreno.c
+++ b/src/vnet/tcp/tcp_newreno.c
@@ -51,9 +51,23 @@ newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type)
     }
   else if (ack_type == TCP_CC_PARTIALACK)
     {
-      tc->cwnd -= tc->bytes_acked;
-      if (tc->bytes_acked > tc->snd_mss)
-	tc->bytes_acked += tc->snd_mss;
+      /* RFC 6582 Sec. 3.2 */
+      if (!tcp_opts_sack_permitted (&tc->rcv_opts))
+	{
+	  /* Deflate the congestion window by the amount of new data
+	   * acknowledged by the Cumulative Acknowledgment field.
+	   * If the partial ACK acknowledges at least one SMSS of new data,
+	   * then add back SMSS bytes to the congestion window. This
+	   * artificially inflates the congestion window in order to reflect
+	   * the additional segment that has left the network. This "partial
+	   * window deflation" attempts to ensure that, when fast recovery
+	   * eventually ends, approximately ssthresh amount of data will be
+	   * outstanding in the network.*/
+	  tc->cwnd = (tc->cwnd > tc->bytes_acked) ?
+	    tc->cwnd - tc->bytes_acked : 0;
+	  if (tc->bytes_acked > tc->snd_mss)
+	    tc->cwnd += tc->snd_mss;
+	}
     }
 }
 
diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c
index 49fd6bef..47c94e6d 100644
--- a/src/vnet/tcp/tcp_output.c
+++ b/src/vnet/tcp/tcp_output.c
@@ -136,10 +136,10 @@ tcp_update_rcv_wnd (tcp_connection_t * tc)
    * Figure out how much space we have available
    */
   available_space = stream_session_max_rx_enqueue (&tc->connection);
-  max_fifo = stream_session_fifo_size (&tc->connection);
+  max_fifo = stream_session_rx_fifo_size (&tc->connection);
 
-  ASSERT (tc->opt.mss < max_fifo);
-  if (available_space < tc->opt.mss && available_space < max_fifo >> 3)
+  ASSERT (tc->rcv_opts.mss < max_fifo);
+  if (available_space < tc->rcv_opts.mss && available_space < max_fifo >> 3)
     available_space = 0;
 
   /*
@@ -276,8 +276,11 @@ tcp_make_syn_options (tcp_options_t * opts, u8 wnd_scale)
   opts->tsecr = 0;
   len += TCP_OPTION_LEN_TIMESTAMP;
 
-  opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED;
-  len += TCP_OPTION_LEN_SACK_PERMITTED;
+  if (TCP_USE_SACKS)
+    {
+      opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED;
+      len += TCP_OPTION_LEN_SACK_PERMITTED;
+    }
 
   /* Align to needed boundary */
   len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN;
@@ -293,14 +296,14 @@ tcp_make_synack_options (tcp_connection_t * tc, tcp_options_t * opts)
   opts->mss = tc->mss;
   len += TCP_OPTION_LEN_MSS;
 
-  if (tcp_opts_wscale (&tc->opt))
+  if (tcp_opts_wscale (&tc->rcv_opts))
     {
       opts->flags |= TCP_OPTS_FLAG_WSCALE;
       opts->wscale = tc->rcv_wscale;
       len += TCP_OPTION_LEN_WINDOW_SCALE;
     }
 
-  if (tcp_opts_tstamp (&tc->opt))
+  if (tcp_opts_tstamp (&tc->rcv_opts))
     {
       opts->flags |= TCP_OPTS_FLAG_TSTAMP;
       opts->tsval = tcp_time_now ();
@@ -308,7 +311,7 @@ tcp_make_synack_options (tcp_connection_t * tc, tcp_options_t * opts)
       len += TCP_OPTION_LEN_TIMESTAMP;
     }
 
-  if (tcp_opts_sack_permitted (&tc->opt))
+  if (tcp_opts_sack_permitted (&tc->rcv_opts))
     {
       opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED;
       len += TCP_OPTION_LEN_SACK_PERMITTED;
@@ -326,14 +329,14 @@ tcp_make_established_options (tcp_connection_t * tc, tcp_options_t * opts)
 
   opts->flags = 0;
 
-  if (tcp_opts_tstamp (&tc->opt))
+  if (tcp_opts_tstamp (&tc->rcv_opts))
     {
       opts->flags |= TCP_OPTS_FLAG_TSTAMP;
       opts->tsval = tcp_time_now ();
       opts->tsecr = tc->tsval_recent;
       len += TCP_OPTION_LEN_TIMESTAMP;
     }
-  if (tcp_opts_sack_permitted (&tc->opt))
+  if (tcp_opts_sack_permitted (&tc->rcv_opts))
     {
       if (vec_len (tc->snd_sacks))
 	{
@@ -395,7 +398,7 @@ tcp_update_snd_mss (tcp_connection_t * tc)
     tcp_make_options (tc, &tc->snd_opts, TCP_STATE_ESTABLISHED);
 
   /* XXX check if MTU has been updated */
-  tc->snd_mss = clib_min (tc->mss, tc->opt.mss) - tc->snd_opts_len;
+  tc->snd_mss = clib_min (tc->mss, tc->rcv_opts.mss) - tc->snd_opts_len;
   ASSERT (tc->snd_mss > 0);
 }
 
@@ -406,21 +409,21 @@ tcp_init_mss (tcp_connection_t * tc)
   tcp_update_rcv_mss (tc);
 
   /* TODO cache mss and consider PMTU discovery */
-  tc->snd_mss = clib_min (tc->opt.mss, tc->mss);
+  tc->snd_mss = clib_min (tc->rcv_opts.mss, tc->mss);
 
   if (tc->snd_mss < 45)
     {
       clib_warning ("snd mss is 0");
       /* Assume that at least the min default mss works */
       tc->snd_mss = default_min_mss;
-      tc->opt.mss = default_min_mss;
+      tc->rcv_opts.mss = default_min_mss;
     }
 
   /* We should have enough space for 40 bytes of options */
   ASSERT (tc->snd_mss > 45);
 
   /* If we use timestamp option, account for it */
-  if (tcp_opts_tstamp (&tc->opt))
+  if (tcp_opts_tstamp (&tc->rcv_opts))
     tc->snd_mss -= TCP_OPTION_LEN_TIMESTAMP;
 }
 
@@ -879,6 +882,7 @@ tcp_send_fin (tcp_connection_t * tc)
   tcp_make_fin (tc, b);
   tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
   tc->flags |= TCP_CONN_FINSNT;
+  tcp_retransmit_timer_force_update (tc);
   TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc);
 }
 
@@ -919,10 +923,7 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b,
   if (compute_opts)
     tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state);
 
-  /* Write pre-computed options */
   tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t);
-
-  /* Get rcv window to advertise */
   advertise_wnd = tcp_window_to_advertise (tc, next_state);
   flags = tcp_make_state_flags (next_state);
 
@@ -930,26 +931,25 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b,
   th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt,
 			     tc->rcv_nxt, tcp_hdr_opts_len, flags,
 			     advertise_wnd);
-
   opts_write_len = tcp_options_write ((u8 *) (th + 1), &tc->snd_opts);
 
   ASSERT (opts_write_len == tc->snd_opts_len);
-
-  /* Tag the buffer with the connection index  */
   vnet_buffer (b)->tcp.connection_index = tc->c_c_index;
 
+  /*
+   * Update connection variables
+   */
+
   tc->snd_nxt += data_len;
   tc->rcv_las = tc->rcv_nxt;
 
   /* TODO this is updated in output as well ... */
-  if (tc->snd_nxt > tc->snd_una_max)
-    tc->snd_una_max = tc->snd_nxt;
-
-  if (tc->rtt_ts == 0)
+  if (seq_gt (tc->snd_nxt, tc->snd_una_max))
     {
-      tc->rtt_ts = tcp_time_now ();
-      tc->rtt_seq = tc->snd_nxt;
+      tc->snd_una_max = tc->snd_nxt;
+      tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
     }
+
   TCP_EVT_DBG (TCP_EVT_PKTIZE, tc);
 }
 
@@ -987,13 +987,14 @@ tcp_timer_delack_handler (u32 index)
  *
  * @return the number of bytes in the segment or 0 if there's nothing to
  *         retransmit
- * */
+ */
 u32
 tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b,
 				u32 offset, u32 max_bytes)
 {
   vlib_main_t *vm = vlib_get_main ();
-  u32 n_bytes = 0;
+  int n_bytes = 0;
+  u32 start;
 
   tcp_reuse_buffer (vm, b);
 
@@ -1001,15 +1002,16 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b,
   ASSERT (max_bytes != 0);
 
   max_bytes = clib_min (tc->snd_mss, max_bytes);
+  start = tc->snd_una + offset;
 
   /* Start is beyond snd_congestion */
-  if (seq_geq (tc->snd_una + offset, tc->snd_congestion))
+  if (seq_geq (start, tc->snd_congestion))
     goto done;
 
   /* Don't overshoot snd_congestion */
-  if (seq_gt (tc->snd_nxt + max_bytes, tc->snd_congestion))
+  if (seq_gt (start + max_bytes, tc->snd_congestion))
     {
-      max_bytes = tc->snd_congestion - tc->snd_nxt;
+      max_bytes = tc->snd_congestion - start;
       if (max_bytes == 0)
 	goto done;
     }
@@ -1021,15 +1023,12 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b,
   n_bytes = stream_session_peek_bytes (&tc->connection,
 				       vlib_buffer_get_current (b), offset,
 				       max_bytes);
-  ASSERT (n_bytes != 0);
+  ASSERT (n_bytes > 0);
   b->current_length = n_bytes;
   tcp_push_hdr_i (tc, b, tc->state, 0);
 
-  /* Don't count multiple retransmits of the same segment */
-  if (tc->rto_boff > 1)
-    goto done;
-
-  tc->rtx_bytes += n_bytes;
+  if (tcp_in_fastrecovery (tc))
+    tc->snd_rxt_bytes += n_bytes;
 
 done:
   TCP_EVT_DBG (TCP_EVT_CC_RTX, tc, offset, n_bytes);
@@ -1042,18 +1041,15 @@ done:
 static void
 tcp_rtx_timeout_cc (tcp_connection_t * tc)
 {
+  tc->prev_ssthresh = tc->ssthresh;
+  tc->prev_cwnd = tc->cwnd;
+
   /* Cleanly recover cc (also clears up fast retransmit) */
   if (tcp_in_fastrecovery (tc))
-    {
-      tcp_cc_recover (tc);
-    }
-  else
-    {
-      tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss);
-    }
+    tcp_cc_fastrecovery_exit (tc);
 
   /* Start again from the beginning */
-
+  tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss);
   tc->cwnd = tcp_loss_wnd (tc);
   tc->snd_congestion = tc->snd_una_max;
   tcp_recovery_on (tc);
@@ -1081,18 +1077,31 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
   /* Make sure timer handle is set to invalid */
   tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID;
 
+  if (!tcp_in_recovery (tc) && tc->rto_boff > 0
+      && tc->state >= TCP_STATE_ESTABLISHED)
+    {
+      tc->rto_boff = 0;
+      tcp_update_rto (tc);
+    }
+
   /* Increment RTO backoff (also equal to number of retries) */
   tc->rto_boff += 1;
 
   /* Go back to first un-acked byte */
   tc->snd_nxt = tc->snd_una;
 
-  /* Get buffer */
   tcp_get_free_buffer_index (tm, &bi);
   b = vlib_get_buffer (vm, bi);
 
   if (tc->state >= TCP_STATE_ESTABLISHED)
     {
+      /* Lost FIN, retransmit and return */
+      if (tc->flags & TCP_CONN_FINSNT)
+	{
+	  tcp_send_fin (tc);
+	  return;
+	}
+
       /* First retransmit timeout */
       if (tc->rto_boff == 1)
 	tcp_rtx_timeout_cc (tc);
@@ -1102,24 +1111,30 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
 
       TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1);
 
-      /* Send one segment. No fancy recovery for now! */
+      /* Send one segment */
       n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss);
+      /* TODO be less aggressive about this */
       scoreboard_clear (&tc->sack_sb);
 
       if (n_bytes == 0)
 	{
 	  clib_warning ("could not retransmit anything");
+	  clib_warning ("%U", format_tcp_connection, tc, 2);
+
 	  /* Try again eventually */
 	  tcp_retransmit_timer_set (tc);
+	  ASSERT (0 || (tc->rto_boff > 1
+			&& tc->snd_una == tc->snd_congestion));
 	  return;
 	}
+
+      /* For first retransmit, record timestamp (Eifel detection RFC3522) */
+      if (tc->rto_boff == 1)
+	tc->snd_rxt_ts = tcp_time_now ();
     }
-  else
+  /* Retransmit for SYN/SYNACK */
+  else if (tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_SYN_SENT)
     {
-      /* Retransmit for SYN/SYNACK */
-      ASSERT (tc->state == TCP_STATE_SYN_RCVD
-	      || tc->state == TCP_STATE_SYN_SENT);
-
       /* Try without increasing RTO a number of times. If this fails,
        * start growing RTO exponentially */
       if (tc->rto_boff > TCP_RTO_SYN_RETRIES)
@@ -1132,6 +1147,12 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn)
       /* Account for the SYN */
       tc->snd_nxt += 1;
     }
+  else
+    {
+      ASSERT (tc->state == TCP_STATE_CLOSED);
+      clib_warning ("connection closed ...");
+      return;
+    }
 
   if (!is_syn)
     {
@@ -1180,7 +1201,8 @@ tcp_timer_persist_handler (u32 index)
   u32 thread_index = vlib_get_thread_index ();
   tcp_connection_t *tc;
   vlib_buffer_t *b;
-  u32 bi, n_bytes;
+  u32 bi, old_snd_nxt;
+  int n_bytes = 0;
 
   tc = tcp_connection_get_if_valid (index, thread_index);
 
@@ -1202,13 +1224,15 @@ tcp_timer_persist_handler (u32 index)
   /* Try to force the first unsent segment  */
   tcp_get_free_buffer_index (tm, &bi);
   b = vlib_get_buffer (vm, bi);
+
+  tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una);
   tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state);
   n_bytes = stream_session_peek_bytes (&tc->connection,
 				       vlib_buffer_get_current (b),
 				       tc->snd_una_max - tc->snd_una,
 				       tc->snd_mss);
   /* Nothing to send */
-  if (n_bytes == 0)
+  if (n_bytes <= 0)
     {
       clib_warning ("persist found nothing to send");
       tcp_return_buffer (tm);
@@ -1216,7 +1240,13 @@ tcp_timer_persist_handler (u32 index)
     }
 
   b->current_length = n_bytes;
+  ASSERT (tc->snd_nxt == tc->snd_una_max || tc->rto_boff > 1
+	  || tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT));
+
+  /* Allow updating of snd_una_max but don't update snd_nxt */
+  old_snd_nxt = tc->snd_nxt;
   tcp_push_hdr_i (tc, b, tc->state, 0);
+  tc->snd_nxt = old_snd_nxt;
   tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
 
   /* Re-enable persist timer */
@@ -1232,8 +1262,9 @@ tcp_retransmit_first_unacked (tcp_connection_t * tc)
   tcp_main_t *tm = vnet_get_tcp_main ();
   vlib_main_t *vm = vlib_get_main ();
   vlib_buffer_t *b;
-  u32 bi, n_bytes;
+  u32 bi, n_bytes, old_snd_nxt;
 
+  old_snd_nxt = tc->snd_nxt;
   tc->snd_nxt = tc->snd_una;
 
   /* Get buffer */
@@ -1244,75 +1275,117 @@ tcp_retransmit_first_unacked (tcp_connection_t * tc)
 
   n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss);
   if (n_bytes == 0)
-    goto done;
+    {
+      tcp_return_buffer (tm);
+      goto done;
+    }
 
   tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
 
 done:
-  tc->snd_nxt = tc->snd_una_max;
+  tc->snd_nxt = old_snd_nxt;
 }
 
-sack_scoreboard_hole_t *
-scoreboard_first_rtx_hole (sack_scoreboard_t * sb)
+/**
+ * Do fast retransmit with SACKs
+ */
+void
+tcp_fast_retransmit_sack (tcp_connection_t * tc)
 {
-  sack_scoreboard_hole_t *hole = 0;
-
-//  hole = scoreboard_first_hole (&tc->sack_sb);
-//  if (hole)
-//    {
-//
-//      offset = hole->start - tc->snd_una;
-//      hole_size = hole->end - hole->start;
-//
-//      ASSERT(hole_size);
-//
-//      if (hole_size < max_bytes)
-//      max_bytes = hole_size;
-//    }
-  return hole;
+  tcp_main_t *tm = vnet_get_tcp_main ();
+  vlib_main_t *vm = vlib_get_main ();
+  u32 n_written = 0, offset = 0, max_bytes;
+  vlib_buffer_t *b;
+  sack_scoreboard_hole_t *hole;
+  sack_scoreboard_t *sb;
+  u32 bi, old_snd_nxt;
+  int snd_space;
+  u8 snd_limited = 0, can_rescue = 0;
+
+  ASSERT (tcp_in_fastrecovery (tc));
+  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0);
+
+  old_snd_nxt = tc->snd_nxt;
+  sb = &tc->sack_sb;
+  snd_space = tcp_available_snd_space (tc);
+
+  hole = scoreboard_get_hole (sb, sb->cur_rxt_hole);
+  while (hole && snd_space > 0)
+    {
+      tcp_get_free_buffer_index (tm, &bi);
+      b = vlib_get_buffer (vm, bi);
+
+      hole = scoreboard_next_rxt_hole (sb, hole,
+				       tcp_fastrecovery_sent_1_smss (tc),
+				       &can_rescue, &snd_limited);
+      if (!hole)
+	{
+	  if (!can_rescue || !(seq_lt (sb->rescue_rxt, tc->snd_una)
+			       || seq_gt (sb->rescue_rxt,
+					  tc->snd_congestion)))
+	    break;
+
+	  /* If rescue rxt undefined or less than snd_una then one segment of
+	   * up to SMSS octets that MUST include the highest outstanding
+	   * unSACKed sequence number SHOULD be returned, and RescueRxt set to
+	   * RecoveryPoint. HighRxt MUST NOT be updated.
+	   */
+	  max_bytes = clib_min (tc->snd_mss, snd_space);
+	  offset = tc->snd_congestion - tc->snd_una - max_bytes;
+	  sb->rescue_rxt = tc->snd_congestion;
+	  tc->snd_nxt = tc->snd_una + offset;
+	  tcp_prepare_retransmit_segment (tc, b, offset, max_bytes);
+	  tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
+	  break;
+	}
+
+      max_bytes = snd_limited ? tc->snd_mss : hole->end - sb->high_rxt;
+      offset = sb->high_rxt - tc->snd_una;
+      tc->snd_nxt = tc->snd_una + offset;
+      n_written = tcp_prepare_retransmit_segment (tc, b, offset, max_bytes);
+
+      /* Nothing left to retransmit */
+      if (n_written == 0)
+	{
+	  tcp_return_buffer (tm);
+	  break;
+	}
+
+      sb->high_rxt += n_written;
+      tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4);
+      snd_space -= n_written;
+    }
+
+  /* If window allows, send 1 SMSS of new data */
+  tc->snd_nxt = old_snd_nxt;
 }
 
 /**
- * Do fast retransmit.
+ * Fast retransmit without SACK info
  */
 void
-tcp_fast_retransmit (tcp_connection_t * tc)
+tcp_fast_retransmit_no_sack (tcp_connection_t * tc)
 {
   tcp_main_t *tm = vnet_get_tcp_main ();
   vlib_main_t *vm = vlib_get_main ();
-  u32 bi;
+  u32 n_written = 0, offset = 0, bi, old_snd_nxt;
   int snd_space;
-  u32 n_written = 0, offset = 0;
   vlib_buffer_t *b;
-  u8 use_sacks = 0;
 
   ASSERT (tcp_in_fastrecovery (tc));
+  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0);
 
   /* Start resending from first un-acked segment */
+  old_snd_nxt = tc->snd_nxt;
   tc->snd_nxt = tc->snd_una;
-
   snd_space = tcp_available_snd_space (tc);
-  TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0);
-
-  /* If we have SACKs use them */
-  if (tcp_opts_sack_permitted (&tc->opt)
-      && scoreboard_first_hole (&tc->sack_sb))
-    use_sacks = 0;
 
   while (snd_space > 0)
     {
       tcp_get_free_buffer_index (tm, &bi);
       b = vlib_get_buffer (vm, bi);
 
-      if (use_sacks)
-	{
-	  scoreboard_first_rtx_hole (&tc->sack_sb);
-	}
-      else
-	{
-	  offset += n_written;
-	}
-
+      offset += n_written;
       n_written = tcp_prepare_retransmit_segment (tc, b, offset, snd_space);
 
       /* Nothing left to retransmit */
@@ -1326,9 +1399,21 @@ tcp_fast_retransmit (tcp_connection_t * tc)
       snd_space -= n_written;
     }
 
-  /* If window allows, send 1 SMSS of new data */
-  if (seq_lt (tc->snd_nxt, tc->snd_congestion))
-    tc->snd_nxt = tc->snd_congestion;
+  /* Restore snd_nxt. If window allows, send 1 SMSS of new data */
+  tc->snd_nxt = old_snd_nxt;
+}
+
+/**
+ * Do fast retransmit
+ */
+void
+tcp_fast_retransmit (tcp_connection_t * tc)
+{
+  if (tcp_opts_sack_permitted (&tc->rcv_opts)
+      && scoreboard_first_hole (&tc->sack_sb))
+    tcp_fast_retransmit_sack (tc);
+  else
+    tcp_fast_retransmit_no_sack (tc);
 }
 
 always_inline u32
@@ -1544,6 +1629,12 @@ tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b)
 
   tc = (tcp_connection_t *) tconn;
   tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED, 0);
+
+  if (tc->rtt_ts == 0)
+    {
+      tc->rtt_ts = tcp_time_now ();
+      tc->rtt_seq = tc->snd_nxt;
+    }
   return 0;
 }
 
diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c
index 2af38484..3f8afa40 100644
--- a/src/vnet/tcp/tcp_test.c
+++ b/src/vnet/tcp/tcp_test.c
@@ -54,7 +54,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input)
   tc->snd_una = 0;
   tc->snd_una_max = 1000;
   tc->snd_nxt = 1000;
-  tc->opt.flags |= TCP_OPTS_FLAG_SACK;
+  tc->rcv_opts.flags |= TCP_OPTS_FLAG_SACK;
   scoreboard_init (&tc->sack_sb);
 
   for (i = 0; i < 1000 / 100; i++)
@@ -70,9 +70,9 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input)
 
   for (i = 0; i < 1000 / 200; i++)
     {
-      vec_add1 (tc->opt.sacks, sacks[i * 2]);
+      vec_add1 (tc->rcv_opts.sacks, sacks[i * 2]);
     }
-  tc->opt.n_sack_blocks = vec_len (tc->opt.sacks);
+  tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks);
   tcp_rcv_sacks (tc, 0);
 
   if (verbose)
@@ -93,18 +93,17 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input)
   TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv);
   TCP_TEST ((sb->last_sacked_bytes == 400),
 	    "last sacked bytes %d", sb->last_sacked_bytes);
-  TCP_TEST ((sb->max_byte_sacked == 900),
-	    "max byte sacked %u", sb->max_byte_sacked);
+  TCP_TEST ((sb->high_sacked == 900), "max byte sacked %u", sb->high_sacked);
   /*
    * Inject odd blocks
    */
 
-  vec_reset_length (tc->opt.sacks);
+  vec_reset_length (tc->rcv_opts.sacks);
   for (i = 0; i < 1000 / 200; i++)
     {
-      vec_add1 (tc->opt.sacks, sacks[i * 2 + 1]);
+      vec_add1 (tc->rcv_opts.sacks, sacks[i * 2 + 1]);
     }
-  tc->opt.n_sack_blocks = vec_len (tc->opt.sacks);
+  tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks);
   tcp_rcv_sacks (tc, 0);
 
   if (verbose)
@@ -118,8 +117,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input)
 	    "first hole start %u end %u", hole->start, hole->end);
   TCP_TEST ((sb->sacked_bytes == 900), "sacked bytes %d", sb->sacked_bytes);
   TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv);
-  TCP_TEST ((sb->max_byte_sacked == 1000),
-	    "max sacked byte %u", sb->max_byte_sacked);
+  TCP_TEST ((sb->high_sacked == 1000), "max sacked byte %u", sb->high_sacked);
   TCP_TEST ((sb->last_sacked_bytes == 500),
 	    "last sacked bytes %d", sb->last_sacked_bytes);
 
@@ -135,8 +133,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input)
 	    "scoreboard has %d elements", pool_elts (sb->holes));
   TCP_TEST ((sb->snd_una_adv == 900),
 	    "snd_una_adv after ack %u", sb->snd_una_adv);
-  TCP_TEST ((sb->max_byte_sacked == 1000),
-	    "max sacked byte %u", sb->max_byte_sacked);
+  TCP_TEST ((sb->high_sacked == 1000), "max sacked byte %u", sb->high_sacked);
   TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes);
   TCP_TEST ((sb->last_sacked_bytes == 0),
 	    "last sacked bytes %d", sb->last_sacked_bytes);
@@ -145,11 +142,11 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input)
    * Add new block
    */
 
-  vec_reset_length (tc->opt.sacks);
+  vec_reset_length (tc->rcv_opts.sacks);
 
   block.start = 1200;
   block.end = 1300;
-  vec_add1 (tc->opt.sacks, block);
+  vec_add1 (tc->rcv_opts.sacks, block);
 
   if (verbose)
     vlib_cli_output (vm, "add [1200, 1300]:\n%U", format_tcp_scoreboard, sb);
@@ -171,8 +168,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input)
 	    "first hole start %u end %u", hole->start, hole->end);
   TCP_TEST ((sb->snd_una_adv == 0),
 	    "snd_una_adv after ack %u", sb->snd_una_adv);
-  TCP_TEST ((sb->max_byte_sacked == 1300),
-	    "max sacked byte %u", sb->max_byte_sacked);
+  TCP_TEST ((sb->high_sacked == 1300), "max sacked byte %u", sb->high_sacked);
   hole = scoreboard_last_hole (sb);
   TCP_TEST ((hole->start == 1300 && hole->end == 1500),
 	    "last hole start %u end %u", hole->start, hole->end);
@@ -182,7 +178,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input)
    * Ack first hole
    */
 
-  vec_reset_length (tc->opt.sacks);
+  vec_reset_length (tc->rcv_opts.sacks);
   tcp_rcv_sacks (tc, 1200);
 
   if (verbose)
@@ -196,8 +192,16 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input)
 	    "scoreboard has %d elements", pool_elts (sb->holes));
 
   /*
-   * Remove all
+   * Add some more blocks and then remove all
    */
+  vec_reset_length (tc->rcv_opts.sacks);
+  for (i = 0; i < 5; i++)
+    {
+      block.start = i * 100 + 1200;
+      block.end = (i + 1) * 100 + 1200;
+      vec_add1 (tc->rcv_opts.sacks, block);
+    }
+  tcp_rcv_sacks (tc, 1900);
 
   scoreboard_clear (sb);
   if (verbose)
@@ -205,6 +209,9 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input)
 
   TCP_TEST ((pool_elts (sb->holes) == 0),
 	    "number of holes %d", pool_elts (sb->holes));
+  TCP_TEST ((sb->head == TCP_INVALID_SACK_HOLE_INDEX), "head %u", sb->head);
+  TCP_TEST ((sb->tail == TCP_INVALID_SACK_HOLE_INDEX), "tail %u", sb->tail);
+
   /*
    * Re-inject odd blocks and ack them all
    */
@@ -214,9 +221,9 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input)
   tc->snd_nxt = 1000;
   for (i = 0; i < 5; i++)
     {
-      vec_add1 (tc->opt.sacks, sacks[i * 2 + 1]);
+      vec_add1 (tc->rcv_opts.sacks, sacks[i * 2 + 1]);
     }
-  tc->opt.n_sack_blocks = vec_len (tc->opt.sacks);
+  tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks);
   tcp_rcv_sacks (tc, 0);
   if (verbose)
     vlib_cli_output (vm, "sb added odd blocks and ack [0, 950]:\n%U",
@@ -740,6 +747,10 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input)
       TCP_TEST (0, "[%d] peeked %u expected %u", j, data_buf[j], data[j]);
     }
 
+  /* Try to peek beyond the data */
+  rv = svm_fifo_peek (f, svm_fifo_max_dequeue (f), vec_len (data), data_buf);
+  TCP_TEST ((rv == 0), "peeked %u expected 0", rv);
+
   vec_free (data_buf);
   svm_fifo_free (f);
   vec_free (test_data);
@@ -1239,7 +1250,7 @@ tcp_test_session (vlib_main_t * vm, unformat_input_t * input)
       tc0->c_thread_index = 0;
       tc0->c_lcl_ip4.as_u32 = local.as_u32;
       tc0->c_rmt_ip4.as_u32 = remote.as_u32;
-      tc0->opt.mss = 1450;
+      tc0->rcv_opts.mss = 1450;
       tcp_connection_init_vars (tc0);
 
       TCP_EVT_DBG (TCP_EVT_OPEN, tc0);
-- 
cgit 1.2.3-korg


From 30af5da7522b38e9d4a4e9c906fd5f1c05088212 Mon Sep 17 00:00:00 2001
From: JingLiuZTE <liu.jing5@zte.com.cn>
Date: Mon, 24 Jul 2017 10:53:31 +0800
Subject: VPP-905: Wrong define used in function start_workers.

Change-Id: I6a5faebb63e9360cebfcfb1bc3f3c0eb6b15e937
Signed-off-by: JingLiuZTE <liu.jing5@zte.com.cn>
---
 src/vlib/threads.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'src/vlib/threads.c')

diff --git a/src/vlib/threads.c b/src/vlib/threads.c
index 0c775e2d..0661d89a 100644
--- a/src/vlib/threads.c
+++ b/src/vlib/threads.c
@@ -642,7 +642,8 @@ start_workers (vlib_main_t * vm)
 		}
 	      nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] =
 		vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]);
-	      vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT])
+	      vec_foreach (rt,
+			   nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL])
 	      {
 		vlib_node_t *n = vlib_get_node (vm, rt->node_index);
 		rt->thread_index = vm_clone->thread_index;
-- 
cgit 1.2.3-korg


From 215961829c4ae5f738ffcd01a8d1afcab13bd0e2 Mon Sep 17 00:00:00 2001
From: Colin Tregenza Dancer <ctd@metaswitch.com>
Date: Mon, 4 Sep 2017 15:27:49 +0100
Subject: Refork worker thread data structures in parallel (VPP-970)

Change the rebuilding of worker thread clone datastructures
to run in parallel on the workers, instead of serially
on main.

Change-Id: Ib76bcfbef1e51f2399972090f4057be7aaa84e08
Signed-off-by: Colin Tregenza Dancer <ctd@metaswitch.com>
---
 src/vlib/main.h    |   6 +
 src/vlib/threads.c | 333 +++++++++++++++++++++++++++++++----------------------
 src/vlib/threads.h |  11 ++
 3 files changed, 214 insertions(+), 136 deletions(-)

(limited to 'src/vlib/threads.c')

diff --git a/src/vlib/main.h b/src/vlib/main.h
index bfa7ddbe..b63c63fa 100644
--- a/src/vlib/main.h
+++ b/src/vlib/main.h
@@ -181,6 +181,12 @@ typedef struct vlib_main_t
   /* Attempt to do a post-mortem elog dump */
   int elog_post_mortem_dump;
 
+  /*
+   * Need to call vlib_worker_thread_node_runtime_update before
+   * releasing worker thread barrier. Only valid in vlib_global_main.
+   */
+  int need_vlib_worker_thread_node_runtime_update;
+
 } vlib_main_t;
 
 /* Global main structure. */
diff --git a/src/vlib/threads.c b/src/vlib/threads.c
index 0661d89a..6cd325b3 100644
--- a/src/vlib/threads.c
+++ b/src/vlib/threads.c
@@ -547,10 +547,17 @@ start_workers (vlib_main_t * vm)
       vlib_worker_threads->workers_at_barrier =
 	clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES);
 
+      vlib_worker_threads->node_reforks_required =
+	clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES);
+
       /* Ask for an initial barrier sync */
       *vlib_worker_threads->workers_at_barrier = 0;
       *vlib_worker_threads->wait_at_barrier = 1;
 
+      /* Without update or refork */
+      *vlib_worker_threads->node_reforks_required = 0;
+      vm->need_vlib_worker_thread_node_runtime_update = 0;
+
       worker_thread_index = 1;
 
       for (i = 0; i < vec_len (tm->registrations); i++)
@@ -568,6 +575,8 @@ start_workers (vlib_main_t * vm)
 
 	  for (k = 0; k < tr->count; k++)
 	    {
+	      vlib_node_t *n;
+
 	      vec_add2 (vlib_worker_threads, w, 1);
 	      if (tr->mheap_size)
 		w->thread_mheap =
@@ -628,10 +637,12 @@ start_workers (vlib_main_t * vm)
 
 	      /* fork nodes */
 	      nm_clone->nodes = 0;
+
+	      /* Allocate all nodes in single block for speed */
+	      n = clib_mem_alloc_no_fail (vec_len (nm->nodes) * sizeof (*n));
+
 	      for (j = 0; j < vec_len (nm->nodes); j++)
 		{
-		  vlib_node_t *n;
-		  n = clib_mem_alloc_no_fail (sizeof (*n));
 		  clib_memcpy (n, nm->nodes[j], sizeof (*n));
 		  /* none of the copied nodes have enqueue rights given out */
 		  n->owner_node_index = VLIB_INVALID_NODE_INDEX;
@@ -639,6 +650,7 @@ start_workers (vlib_main_t * vm)
 		  memset (&n->stats_last_clear, 0,
 			  sizeof (n->stats_last_clear));
 		  vec_add1 (nm_clone->nodes, n);
+		  n++;
 		}
 	      nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] =
 		vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]);
@@ -778,17 +790,14 @@ start_workers (vlib_main_t * vm)
 
 VLIB_MAIN_LOOP_ENTER_FUNCTION (start_workers);
 
-void
-vlib_worker_thread_node_runtime_update (void)
+static inline void
+worker_thread_node_runtime_update_internal (void)
 {
   int i, j;
-  vlib_worker_thread_t *w;
   vlib_main_t *vm;
   vlib_node_main_t *nm, *nm_clone;
-  vlib_node_t **old_nodes_clone;
   vlib_main_t *vm_clone;
-  vlib_node_runtime_t *rt, *old_rt;
-  void *oldheap;
+  vlib_node_runtime_t *rt;
   never_inline void
     vlib_node_runtime_sync_stats (vlib_main_t * vm,
 				  vlib_node_runtime_t * r,
@@ -797,13 +806,9 @@ vlib_worker_thread_node_runtime_update (void)
 
   ASSERT (vlib_get_thread_index () == 0);
 
-  if (vec_len (vlib_mains) == 1)
-    return;
-
   vm = vlib_mains[0];
   nm = &vm->node_main;
 
-  ASSERT (vlib_get_thread_index () == 0);
   ASSERT (*vlib_worker_threads->wait_at_barrier == 1);
 
   /*
@@ -833,146 +838,170 @@ vlib_worker_thread_node_runtime_update (void)
 	}
     }
 
-  for (i = 1; i < vec_len (vlib_mains); i++)
-    {
-      vlib_node_runtime_t *rt;
-      w = vlib_worker_threads + i;
-      oldheap = clib_mem_set_heap (w->thread_mheap);
+  /* Per-worker clone rebuilds are now done on each thread */
+}
 
-      vm_clone = vlib_mains[i];
 
-      /* Re-clone error heap */
-      u64 *old_counters = vm_clone->error_main.counters;
-      u64 *old_counters_all_clear = vm_clone->error_main.counters_last_clear;
-      clib_memcpy (&vm_clone->error_main, &vm->error_main,
-		   sizeof (vm->error_main));
-      j = vec_len (vm->error_main.counters) - 1;
-      vec_validate_aligned (old_counters, j, CLIB_CACHE_LINE_BYTES);
-      vec_validate_aligned (old_counters_all_clear, j, CLIB_CACHE_LINE_BYTES);
-      vm_clone->error_main.counters = old_counters;
-      vm_clone->error_main.counters_last_clear = old_counters_all_clear;
+void
+vlib_worker_thread_node_refork (void)
+{
+  vlib_main_t *vm, *vm_clone;
+  vlib_node_main_t *nm, *nm_clone;
+  vlib_node_t **old_nodes_clone;
+  vlib_node_runtime_t *rt, *old_rt;
 
-      nm_clone = &vm_clone->node_main;
-      vec_free (nm_clone->next_frames);
-      nm_clone->next_frames = vec_dup (nm->next_frames);
+  vlib_node_t *new_n_clone;
 
-      for (j = 0; j < vec_len (nm_clone->next_frames); j++)
-	{
-	  vlib_next_frame_t *nf = &nm_clone->next_frames[j];
-	  u32 save_node_runtime_index;
-	  u32 save_flags;
-
-	  save_node_runtime_index = nf->node_runtime_index;
-	  save_flags = nf->flags & VLIB_FRAME_NO_FREE_AFTER_DISPATCH;
-	  vlib_next_frame_init (nf);
-	  nf->node_runtime_index = save_node_runtime_index;
-	  nf->flags = save_flags;
-	}
+  int j;
 
-      old_nodes_clone = nm_clone->nodes;
-      nm_clone->nodes = 0;
+  vm = vlib_mains[0];
+  nm = &vm->node_main;
+  vm_clone = vlib_get_main ();
+  nm_clone = &vm_clone->node_main;
+
+  /* Re-clone error heap */
+  u64 *old_counters = vm_clone->error_main.counters;
+  u64 *old_counters_all_clear = vm_clone->error_main.counters_last_clear;
+
+  clib_memcpy (&vm_clone->error_main, &vm->error_main,
+	       sizeof (vm->error_main));
+  j = vec_len (vm->error_main.counters) - 1;
+  vec_validate_aligned (old_counters, j, CLIB_CACHE_LINE_BYTES);
+  vec_validate_aligned (old_counters_all_clear, j, CLIB_CACHE_LINE_BYTES);
+  vm_clone->error_main.counters = old_counters;
+  vm_clone->error_main.counters_last_clear = old_counters_all_clear;
+
+  nm_clone = &vm_clone->node_main;
+  vec_free (nm_clone->next_frames);
+  nm_clone->next_frames = vec_dup (nm->next_frames);
+
+  for (j = 0; j < vec_len (nm_clone->next_frames); j++)
+    {
+      vlib_next_frame_t *nf = &nm_clone->next_frames[j];
+      u32 save_node_runtime_index;
+      u32 save_flags;
+
+      save_node_runtime_index = nf->node_runtime_index;
+      save_flags = nf->flags & VLIB_FRAME_NO_FREE_AFTER_DISPATCH;
+      vlib_next_frame_init (nf);
+      nf->node_runtime_index = save_node_runtime_index;
+      nf->flags = save_flags;
+    }
 
-      /* re-fork nodes */
-      for (j = 0; j < vec_len (nm->nodes); j++)
-	{
-	  vlib_node_t *old_n_clone;
-	  vlib_node_t *new_n, *new_n_clone;
+  old_nodes_clone = nm_clone->nodes;
+  nm_clone->nodes = 0;
 
-	  new_n = nm->nodes[j];
-	  old_n_clone = old_nodes_clone[j];
+  /* re-fork nodes */
 
-	  new_n_clone = clib_mem_alloc_no_fail (sizeof (*new_n_clone));
-	  clib_memcpy (new_n_clone, new_n, sizeof (*new_n));
-	  /* none of the copied nodes have enqueue rights given out */
-	  new_n_clone->owner_node_index = VLIB_INVALID_NODE_INDEX;
+  /* Allocate all nodes in single block for speed */
+  new_n_clone =
+    clib_mem_alloc_no_fail (vec_len (nm->nodes) * sizeof (*new_n_clone));
+  for (j = 0; j < vec_len (nm->nodes); j++)
+    {
+      vlib_node_t *old_n_clone;
+      vlib_node_t *new_n;
 
-	  if (j >= vec_len (old_nodes_clone))
-	    {
-	      /* new node, set to zero */
-	      memset (&new_n_clone->stats_total, 0,
-		      sizeof (new_n_clone->stats_total));
-	      memset (&new_n_clone->stats_last_clear, 0,
-		      sizeof (new_n_clone->stats_last_clear));
-	    }
-	  else
-	    {
-	      /* Copy stats if the old data is valid */
-	      clib_memcpy (&new_n_clone->stats_total,
-			   &old_n_clone->stats_total,
-			   sizeof (new_n_clone->stats_total));
-	      clib_memcpy (&new_n_clone->stats_last_clear,
-			   &old_n_clone->stats_last_clear,
-			   sizeof (new_n_clone->stats_last_clear));
-
-	      /* keep previous node state */
-	      new_n_clone->state = old_n_clone->state;
-	    }
-	  vec_add1 (nm_clone->nodes, new_n_clone);
-	}
-      /* Free the old node clone */
-      for (j = 0; j < vec_len (old_nodes_clone); j++)
-	clib_mem_free (old_nodes_clone[j]);
-      vec_free (old_nodes_clone);
-
-
-      /* re-clone internal nodes */
-      old_rt = nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL];
-      nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] =
-	vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]);
-
-      vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL])
-      {
-	vlib_node_t *n = vlib_get_node (vm, rt->node_index);
-	rt->thread_index = vm_clone->thread_index;
-	/* copy runtime_data, will be overwritten later for existing rt */
-	if (n->runtime_data && n->runtime_data_bytes > 0)
-	  clib_memcpy (rt->runtime_data, n->runtime_data,
-		       clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
-				 n->runtime_data_bytes));
-      }
-
-      for (j = 0; j < vec_len (old_rt); j++)
+      new_n = nm->nodes[j];
+      old_n_clone = old_nodes_clone[j];
+
+      clib_memcpy (new_n_clone, new_n, sizeof (*new_n));
+      /* none of the copied nodes have enqueue rights given out */
+      new_n_clone->owner_node_index = VLIB_INVALID_NODE_INDEX;
+
+      if (j >= vec_len (old_nodes_clone))
 	{
-	  rt = vlib_node_get_runtime (vm_clone, old_rt[j].node_index);
-	  rt->state = old_rt[j].state;
-	  clib_memcpy (rt->runtime_data, old_rt[j].runtime_data,
-		       VLIB_NODE_RUNTIME_DATA_SIZE);
+	  /* new node, set to zero */
+	  memset (&new_n_clone->stats_total, 0,
+		  sizeof (new_n_clone->stats_total));
+	  memset (&new_n_clone->stats_last_clear, 0,
+		  sizeof (new_n_clone->stats_last_clear));
 	}
-
-      vec_free (old_rt);
-
-      /* re-clone input nodes */
-      old_rt = nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT];
-      nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] =
-	vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]);
-
-      vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT])
-      {
-	vlib_node_t *n = vlib_get_node (vm, rt->node_index);
-	rt->thread_index = vm_clone->thread_index;
-	/* copy runtime_data, will be overwritten later for existing rt */
-	if (n->runtime_data && n->runtime_data_bytes > 0)
-	  clib_memcpy (rt->runtime_data, n->runtime_data,
-		       clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
-				 n->runtime_data_bytes));
-      }
-
-      for (j = 0; j < vec_len (old_rt); j++)
+      else
 	{
-	  rt = vlib_node_get_runtime (vm_clone, old_rt[j].node_index);
-	  rt->state = old_rt[j].state;
-	  clib_memcpy (rt->runtime_data, old_rt[j].runtime_data,
-		       VLIB_NODE_RUNTIME_DATA_SIZE);
+	  /* Copy stats if the old data is valid */
+	  clib_memcpy (&new_n_clone->stats_total,
+		       &old_n_clone->stats_total,
+		       sizeof (new_n_clone->stats_total));
+	  clib_memcpy (&new_n_clone->stats_last_clear,
+		       &old_n_clone->stats_last_clear,
+		       sizeof (new_n_clone->stats_last_clear));
+
+	  /* keep previous node state */
+	  new_n_clone->state = old_n_clone->state;
 	}
+      vec_add1 (nm_clone->nodes, new_n_clone);
+      new_n_clone++;
+    }
+  /* Free the old node clones */
+  clib_mem_free (old_nodes_clone[0]);
+
+  vec_free (old_nodes_clone);
 
-      vec_free (old_rt);
 
-      nm_clone->processes = vec_dup (nm->processes);
+  /* re-clone internal nodes */
+  old_rt = nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL];
+  nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] =
+    vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]);
 
-      clib_mem_set_heap (oldheap);
+  vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL])
+  {
+    vlib_node_t *n = vlib_get_node (vm, rt->node_index);
+    rt->thread_index = vm_clone->thread_index;
+    /* copy runtime_data, will be overwritten later for existing rt */
+    if (n->runtime_data && n->runtime_data_bytes > 0)
+      clib_memcpy (rt->runtime_data, n->runtime_data,
+		   clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
+			     n->runtime_data_bytes));
+  }
 
-      // vnet_main_fork_fixup (i);
+  for (j = 0; j < vec_len (old_rt); j++)
+    {
+      rt = vlib_node_get_runtime (vm_clone, old_rt[j].node_index);
+      rt->state = old_rt[j].state;
+      clib_memcpy (rt->runtime_data, old_rt[j].runtime_data,
+		   VLIB_NODE_RUNTIME_DATA_SIZE);
     }
+
+  vec_free (old_rt);
+
+  /* re-clone input nodes */
+  old_rt = nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT];
+  nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] =
+    vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]);
+
+  vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT])
+  {
+    vlib_node_t *n = vlib_get_node (vm, rt->node_index);
+    rt->thread_index = vm_clone->thread_index;
+    /* copy runtime_data, will be overwritten later for existing rt */
+    if (n->runtime_data && n->runtime_data_bytes > 0)
+      clib_memcpy (rt->runtime_data, n->runtime_data,
+		   clib_min (VLIB_NODE_RUNTIME_DATA_SIZE,
+			     n->runtime_data_bytes));
+  }
+
+  for (j = 0; j < vec_len (old_rt); j++)
+    {
+      rt = vlib_node_get_runtime (vm_clone, old_rt[j].node_index);
+      rt->state = old_rt[j].state;
+      clib_memcpy (rt->runtime_data, old_rt[j].runtime_data,
+		   VLIB_NODE_RUNTIME_DATA_SIZE);
+    }
+
+  vec_free (old_rt);
+
+  nm_clone->processes = vec_dup (nm->processes);
+}
+
+
+void
+vlib_worker_thread_node_runtime_update (void)
+{
+  /*
+   * Make a note that we need to do a node runtime update
+   * prior to releasing the barrier.
+   */
+  vlib_global_main.need_vlib_worker_thread_node_runtime_update = 1;
 }
 
 u32
@@ -1172,6 +1201,8 @@ vlib_worker_thread_barrier_sync (vlib_main_t * vm)
   if (vec_len (vlib_mains) < 2)
     return;
 
+  ASSERT (vlib_get_thread_index () == 0);
+
   count = vec_len (vlib_mains) - 1;
 
   /* Tolerate recursive calls */
@@ -1180,8 +1211,6 @@ vlib_worker_thread_barrier_sync (vlib_main_t * vm)
 
   vlib_worker_threads[0].barrier_sync_count++;
 
-  ASSERT (vlib_get_thread_index () == 0);
-
   deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT;
 
   *vlib_worker_threads->wait_at_barrier = 1;
@@ -1199,13 +1228,29 @@ void
 vlib_worker_thread_barrier_release (vlib_main_t * vm)
 {
   f64 deadline;
+  int refork_needed = 0;
 
   if (vec_len (vlib_mains) < 2)
     return;
 
+  ASSERT (vlib_get_thread_index () == 0);
+
   if (--vlib_worker_threads[0].recursion_level > 0)
     return;
 
+  /* Update (all) node runtimes before releasing the barrier, if needed */
+  if (vm->need_vlib_worker_thread_node_runtime_update)
+    {
+      /* Do stats elements on main thread */
+      worker_thread_node_runtime_update_internal ();
+      vm->need_vlib_worker_thread_node_runtime_update = 0;
+
+      /* Do per thread rebuilds in parallel */
+      refork_needed = 1;
+      clib_smp_atomic_add (vlib_worker_threads->node_reforks_required,
+			   (vec_len (vlib_mains) - 1));
+    }
+
   deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT;
 
   *vlib_worker_threads->wait_at_barrier = 0;
@@ -1218,6 +1263,22 @@ vlib_worker_thread_barrier_release (vlib_main_t * vm)
 	  os_panic ();
 	}
     }
+
+  /* Wait for reforks before continuing */
+  if (refork_needed)
+    {
+      deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT;
+
+      while (*vlib_worker_threads->node_reforks_required > 0)
+	{
+	  if (vlib_time_now (vm) > deadline)
+	    {
+	      fformat (stderr, "%s: worker thread refork deadlock\n",
+		       __FUNCTION__);
+	      os_panic ();
+	    }
+	}
+    }
 }
 
 /*
diff --git a/src/vlib/threads.h b/src/vlib/threads.h
index 572ce77f..c3f1cade 100644
--- a/src/vlib/threads.h
+++ b/src/vlib/threads.h
@@ -102,6 +102,7 @@ typedef struct
   vlib_thread_registration_t *registration;
   u8 *name;
   u64 barrier_sync_count;
+  volatile u32 *node_reforks_required;
 
   long lwp;
   int lcore_id;
@@ -180,6 +181,7 @@ u32 vlib_frame_queue_main_init (u32 node_index, u32 frame_queue_nelts);
 
 void vlib_worker_thread_barrier_sync (vlib_main_t * vm);
 void vlib_worker_thread_barrier_release (vlib_main_t * vm);
+void vlib_worker_thread_node_refork (void);
 
 static_always_inline uword
 vlib_get_thread_index (void)
@@ -369,6 +371,15 @@ vlib_worker_thread_barrier_check (void)
       if (CLIB_DEBUG > 0)
 	vm->parked_at_barrier = 0;
       clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, -1);
+
+      if (PREDICT_FALSE (*vlib_worker_threads->node_reforks_required))
+	{
+	  vlib_worker_thread_node_refork ();
+	  clib_smp_atomic_add (vlib_worker_threads->node_reforks_required,
+			       -1);
+	  while (*vlib_worker_threads->node_reforks_required)
+	    ;
+	}
     }
 }
 
-- 
cgit 1.2.3-korg


From eb1ac1732f15f9a99edbeffeb94c525b9ff25c1d Mon Sep 17 00:00:00 2001
From: Colin Tregenza Dancer <ctd@metaswitch.com>
Date: Wed, 6 Sep 2017 20:23:24 +0100
Subject: Recombine diags and minimum barrier open time changes (VPP-968)
 Support logging to both syslog and elog Also include DaveB is_mp_safe fix,
 which had been lost

Change-Id: If82f7969e2f43c63c3fed5b1a0c7434c90c1f380
Signed-off-by: Colin Tregenza Dancer <ctd@metaswitch.com>
---
 src/vlib/main.h              |   9 ++
 src/vlib/threads.c           | 313 +++++++++++++++++++++++++++++++++++++++++--
 src/vlib/threads.h           |  29 +++-
 src/vlibapi/api_common.h     |   6 +
 src/vlibapi/api_shared.c     |  10 +-
 src/vlibmemory/memory_vlib.c |   5 +
 src/vpp/vnet/main.c          |   8 ++
 7 files changed, 367 insertions(+), 13 deletions(-)

(limited to 'src/vlib/threads.c')

diff --git a/src/vlib/main.h b/src/vlib/main.h
index 4c0cde3f..fb67334e 100644
--- a/src/vlib/main.h
+++ b/src/vlib/main.h
@@ -199,6 +199,15 @@ typedef struct vlib_main_t
    */
   int need_vlib_worker_thread_node_runtime_update;
 
+  /*
+   * Barrier epoch - Set to current time, each time barrier_sync or
+   * barrier_release is called with zero recursion.
+   */
+  f64 barrier_epoch;
+
+  /* Earliest barrier can be closed again */
+  f64 barrier_no_close_before;
+
 } vlib_main_t;
 
 /* Global main structure. */
diff --git a/src/vlib/threads.c b/src/vlib/threads.c
index 6cd325b3..2d9ce84a 100644
--- a/src/vlib/threads.c
+++ b/src/vlib/threads.c
@@ -35,6 +35,222 @@ vl (void *p)
 vlib_worker_thread_t *vlib_worker_threads;
 vlib_thread_main_t vlib_thread_main;
 
+/*
+ * Barrier tracing can be enabled on a normal build to collect information
+ * on barrier use, including timings and call stacks.  Deliberately not
+ * keyed off CLIB_DEBUG, because that can add significant overhead which
+ * imapacts observed timings.
+ */
+
+#ifdef BARRIER_TRACING
+ /*
+  * Output of barrier tracing can be to syslog or elog as suits
+  */
+#ifdef BARRIER_TRACING_ELOG
+static u32
+elog_id_for_msg_name (const char *msg_name)
+{
+  uword *p, r;
+  static uword *h;
+  u8 *name_copy;
+
+  if (!h)
+    h = hash_create_string (0, sizeof (uword));
+
+  p = hash_get_mem (h, msg_name);
+  if (p)
+    return p[0];
+  r = elog_string (&vlib_global_main.elog_main, "%s", msg_name);
+
+  name_copy = format (0, "%s%c", msg_name, 0);
+
+  hash_set_mem (h, name_copy, r);
+
+  return r;
+}
+
+  /*
+   * elog Barrier trace functions, which are nulled out if BARRIER_TRACING isn't
+   * defined
+   */
+
+static inline void
+barrier_trace_sync (f64 t_entry, f64 t_open, f64 t_closed)
+{
+    /* *INDENT-OFF* */
+    ELOG_TYPE_DECLARE (e) =
+      {
+        .format = "barrier <%d#%s(O:%dus:%dus)(%dus)",
+        .format_args = "i4T4i4i4i4",
+      };
+    /* *INDENT-ON* */
+  struct
+  {
+    u32 count, caller, t_entry, t_open, t_closed;
+  } *ed = 0;
+
+  ed = ELOG_DATA (&vlib_global_main.elog_main, e);
+  ed->count = (int) vlib_worker_threads[0].barrier_sync_count;
+  ed->caller = elog_id_for_msg_name (vlib_worker_threads[0].barrier_caller);
+  ed->t_entry = (int) (1000000.0 * t_entry);
+  ed->t_open = (int) (1000000.0 * t_open);
+  ed->t_closed = (int) (1000000.0 * t_closed);
+}
+
+static inline void
+barrier_trace_sync_rec (f64 t_entry)
+{
+    /* *INDENT-OFF* */
+    ELOG_TYPE_DECLARE (e) =
+      {
+        .format = "barrier    <%d(%dus)%s",
+        .format_args = "i4i4T4",
+      };
+    /* *INDENT-ON* */
+  struct
+  {
+    u32 depth, t_entry, caller;
+  } *ed = 0;
+
+  ed = ELOG_DATA (&vlib_global_main.elog_main, e);
+  ed->depth = (int) vlib_worker_threads[0].recursion_level - 1;
+  ed->t_entry = (int) (1000000.0 * t_entry);
+  ed->caller = elog_id_for_msg_name (vlib_worker_threads[0].barrier_caller);
+}
+
+static inline void
+barrier_trace_release_rec (f64 t_entry)
+{
+    /* *INDENT-OFF* */
+    ELOG_TYPE_DECLARE (e) =
+      {
+        .format = "barrier      (%dus)%d>",
+        .format_args = "i4i4",
+      };
+    /* *INDENT-ON* */
+  struct
+  {
+    u32 t_entry, depth;
+  } *ed = 0;
+
+  ed = ELOG_DATA (&vlib_global_main.elog_main, e);
+  ed->t_entry = (int) (1000000.0 * t_entry);
+  ed->depth = (int) vlib_worker_threads[0].recursion_level;
+}
+
+static inline void
+barrier_trace_release (f64 t_entry, f64 t_closed_total, f64 t_update_main)
+{
+    /* *INDENT-OFF* */
+    ELOG_TYPE_DECLARE (e) =
+      {
+        .format = "barrier   (%dus){%d}(C:%dus)#%d>",
+        .format_args = "i4i4i4i4",
+      };
+    /* *INDENT-ON* */
+  struct
+  {
+    u32 t_entry, t_update_main, t_closed_total, count;
+  } *ed = 0;
+
+  ed = ELOG_DATA (&vlib_global_main.elog_main, e);
+  ed->t_entry = (int) (1000000.0 * t_entry);
+  ed->t_update_main = (int) (1000000.0 * t_update_main);
+  ed->t_closed_total = (int) (1000000.0 * t_closed_total);
+  ed->count = (int) vlib_worker_threads[0].barrier_sync_count;
+
+  /* Reset context for next trace */
+  vlib_worker_threads[0].barrier_context = NULL;
+}
+#else
+char barrier_trace[65536];
+char *btp = barrier_trace;
+
+  /*
+   * syslog Barrier trace functions, which are nulled out if BARRIER_TRACING
+   * isn't defined
+   */
+
+
+static inline void
+barrier_trace_sync (f64 t_entry, f64 t_open, f64 t_closed)
+{
+  btp += sprintf (btp, "<%u#%s",
+		  (unsigned int) vlib_worker_threads[0].barrier_sync_count,
+		  vlib_worker_threads[0].barrier_caller);
+
+  if (vlib_worker_threads[0].barrier_context)
+    {
+      btp += sprintf (btp, "[%s]", vlib_worker_threads[0].barrier_context);
+
+    }
+
+  btp += sprintf (btp, "(O:%dus:%dus)(%dus):",
+		  (int) (1000000.0 * t_entry),
+		  (int) (1000000.0 * t_open), (int) (1000000.0 * t_closed));
+
+}
+
+static inline void
+barrier_trace_sync_rec (f64 t_entry)
+{
+  btp += sprintf (btp, "<%u(%dus)%s:",
+		  (int) vlib_worker_threads[0].recursion_level - 1,
+		  (int) (1000000.0 * t_entry),
+		  vlib_worker_threads[0].barrier_caller);
+}
+
+static inline void
+barrier_trace_release_rec (f64 t_entry)
+{
+  btp += sprintf (btp, ":(%dus)%u>", (int) (1000000.0 * t_entry),
+		  (int) vlib_worker_threads[0].recursion_level);
+}
+
+static inline void
+barrier_trace_release (f64 t_entry, f64 t_closed_total, f64 t_update_main)
+{
+
+  btp += sprintf (btp, ":(%dus)", (int) (1000000.0 * t_entry));
+  if (t_update_main > 0)
+    {
+      btp += sprintf (btp, "{%dus}", (int) (1000000.0 * t_update_main));
+    }
+
+  btp += sprintf (btp, "(C:%dus)#%u>",
+		  (int) (1000000.0 * t_closed_total),
+		  (int) vlib_worker_threads[0].barrier_sync_count);
+
+  /* Dump buffer to syslog, and reset for next trace */
+  fformat (stderr, "BTRC %s\n", barrier_trace);
+  btp = barrier_trace;
+  vlib_worker_threads[0].barrier_context = NULL;
+}
+#endif
+#else
+
+  /* Null functions for default case where barrier tracing isn't used */
+static inline void
+barrier_trace_sync (f64 t_entry, f64 t_open, f64 t_closed)
+{
+}
+
+static inline void
+barrier_trace_sync_rec (f64 t_entry)
+{
+}
+
+static inline void
+barrier_trace_release_rec (f64 t_entry)
+{
+}
+
+static inline void
+barrier_trace_release (f64 t_entry, f64 t_closed_total, f64 t_update_main)
+{
+}
+#endif
+
 uword
 os_get_nthreads (void)
 {
@@ -558,6 +774,10 @@ start_workers (vlib_main_t * vm)
       *vlib_worker_threads->node_reforks_required = 0;
       vm->need_vlib_worker_thread_node_runtime_update = 0;
 
+      /* init timing */
+      vm->barrier_epoch = 0;
+      vm->barrier_no_close_before = 0;
+
       worker_thread_index = 1;
 
       for (i = 0; i < vec_len (tm->registrations); i++)
@@ -790,6 +1010,7 @@ start_workers (vlib_main_t * vm)
 
 VLIB_MAIN_LOOP_ENTER_FUNCTION (start_workers);
 
+
 static inline void
 worker_thread_node_runtime_update_internal (void)
 {
@@ -993,7 +1214,6 @@ vlib_worker_thread_node_refork (void)
   nm_clone->processes = vec_dup (nm->processes);
 }
 
-
 void
 vlib_worker_thread_node_runtime_update (void)
 {
@@ -1192,10 +1412,29 @@ vlib_worker_thread_fork_fixup (vlib_fork_fixup_t which)
   vlib_worker_thread_barrier_release (vm);
 }
 
+  /*
+   * Enforce minimum open time to minimize packet loss due to Rx overflow,
+   * based on a test based heuristic that barrier should be open for at least
+   * 3 time as long as it is closed (with an upper bound of 1ms because by that
+   *  point it is probably too late to make a difference)
+   */
+
+#ifndef BARRIER_MINIMUM_OPEN_LIMIT
+#define BARRIER_MINIMUM_OPEN_LIMIT 0.001
+#endif
+
+#ifndef BARRIER_MINIMUM_OPEN_FACTOR
+#define BARRIER_MINIMUM_OPEN_FACTOR 3
+#endif
+
 void
-vlib_worker_thread_barrier_sync (vlib_main_t * vm)
+vlib_worker_thread_barrier_sync_int (vlib_main_t * vm)
 {
   f64 deadline;
+  f64 now;
+  f64 t_entry;
+  f64 t_open;
+  f64 t_closed;
   u32 count;
 
   if (vec_len (vlib_mains) < 2)
@@ -1205,29 +1444,55 @@ vlib_worker_thread_barrier_sync (vlib_main_t * vm)
 
   count = vec_len (vlib_mains) - 1;
 
+  /* Record entry relative to last close */
+  now = vlib_time_now (vm);
+  t_entry = now - vm->barrier_epoch;
+
   /* Tolerate recursive calls */
   if (++vlib_worker_threads[0].recursion_level > 1)
-    return;
+    {
+      barrier_trace_sync_rec (t_entry);
+      return;
+    }
 
   vlib_worker_threads[0].barrier_sync_count++;
 
-  deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT;
+  /* Enforce minimum barrier open time to minimize packet loss */
+  ASSERT (vm->barrier_no_close_before <= (now + BARRIER_MINIMUM_OPEN_LIMIT));
+  while ((now = vlib_time_now (vm)) < vm->barrier_no_close_before)
+    ;
+
+  /* Record time of closure */
+  t_open = now - vm->barrier_epoch;
+  vm->barrier_epoch = now;
+
+  deadline = now + BARRIER_SYNC_TIMEOUT;
 
   *vlib_worker_threads->wait_at_barrier = 1;
   while (*vlib_worker_threads->workers_at_barrier != count)
     {
-      if (vlib_time_now (vm) > deadline)
+      if ((now = vlib_time_now (vm)) > deadline)
 	{
 	  fformat (stderr, "%s: worker thread deadlock\n", __FUNCTION__);
 	  os_panic ();
 	}
     }
+
+  t_closed = now - vm->barrier_epoch;
+
+  barrier_trace_sync (t_entry, t_open, t_closed);
+
 }
 
 void
 vlib_worker_thread_barrier_release (vlib_main_t * vm)
 {
   f64 deadline;
+  f64 now;
+  f64 minimum_open;
+  f64 t_entry;
+  f64 t_closed_total;
+  f64 t_update_main = 0.0;
   int refork_needed = 0;
 
   if (vec_len (vlib_mains) < 2)
@@ -1235,8 +1500,15 @@ vlib_worker_thread_barrier_release (vlib_main_t * vm)
 
   ASSERT (vlib_get_thread_index () == 0);
 
+
+  now = vlib_time_now (vm);
+  t_entry = now - vm->barrier_epoch;
+
   if (--vlib_worker_threads[0].recursion_level > 0)
-    return;
+    {
+      barrier_trace_release_rec (t_entry);
+      return;
+    }
 
   /* Update (all) node runtimes before releasing the barrier, if needed */
   if (vm->need_vlib_worker_thread_node_runtime_update)
@@ -1249,15 +1521,17 @@ vlib_worker_thread_barrier_release (vlib_main_t * vm)
       refork_needed = 1;
       clib_smp_atomic_add (vlib_worker_threads->node_reforks_required,
 			   (vec_len (vlib_mains) - 1));
+      now = vlib_time_now (vm);
+      t_update_main = now - vm->barrier_epoch;
     }
 
-  deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT;
+  deadline = now + BARRIER_SYNC_TIMEOUT;
 
   *vlib_worker_threads->wait_at_barrier = 0;
 
   while (*vlib_worker_threads->workers_at_barrier > 0)
     {
-      if (vlib_time_now (vm) > deadline)
+      if ((now = vlib_time_now (vm)) > deadline)
 	{
 	  fformat (stderr, "%s: worker thread deadlock\n", __FUNCTION__);
 	  os_panic ();
@@ -1267,11 +1541,13 @@ vlib_worker_thread_barrier_release (vlib_main_t * vm)
   /* Wait for reforks before continuing */
   if (refork_needed)
     {
-      deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT;
+      now = vlib_time_now (vm);
+
+      deadline = now + BARRIER_SYNC_TIMEOUT;
 
       while (*vlib_worker_threads->node_reforks_required > 0)
 	{
-	  if (vlib_time_now (vm) > deadline)
+	  if ((now = vlib_time_now (vm)) > deadline)
 	    {
 	      fformat (stderr, "%s: worker thread refork deadlock\n",
 		       __FUNCTION__);
@@ -1279,6 +1555,23 @@ vlib_worker_thread_barrier_release (vlib_main_t * vm)
 	    }
 	}
     }
+
+  t_closed_total = now - vm->barrier_epoch;
+
+  minimum_open = t_closed_total * BARRIER_MINIMUM_OPEN_FACTOR;
+
+  if (minimum_open > BARRIER_MINIMUM_OPEN_LIMIT)
+    {
+      minimum_open = BARRIER_MINIMUM_OPEN_LIMIT;
+    }
+
+  vm->barrier_no_close_before = now + minimum_open;
+
+  /* Record barrier epoch (used to enforce minimum open time) */
+  vm->barrier_epoch = now;
+
+  barrier_trace_release (t_entry, t_closed_total, t_update_main);
+
 }
 
 /*
diff --git a/src/vlib/threads.h b/src/vlib/threads.h
index c3f1cade..72340ee1 100644
--- a/src/vlib/threads.h
+++ b/src/vlib/threads.h
@@ -18,6 +18,22 @@
 #include <vlib/main.h>
 #include <linux/sched.h>
 
+/*
+ * To enable detailed tracing of barrier usage, including call stacks and
+ * timings, define BARRIER_TRACING here or in relevant TAGS.  If also used
+ * with CLIB_DEBUG, timing will _not_ be representative of normal code
+ * execution.
+ *
+ */
+
+// #define BARRIER_TRACING 1
+
+/*
+ * Two options for barrier tracing output: syslog & elog.
+ */
+
+// #define BARRIER_TRACING_ELOG 1
+
 extern vlib_main_t **vlib_mains;
 
 void vlib_set_thread_name (char *name);
@@ -102,6 +118,10 @@ typedef struct
   vlib_thread_registration_t *registration;
   u8 *name;
   u64 barrier_sync_count;
+#ifdef BARRIER_TRACING
+  const char *barrier_caller;
+  const char *barrier_context;
+#endif
   volatile u32 *node_reforks_required;
 
   long lwp;
@@ -179,7 +199,14 @@ u32 vlib_frame_queue_main_init (u32 node_index, u32 frame_queue_nelts);
 #define BARRIER_SYNC_TIMEOUT (1.0)
 #endif
 
-void vlib_worker_thread_barrier_sync (vlib_main_t * vm);
+#ifdef BARRIER_TRACING
+#define vlib_worker_thread_barrier_sync(X) {vlib_worker_threads[0].barrier_caller=__FUNCTION__;vlib_worker_thread_barrier_sync_int(X);}
+#else
+#define vlib_worker_thread_barrier_sync(X) vlib_worker_thread_barrier_sync_int(X)
+#endif
+
+
+void vlib_worker_thread_barrier_sync_int (vlib_main_t * vm);
 void vlib_worker_thread_barrier_release (vlib_main_t * vm);
 void vlib_worker_thread_node_refork (void);
 
diff --git a/src/vlibapi/api_common.h b/src/vlibapi/api_common.h
index 651566ae..bbeccfc2 100644
--- a/src/vlibapi/api_common.h
+++ b/src/vlibapi/api_common.h
@@ -144,6 +144,12 @@ void vl_msg_api_queue_handler (unix_shared_memory_queue_t * q);
 
 void vl_msg_api_barrier_sync (void) __attribute__ ((weak));
 void vl_msg_api_barrier_release (void) __attribute__ ((weak));
+#ifdef BARRIER_TRACING
+void vl_msg_api_barrier_trace_context (const char *context)
+  __attribute__ ((weak));
+#else
+#define vl_msg_api_barrier_trace_context(X)
+#endif
 void vl_msg_api_free (void *);
 void vl_noop_handler (void *mp);
 void vl_msg_api_increment_missing_client_counter (void);
diff --git a/src/vlibapi/api_shared.c b/src/vlibapi/api_shared.c
index 5c1a9940..59dc2375 100644
--- a/src/vlibapi/api_shared.c
+++ b/src/vlibapi/api_shared.c
@@ -418,7 +418,10 @@ msg_handler_internal (api_main_t * am,
       if (do_it)
 	{
 	  if (!am->is_mp_safe[id])
-	    vl_msg_api_barrier_sync ();
+	    {
+	      vl_msg_api_barrier_trace_context (am->msg_names[id]);
+	      vl_msg_api_barrier_sync ();
+	    }
 	  (*am->msg_handlers[id]) (the_msg);
 	  if (!am->is_mp_safe[id])
 	    vl_msg_api_barrier_release ();
@@ -498,7 +501,10 @@ vl_msg_api_handler_with_vm_node (api_main_t * am,
 	vl_msg_api_trace (am, am->rx_trace, the_msg);
 
       if (!am->is_mp_safe[id])
-	vl_msg_api_barrier_sync ();
+	{
+	  vl_msg_api_barrier_trace_context (am->msg_names[id]);
+	  vl_msg_api_barrier_sync ();
+	}
       (*handler) (the_msg, vm, node);
       if (!am->is_mp_safe[id])
 	vl_msg_api_barrier_release ();
diff --git a/src/vlibmemory/memory_vlib.c b/src/vlibmemory/memory_vlib.c
index 688ce604..55a90d64 100644
--- a/src/vlibmemory/memory_vlib.c
+++ b/src/vlibmemory/memory_vlib.c
@@ -1462,6 +1462,7 @@ _(TRACE_PLUGIN_MSG_IDS,trace_plugin_msg_ids)
 static clib_error_t *
 rpc_api_hookup (vlib_main_t * vm)
 {
+  api_main_t *am = &api_main;
 #define _(N,n)                                                  \
     vl_msg_api_set_handlers(VL_API_##N, #n,                     \
                            vl_api_##n##_t_handler,              \
@@ -1481,6 +1482,10 @@ rpc_api_hookup (vlib_main_t * vm)
                            sizeof(vl_api_##n##_t), 1 /* do trace */);
   foreach_plugin_trace_msg;
 #undef _
+
+  /* No reason to halt the parade to create a trace record... */
+  am->is_mp_safe[VL_API_TRACE_PLUGIN_MSG_IDS] = 1;
+
   return 0;
 }
 
diff --git a/src/vpp/vnet/main.c b/src/vpp/vnet/main.c
index 76371dbe..b330f60f 100644
--- a/src/vpp/vnet/main.c
+++ b/src/vpp/vnet/main.c
@@ -294,6 +294,14 @@ os_exit (int code)
   exit (code);
 }
 
+#ifdef BARRIER_TRACING
+void
+vl_msg_api_barrier_trace_context (const char *context)
+{
+  vlib_worker_threads[0].barrier_context = context;
+}
+#endif
+
 void
 vl_msg_api_barrier_sync (void)
 {
-- 
cgit 1.2.3-korg


From 01914ce45729833cec88c65689de9a0336cd40cc Mon Sep 17 00:00:00 2001
From: Damjan Marion <damarion@cisco.com>
Date: Thu, 14 Sep 2017 19:04:50 +0200
Subject: vppinfra: add clib_mem_vm_ext_alloc function

Change-Id: Iff33694fc42cc3bcc73cf1372339053a6365039c
Signed-off-by: Damjan Marion <damarion@cisco.com>
---
 src/plugins/dpdk/device/init.c         |   6 +-
 src/plugins/memif/memif.c              |  21 ++-
 src/vlib.am                            |   5 +-
 src/vlib/linux/pci.c                   |  25 ++--
 src/vlib/linux/physmem.c               | 192 ++++--------------------
 src/vlib/linux/syscall.h               |  58 --------
 src/vlib/linux/sysfs.c                 | 250 -------------------------------
 src/vlib/linux/sysfs.h                 |  44 ------
 src/vlib/threads.c                     |   6 +-
 src/vlib/threads_cli.c                 |   6 +-
 src/vnet/devices/af_packet/af_packet.c |   4 +-
 src/vppinfra.am                        |   5 +-
 src/vppinfra/linux/mem.c               | 260 +++++++++++++++++++++++++++++++++
 src/vppinfra/linux/syscall.h           |  56 +++++++
 src/vppinfra/linux/sysfs.c             | 250 +++++++++++++++++++++++++++++++
 src/vppinfra/linux/sysfs.h             |  46 ++++++
 src/vppinfra/mem.h                     |  94 ++++++++++--
 src/vppinfra/vm_linux_kernel.h         |  78 ----------
 src/vppinfra/vm_standalone.h           |  74 ----------
 src/vppinfra/vm_unix.h                 | 106 --------------
 20 files changed, 761 insertions(+), 825 deletions(-)
 delete mode 100644 src/vlib/linux/syscall.h
 delete mode 100644 src/vlib/linux/sysfs.c
 delete mode 100644 src/vlib/linux/sysfs.h
 create mode 100644 src/vppinfra/linux/mem.c
 create mode 100644 src/vppinfra/linux/syscall.h
 create mode 100644 src/vppinfra/linux/sysfs.c
 create mode 100644 src/vppinfra/linux/sysfs.h
 delete mode 100644 src/vppinfra/vm_linux_kernel.h
 delete mode 100644 src/vppinfra/vm_standalone.h
 delete mode 100644 src/vppinfra/vm_unix.h

(limited to 'src/vlib/threads.c')

diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c
index 95176fb8..ee61f94e 100755
--- a/src/plugins/dpdk/device/init.c
+++ b/src/plugins/dpdk/device/init.c
@@ -17,7 +17,7 @@
 #include <vppinfra/error.h>
 #include <vppinfra/format.h>
 #include <vppinfra/bitmap.h>
-#include <vlib/linux/sysfs.h>
+#include <vppinfra/linux/sysfs.h>
 #include <vlib/unix/unix.h>
 
 #include <vnet/ethernet/ethernet.h>
@@ -1040,7 +1040,7 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input)
 	  mem = mem_by_socket[c];
 
 	  page_size = 1024;
-	  e = vlib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail);
+	  e = clib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail);
 
 	  if (e != 0 || pages_avail < 0 || page_size * pages_avail < mem)
 	    use_1g = 0;
@@ -1049,7 +1049,7 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input)
 	   clib_error_free (e);
 
 	  page_size = 2;
-	  e = vlib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail);
+	  e = clib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail);
 
 	  if (e != 0 || pages_avail < 0 || page_size * pages_avail < mem)
 	    use_2m = 0;
diff --git a/src/plugins/memif/memif.c b/src/plugins/memif/memif.c
index 8fec409a..6a609a57 100644
--- a/src/plugins/memif/memif.c
+++ b/src/plugins/memif/memif.c
@@ -33,7 +33,7 @@
 
 #include <vlib/vlib.h>
 #include <vlib/unix/unix.h>
-#include <vlib/linux/syscall.h>
+#include <vppinfra/linux/syscall.h>
 #include <vnet/plugin/plugin.h>
 #include <vnet/ethernet/ethernet.h>
 #include <vpp/app/version.h>
@@ -267,6 +267,8 @@ memif_init_regions_and_queues (memif_if_t * mif)
   int i, j;
   u64 buffer_offset;
   memif_region_t *r;
+  clib_mem_vm_alloc_t alloc = { 0 };
+  clib_error_t *err;
 
   vec_validate_aligned (mif->regions, 0, CLIB_CACHE_LINE_BYTES);
   r = vec_elt_at_index (mif->regions, 0);
@@ -279,18 +281,15 @@ memif_init_regions_and_queues (memif_if_t * mif)
     mif->run.buffer_size * (1 << mif->run.log2_ring_size) *
     (mif->run.num_s2m_rings + mif->run.num_m2s_rings);
 
-  if ((r->fd = memfd_create ("memif region 0", MFD_ALLOW_SEALING)) == -1)
-    return clib_error_return_unix (0, "memfd_create");
-
-  if ((fcntl (r->fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1)
-    return clib_error_return_unix (0, "fcntl (F_ADD_SEALS, F_SEAL_SHRINK)");
+  alloc.name = "memif region";
+  alloc.size = r->region_size;
+  alloc.flags = CLIB_MEM_VM_F_SHARED;
 
-  if ((ftruncate (r->fd, r->region_size)) == -1)
-    return clib_error_return_unix (0, "ftruncate");
+  err = clib_mem_vm_ext_alloc (&alloc);
+  if (err)
+    return err;
 
-  if ((r->shm = mmap (NULL, r->region_size, PROT_READ | PROT_WRITE,
-		      MAP_SHARED, r->fd, 0)) == MAP_FAILED)
-    return clib_error_return_unix (0, "mmap");
+  r->fd = alloc.fd;
 
   for (i = 0; i < mif->run.num_s2m_rings; i++)
     {
diff --git a/src/vlib.am b/src/vlib.am
index 41d68690..067e4afc 100644
--- a/src/vlib.am
+++ b/src/vlib.am
@@ -13,7 +13,7 @@
 
 
 lib_LTLIBRARIES += libvlib.la
-libvlib_la_LIBADD = libvppinfra.la -ldl -lpthread -lnuma
+libvlib_la_LIBADD = libvppinfra.la -ldl -lpthread
 libvlib_la_DEPENDENCIES = libvppinfra.la
 
 BUILT_SOURCES += vlib/config.h
@@ -34,7 +34,6 @@ libvlib_la_SOURCES =				\
   vlib/init.c					\
   vlib/linux/pci.c				\
   vlib/linux/physmem.c				\
-  vlib/linux/sysfs.c				\
   vlib/main.c					\
   vlib/mc.c					\
   vlib/node.c					\
@@ -60,8 +59,6 @@ nobase_include_HEADERS +=			\
   vlib/global_funcs.h				\
   vlib/i2c.h					\
   vlib/init.h					\
-  vlib/linux/sysfs.h				\
-  vlib/linux/syscall.h				\
   vlib/main.h					\
   vlib/mc.h					\
   vlib/node_funcs.h				\
diff --git a/src/vlib/linux/pci.c b/src/vlib/linux/pci.c
index 4ce19190..790f168a 100644
--- a/src/vlib/linux/pci.c
+++ b/src/vlib/linux/pci.c
@@ -37,10 +37,11 @@
  *  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
+#include <vppinfra/linux/sysfs.h>
+
 #include <vlib/vlib.h>
 #include <vlib/pci/pci.h>
 #include <vlib/unix/unix.h>
-#include <vlib/linux/sysfs.h>
 
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -104,7 +105,7 @@ vlib_pci_bind_to_uio (vlib_pci_device_t * d, char *uio_driver_name)
 			     format_vlib_pci_addr, &d->bus_address);
 
   s = format (s, "%v/driver%c", dev_dir_name, 0);
-  driver_name = vlib_sysfs_link_to_name ((char *) s);
+  driver_name = clib_sysfs_link_to_name ((char *) s);
   vec_reset_length (s);
 
   if (driver_name &&
@@ -183,32 +184,32 @@ vlib_pci_bind_to_uio (vlib_pci_device_t * d, char *uio_driver_name)
   vec_reset_length (s);
 
   s = format (s, "%v/driver/unbind%c", dev_dir_name, 0);
-  vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address);
+  clib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address);
   vec_reset_length (s);
 
   s = format (s, "%v/driver_override%c", dev_dir_name, 0);
   if (access ((char *) s, F_OK) == 0)
     {
-      vlib_sysfs_write ((char *) s, "%s", uio_driver_name);
+      clib_sysfs_write ((char *) s, "%s", uio_driver_name);
       clear_driver_override = 1;
     }
   else
     {
       vec_reset_length (s);
       s = format (s, "/sys/bus/pci/drivers/%s/new_id%c", uio_driver_name, 0);
-      vlib_sysfs_write ((char *) s, "0x%04x 0x%04x", d->vendor_id,
+      clib_sysfs_write ((char *) s, "0x%04x 0x%04x", d->vendor_id,
 			d->device_id);
     }
   vec_reset_length (s);
 
   s = format (s, "/sys/bus/pci/drivers/%s/bind%c", uio_driver_name, 0);
-  vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address);
+  clib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address);
   vec_reset_length (s);
 
   if (clear_driver_override)
     {
       s = format (s, "%v/driver_override%c", dev_dir_name, 0);
-      vlib_sysfs_write ((char *) s, "%c", 0);
+      clib_sysfs_write ((char *) s, "%c", 0);
       vec_reset_length (s);
     }
 
@@ -602,28 +603,28 @@ scan_device (void *arg, u8 * dev_dir_name, u8 * ignored)
   dev->numa_node = -1;
   vec_reset_length (f);
   f = format (f, "%v/numa_node%c", dev_dir_name, 0);
-  vlib_sysfs_read ((char *) f, "%u", &dev->numa_node);
+  clib_sysfs_read ((char *) f, "%u", &dev->numa_node);
 
   vec_reset_length (f);
   f = format (f, "%v/class%c", dev_dir_name, 0);
-  vlib_sysfs_read ((char *) f, "0x%x", &tmp);
+  clib_sysfs_read ((char *) f, "0x%x", &tmp);
   dev->device_class = tmp >> 8;
 
   vec_reset_length (f);
   f = format (f, "%v/vendor%c", dev_dir_name, 0);
-  vlib_sysfs_read ((char *) f, "0x%x", &tmp);
+  clib_sysfs_read ((char *) f, "0x%x", &tmp);
   dev->vendor_id = tmp;
 
   vec_reset_length (f);
   f = format (f, "%v/device%c", dev_dir_name, 0);
-  vlib_sysfs_read ((char *) f, "0x%x", &tmp);
+  clib_sysfs_read ((char *) f, "0x%x", &tmp);
   dev->device_id = tmp;
 
   error = init_device (vm, dev, &pdev);
 
   vec_reset_length (f);
   f = format (f, "%v/driver%c", dev_dir_name, 0);
-  dev->driver_name = vlib_sysfs_link_to_name ((char *) f);
+  dev->driver_name = clib_sysfs_link_to_name ((char *) f);
 
 done:
   vec_free (f);
diff --git a/src/vlib/linux/physmem.c b/src/vlib/linux/physmem.c
index d8c5dc9b..3cc42a06 100644
--- a/src/vlib/linux/physmem.c
+++ b/src/vlib/linux/physmem.c
@@ -43,14 +43,12 @@
 #include <sys/mman.h>
 #include <sys/fcntl.h>
 #include <sys/stat.h>
-#include <numa.h>
-#include <numaif.h>
 
+#include <vppinfra/linux/syscall.h>
+#include <vppinfra/linux/sysfs.h>
 #include <vlib/vlib.h>
 #include <vlib/physmem.h>
 #include <vlib/unix/unix.h>
-#include <vlib/linux/syscall.h>
-#include <vlib/linux/sysfs.h>
 
 static void *
 unix_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx,
@@ -111,31 +109,6 @@ unix_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx, void *x)
   mheap_put (pr->heap, x - pr->heap);
 }
 
-static u64
-get_page_paddr (int fd, uword addr)
-{
-  int pagesize = sysconf (_SC_PAGESIZE);
-  u64 seek, pagemap = 0;
-
-  seek = ((u64) addr / pagesize) * sizeof (u64);
-  if (lseek (fd, seek, SEEK_SET) != seek)
-    {
-      clib_unix_warning ("lseek to 0x%llx", seek);
-      return 0;
-    }
-  if (read (fd, &pagemap, sizeof (pagemap)) != (sizeof (pagemap)))
-    {
-      clib_unix_warning ("read ptbits");
-      return 0;
-    }
-  if ((pagemap & (1ULL << 63)) == 0)
-    return 0;
-
-  pagemap &= pow2_mask (55);
-
-  return pagemap * pagesize;
-}
-
 static clib_error_t *
 unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size,
 			   u8 numa_node, u32 flags,
@@ -144,13 +117,8 @@ unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size,
   vlib_physmem_main_t *vpm = &vm->physmem_main;
   vlib_physmem_region_t *pr;
   clib_error_t *error = 0;
-  int pagemap_fd = -1;
-  u8 *mount_dir = 0;
-  u8 *filename = 0;
-  struct stat st;
-  int old_mpol;
-  int mmap_flags;
-  struct bitmask *old_mask = numa_allocate_nodemask ();
+  clib_mem_vm_alloc_t alloc = { 0 };
+
 
   if (geteuid () != 0 && (flags & VLIB_PHYSMEM_F_FAKE) == 0)
     return clib_error_return (0, "not allowed");
@@ -163,113 +131,32 @@ unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size,
       goto error;
     }
 
-  pr->index = pr - vpm->regions;
-  pr->fd = -1;
-  pr->flags = flags;
-
-  if (get_mempolicy (&old_mpol, old_mask->maskp, old_mask->size + 1, NULL, 0)
-      == -1)
-    {
-      if ((flags & VLIB_PHYSMEM_F_FAKE) == 0)
-	{
-	  error = clib_error_return_unix (0, "get_mempolicy");
-	  goto error;
-	}
-      else
-	old_mpol = -1;
-    }
+  alloc.name = name;
+  alloc.size = size;
+  alloc.numa_node = numa_node;
+  alloc.flags = CLIB_MEM_VM_F_SHARED;
 
   if ((flags & VLIB_PHYSMEM_F_FAKE) == 0)
     {
-      if ((pagemap_fd = open ((char *) "/proc/self/pagemap", O_RDONLY)) == -1)
-	{
-	  error = clib_error_return_unix (0, "open '/proc/self/pagemap'");
-	  goto error;
-	}
-
-      mount_dir = format (0, "%s/physmem_region%d%c",
-			  vlib_unix_get_runtime_dir (), pr->index, 0);
-      filename = format (0, "%s/mem%c", mount_dir, 0);
-
-      unlink ((char *) mount_dir);
-
-      error = vlib_unix_recursive_mkdir ((char *) mount_dir);
-      if (error)
-	goto error;
-
-      if (mount ("none", (char *) mount_dir, "hugetlbfs", 0, NULL))
-	{
-	  error = clib_error_return_unix (0, "mount hugetlb directory '%s'",
-					  mount_dir);
-	  goto error;
-	}
-
-      if ((pr->fd = open ((char *) filename, O_CREAT | O_RDWR, 0755)) == -1)
-	{
-	  error = clib_error_return_unix (0, "open");
-	  goto error;
-	}
-
-      mmap_flags = MAP_SHARED | MAP_HUGETLB | MAP_LOCKED;
+      alloc.flags |= CLIB_MEM_VM_F_HUGETLB;
+      alloc.flags |= CLIB_MEM_VM_F_HUGETLB_PREALLOC;
+      alloc.flags |= CLIB_MEM_VM_F_NUMA_FORCE;
     }
   else
     {
-      if ((pr->fd = memfd_create (name, MFD_ALLOW_SEALING)) == -1)
-	return clib_error_return_unix (0, "memfd_create");
-
-      if ((fcntl (pr->fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1)
-	{
-	  error =
-	    clib_error_return_unix (0, "fcntl (F_ADD_SEALS, F_SEAL_SHRINK)");
-	  goto error;
-	}
-      mmap_flags = MAP_SHARED;
-    }
-
-  if (fstat (pr->fd, &st))
-    {
-      error = clib_error_return_unix (0, "fstat");
-      goto error;
-    }
-
-  pr->log2_page_size = min_log2 (st.st_blksize);
-  pr->n_pages = ((size - 1) >> pr->log2_page_size) + 1;
-  size = pr->n_pages * (1 << pr->log2_page_size);
-
-  if ((ftruncate (pr->fd, size)) == -1)
-    {
-      error = clib_error_return_unix (0, "ftruncate length: %d", size);
-      goto error;
-    }
-
-  if ((flags & VLIB_PHYSMEM_F_FAKE) == 0)
-    {
-      error = vlib_sysfs_prealloc_hugepages (numa_node,
-					     1 << (pr->log2_page_size - 10),
-					     pr->n_pages);
-      if (error)
-	goto error;
-    }
-
-  if (old_mpol != -1)
-    numa_set_preferred (numa_node);
-
-  pr->mem = mmap (0, size, (PROT_READ | PROT_WRITE), mmap_flags, pr->fd, 0);
-
-  if (pr->mem == MAP_FAILED)
-    {
-      pr->mem = 0;
-      error = clib_error_return_unix (0, "mmap");
-      goto error;
+      alloc.flags |= CLIB_MEM_VM_F_NUMA_PREFER;
     }
 
-  if (old_mpol != -1 &&
-      set_mempolicy (old_mpol, old_mask->maskp, old_mask->size + 1) == -1)
-    {
-      error = clib_error_return_unix (0, "set_mempolicy");
-      goto error;
-    }
+  error = clib_mem_vm_ext_alloc (&alloc);
+  if (error)
+    goto error;
 
+  pr->index = pr - vpm->regions;
+  pr->flags = flags;
+  pr->fd = alloc.fd;
+  pr->mem = alloc.addr;
+  pr->log2_page_size = alloc.log2_page_size;
+  pr->n_pages = alloc.n_pages;
   pr->size = pr->n_pages << pr->log2_page_size;
   pr->page_mask = (1 << pr->log2_page_size) - 1;
   pr->numa_node = numa_node;
@@ -285,13 +172,14 @@ unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size,
 	  move_pages (0, 1, &ptr, 0, &node, 0);
 	  if (numa_node != node)
 	    {
-	      clib_warning
-		("physmem page for region \'%s\' allocated on the wrong"
-		 " numa node (requested %u actual %u)", pr->name,
-		 pr->numa_node, node, i);
+	      clib_warning ("physmem page for region \'%s\' allocated on the"
+			    " wrong numa node (requested %u actual %u)",
+			    pr->name, pr->numa_node, node, i);
 	      break;
 	    }
 	}
+      pr->page_table = clib_mem_vm_get_paddr (pr->mem, pr->log2_page_size,
+					      pr->n_pages);
     }
 
   if (flags & VLIB_PHYSMEM_F_INIT_MHEAP)
@@ -309,41 +197,13 @@ unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size,
 
   *idx = pr->index;
 
-  if ((flags & VLIB_PHYSMEM_F_FAKE) == 0)
-    {
-      int i;
-      for (i = 0; i < pr->n_pages; i++)
-	{
-	  uword vaddr =
-	    pointer_to_uword (pr->mem) + (((u64) i) << pr->log2_page_size);
-	  u64 page_paddr = get_page_paddr (pagemap_fd, vaddr);
-	  vec_add1 (pr->page_table, page_paddr);
-	}
-    }
-
   goto done;
 
 error:
-  if (pr->fd > -1)
-    close (pr->fd);
-
-  if (pr->mem)
-    munmap (pr->mem, size);
-
   memset (pr, 0, sizeof (*pr));
   pool_put (vpm->regions, pr);
 
 done:
-  if (mount_dir)
-    {
-      umount2 ((char *) mount_dir, MNT_DETACH);
-      rmdir ((char *) mount_dir);
-      vec_free (mount_dir);
-    }
-  numa_free_cpumask (old_mask);
-  vec_free (filename);
-  if (pagemap_fd > -1)
-    close (pagemap_fd);
   return error;
 }
 
diff --git a/src/vlib/linux/syscall.h b/src/vlib/linux/syscall.h
deleted file mode 100644
index 9e37997e..00000000
--- a/src/vlib/linux/syscall.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2017 Cisco and/or its affiliates.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at:
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef included_linux_syscall_h
-#define included_linux_syscall_h
-
-#ifndef __NR_memfd_create
-#if defined __x86_64__
-#define __NR_memfd_create 319
-#elif defined __arm__
-#define __NR_memfd_create 385
-#elif defined __aarch64__
-#define __NR_memfd_create 279
-#else
-#error "__NR_memfd_create unknown for this architecture"
-#endif
-#endif
-
-static inline int
-memfd_create (const char *name, unsigned int flags)
-{
-  return syscall (__NR_memfd_create, name, flags);
-}
-
-#ifndef F_LINUX_SPECIFIC_BASE
-#define F_LINUX_SPECIFIC_BASE 1024
-#endif
-#define MFD_ALLOW_SEALING       0x0002U
-#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
-#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
-
-#define F_SEAL_SEAL     0x0001	/* prevent further seals from being set */
-#define F_SEAL_SHRINK   0x0002	/* prevent file from shrinking */
-#define F_SEAL_GROW     0x0004	/* prevent file from growing */
-#define F_SEAL_WRITE    0x0008	/* prevent writes */
-
-
-#endif /* included_linux_syscall_h */
-
-/*
- * fd.io coding-style-patch-verification: ON
- *
- * Local Variables:
- * eval: (c-set-style "gnu")
- * End:
- */
diff --git a/src/vlib/linux/sysfs.c b/src/vlib/linux/sysfs.c
deleted file mode 100644
index f92f9ef5..00000000
--- a/src/vlib/linux/sysfs.c
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * Copyright (c) 2017 Cisco and/or its affiliates.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at:
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <vlib/vlib.h>
-#include <vlib/unix/unix.h>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <dirent.h>
-
-clib_error_t *
-vlib_sysfs_write (char *file_name, char *fmt, ...)
-{
-  u8 *s;
-  int fd;
-  clib_error_t *error = 0;
-
-  fd = open (file_name, O_WRONLY);
-  if (fd < 0)
-    return clib_error_return_unix (0, "open `%s'", file_name);
-
-  va_list va;
-  va_start (va, fmt);
-  s = va_format (0, fmt, &va);
-  va_end (va);
-
-  if (write (fd, s, vec_len (s)) < 0)
-    error = clib_error_return_unix (0, "write `%s'", file_name);
-
-  vec_free (s);
-  close (fd);
-  return error;
-}
-
-clib_error_t *
-vlib_sysfs_read (char *file_name, char *fmt, ...)
-{
-  unformat_input_t input;
-  u8 *s = 0;
-  int fd;
-  ssize_t sz;
-  uword result;
-
-  fd = open (file_name, O_RDONLY);
-  if (fd < 0)
-    return clib_error_return_unix (0, "open `%s'", file_name);
-
-  vec_validate (s, 4095);
-
-  sz = read (fd, s, vec_len (s));
-  if (sz < 0)
-    {
-      close (fd);
-      vec_free (s);
-      return clib_error_return_unix (0, "read `%s'", file_name);
-    }
-
-  _vec_len (s) = sz;
-  unformat_init_vector (&input, s);
-
-  va_list va;
-  va_start (va, fmt);
-  result = va_unformat (&input, fmt, &va);
-  va_end (va);
-
-  vec_free (s);
-  close (fd);
-
-  if (result == 0)
-    return clib_error_return (0, "unformat error");
-
-  return 0;
-}
-
-u8 *
-vlib_sysfs_link_to_name (char *link)
-{
-  char *p, buffer[64];
-  unformat_input_t in;
-  u8 *s = 0;
-  int r;
-
-  r = readlink (link, buffer, sizeof (buffer) - 1);
-
-  if (r < 0)
-    return 0;
-
-  buffer[r] = 0;
-  p = strrchr (buffer, '/');
-
-  if (!p)
-    return 0;
-
-  unformat_init_string (&in, p + 1, strlen (p + 1));
-  if (unformat (&in, "%s", &s) != 1)
-    clib_unix_warning ("no string?");
-  unformat_free (&in);
-
-  return s;
-}
-
-clib_error_t *
-vlib_sysfs_set_nr_hugepages (unsigned int numa_node, int page_size, int nr)
-{
-  clib_error_t *error = 0;
-  struct stat sb;
-  u8 *p = 0;
-
-  p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0);
-
-  if (stat ((char *) p, &sb) == 0)
-    {
-      if (S_ISDIR (sb.st_mode) == 0)
-	{
-	  error = clib_error_return (0, "'%s' is not directory", p);
-	  goto done;
-	}
-    }
-  else if (numa_node == 0)
-    {
-      vec_reset_length (p);
-      p = format (p, "/sys/kernel/mm%c", 0);
-      if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0)
-	{
-	  error = clib_error_return (0, "'%s' does not exist or it is not "
-				     "directory", p);
-	  goto done;
-	}
-    }
-  else
-    {
-      error = clib_error_return (0, "'%s' does not exist", p);
-      goto done;
-    }
-
-  _vec_len (p) -= 1;
-  p = format (p, "/hugepages/hugepages-%ukB/nr_hugepages%c", page_size, 0);
-  vlib_sysfs_write ((char *) p, "%d", nr);
-
-done:
-  vec_free (p);
-  return error;
-}
-
-
-static clib_error_t *
-vlib_sysfs_get_xxx_hugepages (char *type, unsigned int numa_node,
-			      int page_size, int *val)
-{
-  clib_error_t *error = 0;
-  struct stat sb;
-  u8 *p = 0;
-
-  p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0);
-
-  if (stat ((char *) p, &sb) == 0)
-    {
-      if (S_ISDIR (sb.st_mode) == 0)
-	{
-	  error = clib_error_return (0, "'%s' is not directory", p);
-	  goto done;
-	}
-    }
-  else if (numa_node == 0)
-    {
-      vec_reset_length (p);
-      p = format (p, "/sys/kernel/mm%c", 0);
-      if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0)
-	{
-	  error = clib_error_return (0, "'%s' does not exist or it is not "
-				     "directory", p);
-	  goto done;
-	}
-    }
-  else
-    {
-      error = clib_error_return (0, "'%s' does not exist", p);
-      goto done;
-    }
-
-  _vec_len (p) -= 1;
-  p = format (p, "/hugepages/hugepages-%ukB/%s_hugepages%c", page_size,
-	      type, 0);
-  error = vlib_sysfs_read ((char *) p, "%d", val);
-
-done:
-  vec_free (p);
-  return error;
-}
-
-clib_error_t *
-vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size, int *v)
-{
-  return vlib_sysfs_get_xxx_hugepages ("free", numa_node, page_size, v);
-}
-
-clib_error_t *
-vlib_sysfs_get_nr_hugepages (unsigned int numa_node, int page_size, int *v)
-{
-  return vlib_sysfs_get_xxx_hugepages ("nr", numa_node, page_size, v);
-}
-
-clib_error_t *
-vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, int page_size,
-				  int *v)
-{
-  return vlib_sysfs_get_xxx_hugepages ("surplus", numa_node, page_size, v);
-}
-
-clib_error_t *
-vlib_sysfs_prealloc_hugepages (unsigned int numa_node, int page_size, int nr)
-{
-  clib_error_t *error = 0;
-  int n, needed;
-  error = vlib_sysfs_get_free_hugepages (numa_node, page_size, &n);
-  if (error)
-    return error;
-  needed = nr - n;
-  if (needed <= 0)
-    return 0;
-
-  error = vlib_sysfs_get_nr_hugepages (numa_node, page_size, &n);
-  if (error)
-    return error;
-  clib_warning ("pre-allocating %u additional %uK hugepages on numa node %u",
-		needed, page_size, numa_node);
-  return vlib_sysfs_set_nr_hugepages (numa_node, page_size, n + needed);
-}
-
-
-/*
- * fd.io coding-style-patch-verification: ON
- *
- * Local Variables:
- * eval: (c-set-style "gnu")
- * End:
- */
diff --git a/src/vlib/linux/sysfs.h b/src/vlib/linux/sysfs.h
deleted file mode 100644
index 14b71317..00000000
--- a/src/vlib/linux/sysfs.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2017 Cisco and/or its affiliates.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at:
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef included_linux_sysfs_h
-#define included_linux_sysfs_h
-
-clib_error_t *vlib_sysfs_write (char *file_name, char *fmt, ...);
-
-clib_error_t *vlib_sysfs_read (char *file_name, char *fmt, ...);
-
-u8 *vlib_sysfs_link_to_name (char *link);
-
-clib_error_t *vlib_sysfs_set_nr_hugepages (unsigned int numa_node,
-					   int page_size, int nr);
-clib_error_t *vlib_sysfs_get_nr_hugepages (unsigned int numa_node,
-					   int page_size, int *v);
-clib_error_t *vlib_sysfs_get_free_hugepages (unsigned int numa_node,
-					     int page_size, int *v);
-clib_error_t *vlib_sysfs_get_surplus_hugepages (unsigned int numa_node,
-						int page_size, int *v);
-clib_error_t *vlib_sysfs_prealloc_hugepages (unsigned int numa_node,
-					     int page_size, int nr);
-
-#endif /* included_linux_sysfs_h */
-
-/*
- * fd.io coding-style-patch-verification: ON
- *
- * Local Variables:
- * eval: (c-set-style "gnu")
- * End:
- */
diff --git a/src/vlib/threads.c b/src/vlib/threads.c
index 2d9ce84a..f9c7043c 100644
--- a/src/vlib/threads.c
+++ b/src/vlib/threads.c
@@ -289,7 +289,7 @@ sort_registrations_by_no_clone (void *a0, void *a1)
 }
 
 static uword *
-vlib_sysfs_list_to_bitmap (char *filename)
+clib_sysfs_list_to_bitmap (char *filename)
 {
   FILE *fp;
   uword *r = 0;
@@ -331,9 +331,9 @@ vlib_thread_init (vlib_main_t * vm)
 
   /* get bitmaps of active cpu cores and sockets */
   tm->cpu_core_bitmap =
-    vlib_sysfs_list_to_bitmap ("/sys/devices/system/cpu/online");
+    clib_sysfs_list_to_bitmap ("/sys/devices/system/cpu/online");
   tm->cpu_socket_bitmap =
-    vlib_sysfs_list_to_bitmap ("/sys/devices/system/node/online");
+    clib_sysfs_list_to_bitmap ("/sys/devices/system/node/online");
 
   avail_cpu = clib_bitmap_dup (tm->cpu_core_bitmap);
 
diff --git a/src/vlib/threads_cli.c b/src/vlib/threads_cli.c
index f8d5d8f9..02bdea5c 100644
--- a/src/vlib/threads_cli.c
+++ b/src/vlib/threads_cli.c
@@ -15,10 +15,10 @@
 #define _GNU_SOURCE
 
 #include <vppinfra/format.h>
+#include <vppinfra/linux/sysfs.h>
 #include <vlib/vlib.h>
 
 #include <vlib/threads.h>
-#include <vlib/linux/sysfs.h>
 #include <vlib/unix/unix.h>
 
 static u8 *
@@ -98,14 +98,14 @@ show_threads_fn (vlib_main_t * vm,
 	  u8 *p = 0;
 
 	  p = format (p, "%s%u/topology/core_id%c", sys_cpu_path, lcore, 0);
-	  vlib_sysfs_read ((char *) p, "%d", &core_id);
+	  clib_sysfs_read ((char *) p, "%d", &core_id);
 
 	  vec_reset_length (p);
 	  p =
 	    format (p,
 		    "%s%u/topology/physical_package_id%c",
 		    sys_cpu_path, lcore, 0);
-	  vlib_sysfs_read ((char *) p, "%d", &socket_id);
+	  clib_sysfs_read ((char *) p, "%d", &socket_id);
 	  vec_free (p);
 
 	  line = format (line, "%-7u%-7u%-7u%", lcore, core_id, socket_id);
diff --git a/src/vnet/devices/af_packet/af_packet.c b/src/vnet/devices/af_packet/af_packet.c
index 62bb228f..32696014 100644
--- a/src/vnet/devices/af_packet/af_packet.c
+++ b/src/vnet/devices/af_packet/af_packet.c
@@ -24,9 +24,9 @@
 #include <sys/types.h>
 #include <fcntl.h>
 
+#include <vppinfra/linux/sysfs.h>
 #include <vlib/vlib.h>
 #include <vlib/unix/unix.h>
-#include <vlib/linux/sysfs.h>
 #include <vnet/ip/ip.h>
 #include <vnet/ethernet/ethernet.h>
 
@@ -75,7 +75,7 @@ af_packet_eth_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi,
     {
       s = format (0, "/sys/class/net/%s/mtu%c", apif->host_if_name, 0);
 
-      error = vlib_sysfs_write ((char *) s, "%d", hi->max_packet_bytes);
+      error = clib_sysfs_write ((char *) s, "%d", hi->max_packet_bytes);
       vec_free (s);
 
       if (error)
diff --git a/src/vppinfra.am b/src/vppinfra.am
index a5769a0d..daca9954 100644
--- a/src/vppinfra.am
+++ b/src/vppinfra.am
@@ -188,6 +188,8 @@ nobase_include_HEADERS = \
   vppinfra/graph.h \
   vppinfra/hash.h \
   vppinfra/heap.h \
+  vppinfra/linux/sysfs.h \
+  vppinfra/linux/syscall.h \
   vppinfra/lock.h \
   vppinfra/longjmp.h \
   vppinfra/macros.h \
@@ -233,7 +235,6 @@ nobase_include_HEADERS = \
   vppinfra/vector_neon.h \
   vppinfra/vector_sse2.h \
   vppinfra/valgrind.h \
-  vppinfra/vm_unix.h \
   vppinfra/xxhash.h \
   vppinfra/xy.h \
   vppinfra/zvec.h
@@ -291,6 +292,8 @@ CLIB_CORE = \
 libvppinfra_la_SOURCES =			\
   $(CLIB_CORE)					\
   vppinfra/elf_clib.c				\
+  vppinfra/linux/mem.c				\
+  vppinfra/linux/sysfs.c			\
   vppinfra/socket.c				\
   vppinfra/timer.c				\
   vppinfra/unix-formats.c			\
diff --git a/src/vppinfra/linux/mem.c b/src/vppinfra/linux/mem.c
new file mode 100644
index 00000000..665ddf61
--- /dev/null
+++ b/src/vppinfra/linux/mem.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2017 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/mount.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <linux/mempolicy.h>
+#include <linux/memfd.h>
+
+#include <vppinfra/clib.h>
+#include <vppinfra/mem.h>
+#include <vppinfra/format.h>
+#include <vppinfra/clib_error.h>
+#include <vppinfra/linux/syscall.h>
+#include <vppinfra/linux/sysfs.h>
+
+#ifndef F_LINUX_SPECIFIC_BASE
+#define F_LINUX_SPECIFIC_BASE 1024
+#endif
+
+#ifndef F_ADD_SEALS
+#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
+#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
+
+#define F_SEAL_SEAL     0x0001	/* prevent further seals from being set */
+#define F_SEAL_SHRINK   0x0002	/* prevent file from shrinking */
+#define F_SEAL_GROW     0x0004	/* prevent file from growing */
+#define F_SEAL_WRITE    0x0008	/* prevent writes */
+#endif
+
+int
+clib_mem_vm_get_log2_page_size (int fd)
+{
+  struct stat st = { 0 };
+  if (fstat (fd, &st))
+    return 0;
+  return min_log2 (st.st_blksize);
+}
+
+clib_error_t *
+clib_mem_vm_ext_alloc (clib_mem_vm_alloc_t * a)
+{
+  int fd = -1;
+  clib_error_t *err = 0;
+  void *addr = 0;
+  u8 *filename = 0;
+  int mmap_flags = MAP_SHARED;
+  int log2_page_size;
+  int n_pages;
+  int old_mpol = -1;
+  u64 old_mask[16] = { 0 };
+
+  /* save old numa mem policy if needed */
+  if (a->flags & (CLIB_MEM_VM_F_NUMA_PREFER | CLIB_MEM_VM_F_NUMA_FORCE))
+    {
+      int rv;
+      rv =
+	get_mempolicy (&old_mpol, old_mask, sizeof (old_mask) * 8 + 1, 0, 0);
+
+      if (rv == -1)
+	{
+	  if ((a->flags & CLIB_MEM_VM_F_NUMA_FORCE) != 0)
+	    {
+	      err = clib_error_return_unix (0, "get_mempolicy");
+	      goto error;
+	    }
+	  else
+	    old_mpol = -1;
+	}
+    }
+
+  /* if we are creating shared segment, we need file descriptor */
+  if (a->flags & CLIB_MEM_VM_F_SHARED)
+    {
+      /* if hugepages are needed we need to create mount point */
+      if (a->flags & CLIB_MEM_VM_F_HUGETLB)
+	{
+	  char *mount_dir;
+	  char template[] = "/tmp/hugepage_mount.XXXXXX";
+
+	  mount_dir = mkdtemp (template);
+	  if (mount_dir == 0)
+	    return clib_error_return_unix (0, "mkdtemp \'%s\'", template);
+
+	  if (mount ("none", (char *) mount_dir, "hugetlbfs", 0, NULL))
+	    {
+	      err = clib_error_return_unix (0, "mount hugetlb directory '%s'",
+					    mount_dir);
+	      goto error;
+	    }
+
+	  filename = format (0, "%s/%s%c", mount_dir, a->name, 0);
+
+	  if ((fd = open ((char *) filename, O_CREAT | O_RDWR, 0755)) == -1)
+	    {
+	      err = clib_error_return_unix (0, "open");
+	      goto error;
+	    }
+	  umount2 ((char *) mount_dir, MNT_DETACH);
+	  rmdir ((char *) mount_dir);
+	  mmap_flags |= MAP_LOCKED;
+	}
+      else
+	{
+	  if ((fd = memfd_create (a->name, MFD_ALLOW_SEALING)) == -1)
+	    {
+	      err = clib_error_return_unix (0, "memfd_create");
+	      goto error;
+	    }
+
+	  if ((fcntl (fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1)
+	    {
+	      err = clib_error_return_unix (0, "fcntl (F_ADD_SEALS)");
+	      goto error;
+	    }
+	}
+      log2_page_size = clib_mem_vm_get_log2_page_size (fd);
+    }
+  else				/* not CLIB_MEM_VM_F_SHARED */
+    {
+      if (a->flags & CLIB_MEM_VM_F_HUGETLB)
+	{
+	  mmap_flags |= MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS;
+	  log2_page_size = 21;
+	}
+      else
+	{
+	  mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS;
+	  log2_page_size = min_log2 (sysconf (_SC_PAGESIZE));
+	}
+    }
+
+  n_pages = ((a->size - 1) >> log2_page_size) + 1;
+
+
+  if (a->flags & CLIB_MEM_VM_F_HUGETLB_PREALLOC)
+    {
+      err = clib_sysfs_prealloc_hugepages (a->numa_node,
+					   1 << (log2_page_size - 10),
+					   n_pages);
+      if (err)
+	goto error;
+
+    }
+
+  if (fd != -1)
+    if ((ftruncate (fd, a->size)) == -1)
+      {
+	err = clib_error_return_unix (0, "ftruncate");
+	goto error;
+      }
+
+  if (old_mpol != -1)
+    {
+      int rv;
+      u64 mask[16] = { 0 };
+      mask[0] = 1 << a->numa_node;
+      rv = set_mempolicy (MPOL_BIND, mask, sizeof (mask) * 8 + 1);
+      if (rv)
+	{
+	  err = clib_error_return_unix (0, "set_mempolicy");
+	  goto error;
+	}
+    }
+
+  addr = mmap (0, a->size, (PROT_READ | PROT_WRITE), mmap_flags, fd, 0);
+  if (addr == MAP_FAILED)
+    {
+      err = clib_error_return_unix (0, "mmap");
+      goto error;
+    }
+
+  /* re-apply ole numa memory policy */
+  if (old_mpol != -1 &&
+      set_mempolicy (old_mpol, old_mask, sizeof (old_mask) * 8 + 1) == -1)
+    {
+      err = clib_error_return_unix (0, "set_mempolicy");
+      goto error;
+    }
+
+  a->log2_page_size = log2_page_size;
+  a->n_pages = n_pages;
+  a->addr = addr;
+  a->fd = fd;
+  goto done;
+
+error:
+  if (fd != -1)
+    close (fd);
+
+done:
+  vec_free (filename);
+  return err;
+}
+
+u64 *
+clib_mem_vm_get_paddr (void *mem, int log2_page_size, int n_pages)
+{
+  int pagesize = sysconf (_SC_PAGESIZE);
+  int fd;
+  int i;
+  u64 *r = 0;
+
+  if ((fd = open ((char *) "/proc/self/pagemap", O_RDONLY)) == -1)
+    return 0;
+
+  for (i = 0; i < n_pages; i++)
+    {
+      u64 seek, pagemap = 0;
+      uword vaddr = pointer_to_uword (mem) + (((u64) i) << log2_page_size);
+      seek = ((u64) vaddr / pagesize) * sizeof (u64);
+      if (lseek (fd, seek, SEEK_SET) != seek)
+	goto done;
+
+      if (read (fd, &pagemap, sizeof (pagemap)) != (sizeof (pagemap)))
+	goto done;
+
+      if ((pagemap & (1ULL << 63)) == 0)
+	goto done;
+
+      pagemap &= pow2_mask (55);
+      vec_add1 (r, pagemap * pagesize);
+    }
+
+done:
+  close (fd);
+  if (vec_len (r) != n_pages)
+    {
+      vec_free (r);
+      return 0;
+    }
+  return r;
+}
+
+
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vppinfra/linux/syscall.h b/src/vppinfra/linux/syscall.h
new file mode 100644
index 00000000..f8ec5919
--- /dev/null
+++ b/src/vppinfra/linux/syscall.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_linux_syscall_h
+#define included_linux_syscall_h
+
+#include <unistd.h>
+#include <sys/syscall.h>
+
+static inline long
+set_mempolicy (int mode, const unsigned long *nodemask, unsigned long maxnode)
+{
+  return syscall (__NR_set_mempolicy, mode, nodemask, maxnode);
+}
+
+static inline int
+get_mempolicy (int *mode, unsigned long *nodemask, unsigned long maxnode,
+	       void *addr, unsigned long flags)
+{
+  return syscall (__NR_get_mempolicy, mode, nodemask, maxnode, addr, flags);
+}
+
+static inline long
+move_pages (int pid, unsigned long count, void **pages, const int *nodes,
+	    int *status, int flags)
+{
+  return syscall (__NR_move_pages, pid, count, pages, nodes, status, flags);
+}
+
+static inline int
+memfd_create (const char *name, unsigned int flags)
+{
+  return syscall (__NR_memfd_create, name, flags);
+}
+
+#endif /* included_linux_syscall_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vppinfra/linux/sysfs.c b/src/vppinfra/linux/sysfs.c
new file mode 100644
index 00000000..5f611e6a
--- /dev/null
+++ b/src/vppinfra/linux/sysfs.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2017 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vppinfra/clib.h>
+#include <vppinfra/clib_error.h>
+#include <vppinfra/format.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <dirent.h>
+
+clib_error_t *
+clib_sysfs_write (char *file_name, char *fmt, ...)
+{
+  u8 *s;
+  int fd;
+  clib_error_t *error = 0;
+
+  fd = open (file_name, O_WRONLY);
+  if (fd < 0)
+    return clib_error_return_unix (0, "open `%s'", file_name);
+
+  va_list va;
+  va_start (va, fmt);
+  s = va_format (0, fmt, &va);
+  va_end (va);
+
+  if (write (fd, s, vec_len (s)) < 0)
+    error = clib_error_return_unix (0, "write `%s'", file_name);
+
+  vec_free (s);
+  close (fd);
+  return error;
+}
+
+clib_error_t *
+clib_sysfs_read (char *file_name, char *fmt, ...)
+{
+  unformat_input_t input;
+  u8 *s = 0;
+  int fd;
+  ssize_t sz;
+  uword result;
+
+  fd = open (file_name, O_RDONLY);
+  if (fd < 0)
+    return clib_error_return_unix (0, "open `%s'", file_name);
+
+  vec_validate (s, 4095);
+
+  sz = read (fd, s, vec_len (s));
+  if (sz < 0)
+    {
+      close (fd);
+      vec_free (s);
+      return clib_error_return_unix (0, "read `%s'", file_name);
+    }
+
+  _vec_len (s) = sz;
+  unformat_init_vector (&input, s);
+
+  va_list va;
+  va_start (va, fmt);
+  result = va_unformat (&input, fmt, &va);
+  va_end (va);
+
+  vec_free (s);
+  close (fd);
+
+  if (result == 0)
+    return clib_error_return (0, "unformat error");
+
+  return 0;
+}
+
+u8 *
+clib_sysfs_link_to_name (char *link)
+{
+  char *p, buffer[64];
+  unformat_input_t in;
+  u8 *s = 0;
+  int r;
+
+  r = readlink (link, buffer, sizeof (buffer) - 1);
+
+  if (r < 0)
+    return 0;
+
+  buffer[r] = 0;
+  p = strrchr (buffer, '/');
+
+  if (!p)
+    return 0;
+
+  unformat_init_string (&in, p + 1, strlen (p + 1));
+  if (unformat (&in, "%s", &s) != 1)
+    clib_unix_warning ("no string?");
+  unformat_free (&in);
+
+  return s;
+}
+
+clib_error_t *
+clib_sysfs_set_nr_hugepages (int numa_node, int page_size, int nr)
+{
+  clib_error_t *error = 0;
+  struct stat sb;
+  u8 *p = 0;
+
+  p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0);
+
+  if (stat ((char *) p, &sb) == 0)
+    {
+      if (S_ISDIR (sb.st_mode) == 0)
+	{
+	  error = clib_error_return (0, "'%s' is not directory", p);
+	  goto done;
+	}
+    }
+  else if (numa_node == 0)
+    {
+      vec_reset_length (p);
+      p = format (p, "/sys/kernel/mm%c", 0);
+      if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0)
+	{
+	  error = clib_error_return (0, "'%s' does not exist or it is not "
+				     "directory", p);
+	  goto done;
+	}
+    }
+  else
+    {
+      error = clib_error_return (0, "'%s' does not exist", p);
+      goto done;
+    }
+
+  _vec_len (p) -= 1;
+  p = format (p, "/hugepages/hugepages-%ukB/nr_hugepages%c", page_size, 0);
+  clib_sysfs_write ((char *) p, "%d", nr);
+
+done:
+  vec_free (p);
+  return error;
+}
+
+
+static clib_error_t *
+clib_sysfs_get_xxx_hugepages (char *type, int numa_node,
+			      int page_size, int *val)
+{
+  clib_error_t *error = 0;
+  struct stat sb;
+  u8 *p = 0;
+
+  p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0);
+
+  if (stat ((char *) p, &sb) == 0)
+    {
+      if (S_ISDIR (sb.st_mode) == 0)
+	{
+	  error = clib_error_return (0, "'%s' is not directory", p);
+	  goto done;
+	}
+    }
+  else if (numa_node == 0)
+    {
+      vec_reset_length (p);
+      p = format (p, "/sys/kernel/mm%c", 0);
+      if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0)
+	{
+	  error = clib_error_return (0, "'%s' does not exist or it is not "
+				     "directory", p);
+	  goto done;
+	}
+    }
+  else
+    {
+      error = clib_error_return (0, "'%s' does not exist", p);
+      goto done;
+    }
+
+  _vec_len (p) -= 1;
+  p = format (p, "/hugepages/hugepages-%ukB/%s_hugepages%c", page_size,
+	      type, 0);
+  error = clib_sysfs_read ((char *) p, "%d", val);
+
+done:
+  vec_free (p);
+  return error;
+}
+
+clib_error_t *
+clib_sysfs_get_free_hugepages (int numa_node, int page_size, int *v)
+{
+  return clib_sysfs_get_xxx_hugepages ("free", numa_node, page_size, v);
+}
+
+clib_error_t *
+clib_sysfs_get_nr_hugepages (int numa_node, int page_size, int *v)
+{
+  return clib_sysfs_get_xxx_hugepages ("nr", numa_node, page_size, v);
+}
+
+clib_error_t *
+clib_sysfs_get_surplus_hugepages (int numa_node, int page_size, int *v)
+{
+  return clib_sysfs_get_xxx_hugepages ("surplus", numa_node, page_size, v);
+}
+
+clib_error_t *
+clib_sysfs_prealloc_hugepages (int numa_node, int page_size, int nr)
+{
+  clib_error_t *error = 0;
+  int n, needed;
+  error = clib_sysfs_get_free_hugepages (numa_node, page_size, &n);
+  if (error)
+    return error;
+  needed = nr - n;
+  if (needed <= 0)
+    return 0;
+
+  error = clib_sysfs_get_nr_hugepages (numa_node, page_size, &n);
+  if (error)
+    return error;
+  clib_warning ("pre-allocating %u additional %uK hugepages on numa node %u",
+		needed, page_size, numa_node);
+  return clib_sysfs_set_nr_hugepages (numa_node, page_size, n + needed);
+}
+
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vppinfra/linux/sysfs.h b/src/vppinfra/linux/sysfs.h
new file mode 100644
index 00000000..6c80cf95
--- /dev/null
+++ b/src/vppinfra/linux/sysfs.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2017 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef included_linux_sysfs_h
+#define included_linux_sysfs_h
+
+#include <vppinfra/error.h>
+
+clib_error_t *clib_sysfs_write (char *file_name, char *fmt, ...);
+
+clib_error_t *clib_sysfs_read (char *file_name, char *fmt, ...);
+
+u8 *clib_sysfs_link_to_name (char *link);
+
+clib_error_t *clib_sysfs_set_nr_hugepages (int numa_node,
+					   int page_size, int nr);
+clib_error_t *clib_sysfs_get_nr_hugepages (int numa_node,
+					   int page_size, int *v);
+clib_error_t *clib_sysfs_get_free_hugepages (int numa_node,
+					     int page_size, int *v);
+clib_error_t *clib_sysfs_get_surplus_hugepages (int numa_node,
+						int page_size, int *v);
+clib_error_t *clib_sysfs_prealloc_hugepages (int numa_node,
+					     int page_size, int nr);
+
+#endif /* included_linux_sysfs_h */
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */
diff --git a/src/vppinfra/mem.h b/src/vppinfra/mem.h
index 63c5ac16..69ab8803 100644
--- a/src/vppinfra/mem.h
+++ b/src/vppinfra/mem.h
@@ -39,8 +39,11 @@
 #define _included_clib_mem_h
 
 #include <stdarg.h>
+#include <unistd.h>
+#include <sys/mman.h>
 
 #include <vppinfra/clib.h>	/* uword, etc */
+#include <vppinfra/clib_error.h>
 #include <vppinfra/mheap_bootstrap.h>
 #include <vppinfra/os.h>
 #include <vppinfra/string.h>	/* memcpy, memset */
@@ -264,19 +267,90 @@ void clib_mem_usage (clib_mem_usage_t * usage);
 
 u8 *format_clib_mem_usage (u8 * s, va_list * args);
 
-/* Include appropriate VM functions depending on whether
-   we are compiling for linux kernel, for Unix or standalone. */
-#ifdef CLIB_LINUX_KERNEL
-#include <vppinfra/vm_linux_kernel.h>
-#endif
+/* Allocate virtual address space. */
+always_inline void *
+clib_mem_vm_alloc (uword size)
+{
+  void *mmap_addr;
+  uword flags = MAP_PRIVATE;
 
-#ifdef CLIB_UNIX
-#include <vppinfra/vm_unix.h>
+#ifdef MAP_ANONYMOUS
+  flags |= MAP_ANONYMOUS;
 #endif
 
-#ifdef CLIB_STANDALONE
-#include <vppinfra/vm_standalone.h>
-#endif
+  mmap_addr = mmap (0, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+  if (mmap_addr == (void *) -1)
+    mmap_addr = 0;
+
+  return mmap_addr;
+}
+
+always_inline void
+clib_mem_vm_free (void *addr, uword size)
+{
+  munmap (addr, size);
+}
+
+always_inline void *
+clib_mem_vm_unmap (void *addr, uword size)
+{
+  void *mmap_addr;
+  uword flags = MAP_PRIVATE | MAP_FIXED;
+
+  /* To unmap we "map" with no protection.  If we actually called
+     munmap then other callers could steal the address space.  By
+     changing to PROT_NONE the kernel can free up the pages which is
+     really what we want "unmap" to mean. */
+  mmap_addr = mmap (addr, size, PROT_NONE, flags, -1, 0);
+  if (mmap_addr == (void *) -1)
+    mmap_addr = 0;
+
+  return mmap_addr;
+}
+
+always_inline void *
+clib_mem_vm_map (void *addr, uword size)
+{
+  void *mmap_addr;
+  uword flags = MAP_PRIVATE | MAP_FIXED;
+
+  mmap_addr = mmap (addr, size, (PROT_READ | PROT_WRITE), flags, -1, 0);
+  if (mmap_addr == (void *) -1)
+    mmap_addr = 0;
+
+  return mmap_addr;
+}
+
+typedef struct
+{
+#define CLIB_MEM_VM_F_SHARED (1 << 0)
+#define CLIB_MEM_VM_F_HUGETLB (1 << 1)
+#define CLIB_MEM_VM_F_NUMA_PREFER (1 << 2)
+#define CLIB_MEM_VM_F_NUMA_FORCE (1 << 3)
+#define CLIB_MEM_VM_F_HUGETLB_PREALLOC (1 << 4)
+  u32 flags; /**< vm allocation flags:
+                <br> CLIB_MEM_VM_F_SHARED: request shared memory, file
+		destiptor will be provided on successful allocation.
+                <br> CLIB_MEM_VM_F_HUGETLB: request hugepages.
+		<br> CLIB_MEM_VM_F_NUMA_PREFER: numa_node field contains valid
+		numa node preference.
+		<br> CLIB_MEM_VM_F_NUMA_FORCE: fail if setting numa policy fails.
+		<br> CLIB_MEM_VM_F_HUGETLB_PREALLOC: pre-allocate hugepages if
+		number of available pages is not sufficient.
+             */
+  char *name; /**< Name for memory allocation, set by caller. */
+  uword size; /**< Allocation size, set by caller. */
+  int numa_node; /**< numa node preference. Valid if CLIB_MEM_VM_F_NUMA_PREFER set. */
+  void *addr; /**< Pointer to allocated memory, set on successful allocation. */
+  int fd; /**< File desriptor, set on successful allocation if CLIB_MEM_VM_F_SHARED is set. */
+  int log2_page_size;		/* Page size in log2 format, set on successful allocation. */
+  int n_pages;			/* Number of pages. */
+} clib_mem_vm_alloc_t;
+
+clib_error_t *clib_mem_vm_ext_alloc (clib_mem_vm_alloc_t * a);
+int clib_mem_vm_get_log2_page_size (int fd);
+u64 *clib_mem_vm_get_paddr (void *mem, int log2_page_size, int n_pages);
+
 
 #include <vppinfra/error.h>	/* clib_panic */
 
diff --git a/src/vppinfra/vm_linux_kernel.h b/src/vppinfra/vm_linux_kernel.h
deleted file mode 100644
index fd9e6148..00000000
--- a/src/vppinfra/vm_linux_kernel.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2015 Cisco and/or its affiliates.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at:
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
-  Copyright (c) 2001, 2002, 2003 Eliot Dresselhaus
-
-  Permission is hereby granted, free of charge, to any person obtaining
-  a copy of this software and associated documentation files (the
-  "Software"), to deal in the Software without restriction, including
-  without limitation the rights to use, copy, modify, merge, publish,
-  distribute, sublicense, and/or sell copies of the Software, and to
-  permit persons to whom the Software is furnished to do so, subject to
-  the following conditions:
-
-  The above copyright notice and this permission notice shall be
-  included in all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef included_vm_linux_kernel_h
-#define included_vm_linux_kernel_h
-
-#include <linux/vmalloc.h>
-#include <linux/gfp.h>		/* for GFP_* */
-#include <asm/pgtable.h>	/* for PAGE_KERNEL */
-
-/* Allocate virtual address space. */
-always_inline void *
-clib_mem_vm_alloc (uword size)
-{
-  return vmalloc (size);
-}
-
-always_inline void
-clib_mem_vm_free (void *addr, uword size)
-{
-  vfree (addr);
-}
-
-always_inline void *
-clib_mem_vm_unmap (void *addr, uword size)
-{
-  return 0;
-}
-
-always_inline void *
-clib_mem_vm_map (void *addr, uword size)
-{
-  return addr;
-}
-
-#endif /* included_vm_linux_kernel_h */
-
-/*
- * fd.io coding-style-patch-verification: ON
- *
- * Local Variables:
- * eval: (c-set-style "gnu")
- * End:
- */
diff --git a/src/vppinfra/vm_standalone.h b/src/vppinfra/vm_standalone.h
deleted file mode 100644
index 2cd431bc..00000000
--- a/src/vppinfra/vm_standalone.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2015 Cisco and/or its affiliates.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at:
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
-  Copyright (c) 2001, 2002, 2003 Eliot Dresselhaus
-
-  Permission is hereby granted, free of charge, to any person obtaining
-  a copy of this software and associated documentation files (the
-  "Software"), to deal in the Software without restriction, including
-  without limitation the rights to use, copy, modify, merge, publish,
-  distribute, sublicense, and/or sell copies of the Software, and to
-  permit persons to whom the Software is furnished to do so, subject to
-  the following conditions:
-
-  The above copyright notice and this permission notice shall be
-  included in all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef included_vm_standalone_h
-#define included_vm_standalone_h
-
-/* Stubs for standalone "system" which has no VM support. */
-
-always_inline void *
-clib_mem_vm_alloc (uword size)
-{
-  return 0;
-}
-
-always_inline void
-clib_mem_vm_free (void *addr, uword size)
-{
-}
-
-always_inline void *
-clib_mem_vm_unmap (void *addr, uword size)
-{
-  return 0;
-}
-
-always_inline void *
-clib_mem_vm_map (void *addr, uword size)
-{
-  return addr;
-}
-
-#endif /* included_vm_standalone_h */
-
-/*
- * fd.io coding-style-patch-verification: ON
- *
- * Local Variables:
- * eval: (c-set-style "gnu")
- * End:
- */
diff --git a/src/vppinfra/vm_unix.h b/src/vppinfra/vm_unix.h
deleted file mode 100644
index 07e86516..00000000
--- a/src/vppinfra/vm_unix.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2015 Cisco and/or its affiliates.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at:
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
-  Copyright (c) 2001, 2002, 2003 Eliot Dresselhaus
-
-  Permission is hereby granted, free of charge, to any person obtaining
-  a copy of this software and associated documentation files (the
-  "Software"), to deal in the Software without restriction, including
-  without limitation the rights to use, copy, modify, merge, publish,
-  distribute, sublicense, and/or sell copies of the Software, and to
-  permit persons to whom the Software is furnished to do so, subject to
-  the following conditions:
-
-  The above copyright notice and this permission notice shall be
-  included in all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef included_vm_unix_h
-#define included_vm_unix_h
-
-#include <unistd.h>
-#include <sys/mman.h>
-
-/* Allocate virtual address space. */
-always_inline void *
-clib_mem_vm_alloc (uword size)
-{
-  void *mmap_addr;
-  uword flags = MAP_PRIVATE;
-
-#ifdef MAP_ANONYMOUS
-  flags |= MAP_ANONYMOUS;
-#endif
-
-  mmap_addr = mmap (0, size, PROT_READ | PROT_WRITE, flags, -1, 0);
-  if (mmap_addr == (void *) -1)
-    mmap_addr = 0;
-
-  return mmap_addr;
-}
-
-always_inline void
-clib_mem_vm_free (void *addr, uword size)
-{
-  munmap (addr, size);
-}
-
-always_inline void *
-clib_mem_vm_unmap (void *addr, uword size)
-{
-  void *mmap_addr;
-  uword flags = MAP_PRIVATE | MAP_FIXED;
-
-  /* To unmap we "map" with no protection.  If we actually called
-     munmap then other callers could steal the address space.  By
-     changing to PROT_NONE the kernel can free up the pages which is
-     really what we want "unmap" to mean. */
-  mmap_addr = mmap (addr, size, PROT_NONE, flags, -1, 0);
-  if (mmap_addr == (void *) -1)
-    mmap_addr = 0;
-
-  return mmap_addr;
-}
-
-always_inline void *
-clib_mem_vm_map (void *addr, uword size)
-{
-  void *mmap_addr;
-  uword flags = MAP_PRIVATE | MAP_FIXED;
-
-  mmap_addr = mmap (addr, size, (PROT_READ | PROT_WRITE), flags, -1, 0);
-  if (mmap_addr == (void *) -1)
-    mmap_addr = 0;
-
-  return mmap_addr;
-}
-
-#endif /* included_vm_unix_h */
-
-/*
- * fd.io coding-style-patch-verification: ON
- *
- * Local Variables:
- * eval: (c-set-style "gnu")
- * End:
- */
-- 
cgit 1.2.3-korg


From 69128d0209ba6108430dca9cc78ab36a9b1c793e Mon Sep 17 00:00:00 2001
From: Dave Barach <dbarach@cisco.com>
Date: Tue, 26 Sep 2017 10:54:34 -0400
Subject: Add thread-safe event signaller, use RPC where required

Update ping code to use the new function

Change-Id: Ieb753b23f8402cbe5667c22747896784c8ece937
Signed-off-by: Florin Coras <fcoras@cisco.com>
Signed-off-by: Dave Barach <dave@barachs.net>
---
 src/vlib/node_funcs.h        | 23 +++++++++++++++++++++++
 src/vlib/threads.c           | 24 +++++++++++++++++++++++-
 src/vlib/threads.h           | 14 +++++++++++++-
 src/vlibmemory/memory_vlib.c | 13 ++++++++++++-
 src/vnet/ip/ping.c           |  4 ++--
 5 files changed, 73 insertions(+), 5 deletions(-)

(limited to 'src/vlib/threads.c')

diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h
index 3ae4e541..0734476c 100644
--- a/src/vlib/node_funcs.h
+++ b/src/vlib/node_funcs.h
@@ -965,6 +965,29 @@ vlib_process_signal_event_pointer (vlib_main_t * vm,
   d[0] = data;
 }
 
+/**
+ * Signal event to process from any thread.
+ *
+ * When in doubt, use this.
+ */
+always_inline void
+vlib_process_signal_event_mt (vlib_main_t * vm,
+			      uword node_index, uword type_opaque, uword data)
+{
+  if (vlib_get_thread_index () != 0)
+    {
+      vlib_process_signal_event_mt_args_t args = {
+	.node_index = node_index,
+	.type_opaque = type_opaque,
+	.data = data,
+      };
+      vlib_rpc_call_main_thread (vlib_process_signal_event_mt_helper,
+				 (u8 *) & args, sizeof (args));
+    }
+  else
+    vlib_process_signal_event (vm, node_index, type_opaque, data);
+}
+
 always_inline void
 vlib_process_signal_one_time_event (vlib_main_t * vm,
 				    uword node_index,
diff --git a/src/vlib/threads.c b/src/vlib/threads.c
index f9c7043c..be8daa64 100644
--- a/src/vlib/threads.c
+++ b/src/vlib/threads.c
@@ -1767,7 +1767,6 @@ vlib_frame_queue_main_init (u32 node_index, u32 frame_queue_nelts)
   return (fqm - tm->frame_queue_mains);
 }
 
-
 int
 vlib_thread_cb_register (struct vlib_main_t *vm, vlib_thread_callbacks_t * cb)
 {
@@ -1781,6 +1780,29 @@ vlib_thread_cb_register (struct vlib_main_t *vm, vlib_thread_callbacks_t * cb)
   return 0;
 }
 
+void
+vlib_process_signal_event_mt_helper (vlib_process_signal_event_mt_args_t *
+				     args)
+{
+  ASSERT (vlib_get_thread_index () == 0);
+  vlib_process_signal_event (vlib_get_main (), args->node_index,
+			     args->type_opaque, args->data);
+}
+
+void *rpc_call_main_thread_cb_fn;
+
+void
+vlib_rpc_call_main_thread (void *callback, u8 * args, u32 arg_size)
+{
+  if (rpc_call_main_thread_cb_fn)
+    {
+      void (*fp) (void *, u8 *, u32) = rpc_call_main_thread_cb_fn;
+      (*fp) (callback, args, arg_size);
+    }
+  else
+    clib_warning ("BUG: rpc_call_main_thread_cb_fn NULL!");
+}
+
 clib_error_t *
 threads_init (vlib_main_t * vm)
 {
diff --git a/src/vlib/threads.h b/src/vlib/threads.h
index 72340ee1..8931584b 100644
--- a/src/vlib/threads.h
+++ b/src/vlib/threads.h
@@ -171,6 +171,13 @@ typedef struct
   frame_queue_nelt_counter_t *frame_queue_histogram;
 } vlib_frame_queue_main_t;
 
+typedef struct
+{
+  uword node_index;
+  uword type_opaque;
+  uword data;
+} vlib_process_signal_event_mt_args_t;
+
 /* Called early, in thread 0's context */
 clib_error_t *vlib_thread_init (vlib_main_t * vm);
 
@@ -510,9 +517,14 @@ vlib_get_worker_handoff_queue_elt (u32 frame_queue_index,
 }
 
 u8 *vlib_thread_stack_init (uword thread_index);
-
 int vlib_thread_cb_register (struct vlib_main_t *vm,
 			     vlib_thread_callbacks_t * cb);
+extern void *rpc_call_main_thread_cb_fn;
+
+void
+vlib_process_signal_event_mt_helper (vlib_process_signal_event_mt_args_t *
+				     args);
+void vlib_rpc_call_main_thread (void *function, u8 * args, u32 size);
 
 #endif /* included_vlib_threads_h */
 
diff --git a/src/vlibmemory/memory_vlib.c b/src/vlibmemory/memory_vlib.c
index 77959e6d..d305ea61 100644
--- a/src/vlibmemory/memory_vlib.c
+++ b/src/vlibmemory/memory_vlib.c
@@ -1573,6 +1573,17 @@ _(RPC_CALL_REPLY,rpc_call_reply)
 #define foreach_plugin_trace_msg		\
 _(TRACE_PLUGIN_MSG_IDS,trace_plugin_msg_ids)
 
+/*
+ * Set the rpc callback at our earliest possible convenience.
+ * This avoids ordering issues between thread_init() -> start_workers and
+ * an init function which we could define here. If we ever intend to use
+ * vlib all by itself, we can't create a link-time dependency on
+ * an init function here and a typical "call foo_init first"
+ * guitar lick.
+ */
+
+extern void *rpc_call_main_thread_cb_fn;
+
 static clib_error_t *
 rpc_api_hookup (vlib_main_t * vm)
 {
@@ -1599,7 +1610,7 @@ rpc_api_hookup (vlib_main_t * vm)
 
   /* No reason to halt the parade to create a trace record... */
   am->is_mp_safe[VL_API_TRACE_PLUGIN_MSG_IDS] = 1;
-
+  rpc_call_main_thread_cb_fn = vl_api_rpc_call_main_thread;
   return 0;
 }
 
diff --git a/src/vnet/ip/ping.c b/src/vnet/ip/ping.c
index c847e696..0fa537f6 100755
--- a/src/vnet/ip/ping.c
+++ b/src/vnet/ip/ping.c
@@ -97,7 +97,7 @@ signal_ip46_icmp_reply_event (u8 event_type, vlib_buffer_t * b0)
   clib_memcpy (vnet_buffer
 	       (vlib_get_buffer
 		(vm, bi0_copy))->unused, &nowts, sizeof (nowts));
-  vlib_process_signal_event (vm, pr->cli_process_id, event_type, bi0_copy);
+  vlib_process_signal_event_mt (vm, pr->cli_process_id, event_type, bi0_copy);
   return 1;
 }
 
@@ -646,7 +646,7 @@ run_ping_ip46_address (vlib_main_t * vm, u32 table_id, ip4_address_t * pa4,
 	      i = 1 + ping_repeat;
 	      break;
 	    }
-      vec_free(event_data);
+	  vec_free (event_data);
 	}
     }
   vlib_cli_output (vm, "\n");
-- 
cgit 1.2.3-korg