perfmon plugin: 2-way parallel stat collection

As a FUD reduction measure, this patch implements 2-way parallel counter collection. Synthetic stat component counter pairs run at the same time. Running two counters (of any kind) at the same time naturally reduces the aggregate time required by an approximate factor-of-2, depending on whether an even or odd number of stats have been requested. I don't completely buy the argument that computing synthetic stats such as instructions-per-clock will be inaccurate if component counter values are collected sequentially. Given uniform traffic pattern, it must make no difference. As the collection interval increases, the difference between serial and parallel component counter collection will approach zero, see also the Central Limit theorem. Change-Id: I36ebdcf125e8882cca8a1929ec58f17fba1ad8f1 Signed-off-by: Dave Barach <dave@barachs.net>
author: Dave Barach <dave@barachs.net> 2019-01-24 10:34:24 -0500
committer: Damjan Marion <dmarion@me.com> 2019-01-24 16:19:04 +0000
commit: ec595ef02639005b34334097af76b41ceef3dca5 (patch)
tree: b8b752d9c9371b9ea75a9a28bfa1a3a6e7494b18 /src/vlib/main.c
parent: 22f23ae802f6dc654dbef27340c67773eb8be8c3 (diff)
1 files changed, 48 insertions, 38 deletions
diff --git a/src/vlib/main.c b/src/vlib/main.c
index 23c4e076e1f..0e480fabe2a 100644
--- a/src/vlib/main.c
+++ b/src/vlib/main.c
@@ -543,15 +543,17 @@ never_inline void
 vlib_node_runtime_sync_stats (vlib_main_t * vm,
 			      vlib_node_runtime_t * r,
 			      uword n_calls, uword n_vectors, uword n_clocks,
-			      uword n_ticks)
+			      uword n_ticks0, uword n_ticks1)
 {
   vlib_node_t *n = vlib_get_node (vm, r->node_index);
 
   n->stats_total.calls += n_calls + r->calls_since_last_overflow;
   n->stats_total.vectors += n_vectors + r->vectors_since_last_overflow;
   n->stats_total.clocks += n_clocks + r->clocks_since_last_overflow;
-  n->stats_total.perf_counter_ticks += n_ticks +
-    r->perf_counter_ticks_since_last_overflow;
+  n->stats_total.perf_counter0_ticks += n_ticks0 +
+    r->perf_counter0_ticks_since_last_overflow;
+  n->stats_total.perf_counter1_ticks += n_ticks1 +
+    r->perf_counter1_ticks_since_last_overflow;
   n->stats_total.perf_counter_vectors += n_vectors +
     r->perf_counter_vectors_since_last_overflow;
   n->stats_total.max_clock = r->max_clock;
@@ -560,7 +562,8 @@ vlib_node_runtime_sync_stats (vlib_main_t * vm,
   r->calls_since_last_overflow = 0;
   r->vectors_since_last_overflow = 0;
   r->clocks_since_last_overflow = 0;
-  r->perf_counter_ticks_since_last_overflow = 0ULL;
+  r->perf_counter0_ticks_since_last_overflow = 0ULL;
+  r->perf_counter1_ticks_since_last_overflow = 0ULL;
   r->perf_counter_vectors_since_last_overflow = 0ULL;
 }
 
@@ -568,12 +571,12 @@ always_inline void __attribute__ ((unused))
 vlib_process_sync_stats (vlib_main_t * vm,
 			 vlib_process_t * p,
 			 uword n_calls, uword n_vectors, uword n_clocks,
-			 uword n_ticks)
+			 uword n_ticks0, uword n_ticks1)
 {
   vlib_node_runtime_t *rt = &p->node_runtime;
   vlib_node_t *n = vlib_get_node (vm, rt->node_index);
   vlib_node_runtime_sync_stats (vm, rt, n_calls, n_vectors, n_clocks,
-				n_ticks);
+				n_ticks0, n_ticks1);
   n->stats_total.suspends += p->n_suspends;
   p->n_suspends = 0;
 }
@@ -599,7 +602,7 @@ vlib_node_sync_stats (vlib_main_t * vm, vlib_node_t * n)
       vec_elt_at_index (vm->node_main.nodes_by_type[n->type],
 			n->runtime_index);
 
-  vlib_node_runtime_sync_stats (vm, rt, 0, 0, 0, 0);
+  vlib_node_runtime_sync_stats (vm, rt, 0, 0, 0, 0, 0);
 
   /* Sync up runtime next frame vector counters with main node structure. */
   {
@@ -620,27 +623,30 @@ vlib_node_runtime_update_stats (vlib_main_t * vm,
 				vlib_node_runtime_t * node,
 				uword n_calls,
 				uword n_vectors, uword n_clocks,
-				uword n_ticks)
+				uword n_ticks0, uword n_ticks1)
 {
   u32 ca0, ca1, v0, v1, cl0, cl1, r;
-  u32 ptick0, ptick1, pvec0, pvec1;
+  u32 ptick00, ptick01, ptick10, ptick11, pvec0, pvec1;
 
   cl0 = cl1 = node->clocks_since_last_overflow;
   ca0 = ca1 = node->calls_since_last_overflow;
   v0 = v1 = node->vectors_since_last_overflow;
-  ptick0 = ptick1 = node->perf_counter_ticks_since_last_overflow;
+  ptick00 = ptick01 = node->perf_counter0_ticks_since_last_overflow;
+  ptick10 = ptick11 = node->perf_counter1_ticks_since_last_overflow;
   pvec0 = pvec1 = node->perf_counter_vectors_since_last_overflow;
 
   ca1 = ca0 + n_calls;
   v1 = v0 + n_vectors;
   cl1 = cl0 + n_clocks;
-  ptick1 = ptick0 + n_ticks;
+  ptick01 = ptick00 + n_ticks0;
+  ptick11 = ptick10 + n_ticks1;
   pvec1 = pvec0 + n_vectors;
 
   node->calls_since_last_overflow = ca1;
   node->clocks_since_last_overflow = cl1;
   node->vectors_since_last_overflow = v1;
-  node->perf_counter_ticks_since_last_overflow = ptick1;
+  node->perf_counter0_ticks_since_last_overflow = ptick01;
+  node->perf_counter1_ticks_since_last_overflow = ptick11;
   node->perf_counter_vectors_since_last_overflow = pvec1;
 
   node->max_clock_n = node->max_clock > n_clocks ?
@@ -649,38 +655,39 @@ vlib_node_runtime_update_stats (vlib_main_t * vm,
 
   r = vlib_node_runtime_update_main_loop_vector_stats (vm, node, n_vectors);
 
-  if (PREDICT_FALSE (ca1 < ca0 || v1 < v0 || cl1 < cl0) || (ptick1 < ptick0)
-      || (pvec1 < pvec0))
+  if (PREDICT_FALSE (ca1 < ca0 || v1 < v0 || cl1 < cl0) || (ptick01 < ptick00)
+      || (ptick11 < ptick10) || (pvec1 < pvec0))
     {
       node->calls_since_last_overflow = ca0;
       node->clocks_since_last_overflow = cl0;
       node->vectors_since_last_overflow = v0;
-      node->perf_counter_ticks_since_last_overflow = ptick0;
+      node->perf_counter0_ticks_since_last_overflow = ptick00;
+      node->perf_counter1_ticks_since_last_overflow = ptick10;
       node->perf_counter_vectors_since_last_overflow = pvec0;
 
       vlib_node_runtime_sync_stats (vm, node, n_calls, n_vectors, n_clocks,
-				    n_ticks);
+				    n_ticks0, n_ticks1);
     }
 
   return r;
 }
 
-static inline u64
-vlib_node_runtime_perf_counter (vlib_main_t * vm)
+static inline void
+vlib_node_runtime_perf_counter (vlib_main_t * vm, u64 * pmc0, u64 * pmc1)
 {
+  *pmc0 = 0;
+  *pmc1 = 0;
   if (PREDICT_FALSE (vm->vlib_node_runtime_perf_counter_cb != 0))
-    return ((*vm->vlib_node_runtime_perf_counter_cb) (vm));
-  return 0ULL;
+    (*vm->vlib_node_runtime_perf_counter_cb) (vm, pmc0, pmc1);
 }
 
 always_inline void
 vlib_process_update_stats (vlib_main_t * vm,
 			   vlib_process_t * p,
-			   uword n_calls, uword n_vectors, uword n_clocks,
-			   uword n_ticks)
+			   uword n_calls, uword n_vectors, uword n_clocks)
 {
   vlib_node_runtime_update_stats (vm, &p->node_runtime,
-				  n_calls, n_vectors, n_clocks, n_ticks);
+				  n_calls, n_vectors, n_clocks, 0ULL, 0ULL);
 }
 
 static clib_error_t *
@@ -1098,6 +1105,8 @@ dispatch_pcap_trace (vlib_main_t * vm,
     }
 }
 
+u64 oingo0, oingo1;
+
 static_always_inline u64
 dispatch_node (vlib_main_t * vm,
 	       vlib_node_runtime_t * node,
@@ -1146,18 +1155,14 @@ dispatch_node (vlib_main_t * vm,
 
   if (1 /* || vm->thread_index == node->thread_index */ )
     {
-      u64 pmc_before, pmc_delta;
+      u64 pmc_before[2], pmc_after[2], pmc_delta[2];
 
       vlib_elog_main_loop_event (vm, node->node_index,
 				 last_time_stamp,
 				 frame ? frame->n_vectors : 0,
 				 /* is_after */ 0);
 
-      /*
-       * To validate accounting: pmc_before = last_time_stamp
-       * perf ticks should equal clocks/pkt...
-       */
-      pmc_before = vlib_node_runtime_perf_counter (vm);
+      vlib_node_runtime_perf_counter (vm, &pmc_before[0], &pmc_before[1]);
 
       /*
        * Turn this on if you run into
@@ -1191,7 +1196,10 @@ dispatch_node (vlib_main_t * vm,
        * To validate accounting: pmc_delta = t - pmc_before;
        * perf ticks should equal clocks/pkt...
        */
-      pmc_delta = vlib_node_runtime_perf_counter (vm) - pmc_before;
+      vlib_node_runtime_perf_counter (vm, &pmc_after[0], &pmc_after[1]);
+
+      pmc_delta[0] = pmc_after[0] - pmc_before[0];
+      pmc_delta[1] = pmc_after[1] - pmc_before[1];
 
       vlib_elog_main_loop_event (vm, node->node_index, t, n,	/* is_after */
 				 1);
@@ -1199,11 +1207,18 @@ dispatch_node (vlib_main_t * vm,
       vm->main_loop_vectors_processed += n;
       vm->main_loop_nodes_processed += n > 0;
 
+      if (pmc_delta[0] || pmc_delta[1])
+	{
+	  oingo0 += pmc_delta[0];
+	  oingo1 += pmc_delta[1];
+	}
+
       v = vlib_node_runtime_update_stats (vm, node,
 					  /* n_calls */ 1,
 					  /* n_vectors */ n,
 					  /* n_clocks */ t - last_time_stamp,
-					  pmc_delta /* PMC ticks */ );
+					  pmc_delta[0] /* PMC0 */ ,
+					  pmc_delta[1] /* PMC1 */ );
 
       /* When in interrupt mode and vector rate crosses threshold switch to
          polling mode. */
@@ -1542,8 +1557,7 @@ dispatch_process (vlib_main_t * vm,
   vlib_process_update_stats (vm, p,
 			     /* n_calls */ !is_suspend,
 			     /* n_vectors */ n_vectors,
-			     /* n_clocks */ t - last_time_stamp,
-			     /* pmc_ticks */ 0ULL);
+			     /* n_clocks */ t - last_time_stamp);
 
   return t;
 }
@@ -1626,8 +1640,7 @@ dispatch_suspended_process (vlib_main_t * vm,
   vlib_process_update_stats (vm, p,
 			     /* n_calls */ !is_suspend,
 			     /* n_vectors */ n_vectors,
-			     /* n_clocks */ t - last_time_stamp,
-			     /* pmc_ticks */ 0ULL);
+			     /* n_clocks */ t - last_time_stamp);
 
   return t;
 }
@@ -1677,9 +1690,6 @@ vlib_main_or_worker_loop (vlib_main_t * vm, int is_main)
   if (!nm->interrupt_threshold_vector_length)
     nm->interrupt_threshold_vector_length = 5;
 
-  /* Make sure the performance monitor counter is disabled */
-  vm->perf_counter_id = ~0;
-
   /* Start all processes. */
   if (is_main)
     {
author	Dave Barach <dave@barachs.net>	2019-01-24 10:34:24 -0500
committer	Damjan Marion <dmarion@me.com>	2019-01-24 16:19:04 +0000
commit	ec595ef02639005b34334097af76b41ceef3dca5 (patch)
tree	b8b752d9c9371b9ea75a9a28bfa1a3a6e7494b18 /src/vlib/main.c
parent	22f23ae802f6dc654dbef27340c67773eb8be8c3 (diff)