summaryrefslogtreecommitdiffstats
path: root/src/vlib/mc.c
blob: a289871f57086655a66577759224405e8a8aff8d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540<
/*
 * Copyright (c) 2017-2019 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <vnet/session/application_namespace.h>
#include <vnet/session/application.h>
#include <vnet/session/session_table.h>
#include <vnet/session/session.h>
#include <vnet/fib/fib_table.h>
#include <vppinfra/file.h>
#include <vppinfra/format_table.h>
#include <vlib/unix/unix.h>

/**
 * Hash table of application namespaces by app ns ids
 */
uword *app_namespace_lookup_table;

/**
 * Pool of application namespaces
 */
static app_namespace_t *app_namespace_pool;

static u8 app_sapi_enabled;

app_namespace_t *
app_namespace_get (u32 index)
{
  return pool_elt_at_index (app_namespace_pool, index);
}

app_namespace_t *
app_namespace_get_from_id (const u8 *ns_id)
{
  u32 index = app_namespace_index_from_id (ns_id);
  if (index == APP_NAMESPACE_INVALID_INDEX)
    return 0;
  return app_namespace_get (index);
}

u32
app_namespace_index (app_namespace_t * app_ns)
{
  return (app_ns - app_namespace_pool);
}

void
app_namespace_free (app_namespace_t *app_ns)
{
  hash_unset_mem (app_namespace_lookup_table, app_ns->ns_id);
  vec_free (app_ns->ns_id);

  pool_put (app_namespace_pool, app_ns);
}

app_namespace_t *
app_namespace_alloc (const u8 *ns_id)
{
  app_namespace_t *app_ns;

  pool_get (app_namespace_pool, app_ns);
  clib_memset (app_ns, 0, sizeof (*app_ns));

  app_ns->ns_id = vec_dup ((u8 *) ns_id);
  vec_terminate_c_string (app_ns->ns_id);

  hash_set_mem (app_namespace_lookup_table, app_ns->ns_id,
		app_ns - app_namespace_pool);

  return app_ns;
}

int
vnet_app_namespace_add_del (vnet_app_namespace_add_del_args_t * a)
{
  app_namespace_t *app_ns;
  session_table_t *st;
  u32 ns_index;
  int rv;

  if (a->is_add)
    {
      if (a->sw_if_index != APP_NAMESPACE_INVALID_INDEX
	  && !vnet_get_sw_interface_or_null (vnet_get_main (),
					     a->sw_if_index))
	return VNET_API_ERROR_INVALID_SW_IF_INDEX;


      if (a->sw_if_index != APP_NAMESPACE_INVALID_INDEX)
	{
	  a->ip4_fib_id =
	    fib_table_get_table_id_for_sw_if_index (FIB_PROTOCOL_IP4,
						    a->sw_if_index);
	  a->ip6_fib_id =
	    fib_table_get_table_id_for_sw_if_index (FIB_PROTOCOL_IP6,
						    a->sw_if_index);
	}
      if (a->sw_if_index == APP_NAMESPACE_INVALID_INDEX
	  && a->ip4_fib_id == APP_NAMESPACE_INVALID_INDEX)
	return VNET_API_ERROR_INVALID_VALUE;

      app_ns = app_namespace_get_from_id (a->ns_id);
      if (!app_ns)
	{
	  app_ns = app_namespace_alloc (a->ns_id);
	  st = session_table_alloc ();
	  session_table_init (st, FIB_PROTOCOL_MAX);
	  st->is_local = 1;
	  st->appns_index = app_namespace_index (app_ns);
	  app_ns->local_table_index = session_table_index (st);
	  if (a->netns)
	    {
	      app_ns->netns = vec_dup (a->netns);
	      vec_terminate_c_string (app_ns->netns);
	    }
	  if (a->sock_name)
	    {
	      app_ns->sock_name = vec_dup (a->sock_name);
	      vec_terminate_c_string (app_ns->sock_name);
	    }

	  /* Add socket for namespace,
	   * only at creation time */
	  if (app_sapi_enabled)
	    {
	      rv = appns_sapi_add_ns_socket (app_ns);
	      if (rv)
		return rv;
	    }
	}

      app_ns->ns_secret = a->secret;
      app_ns->sw_if_index = a->sw_if_index;
      app_ns->ip4_fib_index =
	fib_table_find (FIB_PROTOCOL_IP4, a->ip4_fib_id);
      app_ns->ip6_fib_index =
	fib_table_find (FIB_PROTOCOL_IP6, a->ip6_fib_id);
      session_lookup_set_tables_appns (app_ns);

    }
  else
    {
      ns_index = app_namespace_index_from_id (a->ns_id);
      if (ns_index == APP_NAMESPACE_INVALID_INDEX)
	return VNET_API_ERROR_INVALID_VALUE;

      app_ns = app_namespace_get (ns_index);
      if (!app_ns)
	return VNET_API_ERROR_INVALID_VALUE;

      application_namespace_cleanup (app_ns);

      if (app_sapi_enabled)
	appns_sapi_del_ns_socket (app_ns);

      st = session_table_get (app_ns->local_table_index);

      session_table_free (st, FIB_PROTOCOL_MAX);
      if (app_ns->netns)
	vec_free (app_ns->netns);
      if (app_ns->sock_name)
	vec_free (app_ns->sock_name);

      app_namespace_free (app_ns);
    }

  return 0;
}

const u8 *
app_namespace_id (app_namespace_t * app_ns)
{
  return app_ns->ns_id;
}

u32
app_namespace_index_from_id (const u8 * ns_id)
{
  uword *indexp;
  u8 *key;

  key = vec_dup ((u8 *) ns_id);
  vec_terminate_c_string (key);

  indexp = hash_get_mem (app_namespace_lookup_table, key);
  vec_free (key);
  if (!indexp)
    return APP_NAMESPACE_INVALID_INDEX;
  return *indexp;
}

const u8 *
app_namespace_id_from_index (u32 index)
{
  app_namespace_t *app_ns;

  app_ns = app_namespace_get (index);
  return app_namespace_id (app_ns);
}

u32
app_namespace_get_fib_index (app_namespace_t * app_ns, u8 fib_proto)
{
  return fib_proto == FIB_PROTOCOL_IP4 ?
    app_ns->ip4_fib_index : app_ns->ip6_fib_index;
}

session_table_t *
app_namespace_get_local_table (app_namespace_t * app_ns)
{
  return session_table_get (app_ns->local_table_index);
}

int
appns_sapi_enable_disable (int is_enable)
{
  /* This cannot be called with active sockets */
  if (pool_elts (app_namespace_pool))
    return -1;

  app_sapi_enabled = is_enable;
  return 0;
}

u8
appns_sapi_enabled (void)
{
  return app_sapi_enabled;
}

void
app_namespaces_init (void)
{
  u8 *ns_id = format (0, "default");

  if (!app_namespace_lookup_table)
    app_namespace_lookup_table =
      hash_create_vec (0, sizeof (u8), sizeof (uword));

  /*
   * Allocate default namespace
   */

  /* clang-format off */
  vnet_app_namespace_add_del_args_t a = {
    .ns_id = ns_id,
    .netns = 0,
    .sock_name = 0,
    .secret = 0,
    .sw_if_index = APP_NAMESPACE_INVALID_INDEX,
    .is_add = 1
  };
  /* clang-format on */

  vnet_app_namespace_add_del (&a);
  vec_free (ns_id);
}

static clib_error_t *
app_ns_fn (vlib_main_t * vm, unformat_input_t * input,
	   vlib_cli_command_t * cmd)
{
  u8 is_add = 0, *ns_id = 0, secret_set = 0, sw_if_index_set = 0;
  u8 *netns = 0, *sock_name = 0;
  unformat_input_t _line_input, *line_input = &_line_input;
  u32 sw_if_index, fib_id = APP_NAMESPACE_INVALID_INDEX;
  vnet_main_t *vnm = vnet_get_main ();
  u64 secret;
  clib_error_t *error = 0;
  int rv;

  session_cli_return_if_not_enabled ();

  if (!unformat_user (input, unformat_line_input, line_input))
    return 0;

  while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
    {
      if (unformat (line_input, "add"))
	is_add = 1;
      else if (unformat (line_input, "del"))
	is_add = 0;
      else if (unformat (line_input, "id %_%v%_", &ns_id))
	;
      else if (unformat (line_input, "secret %lu", &secret))
	secret_set = 1;
      else if (unformat (line_input, "sw_if_index %u", &sw_if_index))
	sw_if_index_set = 1;
      else if (unformat (line_input, "if %U", unformat_vnet_sw_interface, vnm,
			 &sw_if_index))
	sw_if_index_set = 1;
      else if (unformat (line_input, "fib_id", &fib_id))
	;
      else if (unformat (line_input, "netns %_%v%_", &netns))
	;
      else if (unformat (line_input, "sock-name %_%v%_", &sock_name))
	;
      else
	{
	  error = clib_error_return (0, "unknown input `%U'",
				     format_unformat_error, line_input);
	  goto done;
	}
    }

  if (!ns_id)
    {
      vlib_cli_output (vm, "namespace-id must be provided");
      goto done;
    }

  if (is_add && (!secret_set || !sw_if_index_set))
    {
      vlib_cli_output (vm, "secret and interface must be provided");
      goto done;
    }

  /* clang-format off */
  vnet_app_namespace_add_del_args_t args = {
    .ns_id = ns_id,
    .netns = netns,
    .secret = secret,
    .sw_if_index = sw_if_index,
    .sock_name = sock_name,
    .ip4_fib_id = fib_id,
    .is_add = is_add,
  };
  /* clang-format on */

  if ((rv = vnet_app_namespace_add_del (&args)))
    error = clib_error_return (0, "app namespace add del returned %d", rv);

done:

  vec_free (ns_id);
  vec_free (netns);
  vec_free (sock_name);
  unformat_free (line_input);

  return error;
}

/* *INDENT-OFF* */
VLIB_CLI_COMMAND (app_ns_command, static) = {
  .path = "app ns",
  .short_help = "app ns [add|del] id <namespace-id> secret <secret> "
		"sw_if_index <sw_if_index> if <interface> [netns <ns>]",
  .function = app_ns_fn,
};
/* *INDENT-ON* */

u8 *
format_app_namespace (u8 * s, va_list * args)
{
  app_namespace_t *app_ns = va_arg (*args, app_namespace_t *);
  vnet_main_t *vnm = vnet_get_main ();

  s = format (s, "Application namespace [%u]\nid:        %s\nsecret:    %lu",
	      app_namespace_index (app_ns), app_ns->ns_id, app_ns->ns_secret);
  if (app_ns->sw_if_index != (u32) ~0)
    s = format (s, "\nInterface: %U", format_vnet_sw_if_index_name, vnm,
		app_ns->sw_if_index);
  if (app_ns->netns)
    s = format (s, "\nNetns:     %s", app_ns->netns);
  if (app_ns->sock_name)
    s = format (s, "\nSocket:    %s", app_ns->sock_name);

  return s;
}

static void
app_namespace_show_api (vlib_main_t * vm, app_namespace_t * app_ns)
{
  app_ns_api_handle_t *handle;
  app_worker_t *app_wrk;
  clib_socket_t *cs;
  clib_file_t *cf;

  if (!app_sapi_enabled)
    {
      vlib_cli_output (vm, "app socket api not enabled!");
      return;
    }

  vlib_cli_output (vm, "socket: %v\n", app_ns->sock_name);

  if (!pool_elts (app_ns->app_sockets))
    return;

  vlib_cli_output (vm, "%12s%12s%5s", "app index", "wrk index", "fd");


  /* *INDENT-OFF* */
  pool_foreach (cs, app_ns->app_sockets)  {
    handle = (app_ns_api_handle_t *) &cs->private_data;
    cf = clib_file_get (&file_main, handle->aah_file_index);
    if (handle->aah_app_wrk_index == APP_INVALID_INDEX)
      {
	vlib_cli_output (vm, "%12d%12d%5u", -1, -1, cf->file_descriptor);
	continue;
      }
    app_wrk = app_worker_get (handle->aah_app_wrk_index);
    vlib_cli_output (vm, "%12d%12d%5u", app_wrk->app_index,
                     app_wrk->wrk_map_index, cf->file_descriptor);
  }
  /* *INDENT-ON* */
}

static clib_error_t *
show_app_ns_fn (vlib_main_t * vm, unformat_input_t * main_input,
		vlib_cli_command_t * cmd)
{
  unformat_input_t _line_input, *line_input = &_line_input;
  u8 *ns_id = 0, do_table = 0, had_input = 1, do_api = 0;
  app_namespace_t *app_ns;
  vnet_main_t *vnm = vnet_get_main ();
  session_table_t *st;
  table_t table = {}, *t = &table;

  session_cli_return_if_not_enabled ();

  if (!unformat_user (main_input, unformat_line_input, line_input))
    {
      had_input = 0;
      goto do_ns_list;
    }

  while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
    {
      if (unformat (line_input, "id %_%v%_", &ns_id))
	do_table = 1;
      else if (unformat (line_input, "api-clients"))
	do_api = 1;
      else
	{
	  vlib_cli_output (vm, "unknown input [%U]", format_unformat_error,
			   line_input);
	  goto done;
	}
    }

  if (do_api)
    {
      if (!do_table)
	{
	  vlib_cli_output (vm, "must specify a table for api");
	  goto done;
	}
      app_ns = app_namespace_get_from_id (ns_id);
      app_namespace_show_api (vm, app_ns);
      goto done;
    }
  if (do_table)
    {
      app_ns = app_namespace_get_from_id (ns_id);
      if (!app_ns)
	{
	  vlib_cli_output (vm, "ns %v not found", ns_id);
	  goto done;
	}
      st = session_table_get (app_ns->local_table_index);
      if (!st)
	{
	  vlib_cli_output (vm, "table for ns %v could not be found", ns_id);
	  goto done;
	}
      vlib_cli_output (vm, "%U", format_app_namespace, app_ns);
      session_lookup_show_table_entries (vm, st, 0, 1);
      vec_free (ns_id);
      goto done;
    }

do_ns_list:
  table_add_header_col (t, 6, "Index", "Secret", "Interface", "Id", "Netns",
			"Socket");
  int i = 0;
  pool_foreach (app_ns, app_namespace_pool)
    {
      int j = 0;
      table_format_cell (t, i, j++, "%u", app_namespace_index (app_ns));
      table_format_cell (t, i, j++, "%lu", app_ns->ns_secret);
      table_format_cell (t, i, j++, "%U", format_vnet_sw_if_index_name, vnm,
			 app_ns->sw_if_index);
      table_format_cell (t, i, j++, "%s", app_ns->ns_id);
      table_format_cell (t, i, j++, "%s", app_ns->netns);
      table_format_cell (t, i++, j++, "%s", app_ns->sock_name);
    }

  t->default_body.align = TTAA_LEFT;
  t->default_header_col.align = TTAA_LEFT;
  t->default_header_col.fg_color = TTAC_YELLOW;
  t->default_header_col.flags = TTAF_FG_COLOR_SET;
  vlib_cli_output (vm, "%U", format_table, t);
  table_free (t);

done:
  if (had_input)
    unformat_free (line_input);
  return 0;
}

/* *INDENT-OFF* */
VLIB_CLI_COMMAND (show_app_ns_command, static) = {
  .path = "show app ns",
  .short_help = "show app ns [id <id> [api-clients]]",
  .function = show_app_ns_fn,
};
/* *INDENT-ON* */

/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */
f='#n2484'>2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609
/*
 * mc.c: vlib reliable sequenced multicast distributed applications
 *
 * Copyright (c) 2010 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <vlib/vlib.h>

/*
 * 1 to enable msg id training wheels, which are useful for tracking
 * down catchup and/or partitioned network problems
 */
#define MSG_ID_DEBUG 0

static format_function_t format_mc_stream_state;

static u32
elog_id_for_peer_id (mc_main_t * m, u64 peer_id)
{
  uword *p, r;
  mhash_t *h = &m->elog_id_by_peer_id;

  if (!m->elog_id_by_peer_id.hash)
    mhash_init (h, sizeof (uword), sizeof (mc_peer_id_t));

  p = mhash_get (h, &peer_id);
  if (p)
    return p[0];
  r = elog_string (m->elog_main, "%U", m->transport.format_peer_id, peer_id);
  mhash_set (h, &peer_id, r, /* old_value */ 0);
  return r;
}

static u32
elog_id_for_msg_name (mc_main_t * m, char *msg_name)
{
  uword *p, r;
  uword *h = m->elog_id_by_msg_name;
  u8 *name_copy;

  if (!h)
    h = m->elog_id_by_msg_name = hash_create_string (0, sizeof (uword));

  p = hash_get_mem (h, msg_name);
  if (p)
    return p[0];
  r = elog_string (m->elog_main, "%s", msg_name);

  name_copy = format (0, "%s%c", msg_name, 0);

  hash_set_mem (h, name_copy, r);
  m->elog_id_by_msg_name = h;

  return r;
}

static void
elog_tx_msg (mc_main_t * m, u32 stream_id, u32 local_sequence,
	     u32 retry_count)
{
  if (MC_EVENT_LOGGING > 0)
    {
      /* *INDENT-OFF* */
      ELOG_TYPE_DECLARE (e) =
        {
          .format = "tx-msg: stream %d local seq %d attempt %d",
          .format_args = "i4i4i4",
        };
      /* *INDENT-ON* */
      struct
      {
	u32 stream_id, local_sequence, retry_count;
      } *ed;
      ed = ELOG_DATA (m->elog_main, e);
      ed->stream_id = stream_id;
      ed->local_sequence = local_sequence;
      ed->retry_count = retry_count;
    }
}

/*
 * seq_cmp
 * correctly compare two unsigned sequence numbers.
 * This function works so long as x and y are within 2**(n-1) of each
 * other, where n = bits(x, y).
 *
 * Magic decoder ring:
 * seq_cmp == 0 => x and y are equal
 * seq_cmp < 0 => x is "in the past" with respect to y
 * seq_cmp > 0 => x is "in the future" with respect to y
 */
always_inline i32
mc_seq_cmp (u32 x, u32 y)
{
  return (i32) x - (i32) y;
}

void *
mc_get_vlib_buffer (vlib_main_t * vm, u32 n_bytes, u32 * bi_return)
{
  u32 n_alloc, bi = 0;
  vlib_buffer_t *b;

  n_alloc = vlib_buffer_alloc (vm, &bi, 1);
  ASSERT (n_alloc == 1);

  b = vlib_get_buffer (vm, bi);
  b->current_length = n_bytes;
  *bi_return = bi;
  return (void *) b->data;
}

static void
delete_peer_with_index (mc_main_t * mcm, mc_stream_t * s,
			uword index, int notify_application)
{
  mc_stream_peer_t *p = pool_elt_at_index (s->peers, index);
  ASSERT (p != 0);
  if (s->config.peer_died && notify_application)
    s->config.peer_died (mcm, s, p->id);

  s->all_peer_bitmap = clib_bitmap_andnoti (s->all_peer_bitmap, p - s->peers);

  if (MC_EVENT_LOGGING > 0)
    {
      /* *INDENT-OFF* */
      ELOG_TYPE_DECLARE (e) =
        {
          .format = "delete peer %s from all_peer_bitmap",
          .format_args = "T4",
        };
      /* *INDENT-ON* */
      struct
      {
	u32 peer;
      } *ed = 0;

      ed = ELOG_DATA (mcm->elog_main, e);
      ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64);
    }
  /* Do not delete the pool / hash table entries, or we lose sequence number state */
}

static mc_stream_peer_t *
get_or_create_peer_with_id (mc_main_t * mcm,
			    mc_stream_t * s, mc_peer_id_t id, int *created)
{
  uword *q = mhash_get (&s->peer_index_by_id, &id);
  mc_stream_peer_t *p;

  if (q)
    {
      p = pool_elt_at_index (s->peers, q[0]);
      goto done;
    }

  pool_get (s->peers, p);
  memset (p, 0, sizeof (p[0]));
  p->id = id;
  p->last_sequence_received = ~0;
  mhash_set (&s->peer_index_by_id, &id, p - s->peers, /* old_value */ 0);
  if (created)
    *created = 1;

done:
  if (MC_EVENT_LOGGING > 0)
    {
      /* *INDENT-OFF* */
      ELOG_TYPE_DECLARE (e) =
        {
          .format = "get_or_create %s peer %s stream %d seq %d",
          .format_args = "t4T4i4i4",
          .n_enum_strings = 2,
          .enum_strings = {
            "old", "new",
          },
        };
      /* *INDENT-ON* */
      struct
      {
	u32 is_new, peer, stream_index, rx_sequence;
      } *ed = 0;

      ed = ELOG_DATA (mcm->elog_main, e);
      ed->is_new = q ? 0 : 1;
      ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64);
      ed->stream_index = s->index;
      ed->rx_sequence = p->last_sequence_received;
    }
  /* $$$$ Enable or reenable this peer */
  s->all_peer_bitmap = clib_bitmap_ori (s->all_peer_bitmap, p - s->peers);
  return p;
}

static void
maybe_send_window_open_event (vlib_main_t * vm, mc_stream_t * stream)
{
  vlib_one_time_waiting_process_t *p;

  if (pool_elts (stream->retry_pool) >= stream->config.window_size)
    return;

  vec_foreach (p, stream->procs_waiting_for_open_window)
    vlib_signal_one_time_waiting_process (vm, p);

  if (stream->procs_waiting_for_open_window)
    _vec_len (stream->procs_waiting_for_open_window) = 0;
}

static void
mc_retry_free (mc_main_t * mcm, mc_stream_t * s, mc_retry_t * r)
{
  mc_retry_t record, *retp;

  if (r->unacked_by_peer_bitmap)
    _vec_len (r->unacked_by_peer_bitmap) = 0;

  if (clib_fifo_elts (s->retired_fifo) >= 2 * s->config.window_size)
    {
      clib_fifo_sub1 (s->retired_fifo, record);
      vlib_buffer_free_one (mcm->vlib_main, record.buffer_index);
    }

  clib_fifo_add2 (s->retired_fifo, retp);

  retp->buffer_index = r->buffer_index;
  retp->local_sequence = r->local_sequence;

  r->buffer_index = ~0;		/* poison buffer index in this retry */
}

static void
mc_resend_retired (mc_main_t * mcm, mc_stream_t * s, u32 local_sequence)
{
  mc_retry_t *retry;

  if (MC_EVENT_LOGGING > 0)
    {
      /* *INDENT-OFF* */
      ELOG_TYPE_DECLARE (e) =
        {
          .format = "resend-retired: search for local seq %d",
          .format_args = "i4",
        };
      /* *INDENT-ON* */
      struct
      {
	u32 local_sequence;
      } *ed;
      ed = ELOG_DATA (mcm->elog_main, e);
      ed->local_sequence = local_sequence;
    }

  /* *INDENT-OFF* */
  clib_fifo_foreach (retry, s->retired_fifo,
  ({
    if (retry->local_sequence == local_sequence)
      {
        elog_tx_msg (mcm, s->index, retry-> local_sequence, -13);
        mcm->transport.tx_buffer (mcm->transport.opaque,
                                  MC_TRANSPORT_USER_REQUEST_TO_RELAY,
                                  retry->buffer_index);
        return;
      }
  }));
  /* *INDENT-ON* */

  if (MC_EVENT_LOGGING > 0)
    {
      /* *INDENT-OFF* */
      ELOG_TYPE_DECLARE (e) =
        {
          .format = "resend-retired: FAILED search for local seq %d",
          .format_args = "i4",
        };
      /* *INDENT-ON* */
      struct
      {
	u32 local_sequence;
      } *ed;
      ed = ELOG_DATA (mcm->elog_main, e);
      ed->local_sequence = local_sequence;
    }
}

static uword *
delete_retry_fifo_elt (mc_main_t * mcm,
		       mc_stream_t * stream,
		       mc_retry_t * r, uword * dead_peer_bitmap)
{
  mc_stream_peer_t *p;

  /* *INDENT-OFF* */
  pool_foreach (p, stream->peers, ({
    uword pi = p - stream->peers;
    uword is_alive = 0 == clib_bitmap_get (r->unacked_by_peer_bitmap, pi);

    if (! is_alive)
      dead_peer_bitmap = clib_bitmap_ori (dead_peer_bitmap, pi);

    if (MC_EVENT_LOGGING > 0)
      {
        ELOG_TYPE_DECLARE (e) = {
          .format = "delete_retry_fifo_elt: peer %s is %s",
          .format_args = "T4t4",
          .n_enum_strings = 2,
          .enum_strings = { "alive", "dead", },
        };
        struct { u32 peer, is_alive; } * ed;
        ed = ELOG_DATA (mcm->elog_main, e);
        ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64);
        ed->is_alive = is_alive;
      }
  }));
  /* *INDENT-ON* */

  hash_unset (stream->retry_index_by_local_sequence, r->local_sequence);
  mc_retry_free (mcm, stream, r);

  return dead_peer_bitmap;
}

always_inline mc_retry_t *
prev_retry (mc_stream_t * s, mc_retry_t * r)
{
  return (r->prev_index != ~0
	  ? pool_elt_at_index (s->retry_pool, r->prev_index) : 0);
}

always_inline mc_retry_t *
next_retry (mc_stream_t * s, mc_retry_t * r)
{
  return (r->next_index != ~0
	  ? pool_elt_at_index (s->retry_pool, r->next_index) : 0);
}

always_inline void
remove_retry_from_pool (mc_stream_t * s, mc_retry_t * r)
{
  mc_retry_t *p = prev_retry (s, r);
  mc_retry_t *n = next_retry (s, r);

  if (p)
    p->next_index = r->next_index;
  else
    s->retry_head_index = r->next_index;
  if (n)
    n->prev_index = r->prev_index;
  else
    s->retry_tail_index = r->prev_index;

  pool_put_index (s->retry_pool, r - s->retry_pool);
}

static void
check_retry (mc_main_t * mcm, mc_stream_t * s)
{
  mc_retry_t *r;
  vlib_main_t *vm = mcm->vlib_main;
  f64 now = vlib_time_now (vm);
  uword *dead_peer_bitmap = 0;
  u32 ri, ri_next;

  for (ri = s->retry_head_index; ri != ~0; ri = ri_next)
    {
      r = pool_elt_at_index (s->retry_pool, ri);
      ri_next = r->next_index;

      if (now < r->sent_at + s->config.retry_interval)
	continue;

      r->n_retries += 1;
      if (r->n_retries > s->config.retry_limit)
	{
	  dead_peer_bitmap =
	    delete_retry_fifo_elt (mcm, s, r, dead_peer_bitmap);
	  remove_retry_from_pool (s, r);
	}
      else
	{
	  if (MC_EVENT_LOGGING > 0)
	    {
	      mc_stream_peer_t *p;

              /* *INDENT-OFF* */
	      ELOG_TYPE_DECLARE (t) =
                {
                  .format = "resend local seq %d attempt %d",
                  .format_args = "i4i4",
                };
              /* *INDENT-ON* */

              /* *INDENT-OFF* */
	      pool_foreach (p, s->peers, ({
		if (clib_bitmap_get (r->unacked_by_peer_bitmap, p - s->peers))
		  {
		    ELOG_TYPE_DECLARE (ev) = {
		      .format = "resend: needed by peer %s local seq %d",
		      .format_args = "T4i4",
		    };
		    struct { u32 peer, rx_sequence; } * ed;
		    ed = ELOG_DATA (mcm->elog_main, ev);
		    ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64);
		    ed->rx_sequence = r->local_sequence;
		  }
	      }));
              /* *INDENT-ON* */

	      struct
	      {
		u32 sequence;
		u32 trail;
	      } *ed;
	      ed = ELOG_DATA (mcm->elog_main, t);
	      ed->sequence = r->local_sequence;
	      ed->trail = r->n_retries;
	    }

	  r->sent_at = vlib_time_now (vm);
	  s->stats.n_retries += 1;

	  elog_tx_msg (mcm, s->index, r->local_sequence, r->n_retries);

	  mcm->transport.tx_buffer
	    (mcm->transport.opaque,
	     MC_TRANSPORT_USER_REQUEST_TO_RELAY, r->buffer_index);
	}
    }

  maybe_send_window_open_event (mcm->vlib_main, s);

  /* Delete any dead peers we've found. */
  if (!clib_bitmap_is_zero (dead_peer_bitmap))
    {
      uword i;

      /* *INDENT-OFF* */
      clib_bitmap_foreach (i, dead_peer_bitmap, ({
	delete_peer_with_index (mcm, s, i, /* notify_application */ 1);

	/* Delete any references to just deleted peer in retry pool. */
	pool_foreach (r, s->retry_pool, ({
	  r->unacked_by_peer_bitmap =
	    clib_bitmap_andnoti (r->unacked_by_peer_bitmap, i);
	}));
      }));
/* *INDENT-ON* */
      clib_bitmap_free (dead_peer_bitmap);
    }
}

always_inline mc_main_t *
mc_node_get_main (vlib_node_runtime_t * node)
{
  mc_main_t **p = (void *) node->runtime_data;
  return p[0];
}

static uword
mc_retry_process (vlib_main_t * vm,
		  vlib_node_runtime_t * node, vlib_frame_t * f)
{
  mc_main_t *mcm = mc_node_get_main (node);
  mc_stream_t *s;

  while (1)
    {
      vlib_process_suspend (vm, 1.0);
      vec_foreach (s, mcm->stream_vector)
      {
	if (s->state != MC_STREAM_STATE_invalid)
	  check_retry (mcm, s);
      }
    }
  return 0;			/* not likely */
}

static void
send_join_or_leave_request (mc_main_t * mcm, u32 stream_index, u32 is_join)
{
  vlib_main_t *vm = mcm->vlib_main;
  mc_msg_join_or_leave_request_t *mp;
  u32 bi;

  mp = mc_get_vlib_buffer (vm, sizeof (mp[0]), &bi);
  memset (mp, 0, sizeof (*mp));
  mp->type = MC_MSG_TYPE_join_or_leave_request;
  mp->peer_id = mcm->transport.our_ack_peer_id;
  mp->stream_index = stream_index;
  mp->is_join = is_join;

  mc_byte_swap_msg_join_or_leave_request (mp);

  /*
   * These msgs are unnumbered, unordered so send on the from-relay
   * channel.
   */
  mcm->transport.tx_buffer (mcm->transport.opaque, MC_TRANSPORT_JOIN, bi);
}

static uword
mc_join_ager_process (vlib_main_t * vm,
		      vlib_node_runtime_t * node, vlib_frame_t * f)
{
  mc_main_t *mcm = mc_node_get_main (node);

  while (1)
    {
      if (mcm->joins_in_progress)
	{
	  mc_stream_t *s;
	  vlib_one_time_waiting_process_t *p;
	  f64 now = vlib_time_now (vm);

	  vec_foreach (s, mcm->stream_vector)
	  {
	    if (s->state != MC_STREAM_STATE_join_in_progress)
	      continue;

	    if (now > s->join_timeout)
	      {
		s->state = MC_STREAM_STATE_ready;

		if (MC_EVENT_LOGGING > 0)
		  {
                    /* *INDENT-OFF* */
		    ELOG_TYPE_DECLARE (e) =
                      {
                        .format = "stream %d join timeout",
                      };
                    /* *INDENT-ON* */
		    ELOG (mcm->elog_main, e, s->index);
		  }
		/* Make sure that this app instance exists as a stream peer,
		   or we may answer a catchup request with a NULL
		   all_peer_bitmap... */
		(void) get_or_create_peer_with_id
		  (mcm, s, mcm->transport.our_ack_peer_id, /* created */ 0);

		vec_foreach (p, s->procs_waiting_for_join_done)
		  vlib_signal_one_time_waiting_process (vm, p);
		if (s->procs_waiting_for_join_done)
		  _vec_len (s->procs_waiting_for_join_done) = 0;

		mcm->joins_in_progress--;
		ASSERT (mcm->joins_in_progress >= 0);
	      }
	    else
	      {
		/* Resent join request which may have been lost. */
		send_join_or_leave_request (mcm, s->index, 1 /* is_join */ );

		/* We're *not* alone, retry for as long as it takes */
		if (mcm->relay_state == MC_RELAY_STATE_SLAVE)
		  s->join_timeout = vlib_time_now (vm) + 2.0;


		if (MC_EVENT_LOGGING > 0)
		  {
                    /* *INDENT-OFF* */
		    ELOG_TYPE_DECLARE (e) =
                      {
                        .format = "stream %d resend join request",
                      };
                    /* *INDENT-ON* */
		    ELOG (mcm->elog_main, e, s->index);
		  }
	      }
	  }
	}

      vlib_process_suspend (vm, .5);
    }

  return 0;			/* not likely */
}

static void
serialize_mc_register_stream_name (serialize_main_t * m, va_list * va)
{
  char *name = va_arg (*va, char *);
  serialize_cstring (m, name);
}

static void
elog_stream_name (char *buf, int n_buf_bytes, char *v)
{
  clib_memcpy (buf, v, clib_min (n_buf_bytes - 1, vec_len (v)));
  buf[n_buf_bytes - 1] = 0;
}

static void
unserialize_mc_register_stream_name (serialize_main_t * m, va_list * va)
{
  mc_main_t *mcm = va_arg (*va, mc_main_t *);
  char *name;
  mc_stream_t *s;
  uword *p;

  unserialize_cstring (m, &name);

  if ((p = hash_get_mem (mcm->stream_index_by_name, name)))
    {
      if (MC_EVENT_LOGGING > 0)
	{
          /* *INDENT-OFF* */
	  ELOG_TYPE_DECLARE (e) =
            {
              .format = "stream index %d already named %s",
              .format_args = "i4s16",
            };
          /* *INDENT-ON* */
	  struct
	  {
	    u32 stream_index;
	    char name[16];
	  } *ed;
	  ed = ELOG_DATA (mcm->elog_main, e);
	  ed->stream_index = p[0];
	  elog_stream_name (ed->name, sizeof (ed->name), name);
	}

      vec_free (name);
      return;
    }

  vec_add2 (mcm->stream_vector, s, 1);
  mc_stream_init (s);
  s->state = MC_STREAM_STATE_name_known;
  s->index = s - mcm->stream_vector;
  s->config.name = name;

  if (MC_EVENT_LOGGING > 0)
    {
      /* *INDENT-OFF* */
      ELOG_TYPE_DECLARE (e) =
        {
          .format = "stream index %d named %s",
          .format_args = "i4s16",
        };
      /* *INDENT-ON* */
      struct
      {
	u32 stream_index;
	char name[16];
      } *ed;
      ed = ELOG_DATA (mcm->elog_main, e);
      ed->stream_index = s->index;
      elog_stream_name (ed->name, sizeof (ed->name), name);
    }

  hash_set_mem (mcm->stream_index_by_name, name, s->index);

  p = hash_get (mcm->procs_waiting_for_stream_name_by_name, name);
  if (p)
    {
      vlib_one_time_waiting_process_t *wp, **w;
      w = pool_elt_at_index (mcm->procs_waiting_for_stream_name_pool, p[0]);
      vec_foreach (wp, w[0])
	vlib_signal_one_time_waiting_process (mcm->vlib_main, wp);
      pool_put (mcm->procs_waiting_for_stream_name_pool, w);
      hash_unset_mem (mcm->procs_waiting_for_stream_name_by_name, name);
    }
}

/* *INDENT-OFF* */
MC_SERIALIZE_MSG (mc_register_stream_name_msg, static) =
{
  .name = "mc_register_stream_name",
  .serialize = serialize_mc_register_stream_name,
  .unserialize = unserialize_mc_register_stream_name,
};
/* *INDENT-ON* */

void
mc_rx_buffer_unserialize (mc_main_t * mcm,
			  mc_stream_t * stream,
			  mc_peer_id_t peer_id, u32 buffer_index)
{
  return mc_unserialize (mcm, stream, buffer_index);
}

static u8 *
mc_internal_catchup_snapshot (mc_main_t * mcm,
			      u8 * data_vector,
			      u32 last_global_sequence_processed)
{
  serialize_main_t m;

  /* Append serialized data to data vector. */
  serialize_open_vector (&m, data_vector);
  m.stream.current_buffer_index = vec_len (data_vector);

  serialize (&m, serialize_mc_main, mcm);
  return serialize_close_vector (&m);
}

static void
mc_internal_catchup (mc_main_t * mcm, u8 * data, u32 n_data_bytes)
{
  serialize_main_t s;

  unserialize_open_data (&s, data, n_data_bytes);

  unserialize (&s, unserialize_mc_main, mcm);
}

/* Overridden from the application layer, not actually used here */
void mc_stream_join_process_hold (void) __attribute__ ((weak));
void
mc_stream_join_process_hold (void)
{
}

static u32
mc_stream_join_helper (mc_main_t * mcm,
		       mc_stream_config_t * config, u32 is_internal)
{
  mc_stream_t *s;
  vlib_main_t *vm = mcm->vlib_main;

  s = 0;
  if (!is_internal)
    {
      uword *p;

      /* Already have a stream with given name? */
      if ((s = mc_stream_by_name (mcm, config->name)))
	{
	  /* Already joined and ready? */
	  if (s->state == MC_STREAM_STATE_ready)
	    return s->index;
	}

      /* First join MC internal stream. */
      if (!mcm->stream_vector
	  || (mcm->stream_vector[MC_STREAM_INDEX_INTERNAL].state
	      == MC_STREAM_STATE_invalid))
	{
	  static mc_stream_config_t c = {
	    .name = "mc-internal",
	    .rx_buffer = mc_rx_buffer_unserialize,
	    .catchup = mc_internal_catchup,
	    .catchup_snapshot = mc_internal_catchup_snapshot,
	  };

	  c.save_snapshot = config->save_snapshot;

	  mc_stream_join_helper (mcm, &c, /* is_internal */ 1);
	}

      /* If stream is still unknown register this name and wait for
         sequenced message to name stream.  This way all peers agree
         on stream name to index mappings. */
      s = mc_stream_by_name (mcm, config->name);
      if (!s)
	{
	  vlib_one_time_waiting_process_t *wp, **w;
	  u8 *name_copy = format (0, "%s", config->name);

	  mc_serialize_stream (mcm,
			       MC_STREAM_INDEX_INTERNAL,
			       &mc_register_stream_name_msg, config->name);

	  /* Wait for this stream to be named. */
	  p =
	    hash_get_mem (mcm->procs_waiting_for_stream_name_by_name,
			  name_copy);
	  if (p)
	    w =
	      pool_elt_at_index (mcm->procs_waiting_for_stream_name_pool,
				 p[0]);
	  else
	    {
	      pool_get (mcm->procs_waiting_for_stream_name_pool, w);
	      if (!mcm->procs_waiting_for_stream_name_by_name)
		mcm->procs_waiting_for_stream_name_by_name = hash_create_string ( /* elts */ 0,	/* value size */
										 sizeof
										 (uword));
	      hash_set_mem (mcm->procs_waiting_for_stream_name_by_name,
			    name_copy,
			    w - mcm->procs_waiting_for_stream_name_pool);
	      w[0] = 0;
	    }

	  vec_add2 (w[0], wp, 1);
	  vlib_current_process_wait_for_one_time_event (vm, wp);
	  vec_free (name_copy);
	}

      /* Name should be known now. */
      s = mc_stream_by_name (mcm, config->name);
      ASSERT (s != 0);
      ASSERT (s->state == MC_STREAM_STATE_name_known);
    }

  if (!s)
    {
      vec_add2 (mcm->stream_vector, s, 1);
      mc_stream_init (s);
      s->index = s - mcm->stream_vector;
    }

  {
    /* Save name since we could have already used it as hash key. */
    char *name_save = s->config.name;

    s->config = config[0];

    if (name_save)
      s->config.name = name_save;
  }

  if (s->config.window_size == 0)
    s->config.window_size = 8;

  if (s->config.retry_interval == 0.0)
    s->config.retry_interval = 1.0;

  /* Sanity. */
  ASSERT (s->config.retry_interval < 30);

  if (s->config.retry_limit == 0)
    s->config.retry_limit = 7;

  s->state = MC_STREAM_STATE_join_in_progress;
  if (!s->peer_index_by_id.hash)
    mhash_init (&s->peer_index_by_id, sizeof (uword), sizeof (mc_peer_id_t));

  /* If we don't hear from someone in 5 seconds, we're alone */
  s->join_timeout = vlib_time_now (vm) + 5.0;
  mcm->joins_in_progress++;

  if (MC_EVENT_LOGGING > 0)
    {
      /* *INDENT-OFF* */
      ELOG_TYPE_DECLARE (e) =
      {
        .format = "stream index %d join request %s",
        .format_args = "i4s16",
      };
      /* *INDENT-ON* */
      struct
      {
	u32 stream_index;
	char name[16];
      } *ed;
      ed = ELOG_DATA (mcm->elog_main, e);
      ed->stream_index = s->index;
      elog_stream_name (ed->name, sizeof (ed->name), s->config.name);
    }

  send_join_or_leave_request (mcm, s->index, 1 /* join */ );

  vlib_current_process_wait_for_one_time_event_vector
    (vm, &s->procs_waiting_for_join_done);

  if (MC_EVENT_LOGGING)
    {
      ELOG_TYPE (e, "join complete stream %d");
      ELOG (mcm->elog_main, e, s->index);
    }

  return s->index;
}

u32
mc_stream_join (mc_main_t * mcm, mc_stream_config_t * config)
{
  return mc_stream_join_helper (mcm, config, /* is_internal */ 0);
}

void
mc_stream_leave (mc_main_t * mcm, u32 stream_index)
{
  mc_stream_t *s = mc_stream_by_index (mcm, stream_index);

  if (!s)
    return;

  if (MC_EVENT_LOGGING)
    {
      /* *INDENT-OFF* */
      ELOG_TYPE_DECLARE (t) =
        {
          .format = "leave-stream: %d",.format_args = "i4",
        };
      /* *INDENT-ON* */
      struct
      {
	u32 index;
      } *ed;
      ed = ELOG_DATA (mcm->elog_main, t);
      ed->index = stream_index;
    }

  send_join_or_leave_request (mcm, stream_index, 0 /* is_join */ );
  mc_stream_free (s);
  s->state = MC_STREAM_STATE_name_known;
}

void
mc_msg_join_or_leave_request_handler (mc_main_t * mcm,
				      mc_msg_join_or_leave_request_t * req,
				      u32 buffer_index)
{
  mc_stream_t *s;
  mc_msg_join_reply_t *rep;
  u32 bi;

  mc_byte_swap_msg_join_or_leave_request (req);

  s = mc_stream_by_index (mcm, req->stream_index);
  if (!s || s->state != MC_STREAM_STATE_ready)
    return;

  /* If the peer is joining, create it */
  if (req->is_join)
    {
      mc_stream_t *this_s;

      /* We're not in a position to catch up a peer until all
         stream joins are complete. */
      if (0)
	{
	  /* XXX This is hard to test so we've. */
	  vec_foreach (this_s, mcm->stream_vector)
	  {
	    if (this_s->state != MC_STREAM_STATE_ready
		&& this_s->state != MC_STREAM_STATE_name_known)
	      return;
	  }
	}
      else if (mcm->joins_in_progress > 0)
	return;

      (void) get_or_create_peer_with_id (mcm, s, req->peer_id,
					 /* created */ 0);

      rep = mc_get_vlib_buffer (mcm->vlib_main, sizeof (rep[0]), &bi);
      memset (rep, 0, sizeof (rep[0]));
      rep->type = MC_MSG_TYPE_join_reply;
      rep->stream_index = req->stream_index;

      mc_byte_swap_msg_join_reply (rep);
      /* These two are already in network byte order... */
      rep->peer_id = mcm->transport.our_ack_peer_id;
      rep->catchup_peer_id = mcm->transport.our_catchup_peer_id;

      mcm->transport.tx_buffer (mcm->transport.opaque, MC_TRANSPORT_JOIN, bi);
    }
  else
    {
      if (s->config.peer_died)
	s->config.peer_died (mcm, s, req->peer_id);
    }
}

void
mc_msg_join_reply_handler (mc_main_t * mcm,
			   mc_msg_join_reply_t * mp, u32 buffer_index)
{
  mc_stream_t *s;

  mc_byte_swap_msg_join_reply (mp);

  s = mc_stream_by_index (mcm, mp->stream_index);

  if (!s || s->state != MC_STREAM_STATE_join_in_progress)
    return;

  /* Switch to catchup state; next join reply
     for this stream will be ignored. */
  s->state = MC_STREAM_STATE_catchup;

  mcm->joins_in_progress--;
  mcm->transport.catchup_request_fun (mcm->transport.opaque,
				      mp->stream_index, mp->catchup_peer_id);
}

void
mc_wait_for_stream_ready (mc_main_t * m, char *stream_name)
{
  mc_stream_t *s;

  while (1)
    {
      s = mc_stream_by_name (m, stream_name);
      if (s)
	break;
      vlib_process_suspend (m->vlib_main, .1);
    }

  /* It's OK to send a message in catchup and ready states. */
  if (s->state == MC_STREAM_STATE_catchup
      || s->state == MC_STREAM_STATE_ready)
    return;

  /* Otherwise we are waiting for a join to finish. */
  vlib_current_process_wait_for_one_time_event_vector
    (m->vlib_main, &s->procs_waiting_for_join_done);
}

u32
mc_stream_send (mc_main_t * mcm, u32 stream_index, u32 buffer_index)
{
  mc_stream_t *s = mc_stream_by_index (mcm, stream_index);
  vlib_main_t *vm = mcm->vlib_main;
  mc_retry_t *r;
  mc_msg_user_request_t *mp;
  vlib_buffer_t *b = vlib_get_buffer (vm, buffer_index);
  u32 ri;

  if (!s)
    return 0;

  if (s->state != MC_STREAM_STATE_ready)
    vlib_current_process_wait_for_one_time_event_vector
      (vm, &s->procs_waiting_for_join_done);

  while (pool_elts (s->retry_pool) >= s->config.window_size)
    {
      vlib_current_process_wait_for_one_time_event_vector
	(vm, &s->procs_waiting_for_open_window);
    }

  pool_get (s->retry_pool, r);
  ri = r - s->retry_pool;

  r->prev_index = s->retry_tail_index;
  r->next_index = ~0;
  s->retry_tail_index = ri;

  if (r->prev_index == ~0)
    s->retry_head_index = ri;
  else
    {
      mc_retry_t *p = pool_elt_at_index (s->retry_pool, r->prev_index);
      p->next_index = ri;
    }

  vlib_buffer_advance (b, -sizeof (mp[0]));
  mp = vlib_buffer_get_current (b);

  mp->peer_id = mcm->transport.our_ack_peer_id;
  /* mp->transport.global_sequence set by relay agent. */
  mp->global_sequence = 0xdeadbeef;
  mp->stream_index = s->index;
  mp->local_sequence = s->our_local_sequence++;
  mp->n_data_bytes =
    vlib_buffer_index_length_in_chain (vm, buffer_index) - sizeof (mp[0]);

  r->buffer_index = buffer_index;
  r->local_sequence = mp->local_sequence;
  r->sent_at = vlib_time_now (vm);
  r->n_retries = 0;

  /* Retry will be freed when all currently known peers have acked. */
  vec_validate (r->unacked_by_peer_bitmap, vec_len (s->all_peer_bitmap) - 1);
  vec_copy (r->unacked_by_peer_bitmap, s->all_peer_bitmap);

  hash_set (s->retry_index_by_local_sequence, r->local_sequence,
	    r - s->retry_pool);

  elog_tx_msg (mcm, s->index, mp->local_sequence, r->n_retries);

  mc_byte_swap_msg_user_request (mp);

  mcm->transport.tx_buffer (mcm->transport.opaque,
			    MC_TRANSPORT_USER_REQUEST_TO_RELAY, buffer_index);

  s->user_requests_sent++;

  /* return amount of window remaining */
  return s->config.window_size - pool_elts (s->retry_pool);
}

void
mc_msg_user_request_handler (mc_main_t * mcm, mc_msg_user_request_t * mp,
			     u32 buffer_index)
{
  vlib_main_t *vm = mcm->vlib_main;
  mc_stream_t *s;
  mc_stream_peer_t *peer;
  i32 seq_cmp_result;
  static int once = 0;

  mc_byte_swap_msg_user_request (mp);

  s = mc_stream_by_index (mcm, mp->stream_index);

  /* Not signed up for this stream? Turf-o-matic */
  if (!s || s->state != MC_STREAM_STATE_ready)
    {
      vlib_buffer_free_one (vm, buffer_index);
      return;
    }

  /* Find peer, including ourselves. */
  peer = get_or_create_peer_with_id (mcm, s, mp->peer_id,
				     /* created */ 0);

  seq_cmp_result = mc_seq_cmp (mp->local_sequence,
			       peer->last_sequence_received + 1);

  if (MC_EVENT_LOGGING > 0)
    {
      /* *INDENT-OFF* */
      ELOG_TYPE_DECLARE (e) =
        {
          .format = "rx-msg: peer %s stream %d rx seq %d seq_cmp %d",
          .format_args = "T4i4i4i4",
        };
      /* *INDENT-ON* */
      struct
      {
	u32 peer, stream_index, rx_sequence;
	i32 seq_cmp_result;
      } *ed;
      ed = ELOG_DATA (mcm->elog_main, e);
      ed->peer = elog_id_for_peer_id (mcm, peer->id.as_u64);
      ed->stream_index = mp->stream_index;
      ed->rx_sequence = mp->local_sequence;
      ed->seq_cmp_result = seq_cmp_result;
    }

  if (0 && mp->stream_index == 1 && once == 0)
    {
      once = 1;
      ELOG_TYPE (e, "FAKE lost msg on stream 1");
      ELOG (mcm->elog_main, e, 0);
      return;
    }

  peer->last_sequence_received += seq_cmp_result == 0;
  s->user_requests_received++;

  if (seq_cmp_result > 0)
    peer->stats.n_msgs_from_future += 1;

  /* Send ack even if msg from future */
  if (1)
    {
      mc_msg_user_ack_t *rp;
      u32 bi;

      rp = mc_get_vlib_buffer (vm, sizeof (rp[0]), &bi);
      rp->peer_id = mcm->transport.our_ack_peer_id;
      rp->stream_index = s->index;
      rp->local_sequence = mp->local_sequence;
      rp->seq_cmp_result = seq_cmp_result;

      if (MC_EVENT_LOGGING > 0)
	{
          /* *INDENT-OFF* */
	  ELOG_TYPE_DECLARE (e) =
            {
              .format = "tx-ack: stream %d local seq %d",
              .format_args = "i4i4",
            };
          /* *INDENT-ON* */
	  struct
	  {
	    u32 stream_index;
	    u32 local_sequence;
	  } *ed;
	  ed = ELOG_DATA (mcm->elog_main, e);
	  ed->stream_index = rp->stream_index;
	  ed->local_sequence = rp->local_sequence;
	}

      mc_byte_swap_msg_user_ack (rp);

      mcm->transport.tx_ack (mcm->transport.opaque, mp->peer_id, bi);
      /* Msg from past? If so, free the buffer... */
      if (seq_cmp_result < 0)
	{
	  vlib_buffer_free_one (vm, buffer_index);
	  peer->stats.n_msgs_from_past += 1;
	}
    }

  if (seq_cmp_result == 0)
    {
      vlib_buffer_t *b = vlib_get_buffer (vm, buffer_index);
      switch (s->state)
	{
	case MC_STREAM_STATE_ready:
	  vlib_buffer_advance (b, sizeof (mp[0]));
	  s->config.rx_buffer (mcm, s, mp->peer_id, buffer_index);

	  /* Stream vector can change address via rx callback for mc-internal
	     stream. */
	  s = mc_stream_by_index (mcm, mp->stream_index);
	  ASSERT (s != 0);
	  s->last_global_sequence_processed = mp->global_sequence;
	  break;

	case MC_STREAM_STATE_catchup:
	  clib_fifo_add1 (s->catchup_fifo, buffer_index);
	  break;

	default:
	  clib_warning ("stream in unknown state %U",
			format_mc_stream_state, s->state);
	  break;
	}
    }
}

void
mc_msg_user_ack_handler (mc_main_t * mcm, mc_msg_user_ack_t * mp,
			 u32 buffer_index)
{
  vlib_main_t *vm = mcm->vlib_main;
  uword *p;
  mc_stream_t *s;
  mc_stream_peer_t *peer;
  mc_retry_t *r;
  int peer_created = 0;

  mc_byte_swap_msg_user_ack (mp);

  s = mc_stream_by_index (mcm, mp->stream_index);

  if (MC_EVENT_LOGGING > 0)
    {
      /* *INDENT-OFF* */
      ELOG_TYPE_DECLARE (t) =
        {
          .format = "rx-ack: local seq %d peer %s seq_cmp_result %d",
          .format_args = "i4T4i4",
        };
      /* *INDENT-ON* */

      struct
      {
	u32 local_sequence;
	u32 peer;
	i32 seq_cmp_result;
      } *ed;
      ed = ELOG_DATA (mcm->elog_main, t);
      ed->local_sequence = mp->local_sequence;
      ed->peer = elog_id_for_peer_id (mcm, mp->peer_id.as_u64);
      ed->seq_cmp_result = mp->seq_cmp_result;
    }

  /* Unknown stream? */
  if (!s)
    return;

  /* Find the peer which just ack'ed. */
  peer = get_or_create_peer_with_id (mcm, s, mp->peer_id,
				     /* created */ &peer_created);

  /*
   * Peer reports message from the future. If it's not in the retry
   * fifo, look for a retired message.
   */
  if (mp->seq_cmp_result > 0)
    {
      p = hash_get (s->retry_index_by_local_sequence, mp->local_sequence -
		    mp->seq_cmp_result);
      if (p == 0)
	mc_resend_retired (mcm, s, mp->local_sequence - mp->seq_cmp_result);

      /* Normal retry should fix it... */
      return;
    }

  /*
   * Pointer to the indicated retry fifo entry.
   * Worth hashing because we could use a window size of 100 or 1000.
   */
  p = hash_get (s->retry_index_by_local_sequence, mp->local_sequence);

  /*
   * Is this a duplicate ACK, received after we've retired the
   * fifo entry. This can happen when learning about new
   * peers.
   */
  if (p == 0)
    {
      if (MC_EVENT_LOGGING > 0)
	{
          /* *INDENT-OFF* */
	  ELOG_TYPE_DECLARE (t) =
            {
              .format = "ack: for seq %d from peer %s no fifo elt",
              .format_args = "i4T4",
            };
          /* *INDENT-ON* */

	  struct
	  {
	    u32 seq;
	    u32 peer;
	  } *ed;
	  ed = ELOG_DATA (mcm->elog_main, t);
	  ed->seq = mp->local_sequence;
	  ed->peer = elog_id_for_peer_id (mcm, mp->peer_id.as_u64);
	}

      return;
    }

  r = pool_elt_at_index (s->retry_pool, p[0]);

  /* Make sure that this new peer ACKs our msgs from now on */
  if (peer_created)
    {
      mc_retry_t *later_retry = next_retry (s, r);

      while (later_retry)
	{
	  later_retry->unacked_by_peer_bitmap =
	    clib_bitmap_ori (later_retry->unacked_by_peer_bitmap,
			     peer - s->peers);
	  later_retry = next_retry (s, later_retry);
	}
    }

  ASSERT (mp->local_sequence == r->local_sequence);

  /* If we weren't expecting to hear from this peer */
  if (!peer_created &&
      !clib_bitmap_get (r->unacked_by_peer_bitmap, peer - s->peers))
    {
      if (MC_EVENT_LOGGING > 0)
	{
          /* *INDENT-OFF* */
	  ELOG_TYPE_DECLARE (t) =
            {
              .format = "dup-ack: for seq %d from peer %s",
              .format_args = "i4T4",
            };
          /* *INDENT-ON* */
	  struct
	  {
	    u32 seq;
	    u32 peer;
	  } *ed;
	  ed = ELOG_DATA (mcm->elog_main, t);
	  ed->seq = r->local_sequence;
	  ed->peer = elog_id_for_peer_id (mcm, peer->id.as_u64);
	}
      if (!clib_bitmap_is_zero (r->unacked_by_peer_bitmap))
	return;
    }

  if (MC_EVENT_LOGGING > 0)
    {
      /* *INDENT-OFF* */
      ELOG_TYPE_DECLARE (t) =
        {
          .format = "ack: for seq %d from peer %s",
          .format_args = "i4T4",
        };
      /* *INDENT-ON* */
      struct
      {
	u32 seq;
	u32 peer;
      } *ed;
      ed = ELOG_DATA (mcm->elog_main, t);
      ed->seq = mp->local_sequence;
      ed->peer = elog_id_for_peer_id (mcm, peer->id.as_u64);
    }

  r->unacked_by_peer_bitmap =
    clib_bitmap_andnoti (r->unacked_by_peer_bitmap, peer - s->peers);

  /* Not all clients have ack'ed */
  if (!clib_bitmap_is_zero (r->unacked_by_peer_bitmap))
    {
      return;
    }
  if (MC_EVENT_LOGGING > 0)
    {
      /* *INDENT-OFF* */
      ELOG_TYPE_DECLARE (t) =
        {
          .format = "ack: retire fifo elt loc seq %d after %d acks",
          .format_args = "i4i4",
        };
      /* *INDENT-ON* */
      struct
      {
	u32 seq;
	u32 npeers;
      } *ed;
      ed = ELOG_DATA (mcm->elog_main, t);
      ed->seq = r->local_sequence;
      ed->npeers = pool_elts (s->peers);
    }

  hash_unset (s->retry_index_by_local_sequence, mp->local_sequence);
  mc_retry_free (mcm, s, r);
  remove_retry_from_pool (s, r);
  maybe_send_window_open_event (vm, s);
}

#define EVENT_MC_SEND_CATCHUP_DATA 0

static uword
mc_catchup_process (vlib_main_t * vm,
		    vlib_node_runtime_t * node, vlib_frame_t * f)
{
  mc_main_t *mcm = mc_node_get_main (node);
  uword *event_data = 0;
  mc_catchup_process_arg_t *args;
  int i;

  while (1)
    {
      if (event_data)
	_vec_len (event_data) = 0;
      vlib_process_wait_for_event_with_type (vm, &event_data,
					     EVENT_MC_SEND_CATCHUP_DATA);

      for (i = 0; i < vec_len (event_data); i++)
	{
	  args = pool_elt_at_index (mcm->catchup_process_args, event_data[i]);

	  mcm->transport.catchup_send_fun (mcm->transport.opaque,
					   args->catchup_opaque,
					   args->catchup_snapshot);

	  /* Send function will free snapshot data vector. */
	  pool_put (mcm->catchup_process_args, args);
	}
    }

  return 0;			/* not likely */
}

static void
serialize_mc_stream (serialize_main_t * m, va_list * va)
{
  mc_stream_t *s = va_arg (*va, mc_stream_t *);
  mc_stream_peer_t *p;

  serialize_integer (m, pool_elts (s->peers), sizeof (u32));
  /* *INDENT-OFF* */
  pool_foreach (p, s->peers, ({
    u8 * x = serialize_get (m, sizeof (p->id));
    clib_memcpy (x, p->id.as_u8, sizeof (p->id));
    serialize_integer (m, p->last_sequence_received,
                       sizeof (p->last_sequence_received));
  }));
/* *INDENT-ON* */
  serialize_bitmap (m, s->all_peer_bitmap);
}

void
unserialize_mc_stream (serialize_main_t * m, va_list * va)
{
  mc_stream_t *s = va_arg (*va, mc_stream_t *);
  u32 i, n_peers;
  mc_stream_peer_t *p;

  unserialize_integer (m, &n_peers, sizeof (u32));
  mhash_init (&s->peer_index_by_id, sizeof (uword), sizeof (mc_peer_id_t));
  for (i = 0; i < n_peers; i++)
    {
      u8 *x;
      pool_get (s->peers, p);
      x = unserialize_get (m, sizeof (p->id));
      clib_memcpy (p->id.as_u8, x, sizeof (p->id));
      unserialize_integer (m, &p->last_sequence_received,
			   sizeof (p->last_sequence_received));
      mhash_set (&s->peer_index_by_id, &p->id, p - s->peers,	/* old_value */
		 0);
    }
  s->all_peer_bitmap = unserialize_bitmap (m);

  /* This is really bad. */
  if (!s->all_peer_bitmap)
    clib_warning ("BUG: stream %s all_peer_bitmap NULL", s->config.name);
}

void
mc_msg_catchup_request_handler (mc_main_t * mcm,
				mc_msg_catchup_request_t * req,
				u32 catchup_opaque)
{
  vlib_main_t *vm = mcm->vlib_main;
  mc_stream_t *s;
  mc_catchup_process_arg_t *args;

  mc_byte_swap_msg_catchup_request (req);

  s = mc_stream_by_index (mcm, req->stream_index);
  if (!s || s->state != MC_STREAM_STATE_ready)
    return;

  if (MC_EVENT_LOGGING > 0)
    {
      /* *INDENT-OFF* */
      ELOG_TYPE_DECLARE (t) =
        {
          .format = "catchup-request: from %s stream %d",
          .format_args = "T4i4",
        };
      /* *INDENT-ON* */
      struct
      {
	u32 peer, stream;
      } *ed;
      ed = ELOG_DATA (mcm->elog_main, t);
      ed->peer = elog_id_for_peer_id (mcm, req->peer_id.as_u64);
      ed->stream = req->stream_index;
    }

  /*
   * The application has to snapshoot its data structures right
   * here, right now. If we process any messages after
   * noting the last global sequence we've processed, the client
   * won't be able to accurately reconstruct our data structures.
   *
   * Once the data structures are e.g. vec_dup()'ed, we
   * send the resulting messages from a separate process, to
   * make sure that we don't cause a bunch of message retransmissions
   */
  pool_get (mcm->catchup_process_args, args);

  args->stream_index = s - mcm->stream_vector;
  args->catchup_opaque = catchup_opaque;
  args->catchup_snapshot = 0;

  /* Construct catchup reply and snapshot state for stream to send as
     catchup reply payload. */
  {
    mc_msg_catchup_reply_t *rep;
    serialize_main_t m;

    vec_resize (args->catchup_snapshot, sizeof (rep[0]));

    rep = (void *) args->catchup_snapshot;

    rep->peer_id = req->peer_id;
    rep->stream_index = req->stream_index;
    rep->last_global_sequence_included = s->last_global_sequence_processed;

    /* Setup for serialize to append to catchup snapshot. */
    serialize_open_vector (&m, args->catchup_snapshot);
    m.stream.current_buffer_index = vec_len (m.stream.buffer);

    serialize (&m, serialize_mc_stream, s);

    args->catchup_snapshot = serialize_close_vector (&m);

    /* Actually copy internal state */
    args->catchup_snapshot = s->config.catchup_snapshot
      (mcm, args->catchup_snapshot, rep->last_global_sequence_included);

    rep = (void *) args->catchup_snapshot;
    rep->n_data_bytes = vec_len (args->catchup_snapshot) - sizeof (rep[0]);

    mc_byte_swap_msg_catchup_reply (rep);
  }

  /* now go send it... */
  vlib_process_signal_event (vm, mcm->catchup_process,
			     EVENT_MC_SEND_CATCHUP_DATA,
			     args - mcm->catchup_process_args);
}

#define EVENT_MC_UNSERIALIZE_BUFFER 0
#define EVENT_MC_UNSERIALIZE_CATCHUP 1

void
mc_msg_catchup_reply_handler (mc_main_t * mcm, mc_msg_catchup_reply_t * mp,
			      u32 catchup_opaque)
{
  vlib_process_signal_event (mcm->vlib_main,
			     mcm->unserialize_process,
			     EVENT_MC_UNSERIALIZE_CATCHUP,
			     pointer_to_uword (mp));
}

static void
perform_catchup (mc_main_t * mcm, mc_msg_catchup_reply_t * mp)
{
  mc_stream_t *s;
  i32 seq_cmp_result;

  mc_byte_swap_msg_catchup_reply (mp);

  s = mc_stream_by_index (mcm, mp->stream_index);

  /* Never heard of this stream or already caught up. */
  if (!s || s->state == MC_STREAM_STATE_ready)
    return;

  {
    serialize_main_t m;
    mc_stream_peer_t *p;
    u32 n_stream_bytes;

    /* For offline sim replay: save the entire catchup snapshot... */
    if (s->config.save_snapshot)
      s->config.save_snapshot (mcm, /* is_catchup */ 1, mp->data,
			       mp->n_data_bytes);

    unserialize_open_data (&m, mp->data, mp->n_data_bytes);
    unserialize (&m, unserialize_mc_stream, s);

    /* Make sure we start numbering our messages as expected */
    /* *INDENT-OFF* */
    pool_foreach (p, s->peers, ({
      if (p->id.as_u64 == mcm->transport.our_ack_peer_id.as_u64)
        s->our_local_sequence = p->last_sequence_received + 1;
    }));
/* *INDENT-ON* */

    n_stream_bytes = m.stream.current_buffer_index;

    /* No need to unserialize close; nothing to free. */

    /* After serialized stream is user's catchup data. */
    s->config.catchup (mcm, mp->data + n_stream_bytes,
		       mp->n_data_bytes - n_stream_bytes);
  }

  /* Vector could have been moved by catchup.
     This can only happen for mc-internal stream. */
  s = mc_stream_by_index (mcm, mp->stream_index);

  s->last_global_sequence_processed = mp->last_global_sequence_included;

  while (clib_fifo_elts (s->catchup_fifo))
    {
      mc_msg_user_request_t *gp;
      u32 bi;
      vlib_buffer_t *b;

      clib_fifo_sub1 (s->catchup_fifo, bi);

      b = vlib_get_buffer (mcm->vlib_main, bi);
      gp = vlib_buffer_get_current (b);

      /* Make sure we're replaying "new" news */
      seq_cmp_result = mc_seq_cmp (gp->global_sequence,
				   mp->last_global_sequence_included);

      if (seq_cmp_result > 0)
	{
	  vlib_buffer_advance (b, sizeof (gp[0]));
	  s->config.rx_buffer (mcm, s, gp->peer_id, bi);
	  s->last_global_sequence_processed = gp->global_sequence;

	  if (MC_EVENT_LOGGING)
	    {
              /* *INDENT-OFF* */
	      ELOG_TYPE_DECLARE (t) =
                {
                  .format = "catchup replay local sequence 0x%x",
                  .format_args = "i4",
                };
              /* *INDENT-ON* */
	      struct
	      {
		u32 local_sequence;
	      } *ed;
	      ed = ELOG_DATA (mcm->elog_main, t);
	      ed->local_sequence = gp->local_sequence;
	    }
	}
      else
	{
	  if (MC_EVENT_LOGGING)
	    {
              /* *INDENT-OFF* */
	      ELOG_TYPE_DECLARE (t) =
                {
                  .format = "catchup discard local sequence 0x%x",
                  .format_args = "i4",
                };
              /* *INDENT-ON* */
	      struct
	      {
		u32 local_sequence;
	      } *ed;
	      ed = ELOG_DATA (mcm->elog_main, t);
	      ed->local_sequence = gp->local_sequence;
	    }

	  vlib_buffer_free_one (mcm->vlib_main, bi);
	}
    }

  s->state = MC_STREAM_STATE_ready;

  /* Now that we are caught up wake up joining process. */
  {
    vlib_one_time_waiting_process_t *wp;
    vec_foreach (wp, s->procs_waiting_for_join_done)
      vlib_signal_one_time_waiting_process (mcm->vlib_main, wp);
    if (s->procs_waiting_for_join_done)
      _vec_len (s->procs_waiting_for_join_done) = 0;
  }
}

static void
this_node_maybe_master (mc_main_t * mcm)
{
  vlib_main_t *vm = mcm->vlib_main;
  mc_msg_master_assert_t *mp;
  uword event_type;
  int timeouts = 0;
  int is_master = mcm->relay_state == MC_RELAY_STATE_MASTER;
  clib_error_t *error;
  f64 now, time_last_master_assert = -1;
  u32 bi;

  while (1)
    {
      if (!mcm->we_can_be_relay_master)
	{
	  mcm->relay_state = MC_RELAY_STATE_SLAVE;
	  if (MC_EVENT_LOGGING)
	    {
	      ELOG_TYPE (e, "become slave (config)");
	      ELOG (mcm->elog_main, e, 0);
	    }
	  return;
	}

      now = vlib_time_now (vm);
      if (now >= time_last_master_assert + 1)
	{
	  time_last_master_assert = now;
	  mp = mc_get_vlib_buffer (mcm->vlib_main, sizeof (mp[0]), &bi);

	  mp->peer_id = mcm->transport.our_ack_peer_id;
	  mp->global_sequence = mcm->relay_global_sequence;

	  /*
	   * these messages clog the event log, set MC_EVENT_LOGGING higher
	   * if you want them
	   */
	  if (MC_EVENT_LOGGING > 1)
	    {
              /* *INDENT-OFF* */
	      ELOG_TYPE_DECLARE (e) =
                {
                  .format = "tx-massert: peer %s global seq %u",
                  .format_args = "T4i4",
                };
              /* *INDENT-ON* */
	      struct
	      {
		u32 peer, global_sequence;
	      } *ed;
	      ed = ELOG_DATA (mcm->elog_main, e);
	      ed->peer = elog_id_for_peer_id (mcm, mp->peer_id.as_u64);
	      ed->global_sequence = mp->global_sequence;
	    }

	  mc_byte_swap_msg_master_assert (mp);

	  error =
	    mcm->transport.tx_buffer (mcm->transport.opaque,
				      MC_TRANSPORT_MASTERSHIP, bi);
	  if (error)
	    clib_error_report (error);
	}

      vlib_process_wait_for_event_or_clock (vm, 1.0);
      event_type = vlib_process_get_events (vm, /* no event data */ 0);

      switch (event_type)
	{
	case ~0:
	  if (!is_master && timeouts++ > 2)
	    {
	      mcm->relay_state = MC_RELAY_STATE_MASTER;
	      mcm->relay_master_peer_id =
		mcm->transport.our_ack_peer_id.as_u64;
	      if (MC_EVENT_LOGGING)
		{
		  ELOG_TYPE (e, "become master (was maybe_master)");
		  ELOG (mcm->elog_main, e, 0);
		}
	      return;
	    }
	  break;

	case MC_RELAY_STATE_SLAVE:
	  mcm->relay_state = MC_RELAY_STATE_SLAVE;
	  if (MC_EVENT_LOGGING && mcm->relay_state != MC_RELAY_STATE_SLAVE)
	    {
	      ELOG_TYPE (e, "become slave (was maybe_master)");
	      ELOG (mcm->elog_main, e, 0);
	    }
	  return;
	}
    }
}

static void
this_node_slave (mc_main_t * mcm)
{
  vlib_main_t *vm = mcm->vlib_main;
  uword event_type;
  int timeouts = 0;

  if (MC_EVENT_LOGGING)
    {
      ELOG_TYPE (e, "become slave");
      ELOG (mcm->elog_main, e, 0);
    }

  while (1)
    {
      vlib_process_wait_for_event_or_clock (vm, 1.0);
      event_type = vlib_process_get_events (vm, /* no event data */ 0);

      switch (event_type)
	{
	case ~0:
	  if (timeouts++ > 2)
	    {
	      mcm->relay_state = MC_RELAY_STATE_NEGOTIATE;
	      mcm->relay_master_peer_id = ~0ULL;
	      if (MC_EVENT_LOGGING)
		{
		  ELOG_TYPE (e, "timeouts; negoitate mastership");
		  ELOG (mcm->elog_main, e, 0);
		}
	      return;
	    }
	  break;

	case MC_RELAY_STATE_SLAVE:
	  mcm->relay_state = MC_RELAY_STATE_SLAVE;
	  timeouts = 0;
	  break;
	}
    }
}

static uword
mc_mastership_process (vlib_main_t * vm,
		       vlib_node_runtime_t * node, vlib_frame_t * f)
{
  mc_main_t *mcm = mc_node_get_main (node);

  while (1)
    {
      switch (mcm->relay_state)
	{
	case MC_RELAY_STATE_NEGOTIATE:
	case MC_RELAY_STATE_MASTER:
	  this_node_maybe_master (mcm);
	  break;

	case MC_RELAY_STATE_SLAVE:
	  this_node_slave (mcm);
	  break;
	}
    }
  return 0;			/* not likely */
}

void
mc_enable_disable_mastership (mc_main_t * mcm, int we_can_be_master)
{
  if (we_can_be_master != mcm->we_can_be_relay_master)
    {
      mcm->we_can_be_relay_master = we_can_be_master;
      vlib_process_signal_event (mcm->vlib_main,
				 mcm->mastership_process,
				 MC_RELAY_STATE_NEGOTIATE, 0);
    }
}

void
mc_msg_master_assert_handler (mc_main_t * mcm, mc_msg_master_assert_t * mp,
			      u32 buffer_index)
{
  mc_peer_id_t his_peer_id, our_peer_id;
  i32 seq_cmp_result;
  u8 signal_slave = 0;
  u8 update_global_sequence = 0;

  mc_byte_swap_msg_master_assert (mp);

  his_peer_id = mp->peer_id;
  our_peer_id = mcm->transport.our_ack_peer_id;

  /* compare the incoming global sequence with ours */
  seq_cmp_result = mc_seq_cmp (mp->global_sequence,
			       mcm->relay_global_sequence);

  /* If the sender has a lower peer id and the sender's sequence >=
     our global sequence, we become a slave.  Otherwise we are master. */
  if (mc_peer_id_compare (his_peer_id, our_peer_id) < 0
      && seq_cmp_result >= 0)
    {
      vlib_process_signal_event (mcm->vlib_main,
				 mcm->mastership_process,
				 MC_RELAY_STATE_SLAVE, 0);
      signal_slave = 1;
    }

  /* Update our global sequence. */
  if (seq_cmp_result > 0)
    {
      mcm->relay_global_sequence = mp->global_sequence;
      update_global_sequence = 1;
    }

  {
    uword *q = mhash_get (&mcm->mastership_peer_index_by_id, &his_peer_id);
    mc_mastership_peer_t *p;

    if (q)
      p = vec_elt_at_index (mcm->mastership_peers, q[0]);
    else
      {
	vec_add2 (mcm->mastership_peers, p, 1);
	p->peer_id = his_peer_id;
	mhash_set (&mcm->mastership_peer_index_by_id, &p->peer_id,
		   p - mcm->mastership_peers,
		   /* old_value */ 0);
      }
    p->time_last_master_assert_received = vlib_time_now (mcm->vlib_main);
  }

  /*
   * these messages clog the event log, set MC_EVENT_LOGGING higher
   * if you want them.
   */
  if (MC_EVENT_LOGGING > 1)
    {
      /* *INDENT-OFF* */
      ELOG_TYPE_DECLARE (e) =
        {
          .format = "rx-massert: peer %s global seq %u upd %d slave %d",
          .format_args = "T4i4i1i1",
        };
      /* *INDENT-ON* */

      struct
      {
	u32 peer;
	u32 global_sequence;
	u8 update_sequence;
	u8 slave;
      } *ed;
      ed = ELOG_DATA (mcm->elog_main, e);
      ed->peer = elog_id_for_peer_id (mcm, his_peer_id.as_u64);
      ed->global_sequence = mp->global_sequence;
      ed->update_sequence = update_global_sequence;
      ed->slave = signal_slave;
    }
}

static void
mc_serialize_init (mc_main_t * mcm)
{
  mc_serialize_msg_t *m;
  vlib_main_t *vm = vlib_get_main ();

  mcm->global_msg_index_by_name
    = hash_create_string ( /* elts */ 0, sizeof (uword));

  m = vm->mc_msg_registrations;

  while (m)
    {
      m->global_index = vec_len (mcm->global_msgs);
      hash_set_mem (mcm->global_msg_index_by_name, m->name, m->global_index);
      vec_add1 (mcm->global_msgs, m);
      m = m->next_registration;
    }
}

clib_error_t *
mc_serialize_va (mc_main_t * mc,
		 u32 stream_index,
		 u32 multiple_messages_per_vlib_buffer,
		 mc_serialize_msg_t * msg, va_list * va)
{
  mc_stream_t *s;
  clib_error_t *error;
  serialize_main_t *m = &mc->serialize_mains[VLIB_TX];
  vlib_serialize_buffer_main_t *sbm = &mc->serialize_buffer_mains[VLIB_TX];
  u32 bi, n_before, n_after, n_total, n_this_msg;
  u32 si, gi;

  if (!sbm->vlib_main)
    {
      sbm->tx.max_n_data_bytes_per_chain = 4096;
      sbm->tx.free_list_index = VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX;
    }

  if (sbm->first_buffer == 0)
    serialize_open_vlib_buffer (m, mc->vlib_main, sbm);

  n_before = serialize_vlib_buffer_n_bytes (m);

  s = mc_stream_by_index (mc, stream_index);
  gi = msg->global_index;
  ASSERT (msg == vec_elt (mc->global_msgs, gi));

  si = ~0;
  if (gi < vec_len (s->stream_msg_index_by_global_index))
    si = s->stream_msg_index_by_global_index[gi];

  serialize_likely_small_unsigned_integer (m, si);

  /* For first time message is sent, use name to identify message. */
  if (si == ~0 || MSG_ID_DEBUG)
    serialize_cstring (m, msg->name);

  if (MSG_ID_DEBUG && MC_EVENT_LOGGING > 0)
    {
      /* *INDENT-OFF* */
      ELOG_TYPE_DECLARE (e) =
        {
          .format = "serialize-msg: %s index %d",
          .format_args = "T4i4",
        };
      /* *INDENT-ON* */
      struct
      {
	u32 c[2];
      } *ed;
      ed = ELOG_DATA (mc->elog_main, e);
      ed->c[0] = elog_id_for_msg_name (mc, msg->name);
      ed->c[1] = si;
    }

  error = va_serialize (m, va);

  n_after = serialize_vlib_buffer_n_bytes (m);
  n_this_msg = n_after - n_before;
  n_total = n_after + sizeof (mc_msg_user_request_t);

  /* For max message size ignore first message where string name is sent. */
  if (si != ~0)
    msg->max_n_bytes_serialized =
      clib_max (msg->max_n_bytes_serialized, n_this_msg);

  if (!multiple_messages_per_vlib_buffer
      || si == ~0
      || n_total + msg->max_n_bytes_serialized >
      mc->transport.max_packet_size)
    {
      bi = serialize_close_vlib_buffer (m);
      sbm->first_buffer = 0;
      if (!error)
	mc_stream_send (mc, stream_index, bi);
      else if (bi != ~0)
	vlib_buffer_free_one (mc->vlib_main, bi);
    }

  return error;
}

clib_error_t *
mc_serialize_internal (mc_main_t * mc,
		       u32 stream_index,
		       u32 multiple_messages_per_vlib_buffer,
		       mc_serialize_msg_t * msg, ...)
{
  vlib_main_t *vm = mc->vlib_main;
  va_list va;
  clib_error_t *error;

  if (stream_index == ~0)
    {
      if (vm->mc_main && vm->mc_stream_index == ~0)
	vlib_current_process_wait_for_one_time_event_vector
	  (vm, &vm->procs_waiting_for_mc_stream_join);
      stream_index = vm->mc_stream_index;
    }

  va_start (va, msg);
  error = mc_serialize_va (mc, stream_index,
			   multiple_messages_per_vlib_buffer, msg, &va);
  va_end (va);
  return error;
}

uword
mc_unserialize_message (mc_main_t * mcm,
			mc_stream_t * s, serialize_main_t * m)
{
  mc_serialize_stream_msg_t *sm;
  u32 gi, si;

  si = unserialize_likely_small_unsigned_integer (m);

  if (!(si == ~0 || MSG_ID_DEBUG))
    {
      sm = vec_elt_at_index (s->stream_msgs, si);
      gi = sm->global_index;
    }
  else
    {
      char *name;

      unserialize_cstring (m, &name);

      if (MSG_ID_DEBUG && MC_EVENT_LOGGING > 0)
	{
          /* *INDENT-OFF* */
	  ELOG_TYPE_DECLARE (e) =
            {
              .format = "unserialize-msg: %s rx index %d",
              .format_args = "T4i4",
            };
          /* *INDENT-ON* */
	  struct
	  {
	    u32 c[2];
	  } *ed;
	  ed = ELOG_DATA (mcm->elog_main, e);
	  ed->c[0] = elog_id_for_msg_name (mcm, name);
	  ed->c[1] = si;
	}

      {
	uword *p = hash_get_mem (mcm->global_msg_index_by_name, name);
	gi = p ? p[0] : ~0;
      }

      /* Unknown message? */
      if (gi == ~0)
	{
	  vec_free (name);
	  goto done;
	}

      vec_validate_init_empty (s->stream_msg_index_by_global_index, gi, ~0);
      si = s->stream_msg_index_by_global_index[gi];

      /* Stream local index unknown?  Create it. */
      if (si == ~0)
	{
	  vec_add2 (s->stream_msgs, sm, 1);

	  si = sm - s->stream_msgs;
	  sm->global_index = gi;
	  s->stream_msg_index_by_global_index[gi] = si;

	  if (MC_EVENT_LOGGING > 0)
	    {
              /* *INDENT-OFF* */
	      ELOG_TYPE_DECLARE (e) =
                {
                  .format = "msg-bind: stream %d %s to index %d",
                  .format_args = "i4T4i4",
                };
              /* *INDENT-ON* */
	      struct
	      {
		u32 c[3];
	      } *ed;
	      ed = ELOG_DATA (mcm->elog_main, e);
	      ed->c[0] = s->index;
	      ed->c[1] = elog_id_for_msg_name (mcm, name);
	      ed->c[2] = si;
	    }
	}
      else
	{
	  sm = vec_elt_at_index (s->stream_msgs, si);
	  if (gi != sm->global_index && MC_EVENT_LOGGING > 0)
	    {
              /* *INDENT-OFF* */
	      ELOG_TYPE_DECLARE (e) =
                {
                  .format = "msg-id-ERROR: %s index %d expected %d",
                  .format_args = "T4i4i4",
                };
              /* *INDENT-ON* */
	      struct
	      {
		u32 c[3];
	      } *ed;
	      ed = ELOG_DATA (mcm->elog_main, e);
	      ed->c[0] = elog_id_for_msg_name (mcm, name);
	      ed->c[1] = si;
	      ed->c[2] = ~0;
	      if (sm->global_index <
		  vec_len (s->stream_msg_index_by_global_index))
		ed->c[2] =
		  s->stream_msg_index_by_global_index[sm->global_index];
	    }
	}

      vec_free (name);
    }

  if (gi != ~0)
    {
      mc_serialize_msg_t *msg;
      msg = vec_elt (mcm->global_msgs, gi);
      unserialize (m, msg->unserialize, mcm);
    }

done:
  return gi != ~0;
}

void
mc_unserialize_internal (mc_main_t * mcm, u32 stream_and_buffer_index)
{
  vlib_main_t *vm = mcm->vlib_main;
  serialize_main_t *m = &mcm->serialize_mains[VLIB_RX];
  vlib_serialize_buffer_main_t *sbm = &mcm->serialize_buffer_mains[VLIB_RX];
  mc_stream_and_buffer_t *sb;
  mc_stream_t *stream;
  u32 buffer_index;

  sb =
    pool_elt_at_index (mcm->mc_unserialize_stream_and_buffers,
		       stream_and_buffer_index);
  buffer_index = sb->buffer_index;
  stream = vec_elt_at_index (mcm->stream_vector, sb->stream_index);
  pool_put (mcm->mc_unserialize_stream_and_buffers, sb);

  if (stream->config.save_snapshot)
    {
      u32 n_bytes = vlib_buffer_index_length_in_chain (vm, buffer_index);
      static u8 *contents;
      vec_reset_length (contents);
      vec_validate (contents, n_bytes - 1);
      vlib_buffer_contents (vm, buffer_index, contents);
      stream->config.save_snapshot (mcm, /* is_catchup */ 0, contents,
				    n_bytes);
    }

  ASSERT (vlib_in_process_context (vm));

  unserialize_open_vlib_buffer (m, vm, sbm);

  clib_fifo_add1 (sbm->rx.buffer_fifo, buffer_index);

  while (unserialize_vlib_buffer_n_bytes (m) > 0)
    mc_unserialize_message (mcm, stream, m);

  /* Frees buffer. */
  unserialize_close_vlib_buffer (m);
}

void
mc_unserialize (mc_main_t * mcm, mc_stream_t * s, u32 buffer_index)
{
  vlib_main_t *vm = mcm->vlib_main;
  mc_stream_and_buffer_t *sb;
  pool_get (mcm->mc_unserialize_stream_and_buffers, sb);
  sb->stream_index = s->index;
  sb->buffer_index = buffer_index;
  vlib_process_signal_event (vm, mcm->unserialize_process,
			     EVENT_MC_UNSERIALIZE_BUFFER,
			     sb - mcm->mc_unserialize_stream_and_buffers);
}

static uword
mc_unserialize_process (vlib_main_t * vm,
			vlib_node_runtime_t * node, vlib_frame_t * f)
{
  mc_main_t *mcm = mc_node_get_main (node);
  uword event_type, *event_data = 0;
  int i;

  while (1)
    {
      if (event_data)
	_vec_len (event_data) = 0;

      vlib_process_wait_for_event (vm);
      event_type = vlib_process_get_events (vm, &event_data);
      switch (event_type)
	{
	case EVENT_MC_UNSERIALIZE_BUFFER:
	  for (i = 0; i < vec_len (event_data); i++)
	    mc_unserialize_internal (mcm, event_data[i]);
	  break;

	case EVENT_MC_UNSERIALIZE_CATCHUP:
	  for (i = 0; i < vec_len (event_data); i++)
	    {
	      u8 *mp = uword_to_pointer (event_data[i], u8 *);
	      perform_catchup (mcm, (void *) mp);
	      vec_free (mp);
	    }
	  break;

	default:
	  break;
	}
    }

  return 0;			/* not likely */
}

void
serialize_mc_main (serialize_main_t * m, va_list * va)
{
  mc_main_t *mcm = va_arg (*va, mc_main_t *);
  mc_stream_t *s;
  mc_serialize_stream_msg_t *sm;
  mc_serialize_msg_t *msg;

  serialize_integer (m, vec_len (mcm->stream_vector), sizeof (u32));
  vec_foreach (s, mcm->stream_vector)
  {
    /* Stream name. */
    serialize_cstring (m, s->config.name);

    /* Serialize global names for all sent messages. */
    serialize_integer (m, vec_len (s->stream_msgs), sizeof (u32));
    vec_foreach (sm, s->stream_msgs)
    {
      msg = vec_elt (mcm->global_msgs, sm->global_index);
      serialize_cstring (m, msg->name);
    }
  }
}

void
unserialize_mc_main (serialize_main_t * m, va_list * va)
{
  mc_main_t *mcm = va_arg (*va, mc_main_t *);
  u32 i, n_streams, n_stream_msgs;
  char *name;
  mc_stream_t *s;
  mc_serialize_stream_msg_t *sm;

  unserialize_integer (m, &n_streams, sizeof (u32));
  for (i = 0; i < n_streams; i++)
    {
      unserialize_cstring (m, &name);
      if (i != MC_STREAM_INDEX_INTERNAL && !mc_stream_by_name (mcm, name))
	{
	  vec_validate (mcm->stream_vector, i);
	  s = vec_elt_at_index (mcm->stream_vector, i);
	  mc_stream_init (s);
	  s->index = s - mcm->stream_vector;
	  s->config.name = name;
	  s->state = MC_STREAM_STATE_name_known;
	  hash_set_mem (mcm->stream_index_by_name, s->config.name, s->index);
	}
      else
	vec_free (name);

      s = vec_elt_at_index (mcm->stream_vector, i);

      vec_free (s->stream_msgs);
      vec_free (s->stream_msg_index_by_global_index);

      unserialize_integer (m, &n_stream_msgs, sizeof (u32));
      vec_resize (s->stream_msgs, n_stream_msgs);
      vec_foreach (sm, s->stream_msgs)
      {
	uword *p;
	u32 si, gi;

	unserialize_cstring (m, &name);
	p = hash_get (mcm->global_msg_index_by_name, name);
	gi = p ? p[0] : ~0;
	si = sm - s->stream_msgs;

	if (MC_EVENT_LOGGING > 0)
	  {
            /* *INDENT-OFF* */
	    ELOG_TYPE_DECLARE (e) =
              {
                .format = "catchup-bind: %s to %d global index %d stream %d",
                .format_args = "T4i4i4i4",
              };
            /* *INDENT-ON* */

	    struct
	    {
	      u32 c[4];
	    } *ed;
	    ed = ELOG_DATA (mcm->elog_main, e);
	    ed->c[0] = elog_id_for_msg_name (mcm, name);
	    ed->c[1] = si;
	    ed->c[2] = gi;
	    ed->c[3] = s->index;
	  }

	vec_free (name);

	sm->global_index = gi;
	if (gi != ~0)
	  {
	    vec_validate_init_empty (s->stream_msg_index_by_global_index,
				     gi, ~0);
	    s->stream_msg_index_by_global_index[gi] = si;
	  }
      }
    }
}

void
mc_main_init (mc_main_t * mcm, char *tag)
{
  vlib_main_t *vm = vlib_get_main ();

  mcm->vlib_main = vm;
  mcm->elog_main = &vm->elog_main;

  mcm->relay_master_peer_id = ~0ULL;
  mcm->relay_state = MC_RELAY_STATE_NEGOTIATE;

  mcm->stream_index_by_name
    = hash_create_string ( /* elts */ 0, /* value size */ sizeof (uword));

  {
    vlib_node_registration_t r;

    memset (&r, 0, sizeof (r));

    r.type = VLIB_NODE_TYPE_PROCESS;

    /* Point runtime data to main instance. */
    r.runtime_data = &mcm;
    r.runtime_data_bytes = sizeof (&mcm);

    r.name = (char *) format (0, "mc-mastership-%s", tag);
    r.function = mc_mastership_process;
    mcm->mastership_process = vlib_register_node (vm, &r);

    r.name = (char *) format (0, "mc-join-ager-%s", tag);
    r.function = mc_join_ager_process;
    mcm->join_ager_process = vlib_register_node (vm, &r);

    r.name = (char *) format (0, "mc-retry-%s", tag);
    r.function = mc_retry_process;
    mcm->retry_process = vlib_register_node (vm, &r);

    r.name = (char *) format (0, "mc-catchup-%s", tag);
    r.function = mc_catchup_process;
    mcm->catchup_process = vlib_register_node (vm, &r);

    r.name = (char *) format (0, "mc-unserialize-%s", tag);
    r.function = mc_unserialize_process;
    mcm->unserialize_process = vlib_register_node (vm, &r);
  }

  if (MC_EVENT_LOGGING > 0)
    mhash_init (&mcm->elog_id_by_peer_id, sizeof (uword),
		sizeof (mc_peer_id_t));

  mhash_init (&mcm->mastership_peer_index_by_id, sizeof (uword),
	      sizeof (mc_peer_id_t));
  mc_serialize_init (mcm);
}

static u8 *
format_mc_relay_state (u8 * s, va_list * args)
{
  mc_relay_state_t state = va_arg (*args, mc_relay_state_t);
  char *t = 0;
  switch (state)
    {
    case MC_RELAY_STATE_NEGOTIATE:
      t = "negotiate";
      break;
    case MC_RELAY_STATE_MASTER:
      t = "master";
      break;
    case MC_RELAY_STATE_SLAVE:
      t = "slave";
      break;
    default:
      return format (s, "unknown 0x%x", state);
    }

  return format (s, "%s", t);
}

static u8 *
format_mc_stream_state (u8 * s, va_list * args)
{
  mc_stream_state_t state = va_arg (*args, mc_stream_state_t);
  char *t = 0;
  switch (state)
    {
#define _(f) case MC_STREAM_STATE_##f: t = #f; break;
      foreach_mc_stream_state
#undef _
    default:
      return format (s, "unknown 0x%x", state);
    }

  return format (s, "%s", t);
}

static int
mc_peer_comp (void *a1, void *a2)
{
  mc_stream_peer_t *p1 = a1;
  mc_stream_peer_t *p2 = a2;

  return mc_peer_id_compare (p1->id, p2->id);
}

u8 *
format_mc_main (u8 * s, va_list * args)
{
  mc_main_t *mcm = va_arg (*args, mc_main_t *);
  mc_stream_t *t;
  mc_stream_peer_t *p, *ps;
  u32 indent = format_get_indent (s);

  s = format (s, "MC state %U, %d streams joined, global sequence 0x%x",
	      format_mc_relay_state, mcm->relay_state,
	      vec_len (mcm->stream_vector), mcm->relay_global_sequence);

  {
    mc_mastership_peer_t *mp;
    f64 now = vlib_time_now (mcm->vlib_main);
    s = format (s, "\n%UMost recent mastership peers:",
		format_white_space, indent + 2);
    vec_foreach (mp, mcm->mastership_peers)
    {
      s = format (s, "\n%U%-30U%.4e",
		  format_white_space, indent + 4,
		  mcm->transport.format_peer_id, mp->peer_id,
		  now - mp->time_last_master_assert_received);
    }
  }

  vec_foreach (t, mcm->stream_vector)
  {
    s = format (s, "\n%Ustream `%s' index %d",
		format_white_space, indent + 2, t->config.name, t->index);

    s = format (s, "\n%Ustate %U",
		format_white_space, indent + 4,
		format_mc_stream_state, t->state);

    s =
      format (s,
	      "\n%Uretries: interval %.0f sec, limit %d, pool elts %d, %Ld sent",
	      format_white_space, indent + 4, t->config.retry_interval,
	      t->config.retry_limit, pool_elts (t->retry_pool),
	      t->stats.n_retries - t->stats_last_clear.n_retries);

    s = format (s, "\n%U%Ld/%Ld user requests sent/received",
		format_white_space, indent + 4,
		t->user_requests_sent, t->user_requests_received);

    s = format (s, "\n%U%d peers, local/global sequence 0x%x/0x%x",
		format_white_space, indent + 4,
		pool_elts (t->peers),
		t->our_local_sequence, t->last_global_sequence_processed);

    ps = 0;
    /* *INDENT-OFF* */
    pool_foreach (p, t->peers,
    ({
      if (clib_bitmap_get (t->all_peer_bitmap, p - t->peers))
        vec_add1 (ps, p[0]);
    }));
    /* *INDENT-ON* */
    vec_sort_with_function (ps, mc_peer_comp);
    s = format (s, "\n%U%=30s%10s%16s%16s",
		format_white_space, indent + 6,
		"Peer", "Last seq", "Retries", "Future");

    vec_foreach (p, ps)
    {
      s = format (s, "\n%U%-30U0x%08x%16Ld%16Ld%s",
		  format_white_space, indent + 6,
		  mcm->transport.format_peer_id, p->id.as_u64,
		  p->last_sequence_received,
		  p->stats.n_msgs_from_past -
		  p->stats_last_clear.n_msgs_from_past,
		  p->stats.n_msgs_from_future -
		  p->stats_last_clear.n_msgs_from_future,
		  (mcm->transport.our_ack_peer_id.as_u64 ==
		   p->id.as_u64 ? " (self)" : ""));
    }
    vec_free (ps);
  }

  return s;
}

/*
 * fd.io coding-style-patch-verification: ON
 *
 * Local Variables:
 * eval: (c-set-style "gnu")
 * End:
 */