aboutsummaryrefslogtreecommitdiffstats
path: root/src/vlib/mc.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/vlib/mc.c')
-rw-r--r--src/vlib/mc.c2609
1 files changed, 2609 insertions, 0 deletions
diff --git a/src/vlib/mc.c b/src/vlib/mc.c
new file mode 100644
index 00000000000..8fde091389e
--- /dev/null
+++ b/src/vlib/mc.c
@@ -0,0 +1,2609 @@
+/*
+ * mc.c: vlib reliable sequenced multicast distributed applications
+ *
+ * Copyright (c) 2010 Cisco and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vlib/vlib.h>
+
+/*
+ * 1 to enable msg id training wheels, which are useful for tracking
+ * down catchup and/or partitioned network problems
+ */
+#define MSG_ID_DEBUG 0
+
+static format_function_t format_mc_stream_state;
+
+static u32
+elog_id_for_peer_id (mc_main_t * m, u64 peer_id)
+{
+ uword *p, r;
+ mhash_t *h = &m->elog_id_by_peer_id;
+
+ if (!m->elog_id_by_peer_id.hash)
+ mhash_init (h, sizeof (uword), sizeof (mc_peer_id_t));
+
+ p = mhash_get (h, &peer_id);
+ if (p)
+ return p[0];
+ r = elog_string (m->elog_main, "%U", m->transport.format_peer_id, peer_id);
+ mhash_set (h, &peer_id, r, /* old_value */ 0);
+ return r;
+}
+
+static u32
+elog_id_for_msg_name (mc_main_t * m, char *msg_name)
+{
+ uword *p, r;
+ uword *h = m->elog_id_by_msg_name;
+ u8 *name_copy;
+
+ if (!h)
+ h = m->elog_id_by_msg_name = hash_create_string (0, sizeof (uword));
+
+ p = hash_get_mem (h, msg_name);
+ if (p)
+ return p[0];
+ r = elog_string (m->elog_main, "%s", msg_name);
+
+ name_copy = format (0, "%s%c", msg_name, 0);
+
+ hash_set_mem (h, name_copy, r);
+ m->elog_id_by_msg_name = h;
+
+ return r;
+}
+
+static void
+elog_tx_msg (mc_main_t * m, u32 stream_id, u32 local_sequence,
+ u32 retry_count)
+{
+ if (MC_EVENT_LOGGING > 0)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "tx-msg: stream %d local seq %d attempt %d",
+ .format_args = "i4i4i4",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 stream_id, local_sequence, retry_count;
+ } *ed;
+ ed = ELOG_DATA (m->elog_main, e);
+ ed->stream_id = stream_id;
+ ed->local_sequence = local_sequence;
+ ed->retry_count = retry_count;
+ }
+}
+
+/*
+ * seq_cmp
+ * correctly compare two unsigned sequence numbers.
+ * This function works so long as x and y are within 2**(n-1) of each
+ * other, where n = bits(x, y).
+ *
+ * Magic decoder ring:
+ * seq_cmp == 0 => x and y are equal
+ * seq_cmp < 0 => x is "in the past" with respect to y
+ * seq_cmp > 0 => x is "in the future" with respect to y
+ */
+always_inline i32
+mc_seq_cmp (u32 x, u32 y)
+{
+ return (i32) x - (i32) y;
+}
+
+void *
+mc_get_vlib_buffer (vlib_main_t * vm, u32 n_bytes, u32 * bi_return)
+{
+ u32 n_alloc, bi;
+ vlib_buffer_t *b;
+
+ n_alloc = vlib_buffer_alloc (vm, &bi, 1);
+ ASSERT (n_alloc == 1);
+
+ b = vlib_get_buffer (vm, bi);
+ b->current_length = n_bytes;
+ *bi_return = bi;
+ return (void *) b->data;
+}
+
+static void
+delete_peer_with_index (mc_main_t * mcm, mc_stream_t * s,
+ uword index, int notify_application)
+{
+ mc_stream_peer_t *p = pool_elt_at_index (s->peers, index);
+ ASSERT (p != 0);
+ if (s->config.peer_died && notify_application)
+ s->config.peer_died (mcm, s, p->id);
+
+ s->all_peer_bitmap = clib_bitmap_andnoti (s->all_peer_bitmap, p - s->peers);
+
+ if (MC_EVENT_LOGGING > 0)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "delete peer %s from all_peer_bitmap",
+ .format_args = "T4",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 peer;
+ } *ed = 0;
+
+ ed = ELOG_DATA (mcm->elog_main, e);
+ ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64);
+ }
+ /* Do not delete the pool / hash table entries, or we lose sequence number state */
+}
+
+static mc_stream_peer_t *
+get_or_create_peer_with_id (mc_main_t * mcm,
+ mc_stream_t * s, mc_peer_id_t id, int *created)
+{
+ uword *q = mhash_get (&s->peer_index_by_id, &id);
+ mc_stream_peer_t *p;
+
+ if (q)
+ {
+ p = pool_elt_at_index (s->peers, q[0]);
+ goto done;
+ }
+
+ pool_get (s->peers, p);
+ memset (p, 0, sizeof (p[0]));
+ p->id = id;
+ p->last_sequence_received = ~0;
+ mhash_set (&s->peer_index_by_id, &id, p - s->peers, /* old_value */ 0);
+ if (created)
+ *created = 1;
+
+done:
+ if (MC_EVENT_LOGGING > 0)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "get_or_create %s peer %s stream %d seq %d",
+ .format_args = "t4T4i4i4",
+ .n_enum_strings = 2,
+ .enum_strings = {
+ "old", "new",
+ },
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 is_new, peer, stream_index, rx_sequence;
+ } *ed = 0;
+
+ ed = ELOG_DATA (mcm->elog_main, e);
+ ed->is_new = q ? 0 : 1;
+ ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64);
+ ed->stream_index = s->index;
+ ed->rx_sequence = p->last_sequence_received;
+ }
+ /* $$$$ Enable or reenable this peer */
+ s->all_peer_bitmap = clib_bitmap_ori (s->all_peer_bitmap, p - s->peers);
+ return p;
+}
+
+static void
+maybe_send_window_open_event (vlib_main_t * vm, mc_stream_t * stream)
+{
+ vlib_one_time_waiting_process_t *p;
+
+ if (pool_elts (stream->retry_pool) >= stream->config.window_size)
+ return;
+
+ vec_foreach (p, stream->procs_waiting_for_open_window)
+ vlib_signal_one_time_waiting_process (vm, p);
+
+ if (stream->procs_waiting_for_open_window)
+ _vec_len (stream->procs_waiting_for_open_window) = 0;
+}
+
+static void
+mc_retry_free (mc_main_t * mcm, mc_stream_t * s, mc_retry_t * r)
+{
+ mc_retry_t record, *retp;
+
+ if (r->unacked_by_peer_bitmap)
+ _vec_len (r->unacked_by_peer_bitmap) = 0;
+
+ if (clib_fifo_elts (s->retired_fifo) >= 2 * s->config.window_size)
+ {
+ clib_fifo_sub1 (s->retired_fifo, record);
+ vlib_buffer_free_one (mcm->vlib_main, record.buffer_index);
+ }
+
+ clib_fifo_add2 (s->retired_fifo, retp);
+
+ retp->buffer_index = r->buffer_index;
+ retp->local_sequence = r->local_sequence;
+
+ r->buffer_index = ~0; /* poison buffer index in this retry */
+}
+
+static void
+mc_resend_retired (mc_main_t * mcm, mc_stream_t * s, u32 local_sequence)
+{
+ mc_retry_t *retry;
+
+ if (MC_EVENT_LOGGING > 0)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "resend-retired: search for local seq %d",
+ .format_args = "i4",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 local_sequence;
+ } *ed;
+ ed = ELOG_DATA (mcm->elog_main, e);
+ ed->local_sequence = local_sequence;
+ }
+
+ /* *INDENT-OFF* */
+ clib_fifo_foreach (retry, s->retired_fifo,
+ ({
+ if (retry->local_sequence == local_sequence)
+ {
+ elog_tx_msg (mcm, s->index, retry-> local_sequence, -13);
+ mcm->transport.tx_buffer (mcm->transport.opaque,
+ MC_TRANSPORT_USER_REQUEST_TO_RELAY,
+ retry->buffer_index);
+ return;
+ }
+ }));
+ /* *INDENT-ON* */
+
+ if (MC_EVENT_LOGGING > 0)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "resend-retired: FAILED search for local seq %d",
+ .format_args = "i4",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 local_sequence;
+ } *ed;
+ ed = ELOG_DATA (mcm->elog_main, e);
+ ed->local_sequence = local_sequence;
+ }
+}
+
+static uword *
+delete_retry_fifo_elt (mc_main_t * mcm,
+ mc_stream_t * stream,
+ mc_retry_t * r, uword * dead_peer_bitmap)
+{
+ mc_stream_peer_t *p;
+
+ /* *INDENT-OFF* */
+ pool_foreach (p, stream->peers, ({
+ uword pi = p - stream->peers;
+ uword is_alive = 0 == clib_bitmap_get (r->unacked_by_peer_bitmap, pi);
+
+ if (! is_alive)
+ dead_peer_bitmap = clib_bitmap_ori (dead_peer_bitmap, pi);
+
+ if (MC_EVENT_LOGGING > 0)
+ {
+ ELOG_TYPE_DECLARE (e) = {
+ .format = "delete_retry_fifo_elt: peer %s is %s",
+ .format_args = "T4t4",
+ .n_enum_strings = 2,
+ .enum_strings = { "alive", "dead", },
+ };
+ struct { u32 peer, is_alive; } * ed;
+ ed = ELOG_DATA (mcm->elog_main, e);
+ ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64);
+ ed->is_alive = is_alive;
+ }
+ }));
+ /* *INDENT-ON* */
+
+ hash_unset (stream->retry_index_by_local_sequence, r->local_sequence);
+ mc_retry_free (mcm, stream, r);
+
+ return dead_peer_bitmap;
+}
+
+always_inline mc_retry_t *
+prev_retry (mc_stream_t * s, mc_retry_t * r)
+{
+ return (r->prev_index != ~0
+ ? pool_elt_at_index (s->retry_pool, r->prev_index) : 0);
+}
+
+always_inline mc_retry_t *
+next_retry (mc_stream_t * s, mc_retry_t * r)
+{
+ return (r->next_index != ~0
+ ? pool_elt_at_index (s->retry_pool, r->next_index) : 0);
+}
+
+always_inline void
+remove_retry_from_pool (mc_stream_t * s, mc_retry_t * r)
+{
+ mc_retry_t *p = prev_retry (s, r);
+ mc_retry_t *n = next_retry (s, r);
+
+ if (p)
+ p->next_index = r->next_index;
+ else
+ s->retry_head_index = r->next_index;
+ if (n)
+ n->prev_index = r->prev_index;
+ else
+ s->retry_tail_index = r->prev_index;
+
+ pool_put_index (s->retry_pool, r - s->retry_pool);
+}
+
+static void
+check_retry (mc_main_t * mcm, mc_stream_t * s)
+{
+ mc_retry_t *r;
+ vlib_main_t *vm = mcm->vlib_main;
+ f64 now = vlib_time_now (vm);
+ uword *dead_peer_bitmap = 0;
+ u32 ri, ri_next;
+
+ for (ri = s->retry_head_index; ri != ~0; ri = ri_next)
+ {
+ r = pool_elt_at_index (s->retry_pool, ri);
+ ri_next = r->next_index;
+
+ if (now < r->sent_at + s->config.retry_interval)
+ continue;
+
+ r->n_retries += 1;
+ if (r->n_retries > s->config.retry_limit)
+ {
+ dead_peer_bitmap =
+ delete_retry_fifo_elt (mcm, s, r, dead_peer_bitmap);
+ remove_retry_from_pool (s, r);
+ }
+ else
+ {
+ if (MC_EVENT_LOGGING > 0)
+ {
+ mc_stream_peer_t *p;
+
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (t) =
+ {
+ .format = "resend local seq %d attempt %d",
+ .format_args = "i4i4",
+ };
+ /* *INDENT-ON* */
+
+ /* *INDENT-OFF* */
+ pool_foreach (p, s->peers, ({
+ if (clib_bitmap_get (r->unacked_by_peer_bitmap, p - s->peers))
+ {
+ ELOG_TYPE_DECLARE (ev) = {
+ .format = "resend: needed by peer %s local seq %d",
+ .format_args = "T4i4",
+ };
+ struct { u32 peer, rx_sequence; } * ed;
+ ed = ELOG_DATA (mcm->elog_main, ev);
+ ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64);
+ ed->rx_sequence = r->local_sequence;
+ }
+ }));
+ /* *INDENT-ON* */
+
+ struct
+ {
+ u32 sequence;
+ u32 trail;
+ } *ed;
+ ed = ELOG_DATA (mcm->elog_main, t);
+ ed->sequence = r->local_sequence;
+ ed->trail = r->n_retries;
+ }
+
+ r->sent_at = vlib_time_now (vm);
+ s->stats.n_retries += 1;
+
+ elog_tx_msg (mcm, s->index, r->local_sequence, r->n_retries);
+
+ mcm->transport.tx_buffer
+ (mcm->transport.opaque,
+ MC_TRANSPORT_USER_REQUEST_TO_RELAY, r->buffer_index);
+ }
+ }
+
+ maybe_send_window_open_event (mcm->vlib_main, s);
+
+ /* Delete any dead peers we've found. */
+ if (!clib_bitmap_is_zero (dead_peer_bitmap))
+ {
+ uword i;
+
+ /* *INDENT-OFF* */
+ clib_bitmap_foreach (i, dead_peer_bitmap, ({
+ delete_peer_with_index (mcm, s, i, /* notify_application */ 1);
+
+ /* Delete any references to just deleted peer in retry pool. */
+ pool_foreach (r, s->retry_pool, ({
+ r->unacked_by_peer_bitmap =
+ clib_bitmap_andnoti (r->unacked_by_peer_bitmap, i);
+ }));
+ }));
+/* *INDENT-ON* */
+ clib_bitmap_free (dead_peer_bitmap);
+ }
+}
+
+always_inline mc_main_t *
+mc_node_get_main (vlib_node_runtime_t * node)
+{
+ mc_main_t **p = (void *) node->runtime_data;
+ return p[0];
+}
+
+static uword
+mc_retry_process (vlib_main_t * vm,
+ vlib_node_runtime_t * node, vlib_frame_t * f)
+{
+ mc_main_t *mcm = mc_node_get_main (node);
+ mc_stream_t *s;
+
+ while (1)
+ {
+ vlib_process_suspend (vm, 1.0);
+ vec_foreach (s, mcm->stream_vector)
+ {
+ if (s->state != MC_STREAM_STATE_invalid)
+ check_retry (mcm, s);
+ }
+ }
+ return 0; /* not likely */
+}
+
+static void
+send_join_or_leave_request (mc_main_t * mcm, u32 stream_index, u32 is_join)
+{
+ vlib_main_t *vm = mcm->vlib_main;
+ mc_msg_join_or_leave_request_t *mp;
+ u32 bi;
+
+ mp = mc_get_vlib_buffer (vm, sizeof (mp[0]), &bi);
+ memset (mp, 0, sizeof (*mp));
+ mp->type = MC_MSG_TYPE_join_or_leave_request;
+ mp->peer_id = mcm->transport.our_ack_peer_id;
+ mp->stream_index = stream_index;
+ mp->is_join = is_join;
+
+ mc_byte_swap_msg_join_or_leave_request (mp);
+
+ /*
+ * These msgs are unnumbered, unordered so send on the from-relay
+ * channel.
+ */
+ mcm->transport.tx_buffer (mcm->transport.opaque, MC_TRANSPORT_JOIN, bi);
+}
+
+static uword
+mc_join_ager_process (vlib_main_t * vm,
+ vlib_node_runtime_t * node, vlib_frame_t * f)
+{
+ mc_main_t *mcm = mc_node_get_main (node);
+
+ while (1)
+ {
+ if (mcm->joins_in_progress)
+ {
+ mc_stream_t *s;
+ vlib_one_time_waiting_process_t *p;
+ f64 now = vlib_time_now (vm);
+
+ vec_foreach (s, mcm->stream_vector)
+ {
+ if (s->state != MC_STREAM_STATE_join_in_progress)
+ continue;
+
+ if (now > s->join_timeout)
+ {
+ s->state = MC_STREAM_STATE_ready;
+
+ if (MC_EVENT_LOGGING > 0)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "stream %d join timeout",
+ };
+ /* *INDENT-ON* */
+ ELOG (mcm->elog_main, e, s->index);
+ }
+ /* Make sure that this app instance exists as a stream peer,
+ or we may answer a catchup request with a NULL
+ all_peer_bitmap... */
+ (void) get_or_create_peer_with_id
+ (mcm, s, mcm->transport.our_ack_peer_id, /* created */ 0);
+
+ vec_foreach (p, s->procs_waiting_for_join_done)
+ vlib_signal_one_time_waiting_process (vm, p);
+ if (s->procs_waiting_for_join_done)
+ _vec_len (s->procs_waiting_for_join_done) = 0;
+
+ mcm->joins_in_progress--;
+ ASSERT (mcm->joins_in_progress >= 0);
+ }
+ else
+ {
+ /* Resent join request which may have been lost. */
+ send_join_or_leave_request (mcm, s->index, 1 /* is_join */ );
+
+ /* We're *not* alone, retry for as long as it takes */
+ if (mcm->relay_state == MC_RELAY_STATE_SLAVE)
+ s->join_timeout = vlib_time_now (vm) + 2.0;
+
+
+ if (MC_EVENT_LOGGING > 0)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "stream %d resend join request",
+ };
+ /* *INDENT-ON* */
+ ELOG (mcm->elog_main, e, s->index);
+ }
+ }
+ }
+ }
+
+ vlib_process_suspend (vm, .5);
+ }
+
+ return 0; /* not likely */
+}
+
+static void
+serialize_mc_register_stream_name (serialize_main_t * m, va_list * va)
+{
+ char *name = va_arg (*va, char *);
+ serialize_cstring (m, name);
+}
+
+static void
+elog_stream_name (char *buf, int n_buf_bytes, char *v)
+{
+ clib_memcpy (buf, v, clib_min (n_buf_bytes - 1, vec_len (v)));
+ buf[n_buf_bytes - 1] = 0;
+}
+
+static void
+unserialize_mc_register_stream_name (serialize_main_t * m, va_list * va)
+{
+ mc_main_t *mcm = va_arg (*va, mc_main_t *);
+ char *name;
+ mc_stream_t *s;
+ uword *p;
+
+ unserialize_cstring (m, &name);
+
+ if ((p = hash_get_mem (mcm->stream_index_by_name, name)))
+ {
+ if (MC_EVENT_LOGGING > 0)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "stream index %d already named %s",
+ .format_args = "i4s16",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 stream_index;
+ char name[16];
+ } *ed;
+ ed = ELOG_DATA (mcm->elog_main, e);
+ ed->stream_index = p[0];
+ elog_stream_name (ed->name, sizeof (ed->name), name);
+ }
+
+ vec_free (name);
+ return;
+ }
+
+ vec_add2 (mcm->stream_vector, s, 1);
+ mc_stream_init (s);
+ s->state = MC_STREAM_STATE_name_known;
+ s->index = s - mcm->stream_vector;
+ s->config.name = name;
+
+ if (MC_EVENT_LOGGING > 0)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "stream index %d named %s",
+ .format_args = "i4s16",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 stream_index;
+ char name[16];
+ } *ed;
+ ed = ELOG_DATA (mcm->elog_main, e);
+ ed->stream_index = s->index;
+ elog_stream_name (ed->name, sizeof (ed->name), name);
+ }
+
+ hash_set_mem (mcm->stream_index_by_name, name, s->index);
+
+ p = hash_get (mcm->procs_waiting_for_stream_name_by_name, name);
+ if (p)
+ {
+ vlib_one_time_waiting_process_t *wp, **w;
+ w = pool_elt_at_index (mcm->procs_waiting_for_stream_name_pool, p[0]);
+ vec_foreach (wp, w[0])
+ vlib_signal_one_time_waiting_process (mcm->vlib_main, wp);
+ pool_put (mcm->procs_waiting_for_stream_name_pool, w);
+ hash_unset_mem (mcm->procs_waiting_for_stream_name_by_name, name);
+ }
+}
+
+/* *INDENT-OFF* */
+MC_SERIALIZE_MSG (mc_register_stream_name_msg, static) =
+{
+ .name = "mc_register_stream_name",
+ .serialize = serialize_mc_register_stream_name,
+ .unserialize = unserialize_mc_register_stream_name,
+};
+/* *INDENT-ON* */
+
+void
+mc_rx_buffer_unserialize (mc_main_t * mcm,
+ mc_stream_t * stream,
+ mc_peer_id_t peer_id, u32 buffer_index)
+{
+ return mc_unserialize (mcm, stream, buffer_index);
+}
+
+static u8 *
+mc_internal_catchup_snapshot (mc_main_t * mcm,
+ u8 * data_vector,
+ u32 last_global_sequence_processed)
+{
+ serialize_main_t m;
+
+ /* Append serialized data to data vector. */
+ serialize_open_vector (&m, data_vector);
+ m.stream.current_buffer_index = vec_len (data_vector);
+
+ serialize (&m, serialize_mc_main, mcm);
+ return serialize_close_vector (&m);
+}
+
+static void
+mc_internal_catchup (mc_main_t * mcm, u8 * data, u32 n_data_bytes)
+{
+ serialize_main_t s;
+
+ unserialize_open_data (&s, data, n_data_bytes);
+
+ unserialize (&s, unserialize_mc_main, mcm);
+}
+
+/* Overridden from the application layer, not actually used here */
+void mc_stream_join_process_hold (void) __attribute__ ((weak));
+void
+mc_stream_join_process_hold (void)
+{
+}
+
+static u32
+mc_stream_join_helper (mc_main_t * mcm,
+ mc_stream_config_t * config, u32 is_internal)
+{
+ mc_stream_t *s;
+ vlib_main_t *vm = mcm->vlib_main;
+
+ s = 0;
+ if (!is_internal)
+ {
+ uword *p;
+
+ /* Already have a stream with given name? */
+ if ((s = mc_stream_by_name (mcm, config->name)))
+ {
+ /* Already joined and ready? */
+ if (s->state == MC_STREAM_STATE_ready)
+ return s->index;
+ }
+
+ /* First join MC internal stream. */
+ if (!mcm->stream_vector
+ || (mcm->stream_vector[MC_STREAM_INDEX_INTERNAL].state
+ == MC_STREAM_STATE_invalid))
+ {
+ static mc_stream_config_t c = {
+ .name = "mc-internal",
+ .rx_buffer = mc_rx_buffer_unserialize,
+ .catchup = mc_internal_catchup,
+ .catchup_snapshot = mc_internal_catchup_snapshot,
+ };
+
+ c.save_snapshot = config->save_snapshot;
+
+ mc_stream_join_helper (mcm, &c, /* is_internal */ 1);
+ }
+
+ /* If stream is still unknown register this name and wait for
+ sequenced message to name stream. This way all peers agree
+ on stream name to index mappings. */
+ s = mc_stream_by_name (mcm, config->name);
+ if (!s)
+ {
+ vlib_one_time_waiting_process_t *wp, **w;
+ u8 *name_copy = format (0, "%s", config->name);
+
+ mc_serialize_stream (mcm,
+ MC_STREAM_INDEX_INTERNAL,
+ &mc_register_stream_name_msg, config->name);
+
+ /* Wait for this stream to be named. */
+ p =
+ hash_get_mem (mcm->procs_waiting_for_stream_name_by_name,
+ name_copy);
+ if (p)
+ w =
+ pool_elt_at_index (mcm->procs_waiting_for_stream_name_pool,
+ p[0]);
+ else
+ {
+ pool_get (mcm->procs_waiting_for_stream_name_pool, w);
+ if (!mcm->procs_waiting_for_stream_name_by_name)
+ mcm->procs_waiting_for_stream_name_by_name = hash_create_string ( /* elts */ 0, /* value size */
+ sizeof
+ (uword));
+ hash_set_mem (mcm->procs_waiting_for_stream_name_by_name,
+ name_copy,
+ w - mcm->procs_waiting_for_stream_name_pool);
+ w[0] = 0;
+ }
+
+ vec_add2 (w[0], wp, 1);
+ vlib_current_process_wait_for_one_time_event (vm, wp);
+ vec_free (name_copy);
+ }
+
+ /* Name should be known now. */
+ s = mc_stream_by_name (mcm, config->name);
+ ASSERT (s != 0);
+ ASSERT (s->state == MC_STREAM_STATE_name_known);
+ }
+
+ if (!s)
+ {
+ vec_add2 (mcm->stream_vector, s, 1);
+ mc_stream_init (s);
+ s->index = s - mcm->stream_vector;
+ }
+
+ {
+ /* Save name since we could have already used it as hash key. */
+ char *name_save = s->config.name;
+
+ s->config = config[0];
+
+ if (name_save)
+ s->config.name = name_save;
+ }
+
+ if (s->config.window_size == 0)
+ s->config.window_size = 8;
+
+ if (s->config.retry_interval == 0.0)
+ s->config.retry_interval = 1.0;
+
+ /* Sanity. */
+ ASSERT (s->config.retry_interval < 30);
+
+ if (s->config.retry_limit == 0)
+ s->config.retry_limit = 7;
+
+ s->state = MC_STREAM_STATE_join_in_progress;
+ if (!s->peer_index_by_id.hash)
+ mhash_init (&s->peer_index_by_id, sizeof (uword), sizeof (mc_peer_id_t));
+
+ /* If we don't hear from someone in 5 seconds, we're alone */
+ s->join_timeout = vlib_time_now (vm) + 5.0;
+ mcm->joins_in_progress++;
+
+ if (MC_EVENT_LOGGING > 0)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "stream index %d join request %s",
+ .format_args = "i4s16",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 stream_index;
+ char name[16];
+ } *ed;
+ ed = ELOG_DATA (mcm->elog_main, e);
+ ed->stream_index = s->index;
+ elog_stream_name (ed->name, sizeof (ed->name), s->config.name);
+ }
+
+ send_join_or_leave_request (mcm, s->index, 1 /* join */ );
+
+ vlib_current_process_wait_for_one_time_event_vector
+ (vm, &s->procs_waiting_for_join_done);
+
+ if (MC_EVENT_LOGGING)
+ {
+ ELOG_TYPE (e, "join complete stream %d");
+ ELOG (mcm->elog_main, e, s->index);
+ }
+
+ return s->index;
+}
+
+u32
+mc_stream_join (mc_main_t * mcm, mc_stream_config_t * config)
+{
+ return mc_stream_join_helper (mcm, config, /* is_internal */ 0);
+}
+
+void
+mc_stream_leave (mc_main_t * mcm, u32 stream_index)
+{
+ mc_stream_t *s = mc_stream_by_index (mcm, stream_index);
+
+ if (!s)
+ return;
+
+ if (MC_EVENT_LOGGING)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (t) =
+ {
+ .format = "leave-stream: %d",.format_args = "i4",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 index;
+ } *ed;
+ ed = ELOG_DATA (mcm->elog_main, t);
+ ed->index = stream_index;
+ }
+
+ send_join_or_leave_request (mcm, stream_index, 0 /* is_join */ );
+ mc_stream_free (s);
+ s->state = MC_STREAM_STATE_name_known;
+}
+
+void
+mc_msg_join_or_leave_request_handler (mc_main_t * mcm,
+ mc_msg_join_or_leave_request_t * req,
+ u32 buffer_index)
+{
+ mc_stream_t *s;
+ mc_msg_join_reply_t *rep;
+ u32 bi;
+
+ mc_byte_swap_msg_join_or_leave_request (req);
+
+ s = mc_stream_by_index (mcm, req->stream_index);
+ if (!s || s->state != MC_STREAM_STATE_ready)
+ return;
+
+ /* If the peer is joining, create it */
+ if (req->is_join)
+ {
+ mc_stream_t *this_s;
+
+ /* We're not in a position to catch up a peer until all
+ stream joins are complete. */
+ if (0)
+ {
+ /* XXX This is hard to test so we've. */
+ vec_foreach (this_s, mcm->stream_vector)
+ {
+ if (this_s->state != MC_STREAM_STATE_ready
+ && this_s->state != MC_STREAM_STATE_name_known)
+ return;
+ }
+ }
+ else if (mcm->joins_in_progress > 0)
+ return;
+
+ (void) get_or_create_peer_with_id (mcm, s, req->peer_id,
+ /* created */ 0);
+
+ rep = mc_get_vlib_buffer (mcm->vlib_main, sizeof (rep[0]), &bi);
+ memset (rep, 0, sizeof (rep[0]));
+ rep->type = MC_MSG_TYPE_join_reply;
+ rep->stream_index = req->stream_index;
+
+ mc_byte_swap_msg_join_reply (rep);
+ /* These two are already in network byte order... */
+ rep->peer_id = mcm->transport.our_ack_peer_id;
+ rep->catchup_peer_id = mcm->transport.our_catchup_peer_id;
+
+ mcm->transport.tx_buffer (mcm->transport.opaque, MC_TRANSPORT_JOIN, bi);
+ }
+ else
+ {
+ if (s->config.peer_died)
+ s->config.peer_died (mcm, s, req->peer_id);
+ }
+}
+
+void
+mc_msg_join_reply_handler (mc_main_t * mcm,
+ mc_msg_join_reply_t * mp, u32 buffer_index)
+{
+ mc_stream_t *s;
+
+ mc_byte_swap_msg_join_reply (mp);
+
+ s = mc_stream_by_index (mcm, mp->stream_index);
+
+ if (!s || s->state != MC_STREAM_STATE_join_in_progress)
+ return;
+
+ /* Switch to catchup state; next join reply
+ for this stream will be ignored. */
+ s->state = MC_STREAM_STATE_catchup;
+
+ mcm->joins_in_progress--;
+ mcm->transport.catchup_request_fun (mcm->transport.opaque,
+ mp->stream_index, mp->catchup_peer_id);
+}
+
+void
+mc_wait_for_stream_ready (mc_main_t * m, char *stream_name)
+{
+ mc_stream_t *s;
+
+ while (1)
+ {
+ s = mc_stream_by_name (m, stream_name);
+ if (s)
+ break;
+ vlib_process_suspend (m->vlib_main, .1);
+ }
+
+ /* It's OK to send a message in catchup and ready states. */
+ if (s->state == MC_STREAM_STATE_catchup
+ || s->state == MC_STREAM_STATE_ready)
+ return;
+
+ /* Otherwise we are waiting for a join to finish. */
+ vlib_current_process_wait_for_one_time_event_vector
+ (m->vlib_main, &s->procs_waiting_for_join_done);
+}
+
+u32
+mc_stream_send (mc_main_t * mcm, u32 stream_index, u32 buffer_index)
+{
+ mc_stream_t *s = mc_stream_by_index (mcm, stream_index);
+ vlib_main_t *vm = mcm->vlib_main;
+ mc_retry_t *r;
+ mc_msg_user_request_t *mp;
+ vlib_buffer_t *b = vlib_get_buffer (vm, buffer_index);
+ u32 ri;
+
+ if (!s)
+ return 0;
+
+ if (s->state != MC_STREAM_STATE_ready)
+ vlib_current_process_wait_for_one_time_event_vector
+ (vm, &s->procs_waiting_for_join_done);
+
+ while (pool_elts (s->retry_pool) >= s->config.window_size)
+ {
+ vlib_current_process_wait_for_one_time_event_vector
+ (vm, &s->procs_waiting_for_open_window);
+ }
+
+ pool_get (s->retry_pool, r);
+ ri = r - s->retry_pool;
+
+ r->prev_index = s->retry_tail_index;
+ r->next_index = ~0;
+ s->retry_tail_index = ri;
+
+ if (r->prev_index == ~0)
+ s->retry_head_index = ri;
+ else
+ {
+ mc_retry_t *p = pool_elt_at_index (s->retry_pool, r->prev_index);
+ p->next_index = ri;
+ }
+
+ vlib_buffer_advance (b, -sizeof (mp[0]));
+ mp = vlib_buffer_get_current (b);
+
+ mp->peer_id = mcm->transport.our_ack_peer_id;
+ /* mp->transport.global_sequence set by relay agent. */
+ mp->global_sequence = 0xdeadbeef;
+ mp->stream_index = s->index;
+ mp->local_sequence = s->our_local_sequence++;
+ mp->n_data_bytes =
+ vlib_buffer_index_length_in_chain (vm, buffer_index) - sizeof (mp[0]);
+
+ r->buffer_index = buffer_index;
+ r->local_sequence = mp->local_sequence;
+ r->sent_at = vlib_time_now (vm);
+ r->n_retries = 0;
+
+ /* Retry will be freed when all currently known peers have acked. */
+ vec_validate (r->unacked_by_peer_bitmap, vec_len (s->all_peer_bitmap) - 1);
+ vec_copy (r->unacked_by_peer_bitmap, s->all_peer_bitmap);
+
+ hash_set (s->retry_index_by_local_sequence, r->local_sequence,
+ r - s->retry_pool);
+
+ elog_tx_msg (mcm, s->index, mp->local_sequence, r->n_retries);
+
+ mc_byte_swap_msg_user_request (mp);
+
+ mcm->transport.tx_buffer (mcm->transport.opaque,
+ MC_TRANSPORT_USER_REQUEST_TO_RELAY, buffer_index);
+
+ s->user_requests_sent++;
+
+ /* return amount of window remaining */
+ return s->config.window_size - pool_elts (s->retry_pool);
+}
+
+void
+mc_msg_user_request_handler (mc_main_t * mcm, mc_msg_user_request_t * mp,
+ u32 buffer_index)
+{
+ vlib_main_t *vm = mcm->vlib_main;
+ mc_stream_t *s;
+ mc_stream_peer_t *peer;
+ i32 seq_cmp_result;
+ static int once = 0;
+
+ mc_byte_swap_msg_user_request (mp);
+
+ s = mc_stream_by_index (mcm, mp->stream_index);
+
+ /* Not signed up for this stream? Turf-o-matic */
+ if (!s || s->state != MC_STREAM_STATE_ready)
+ {
+ vlib_buffer_free_one (vm, buffer_index);
+ return;
+ }
+
+ /* Find peer, including ourselves. */
+ peer = get_or_create_peer_with_id (mcm, s, mp->peer_id,
+ /* created */ 0);
+
+ seq_cmp_result = mc_seq_cmp (mp->local_sequence,
+ peer->last_sequence_received + 1);
+
+ if (MC_EVENT_LOGGING > 0)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "rx-msg: peer %s stream %d rx seq %d seq_cmp %d",
+ .format_args = "T4i4i4i4",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 peer, stream_index, rx_sequence;
+ i32 seq_cmp_result;
+ } *ed;
+ ed = ELOG_DATA (mcm->elog_main, e);
+ ed->peer = elog_id_for_peer_id (mcm, peer->id.as_u64);
+ ed->stream_index = mp->stream_index;
+ ed->rx_sequence = mp->local_sequence;
+ ed->seq_cmp_result = seq_cmp_result;
+ }
+
+ if (0 && mp->stream_index == 1 && once == 0)
+ {
+ once = 1;
+ ELOG_TYPE (e, "FAKE lost msg on stream 1");
+ ELOG (mcm->elog_main, e, 0);
+ return;
+ }
+
+ peer->last_sequence_received += seq_cmp_result == 0;
+ s->user_requests_received++;
+
+ if (seq_cmp_result > 0)
+ peer->stats.n_msgs_from_future += 1;
+
+ /* Send ack even if msg from future */
+ if (1)
+ {
+ mc_msg_user_ack_t *rp;
+ u32 bi;
+
+ rp = mc_get_vlib_buffer (vm, sizeof (rp[0]), &bi);
+ rp->peer_id = mcm->transport.our_ack_peer_id;
+ rp->stream_index = s->index;
+ rp->local_sequence = mp->local_sequence;
+ rp->seq_cmp_result = seq_cmp_result;
+
+ if (MC_EVENT_LOGGING > 0)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "tx-ack: stream %d local seq %d",
+ .format_args = "i4i4",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 stream_index;
+ u32 local_sequence;
+ } *ed;
+ ed = ELOG_DATA (mcm->elog_main, e);
+ ed->stream_index = rp->stream_index;
+ ed->local_sequence = rp->local_sequence;
+ }
+
+ mc_byte_swap_msg_user_ack (rp);
+
+ mcm->transport.tx_ack (mcm->transport.opaque, mp->peer_id, bi);
+ /* Msg from past? If so, free the buffer... */
+ if (seq_cmp_result < 0)
+ {
+ vlib_buffer_free_one (vm, buffer_index);
+ peer->stats.n_msgs_from_past += 1;
+ }
+ }
+
+ if (seq_cmp_result == 0)
+ {
+ vlib_buffer_t *b = vlib_get_buffer (vm, buffer_index);
+ switch (s->state)
+ {
+ case MC_STREAM_STATE_ready:
+ vlib_buffer_advance (b, sizeof (mp[0]));
+ s->config.rx_buffer (mcm, s, mp->peer_id, buffer_index);
+
+ /* Stream vector can change address via rx callback for mc-internal
+ stream. */
+ s = mc_stream_by_index (mcm, mp->stream_index);
+ ASSERT (s != 0);
+ s->last_global_sequence_processed = mp->global_sequence;
+ break;
+
+ case MC_STREAM_STATE_catchup:
+ clib_fifo_add1 (s->catchup_fifo, buffer_index);
+ break;
+
+ default:
+ clib_warning ("stream in unknown state %U",
+ format_mc_stream_state, s->state);
+ break;
+ }
+ }
+}
+
+void
+mc_msg_user_ack_handler (mc_main_t * mcm, mc_msg_user_ack_t * mp,
+ u32 buffer_index)
+{
+ vlib_main_t *vm = mcm->vlib_main;
+ uword *p;
+ mc_stream_t *s;
+ mc_stream_peer_t *peer;
+ mc_retry_t *r;
+ int peer_created = 0;
+
+ mc_byte_swap_msg_user_ack (mp);
+
+ s = mc_stream_by_index (mcm, mp->stream_index);
+
+ if (MC_EVENT_LOGGING > 0)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (t) =
+ {
+ .format = "rx-ack: local seq %d peer %s seq_cmp_result %d",
+ .format_args = "i4T4i4",
+ };
+ /* *INDENT-ON* */
+
+ struct
+ {
+ u32 local_sequence;
+ u32 peer;
+ i32 seq_cmp_result;
+ } *ed;
+ ed = ELOG_DATA (mcm->elog_main, t);
+ ed->local_sequence = mp->local_sequence;
+ ed->peer = elog_id_for_peer_id (mcm, mp->peer_id.as_u64);
+ ed->seq_cmp_result = mp->seq_cmp_result;
+ }
+
+ /* Unknown stream? */
+ if (!s)
+ return;
+
+ /* Find the peer which just ack'ed. */
+ peer = get_or_create_peer_with_id (mcm, s, mp->peer_id,
+ /* created */ &peer_created);
+
+ /*
+ * Peer reports message from the future. If it's not in the retry
+ * fifo, look for a retired message.
+ */
+ if (mp->seq_cmp_result > 0)
+ {
+ p = hash_get (s->retry_index_by_local_sequence, mp->local_sequence -
+ mp->seq_cmp_result);
+ if (p == 0)
+ mc_resend_retired (mcm, s, mp->local_sequence - mp->seq_cmp_result);
+
+ /* Normal retry should fix it... */
+ return;
+ }
+
+ /*
+ * Pointer to the indicated retry fifo entry.
+ * Worth hashing because we could use a window size of 100 or 1000.
+ */
+ p = hash_get (s->retry_index_by_local_sequence, mp->local_sequence);
+
+ /*
+ * Is this a duplicate ACK, received after we've retired the
+ * fifo entry. This can happen when learning about new
+ * peers.
+ */
+ if (p == 0)
+ {
+ if (MC_EVENT_LOGGING > 0)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (t) =
+ {
+ .format = "ack: for seq %d from peer %s no fifo elt",
+ .format_args = "i4T4",
+ };
+ /* *INDENT-ON* */
+
+ struct
+ {
+ u32 seq;
+ u32 peer;
+ } *ed;
+ ed = ELOG_DATA (mcm->elog_main, t);
+ ed->seq = mp->local_sequence;
+ ed->peer = elog_id_for_peer_id (mcm, mp->peer_id.as_u64);
+ }
+
+ return;
+ }
+
+ r = pool_elt_at_index (s->retry_pool, p[0]);
+
+ /* Make sure that this new peer ACKs our msgs from now on */
+ if (peer_created)
+ {
+ mc_retry_t *later_retry = next_retry (s, r);
+
+ while (later_retry)
+ {
+ later_retry->unacked_by_peer_bitmap =
+ clib_bitmap_ori (later_retry->unacked_by_peer_bitmap,
+ peer - s->peers);
+ later_retry = next_retry (s, later_retry);
+ }
+ }
+
+ ASSERT (mp->local_sequence == r->local_sequence);
+
+ /* If we weren't expecting to hear from this peer */
+ if (!peer_created &&
+ !clib_bitmap_get (r->unacked_by_peer_bitmap, peer - s->peers))
+ {
+ if (MC_EVENT_LOGGING > 0)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (t) =
+ {
+ .format = "dup-ack: for seq %d from peer %s",
+ .format_args = "i4T4",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 seq;
+ u32 peer;
+ } *ed;
+ ed = ELOG_DATA (mcm->elog_main, t);
+ ed->seq = r->local_sequence;
+ ed->peer = elog_id_for_peer_id (mcm, peer->id.as_u64);
+ }
+ if (!clib_bitmap_is_zero (r->unacked_by_peer_bitmap))
+ return;
+ }
+
+ if (MC_EVENT_LOGGING > 0)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (t) =
+ {
+ .format = "ack: for seq %d from peer %s",
+ .format_args = "i4T4",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 seq;
+ u32 peer;
+ } *ed;
+ ed = ELOG_DATA (mcm->elog_main, t);
+ ed->seq = mp->local_sequence;
+ ed->peer = elog_id_for_peer_id (mcm, peer->id.as_u64);
+ }
+
+ r->unacked_by_peer_bitmap =
+ clib_bitmap_andnoti (r->unacked_by_peer_bitmap, peer - s->peers);
+
+ /* Not all clients have ack'ed */
+ if (!clib_bitmap_is_zero (r->unacked_by_peer_bitmap))
+ {
+ return;
+ }
+ if (MC_EVENT_LOGGING > 0)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (t) =
+ {
+ .format = "ack: retire fifo elt loc seq %d after %d acks",
+ .format_args = "i4i4",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 seq;
+ u32 npeers;
+ } *ed;
+ ed = ELOG_DATA (mcm->elog_main, t);
+ ed->seq = r->local_sequence;
+ ed->npeers = pool_elts (s->peers);
+ }
+
+ hash_unset (s->retry_index_by_local_sequence, mp->local_sequence);
+ mc_retry_free (mcm, s, r);
+ remove_retry_from_pool (s, r);
+ maybe_send_window_open_event (vm, s);
+}
+
+#define EVENT_MC_SEND_CATCHUP_DATA 0
+
+static uword
+mc_catchup_process (vlib_main_t * vm,
+ vlib_node_runtime_t * node, vlib_frame_t * f)
+{
+ mc_main_t *mcm = mc_node_get_main (node);
+ uword *event_data = 0;
+ mc_catchup_process_arg_t *args;
+ int i;
+
+ while (1)
+ {
+ if (event_data)
+ _vec_len (event_data) = 0;
+ vlib_process_wait_for_event_with_type (vm, &event_data,
+ EVENT_MC_SEND_CATCHUP_DATA);
+
+ for (i = 0; i < vec_len (event_data); i++)
+ {
+ args = pool_elt_at_index (mcm->catchup_process_args, event_data[i]);
+
+ mcm->transport.catchup_send_fun (mcm->transport.opaque,
+ args->catchup_opaque,
+ args->catchup_snapshot);
+
+ /* Send function will free snapshot data vector. */
+ pool_put (mcm->catchup_process_args, args);
+ }
+ }
+
+ return 0; /* not likely */
+}
+
+static void
+serialize_mc_stream (serialize_main_t * m, va_list * va)
+{
+ mc_stream_t *s = va_arg (*va, mc_stream_t *);
+ mc_stream_peer_t *p;
+
+ serialize_integer (m, pool_elts (s->peers), sizeof (u32));
+ /* *INDENT-OFF* */
+ pool_foreach (p, s->peers, ({
+ u8 * x = serialize_get (m, sizeof (p->id));
+ clib_memcpy (x, p->id.as_u8, sizeof (p->id));
+ serialize_integer (m, p->last_sequence_received,
+ sizeof (p->last_sequence_received));
+ }));
+/* *INDENT-ON* */
+ serialize_bitmap (m, s->all_peer_bitmap);
+}
+
+void
+unserialize_mc_stream (serialize_main_t * m, va_list * va)
+{
+ mc_stream_t *s = va_arg (*va, mc_stream_t *);
+ u32 i, n_peers;
+ mc_stream_peer_t *p;
+
+ unserialize_integer (m, &n_peers, sizeof (u32));
+ mhash_init (&s->peer_index_by_id, sizeof (uword), sizeof (mc_peer_id_t));
+ for (i = 0; i < n_peers; i++)
+ {
+ u8 *x;
+ pool_get (s->peers, p);
+ x = unserialize_get (m, sizeof (p->id));
+ clib_memcpy (p->id.as_u8, x, sizeof (p->id));
+ unserialize_integer (m, &p->last_sequence_received,
+ sizeof (p->last_sequence_received));
+ mhash_set (&s->peer_index_by_id, &p->id, p - s->peers, /* old_value */
+ 0);
+ }
+ s->all_peer_bitmap = unserialize_bitmap (m);
+
+ /* This is really bad. */
+ if (!s->all_peer_bitmap)
+ clib_warning ("BUG: stream %s all_peer_bitmap NULL", s->config.name);
+}
+
+void
+mc_msg_catchup_request_handler (mc_main_t * mcm,
+ mc_msg_catchup_request_t * req,
+ u32 catchup_opaque)
+{
+ vlib_main_t *vm = mcm->vlib_main;
+ mc_stream_t *s;
+ mc_catchup_process_arg_t *args;
+
+ mc_byte_swap_msg_catchup_request (req);
+
+ s = mc_stream_by_index (mcm, req->stream_index);
+ if (!s || s->state != MC_STREAM_STATE_ready)
+ return;
+
+ if (MC_EVENT_LOGGING > 0)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (t) =
+ {
+ .format = "catchup-request: from %s stream %d",
+ .format_args = "T4i4",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 peer, stream;
+ } *ed;
+ ed = ELOG_DATA (mcm->elog_main, t);
+ ed->peer = elog_id_for_peer_id (mcm, req->peer_id.as_u64);
+ ed->stream = req->stream_index;
+ }
+
+ /*
+ * The application has to snapshoot its data structures right
+ * here, right now. If we process any messages after
+ * noting the last global sequence we've processed, the client
+ * won't be able to accurately reconstruct our data structures.
+ *
+ * Once the data structures are e.g. vec_dup()'ed, we
+ * send the resulting messages from a separate process, to
+ * make sure that we don't cause a bunch of message retransmissions
+ */
+ pool_get (mcm->catchup_process_args, args);
+
+ args->stream_index = s - mcm->stream_vector;
+ args->catchup_opaque = catchup_opaque;
+ args->catchup_snapshot = 0;
+
+ /* Construct catchup reply and snapshot state for stream to send as
+ catchup reply payload. */
+ {
+ mc_msg_catchup_reply_t *rep;
+ serialize_main_t m;
+
+ vec_resize (args->catchup_snapshot, sizeof (rep[0]));
+
+ rep = (void *) args->catchup_snapshot;
+
+ rep->peer_id = req->peer_id;
+ rep->stream_index = req->stream_index;
+ rep->last_global_sequence_included = s->last_global_sequence_processed;
+
+ /* Setup for serialize to append to catchup snapshot. */
+ serialize_open_vector (&m, args->catchup_snapshot);
+ m.stream.current_buffer_index = vec_len (m.stream.buffer);
+
+ serialize (&m, serialize_mc_stream, s);
+
+ args->catchup_snapshot = serialize_close_vector (&m);
+
+ /* Actually copy internal state */
+ args->catchup_snapshot = s->config.catchup_snapshot
+ (mcm, args->catchup_snapshot, rep->last_global_sequence_included);
+
+ rep = (void *) args->catchup_snapshot;
+ rep->n_data_bytes = vec_len (args->catchup_snapshot) - sizeof (rep[0]);
+
+ mc_byte_swap_msg_catchup_reply (rep);
+ }
+
+ /* now go send it... */
+ vlib_process_signal_event (vm, mcm->catchup_process,
+ EVENT_MC_SEND_CATCHUP_DATA,
+ args - mcm->catchup_process_args);
+}
+
+#define EVENT_MC_UNSERIALIZE_BUFFER 0
+#define EVENT_MC_UNSERIALIZE_CATCHUP 1
+
+void
+mc_msg_catchup_reply_handler (mc_main_t * mcm, mc_msg_catchup_reply_t * mp,
+ u32 catchup_opaque)
+{
+ vlib_process_signal_event (mcm->vlib_main,
+ mcm->unserialize_process,
+ EVENT_MC_UNSERIALIZE_CATCHUP,
+ pointer_to_uword (mp));
+}
+
+static void
+perform_catchup (mc_main_t * mcm, mc_msg_catchup_reply_t * mp)
+{
+ mc_stream_t *s;
+ i32 seq_cmp_result;
+
+ mc_byte_swap_msg_catchup_reply (mp);
+
+ s = mc_stream_by_index (mcm, mp->stream_index);
+
+ /* Never heard of this stream or already caught up. */
+ if (!s || s->state == MC_STREAM_STATE_ready)
+ return;
+
+ {
+ serialize_main_t m;
+ mc_stream_peer_t *p;
+ u32 n_stream_bytes;
+
+ /* For offline sim replay: save the entire catchup snapshot... */
+ if (s->config.save_snapshot)
+ s->config.save_snapshot (mcm, /* is_catchup */ 1, mp->data,
+ mp->n_data_bytes);
+
+ unserialize_open_data (&m, mp->data, mp->n_data_bytes);
+ unserialize (&m, unserialize_mc_stream, s);
+
+ /* Make sure we start numbering our messages as expected */
+ /* *INDENT-OFF* */
+ pool_foreach (p, s->peers, ({
+ if (p->id.as_u64 == mcm->transport.our_ack_peer_id.as_u64)
+ s->our_local_sequence = p->last_sequence_received + 1;
+ }));
+/* *INDENT-ON* */
+
+ n_stream_bytes = m.stream.current_buffer_index;
+
+ /* No need to unserialize close; nothing to free. */
+
+ /* After serialized stream is user's catchup data. */
+ s->config.catchup (mcm, mp->data + n_stream_bytes,
+ mp->n_data_bytes - n_stream_bytes);
+ }
+
+ /* Vector could have been moved by catchup.
+ This can only happen for mc-internal stream. */
+ s = mc_stream_by_index (mcm, mp->stream_index);
+
+ s->last_global_sequence_processed = mp->last_global_sequence_included;
+
+ while (clib_fifo_elts (s->catchup_fifo))
+ {
+ mc_msg_user_request_t *gp;
+ u32 bi;
+ vlib_buffer_t *b;
+
+ clib_fifo_sub1 (s->catchup_fifo, bi);
+
+ b = vlib_get_buffer (mcm->vlib_main, bi);
+ gp = vlib_buffer_get_current (b);
+
+ /* Make sure we're replaying "new" news */
+ seq_cmp_result = mc_seq_cmp (gp->global_sequence,
+ mp->last_global_sequence_included);
+
+ if (seq_cmp_result > 0)
+ {
+ vlib_buffer_advance (b, sizeof (gp[0]));
+ s->config.rx_buffer (mcm, s, gp->peer_id, bi);
+ s->last_global_sequence_processed = gp->global_sequence;
+
+ if (MC_EVENT_LOGGING)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (t) =
+ {
+ .format = "catchup replay local sequence 0x%x",
+ .format_args = "i4",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 local_sequence;
+ } *ed;
+ ed = ELOG_DATA (mcm->elog_main, t);
+ ed->local_sequence = gp->local_sequence;
+ }
+ }
+ else
+ {
+ if (MC_EVENT_LOGGING)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (t) =
+ {
+ .format = "catchup discard local sequence 0x%x",
+ .format_args = "i4",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 local_sequence;
+ } *ed;
+ ed = ELOG_DATA (mcm->elog_main, t);
+ ed->local_sequence = gp->local_sequence;
+ }
+
+ vlib_buffer_free_one (mcm->vlib_main, bi);
+ }
+ }
+
+ s->state = MC_STREAM_STATE_ready;
+
+ /* Now that we are caught up wake up joining process. */
+ {
+ vlib_one_time_waiting_process_t *wp;
+ vec_foreach (wp, s->procs_waiting_for_join_done)
+ vlib_signal_one_time_waiting_process (mcm->vlib_main, wp);
+ if (s->procs_waiting_for_join_done)
+ _vec_len (s->procs_waiting_for_join_done) = 0;
+ }
+}
+
+static void
+this_node_maybe_master (mc_main_t * mcm)
+{
+ vlib_main_t *vm = mcm->vlib_main;
+ mc_msg_master_assert_t *mp;
+ uword event_type;
+ int timeouts = 0;
+ int is_master = mcm->relay_state == MC_RELAY_STATE_MASTER;
+ clib_error_t *error;
+ f64 now, time_last_master_assert = -1;
+ u32 bi;
+
+ while (1)
+ {
+ if (!mcm->we_can_be_relay_master)
+ {
+ mcm->relay_state = MC_RELAY_STATE_SLAVE;
+ if (MC_EVENT_LOGGING)
+ {
+ ELOG_TYPE (e, "become slave (config)");
+ ELOG (mcm->elog_main, e, 0);
+ }
+ return;
+ }
+
+ now = vlib_time_now (vm);
+ if (now >= time_last_master_assert + 1)
+ {
+ time_last_master_assert = now;
+ mp = mc_get_vlib_buffer (mcm->vlib_main, sizeof (mp[0]), &bi);
+
+ mp->peer_id = mcm->transport.our_ack_peer_id;
+ mp->global_sequence = mcm->relay_global_sequence;
+
+ /*
+ * these messages clog the event log, set MC_EVENT_LOGGING higher
+ * if you want them
+ */
+ if (MC_EVENT_LOGGING > 1)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "tx-massert: peer %s global seq %u",
+ .format_args = "T4i4",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 peer, global_sequence;
+ } *ed;
+ ed = ELOG_DATA (mcm->elog_main, e);
+ ed->peer = elog_id_for_peer_id (mcm, mp->peer_id.as_u64);
+ ed->global_sequence = mp->global_sequence;
+ }
+
+ mc_byte_swap_msg_master_assert (mp);
+
+ error =
+ mcm->transport.tx_buffer (mcm->transport.opaque,
+ MC_TRANSPORT_MASTERSHIP, bi);
+ if (error)
+ clib_error_report (error);
+ }
+
+ vlib_process_wait_for_event_or_clock (vm, 1.0);
+ event_type = vlib_process_get_events (vm, /* no event data */ 0);
+
+ switch (event_type)
+ {
+ case ~0:
+ if (!is_master && timeouts++ > 2)
+ {
+ mcm->relay_state = MC_RELAY_STATE_MASTER;
+ mcm->relay_master_peer_id =
+ mcm->transport.our_ack_peer_id.as_u64;
+ if (MC_EVENT_LOGGING)
+ {
+ ELOG_TYPE (e, "become master (was maybe_master)");
+ ELOG (mcm->elog_main, e, 0);
+ }
+ return;
+ }
+ break;
+
+ case MC_RELAY_STATE_SLAVE:
+ mcm->relay_state = MC_RELAY_STATE_SLAVE;
+ if (MC_EVENT_LOGGING && mcm->relay_state != MC_RELAY_STATE_SLAVE)
+ {
+ ELOG_TYPE (e, "become slave (was maybe_master)");
+ ELOG (mcm->elog_main, e, 0);
+ }
+ return;
+ }
+ }
+}
+
+static void
+this_node_slave (mc_main_t * mcm)
+{
+ vlib_main_t *vm = mcm->vlib_main;
+ uword event_type;
+ int timeouts = 0;
+
+ if (MC_EVENT_LOGGING)
+ {
+ ELOG_TYPE (e, "become slave");
+ ELOG (mcm->elog_main, e, 0);
+ }
+
+ while (1)
+ {
+ vlib_process_wait_for_event_or_clock (vm, 1.0);
+ event_type = vlib_process_get_events (vm, /* no event data */ 0);
+
+ switch (event_type)
+ {
+ case ~0:
+ if (timeouts++ > 2)
+ {
+ mcm->relay_state = MC_RELAY_STATE_NEGOTIATE;
+ mcm->relay_master_peer_id = ~0ULL;
+ if (MC_EVENT_LOGGING)
+ {
+ ELOG_TYPE (e, "timeouts; negoitate mastership");
+ ELOG (mcm->elog_main, e, 0);
+ }
+ return;
+ }
+ break;
+
+ case MC_RELAY_STATE_SLAVE:
+ mcm->relay_state = MC_RELAY_STATE_SLAVE;
+ timeouts = 0;
+ break;
+ }
+ }
+}
+
+static uword
+mc_mastership_process (vlib_main_t * vm,
+ vlib_node_runtime_t * node, vlib_frame_t * f)
+{
+ mc_main_t *mcm = mc_node_get_main (node);
+
+ while (1)
+ {
+ switch (mcm->relay_state)
+ {
+ case MC_RELAY_STATE_NEGOTIATE:
+ case MC_RELAY_STATE_MASTER:
+ this_node_maybe_master (mcm);
+ break;
+
+ case MC_RELAY_STATE_SLAVE:
+ this_node_slave (mcm);
+ break;
+ }
+ }
+ return 0; /* not likely */
+}
+
+void
+mc_enable_disable_mastership (mc_main_t * mcm, int we_can_be_master)
+{
+ if (we_can_be_master != mcm->we_can_be_relay_master)
+ {
+ mcm->we_can_be_relay_master = we_can_be_master;
+ vlib_process_signal_event (mcm->vlib_main,
+ mcm->mastership_process,
+ MC_RELAY_STATE_NEGOTIATE, 0);
+ }
+}
+
+void
+mc_msg_master_assert_handler (mc_main_t * mcm, mc_msg_master_assert_t * mp,
+ u32 buffer_index)
+{
+ mc_peer_id_t his_peer_id, our_peer_id;
+ i32 seq_cmp_result;
+ u8 signal_slave = 0;
+ u8 update_global_sequence = 0;
+
+ mc_byte_swap_msg_master_assert (mp);
+
+ his_peer_id = mp->peer_id;
+ our_peer_id = mcm->transport.our_ack_peer_id;
+
+ /* compare the incoming global sequence with ours */
+ seq_cmp_result = mc_seq_cmp (mp->global_sequence,
+ mcm->relay_global_sequence);
+
+ /* If the sender has a lower peer id and the sender's sequence >=
+ our global sequence, we become a slave. Otherwise we are master. */
+ if (mc_peer_id_compare (his_peer_id, our_peer_id) < 0
+ && seq_cmp_result >= 0)
+ {
+ vlib_process_signal_event (mcm->vlib_main,
+ mcm->mastership_process,
+ MC_RELAY_STATE_SLAVE, 0);
+ signal_slave = 1;
+ }
+
+ /* Update our global sequence. */
+ if (seq_cmp_result > 0)
+ {
+ mcm->relay_global_sequence = mp->global_sequence;
+ update_global_sequence = 1;
+ }
+
+ {
+ uword *q = mhash_get (&mcm->mastership_peer_index_by_id, &his_peer_id);
+ mc_mastership_peer_t *p;
+
+ if (q)
+ p = vec_elt_at_index (mcm->mastership_peers, q[0]);
+ else
+ {
+ vec_add2 (mcm->mastership_peers, p, 1);
+ p->peer_id = his_peer_id;
+ mhash_set (&mcm->mastership_peer_index_by_id, &p->peer_id,
+ p - mcm->mastership_peers,
+ /* old_value */ 0);
+ }
+ p->time_last_master_assert_received = vlib_time_now (mcm->vlib_main);
+ }
+
+ /*
+ * these messages clog the event log, set MC_EVENT_LOGGING higher
+ * if you want them.
+ */
+ if (MC_EVENT_LOGGING > 1)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "rx-massert: peer %s global seq %u upd %d slave %d",
+ .format_args = "T4i4i1i1",
+ };
+ /* *INDENT-ON* */
+
+ struct
+ {
+ u32 peer;
+ u32 global_sequence;
+ u8 update_sequence;
+ u8 slave;
+ } *ed;
+ ed = ELOG_DATA (mcm->elog_main, e);
+ ed->peer = elog_id_for_peer_id (mcm, his_peer_id.as_u64);
+ ed->global_sequence = mp->global_sequence;
+ ed->update_sequence = update_global_sequence;
+ ed->slave = signal_slave;
+ }
+}
+
+static void
+mc_serialize_init (mc_main_t * mcm)
+{
+ mc_serialize_msg_t *m;
+ vlib_main_t *vm = vlib_get_main ();
+
+ mcm->global_msg_index_by_name
+ = hash_create_string ( /* elts */ 0, sizeof (uword));
+
+ m = vm->mc_msg_registrations;
+
+ while (m)
+ {
+ m->global_index = vec_len (mcm->global_msgs);
+ hash_set_mem (mcm->global_msg_index_by_name, m->name, m->global_index);
+ vec_add1 (mcm->global_msgs, m);
+ m = m->next_registration;
+ }
+}
+
+clib_error_t *
+mc_serialize_va (mc_main_t * mc,
+ u32 stream_index,
+ u32 multiple_messages_per_vlib_buffer,
+ mc_serialize_msg_t * msg, va_list * va)
+{
+ mc_stream_t *s;
+ clib_error_t *error;
+ serialize_main_t *m = &mc->serialize_mains[VLIB_TX];
+ vlib_serialize_buffer_main_t *sbm = &mc->serialize_buffer_mains[VLIB_TX];
+ u32 bi, n_before, n_after, n_total, n_this_msg;
+ u32 si, gi;
+
+ if (!sbm->vlib_main)
+ {
+ sbm->tx.max_n_data_bytes_per_chain = 4096;
+ sbm->tx.free_list_index = VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX;
+ }
+
+ if (sbm->first_buffer == 0)
+ serialize_open_vlib_buffer (m, mc->vlib_main, sbm);
+
+ n_before = serialize_vlib_buffer_n_bytes (m);
+
+ s = mc_stream_by_index (mc, stream_index);
+ gi = msg->global_index;
+ ASSERT (msg == vec_elt (mc->global_msgs, gi));
+
+ si = ~0;
+ if (gi < vec_len (s->stream_msg_index_by_global_index))
+ si = s->stream_msg_index_by_global_index[gi];
+
+ serialize_likely_small_unsigned_integer (m, si);
+
+ /* For first time message is sent, use name to identify message. */
+ if (si == ~0 || MSG_ID_DEBUG)
+ serialize_cstring (m, msg->name);
+
+ if (MSG_ID_DEBUG && MC_EVENT_LOGGING > 0)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "serialize-msg: %s index %d",
+ .format_args = "T4i4",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 c[2];
+ } *ed;
+ ed = ELOG_DATA (mc->elog_main, e);
+ ed->c[0] = elog_id_for_msg_name (mc, msg->name);
+ ed->c[1] = si;
+ }
+
+ error = va_serialize (m, va);
+
+ n_after = serialize_vlib_buffer_n_bytes (m);
+ n_this_msg = n_after - n_before;
+ n_total = n_after + sizeof (mc_msg_user_request_t);
+
+ /* For max message size ignore first message where string name is sent. */
+ if (si != ~0)
+ msg->max_n_bytes_serialized =
+ clib_max (msg->max_n_bytes_serialized, n_this_msg);
+
+ if (!multiple_messages_per_vlib_buffer
+ || si == ~0
+ || n_total + msg->max_n_bytes_serialized >
+ mc->transport.max_packet_size)
+ {
+ bi = serialize_close_vlib_buffer (m);
+ sbm->first_buffer = 0;
+ if (!error)
+ mc_stream_send (mc, stream_index, bi);
+ else if (bi != ~0)
+ vlib_buffer_free_one (mc->vlib_main, bi);
+ }
+
+ return error;
+}
+
+clib_error_t *
+mc_serialize_internal (mc_main_t * mc,
+ u32 stream_index,
+ u32 multiple_messages_per_vlib_buffer,
+ mc_serialize_msg_t * msg, ...)
+{
+ vlib_main_t *vm = mc->vlib_main;
+ va_list va;
+ clib_error_t *error;
+
+ if (stream_index == ~0)
+ {
+ if (vm->mc_main && vm->mc_stream_index == ~0)
+ vlib_current_process_wait_for_one_time_event_vector
+ (vm, &vm->procs_waiting_for_mc_stream_join);
+ stream_index = vm->mc_stream_index;
+ }
+
+ va_start (va, msg);
+ error = mc_serialize_va (mc, stream_index,
+ multiple_messages_per_vlib_buffer, msg, &va);
+ va_end (va);
+ return error;
+}
+
+uword
+mc_unserialize_message (mc_main_t * mcm,
+ mc_stream_t * s, serialize_main_t * m)
+{
+ mc_serialize_stream_msg_t *sm;
+ u32 gi, si;
+
+ si = unserialize_likely_small_unsigned_integer (m);
+
+ if (!(si == ~0 || MSG_ID_DEBUG))
+ {
+ sm = vec_elt_at_index (s->stream_msgs, si);
+ gi = sm->global_index;
+ }
+ else
+ {
+ char *name;
+
+ unserialize_cstring (m, &name);
+
+ if (MSG_ID_DEBUG && MC_EVENT_LOGGING > 0)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "unserialize-msg: %s rx index %d",
+ .format_args = "T4i4",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 c[2];
+ } *ed;
+ ed = ELOG_DATA (mcm->elog_main, e);
+ ed->c[0] = elog_id_for_msg_name (mcm, name);
+ ed->c[1] = si;
+ }
+
+ {
+ uword *p = hash_get_mem (mcm->global_msg_index_by_name, name);
+ gi = p ? p[0] : ~0;
+ }
+
+ /* Unknown message? */
+ if (gi == ~0)
+ {
+ vec_free (name);
+ goto done;
+ }
+
+ vec_validate_init_empty (s->stream_msg_index_by_global_index, gi, ~0);
+ si = s->stream_msg_index_by_global_index[gi];
+
+ /* Stream local index unknown? Create it. */
+ if (si == ~0)
+ {
+ vec_add2 (s->stream_msgs, sm, 1);
+
+ si = sm - s->stream_msgs;
+ sm->global_index = gi;
+ s->stream_msg_index_by_global_index[gi] = si;
+
+ if (MC_EVENT_LOGGING > 0)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "msg-bind: stream %d %s to index %d",
+ .format_args = "i4T4i4",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 c[3];
+ } *ed;
+ ed = ELOG_DATA (mcm->elog_main, e);
+ ed->c[0] = s->index;
+ ed->c[1] = elog_id_for_msg_name (mcm, name);
+ ed->c[2] = si;
+ }
+ }
+ else
+ {
+ sm = vec_elt_at_index (s->stream_msgs, si);
+ if (gi != sm->global_index && MC_EVENT_LOGGING > 0)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "msg-id-ERROR: %s index %d expected %d",
+ .format_args = "T4i4i4",
+ };
+ /* *INDENT-ON* */
+ struct
+ {
+ u32 c[3];
+ } *ed;
+ ed = ELOG_DATA (mcm->elog_main, e);
+ ed->c[0] = elog_id_for_msg_name (mcm, name);
+ ed->c[1] = si;
+ ed->c[2] = ~0;
+ if (sm->global_index <
+ vec_len (s->stream_msg_index_by_global_index))
+ ed->c[2] =
+ s->stream_msg_index_by_global_index[sm->global_index];
+ }
+ }
+
+ vec_free (name);
+ }
+
+ if (gi != ~0)
+ {
+ mc_serialize_msg_t *msg;
+ msg = vec_elt (mcm->global_msgs, gi);
+ unserialize (m, msg->unserialize, mcm);
+ }
+
+done:
+ return gi != ~0;
+}
+
+void
+mc_unserialize_internal (mc_main_t * mcm, u32 stream_and_buffer_index)
+{
+ vlib_main_t *vm = mcm->vlib_main;
+ serialize_main_t *m = &mcm->serialize_mains[VLIB_RX];
+ vlib_serialize_buffer_main_t *sbm = &mcm->serialize_buffer_mains[VLIB_RX];
+ mc_stream_and_buffer_t *sb;
+ mc_stream_t *stream;
+ u32 buffer_index;
+
+ sb =
+ pool_elt_at_index (mcm->mc_unserialize_stream_and_buffers,
+ stream_and_buffer_index);
+ buffer_index = sb->buffer_index;
+ stream = vec_elt_at_index (mcm->stream_vector, sb->stream_index);
+ pool_put (mcm->mc_unserialize_stream_and_buffers, sb);
+
+ if (stream->config.save_snapshot)
+ {
+ u32 n_bytes = vlib_buffer_index_length_in_chain (vm, buffer_index);
+ static u8 *contents;
+ vec_reset_length (contents);
+ vec_validate (contents, n_bytes - 1);
+ vlib_buffer_contents (vm, buffer_index, contents);
+ stream->config.save_snapshot (mcm, /* is_catchup */ 0, contents,
+ n_bytes);
+ }
+
+ ASSERT (vlib_in_process_context (vm));
+
+ unserialize_open_vlib_buffer (m, vm, sbm);
+
+ clib_fifo_add1 (sbm->rx.buffer_fifo, buffer_index);
+
+ while (unserialize_vlib_buffer_n_bytes (m) > 0)
+ mc_unserialize_message (mcm, stream, m);
+
+ /* Frees buffer. */
+ unserialize_close_vlib_buffer (m);
+}
+
+void
+mc_unserialize (mc_main_t * mcm, mc_stream_t * s, u32 buffer_index)
+{
+ vlib_main_t *vm = mcm->vlib_main;
+ mc_stream_and_buffer_t *sb;
+ pool_get (mcm->mc_unserialize_stream_and_buffers, sb);
+ sb->stream_index = s->index;
+ sb->buffer_index = buffer_index;
+ vlib_process_signal_event (vm, mcm->unserialize_process,
+ EVENT_MC_UNSERIALIZE_BUFFER,
+ sb - mcm->mc_unserialize_stream_and_buffers);
+}
+
+static uword
+mc_unserialize_process (vlib_main_t * vm,
+ vlib_node_runtime_t * node, vlib_frame_t * f)
+{
+ mc_main_t *mcm = mc_node_get_main (node);
+ uword event_type, *event_data = 0;
+ int i;
+
+ while (1)
+ {
+ if (event_data)
+ _vec_len (event_data) = 0;
+
+ vlib_process_wait_for_event (vm);
+ event_type = vlib_process_get_events (vm, &event_data);
+ switch (event_type)
+ {
+ case EVENT_MC_UNSERIALIZE_BUFFER:
+ for (i = 0; i < vec_len (event_data); i++)
+ mc_unserialize_internal (mcm, event_data[i]);
+ break;
+
+ case EVENT_MC_UNSERIALIZE_CATCHUP:
+ for (i = 0; i < vec_len (event_data); i++)
+ {
+ u8 *mp = uword_to_pointer (event_data[i], u8 *);
+ perform_catchup (mcm, (void *) mp);
+ vec_free (mp);
+ }
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ return 0; /* not likely */
+}
+
+void
+serialize_mc_main (serialize_main_t * m, va_list * va)
+{
+ mc_main_t *mcm = va_arg (*va, mc_main_t *);
+ mc_stream_t *s;
+ mc_serialize_stream_msg_t *sm;
+ mc_serialize_msg_t *msg;
+
+ serialize_integer (m, vec_len (mcm->stream_vector), sizeof (u32));
+ vec_foreach (s, mcm->stream_vector)
+ {
+ /* Stream name. */
+ serialize_cstring (m, s->config.name);
+
+ /* Serialize global names for all sent messages. */
+ serialize_integer (m, vec_len (s->stream_msgs), sizeof (u32));
+ vec_foreach (sm, s->stream_msgs)
+ {
+ msg = vec_elt (mcm->global_msgs, sm->global_index);
+ serialize_cstring (m, msg->name);
+ }
+ }
+}
+
+void
+unserialize_mc_main (serialize_main_t * m, va_list * va)
+{
+ mc_main_t *mcm = va_arg (*va, mc_main_t *);
+ u32 i, n_streams, n_stream_msgs;
+ char *name;
+ mc_stream_t *s;
+ mc_serialize_stream_msg_t *sm;
+
+ unserialize_integer (m, &n_streams, sizeof (u32));
+ for (i = 0; i < n_streams; i++)
+ {
+ unserialize_cstring (m, &name);
+ if (i != MC_STREAM_INDEX_INTERNAL && !mc_stream_by_name (mcm, name))
+ {
+ vec_validate (mcm->stream_vector, i);
+ s = vec_elt_at_index (mcm->stream_vector, i);
+ mc_stream_init (s);
+ s->index = s - mcm->stream_vector;
+ s->config.name = name;
+ s->state = MC_STREAM_STATE_name_known;
+ hash_set_mem (mcm->stream_index_by_name, s->config.name, s->index);
+ }
+ else
+ vec_free (name);
+
+ s = vec_elt_at_index (mcm->stream_vector, i);
+
+ vec_free (s->stream_msgs);
+ vec_free (s->stream_msg_index_by_global_index);
+
+ unserialize_integer (m, &n_stream_msgs, sizeof (u32));
+ vec_resize (s->stream_msgs, n_stream_msgs);
+ vec_foreach (sm, s->stream_msgs)
+ {
+ uword *p;
+ u32 si, gi;
+
+ unserialize_cstring (m, &name);
+ p = hash_get (mcm->global_msg_index_by_name, name);
+ gi = p ? p[0] : ~0;
+ si = sm - s->stream_msgs;
+
+ if (MC_EVENT_LOGGING > 0)
+ {
+ /* *INDENT-OFF* */
+ ELOG_TYPE_DECLARE (e) =
+ {
+ .format = "catchup-bind: %s to %d global index %d stream %d",
+ .format_args = "T4i4i4i4",
+ };
+ /* *INDENT-ON* */
+
+ struct
+ {
+ u32 c[4];
+ } *ed;
+ ed = ELOG_DATA (mcm->elog_main, e);
+ ed->c[0] = elog_id_for_msg_name (mcm, name);
+ ed->c[1] = si;
+ ed->c[2] = gi;
+ ed->c[3] = s->index;
+ }
+
+ vec_free (name);
+
+ sm->global_index = gi;
+ if (gi != ~0)
+ {
+ vec_validate_init_empty (s->stream_msg_index_by_global_index,
+ gi, ~0);
+ s->stream_msg_index_by_global_index[gi] = si;
+ }
+ }
+ }
+}
+
+void
+mc_main_init (mc_main_t * mcm, char *tag)
+{
+ vlib_main_t *vm = vlib_get_main ();
+
+ mcm->vlib_main = vm;
+ mcm->elog_main = &vm->elog_main;
+
+ mcm->relay_master_peer_id = ~0ULL;
+ mcm->relay_state = MC_RELAY_STATE_NEGOTIATE;
+
+ mcm->stream_index_by_name
+ = hash_create_string ( /* elts */ 0, /* value size */ sizeof (uword));
+
+ {
+ vlib_node_registration_t r;
+
+ memset (&r, 0, sizeof (r));
+
+ r.type = VLIB_NODE_TYPE_PROCESS;
+
+ /* Point runtime data to main instance. */
+ r.runtime_data = &mcm;
+ r.runtime_data_bytes = sizeof (&mcm);
+
+ r.name = (char *) format (0, "mc-mastership-%s", tag);
+ r.function = mc_mastership_process;
+ mcm->mastership_process = vlib_register_node (vm, &r);
+
+ r.name = (char *) format (0, "mc-join-ager-%s", tag);
+ r.function = mc_join_ager_process;
+ mcm->join_ager_process = vlib_register_node (vm, &r);
+
+ r.name = (char *) format (0, "mc-retry-%s", tag);
+ r.function = mc_retry_process;
+ mcm->retry_process = vlib_register_node (vm, &r);
+
+ r.name = (char *) format (0, "mc-catchup-%s", tag);
+ r.function = mc_catchup_process;
+ mcm->catchup_process = vlib_register_node (vm, &r);
+
+ r.name = (char *) format (0, "mc-unserialize-%s", tag);
+ r.function = mc_unserialize_process;
+ mcm->unserialize_process = vlib_register_node (vm, &r);
+ }
+
+ if (MC_EVENT_LOGGING > 0)
+ mhash_init (&mcm->elog_id_by_peer_id, sizeof (uword),
+ sizeof (mc_peer_id_t));
+
+ mhash_init (&mcm->mastership_peer_index_by_id, sizeof (uword),
+ sizeof (mc_peer_id_t));
+ mc_serialize_init (mcm);
+}
+
+static u8 *
+format_mc_relay_state (u8 * s, va_list * args)
+{
+ mc_relay_state_t state = va_arg (*args, mc_relay_state_t);
+ char *t = 0;
+ switch (state)
+ {
+ case MC_RELAY_STATE_NEGOTIATE:
+ t = "negotiate";
+ break;
+ case MC_RELAY_STATE_MASTER:
+ t = "master";
+ break;
+ case MC_RELAY_STATE_SLAVE:
+ t = "slave";
+ break;
+ default:
+ return format (s, "unknown 0x%x", state);
+ }
+
+ return format (s, "%s", t);
+}
+
+static u8 *
+format_mc_stream_state (u8 * s, va_list * args)
+{
+ mc_stream_state_t state = va_arg (*args, mc_stream_state_t);
+ char *t = 0;
+ switch (state)
+ {
+#define _(f) case MC_STREAM_STATE_##f: t = #f; break;
+ foreach_mc_stream_state
+#undef _
+ default:
+ return format (s, "unknown 0x%x", state);
+ }
+
+ return format (s, "%s", t);
+}
+
+static int
+mc_peer_comp (void *a1, void *a2)
+{
+ mc_stream_peer_t *p1 = a1;
+ mc_stream_peer_t *p2 = a2;
+
+ return mc_peer_id_compare (p1->id, p2->id);
+}
+
+u8 *
+format_mc_main (u8 * s, va_list * args)
+{
+ mc_main_t *mcm = va_arg (*args, mc_main_t *);
+ mc_stream_t *t;
+ mc_stream_peer_t *p, *ps;
+ uword indent = format_get_indent (s);
+
+ s = format (s, "MC state %U, %d streams joined, global sequence 0x%x",
+ format_mc_relay_state, mcm->relay_state,
+ vec_len (mcm->stream_vector), mcm->relay_global_sequence);
+
+ {
+ mc_mastership_peer_t *mp;
+ f64 now = vlib_time_now (mcm->vlib_main);
+ s = format (s, "\n%UMost recent mastership peers:",
+ format_white_space, indent + 2);
+ vec_foreach (mp, mcm->mastership_peers)
+ {
+ s = format (s, "\n%U%-30U%.4e",
+ format_white_space, indent + 4,
+ mcm->transport.format_peer_id, mp->peer_id,
+ now - mp->time_last_master_assert_received);
+ }
+ }
+
+ vec_foreach (t, mcm->stream_vector)
+ {
+ s = format (s, "\n%Ustream `%s' index %d",
+ format_white_space, indent + 2, t->config.name, t->index);
+
+ s = format (s, "\n%Ustate %U",
+ format_white_space, indent + 4,
+ format_mc_stream_state, t->state);
+
+ s =
+ format (s,
+ "\n%Uretries: interval %.0f sec, limit %d, pool elts %d, %Ld sent",
+ format_white_space, indent + 4, t->config.retry_interval,
+ t->config.retry_limit, pool_elts (t->retry_pool),
+ t->stats.n_retries - t->stats_last_clear.n_retries);
+
+ s = format (s, "\n%U%Ld/%Ld user requests sent/received",
+ format_white_space, indent + 4,
+ t->user_requests_sent, t->user_requests_received);
+
+ s = format (s, "\n%U%d peers, local/global sequence 0x%x/0x%x",
+ format_white_space, indent + 4,
+ pool_elts (t->peers),
+ t->our_local_sequence, t->last_global_sequence_processed);
+
+ ps = 0;
+ /* *INDENT-OFF* */
+ pool_foreach (p, t->peers,
+ ({
+ if (clib_bitmap_get (t->all_peer_bitmap, p - t->peers))
+ vec_add1 (ps, p[0]);
+ }));
+ /* *INDENT-ON* */
+ vec_sort_with_function (ps, mc_peer_comp);
+ s = format (s, "\n%U%=30s%10s%16s%16s",
+ format_white_space, indent + 6,
+ "Peer", "Last seq", "Retries", "Future");
+
+ vec_foreach (p, ps)
+ {
+ s = format (s, "\n%U%-30U0x%08x%16Ld%16Ld%s",
+ format_white_space, indent + 6,
+ mcm->transport.format_peer_id, p->id.as_u64,
+ p->last_sequence_received,
+ p->stats.n_msgs_from_past -
+ p->stats_last_clear.n_msgs_from_past,
+ p->stats.n_msgs_from_future -
+ p->stats_last_clear.n_msgs_from_future,
+ (mcm->transport.our_ack_peer_id.as_u64 ==
+ p->id.as_u64 ? " (self)" : ""));
+ }
+ vec_free (ps);
+ }
+
+ return s;
+}
+
+/*
+ * fd.io coding-style-patch-verification: ON
+ *
+ * Local Variables:
+ * eval: (c-set-style "gnu")
+ * End:
+ */