diff options
Diffstat (limited to 'src/vlib')
63 files changed, 29819 insertions, 0 deletions
diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c new file mode 100644 index 00000000000..4bf6d125b21 --- /dev/null +++ b/src/vlib/buffer.c @@ -0,0 +1,1987 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * buffer.c: allocate/free network buffers. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @cond (!DPDK) + * @file + * + * Allocate/free network buffers. + */ + +#if DPDK > 0 +#include <rte_config.h> + +#include <rte_common.h> +#include <rte_log.h> +#include <rte_memory.h> +#include <rte_memzone.h> +#include <rte_tailq.h> +#include <rte_eal.h> +#include <rte_per_lcore.h> +#include <rte_launch.h> +#include <rte_atomic.h> +#include <rte_cycles.h> +#include <rte_prefetch.h> +#include <rte_lcore.h> +#include <rte_per_lcore.h> +#include <rte_branch_prediction.h> +#include <rte_interrupts.h> +#include <rte_pci.h> +#include <rte_random.h> +#include <rte_debug.h> +#include <rte_ether.h> +#include <rte_ethdev.h> +#include <rte_ring.h> +#include <rte_mempool.h> +#include <rte_mbuf.h> +#include <rte_version.h> +#endif + +#include <vlib/vlib.h> + +#if DPDK > 0 +#pragma weak rte_mem_virt2phy +#pragma weak rte_eal_has_hugepages +#pragma weak rte_socket_id +#pragma weak rte_pktmbuf_pool_create +#endif + +uword +vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm, + vlib_buffer_t * b_first) +{ + vlib_buffer_t *b = b_first; + uword l_first = b_first->current_length; + uword l = 0; + while (b->flags & VLIB_BUFFER_NEXT_PRESENT) + { + b = vlib_get_buffer (vm, b->next_buffer); + l += b->current_length; + } + b_first->total_length_not_including_first_buffer = l; + b_first->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + return l + l_first; +} + +u8 * +format_vlib_buffer (u8 * s, va_list * args) +{ + vlib_buffer_t *b = va_arg (*args, vlib_buffer_t *); +#if DPDK > 0 + uword indent = format_get_indent (s); + + s = format (s, "current data %d, length %d, free-list %d", + b->current_data, b->current_length, b->free_list_index); + + if (b->flags & VLIB_BUFFER_TOTAL_LENGTH_VALID) + s = format (s, ", totlen-nifb %d", + b->total_length_not_including_first_buffer); + + if (b->flags & VLIB_BUFFER_IS_TRACED) + s = format (s, ", trace 0x%x", b->trace_index); + + while (b->flags & VLIB_BUFFER_NEXT_PRESENT) + { + vlib_main_t *vm = vlib_get_main (); + u32 next_buffer = b->next_buffer; + b = vlib_get_buffer (vm, next_buffer); + + s = format (s, "\n%Unext-buffer 0x%x, segment length %d", + format_white_space, indent, next_buffer, b->current_length); + } + +#else + + s = format (s, "current data %d, length %d, free-list %d", + b->current_data, b->current_length, b->free_list_index); + + if (b->flags & VLIB_BUFFER_IS_TRACED) + s = format (s, ", trace 0x%x", b->trace_index); + + if (b->flags & VLIB_BUFFER_NEXT_PRESENT) + s = format (s, ", next-buffer 0x%x", b->next_buffer); +#endif + + return s; +} + +u8 * +format_vlib_buffer_and_data (u8 * s, va_list * args) +{ + vlib_buffer_t *b = va_arg (*args, vlib_buffer_t *); + + s = format (s, "%U, %U", + format_vlib_buffer, b, + format_hex_bytes, vlib_buffer_get_current (b), 64); + + return s; +} + +#if DPDK == 0 +static u8 * +format_vlib_buffer_known_state (u8 * s, va_list * args) +{ + vlib_buffer_known_state_t state = va_arg (*args, vlib_buffer_known_state_t); + char *t; + + switch (state) + { + case VLIB_BUFFER_UNKNOWN: + t = "unknown"; + break; + + case VLIB_BUFFER_KNOWN_ALLOCATED: + t = "known-allocated"; + break; + + case VLIB_BUFFER_KNOWN_FREE: + t = "known-free"; + break; + + default: + t = "invalid"; + break; + } + + return format (s, "%s", t); +} +#endif + +u8 * +format_vlib_buffer_contents (u8 * s, va_list * va) +{ + vlib_main_t *vm = va_arg (*va, vlib_main_t *); + vlib_buffer_t *b = va_arg (*va, vlib_buffer_t *); + + while (1) + { + vec_add (s, vlib_buffer_get_current (b), b->current_length); + if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT)) + break; + b = vlib_get_buffer (vm, b->next_buffer); + } + + return s; +} + +#if DPDK == 0 +static u8 * +vlib_validate_buffer_helper (vlib_main_t * vm, + u32 bi, + uword follow_buffer_next, uword ** unique_hash) +{ + vlib_buffer_t *b = vlib_get_buffer (vm, bi); + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *fl; + + if (pool_is_free_index (bm->buffer_free_list_pool, b->free_list_index)) + return format (0, "unknown free list 0x%x", b->free_list_index); + + fl = pool_elt_at_index (bm->buffer_free_list_pool, b->free_list_index); + + if ((signed) b->current_data < (signed) -VLIB_BUFFER_PRE_DATA_SIZE) + return format (0, "current data %d before pre-data", b->current_data); +#if DPDK == 0 + if (b->current_data + b->current_length > fl->n_data_bytes) + return format (0, "%d-%d beyond end of buffer %d", + b->current_data, b->current_length, fl->n_data_bytes); +#endif + + if (follow_buffer_next && (b->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + vlib_buffer_known_state_t k; + u8 *msg, *result; + + k = vlib_buffer_is_known (vm, b->next_buffer); + if (k != VLIB_BUFFER_KNOWN_ALLOCATED) + return format (0, "next 0x%x: %U", + b->next_buffer, format_vlib_buffer_known_state, k); + + if (unique_hash) + { + if (hash_get (*unique_hash, b->next_buffer)) + return format (0, "duplicate buffer 0x%x", b->next_buffer); + + hash_set1 (*unique_hash, b->next_buffer); + } + + msg = vlib_validate_buffer (vm, b->next_buffer, follow_buffer_next); + if (msg) + { + result = format (0, "next 0x%x: %v", b->next_buffer, msg); + vec_free (msg); + return result; + } + } + + return 0; +} + +u8 * +vlib_validate_buffer (vlib_main_t * vm, u32 bi, uword follow_buffer_next) +{ + return vlib_validate_buffer_helper (vm, bi, follow_buffer_next, + /* unique_hash */ 0); +} + +u8 * +vlib_validate_buffers (vlib_main_t * vm, + u32 * buffers, + uword next_buffer_stride, + uword n_buffers, + vlib_buffer_known_state_t known_state, + uword follow_buffer_next) +{ + uword i, *hash; + u32 bi, *b = buffers; + vlib_buffer_known_state_t k; + u8 *msg = 0, *result = 0; + + hash = hash_create (0, 0); + for (i = 0; i < n_buffers; i++) + { + bi = b[0]; + b += next_buffer_stride; + + /* Buffer is not unique. */ + if (hash_get (hash, bi)) + { + msg = format (0, "not unique"); + goto done; + } + + k = vlib_buffer_is_known (vm, bi); + if (k != known_state) + { + msg = format (0, "is %U; expected %U", + format_vlib_buffer_known_state, k, + format_vlib_buffer_known_state, known_state); + goto done; + } + + msg = vlib_validate_buffer_helper (vm, bi, follow_buffer_next, &hash); + if (msg) + goto done; + + hash_set1 (hash, bi); + } + +done: + if (msg) + { + result = format (0, "0x%x: %v", bi, msg); + vec_free (msg); + } + hash_free (hash); + return result; +} +#endif + +vlib_main_t **vlib_mains; + +#if DPDK == 0 +/* When dubugging validate that given buffers are either known allocated + or known free. */ +static void +vlib_buffer_validate_alloc_free (vlib_main_t * vm, + u32 * buffers, + uword n_buffers, + vlib_buffer_known_state_t expected_state) +{ + u32 *b; + uword i, bi, is_free; + + if (CLIB_DEBUG == 0) + return; + + ASSERT (os_get_cpu_number () == 0); + + /* smp disaster check */ + if (vlib_mains) + ASSERT (vm == vlib_mains[0]); + + is_free = expected_state == VLIB_BUFFER_KNOWN_ALLOCATED; + b = buffers; + for (i = 0; i < n_buffers; i++) + { + vlib_buffer_known_state_t known; + + bi = b[0]; + b += 1; + known = vlib_buffer_is_known (vm, bi); + if (known != expected_state) + { + ASSERT (0); + vlib_panic_with_msg + (vm, "%s %U buffer 0x%x", + is_free ? "freeing" : "allocating", + format_vlib_buffer_known_state, known, bi); + } + + vlib_buffer_set_known_state + (vm, bi, + is_free ? VLIB_BUFFER_KNOWN_FREE : VLIB_BUFFER_KNOWN_ALLOCATED); + } +} +#endif + +#define BUFFERS_PER_COPY (sizeof (vlib_copy_unit_t) / sizeof (u32)) + +/* Make sure we have at least given number of unaligned buffers. */ +static void +fill_unaligned (vlib_main_t * vm, + vlib_buffer_free_list_t * free_list, + uword n_unaligned_buffers) +{ + word la = vec_len (free_list->aligned_buffers); + word lu = vec_len (free_list->unaligned_buffers); + + /* Aligned come in aligned copy-sized chunks. */ + ASSERT (la % BUFFERS_PER_COPY == 0); + + ASSERT (la >= n_unaligned_buffers); + + while (lu < n_unaligned_buffers) + { + /* Copy 4 buffers from end of aligned vector to unaligned vector. */ + vec_add (free_list->unaligned_buffers, + free_list->aligned_buffers + la - BUFFERS_PER_COPY, + BUFFERS_PER_COPY); + la -= BUFFERS_PER_COPY; + lu += BUFFERS_PER_COPY; + } + _vec_len (free_list->aligned_buffers) = la; +} + +/* After free aligned buffers may not contain even sized chunks. */ +static void +trim_aligned (vlib_buffer_free_list_t * f) +{ + uword l, n_trim; + + /* Add unaligned to aligned before trim. */ + l = vec_len (f->unaligned_buffers); + if (l > 0) + { + vec_add_aligned (f->aligned_buffers, f->unaligned_buffers, l, + /* align */ sizeof (vlib_copy_unit_t)); + + _vec_len (f->unaligned_buffers) = 0; + } + + /* Remove unaligned buffers from end of aligned vector and save for next trim. */ + l = vec_len (f->aligned_buffers); + n_trim = l % BUFFERS_PER_COPY; + if (n_trim) + { + /* Trim aligned -> unaligned. */ + vec_add (f->unaligned_buffers, f->aligned_buffers + l - n_trim, n_trim); + + /* Remove from aligned. */ + _vec_len (f->aligned_buffers) = l - n_trim; + } +} + +static void +merge_free_lists (vlib_buffer_free_list_t * dst, + vlib_buffer_free_list_t * src) +{ + uword l; + u32 *d; + + trim_aligned (src); + trim_aligned (dst); + + l = vec_len (src->aligned_buffers); + if (l > 0) + { + vec_add2_aligned (dst->aligned_buffers, d, l, + /* align */ sizeof (vlib_copy_unit_t)); + clib_memcpy (d, src->aligned_buffers, l * sizeof (d[0])); + vec_free (src->aligned_buffers); + } + + l = vec_len (src->unaligned_buffers); + if (l > 0) + { + vec_add (dst->unaligned_buffers, src->unaligned_buffers, l); + vec_free (src->unaligned_buffers); + } +} + +always_inline u32 +vlib_buffer_get_free_list_with_size (vlib_main_t * vm, u32 size) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + + size = vlib_buffer_round_size (size); + uword *p = hash_get (bm->free_list_by_size, size); + return p ? p[0] : ~0; +} + +/* Add buffer free list. */ +static u32 +vlib_buffer_create_free_list_helper (vlib_main_t * vm, + u32 n_data_bytes, + u32 is_public, u32 is_default, u8 * name) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *f; +#if DPDK > 0 + int i; + + ASSERT (os_get_cpu_number () == 0); + + if (!is_default && pool_elts (bm->buffer_free_list_pool) == 0) + { + u32 default_free_free_list_index; + + /* *INDENT-OFF* */ + default_free_free_list_index = + vlib_buffer_create_free_list_helper + (vm, + /* default buffer size */ VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES, + /* is_public */ 1, + /* is_default */ 1, + (u8 *) "default"); + /* *INDENT-ON* */ + ASSERT (default_free_free_list_index == + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + + if (n_data_bytes == VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES && is_public) + return default_free_free_list_index; + } + + pool_get_aligned (bm->buffer_free_list_pool, f, CLIB_CACHE_LINE_BYTES); + + memset (f, 0, sizeof (f[0])); + f->index = f - bm->buffer_free_list_pool; + f->n_data_bytes = vlib_buffer_round_size (n_data_bytes); + f->min_n_buffers_each_physmem_alloc = 16; + f->name = clib_mem_is_heap_object (name) ? name : format (0, "%s", name); + + /* Setup free buffer template. */ + f->buffer_init_template.free_list_index = f->index; + + if (is_public) + { + uword *p = hash_get (bm->free_list_by_size, f->n_data_bytes); + if (!p) + hash_set (bm->free_list_by_size, f->n_data_bytes, f->index); + } + + for (i = 1; i < vec_len (vlib_mains); i++) + { + vlib_buffer_main_t *wbm = vlib_mains[i]->buffer_main; + vlib_buffer_free_list_t *wf; + pool_get_aligned (wbm->buffer_free_list_pool, + wf, CLIB_CACHE_LINE_BYTES); + ASSERT (f - bm->buffer_free_list_pool == + wf - wbm->buffer_free_list_pool); + wf[0] = f[0]; + wf->aligned_buffers = 0; + wf->unaligned_buffers = 0; + wf->n_alloc = 0; + } +#else + + if (!is_default && pool_elts (bm->buffer_free_list_pool) == 0) + { + u32 default_free_free_list_index; + + default_free_free_list_index = vlib_buffer_create_free_list_helper (vm, + /* default buffer size */ + VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES, + /* is_public */ + 1, + /* is_default */ + 1, + (u8 + *) + "default"); + ASSERT (default_free_free_list_index == + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + + if (n_data_bytes == VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES && is_public) + return default_free_free_list_index; + } + + pool_get_aligned (bm->buffer_free_list_pool, f, CLIB_CACHE_LINE_BYTES); + + memset (f, 0, sizeof (f[0])); + f->index = f - bm->buffer_free_list_pool; + f->n_data_bytes = vlib_buffer_round_size (n_data_bytes); + f->min_n_buffers_each_physmem_alloc = 256; + f->name = clib_mem_is_heap_object (name) ? name : format (0, "%s", name); + + /* Setup free buffer template. */ + f->buffer_init_template.free_list_index = f->index; + + if (is_public) + { + uword *p = hash_get (bm->free_list_by_size, f->n_data_bytes); + if (!p) + hash_set (bm->free_list_by_size, f->n_data_bytes, f->index); + } +#endif + + return f->index; +} + +u32 +vlib_buffer_create_free_list (vlib_main_t * vm, u32 n_data_bytes, + char *fmt, ...) +{ + va_list va; + u8 *name; + + va_start (va, fmt); + name = va_format (0, fmt, &va); + va_end (va); + + return vlib_buffer_create_free_list_helper (vm, n_data_bytes, + /* is_public */ 0, + /* is_default */ 0, + name); +} + +u32 +vlib_buffer_get_or_create_free_list (vlib_main_t * vm, u32 n_data_bytes, + char *fmt, ...) +{ + u32 i = vlib_buffer_get_free_list_with_size (vm, n_data_bytes); + + if (i == ~0) + { + va_list va; + u8 *name; + + va_start (va, fmt); + name = va_format (0, fmt, &va); + va_end (va); + + i = vlib_buffer_create_free_list_helper (vm, n_data_bytes, + /* is_public */ 1, + /* is_default */ 0, + name); + } + + return i; +} + +static void +del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f) +{ + u32 i; +#if DPDK > 0 + struct rte_mbuf *mb; + vlib_buffer_t *b; + + for (i = 0; i < vec_len (f->unaligned_buffers); i++) + { + b = vlib_get_buffer (vm, f->unaligned_buffers[i]); + mb = rte_mbuf_from_vlib_buffer (b); + ASSERT (rte_mbuf_refcnt_read (mb) == 1); + rte_pktmbuf_free (mb); + } + for (i = 0; i < vec_len (f->aligned_buffers); i++) + { + b = vlib_get_buffer (vm, f->aligned_buffers[i]); + mb = rte_mbuf_from_vlib_buffer (b); + ASSERT (rte_mbuf_refcnt_read (mb) == 1); + rte_pktmbuf_free (mb); + } + vec_free (f->name); +#else + + for (i = 0; i < vec_len (f->buffer_memory_allocated); i++) + vm->os_physmem_free (f->buffer_memory_allocated[i]); + vec_free (f->name); + vec_free (f->buffer_memory_allocated); +#endif + vec_free (f->unaligned_buffers); + vec_free (f->aligned_buffers); +} + +/* Add buffer free list. */ +void +vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *f; + u32 merge_index; +#if DPDK > 0 + int i; + + ASSERT (os_get_cpu_number () == 0); + + f = vlib_buffer_get_free_list (vm, free_list_index); + + merge_index = vlib_buffer_get_free_list_with_size (vm, f->n_data_bytes); + if (merge_index != ~0 && merge_index != free_list_index) + { + merge_free_lists (pool_elt_at_index (bm->buffer_free_list_pool, + merge_index), f); + } + + del_free_list (vm, f); + + /* Poison it. */ + memset (f, 0xab, sizeof (f[0])); + + pool_put (bm->buffer_free_list_pool, f); + + for (i = 1; i < vec_len (vlib_mains); i++) + { + bm = vlib_mains[i]->buffer_main; + f = vlib_buffer_get_free_list (vlib_mains[i], free_list_index);; + memset (f, 0xab, sizeof (f[0])); + pool_put (bm->buffer_free_list_pool, f); + } +#else + + f = vlib_buffer_get_free_list (vm, free_list_index); + + ASSERT (vec_len (f->unaligned_buffers) + vec_len (f->aligned_buffers) == + f->n_alloc); + merge_index = vlib_buffer_get_free_list_with_size (vm, f->n_data_bytes); + if (merge_index != ~0 && merge_index != free_list_index) + { + merge_free_lists (pool_elt_at_index (bm->buffer_free_list_pool, + merge_index), f); + } + + del_free_list (vm, f); + + /* Poison it. */ + memset (f, 0xab, sizeof (f[0])); + + pool_put (bm->buffer_free_list_pool, f); +#endif +} + +/* Make sure free list has at least given number of free buffers. */ +static uword +fill_free_list (vlib_main_t * vm, + vlib_buffer_free_list_t * fl, uword min_free_buffers) +{ +#if DPDK > 0 + vlib_buffer_t *b; + int n, i; + u32 bi; + u32 n_remaining = 0, n_alloc = 0; + unsigned socket_id = rte_socket_id ? rte_socket_id () : 0; + struct rte_mempool *rmp = vm->buffer_main->pktmbuf_pools[socket_id]; + struct rte_mbuf *mb; + + /* Too early? */ + if (PREDICT_FALSE (rmp == 0)) + return 0; + + trim_aligned (fl); + + /* Already have enough free buffers on free list? */ + n = min_free_buffers - vec_len (fl->aligned_buffers); + if (n <= 0) + return min_free_buffers; + + /* Always allocate round number of buffers. */ + n = round_pow2 (n, BUFFERS_PER_COPY); + + /* Always allocate new buffers in reasonably large sized chunks. */ + n = clib_max (n, fl->min_n_buffers_each_physmem_alloc); + + vec_validate (vm->mbuf_alloc_list, n - 1); + + if (rte_mempool_get_bulk (rmp, vm->mbuf_alloc_list, n) < 0) + return 0; + + _vec_len (vm->mbuf_alloc_list) = n; + + for (i = 0; i < n; i++) + { + mb = vm->mbuf_alloc_list[i]; + + ASSERT (rte_mbuf_refcnt_read (mb) == 0); + rte_mbuf_refcnt_set (mb, 1); + + b = vlib_buffer_from_rte_mbuf (mb); + bi = vlib_get_buffer_index (vm, b); + + vec_add1_aligned (fl->aligned_buffers, bi, sizeof (vlib_copy_unit_t)); + n_alloc++; + n_remaining--; + + vlib_buffer_init_for_free_list (b, fl); + + if (fl->buffer_init_function) + fl->buffer_init_function (vm, fl, &bi, 1); + } + + fl->n_alloc += n; + + return n; +#else + vlib_buffer_t *buffers, *b; + int n, n_bytes, i; + u32 *bi; + u32 n_remaining, n_alloc, n_this_chunk; + + trim_aligned (fl); + + /* Already have enough free buffers on free list? */ + n = min_free_buffers - vec_len (fl->aligned_buffers); + if (n <= 0) + return min_free_buffers; + + /* Always allocate round number of buffers. */ + n = round_pow2 (n, BUFFERS_PER_COPY); + + /* Always allocate new buffers in reasonably large sized chunks. */ + n = clib_max (n, fl->min_n_buffers_each_physmem_alloc); + + n_remaining = n; + n_alloc = 0; + while (n_remaining > 0) + { + n_this_chunk = clib_min (n_remaining, 16); + + n_bytes = n_this_chunk * (sizeof (b[0]) + fl->n_data_bytes); + + /* drb: removed power-of-2 ASSERT */ + buffers = vm->os_physmem_alloc_aligned (&vm->physmem_main, + n_bytes, + sizeof (vlib_buffer_t)); + if (!buffers) + return n_alloc; + + /* Record chunk as being allocated so we can free it later. */ + vec_add1 (fl->buffer_memory_allocated, buffers); + + fl->n_alloc += n_this_chunk; + n_alloc += n_this_chunk; + n_remaining -= n_this_chunk; + + b = buffers; + vec_add2_aligned (fl->aligned_buffers, bi, n_this_chunk, + sizeof (vlib_copy_unit_t)); + for (i = 0; i < n_this_chunk; i++) + { + bi[i] = vlib_get_buffer_index (vm, b); + + if (CLIB_DEBUG > 0) + vlib_buffer_set_known_state (vm, bi[i], VLIB_BUFFER_KNOWN_FREE); + b = vlib_buffer_next_contiguous (b, fl->n_data_bytes); + } + + memset (buffers, 0, n_bytes); + + /* Initialize all new buffers. */ + b = buffers; + for (i = 0; i < n_this_chunk; i++) + { + vlib_buffer_init_for_free_list (b, fl); + b = vlib_buffer_next_contiguous (b, fl->n_data_bytes); + } + + if (fl->buffer_init_function) + fl->buffer_init_function (vm, fl, bi, n_this_chunk); + } + return n_alloc; +#endif +} + +always_inline uword +copy_alignment (u32 * x) +{ + return (pointer_to_uword (x) / sizeof (x[0])) % BUFFERS_PER_COPY; +} + +static u32 +alloc_from_free_list (vlib_main_t * vm, + vlib_buffer_free_list_t * free_list, + u32 * alloc_buffers, u32 n_alloc_buffers) +{ + u32 *dst, *u_src; + uword u_len, n_left; + uword n_unaligned_start, n_unaligned_end, n_filled; + +#if DPDK == 0 + ASSERT (os_get_cpu_number () == 0); + +#endif + n_left = n_alloc_buffers; + dst = alloc_buffers; + n_unaligned_start = ((BUFFERS_PER_COPY - copy_alignment (dst)) + & (BUFFERS_PER_COPY - 1)); + + n_filled = fill_free_list (vm, free_list, n_alloc_buffers); + if (n_filled == 0) + return 0; + + n_left = n_filled < n_left ? n_filled : n_left; + n_alloc_buffers = n_left; + + if (n_unaligned_start >= n_left) + { + n_unaligned_start = n_left; + n_unaligned_end = 0; + } + else + n_unaligned_end = copy_alignment (dst + n_alloc_buffers); + + fill_unaligned (vm, free_list, n_unaligned_start + n_unaligned_end); + + u_len = vec_len (free_list->unaligned_buffers); + u_src = free_list->unaligned_buffers + u_len - 1; + + if (n_unaligned_start) + { + uword n_copy = n_unaligned_start; + if (n_copy > n_left) + n_copy = n_left; + n_left -= n_copy; + + while (n_copy > 0) + { + *dst++ = *u_src--; + n_copy--; + u_len--; + } + + /* Now dst should be aligned. */ + if (n_left > 0) + ASSERT (pointer_to_uword (dst) % sizeof (vlib_copy_unit_t) == 0); + } + + /* Aligned copy. */ + { + vlib_copy_unit_t *d, *s; + uword n_copy; + + if (vec_len (free_list->aligned_buffers) < + ((n_left / BUFFERS_PER_COPY) * BUFFERS_PER_COPY)) + abort (); + + n_copy = n_left / BUFFERS_PER_COPY; + n_left = n_left % BUFFERS_PER_COPY; + + /* Remove buffers from aligned free list. */ + _vec_len (free_list->aligned_buffers) -= n_copy * BUFFERS_PER_COPY; + + s = (vlib_copy_unit_t *) vec_end (free_list->aligned_buffers); + d = (vlib_copy_unit_t *) dst; + + /* Fast path loop. */ + while (n_copy >= 4) + { + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + n_copy -= 4; + s += 4; + d += 4; + } + + while (n_copy >= 1) + { + d[0] = s[0]; + n_copy -= 1; + s += 1; + d += 1; + } + + dst = (void *) d; + } + + /* Unaligned copy. */ + ASSERT (n_unaligned_end == n_left); + while (n_left > 0) + { + *dst++ = *u_src--; + n_left--; + u_len--; + } + + if (!free_list->unaligned_buffers) + ASSERT (u_len == 0); + else + _vec_len (free_list->unaligned_buffers) = u_len; + +#if DPDK == 0 + /* Verify that buffers are known free. */ + vlib_buffer_validate_alloc_free (vm, alloc_buffers, + n_alloc_buffers, VLIB_BUFFER_KNOWN_FREE); +#endif + + return n_alloc_buffers; +} + +/* Allocate a given number of buffers into given array. + Returns number actually allocated which will be either zero or + number requested. */ +u32 +vlib_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +{ + vlib_buffer_main_t *bm = vm->buffer_main; +#if DPDK == 0 + ASSERT (os_get_cpu_number () == 0); +#endif + + return alloc_from_free_list + (vm, + pool_elt_at_index (bm->buffer_free_list_pool, + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX), + buffers, n_buffers); +} + +u32 +vlib_buffer_alloc_from_free_list (vlib_main_t * vm, + u32 * buffers, + u32 n_buffers, u32 free_list_index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *f; + f = pool_elt_at_index (bm->buffer_free_list_pool, free_list_index); + return alloc_from_free_list (vm, f, buffers, n_buffers); +} + +always_inline void +add_buffer_to_free_list (vlib_main_t * vm, + vlib_buffer_free_list_t * f, + u32 buffer_index, u8 do_init) +{ + vlib_buffer_t *b; + b = vlib_get_buffer (vm, buffer_index); + if (PREDICT_TRUE (do_init)) + vlib_buffer_init_for_free_list (b, f); + vec_add1_aligned (f->aligned_buffers, buffer_index, + sizeof (vlib_copy_unit_t)); +} + +always_inline vlib_buffer_free_list_t * +buffer_get_free_list (vlib_main_t * vm, vlib_buffer_t * b, u32 * index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + u32 i; + + *index = i = b->free_list_index; + return pool_elt_at_index (bm->buffer_free_list_pool, i); +} + +void * +vlib_set_buffer_free_callback (vlib_main_t * vm, void *fp) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + void *rv = bm->buffer_free_callback; + + bm->buffer_free_callback = fp; + return rv; +} + +#if DPDK == 0 +void vnet_buffer_free_dpdk_mb (vlib_buffer_t * b) __attribute__ ((weak)); +void +vnet_buffer_free_dpdk_mb (vlib_buffer_t * b) +{ +} + +#endif +static_always_inline void +vlib_buffer_free_inline (vlib_main_t * vm, + u32 * buffers, u32 n_buffers, u32 follow_buffer_next) +{ +#if DPDK > 0 + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *fl; + u32 fi; + int i; + u32 (*cb) (vlib_main_t * vm, u32 * buffers, u32 n_buffers, + u32 follow_buffer_next); + + cb = bm->buffer_free_callback; + + if (PREDICT_FALSE (cb != 0)) + n_buffers = (*cb) (vm, buffers, n_buffers, follow_buffer_next); + + if (!n_buffers) + return; + + for (i = 0; i < n_buffers; i++) + { + vlib_buffer_t *b; + struct rte_mbuf *mb; + + b = vlib_get_buffer (vm, buffers[i]); + + fl = buffer_get_free_list (vm, b, &fi); + + /* The only current use of this callback: multicast recycle */ + if (PREDICT_FALSE (fl->buffers_added_to_freelist_function != 0)) + { + int j; + + add_buffer_to_free_list + (vm, fl, buffers[i], (b->flags & VLIB_BUFFER_RECYCLE) == 0); + + for (j = 0; j < vec_len (bm->announce_list); j++) + { + if (fl == bm->announce_list[j]) + goto already_announced; + } + vec_add1 (bm->announce_list, fl); + already_announced: + ; + } + else + { + if (PREDICT_TRUE ((b->flags & VLIB_BUFFER_RECYCLE) == 0)) + { + mb = rte_mbuf_from_vlib_buffer (b); + ASSERT (rte_mbuf_refcnt_read (mb) == 1); + rte_pktmbuf_free (mb); + } + } + } + if (vec_len (bm->announce_list)) + { + vlib_buffer_free_list_t *fl; + for (i = 0; i < vec_len (bm->announce_list); i++) + { + fl = bm->announce_list[i]; + fl->buffers_added_to_freelist_function (vm, fl); + } + _vec_len (bm->announce_list) = 0; + } +#else + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *fl; + static u32 *next_to_free[2]; /* smp bad */ + u32 i_next_to_free, *b, *n, *f, fi; + uword n_left; + int i; + static vlib_buffer_free_list_t **announce_list; + vlib_buffer_free_list_t *fl0 = 0, *fl1 = 0; + u32 bi0 = (u32) ~ 0, bi1 = (u32) ~ 0, fi0, fi1 = (u32) ~ 0; + u8 free0, free1 = 0, free_next0, free_next1; + u32 (*cb) (vlib_main_t * vm, u32 * buffers, u32 n_buffers, + u32 follow_buffer_next); + + ASSERT (os_get_cpu_number () == 0); + + cb = bm->buffer_free_callback; + + if (PREDICT_FALSE (cb != 0)) + n_buffers = (*cb) (vm, buffers, n_buffers, follow_buffer_next); + + if (!n_buffers) + return; + + /* Use first buffer to get default free list. */ + { + u32 bi0 = buffers[0]; + vlib_buffer_t *b0; + + b0 = vlib_get_buffer (vm, bi0); + fl = buffer_get_free_list (vm, b0, &fi); + if (fl->buffers_added_to_freelist_function) + vec_add1 (announce_list, fl); + } + + vec_validate (next_to_free[0], n_buffers - 1); + vec_validate (next_to_free[1], n_buffers - 1); + + i_next_to_free = 0; + n_left = n_buffers; + b = buffers; + +again: + /* Verify that buffers are known allocated. */ + vlib_buffer_validate_alloc_free (vm, b, + n_left, VLIB_BUFFER_KNOWN_ALLOCATED); + + vec_add2_aligned (fl->aligned_buffers, f, n_left, + /* align */ sizeof (vlib_copy_unit_t)); + + n = next_to_free[i_next_to_free]; + while (n_left >= 4) + { + vlib_buffer_t *b0, *b1, *binit0, *binit1, dummy_buffers[2]; + + bi0 = b[0]; + bi1 = b[1]; + + f[0] = bi0; + f[1] = bi1; + f += 2; + b += 2; + n_left -= 2; + + /* Prefetch buffers for next iteration. */ + vlib_prefetch_buffer_with_index (vm, b[0], WRITE); + vlib_prefetch_buffer_with_index (vm, b[1], WRITE); + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + free0 = (b0->flags & VLIB_BUFFER_RECYCLE) == 0; + free1 = (b1->flags & VLIB_BUFFER_RECYCLE) == 0; + + /* Must be before init which will over-write buffer flags. */ + if (follow_buffer_next) + { + n[0] = b0->next_buffer; + free_next0 = free0 && (b0->flags & VLIB_BUFFER_NEXT_PRESENT) != 0; + n += free_next0; + + n[0] = b1->next_buffer; + free_next1 = free1 && (b1->flags & VLIB_BUFFER_NEXT_PRESENT) != 0; + n += free_next1; + } + else + free_next0 = free_next1 = 0; + + /* Must be before init which will over-write buffer free list. */ + fi0 = b0->free_list_index; + fi1 = b1->free_list_index; + + if (PREDICT_FALSE (fi0 != fi || fi1 != fi)) + goto slow_path_x2; + + binit0 = free0 ? b0 : &dummy_buffers[0]; + binit1 = free1 ? b1 : &dummy_buffers[1]; + + vlib_buffer_init_two_for_free_list (binit0, binit1, fl); + continue; + + slow_path_x2: + /* Backup speculation. */ + f -= 2; + n -= free_next0 + free_next1; + + _vec_len (fl->aligned_buffers) = f - fl->aligned_buffers; + + fl0 = pool_elt_at_index (bm->buffer_free_list_pool, fi0); + fl1 = pool_elt_at_index (bm->buffer_free_list_pool, fi1); + + add_buffer_to_free_list (vm, fl0, bi0, free0); + if (PREDICT_FALSE (fl0->buffers_added_to_freelist_function != 0)) + { + int i; + for (i = 0; i < vec_len (announce_list); i++) + if (fl0 == announce_list[i]) + goto no_fl0; + vec_add1 (announce_list, fl0); + } + no_fl0: + if (PREDICT_FALSE (fl1->buffers_added_to_freelist_function != 0)) + { + int i; + for (i = 0; i < vec_len (announce_list); i++) + if (fl1 == announce_list[i]) + goto no_fl1; + vec_add1 (announce_list, fl1); + } + + no_fl1: + add_buffer_to_free_list (vm, fl1, bi1, free1); + + /* Possibly change current free list. */ + if (fi0 != fi && fi1 != fi) + { + fi = fi1; + fl = pool_elt_at_index (bm->buffer_free_list_pool, fi); + } + + vec_add2_aligned (fl->aligned_buffers, f, n_left, + /* align */ sizeof (vlib_copy_unit_t)); + } + + while (n_left >= 1) + { + vlib_buffer_t *b0, *binit0, dummy_buffers[1]; + + bi0 = b[0]; + f[0] = bi0; + f += 1; + b += 1; + n_left -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + free0 = (b0->flags & VLIB_BUFFER_RECYCLE) == 0; + + /* Must be before init which will over-write buffer flags. */ + if (follow_buffer_next) + { + n[0] = b0->next_buffer; + free_next0 = free0 && (b0->flags & VLIB_BUFFER_NEXT_PRESENT) != 0; + n += free_next0; + } + else + free_next0 = 0; + + /* Must be before init which will over-write buffer free list. */ + fi0 = b0->free_list_index; + + if (PREDICT_FALSE (fi0 != fi)) + goto slow_path_x1; + + binit0 = free0 ? b0 : &dummy_buffers[0]; + + vlib_buffer_init_for_free_list (binit0, fl); + continue; + + slow_path_x1: + /* Backup speculation. */ + f -= 1; + n -= free_next0; + + _vec_len (fl->aligned_buffers) = f - fl->aligned_buffers; + + fl0 = pool_elt_at_index (bm->buffer_free_list_pool, fi0); + + add_buffer_to_free_list (vm, fl0, bi0, free0); + if (PREDICT_FALSE (fl0->buffers_added_to_freelist_function != 0)) + { + int i; + for (i = 0; i < vec_len (announce_list); i++) + if (fl0 == announce_list[i]) + goto no_fl00; + vec_add1 (announce_list, fl0); + } + + no_fl00: + fi = fi0; + fl = pool_elt_at_index (bm->buffer_free_list_pool, fi); + + vec_add2_aligned (fl->aligned_buffers, f, n_left, + /* align */ sizeof (vlib_copy_unit_t)); + } + + if (follow_buffer_next && ((n_left = n - next_to_free[i_next_to_free]) > 0)) + { + b = next_to_free[i_next_to_free]; + i_next_to_free ^= 1; + goto again; + } + + _vec_len (fl->aligned_buffers) = f - fl->aligned_buffers; + + if (vec_len (announce_list)) + { + vlib_buffer_free_list_t *fl; + for (i = 0; i < vec_len (announce_list); i++) + { + fl = announce_list[i]; + fl->buffers_added_to_freelist_function (vm, fl); + } + _vec_len (announce_list) = 0; + } +#endif +} + +void +vlib_buffer_free (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +{ + vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */ + 1); +} + +void +vlib_buffer_free_no_next (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +{ + vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */ + 0); +} + +#if DPDK == 0 +/* Copy template packet data into buffers as they are allocated. */ +static void +vlib_packet_template_buffer_init (vlib_main_t * vm, + vlib_buffer_free_list_t * fl, + u32 * buffers, u32 n_buffers) +{ + vlib_packet_template_t *t = + uword_to_pointer (fl->buffer_init_function_opaque, + vlib_packet_template_t *); + uword i; + + for (i = 0; i < n_buffers; i++) + { + vlib_buffer_t *b = vlib_get_buffer (vm, buffers[i]); + ASSERT (b->current_length == vec_len (t->packet_data)); + clib_memcpy (vlib_buffer_get_current (b), t->packet_data, + b->current_length); + } +} +#endif + +void +vlib_packet_template_init (vlib_main_t * vm, + vlib_packet_template_t * t, + void *packet_data, + uword n_packet_data_bytes, + uword min_n_buffers_each_physmem_alloc, + char *fmt, ...) +{ +#if DPDK > 0 + va_list va; + __attribute__ ((unused)) u8 *name; + + va_start (va, fmt); + name = va_format (0, fmt, &va); + va_end (va); + + vlib_worker_thread_barrier_sync (vm); + memset (t, 0, sizeof (t[0])); + + vec_add (t->packet_data, packet_data, n_packet_data_bytes); + + vlib_worker_thread_barrier_release (vm); +#else + vlib_buffer_free_list_t *fl; + va_list va; + u8 *name; + + va_start (va, fmt); + name = va_format (0, fmt, &va); + va_end (va); + + memset (t, 0, sizeof (t[0])); + + vec_add (t->packet_data, packet_data, n_packet_data_bytes); + t->min_n_buffers_each_physmem_alloc = min_n_buffers_each_physmem_alloc; + + t->free_list_index = vlib_buffer_create_free_list_helper + (vm, n_packet_data_bytes, + /* is_public */ 1, + /* is_default */ 0, + name); + + ASSERT (t->free_list_index != 0); + fl = vlib_buffer_get_free_list (vm, t->free_list_index); + fl->min_n_buffers_each_physmem_alloc = t->min_n_buffers_each_physmem_alloc; + + fl->buffer_init_function = vlib_packet_template_buffer_init; + fl->buffer_init_function_opaque = pointer_to_uword (t); + + fl->buffer_init_template.current_data = 0; + fl->buffer_init_template.current_length = n_packet_data_bytes; + fl->buffer_init_template.flags = 0; +#endif +} + +void * +vlib_packet_template_get_packet (vlib_main_t * vm, + vlib_packet_template_t * t, u32 * bi_result) +{ + u32 bi; + vlib_buffer_t *b; + + if (vlib_buffer_alloc (vm, &bi, 1) != 1) + return 0; + + *bi_result = bi; + + b = vlib_get_buffer (vm, bi); + clib_memcpy (vlib_buffer_get_current (b), + t->packet_data, vec_len (t->packet_data)); + b->current_length = vec_len (t->packet_data); + + return b->data; +} + +#if DPDK == 0 +void +vlib_packet_template_get_packet_helper (vlib_main_t * vm, + vlib_packet_template_t * t) +{ + word n = t->min_n_buffers_each_physmem_alloc; + word l = vec_len (t->packet_data); + word n_alloc; + + ASSERT (l > 0); + ASSERT (vec_len (t->free_buffers) == 0); + + vec_validate (t->free_buffers, n - 1); + n_alloc = vlib_buffer_alloc_from_free_list (vm, t->free_buffers, + n, t->free_list_index); + _vec_len (t->free_buffers) = n_alloc; +} + +#endif +/* Append given data to end of buffer, possibly allocating new buffers. */ +u32 +vlib_buffer_add_data (vlib_main_t * vm, + u32 free_list_index, + u32 buffer_index, void *data, u32 n_data_bytes) +{ + u32 n_buffer_bytes, n_left, n_left_this_buffer, bi; + vlib_buffer_t *b; + void *d; + + bi = buffer_index; + if (bi == 0 + && 1 != vlib_buffer_alloc_from_free_list (vm, &bi, 1, free_list_index)) + goto out_of_buffers; + + d = data; + n_left = n_data_bytes; + n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm, free_list_index); + + b = vlib_get_buffer (vm, bi); + b->flags &= ~VLIB_BUFFER_TOTAL_LENGTH_VALID; + + /* Get to the end of the chain before we try to append data... */ + while (b->flags & VLIB_BUFFER_NEXT_PRESENT) + b = vlib_get_buffer (vm, b->next_buffer); + + while (1) + { + u32 n; + + ASSERT (n_buffer_bytes >= b->current_length); + n_left_this_buffer = + n_buffer_bytes - (b->current_data + b->current_length); + n = clib_min (n_left_this_buffer, n_left); + clib_memcpy (vlib_buffer_get_current (b) + b->current_length, d, n); + b->current_length += n; + n_left -= n; + if (n_left == 0) + break; + + d += n; + if (1 != + vlib_buffer_alloc_from_free_list (vm, &b->next_buffer, 1, + free_list_index)) + goto out_of_buffers; + + b->flags |= VLIB_BUFFER_NEXT_PRESENT; + + b = vlib_get_buffer (vm, b->next_buffer); + } + + return bi; + +out_of_buffers: + clib_error ("out of buffers"); + return bi; +} + +u16 +vlib_buffer_chain_append_data_with_alloc (vlib_main_t * vm, + u32 free_list_index, + vlib_buffer_t * first, + vlib_buffer_t ** last, + void *data, u16 data_len) +{ + vlib_buffer_t *l = *last; + u32 n_buffer_bytes = + vlib_buffer_free_list_buffer_size (vm, free_list_index); + u16 copied = 0; + ASSERT (n_buffer_bytes >= l->current_length + l->current_data); + while (data_len) + { + u16 max = n_buffer_bytes - l->current_length - l->current_data; + if (max == 0) + { + if (1 != + vlib_buffer_alloc_from_free_list (vm, &l->next_buffer, 1, + free_list_index)) + return copied; + *last = l = vlib_buffer_chain_buffer (vm, first, l, l->next_buffer); + max = n_buffer_bytes - l->current_length - l->current_data; + } + + u16 len = (data_len > max) ? max : data_len; + clib_memcpy (vlib_buffer_get_current (l) + l->current_length, + data + copied, len); + vlib_buffer_chain_increase_length (first, l, len); + data_len -= len; + copied += len; + } + return copied; +} + +#if DPDK > 0 +clib_error_t * +vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, + unsigned socket_id) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_physmem_main_t *vpm = &vm->physmem_main; + struct rte_mempool *rmp; + int i; + + if (!rte_pktmbuf_pool_create) + return clib_error_return (0, "not linked with DPDK"); + + vec_validate_aligned (bm->pktmbuf_pools, socket_id, CLIB_CACHE_LINE_BYTES); + + /* pool already exists, nothing to do */ + if (bm->pktmbuf_pools[socket_id]) + return 0; + + u8 *pool_name = format (0, "mbuf_pool_socket%u%c", socket_id, 0); + + rmp = rte_pktmbuf_pool_create ((char *) pool_name, /* pool name */ + num_mbufs, /* number of mbufs */ + 512, /* cache size */ + VLIB_BUFFER_HDR_SIZE, /* priv size */ + VLIB_BUFFER_PRE_DATA_SIZE + VLIB_BUFFER_DATA_SIZE, /* dataroom size */ + socket_id); /* cpu socket */ + + if (rmp) + { + { + uword this_pool_end; + uword this_pool_start; + uword this_pool_size; + uword save_vpm_start, save_vpm_end, save_vpm_size; + struct rte_mempool_memhdr *memhdr; + + this_pool_start = ~0ULL; + this_pool_end = 0LL; + + STAILQ_FOREACH (memhdr, &rmp->mem_list, next) + { + if (((uword) (memhdr->addr + memhdr->len)) > this_pool_end) + this_pool_end = (uword) (memhdr->addr + memhdr->len); + if (((uword) memhdr->addr) < this_pool_start) + this_pool_start = (uword) (memhdr->addr); + } + ASSERT (this_pool_start < ~0ULL && this_pool_end > 0); + this_pool_size = this_pool_end - this_pool_start; + + if (CLIB_DEBUG > 1) + { + clib_warning ("%s: pool start %llx pool end %llx pool size %lld", + pool_name, this_pool_start, this_pool_end, + this_pool_size); + clib_warning + ("before: virtual.start %llx virtual.end %llx virtual.size %lld", + vpm->virtual.start, vpm->virtual.end, vpm->virtual.size); + } + + save_vpm_start = vpm->virtual.start; + save_vpm_end = vpm->virtual.end; + save_vpm_size = vpm->virtual.size; + + if ((this_pool_start < vpm->virtual.start) || vpm->virtual.start == 0) + vpm->virtual.start = this_pool_start; + if (this_pool_end > vpm->virtual.end) + vpm->virtual.end = this_pool_end; + + vpm->virtual.size = vpm->virtual.end - vpm->virtual.start; + + if (CLIB_DEBUG > 1) + { + clib_warning + ("after: virtual.start %llx virtual.end %llx virtual.size %lld", + vpm->virtual.start, vpm->virtual.end, vpm->virtual.size); + } + + /* check if fits into buffer index range */ + if ((u64) vpm->virtual.size > + ((u64) 1 << (32 + CLIB_LOG2_CACHE_LINE_BYTES))) + { + clib_warning ("physmem: virtual size out of range!"); + vpm->virtual.start = save_vpm_start; + vpm->virtual.end = save_vpm_end; + vpm->virtual.size = save_vpm_size; + rmp = 0; + } + } + if (rmp) + { + bm->pktmbuf_pools[socket_id] = rmp; + vec_free (pool_name); + return 0; + } + } + + vec_free (pool_name); + + /* no usable pool for this socket, try to use pool from another one */ + for (i = 0; i < vec_len (bm->pktmbuf_pools); i++) + { + if (bm->pktmbuf_pools[i]) + { + clib_warning + ("WARNING: Failed to allocate mempool for CPU socket %u. " + "Threads running on socket %u will use socket %u mempool.", + socket_id, socket_id, i); + bm->pktmbuf_pools[socket_id] = bm->pktmbuf_pools[i]; + return 0; + } + } + + return clib_error_return (0, "failed to allocate mempool on socket %u", + socket_id); +} +#endif + +static void +vlib_serialize_tx (serialize_main_header_t * m, serialize_stream_t * s) +{ + vlib_main_t *vm; + vlib_serialize_buffer_main_t *sm; + uword n, n_bytes_to_write; + vlib_buffer_t *last; + + n_bytes_to_write = s->current_buffer_index; + sm = + uword_to_pointer (s->data_function_opaque, + vlib_serialize_buffer_main_t *); + vm = sm->vlib_main; + + ASSERT (sm->tx.max_n_data_bytes_per_chain > 0); + if (serialize_stream_is_end_of_stream (s) + || sm->tx.n_total_data_bytes + n_bytes_to_write > + sm->tx.max_n_data_bytes_per_chain) + { + vlib_process_t *p = vlib_get_current_process (vm); + + last = vlib_get_buffer (vm, sm->last_buffer); + last->current_length = n_bytes_to_write; + + vlib_set_next_frame_buffer (vm, &p->node_runtime, sm->tx.next_index, + sm->first_buffer); + + sm->first_buffer = sm->last_buffer = ~0; + sm->tx.n_total_data_bytes = 0; + } + + else if (n_bytes_to_write == 0 && s->n_buffer_bytes == 0) + { + ASSERT (sm->first_buffer == ~0); + ASSERT (sm->last_buffer == ~0); + n = + vlib_buffer_alloc_from_free_list (vm, &sm->first_buffer, 1, + sm->tx.free_list_index); + if (n != 1) + serialize_error (m, + clib_error_create + ("vlib_buffer_alloc_from_free_list fails")); + sm->last_buffer = sm->first_buffer; + s->n_buffer_bytes = + vlib_buffer_free_list_buffer_size (vm, sm->tx.free_list_index); + } + + if (n_bytes_to_write > 0) + { + vlib_buffer_t *prev = vlib_get_buffer (vm, sm->last_buffer); + n = + vlib_buffer_alloc_from_free_list (vm, &sm->last_buffer, 1, + sm->tx.free_list_index); + if (n != 1) + serialize_error (m, + clib_error_create + ("vlib_buffer_alloc_from_free_list fails")); + sm->tx.n_total_data_bytes += n_bytes_to_write; + prev->current_length = n_bytes_to_write; + prev->next_buffer = sm->last_buffer; + prev->flags |= VLIB_BUFFER_NEXT_PRESENT; + } + + if (sm->last_buffer != ~0) + { + last = vlib_get_buffer (vm, sm->last_buffer); + s->buffer = vlib_buffer_get_current (last); + s->current_buffer_index = 0; + ASSERT (last->current_data == s->current_buffer_index); + } +} + +static void +vlib_serialize_rx (serialize_main_header_t * m, serialize_stream_t * s) +{ + vlib_main_t *vm; + vlib_serialize_buffer_main_t *sm; + vlib_buffer_t *last; + + sm = + uword_to_pointer (s->data_function_opaque, + vlib_serialize_buffer_main_t *); + vm = sm->vlib_main; + + if (serialize_stream_is_end_of_stream (s)) + return; + + if (sm->last_buffer != ~0) + { + last = vlib_get_buffer (vm, sm->last_buffer); + + if (last->flags & VLIB_BUFFER_NEXT_PRESENT) + sm->last_buffer = last->next_buffer; + else + { + vlib_buffer_free (vm, &sm->first_buffer, /* count */ 1); + sm->first_buffer = sm->last_buffer = ~0; + } + } + + if (sm->last_buffer == ~0) + { + while (clib_fifo_elts (sm->rx.buffer_fifo) == 0) + { + sm->rx.ready_one_time_event = + vlib_process_create_one_time_event (vm, vlib_current_process (vm), + ~0); + vlib_process_wait_for_one_time_event (vm, /* no event data */ 0, + sm->rx.ready_one_time_event); + } + + clib_fifo_sub1 (sm->rx.buffer_fifo, sm->first_buffer); + sm->last_buffer = sm->first_buffer; + } + + ASSERT (sm->last_buffer != ~0); + + last = vlib_get_buffer (vm, sm->last_buffer); + s->current_buffer_index = 0; + s->buffer = vlib_buffer_get_current (last); + s->n_buffer_bytes = last->current_length; +} + +static void +serialize_open_vlib_helper (serialize_main_t * m, + vlib_main_t * vm, + vlib_serialize_buffer_main_t * sm, uword is_read) +{ + /* Initialize serialize main but save overflow buffer for re-use between calls. */ + { + u8 *save = m->stream.overflow_buffer; + memset (m, 0, sizeof (m[0])); + m->stream.overflow_buffer = save; + if (save) + _vec_len (save) = 0; + } + + sm->first_buffer = sm->last_buffer = ~0; + if (is_read) + clib_fifo_reset (sm->rx.buffer_fifo); + else + sm->tx.n_total_data_bytes = 0; + sm->vlib_main = vm; + m->header.data_function = is_read ? vlib_serialize_rx : vlib_serialize_tx; + m->stream.data_function_opaque = pointer_to_uword (sm); +} + +void +serialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm, + vlib_serialize_buffer_main_t * sm) +{ + serialize_open_vlib_helper (m, vm, sm, /* is_read */ 0); +} + +void +unserialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm, + vlib_serialize_buffer_main_t * sm) +{ + serialize_open_vlib_helper (m, vm, sm, /* is_read */ 1); +} + +u32 +serialize_close_vlib_buffer (serialize_main_t * m) +{ + vlib_serialize_buffer_main_t *sm + = uword_to_pointer (m->stream.data_function_opaque, + vlib_serialize_buffer_main_t *); + vlib_buffer_t *last; + serialize_stream_t *s = &m->stream; + + last = vlib_get_buffer (sm->vlib_main, sm->last_buffer); + last->current_length = s->current_buffer_index; + + if (vec_len (s->overflow_buffer) > 0) + { + sm->last_buffer + = vlib_buffer_add_data (sm->vlib_main, sm->tx.free_list_index, + sm->last_buffer == ~0 ? 0 : sm->last_buffer, + s->overflow_buffer, + vec_len (s->overflow_buffer)); + _vec_len (s->overflow_buffer) = 0; + } + + return sm->first_buffer; +} + +void +unserialize_close_vlib_buffer (serialize_main_t * m) +{ + vlib_serialize_buffer_main_t *sm + = uword_to_pointer (m->stream.data_function_opaque, + vlib_serialize_buffer_main_t *); + if (sm->first_buffer != ~0) + vlib_buffer_free_one (sm->vlib_main, sm->first_buffer); + clib_fifo_reset (sm->rx.buffer_fifo); + if (m->stream.overflow_buffer) + _vec_len (m->stream.overflow_buffer) = 0; +} + +static u8 * +format_vlib_buffer_free_list (u8 * s, va_list * va) +{ + vlib_buffer_free_list_t *f = va_arg (*va, vlib_buffer_free_list_t *); +#if DPDK > 0 + u32 threadnum = va_arg (*va, u32); + uword bytes_alloc, bytes_free, n_free, size; + + if (!f) + return format (s, "%=7s%=30s%=12s%=12s%=12s%=12s%=12s%=12s", + "Thread", "Name", "Index", "Size", "Alloc", "Free", + "#Alloc", "#Free"); + + size = sizeof (vlib_buffer_t) + f->n_data_bytes; + n_free = vec_len (f->aligned_buffers) + vec_len (f->unaligned_buffers); + bytes_alloc = size * f->n_alloc; + bytes_free = size * n_free; + + s = format (s, "%7d%30s%12d%12d%=12U%=12U%=12d%=12d", threadnum, +#else + uword bytes_alloc, bytes_free, n_free, size; + + if (!f) + return format (s, "%=30s%=12s%=12s%=12s%=12s%=12s%=12s", + "Name", "Index", "Size", "Alloc", "Free", "#Alloc", + "#Free"); + + size = sizeof (vlib_buffer_t) + f->n_data_bytes; + n_free = vec_len (f->aligned_buffers) + vec_len (f->unaligned_buffers); + bytes_alloc = size * f->n_alloc; + bytes_free = size * n_free; + + s = format (s, "%30s%12d%12d%=12U%=12U%=12d%=12d", +#endif + f->name, f->index, f->n_data_bytes, + format_memory_size, bytes_alloc, + format_memory_size, bytes_free, f->n_alloc, n_free); + + return s; +} + +static clib_error_t * +show_buffers (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ +#if DPDK > 0 + vlib_buffer_main_t *bm; + vlib_buffer_free_list_t *f; + vlib_main_t *curr_vm; + u32 vm_index = 0; + + vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, 0, 0); + + do + { + curr_vm = vec_len (vlib_mains) ? vlib_mains[vm_index] : vm; + bm = curr_vm->buffer_main; + + /* *INDENT-OFF* */ + pool_foreach (f, bm->buffer_free_list_pool, ({ + vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, f, vm_index); + })); + /* *INDENT-ON* */ + + vm_index++; + } + while (vm_index < vec_len (vlib_mains)); + +#else + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *f; + + vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, 0); + /* *INDENT-OFF* */ + pool_foreach (f, bm->buffer_free_list_pool, ({ + vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, f); + })); +/* *INDENT-ON* */ + +#endif + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_buffers_command, static) = { + .path = "show buffers", + .short_help = "Show packet buffer allocation", + .function = show_buffers, +}; +/* *INDENT-ON* */ + +#if DPDK > 0 +#if CLIB_DEBUG > 0 + +u32 *vlib_buffer_state_validation_lock; +uword *vlib_buffer_state_validation_hash; +void *vlib_buffer_state_heap; + +static clib_error_t * +buffer_state_validation_init (vlib_main_t * vm) +{ + void *oldheap; + + vlib_buffer_state_heap = mheap_alloc (0, 10 << 20); + + oldheap = clib_mem_set_heap (vlib_buffer_state_heap); + + vlib_buffer_state_validation_hash = hash_create (0, sizeof (uword)); + vec_validate_aligned (vlib_buffer_state_validation_lock, 0, + CLIB_CACHE_LINE_BYTES); + clib_mem_set_heap (oldheap); + return 0; +} + +VLIB_INIT_FUNCTION (buffer_state_validation_init); +#endif +#endif + + +/** @endcond */ +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h new file mode 100644 index 00000000000..5f1e62f08c9 --- /dev/null +++ b/src/vlib/buffer.h @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * buffer.h: VLIB buffers + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_buffer_h +#define included_vlib_buffer_h + +#include <vppinfra/types.h> +#include <vppinfra/cache.h> +#include <vppinfra/serialize.h> +#include <vppinfra/vector.h> +#include <vlib/error.h> /* for vlib_error_t */ + +#if DPDK > 0 +#include <rte_config.h> +#define VLIB_BUFFER_DATA_SIZE (2048) +#define VLIB_BUFFER_PRE_DATA_SIZE RTE_PKTMBUF_HEADROOM +#else +#include <vlib/config.h> /* for __PRE_DATA_SIZE */ +#define VLIB_BUFFER_DATA_SIZE (512) +#define VLIB_BUFFER_PRE_DATA_SIZE __PRE_DATA_SIZE +#endif + +#if defined (CLIB_HAVE_VEC128) || defined (__aarch64__) +typedef u8x16 vlib_copy_unit_t; +#else +typedef u64 vlib_copy_unit_t; +#endif + +/** \file + vlib buffer structure definition and a few select + access methods. This structure and the buffer allocation + mechanism should perhaps live in vnet, but it would take a lot + of typing to make it so. +*/ + +/* VLIB buffer representation. */ +typedef struct +{ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + /* Offset within data[] that we are currently processing. + If negative current header points into predata area. */ + i16 current_data; /**< signed offset in data[], pre_data[] + that we are currently processing. + If negative current header points into predata area. + */ + u16 current_length; /**< Nbytes between current data and + the end of this buffer. + */ + u32 flags; /**< buffer flags: + <br> VLIB_BUFFER_IS_TRACED: trace this buffer. + <br> VLIB_BUFFER_NEXT_PRESENT: this is a multi-chunk buffer. + <br> VLIB_BUFFER_TOTAL_LENGTH_VALID: as it says + <br> VLIB_BUFFER_REPL_FAIL: packet replication failure + <br> VLIB_BUFFER_RECYCLE: as it says + <br> VLIB_BUFFER_FLOW_REPORT: buffer is a flow report, + set to avoid adding it to a flow report + <br> VLIB_BUFFER_FLAG_USER(n): user-defined bit N + */ +#define VLIB_BUFFER_IS_TRACED (1 << 0) +#define VLIB_BUFFER_LOG2_NEXT_PRESENT (1) +#define VLIB_BUFFER_NEXT_PRESENT (1 << VLIB_BUFFER_LOG2_NEXT_PRESENT) +#define VLIB_BUFFER_IS_RECYCLED (1 << 2) +#define VLIB_BUFFER_TOTAL_LENGTH_VALID (1 << 3) +#define VLIB_BUFFER_REPL_FAIL (1 << 4) +#define VLIB_BUFFER_RECYCLE (1 << 5) +#define VLIB_BUFFER_FLOW_REPORT (1 << 6) + + /* User defined buffer flags. */ +#define LOG2_VLIB_BUFFER_FLAG_USER(n) (32 - (n)) +#define VLIB_BUFFER_FLAG_USER(n) (1 << LOG2_VLIB_BUFFER_FLAG_USER(n)) + + u32 free_list_index; /**< Buffer free list that this buffer was + allocated from and will be freed to. + */ + + u32 total_length_not_including_first_buffer; + /**< Only valid for first buffer in chain. Current length plus + total length given here give total number of bytes in buffer chain. + */ + + u32 next_buffer; /**< Next buffer for this linked-list of buffers. + Only valid if VLIB_BUFFER_NEXT_PRESENT flag is set. + */ + + vlib_error_t error; /**< Error code for buffers to be enqueued + to error handler. + */ + u32 current_config_index; /**< Used by feature subgraph arcs to + visit enabled feature nodes + */ + + u8 feature_arc_index; /**< Used to identify feature arcs by intermediate + feature node + */ + + u8 dont_waste_me[3]; /**< Available space in the (precious) + first 32 octets of buffer metadata + Before allocating any of it, discussion required! + */ + + u32 opaque[8]; /**< Opaque data used by sub-graphs for their own purposes. + See .../vnet/vnet/buffer.h + */ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline1); + + u32 trace_index; /**< Specifies index into trace buffer + if VLIB_PACKET_IS_TRACED flag is set. + */ + u32 recycle_count; /**< Used by L2 path recycle code */ + u32 opaque2[14]; /**< More opaque data, currently unused */ + + /***** end of second cache line */ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline2); + u8 pre_data[VLIB_BUFFER_PRE_DATA_SIZE]; /**< Space for inserting data + before buffer start. + Packet rewrite string will be + rewritten backwards and may extend + back before buffer->data[0]. + Must come directly before packet data. + */ + + u8 data[0]; /**< Packet data. Hardware DMA here */ +} vlib_buffer_t; /* Must be a multiple of 64B. */ + +#define VLIB_BUFFER_HDR_SIZE (sizeof(vlib_buffer_t) - VLIB_BUFFER_PRE_DATA_SIZE) + +/** \brief Prefetch buffer metadata. + The first 64 bytes of buffer contains most header information + + @param b - (vlib_buffer_t *) pointer to the buffer + @param type - LOAD, STORE. In most cases, STORE is the right answer +*/ + +#define vlib_prefetch_buffer_header(b,type) CLIB_PREFETCH (b, 64, type) + +always_inline vlib_buffer_t * +vlib_buffer_next_contiguous (vlib_buffer_t * b, u32 buffer_bytes) +{ + return (void *) (b + 1) + buffer_bytes; +} + +always_inline void +vlib_buffer_struct_is_sane (vlib_buffer_t * b) +{ + ASSERT (sizeof (b[0]) % 64 == 0); + + /* Rewrite data must be before and contiguous with packet data. */ + ASSERT (b->pre_data + VLIB_BUFFER_PRE_DATA_SIZE == b->data); +} + +/** \brief Get pointer to current data to process + + @param b - (vlib_buffer_t *) pointer to the buffer + @return - (void *) (b->data + b->current_data) +*/ + +always_inline void * +vlib_buffer_get_current (vlib_buffer_t * b) +{ + /* Check bounds. */ + ASSERT ((signed) b->current_data >= (signed) -VLIB_BUFFER_PRE_DATA_SIZE); + return b->data + b->current_data; +} + +/** \brief Advance current data pointer by the supplied (signed!) amount + + @param b - (vlib_buffer_t *) pointer to the buffer + @param l - (word) signed increment +*/ +always_inline void +vlib_buffer_advance (vlib_buffer_t * b, word l) +{ + ASSERT (b->current_length >= l); + b->current_data += l; + b->current_length -= l; +} + +/** \brief Reset current header & length to state they were in when + packet was received. + + @param b - (vlib_buffer_t *) pointer to the buffer +*/ + +always_inline void +vlib_buffer_reset (vlib_buffer_t * b) +{ + b->current_length += clib_max (b->current_data, 0); + b->current_data = 0; +} + +/** \brief Get pointer to buffer's opaque data array + + @param b - (vlib_buffer_t *) pointer to the buffer + @return - (void *) b->opaque +*/ +always_inline void * +vlib_get_buffer_opaque (vlib_buffer_t * b) +{ + return (void *) b->opaque; +} + +/** \brief Get pointer to buffer's opaque2 data array + + @param b - (vlib_buffer_t *) pointer to the buffer + @return - (void *) b->opaque2 +*/ +always_inline void * +vlib_get_buffer_opaque2 (vlib_buffer_t * b) +{ + return (void *) b->opaque2; +} + +/* Forward declaration. */ +struct vlib_main_t; + +typedef struct vlib_buffer_free_list_t +{ + /* Template buffer used to initialize first 16 bytes of buffers + allocated on this free list. */ + vlib_buffer_t buffer_init_template; + + /* Our index into vlib_main_t's buffer_free_list_pool. */ + u32 index; + + /* Number of data bytes for buffers in this free list. */ + u32 n_data_bytes; + + /* Number of buffers to allocate when we need to allocate new buffers + from physmem heap. */ + u32 min_n_buffers_each_physmem_alloc; + + /* Total number of buffers allocated from this free list. */ + u32 n_alloc; + + /* Vector of free buffers. Each element is a byte offset into I/O heap. + Aligned vectors always has naturally aligned vlib_copy_unit_t sized chunks + of buffer indices. Unaligned vector has any left over. This is meant to + speed up copy routines. */ + u32 *aligned_buffers, *unaligned_buffers; + + /* Memory chunks allocated for this free list + recorded here so they can be freed when free list + is deleted. */ + void **buffer_memory_allocated; + + /* Free list name. */ + u8 *name; + + /* Callback functions to initialize newly allocated buffers. + If null buffers are zeroed. */ + void (*buffer_init_function) (struct vlib_main_t * vm, + struct vlib_buffer_free_list_t * fl, + u32 * buffers, u32 n_buffers); + + /* Callback function to announce that buffers have been + added to the freelist */ + void (*buffers_added_to_freelist_function) + (struct vlib_main_t * vm, struct vlib_buffer_free_list_t * fl); + + uword buffer_init_function_opaque; +} __attribute__ ((aligned (16))) vlib_buffer_free_list_t; + +typedef struct +{ + /* Buffer free callback, for subversive activities */ + u32 (*buffer_free_callback) (struct vlib_main_t * vm, + u32 * buffers, + u32 n_buffers, u32 follow_buffer_next); + /* Pool of buffer free lists. + Multiple free lists exist for packet generator which uses + separate free lists for each packet stream --- so as to avoid + initializing static data for each packet generated. */ + vlib_buffer_free_list_t *buffer_free_list_pool; +#define VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX (0) +#define VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES VLIB_BUFFER_DATA_SIZE + + /* Hash table mapping buffer size (rounded to next unit of + sizeof (vlib_buffer_t)) to free list index. */ + uword *free_list_by_size; + + /* Hash table mapping buffer index into number + 0 => allocated but free, 1 => allocated and not-free. + If buffer index is not in hash table then this buffer + has never been allocated. */ + uword *buffer_known_hash; + + /* List of free-lists needing Blue Light Special announcements */ + vlib_buffer_free_list_t **announce_list; + + /* Vector of rte_mempools per socket */ +#if DPDK == 1 + struct rte_mempool **pktmbuf_pools; +#endif +} vlib_buffer_main_t; + +typedef struct +{ + struct vlib_main_t *vlib_main; + + u32 first_buffer, last_buffer; + + union + { + struct + { + /* Total accumulated bytes in chain starting with first_buffer. */ + u32 n_total_data_bytes; + + /* Max number of bytes to accumulate in chain starting with first_buffer. + As this limit is reached buffers are enqueued to next node. */ + u32 max_n_data_bytes_per_chain; + + /* Next node to enqueue buffers to relative to current process node. */ + u32 next_index; + + /* Free list to use to allocate new buffers. */ + u32 free_list_index; + } tx; + + struct + { + /* CLIB fifo of buffer indices waiting to be unserialized. */ + u32 *buffer_fifo; + + /* Event type used to signal that RX buffers have been added to fifo. */ + uword ready_one_time_event; + } rx; + }; +} vlib_serialize_buffer_main_t; + +void serialize_open_vlib_buffer (serialize_main_t * m, struct vlib_main_t *vm, + vlib_serialize_buffer_main_t * sm); +void unserialize_open_vlib_buffer (serialize_main_t * m, + struct vlib_main_t *vm, + vlib_serialize_buffer_main_t * sm); + +u32 serialize_close_vlib_buffer (serialize_main_t * m); +void unserialize_close_vlib_buffer (serialize_main_t * m); +void *vlib_set_buffer_free_callback (struct vlib_main_t *vm, void *fp); + +always_inline u32 +serialize_vlib_buffer_n_bytes (serialize_main_t * m) +{ + serialize_stream_t *s = &m->stream; + vlib_serialize_buffer_main_t *sm + = uword_to_pointer (m->stream.data_function_opaque, + vlib_serialize_buffer_main_t *); + return sm->tx.n_total_data_bytes + s->current_buffer_index + + vec_len (s->overflow_buffer); +} + +#if DPDK > 0 +#define rte_mbuf_from_vlib_buffer(x) (((struct rte_mbuf *)x) - 1) +#define vlib_buffer_from_rte_mbuf(x) ((vlib_buffer_t *)(x+1)) +#endif + +/* + */ + +/** \brief Compile time buffer trajectory tracing option + Turn this on if you run into "bad monkey" contexts, + and you want to know exactly which nodes they've visited... + See vlib/main.c... +*/ +#define VLIB_BUFFER_TRACE_TRAJECTORY 0 + +#if VLIB_BUFFER_TRACE_TRAJECTORY > 0 +#define VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b) (b)->pre_data[0]=0 +#else +#define VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b) +#endif /* VLIB_BUFFER_TRACE_TRAJECTORY */ + +#endif /* included_vlib_buffer_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h new file mode 100644 index 00000000000..75716eca7f6 --- /dev/null +++ b/src/vlib/buffer_funcs.h @@ -0,0 +1,755 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * buffer_funcs.h: VLIB buffer related functions/inlines + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_buffer_funcs_h +#define included_vlib_buffer_funcs_h + +#include <vppinfra/hash.h> + +/** \file + vlib buffer access methods. +*/ + + +/** \brief Translate buffer index into buffer pointer + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param buffer_index - (u32) buffer index + @return - (vlib_buffer_t *) buffer pointer +*/ +always_inline vlib_buffer_t * +vlib_get_buffer (vlib_main_t * vm, u32 buffer_index) +{ + return vlib_physmem_at_offset (&vm->physmem_main, ((uword) buffer_index) + << CLIB_LOG2_CACHE_LINE_BYTES); +} + +/** \brief Translate buffer pointer into buffer index + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param p - (void *) buffer pointer + @return - (u32) buffer index +*/ +always_inline u32 +vlib_get_buffer_index (vlib_main_t * vm, void *p) +{ + uword offset = vlib_physmem_offset_of (&vm->physmem_main, p); + ASSERT ((offset % (1 << CLIB_LOG2_CACHE_LINE_BYTES)) == 0); + return offset >> CLIB_LOG2_CACHE_LINE_BYTES; +} + +/** \brief Get next buffer in buffer linklist, or zero for end of list. + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param b - (void *) buffer pointer + @return - (vlib_buffer_t *) next buffer, or NULL +*/ +always_inline vlib_buffer_t * +vlib_get_next_buffer (vlib_main_t * vm, vlib_buffer_t * b) +{ + return (b->flags & VLIB_BUFFER_NEXT_PRESENT + ? vlib_get_buffer (vm, b->next_buffer) : 0); +} + +uword vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm, + vlib_buffer_t * b_first); + +/** \brief Get length in bytes of the buffer chain + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param b - (void *) buffer pointer + @return - (uword) length of buffer chain +*/ +always_inline uword +vlib_buffer_length_in_chain (vlib_main_t * vm, vlib_buffer_t * b) +{ + uword l = b->current_length + b->total_length_not_including_first_buffer; + if (PREDICT_FALSE ((b->flags & (VLIB_BUFFER_NEXT_PRESENT + | VLIB_BUFFER_TOTAL_LENGTH_VALID)) + == VLIB_BUFFER_NEXT_PRESENT)) + return vlib_buffer_length_in_chain_slow_path (vm, b); + return l; +} + +/** \brief Get length in bytes of the buffer index buffer chain + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param bi - (u32) buffer index + @return - (uword) length of buffer chain +*/ +always_inline uword +vlib_buffer_index_length_in_chain (vlib_main_t * vm, u32 bi) +{ + vlib_buffer_t *b = vlib_get_buffer (vm, bi); + return vlib_buffer_length_in_chain (vm, b); +} + +/** \brief Copy buffer contents to memory + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param buffer_index - (u32) buffer index + @param contents - (u8 *) memory, <strong>must be large enough</strong> + @return - (uword) length of buffer chain +*/ +always_inline uword +vlib_buffer_contents (vlib_main_t * vm, u32 buffer_index, u8 * contents) +{ + uword content_len = 0; + uword l; + vlib_buffer_t *b; + + while (1) + { + b = vlib_get_buffer (vm, buffer_index); + l = b->current_length; + clib_memcpy (contents + content_len, b->data + b->current_data, l); + content_len += l; + if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT)) + break; + buffer_index = b->next_buffer; + } + + return content_len; +} + +/* Return physical address of buffer->data start. */ +always_inline u64 +vlib_get_buffer_data_physical_address (vlib_main_t * vm, u32 buffer_index) +{ + return vlib_physmem_offset_to_physical (&vm->physmem_main, + (((uword) buffer_index) << + CLIB_LOG2_CACHE_LINE_BYTES) + + STRUCT_OFFSET_OF (vlib_buffer_t, + data)); +} + +/** \brief Prefetch buffer metadata by buffer index + The first 64 bytes of buffer contains most header information + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param bi - (u32) buffer index + @param type - LOAD, STORE. In most cases, STORE is the right answer +*/ +/* Prefetch buffer header given index. */ +#define vlib_prefetch_buffer_with_index(vm,bi,type) \ + do { \ + vlib_buffer_t * _b = vlib_get_buffer (vm, bi); \ + vlib_prefetch_buffer_header (_b, type); \ + } while (0) + +#if 0 +/* Iterate over known allocated vlib bufs. You probably do not want + * to do this! + @param vm the vlib_main_t + @param bi found allocated buffer index + @param body operation to perform on buffer index + function executes body for each allocated buffer index + */ +#define vlib_buffer_foreach_allocated(vm,bi,body) \ +do { \ + vlib_main_t * _vmain = (vm); \ + vlib_buffer_main_t * _bmain = &_vmain->buffer_main; \ + hash_pair_t * _vbpair; \ + hash_foreach_pair(_vbpair, _bmain->buffer_known_hash, ({ \ + if (VLIB_BUFFER_KNOWN_ALLOCATED == _vbpair->value[0]) { \ + (bi) = _vbpair->key; \ + body; \ + } \ + })); \ +} while (0) +#endif + +#if DPDK == 0 + +typedef enum +{ + /* Index is unknown. */ + VLIB_BUFFER_UNKNOWN, + + /* Index is known and free/allocated. */ + VLIB_BUFFER_KNOWN_FREE, + VLIB_BUFFER_KNOWN_ALLOCATED, +} vlib_buffer_known_state_t; + +always_inline vlib_buffer_known_state_t +vlib_buffer_is_known (vlib_main_t * vm, u32 buffer_index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + ASSERT (os_get_cpu_number () == 0); + + uword *p = hash_get (bm->buffer_known_hash, buffer_index); + return p ? p[0] : VLIB_BUFFER_UNKNOWN; +} + +always_inline void +vlib_buffer_set_known_state (vlib_main_t * vm, + u32 buffer_index, + vlib_buffer_known_state_t state) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + ASSERT (os_get_cpu_number () == 0); + hash_set (bm->buffer_known_hash, buffer_index, state); +} + +/* Validates sanity of a single buffer. + Returns format'ed vector with error message if any. */ +u8 *vlib_validate_buffer (vlib_main_t * vm, u32 buffer_index, + uword follow_chain); + +#endif /* DPDK == 0 */ + +clib_error_t *vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, + unsigned socket_id); + +/** \brief Allocate buffers into supplied array + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param buffers - (u32 * ) buffer index array + @param n_buffers - (u32) number of buffers requested + @return - (u32) number of buffers actually allocated, may be + less than the number requested or zero +*/ +u32 vlib_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers); + +always_inline u32 +vlib_buffer_round_size (u32 size) +{ + return round_pow2 (size, sizeof (vlib_buffer_t)); +} + +/** \brief Allocate buffers from specific freelist into supplied array + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param buffers - (u32 * ) buffer index array + @param n_buffers - (u32) number of buffers requested + @return - (u32) number of buffers actually allocated, may be + less than the number requested or zero +*/ +u32 vlib_buffer_alloc_from_free_list (vlib_main_t * vm, + u32 * buffers, + u32 n_buffers, u32 free_list_index); + +/** \brief Free buffers + Frees the entire buffer chain for each buffer + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param buffers - (u32 * ) buffer index array + @param n_buffers - (u32) number of buffers to free + +*/ +void vlib_buffer_free (vlib_main_t * vm, + /* pointer to first buffer */ + u32 * buffers, + /* number of buffers to free */ + u32 n_buffers); + +/** \brief Free buffers, does not free the buffer chain for each buffer + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param buffers - (u32 * ) buffer index array + @param n_buffers - (u32) number of buffers to free + +*/ +void vlib_buffer_free_no_next (vlib_main_t * vm, + /* pointer to first buffer */ + u32 * buffers, + /* number of buffers to free */ + u32 n_buffers); + +/** \brief Free one buffer + Shorthand to free a single buffer chain. + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param buffer_index - (u32) buffer index to free +*/ +always_inline void +vlib_buffer_free_one (vlib_main_t * vm, u32 buffer_index) +{ + vlib_buffer_free (vm, &buffer_index, /* n_buffers */ 1); +} + +/* Add/delete buffer free lists. */ +u32 vlib_buffer_create_free_list (vlib_main_t * vm, u32 n_data_bytes, + char *fmt, ...); +void vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index); + +/* Find already existing public free list with given size or create one. */ +u32 vlib_buffer_get_or_create_free_list (vlib_main_t * vm, u32 n_data_bytes, + char *fmt, ...); + +always_inline vlib_buffer_free_list_t * +vlib_buffer_get_free_list (vlib_main_t * vm, u32 free_list_index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *f; + + f = pool_elt_at_index (bm->buffer_free_list_pool, free_list_index); + + /* Sanity: indices must match. */ + ASSERT (f->index == free_list_index); + + return f; +} + +always_inline u32 +vlib_buffer_free_list_buffer_size (vlib_main_t * vm, u32 free_list_index) +{ + vlib_buffer_free_list_t *f = + vlib_buffer_get_free_list (vm, free_list_index); + return f->n_data_bytes; +} + +void vlib_aligned_memcpy (void *_dst, void *_src, int n_bytes); + +/* Reasonably fast buffer copy routine. */ +always_inline void +vlib_copy_buffers (u32 * dst, u32 * src, u32 n) +{ + while (n >= 4) + { + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; + dst += 4; + src += 4; + n -= 4; + } + while (n > 0) + { + dst[0] = src[0]; + dst += 1; + src += 1; + n -= 1; + } +} + +always_inline void * +vlib_physmem_alloc_aligned (vlib_main_t * vm, clib_error_t ** error, + uword n_bytes, uword alignment) +{ + void *r = + vm->os_physmem_alloc_aligned (&vm->physmem_main, n_bytes, alignment); + if (!r) + *error = + clib_error_return (0, "failed to allocate %wd bytes of I/O memory", + n_bytes); + else + *error = 0; + return r; +} + +/* By default allocate I/O memory with cache line alignment. */ +always_inline void * +vlib_physmem_alloc (vlib_main_t * vm, clib_error_t ** error, uword n_bytes) +{ + return vlib_physmem_alloc_aligned (vm, error, n_bytes, + CLIB_CACHE_LINE_BYTES); +} + +always_inline void +vlib_physmem_free (vlib_main_t * vm, void *mem) +{ + return vm->os_physmem_free (mem); +} + +always_inline u64 +vlib_physmem_virtual_to_physical (vlib_main_t * vm, void *mem) +{ + vlib_physmem_main_t *pm = &vm->physmem_main; + uword o = pointer_to_uword (mem) - pm->virtual.start; + return vlib_physmem_offset_to_physical (pm, o); +} + +/* Append given data to end of buffer, possibly allocating new buffers. */ +u32 vlib_buffer_add_data (vlib_main_t * vm, + u32 free_list_index, + u32 buffer_index, void *data, u32 n_data_bytes); + +/* duplicate all buffers in chain */ +always_inline vlib_buffer_t * +vlib_buffer_copy (vlib_main_t * vm, vlib_buffer_t * b) +{ + vlib_buffer_t *s, *d, *fd; + uword n_alloc, n_buffers = 1; + u32 *new_buffers = 0; + u32 flag_mask = VLIB_BUFFER_NEXT_PRESENT | VLIB_BUFFER_TOTAL_LENGTH_VALID; + int i; + + s = b; + while (s->flags & VLIB_BUFFER_NEXT_PRESENT) + { + n_buffers++; + s = vlib_get_buffer (vm, s->next_buffer); + } + + vec_validate (new_buffers, n_buffers - 1); + n_alloc = vlib_buffer_alloc (vm, new_buffers, n_buffers); + ASSERT (n_alloc == n_buffers); + + /* 1st segment */ + s = b; + fd = d = vlib_get_buffer (vm, new_buffers[0]); + d->current_data = s->current_data; + d->current_length = s->current_length; + d->flags = s->flags & flag_mask; + d->total_length_not_including_first_buffer = + s->total_length_not_including_first_buffer; + clib_memcpy (d->opaque, s->opaque, sizeof (s->opaque)); + clib_memcpy (vlib_buffer_get_current (d), + vlib_buffer_get_current (s), s->current_length); + + /* next segments */ + for (i = 1; i < n_buffers; i++) + { + /* previous */ + d->next_buffer = new_buffers[i]; + /* current */ + s = vlib_get_buffer (vm, s->next_buffer); + d = vlib_get_buffer (vm, new_buffers[i]); + d->current_data = s->current_data; + d->current_length = s->current_length; + clib_memcpy (vlib_buffer_get_current (d), + vlib_buffer_get_current (s), s->current_length); + d->flags = s->flags & flag_mask; + } + + return fd; +} + +/* + * vlib_buffer_chain_* functions provide a way to create long buffers. + * When DPDK is enabled, the 'hidden' DPDK header is taken care of transparently. + */ + +/* Initializes the buffer as an empty packet with no chained buffers. */ +always_inline void +vlib_buffer_chain_init (vlib_buffer_t * first) +{ + first->total_length_not_including_first_buffer = 0; + first->current_length = 0; + first->flags &= ~VLIB_BUFFER_NEXT_PRESENT; + first->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; +} + +/* The provided next_bi buffer index is appended to the end of the packet. */ +always_inline vlib_buffer_t * +vlib_buffer_chain_buffer (vlib_main_t * vm, + vlib_buffer_t * first, + vlib_buffer_t * last, u32 next_bi) +{ + vlib_buffer_t *next_buffer = vlib_get_buffer (vm, next_bi); + last->next_buffer = next_bi; + last->flags |= VLIB_BUFFER_NEXT_PRESENT; + next_buffer->current_length = 0; + next_buffer->flags &= ~VLIB_BUFFER_NEXT_PRESENT; + return next_buffer; +} + +/* Increases or decreases the packet length. + * It does not allocate or deallocate new buffers. + * Therefore, the added length must be compatible + * with the last buffer. */ +always_inline void +vlib_buffer_chain_increase_length (vlib_buffer_t * first, + vlib_buffer_t * last, i32 len) +{ + last->current_length += len; + if (first != last) + first->total_length_not_including_first_buffer += len; +} + +/* Copy data to the end of the packet and increases its length. + * It does not allocate new buffers. + * Returns the number of copied bytes. */ +always_inline u16 +vlib_buffer_chain_append_data (vlib_main_t * vm, + u32 free_list_index, + vlib_buffer_t * first, + vlib_buffer_t * last, void *data, u16 data_len) +{ + u32 n_buffer_bytes = + vlib_buffer_free_list_buffer_size (vm, free_list_index); + ASSERT (n_buffer_bytes >= last->current_length + last->current_data); + u16 len = clib_min (data_len, + n_buffer_bytes - last->current_length - + last->current_data); + clib_memcpy (vlib_buffer_get_current (last) + last->current_length, data, + len); + vlib_buffer_chain_increase_length (first, last, len); + return len; +} + +/* Copy data to the end of the packet and increases its length. + * Allocates additional buffers from the free list if necessary. + * Returns the number of copied bytes. + * 'last' value is modified whenever new buffers are allocated and + * chained and points to the last buffer in the chain. */ +u16 +vlib_buffer_chain_append_data_with_alloc (vlib_main_t * vm, + u32 free_list_index, + vlib_buffer_t * first, + vlib_buffer_t ** last, + void *data, u16 data_len); +void vlib_buffer_chain_validate (vlib_main_t * vm, vlib_buffer_t * first); + +format_function_t format_vlib_buffer, format_vlib_buffer_and_data, + format_vlib_buffer_contents; + +typedef struct +{ + /* Vector of packet data. */ + u8 *packet_data; + + /* Note: the next three fields are unused if DPDK == 1 */ + + /* Number of buffers to allocate in each call to physmem + allocator. */ + u32 min_n_buffers_each_physmem_alloc; + + /* Buffer free list for this template. */ + u32 free_list_index; + + u32 *free_buffers; +} vlib_packet_template_t; + +void vlib_packet_template_get_packet_helper (vlib_main_t * vm, + vlib_packet_template_t * t); + +void vlib_packet_template_init (vlib_main_t * vm, + vlib_packet_template_t * t, + void *packet_data, + uword n_packet_data_bytes, + uword min_n_buffers_each_physmem_alloc, + char *fmt, ...); + +void *vlib_packet_template_get_packet (vlib_main_t * vm, + vlib_packet_template_t * t, + u32 * bi_result); + +always_inline void +vlib_packet_template_free (vlib_main_t * vm, vlib_packet_template_t * t) +{ + vec_free (t->packet_data); +} + +always_inline u32 +unserialize_vlib_buffer_n_bytes (serialize_main_t * m) +{ + serialize_stream_t *s = &m->stream; + vlib_serialize_buffer_main_t *sm + = uword_to_pointer (m->stream.data_function_opaque, + vlib_serialize_buffer_main_t *); + vlib_main_t *vm = sm->vlib_main; + u32 n, *f; + + n = s->n_buffer_bytes - s->current_buffer_index; + if (sm->last_buffer != ~0) + { + vlib_buffer_t *b = vlib_get_buffer (vm, sm->last_buffer); + while (b->flags & VLIB_BUFFER_NEXT_PRESENT) + { + b = vlib_get_buffer (vm, b->next_buffer); + n += b->current_length; + } + } + + /* *INDENT-OFF* */ + clib_fifo_foreach (f, sm->rx.buffer_fifo, ({ + n += vlib_buffer_index_length_in_chain (vm, f[0]); + })); +/* *INDENT-ON* */ + + return n; +} + +typedef union +{ + vlib_buffer_t b; + vlib_copy_unit_t i[sizeof (vlib_buffer_t) / sizeof (vlib_copy_unit_t)]; +} +vlib_buffer_union_t; + +/* Set a buffer quickly into "uninitialized" state. We want this to + be extremely cheap and arrange for all fields that need to be + initialized to be in the first 128 bits of the buffer. */ +always_inline void +vlib_buffer_init_for_free_list (vlib_buffer_t * _dst, + vlib_buffer_free_list_t * fl) +{ + vlib_buffer_union_t *dst = (vlib_buffer_union_t *) _dst; + vlib_buffer_union_t *src = + (vlib_buffer_union_t *) & fl->buffer_init_template; + + /* Make sure vlib_buffer_t is cacheline aligned and sized */ + ASSERT (STRUCT_OFFSET_OF (vlib_buffer_t, cacheline0) == 0); + ASSERT (STRUCT_OFFSET_OF (vlib_buffer_t, cacheline1) == + CLIB_CACHE_LINE_BYTES); + ASSERT (STRUCT_OFFSET_OF (vlib_buffer_t, cacheline2) == + CLIB_CACHE_LINE_BYTES * 2); + + /* Make sure buffer template is sane. */ + ASSERT (fl->index == fl->buffer_init_template.free_list_index); + + /* Copy template from src->current_data thru src->free_list_index */ + dst->i[0] = src->i[0]; + if (1 * sizeof (dst->i[0]) < 16) + dst->i[1] = src->i[1]; + if (2 * sizeof (dst->i[0]) < 16) + dst->i[2] = src->i[2]; + + /* Make sure it really worked. */ +#define _(f) ASSERT (dst->b.f == src->b.f) + _(current_data); + _(current_length); + _(flags); + _(free_list_index); +#undef _ + ASSERT (dst->b.total_length_not_including_first_buffer == 0); +} + +always_inline void +vlib_buffer_init_two_for_free_list (vlib_buffer_t * _dst0, + vlib_buffer_t * _dst1, + vlib_buffer_free_list_t * fl) +{ + vlib_buffer_union_t *dst0 = (vlib_buffer_union_t *) _dst0; + vlib_buffer_union_t *dst1 = (vlib_buffer_union_t *) _dst1; + vlib_buffer_union_t *src = + (vlib_buffer_union_t *) & fl->buffer_init_template; + + /* Make sure buffer template is sane. */ + ASSERT (fl->index == fl->buffer_init_template.free_list_index); + + /* Copy template from src->current_data thru src->free_list_index */ + dst0->i[0] = dst1->i[0] = src->i[0]; + if (1 * sizeof (dst0->i[0]) < 16) + dst0->i[1] = dst1->i[1] = src->i[1]; + if (2 * sizeof (dst0->i[0]) < 16) + dst0->i[2] = dst1->i[2] = src->i[2]; + + /* Make sure it really worked. */ +#define _(f) ASSERT (dst0->b.f == src->b.f && dst1->b.f == src->b.f) + _(current_data); + _(current_length); + _(flags); + _(free_list_index); +#undef _ + ASSERT (dst0->b.total_length_not_including_first_buffer == 0); + ASSERT (dst1->b.total_length_not_including_first_buffer == 0); +} + +#if CLIB_DEBUG > 0 +extern u32 *vlib_buffer_state_validation_lock; +extern uword *vlib_buffer_state_validation_hash; +extern void *vlib_buffer_state_heap; +#endif + +static inline void +vlib_validate_buffer_in_use (vlib_buffer_t * b, u32 expected) +{ +#if CLIB_DEBUG > 0 + uword *p; + void *oldheap; + + oldheap = clib_mem_set_heap (vlib_buffer_state_heap); + + while (__sync_lock_test_and_set (vlib_buffer_state_validation_lock, 1)) + ; + + p = hash_get (vlib_buffer_state_validation_hash, b); + + /* If we don't know about b, declare it to be in the expected state */ + if (!p) + { + hash_set (vlib_buffer_state_validation_hash, b, expected); + goto out; + } + + if (p[0] != expected) + { + void cj_stop (void); + u32 bi; + vlib_main_t *vm = &vlib_global_main; + + cj_stop (); + + bi = vlib_get_buffer_index (vm, b); + + clib_mem_set_heap (oldheap); + clib_warning ("%.6f buffer %llx (%d): %s, not %s", + vlib_time_now (vm), bi, + p[0] ? "busy" : "free", expected ? "busy" : "free"); + os_panic (); + } +out: + CLIB_MEMORY_BARRIER (); + *vlib_buffer_state_validation_lock = 0; + clib_mem_set_heap (oldheap); +#endif +} + +static inline void +vlib_validate_buffer_set_in_use (vlib_buffer_t * b, u32 expected) +{ +#if CLIB_DEBUG > 0 + void *oldheap; + + oldheap = clib_mem_set_heap (vlib_buffer_state_heap); + + while (__sync_lock_test_and_set (vlib_buffer_state_validation_lock, 1)) + ; + + hash_set (vlib_buffer_state_validation_hash, b, expected); + + CLIB_MEMORY_BARRIER (); + *vlib_buffer_state_validation_lock = 0; + clib_mem_set_heap (oldheap); +#endif +} + +#endif /* included_vlib_buffer_funcs_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/buffer_node.h b/src/vlib/buffer_node.h new file mode 100644 index 00000000000..8a779049625 --- /dev/null +++ b/src/vlib/buffer_node.h @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * buffer_node.h: VLIB buffer handling node helper macros/inlines + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_buffer_node_h +#define included_vlib_buffer_node_h + +/** \file + vlib buffer/node functions +*/ + +/** \brief Finish enqueueing two buffers forward in the graph. + Standard dual loop boilerplate element. This is a MACRO, + with MULTIPLE SIDE EFFECTS. In the ideal case, + <code>next_index == next0 == next1</code>, + which means that the speculative enqueue at the top of the dual loop + has correctly dealt with both packets. In that case, the macro does + nothing at all. + + @param vm vlib_main_t pointer, varies by thread + @param node current node vlib_node_runtime_t pointer + @param next_index speculated next index used for both packets + @param to_next speculated vector pointer used for both packets + @param n_left_to_next number of slots left in speculated vector + @param bi0 first buffer index + @param bi1 second buffer index + @param next0 actual next index to be used for the first packet + @param next1 actual next index to be used for the second packet + + @return @c next_index -- speculative next index to be used for future packets + @return @c to_next -- speculative frame to be used for future packets + @return @c n_left_to_next -- number of slots left in speculative frame +*/ + +#define vlib_validate_buffer_enqueue_x2(vm,node,next_index,to_next,n_left_to_next,bi0,bi1,next0,next1) \ +do { \ + int enqueue_code = (next0 != next_index) + 2*(next1 != next_index); \ + \ + if (PREDICT_FALSE (enqueue_code != 0)) \ + { \ + switch (enqueue_code) \ + { \ + case 1: \ + /* A B A */ \ + to_next[-2] = bi1; \ + to_next -= 1; \ + n_left_to_next += 1; \ + vlib_set_next_frame_buffer (vm, node, next0, bi0); \ + break; \ + \ + case 2: \ + /* A A B */ \ + to_next -= 1; \ + n_left_to_next += 1; \ + vlib_set_next_frame_buffer (vm, node, next1, bi1); \ + break; \ + \ + case 3: \ + /* A B B or A B C */ \ + to_next -= 2; \ + n_left_to_next += 2; \ + vlib_set_next_frame_buffer (vm, node, next0, bi0); \ + vlib_set_next_frame_buffer (vm, node, next1, bi1); \ + if (next0 == next1) \ + { \ + vlib_put_next_frame (vm, node, next_index, \ + n_left_to_next); \ + next_index = next1; \ + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); \ + } \ + } \ + } \ +} while (0) + + +/** \brief Finish enqueueing four buffers forward in the graph. + Standard quad loop boilerplate element. This is a MACRO, + with MULTIPLE SIDE EFFECTS. In the ideal case, + <code>next_index == next0 == next1 == next2 == next3</code>, + which means that the speculative enqueue at the top of the quad loop + has correctly dealt with all four packets. In that case, the macro does + nothing at all. + + @param vm vlib_main_t pointer, varies by thread + @param node current node vlib_node_runtime_t pointer + @param next_index speculated next index used for both packets + @param to_next speculated vector pointer used for both packets + @param n_left_to_next number of slots left in speculated vector + @param bi0 first buffer index + @param bi1 second buffer index + @param bi2 third buffer index + @param bi3 fourth buffer index + @param next0 actual next index to be used for the first packet + @param next1 actual next index to be used for the second packet + @param next2 actual next index to be used for the third packet + @param next3 actual next index to be used for the fourth packet + + @return @c next_index -- speculative next index to be used for future packets + @return @c to_next -- speculative frame to be used for future packets + @return @c n_left_to_next -- number of slots left in speculative frame +*/ + +#define vlib_validate_buffer_enqueue_x4(vm,node,next_index,to_next,n_left_to_next,bi0,bi1,bi2,bi3,next0,next1,next2,next3) \ +do { \ + /* After the fact: check the [speculative] enqueue to "next" */ \ + u32 fix_speculation = next_index != next0 || next_index != next1 \ + || next_index != next2 || next_index != next3; \ + if (PREDICT_FALSE(fix_speculation)) \ + { \ + /* rewind... */ \ + to_next -= 4; \ + n_left_to_next += 4; \ + \ + /* If bi0 belongs to "next", send it there */ \ + if (next_index == next0) \ + { \ + to_next[0] = bi0; \ + to_next++; \ + n_left_to_next --; \ + } \ + else /* send it where it needs to go */ \ + vlib_set_next_frame_buffer (vm, node, next0, bi0); \ + \ + if (next_index == next1) \ + { \ + to_next[0] = bi1; \ + to_next++; \ + n_left_to_next --; \ + } \ + else \ + vlib_set_next_frame_buffer (vm, node, next1, bi1); \ + \ + if (next_index == next2) \ + { \ + to_next[0] = bi2; \ + to_next++; \ + n_left_to_next --; \ + } \ + else \ + vlib_set_next_frame_buffer (vm, node, next2, bi2); \ + \ + if (next_index == next3) \ + { \ + to_next[0] = bi3; \ + to_next++; \ + n_left_to_next --; \ + } \ + else \ + vlib_set_next_frame_buffer (vm, node, next3, bi3); \ + \ + /* Change speculation: last 2 packets went to the same node */ \ + if (next2 == next3) \ + { \ + vlib_put_next_frame (vm, node, next_index, n_left_to_next); \ + next_index = next3; \ + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); \ + } \ + } \ + } while(0); + +/** \brief Finish enqueueing one buffer forward in the graph. + Standard single loop boilerplate element. This is a MACRO, + with MULTIPLE SIDE EFFECTS. In the ideal case, + <code>next_index == next0</code>, + which means that the speculative enqueue at the top of the single loop + has correctly dealt with the packet in hand. In that case, the macro does + nothing at all. + + @param vm vlib_main_t pointer, varies by thread + @param node current node vlib_node_runtime_t pointer + @param next_index speculated next index used for both packets + @param to_next speculated vector pointer used for both packets + @param n_left_to_next number of slots left in speculated vector + @param bi0 first buffer index + @param next0 actual next index to be used for the first packet + + @return @c next_index -- speculative next index to be used for future packets + @return @c to_next -- speculative frame to be used for future packets + @return @c n_left_to_next -- number of slots left in speculative frame +*/ +#define vlib_validate_buffer_enqueue_x1(vm,node,next_index,to_next,n_left_to_next,bi0,next0) \ +do { \ + if (PREDICT_FALSE (next0 != next_index)) \ + { \ + vlib_put_next_frame (vm, node, next_index, n_left_to_next + 1); \ + next_index = next0; \ + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); \ + \ + to_next[0] = bi0; \ + to_next += 1; \ + n_left_to_next -= 1; \ + } \ +} while (0) + +always_inline uword +generic_buffer_node_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, + uword sizeof_trace, + void *opaque1, + uword opaque2, + void (*two_buffers) (vlib_main_t * vm, + void *opaque1, + uword opaque2, + vlib_buffer_t * b0, + vlib_buffer_t * b1, + u32 * next0, u32 * next1), + void (*one_buffer) (vlib_main_t * vm, + void *opaque1, uword opaque2, + vlib_buffer_t * b0, + u32 * next0)) +{ + u32 n_left_from, *from, *to_next; + u32 next_index; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + if (node->flags & VLIB_NODE_FLAG_TRACE) + vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors, + /* stride */ 1, sizeof_trace); + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + vlib_buffer_t *p0, *p1; + u32 pi0, next0; + u32 pi1, next1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t *p2, *p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, 64, LOAD); + CLIB_PREFETCH (p3->data, 64, LOAD); + } + + pi0 = to_next[0] = from[0]; + pi1 = to_next[1] = from[1]; + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + p0 = vlib_get_buffer (vm, pi0); + p1 = vlib_get_buffer (vm, pi1); + + two_buffers (vm, opaque1, opaque2, p0, p1, &next0, &next1); + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + pi0, pi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t *p0; + u32 pi0, next0; + + pi0 = from[0]; + to_next[0] = pi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, pi0); + + one_buffer (vm, opaque1, opaque2, p0, &next0); + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + pi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +#endif /* included_vlib_buffer_node_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/cli.c b/src/vlib/cli.c new file mode 100644 index 00000000000..2d141115857 --- /dev/null +++ b/src/vlib/cli.c @@ -0,0 +1,1173 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * cli.c: command line interface + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vlib/vlib.h> +#include <vppinfra/cpu.h> + +/* Root of all show commands. */ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (vlib_cli_show_command, static) = { + .path = "show", + .short_help = "Show commands", +}; +/* *INDENT-ON* */ + +/* Root of all clear commands. */ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (vlib_cli_clear_command, static) = { + .path = "clear", + .short_help = "Clear commands", +}; +/* *INDENT-ON* */ + +/* Root of all set commands. */ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (vlib_cli_set_command, static) = { + .path = "set", + .short_help = "Set commands", +}; +/* *INDENT-ON* */ + +/* Root of all test commands. */ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (vlib_cli_test_command, static) = { + .path = "test", + .short_help = "Test commands", +}; +/* *INDENT-ON* */ + +/* Returns bitmap of commands which match key. */ +static uword * +vlib_cli_sub_command_match (vlib_cli_command_t * c, unformat_input_t * input) +{ + int i, n; + uword *match = 0; + vlib_cli_parse_position_t *p; + + unformat_skip_white_space (input); + + for (i = 0;; i++) + { + uword k; + + k = unformat_get_input (input); + switch (k) + { + case 'a' ... 'z': + case 'A' ... 'Z': + case '0' ... '9': + case '-': + case '_': + break; + + case ' ': + case '\t': + case '\r': + case '\n': + case UNFORMAT_END_OF_INPUT: + /* White space or end of input removes any non-white + matches that were before possible. */ + if (i < vec_len (c->sub_command_positions) + && clib_bitmap_count_set_bits (match) > 1) + { + p = vec_elt_at_index (c->sub_command_positions, i); + for (n = 0; n < vec_len (p->bitmaps); n++) + match = clib_bitmap_andnot (match, p->bitmaps[n]); + } + goto done; + + default: + unformat_put_input (input); + goto done; + } + + if (i >= vec_len (c->sub_command_positions)) + { + no_match: + clib_bitmap_free (match); + return 0; + } + + p = vec_elt_at_index (c->sub_command_positions, i); + if (vec_len (p->bitmaps) == 0) + goto no_match; + + n = k - p->min_char; + if (n < 0 || n >= vec_len (p->bitmaps)) + goto no_match; + + if (i == 0) + match = clib_bitmap_dup (p->bitmaps[n]); + else + match = clib_bitmap_and (match, p->bitmaps[n]); + + if (clib_bitmap_is_zero (match)) + goto no_match; + } + +done: + return match; +} + +/* Looks for string based sub-input formatted { SUB-INPUT }. */ +uword +unformat_vlib_cli_sub_input (unformat_input_t * i, va_list * args) +{ + unformat_input_t *sub_input = va_arg (*args, unformat_input_t *); + u8 *s; + uword c; + + while (1) + { + c = unformat_get_input (i); + switch (c) + { + case ' ': + case '\t': + case '\n': + case '\r': + case '\f': + break; + + case '{': + default: + /* Put back paren. */ + if (c != UNFORMAT_END_OF_INPUT) + unformat_put_input (i); + + if (c == '{' && unformat (i, "%v", &s)) + { + unformat_init_vector (sub_input, s); + return 1; + } + return 0; + } + } + return 0; +} + +static vlib_cli_command_t * +get_sub_command (vlib_cli_main_t * cm, vlib_cli_command_t * parent, u32 si) +{ + vlib_cli_sub_command_t *s = vec_elt_at_index (parent->sub_commands, si); + return vec_elt_at_index (cm->commands, s->index); +} + +static uword +unformat_vlib_cli_sub_command (unformat_input_t * i, va_list * args) +{ + vlib_main_t *vm = va_arg (*args, vlib_main_t *); + vlib_cli_command_t *c = va_arg (*args, vlib_cli_command_t *); + vlib_cli_command_t **result = va_arg (*args, vlib_cli_command_t **); + vlib_cli_main_t *cm = &vm->cli_main; + uword *match_bitmap, is_unique, index; + + { + vlib_cli_sub_rule_t *sr; + vlib_cli_parse_rule_t *r; + vec_foreach (sr, c->sub_rules) + { + void **d; + r = vec_elt_at_index (cm->parse_rules, sr->rule_index); + vec_add2 (cm->parse_rule_data, d, 1); + vec_reset_length (d[0]); + if (r->data_size) + d[0] = _vec_resize (d[0], + /* length increment */ 1, + r->data_size, + /* header_bytes */ 0, + /* data align */ sizeof (uword)); + if (unformat_user (i, r->unformat_function, vm, d[0])) + { + *result = vec_elt_at_index (cm->commands, sr->command_index); + return 1; + } + } + } + + match_bitmap = vlib_cli_sub_command_match (c, i); + is_unique = clib_bitmap_count_set_bits (match_bitmap) == 1; + index = ~0; + if (is_unique) + { + index = clib_bitmap_first_set (match_bitmap); + *result = get_sub_command (cm, c, index); + } + clib_bitmap_free (match_bitmap); + + return is_unique; +} + +static u8 * +format_vlib_cli_command_help (u8 * s, va_list * args) +{ + vlib_cli_command_t *c = va_arg (*args, vlib_cli_command_t *); + int is_long = va_arg (*args, int); + if (is_long && c->long_help) + s = format (s, "%s", c->long_help); + else if (c->short_help) + s = format (s, "%s", c->short_help); + else + s = format (s, "%v commands", c->path); + return s; +} + +static u8 * +format_vlib_cli_parse_rule_name (u8 * s, va_list * args) +{ + vlib_cli_parse_rule_t *r = va_arg (*args, vlib_cli_parse_rule_t *); + return format (s, "<%U>", format_c_identifier, r->name); +} + +static u8 * +format_vlib_cli_path (u8 * s, va_list * args) +{ + u8 *path = va_arg (*args, u8 *); + int i, in_rule; + in_rule = 0; + for (i = 0; i < vec_len (path); i++) + { + switch (path[i]) + { + case '%': + in_rule = 1; + vec_add1 (s, '<'); /* start of <RULE> */ + break; + + case '_': + /* _ -> space in rules. */ + vec_add1 (s, in_rule ? ' ' : '_'); + break; + + case ' ': + if (in_rule) + { + vec_add1 (s, '>'); /* end of <RULE> */ + in_rule = 0; + } + vec_add1 (s, ' '); + break; + + default: + vec_add1 (s, path[i]); + break; + } + } + + if (in_rule) + vec_add1 (s, '>'); /* terminate <RULE> */ + + return s; +} + +static vlib_cli_command_t * +all_subs (vlib_cli_main_t * cm, vlib_cli_command_t * subs, u32 command_index) +{ + vlib_cli_command_t *c = vec_elt_at_index (cm->commands, command_index); + vlib_cli_sub_command_t *sc; + vlib_cli_sub_rule_t *sr; + + if (c->function) + vec_add1 (subs, c[0]); + + vec_foreach (sr, c->sub_rules) + subs = all_subs (cm, subs, sr->command_index); + vec_foreach (sc, c->sub_commands) subs = all_subs (cm, subs, sc->index); + + return subs; +} + +static int +vlib_cli_cmp_rule (void *a1, void *a2) +{ + vlib_cli_sub_rule_t *r1 = a1; + vlib_cli_sub_rule_t *r2 = a2; + + return vec_cmp (r1->name, r2->name); +} + +static int +vlib_cli_cmp_command (void *a1, void *a2) +{ + vlib_cli_command_t *c1 = a1; + vlib_cli_command_t *c2 = a2; + + return vec_cmp (c1->path, c2->path); +} + +static clib_error_t * +vlib_cli_dispatch_sub_commands (vlib_main_t * vm, + vlib_cli_main_t * cm, + unformat_input_t * input, + uword parent_command_index) +{ + vlib_cli_command_t *parent, *c; + clib_error_t *error = 0; + unformat_input_t sub_input; + u8 *string; + uword is_main_dispatch = cm == &vm->cli_main; + + parent = vec_elt_at_index (cm->commands, parent_command_index); + if (is_main_dispatch && unformat (input, "help")) + { + uword help_at_end_of_line, i; + + help_at_end_of_line = + unformat_check_input (input) == UNFORMAT_END_OF_INPUT; + while (1) + { + c = parent; + if (unformat_user + (input, unformat_vlib_cli_sub_command, vm, c, &parent)) + ; + + else if (!(unformat_check_input (input) == UNFORMAT_END_OF_INPUT)) + goto unknown; + + else + break; + } + + /* help SUB-COMMAND => long format help. + "help" at end of line: show all commands. */ + if (!help_at_end_of_line) + vlib_cli_output (vm, "%U", format_vlib_cli_command_help, c, + /* is_long */ 1); + + else if (vec_len (c->sub_commands) + vec_len (c->sub_rules) == 0) + vlib_cli_output (vm, "%v: no sub-commands", c->path); + + else + { + vlib_cli_sub_command_t *sc; + vlib_cli_sub_rule_t *sr, *subs; + + subs = vec_dup (c->sub_rules); + + /* Add in rules if any. */ + vec_foreach (sc, c->sub_commands) + { + vec_add2 (subs, sr, 1); + sr->name = sc->name; + sr->command_index = sc->index; + sr->rule_index = ~0; + } + + vec_sort_with_function (subs, vlib_cli_cmp_rule); + + for (i = 0; i < vec_len (subs); i++) + { + vlib_cli_command_t *d; + vlib_cli_parse_rule_t *r; + + d = vec_elt_at_index (cm->commands, subs[i].command_index); + r = + subs[i].rule_index != ~0 ? vec_elt_at_index (cm->parse_rules, + subs + [i].rule_index) : + 0; + + if (r) + vlib_cli_output + (vm, " %-30U %U", + format_vlib_cli_parse_rule_name, r, + format_vlib_cli_command_help, d, /* is_long */ 0); + else + vlib_cli_output + (vm, " %-30v %U", + subs[i].name, + format_vlib_cli_command_help, d, /* is_long */ 0); + } + + vec_free (subs); + } + } + + else if (is_main_dispatch + && (unformat (input, "choices") || unformat (input, "?"))) + { + vlib_cli_command_t *sub, *subs; + + subs = all_subs (cm, 0, parent_command_index); + vec_sort_with_function (subs, vlib_cli_cmp_command); + vec_foreach (sub, subs) + vlib_cli_output (vm, " %-40U %U", + format_vlib_cli_path, sub->path, + format_vlib_cli_command_help, sub, /* is_long */ 0); + vec_free (subs); + } + + else if (unformat (input, "comment %v", &string)) + { + vec_free (string); + } + + else if (unformat (input, "uncomment %U", + unformat_vlib_cli_sub_input, &sub_input)) + { + error = + vlib_cli_dispatch_sub_commands (vm, cm, &sub_input, + parent_command_index); + unformat_free (&sub_input); + } + + else + if (unformat_user (input, unformat_vlib_cli_sub_command, vm, parent, &c)) + { + unformat_input_t *si; + uword has_sub_commands = + vec_len (c->sub_commands) + vec_len (c->sub_rules) > 0; + + si = input; + if (unformat_user (input, unformat_vlib_cli_sub_input, &sub_input)) + si = &sub_input; + + if (has_sub_commands) + error = vlib_cli_dispatch_sub_commands (vm, cm, si, c - cm->commands); + + if (has_sub_commands && !error) + /* Found valid sub-command. */ ; + + else if (c->function) + { + clib_error_t *c_error; + + /* Skip white space for benefit of called function. */ + unformat_skip_white_space (si); + + if (unformat (si, "?")) + { + vlib_cli_output (vm, " %-40U %U", format_vlib_cli_path, c->path, format_vlib_cli_command_help, c, /* is_long */ + 0); + } + else + { + if (!c->is_mp_safe) + vlib_worker_thread_barrier_sync (vm); + + c_error = c->function (vm, si, c); + + if (!c->is_mp_safe) + vlib_worker_thread_barrier_release (vm); + + if (c_error) + { + error = + clib_error_return (0, "%v: %v", c->path, c_error->what); + clib_error_free (c_error); + /* Free sub input. */ + if (si != input) + unformat_free (si); + + return error; + } + } + + /* Free any previous error. */ + clib_error_free (error); + } + + else if (!error) + error = clib_error_return (0, "%v: no sub-commands", c->path); + + /* Free sub input. */ + if (si != input) + unformat_free (si); + } + + else + goto unknown; + + return error; + +unknown: + if (parent->path) + return clib_error_return (0, "%v: unknown input `%U'", parent->path, + format_unformat_error, input); + else + return clib_error_return (0, "unknown input `%U'", format_unformat_error, + input); +} + + +void vlib_unix_error_report (vlib_main_t *, clib_error_t *) + __attribute__ ((weak)); + +void +vlib_unix_error_report (vlib_main_t * vm, clib_error_t * error) +{ +} + +/* Process CLI input. */ +void +vlib_cli_input (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_output_function_t * function, uword function_arg) +{ + vlib_process_t *cp = vlib_get_current_process (vm); + vlib_cli_main_t *cm = &vm->cli_main; + clib_error_t *error; + vlib_cli_output_function_t *save_function; + uword save_function_arg; + + save_function = cp->output_function; + save_function_arg = cp->output_function_arg; + + cp->output_function = function; + cp->output_function_arg = function_arg; + + do + { + vec_reset_length (cm->parse_rule_data); + error = vlib_cli_dispatch_sub_commands (vm, &vm->cli_main, input, /* parent */ + 0); + } + while (!error && !unformat (input, "%U", unformat_eof)); + + if (error) + { + vlib_cli_output (vm, "%v", error->what); + vlib_unix_error_report (vm, error); + clib_error_free (error); + } + + cp->output_function = save_function; + cp->output_function_arg = save_function_arg; +} + +/* Output to current CLI connection. */ +void +vlib_cli_output (vlib_main_t * vm, char *fmt, ...) +{ + vlib_process_t *cp = vlib_get_current_process (vm); + va_list va; + u8 *s; + + va_start (va, fmt); + s = va_format (0, fmt, &va); + va_end (va); + + /* Terminate with \n if not present. */ + if (vec_len (s) > 0 && s[vec_len (s) - 1] != '\n') + vec_add1 (s, '\n'); + + if ((!cp) || (!cp->output_function)) + fformat (stdout, "%v", s); + else + cp->output_function (cp->output_function_arg, s, vec_len (s)); + + vec_free (s); +} + +static clib_error_t * +show_memory_usage (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + int verbose = 0; + clib_error_t *error; + u32 index = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "verbose")) + verbose = 1; + else + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + return error; + } + } + + /* *INDENT-OFF* */ + foreach_vlib_main ( + ({ + vlib_cli_output (vm, "Thread %d %v\n", index, vlib_worker_threads[index].name); + vlib_cli_output (vm, "%U\n", format_mheap, clib_per_cpu_mheaps[index], verbose); + index++; + })); + /* *INDENT-ON* */ + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_memory_usage_command, static) = { + .path = "show memory", + .short_help = "Show current memory usage", + .function = show_memory_usage, +}; +/* *INDENT-ON* */ + +static clib_error_t * +show_cpu (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ +#define _(a,b,c) vlib_cli_output (vm, "%-25s " b, a ":", c); + _("Model name", "%U", format_cpu_model_name); + _("Microarchitecture", "%U", format_cpu_uarch); + _("Flags", "%U", format_cpu_flags); + _("Base frequency", "%.2f GHz", + ((f64) vm->clib_time.clocks_per_second) * 1e-9); +#undef _ + return 0; +} + +/*? + * Displays various information about the CPU. + * + * @cliexpar + * @cliexstart{show cpu} + * Model name: Intel(R) Xeon(R) CPU E5-2667 v4 @ 3.20GHz + * Microarchitecture: Broadwell (Broadwell-EP/EX) + * Flags: sse3 ssse3 sse41 sse42 avx avx2 aes + * Base Frequency: 3.20 GHz + * @cliexend +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_cpu_command, static) = { + .path = "show cpu", + .short_help = "Show cpu information", + .function = show_cpu, +}; + +/* *INDENT-ON* */ +static clib_error_t * +enable_disable_memory_trace (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + clib_error_t *error = 0; + int enable; + + if (!unformat_user (input, unformat_vlib_enable_disable, &enable)) + { + error = clib_error_return (0, "expecting enable/on or disable/off"); + goto done; + } + + clib_mem_trace (enable); + +done: + return error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (enable_disable_memory_trace_command, static) = { + .path = "memory-trace", + .short_help = "Enable/disable memory allocation trace", + .function = enable_disable_memory_trace, +}; +/* *INDENT-ON* */ + + +static clib_error_t * +test_heap_validate (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + clib_error_t *error = 0; + void *heap; + mheap_t *mheap; + + if (unformat (input, "on")) + { + /* *INDENT-OFF* */ + foreach_vlib_main({ + heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index]; + mheap = mheap_header(heap); + mheap->flags |= MHEAP_FLAG_VALIDATE; + // Turn off small object cache because it delays detection of errors + mheap->flags &= ~MHEAP_FLAG_SMALL_OBJECT_CACHE; + }); + /* *INDENT-ON* */ + + } + else if (unformat (input, "off")) + { + /* *INDENT-OFF* */ + foreach_vlib_main({ + heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index]; + mheap = mheap_header(heap); + mheap->flags &= ~MHEAP_FLAG_VALIDATE; + mheap->flags |= MHEAP_FLAG_SMALL_OBJECT_CACHE; + }); + /* *INDENT-ON* */ + } + else if (unformat (input, "now")) + { + /* *INDENT-OFF* */ + foreach_vlib_main({ + heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index]; + mheap = mheap_header(heap); + mheap_validate(heap); + }); + /* *INDENT-ON* */ + vlib_cli_output (vm, "heap validation complete"); + + } + else + { + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + return error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_test_heap_validate,static) = { + .path = "test heap-validate", + .short_help = "<on/off/now> validate heap on future allocs/frees or right now", + .function = test_heap_validate, +}; +/* *INDENT-ON* */ + +#ifdef TEST_CODE +/* + * A trivial test harness to verify the per-process output_function + * is working correcty. + */ + +static clib_error_t * +sleep_ten_seconds (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + u16 i; + u16 my_id = rand (); + + vlib_cli_output (vm, "Starting 10 seconds sleep with id %u\n", my_id); + + for (i = 0; i < 10; i++) + { + vlib_process_wait_for_event_or_clock (vm, 1.0); + vlib_cli_output (vm, "Iteration number %u, my id: %u\n", i, my_id); + } + vlib_cli_output (vm, "Done with sleep with id %u\n", my_id); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (ping_command, static) = { + .path = "test sleep", + .function = sleep_ten_seconds, + .short_help = "Sleep for 10 seconds", +}; +/* *INDENT-ON* */ +#endif /* ifdef TEST_CODE */ + +static uword +vlib_cli_normalize_path (char *input, char **result) +{ + char *i = input; + char *s = 0; + uword l = 0; + uword index_of_last_space = ~0; + + while (*i != 0) + { + u8 c = *i++; + /* Multiple white space -> single space. */ + switch (c) + { + case ' ': + case '\t': + case '\n': + case '\r': + if (l > 0 && s[l - 1] != ' ') + { + vec_add1 (s, ' '); + l++; + } + break; + + default: + if (l > 0 && s[l - 1] == ' ') + index_of_last_space = vec_len (s); + vec_add1 (s, c); + l++; + break; + } + } + + /* Remove any extra space at end. */ + if (l > 0 && s[l - 1] == ' ') + _vec_len (s) -= 1; + + *result = s; + return index_of_last_space; +} + +always_inline uword +parent_path_len (char *path) +{ + word i; + for (i = vec_len (path) - 1; i >= 0; i--) + { + if (path[i] == ' ') + return i; + } + return ~0; +} + +static void +add_sub_command (vlib_cli_main_t * cm, uword parent_index, uword child_index) +{ + vlib_cli_command_t *p, *c; + vlib_cli_sub_command_t *sub_c; + u8 *sub_name; + word i, l; + + p = vec_elt_at_index (cm->commands, parent_index); + c = vec_elt_at_index (cm->commands, child_index); + + l = parent_path_len (c->path); + if (l == ~0) + sub_name = vec_dup ((u8 *) c->path); + else + { + ASSERT (l + 1 < vec_len (c->path)); + sub_name = 0; + vec_add (sub_name, c->path + l + 1, vec_len (c->path) - (l + 1)); + } + + if (sub_name[0] == '%') + { + uword *q; + vlib_cli_sub_rule_t *sr; + + /* Remove %. */ + vec_delete (sub_name, 1, 0); + + if (!p->sub_rule_index_by_name) + p->sub_rule_index_by_name = hash_create_vec ( /* initial length */ 32, + sizeof (sub_name[0]), + sizeof (uword)); + q = hash_get_mem (p->sub_rule_index_by_name, sub_name); + if (q) + { + sr = vec_elt_at_index (p->sub_rules, q[0]); + ASSERT (sr->command_index == child_index); + return; + } + + q = hash_get_mem (cm->parse_rule_index_by_name, sub_name); + if (!q) + { + clib_error ("reference to unknown rule `%%%v' in path `%v'", + sub_name, c->path); + return; + } + + hash_set_mem (p->sub_rule_index_by_name, sub_name, + vec_len (p->sub_rules)); + vec_add2 (p->sub_rules, sr, 1); + sr->name = sub_name; + sr->rule_index = q[0]; + sr->command_index = child_index; + return; + } + + if (!p->sub_command_index_by_name) + p->sub_command_index_by_name = hash_create_vec ( /* initial length */ 32, + sizeof (c->path[0]), + sizeof (uword)); + + /* Check if sub-command has already been created. */ + if (hash_get_mem (p->sub_command_index_by_name, sub_name)) + { + vec_free (sub_name); + return; + } + + vec_add2 (p->sub_commands, sub_c, 1); + sub_c->index = child_index; + sub_c->name = sub_name; + hash_set_mem (p->sub_command_index_by_name, sub_c->name, + sub_c - p->sub_commands); + + vec_validate (p->sub_command_positions, vec_len (sub_c->name) - 1); + for (i = 0; i < vec_len (sub_c->name); i++) + { + int n; + vlib_cli_parse_position_t *pos; + + pos = vec_elt_at_index (p->sub_command_positions, i); + + if (!pos->bitmaps) + pos->min_char = sub_c->name[i]; + + n = sub_c->name[i] - pos->min_char; + if (n < 0) + { + pos->min_char = sub_c->name[i]; + vec_insert (pos->bitmaps, -n, 0); + n = 0; + } + + vec_validate (pos->bitmaps, n); + pos->bitmaps[n] = + clib_bitmap_ori (pos->bitmaps[n], sub_c - p->sub_commands); + } +} + +static void +vlib_cli_make_parent (vlib_cli_main_t * cm, uword ci) +{ + uword p_len, pi, *p; + char *p_path; + vlib_cli_command_t *c, *parent; + + /* Root command (index 0) should have already been added. */ + ASSERT (vec_len (cm->commands) > 0); + + c = vec_elt_at_index (cm->commands, ci); + p_len = parent_path_len (c->path); + + /* No space? Parent is root command. */ + if (p_len == ~0) + { + add_sub_command (cm, 0, ci); + return; + } + + p_path = 0; + vec_add (p_path, c->path, p_len); + + p = hash_get_mem (cm->command_index_by_path, p_path); + + /* Parent exists? */ + if (!p) + { + /* Parent does not exist; create it. */ + vec_add2 (cm->commands, parent, 1); + parent->path = p_path; + hash_set_mem (cm->command_index_by_path, parent->path, + parent - cm->commands); + pi = parent - cm->commands; + } + else + { + pi = p[0]; + vec_free (p_path); + } + + add_sub_command (cm, pi, ci); + + /* Create parent's parent. */ + if (!p) + vlib_cli_make_parent (cm, pi); +} + +always_inline uword +vlib_cli_command_is_empty (vlib_cli_command_t * c) +{ + return (c->long_help == 0 && c->short_help == 0 && c->function == 0); +} + +clib_error_t * +vlib_cli_register (vlib_main_t * vm, vlib_cli_command_t * c) +{ + vlib_cli_main_t *cm = &vm->cli_main; + clib_error_t *error = 0; + uword ci, *p; + char *normalized_path; + + if ((error = vlib_call_init_function (vm, vlib_cli_init))) + return error; + + (void) vlib_cli_normalize_path (c->path, &normalized_path); + + if (!cm->command_index_by_path) + cm->command_index_by_path = hash_create_vec ( /* initial length */ 32, + sizeof (c->path[0]), + sizeof (uword)); + + /* See if command already exists with given path. */ + p = hash_get_mem (cm->command_index_by_path, normalized_path); + if (p) + { + vlib_cli_command_t *d; + + ci = p[0]; + d = vec_elt_at_index (cm->commands, ci); + + /* If existing command was created via vlib_cli_make_parent + replaced it with callers data. */ + if (vlib_cli_command_is_empty (d)) + { + vlib_cli_command_t save = d[0]; + + ASSERT (!vlib_cli_command_is_empty (c)); + + /* Copy callers fields. */ + d[0] = c[0]; + + /* Save internal fields. */ + d->path = save.path; + d->sub_commands = save.sub_commands; + d->sub_command_index_by_name = save.sub_command_index_by_name; + d->sub_command_positions = save.sub_command_positions; + d->sub_rules = save.sub_rules; + } + else + error = + clib_error_return (0, "duplicate command name with path %v", + normalized_path); + + vec_free (normalized_path); + if (error) + return error; + } + else + { + /* Command does not exist: create it. */ + + /* Add root command (index 0). */ + if (vec_len (cm->commands) == 0) + { + /* Create command with index 0; path is empty string. */ + vec_resize (cm->commands, 1); + } + + ci = vec_len (cm->commands); + hash_set_mem (cm->command_index_by_path, normalized_path, ci); + vec_add1 (cm->commands, c[0]); + + c = vec_elt_at_index (cm->commands, ci); + c->path = normalized_path; + + /* Don't inherit from registration. */ + c->sub_commands = 0; + c->sub_command_index_by_name = 0; + c->sub_command_positions = 0; + } + + vlib_cli_make_parent (cm, ci); + return 0; +} + +clib_error_t * +vlib_cli_register_parse_rule (vlib_main_t * vm, vlib_cli_parse_rule_t * r_reg) +{ + vlib_cli_main_t *cm = &vm->cli_main; + vlib_cli_parse_rule_t *r; + clib_error_t *error = 0; + u8 *r_name; + uword *p; + + if (!cm->parse_rule_index_by_name) + cm->parse_rule_index_by_name = hash_create_vec ( /* initial length */ 32, + sizeof (r->name[0]), + sizeof (uword)); + + /* Make vector copy of name. */ + r_name = format (0, "%s", r_reg->name); + + if ((p = hash_get_mem (cm->parse_rule_index_by_name, r_name))) + { + vec_free (r_name); + return clib_error_return (0, "duplicate parse rule name `%s'", + r_reg->name); + } + + vec_add2 (cm->parse_rules, r, 1); + r[0] = r_reg[0]; + r->name = (char *) r_name; + hash_set_mem (cm->parse_rule_index_by_name, r->name, r - cm->parse_rules); + + return error; +} + +#if 0 +/* $$$ turn back on again someday, maybe */ +static clib_error_t *vlib_cli_register_parse_rules (vlib_main_t * vm, + vlib_cli_parse_rule_t * + lo, + vlib_cli_parse_rule_t * + hi) + __attribute__ ((unused)) +{ + clib_error_t *error = 0; + vlib_cli_parse_rule_t *r; + + for (r = lo; r < hi; r = clib_elf_section_data_next (r, 0)) + { + if (!r->name || strlen (r->name) == 0) + { + error = clib_error_return (0, "parse rule with no name"); + goto done; + } + + error = vlib_cli_register_parse_rule (vm, r); + if (error) + goto done; + } + +done: + return error; +} +#endif + +static clib_error_t * +vlib_cli_init (vlib_main_t * vm) +{ + vlib_cli_main_t *cm = &vm->cli_main; + clib_error_t *error = 0; + vlib_cli_command_t *cmd; + + cmd = cm->cli_command_registrations; + + while (cmd) + { + error = vlib_cli_register (vm, cmd); + if (error) + return error; + cmd = cmd->next_cli_command; + } + return error; +} + +VLIB_INIT_FUNCTION (vlib_cli_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/cli.h b/src/vlib/cli.h new file mode 100644 index 00000000000..009c7e82cf7 --- /dev/null +++ b/src/vlib/cli.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * cli.h: command line interface + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_cli_h +#define included_vlib_cli_h + +#include <vppinfra/format.h> + +struct vlib_cli_command_t; + +typedef struct +{ + u32 min_char; + + /* Indexed by name[position] - min_char. */ + uword **bitmaps; +} vlib_cli_parse_position_t; + +typedef struct +{ + u8 *name; + + u32 index; +} vlib_cli_sub_command_t; + +typedef struct +{ + u8 *name; + + u32 rule_index; + + u32 command_index; +} vlib_cli_sub_rule_t; + +typedef struct +{ + char *name; + char *short_help; + char *long_help; + + /* Number of bytes in parsed data. Zero for vector. */ + uword data_size; + + unformat_function_t *unformat_function; + + /* Opaque for unformat function. */ + uword unformat_function_arg[2]; +} vlib_cli_parse_rule_t; + +/* CLI command callback function. */ +typedef clib_error_t *(vlib_cli_command_function_t) + (struct vlib_main_t * vm, + unformat_input_t * input, struct vlib_cli_command_t * cmd); + +typedef struct vlib_cli_command_t +{ + /* Command path (e.g. "show something"). + Spaces delimit elements of path. */ + char *path; + + /* Short/long help strings. */ + char *short_help; + char *long_help; + + /* Callback function. */ + vlib_cli_command_function_t *function; + + /* Opaque. */ + uword function_arg; + + /* Known MP-safe? */ + uword is_mp_safe; + + /* Sub commands for this command. */ + vlib_cli_sub_command_t *sub_commands; + + /* Hash table mapping name (e.g. last path element) to sub command index. */ + uword *sub_command_index_by_name; + + /* bitmap[p][c][i] says whether sub-command i has character + c in position p. */ + vlib_cli_parse_position_t *sub_command_positions; + + /* Hash table mapping name (e.g. last path element) to sub rule index. */ + uword *sub_rule_index_by_name; + + /* Vector of possible parse rules for this path. */ + vlib_cli_sub_rule_t *sub_rules; + + /* List of CLI commands, built by constructors */ + struct vlib_cli_command_t *next_cli_command; + +} vlib_cli_command_t; + +typedef void (vlib_cli_output_function_t) (uword arg, + u8 * buffer, uword buffer_bytes); +typedef struct +{ + /* Vector of all known commands. */ + vlib_cli_command_t *commands; + + /* Hash table mapping normalized path to index into all_commands. */ + uword *command_index_by_path; + + /* Vector of all known parse rules. */ + vlib_cli_parse_rule_t *parse_rules; + + /* Hash table mapping parse rule name to index into parse_rule vector. */ + uword *parse_rule_index_by_name; + + /* Data parsed for rules. */ + void **parse_rule_data; + + /* registration list added by constructors */ + vlib_cli_command_t *cli_command_registrations; +} vlib_cli_main_t; + +#define VLIB_CLI_COMMAND(x,...) \ + __VA_ARGS__ vlib_cli_command_t x; \ +static void __vlib_cli_command_registration_##x (void) \ + __attribute__((__constructor__)) ; \ +static void __vlib_cli_command_registration_##x (void) \ +{ \ + vlib_main_t * vm = vlib_get_main(); \ + vlib_cli_main_t *cm = &vm->cli_main; \ + x.next_cli_command = cm->cli_command_registrations; \ + cm->cli_command_registrations = &x; \ +} \ +__VA_ARGS__ vlib_cli_command_t x +#define VLIB_CLI_PARSE_RULE(x) \ + vlib_cli_parse_rule_t x +/* Output to current CLI connection. */ +void vlib_cli_output (struct vlib_main_t *vm, char *fmt, ...); + +/* Process CLI input. */ +void vlib_cli_input (struct vlib_main_t *vm, + unformat_input_t * input, + vlib_cli_output_function_t * function, + uword function_arg); + +clib_error_t *vlib_cli_register (struct vlib_main_t *vm, + vlib_cli_command_t * c); +clib_error_t *vlib_cli_register_parse_rule (struct vlib_main_t *vm, + vlib_cli_parse_rule_t * c); + +uword unformat_vlib_cli_sub_input (unformat_input_t * i, va_list * args); + +#endif /* included_vlib_cli_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/cli_funcs.h b/src/vlib/cli_funcs.h new file mode 100644 index 00000000000..78aef73ba2d --- /dev/null +++ b/src/vlib/cli_funcs.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * cli_funcs.h: VLIB CLI related functions/inlines + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_cli_funcs_h +#define included_vlib_cli_funcs_h + +always_inline void * +vlib_cli_get_parse_rule_result (vlib_main_t * vm, uword index) +{ + vlib_cli_main_t *cm = &vm->cli_main; + return vec_elt (cm->parse_rule_data, index); +} + +#endif /* included_vlib_cli_funcs_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/counter.c b/src/vlib/counter.c new file mode 100644 index 00000000000..9f66e04d88e --- /dev/null +++ b/src/vlib/counter.c @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * counter.c: simple and packet/byte counters + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vlib/vlib.h> + +void +vlib_clear_simple_counters (vlib_simple_counter_main_t * cm) +{ + uword i, j; + u16 *my_minis; + + for (i = 0; i < vec_len (cm->minis); i++) + { + my_minis = cm->minis[i]; + + for (j = 0; j < vec_len (my_minis); j++) + { + cm->maxi[j] += my_minis[j]; + my_minis[j] = 0; + } + } + + j = vec_len (cm->maxi); + if (j > 0) + vec_validate (cm->value_at_last_clear, j - 1); + for (i = 0; i < j; i++) + cm->value_at_last_clear[i] = cm->maxi[i]; +} + +void +vlib_clear_combined_counters (vlib_combined_counter_main_t * cm) +{ + uword i, j; + vlib_mini_counter_t *my_minis; + + for (i = 0; i < vec_len (cm->minis); i++) + { + my_minis = cm->minis[i]; + + for (j = 0; j < vec_len (my_minis); j++) + { + cm->maxi[j].packets += my_minis[j].packets; + cm->maxi[j].bytes += my_minis[j].bytes; + my_minis[j].packets = 0; + my_minis[j].bytes = 0; + } + } + + j = vec_len (cm->maxi); + if (j > 0) + vec_validate (cm->value_at_last_clear, j - 1); + + for (i = 0; i < j; i++) + { + vlib_counter_t *c = vec_elt_at_index (cm->value_at_last_clear, i); + + c[0] = cm->maxi[i]; + } +} + +void +vlib_validate_simple_counter (vlib_simple_counter_main_t * cm, u32 index) +{ + vlib_thread_main_t *tm = vlib_get_thread_main (); + int i; + + vec_validate (cm->minis, tm->n_vlib_mains - 1); + for (i = 0; i < tm->n_vlib_mains; i++) + vec_validate_aligned (cm->minis[i], index, CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (cm->maxi, index, CLIB_CACHE_LINE_BYTES); +} + +void +vlib_validate_combined_counter (vlib_combined_counter_main_t * cm, u32 index) +{ + vlib_thread_main_t *tm = vlib_get_thread_main (); + int i; + + vec_validate (cm->minis, tm->n_vlib_mains - 1); + for (i = 0; i < tm->n_vlib_mains; i++) + vec_validate_aligned (cm->minis[i], index, CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (cm->maxi, index, CLIB_CACHE_LINE_BYTES); +} + +void +serialize_vlib_simple_counter_main (serialize_main_t * m, va_list * va) +{ + clib_warning ("unimplemented"); +} + +void +unserialize_vlib_simple_counter_main (serialize_main_t * m, va_list * va) +{ + clib_warning ("unimplemented"); +} + +void +serialize_vlib_combined_counter_main (serialize_main_t * m, va_list * va) +{ + clib_warning ("unimplemented"); +} + +void +unserialize_vlib_combined_counter_main (serialize_main_t * m, va_list * va) +{ + clib_warning ("unimplemented"); +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/counter.h b/src/vlib/counter.h new file mode 100644 index 00000000000..a79032065d9 --- /dev/null +++ b/src/vlib/counter.h @@ -0,0 +1,379 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * counter.h: simple and packet/byte counters + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_counter_h +#define included_vlib_counter_h + +/** \file + + Optimized thread-safe counters. + + Each vlib_[simple|combined]_counter_main_t consists of a single + vector of thread-safe / atomically-updated u64 counters [the + "maxi" vector], and a (u16 **) per-thread vector [the "minis" + vector] of narrow, per-thread counters. + + The idea is to drastically reduce the number of atomic operations. + In the case of packet counts, we divide the number of atomic ops + by 2**16, etc. +*/ + +/** A collection of simple counters */ + +typedef struct +{ + u16 **minis; /**< Per-thread u16 non-atomic counters */ + u64 *maxi; /**< Shared wide counters */ + u64 *value_at_last_clear; /**< Counter values as of last clear. */ + u64 *value_at_last_serialize; /**< Values as of last serialize. */ + u32 last_incremental_serialize_index; /**< Last counter index + serialized incrementally. */ + + char *name; /**< The counter collection's name. */ +} vlib_simple_counter_main_t; + +/** Increment a simple counter + @param cm - (vlib_simple_counter_main_t *) simple counter main pointer + @param cpu_index - (u32) the current cpu index + @param index - (u32) index of the counter to increment + @param increment - (u32) quantitiy to add to the counter +*/ +always_inline void +vlib_increment_simple_counter (vlib_simple_counter_main_t * cm, + u32 cpu_index, u32 index, u32 increment) +{ + u16 *my_minis; + u16 *mini; + u32 old, new; + + my_minis = cm->minis[cpu_index]; + mini = vec_elt_at_index (my_minis, index); + old = mini[0]; + new = old + increment; + mini[0] = new; + + if (PREDICT_FALSE (mini[0] != new)) + { + __sync_fetch_and_add (&cm->maxi[index], new); + my_minis[index] = 0; + } +} + +/** Get the value of a simple counter + Scrapes the entire set of mini counters. Innacurate unless + worker threads which might increment the counter are + barrier-synchronized + + @param cm - (vlib_simple_counter_main_t *) simple counter main pointer + @param index - (u32) index of the counter to fetch + @returns - (u64) current counter value +*/ +always_inline u64 +vlib_get_simple_counter (vlib_simple_counter_main_t * cm, u32 index) +{ + u16 *my_minis, *mini; + u64 v; + int i; + + ASSERT (index < vec_len (cm->maxi)); + + v = 0; + + for (i = 0; i < vec_len (cm->minis); i++) + { + my_minis = cm->minis[i]; + mini = vec_elt_at_index (my_minis, index); + v += mini[0]; + } + + v += cm->maxi[index]; + + if (index < vec_len (cm->value_at_last_clear)) + { + ASSERT (v >= cm->value_at_last_clear[index]); + v -= cm->value_at_last_clear[index]; + } + + return v; +} + +/** Clear a simple counter + Clears the set of per-thread u16 counters, and the u64 counter + + @param cm - (vlib_simple_counter_main_t *) simple counter main pointer + @param index - (u32) index of the counter to clear +*/ +always_inline void +vlib_zero_simple_counter (vlib_simple_counter_main_t * cm, u32 index) +{ + u16 *my_minis; + int i; + + ASSERT (index < vec_len (cm->maxi)); + + for (i = 0; i < vec_len (cm->minis); i++) + { + my_minis = cm->minis[i]; + my_minis[index] = 0; + } + + cm->maxi[index] = 0; + + if (index < vec_len (cm->value_at_last_clear)) + cm->value_at_last_clear[index] = 0; +} + +/** Combined counter to hold both packets and byte differences. + */ +typedef struct +{ + u64 packets; /**< packet counter */ + u64 bytes; /**< byte counter */ +} vlib_counter_t; + +/** Add two combined counters, results in the first counter + @param [in,out] a - (vlib_counter_t *) dst counter + @param b - (vlib_counter_t *) src counter +*/ + +always_inline void +vlib_counter_add (vlib_counter_t * a, vlib_counter_t * b) +{ + a->packets += b->packets; + a->bytes += b->bytes; +} + +/** Subtract combined counters, results in the first counter + @param [in,out] a - (vlib_counter_t *) dst counter + @param b - (vlib_counter_t *) src counter +*/ +always_inline void +vlib_counter_sub (vlib_counter_t * a, vlib_counter_t * b) +{ + ASSERT (a->packets >= b->packets); + ASSERT (a->bytes >= b->bytes); + a->packets -= b->packets; + a->bytes -= b->bytes; +} + +/** Clear a combined counter + @param a - (vlib_counter_t *) counter to clear +*/ +always_inline void +vlib_counter_zero (vlib_counter_t * a) +{ + a->packets = a->bytes = 0; +} + +/** Mini combined counter */ +typedef struct +{ + u16 packets; /**< Packet count */ + i16 bytes; /**< Byte count */ +} vlib_mini_counter_t; + +/** A collection of combined counters */ +typedef struct +{ + vlib_mini_counter_t **minis; /**< Per-thread u16 non-atomic counter pairs */ + vlib_counter_t *maxi; /**< Shared wide counter pairs */ + vlib_counter_t *value_at_last_clear; /**< Counter values as of last clear. */ + vlib_counter_t *value_at_last_serialize; /**< Counter values as of last serialize. */ + u32 last_incremental_serialize_index; /**< Last counter index serialized incrementally. */ + char *name; /**< The counter collection's name. */ +} vlib_combined_counter_main_t; + +/** Clear a collection of simple counters + @param cm - (vlib_simple_counter_main_t *) collection to clear +*/ +void vlib_clear_simple_counters (vlib_simple_counter_main_t * cm); + +/** Clear a collection of combined counters + @param cm - (vlib_combined_counter_main_t *) collection to clear +*/ +void vlib_clear_combined_counters (vlib_combined_counter_main_t * cm); + +/** Increment a combined counter + @param cm - (vlib_combined_counter_main_t *) comined counter main pointer + @param cpu_index - (u32) the current cpu index + @param index - (u32) index of the counter to increment + @param packet_increment - (u32) number of packets to add to the counter + @param byte_increment - (u32) number of bytes to add to the counter +*/ + +always_inline void +vlib_increment_combined_counter (vlib_combined_counter_main_t * cm, + u32 cpu_index, + u32 index, + u32 packet_increment, u32 byte_increment) +{ + vlib_mini_counter_t *my_minis, *mini; + u32 old_packets, new_packets; + i32 old_bytes, new_bytes; + + /* Use this CPU's mini counter array */ + my_minis = cm->minis[cpu_index]; + + mini = vec_elt_at_index (my_minis, index); + old_packets = mini->packets; + old_bytes = mini->bytes; + + new_packets = old_packets + packet_increment; + new_bytes = old_bytes + byte_increment; + + mini->packets = new_packets; + mini->bytes = new_bytes; + + /* Bytes always overflow before packets.. */ + if (PREDICT_FALSE (mini->bytes != new_bytes)) + { + vlib_counter_t *maxi = vec_elt_at_index (cm->maxi, index); + + __sync_fetch_and_add (&maxi->packets, new_packets); + __sync_fetch_and_add (&maxi->bytes, new_bytes); + + mini->packets = 0; + mini->bytes = 0; + } +} + +/** Get the value of a combined counter, never called in the speed path + Scrapes the entire set of mini counters. Innacurate unless + worker threads which might increment the counter are + barrier-synchronized + + @param cm - (vlib_combined_counter_main_t *) combined counter main pointer + @param index - (u32) index of the combined counter to fetch + @param result [out] - (vlib_counter_t *) result stored here +*/ + +static inline void +vlib_get_combined_counter (vlib_combined_counter_main_t * cm, + u32 index, vlib_counter_t * result) +{ + vlib_mini_counter_t *my_minis, *mini; + vlib_counter_t *maxi; + int i; + + result->packets = 0; + result->bytes = 0; + + for (i = 0; i < vec_len (cm->minis); i++) + { + my_minis = cm->minis[i]; + + mini = vec_elt_at_index (my_minis, index); + result->packets += mini->packets; + result->bytes += mini->bytes; + } + + maxi = vec_elt_at_index (cm->maxi, index); + result->packets += maxi->packets; + result->bytes += maxi->bytes; + + if (index < vec_len (cm->value_at_last_clear)) + vlib_counter_sub (result, &cm->value_at_last_clear[index]); +} + +/** Clear a combined counter + Clears the set of per-thread u16 counters, and the shared vlib_counter_t + + @param cm - (vlib_combined_counter_main_t *) combined counter main pointer + @param index - (u32) index of the counter to clear +*/ +always_inline void +vlib_zero_combined_counter (vlib_combined_counter_main_t * cm, u32 index) +{ + vlib_mini_counter_t *mini, *my_minis; + int i; + + for (i = 0; i < vec_len (cm->minis); i++) + { + my_minis = cm->minis[i]; + + mini = vec_elt_at_index (my_minis, index); + mini->packets = 0; + mini->bytes = 0; + } + + vlib_counter_zero (&cm->maxi[index]); + if (index < vec_len (cm->value_at_last_clear)) + vlib_counter_zero (&cm->value_at_last_clear[index]); +} + +/** validate a simple counter + @param cm - (vlib_simple_counter_main_t *) pointer to the counter collection + @param index - (u32) index of the counter to validate +*/ + +void vlib_validate_simple_counter (vlib_simple_counter_main_t * cm, + u32 index); +/** validate a combined counter + @param cm - (vlib_combined_counter_main_t *) pointer to the counter + collection + @param index - (u32) index of the counter to validate +*/ + +void vlib_validate_combined_counter (vlib_combined_counter_main_t * cm, + u32 index); + +/** Obtain the number of simple or combined counters allocated. + A macro which reduces to to vec_len(cm->maxi), the answer in either + case. + + @param cm - (vlib_simple_counter_main_t) or + (vlib_combined_counter_main_t) the counter collection to interrogate + @returns vec_len(cm->maxi) +*/ +#define vlib_counter_len(cm) vec_len((cm)->maxi) + +serialize_function_t serialize_vlib_simple_counter_main, + unserialize_vlib_simple_counter_main; +serialize_function_t serialize_vlib_combined_counter_main, + unserialize_vlib_combined_counter_main; + +#endif /* included_vlib_counter_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/defs.h b/src/vlib/defs.h new file mode 100644 index 00000000000..ad58bc04681 --- /dev/null +++ b/src/vlib/defs.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * defs.h: VLIB generic C definitions + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_defs_h +#define included_vlib_defs_h + +/* Receive or transmit. */ +typedef enum +{ + VLIB_RX, + VLIB_TX, + VLIB_N_RX_TX = 2, /* Used to size arrays. */ +} vlib_rx_or_tx_t; + +#define vlib_foreach_rx_tx(v) for (v = 0; v < VLIB_N_RX_TX; v++) + +/* Read/write. */ +typedef enum +{ + VLIB_READ, + VLIB_WRITE, +} vlib_read_or_write_t; + +/* Up/down. */ +typedef enum +{ + VLIB_DOWN = 0, + VLIB_UP = 1, +} vlib_up_or_down_t; + +/* Enable/disable. */ +typedef enum +{ + VLIB_DISABLE = 0, + VLIB_ENABLE = 1, +} vlib_enable_or_disable_t; + +#endif /* included_vlib_defs_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/dir.dox b/src/vlib/dir.dox new file mode 100644 index 00000000000..4806e7a91c6 --- /dev/null +++ b/src/vlib/dir.dox @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Copyright (c) 2016 Comcast Cable Communications Management, LLC. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* Doxygen directory documentation */ +/** +@dir +@brief VLIB application library source. +*/ +/*? %%clicmd:group_label VLIB application library%% ?*/ + diff --git a/src/vlib/elog_samples.c b/src/vlib/elog_samples.c new file mode 100644 index 00000000000..a8c800df959 --- /dev/null +++ b/src/vlib/elog_samples.c @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vlib/vlib.h> +#include <vppinfra/elog.h> + +static inline void +elog_four_int_sample (u32 * data) +{ + ELOG_TYPE_DECLARE (e) = + { + .format = "four int: first %d second %d third %d fourth %d",.format_args = + "i4i4i4i4",}; + struct + { + u32 data[4]; + } *ed; + ed = ELOG_DATA (&vlib_global_main.elog_main, e); + ed->data[0] = data[0]; + ed->data[1] = data[1]; + ed->data[2] = data[2]; + ed->data[3] = data[3]; +} + +static inline void +elog_four_int_track_sample (u32 * data) +{ + ELOG_TYPE_DECLARE (e) = + { + .format = + "four_int_track: first %d second %d third %d fourth %d",.format_args = + "i4i4i4i4",}; + struct + { + u32 data[4]; + } *ed; + ELOG_TRACK (sample_track); + ed = ELOG_TRACK_DATA (&vlib_global_main.elog_main, e, sample_track); + ed->data[0] = data[0]; + ed->data[1] = data[1]; + ed->data[2] = data[2]; + ed->data[3] = data[3]; +} + +static inline void +elog_enum_sample (u8 which) +{ + ELOG_TYPE_DECLARE (e) = + { + .format = "my enum: %s",.format_args = "t1",.n_enum_strings = + 2,.enum_strings = + { + "string 1", "string 2",},}; + struct + { + u8 which; + } *ed; + ed = ELOG_DATA (&vlib_global_main.elog_main, e); + ed->which = which; +} + +static inline void +elog_one_datum_sample (u32 data) +{ + ELOG_TYPE_DECLARE (e) = + { + .format = "one datum: %d",.format_args = "i4",}; + + elog (&vlib_global_main.elog_main, &e, data); +} + +static clib_error_t * +test_elog_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + int i; + u32 samples[4]; + + for (i = 0; i < 10; i++) + { + samples[0] = i; + samples[1] = i + 1; + samples[2] = i + 2; + samples[3] = i + 3; + + elog_four_int_sample (samples); + elog_four_int_track_sample (samples); + elog_enum_sample (0); + elog_enum_sample (1); + elog_one_datum_sample (i); + } + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (test_elog_command, static) = { + .path = "test elog sample", + .short_help = "test elog sample", + .function = test_elog_command_fn, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/error.c b/src/vlib/error.c new file mode 100644 index 00000000000..a2c2317686b --- /dev/null +++ b/src/vlib/error.c @@ -0,0 +1,338 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * error.c: VLIB error handler + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vlib/vlib.h> +#include <vppinfra/heap.h> + +uword +vlib_error_drop_buffers (vlib_main_t * vm, + vlib_node_runtime_t * node, + u32 * buffers, + u32 next_buffer_stride, + u32 n_buffers, + u32 next_index, + u32 drop_error_node, u32 drop_error_code) +{ + u32 n_left_this_frame, n_buffers_left, *args, n_args_left; + vlib_error_t drop_error; + + drop_error = vlib_error_set (drop_error_node, drop_error_code); + + n_buffers_left = n_buffers; + while (n_buffers_left > 0) + { + vlib_get_next_frame (vm, node, next_index, args, n_args_left); + + n_left_this_frame = clib_min (n_buffers_left, n_args_left); + n_buffers_left -= n_left_this_frame; + n_args_left -= n_left_this_frame; + + while (n_left_this_frame >= 4) + { + u32 bi0, bi1, bi2, bi3; + vlib_buffer_t *b0, *b1, *b2, *b3; + + args[0] = bi0 = buffers[0]; + args[1] = bi1 = buffers[1]; + args[2] = bi2 = buffers[2]; + args[3] = bi3 = buffers[3]; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + b2 = vlib_get_buffer (vm, bi2); + b3 = vlib_get_buffer (vm, bi3); + + b0->error = drop_error; + b1->error = drop_error; + b2->error = drop_error; + b3->error = drop_error; + + buffers += 4; + args += 4; + n_left_this_frame -= 4; + } + + while (n_left_this_frame >= 1) + { + u32 bi0; + vlib_buffer_t *b0; + + args[0] = bi0 = buffers[0]; + + b0 = vlib_get_buffer (vm, bi0); + b0->error = drop_error; + + buffers += 1; + args += 1; + n_left_this_frame -= 1; + } + + vlib_put_next_frame (vm, node, next_index, n_args_left); + } + + return n_buffers; +} + +/* Convenience node to drop a vector of buffers with a "misc error". */ +static uword +misc_drop_buffers (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + return vlib_error_drop_buffers (vm, node, vlib_frame_args (frame), + /* buffer stride */ 1, + frame->n_vectors, + /* next */ 0, + node->node_index, + /* error */ 0); +} + +static char *misc_drop_buffers_error_strings[] = { + [0] = "misc. errors", +}; + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (misc_drop_buffers_node,static) = { + .function = misc_drop_buffers, + .name = "misc-drop-buffers", + .vector_size = sizeof (u32), + .n_errors = 1, + .n_next_nodes = 1, + .next_nodes = { + "error-drop", + }, + .error_strings = misc_drop_buffers_error_strings, +}; +/* *INDENT-ON* */ + +/* Reserves given number of error codes for given node. */ +void +vlib_register_errors (vlib_main_t * vm, + u32 node_index, u32 n_errors, char *error_strings[]) +{ + vlib_error_main_t *em = &vm->error_main; + vlib_node_t *n = vlib_get_node (vm, node_index); + uword l; + + ASSERT (os_get_cpu_number () == 0); + + /* Free up any previous error strings. */ + if (n->n_errors > 0) + heap_dealloc (em->error_strings_heap, n->error_heap_handle); + + n->n_errors = n_errors; + n->error_strings = error_strings; + + if (n_errors == 0) + return; + + n->error_heap_index = + heap_alloc (em->error_strings_heap, n_errors, n->error_heap_handle); + + l = vec_len (em->error_strings_heap); + + clib_memcpy (vec_elt_at_index (em->error_strings_heap, n->error_heap_index), + error_strings, n_errors * sizeof (error_strings[0])); + + /* Allocate a counter/elog type for each error. */ + vec_validate (em->counters, l - 1); + vec_validate (vm->error_elog_event_types, l - 1); + + /* Zero counters for re-registrations of errors. */ + if (n->error_heap_index + n_errors <= vec_len (em->counters_last_clear)) + clib_memcpy (em->counters + n->error_heap_index, + em->counters_last_clear + n->error_heap_index, + n_errors * sizeof (em->counters[0])); + else + memset (em->counters + n->error_heap_index, + 0, n_errors * sizeof (em->counters[0])); + + { + elog_event_type_t t; + uword i; + + memset (&t, 0, sizeof (t)); + for (i = 0; i < n_errors; i++) + { + t.format = (char *) format (0, "%v %s: %%d", + n->name, error_strings[i]); + vm->error_elog_event_types[n->error_heap_index + i] = t; + } + } +} + +static clib_error_t * +show_errors (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_error_main_t *em = &vm->error_main; + vlib_node_t *n; + u32 code, i, ni; + u64 c; + int index = 0; + int verbose = 0; + u64 *sums = 0; + + if (unformat (input, "verbose %d", &verbose)) + ; + else if (unformat (input, "verbose")) + verbose = 1; + + vec_validate (sums, vec_len (em->counters)); + + if (verbose) + vlib_cli_output (vm, "%=10s%=40s%=20s%=6s", "Count", "Node", "Reason", + "Index"); + else + vlib_cli_output (vm, "%=10s%=40s%=6s", "Count", "Node", "Reason"); + + + /* *INDENT-OFF* */ + foreach_vlib_main(({ + em = &this_vlib_main->error_main; + + if (verbose) + vlib_cli_output(vm, "Thread %u (%v):", index, + vlib_worker_threads[index].name); + + for (ni = 0; ni < vec_len (this_vlib_main->node_main.nodes); ni++) + { + n = vlib_get_node (this_vlib_main, ni); + for (code = 0; code < n->n_errors; code++) + { + i = n->error_heap_index + code; + c = em->counters[i]; + if (i < vec_len (em->counters_last_clear)) + c -= em->counters_last_clear[i]; + sums[i] += c; + + if (c == 0 && verbose < 2) + continue; + + if (verbose) + vlib_cli_output (vm, "%10Ld%=40v%=20s%=6d", c, n->name, + em->error_strings_heap[i], i); + else + vlib_cli_output (vm, "%10d%=40v%s", c, n->name, + em->error_strings_heap[i]); + } + } + index++; + })); + /* *INDENT-ON* */ + + if (verbose) + vlib_cli_output (vm, "Total:"); + + for (ni = 0; ni < vec_len (vm->node_main.nodes); ni++) + { + n = vlib_get_node (vm, ni); + for (code = 0; code < n->n_errors; code++) + { + i = n->error_heap_index + code; + if (sums[i]) + { + if (verbose) + vlib_cli_output (vm, "%10Ld%=40v%=20s%=10d", sums[i], n->name, + em->error_strings_heap[i], i); + } + } + } + + vec_free (sums); + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cli_show_errors, static) = { + .path = "show errors", + .short_help = "Show error counts", + .function = show_errors, +}; +/* *INDENT-ON* */ + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cli_show_node_counters, static) = { + .path = "show node counters", + .short_help = "Show node counters", + .function = show_errors, +}; +/* *INDENT-ON* */ + +static clib_error_t * +clear_error_counters (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_error_main_t *em; + u32 i; + + /* *INDENT-OFF* */ + foreach_vlib_main(({ + em = &this_vlib_main->error_main; + vec_validate (em->counters_last_clear, vec_len (em->counters) - 1); + for (i = 0; i < vec_len (em->counters); i++) + em->counters_last_clear[i] = em->counters[i]; + })); + /* *INDENT-ON* */ + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cli_clear_error_counters, static) = { + .path = "clear errors", + .short_help = "Clear error counters", + .function = clear_error_counters, +}; +/* *INDENT-ON* */ + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cli_clear_node_counters, static) = { + .path = "clear node counters", + .short_help = "Clear node counters", + .function = clear_error_counters, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/error.h b/src/vlib/error.h new file mode 100644 index 00000000000..df2075c306d --- /dev/null +++ b/src/vlib/error.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * error.h: drop/punt error packets + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_error_h +#define included_vlib_error_h + +/* Combined 16 bit node & 16 bit code as 32 bit number. */ +typedef u32 vlib_error_t; + +always_inline u32 +vlib_error_get_node (vlib_error_t e) +{ + return e >> 12; +} + +always_inline u32 +vlib_error_get_code (vlib_error_t e) +{ + return e & 0xfff; +} + +always_inline vlib_error_t +vlib_error_set (u32 node_index, u32 code) +{ + ASSERT (node_index < (1 << 20)); + ASSERT (code < (1 << 12)); + return (node_index << 12) | code; +} + +always_inline vlib_error_t +vlib_error_set_code (vlib_error_t e, u32 code) +{ + ASSERT (vlib_error_get_code (e) == 0); + ASSERT (code < (1 << 12)); + e |= code; + return e; +} + +typedef struct +{ + /* Error counters. */ + u64 *counters; + + /* Counter values as of last counter clear. */ + u64 *counters_last_clear; + + /* Error name strings in heap. Heap index + indexes counter vector. */ + char **error_strings_heap; +} vlib_error_main_t; + +/* Per node error registration. */ +void vlib_register_errors (struct vlib_main_t *vm, + u32 node_index, + u32 n_errors, char *error_strings[]); + +#endif /* included_vlib_error_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/error_funcs.h b/src/vlib/error_funcs.h new file mode 100644 index 00000000000..1a3602e92c6 --- /dev/null +++ b/src/vlib/error_funcs.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * error_funcs.h: VLIB error handling + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_error_funcs_h +#define included_vlib_error_funcs_h + +#include <vlib/node_funcs.h> + +always_inline void +vlib_error_elog_count (vlib_main_t * vm, uword counter, uword increment) +{ + elog_main_t *em = &vm->elog_main; + if (VLIB_ELOG_MAIN_LOOP > 0 && increment > 0) + elog (em, vec_elt_at_index (vm->error_elog_event_types, counter), + increment); +} + +always_inline void +vlib_error_count (vlib_main_t * vm, uword node_index, + uword counter, uword increment) +{ + vlib_node_t *n = vlib_get_node (vm, node_index); + vlib_error_main_t *em = &vm->error_main; + + ASSERT (counter < n->n_errors); + counter += n->error_heap_index; + + ASSERT (counter < vec_len (em->counters)); + em->counters[counter] += increment; + + vlib_error_elog_count (vm, counter, increment); +} + +/* Drop all buffers in frame with given error code. */ +uword +vlib_error_drop_buffers (vlib_main_t * vm, + vlib_node_runtime_t * node, + u32 * buffers, + u32 next_buffer_stride, + u32 n_buffers, + u32 error_next_index, + u32 error_node, u32 error_code); + +#endif /* included_vlib_error_funcs_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/format.c b/src/vlib/format.c new file mode 100644 index 00000000000..79a4d6866db --- /dev/null +++ b/src/vlib/format.c @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * format.c: generic network formatting/unformating + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vlib/vlib.h> + +u8 * +format_vlib_rx_tx (u8 * s, va_list * args) +{ + vlib_rx_or_tx_t r = va_arg (*args, vlib_rx_or_tx_t); + char *t; + + switch (r) + { + case VLIB_RX: + t = "rx"; + break; + case VLIB_TX: + t = "tx"; + break; + default: + t = "INVALID"; + break; + } + + vec_add (s, t, strlen (t)); + return s; +} + +u8 * +format_vlib_read_write (u8 * s, va_list * args) +{ + vlib_rx_or_tx_t r = va_arg (*args, vlib_rx_or_tx_t); + char *t; + + switch (r) + { + case VLIB_READ: + t = "read"; + break; + case VLIB_WRITE: + t = "write"; + break; + default: + t = "INVALID"; + break; + } + + vec_add (s, t, strlen (t)); + return s; +} + +/* Formats buffer data as printable ascii or as hex. */ +u8 * +format_vlib_buffer_data (u8 * s, va_list * args) +{ + u8 *data = va_arg (*args, u8 *); + u32 n_data_bytes = va_arg (*args, u32); + u32 i, is_printable; + + is_printable = 1; + for (i = 0; i < n_data_bytes && is_printable; i++) + { + u8 c = data[i]; + if (c < 0x20) + is_printable = 0; + else if (c >= 0x7f) + is_printable = 0; + } + + if (is_printable) + vec_add (s, data, n_data_bytes); + else + s = format (s, "%U", format_hex_bytes, data, n_data_bytes); + + return s; +} + +/* Enable/on => 1; disable/off => 0. */ +uword +unformat_vlib_enable_disable (unformat_input_t * input, va_list * args) +{ + int *result = va_arg (*args, int *); + int enable; + + if (unformat (input, "enable") || unformat (input, "on")) + enable = 1; + else if (unformat (input, "disable") || unformat (input, "off")) + enable = 0; + else + return 0; + + *result = enable; + return 1; +} + +/* rx/tx => VLIB_RX/VLIB_TX. */ +uword +unformat_vlib_rx_tx (unformat_input_t * input, va_list * args) +{ + int *result = va_arg (*args, int *); + if (unformat (input, "rx")) + *result = VLIB_RX; + else if (unformat (input, "tx")) + *result = VLIB_TX; + else + return 0; + return 1; +} + +/* Parse an int either %d or 0x%x. */ +uword +unformat_vlib_number (unformat_input_t * input, va_list * args) +{ + int *result = va_arg (*args, int *); + + return (unformat (input, "0x%x", result) || unformat (input, "%d", result)); +} + +/* Parse a-zA-Z0-9_ token and hash to value. */ +uword +unformat_vlib_number_by_name (unformat_input_t * input, va_list * args) +{ + uword *hash = va_arg (*args, uword *); + int *result = va_arg (*args, int *); + uword *p; + u8 *token; + int i; + + if (!unformat_user (input, unformat_token, "a-zA-Z0-9_", &token)) + return 0; + + /* Null terminate. */ + if (vec_len (token) > 0 && token[vec_len (token) - 1] != 0) + vec_add1 (token, 0); + + /* Check for exact match. */ + p = hash_get_mem (hash, token); + if (p) + goto done; + + /* Convert to upper case & try match. */ + for (i = 0; i < vec_len (token); i++) + if (token[i] >= 'a' && token[i] <= 'z') + token[i] = 'A' + token[i] - 'a'; + p = hash_get_mem (hash, token); + +done: + vec_free (token); + if (p) + *result = p[0]; + return p != 0; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/format_funcs.h b/src/vlib/format_funcs.h new file mode 100644 index 00000000000..f60b8940d14 --- /dev/null +++ b/src/vlib/format_funcs.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * format_funcs.h: VLIB formatting/unformating + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_format_h +#define included_vlib_format_h + +/* Format vlib_rx_or_tx_t/vlib_read_or_write_t enum as string. */ +u8 *format_vlib_rx_tx (u8 * s, va_list * args); +u8 *format_vlib_read_write (u8 * s, va_list * args); + +/* Formats buffer data as printable ascii or as hex. */ +u8 *format_vlib_buffer_data (u8 * s, va_list * args); + +/* Enable/on => 1; disable/off => 0. */ +uword unformat_vlib_enable_disable (unformat_input_t * input, va_list * args); + +/* rx/tx => VLIB_RX/VLIB_TX. */ +uword unformat_vlib_rx_tx (unformat_input_t * input, va_list * args); + +/* Parse a-zA-Z0-9_ token and hash to value. */ +uword unformat_vlib_number_by_name (unformat_input_t * input, va_list * args); + +/* Parse an int either %d or 0x%x. */ +uword unformat_vlib_number (unformat_input_t * input, va_list * args); + +/* Flag to format_vlib_*_header functions to tell them not to recurse + into the next layer's header. For example, tells format_vlib_ethernet_header + not to format ip header. */ +#define FORMAT_VLIB_HEADER_NO_RECURSION (~0) + +#endif /* included_vlib_format_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/global_funcs.h b/src/vlib/global_funcs.h new file mode 100644 index 00000000000..bbdbdef50b2 --- /dev/null +++ b/src/vlib/global_funcs.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * global_funcs.h: global data structure access functions + */ + +#ifndef included_vlib_global_funcs_h_ +#define included_vlib_global_funcs_h_ + +always_inline vlib_main_t * +vlib_get_main (void) +{ + vlib_main_t *vm; + vm = vlib_mains ? vlib_mains[os_get_cpu_number ()] : &vlib_global_main; + ASSERT (vm); + return vm; +} + +always_inline vlib_thread_main_t * +vlib_get_thread_main () +{ + return &vlib_thread_main; +} + +#endif /* included_vlib_global_funcs_h_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/i2c.c b/src/vlib/i2c.c new file mode 100644 index 00000000000..97f5bb21cc7 --- /dev/null +++ b/src/vlib/i2c.c @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vlib/vlib.h> +#include <vlib/i2c.h> + +static inline void +i2c_delay (i2c_bus_t * b, f64 timeout) +{ + vlib_main_t *vm = vlib_get_main (); + vlib_time_wait (vm, timeout); +} + +static void +i2c_wait_for_scl (i2c_bus_t * b) +{ + f64 t = 0; + + while (t < b->hold_time) + { + int sda, scl; + i2c_delay (b, b->rise_fall_time); + b->get_bits (b, &scl, &sda); + + if (scl) + return; + + t += b->rise_fall_time; + } + b->timeout = 1; +} + +static void +i2c_start (i2c_bus_t * b) +{ + b->timeout = 0; + + b->put_bits (b, 1, 1); + i2c_wait_for_scl (b); + + if (vlib_i2c_bus_timed_out (b)) + return; + + b->put_bits (b, 1, 0); + i2c_delay (b, b->hold_time); + b->put_bits (b, 0, 0); + i2c_delay (b, b->hold_time); +} + +static void +i2c_stop (i2c_bus_t * b) +{ + b->put_bits (b, 0, 0); + i2c_delay (b, b->rise_fall_time); + + b->put_bits (b, 1, 0); + i2c_delay (b, b->hold_time); + + b->put_bits (b, 1, 1); + i2c_delay (b, b->hold_time); +} + +static void +i2c_write_bit (i2c_bus_t * b, int sda) +{ + b->put_bits (b, 0, sda); + i2c_delay (b, b->rise_fall_time); + + b->put_bits (b, 1, sda); + i2c_wait_for_scl (b); + i2c_delay (b, b->hold_time); + + b->put_bits (b, 0, sda); + i2c_delay (b, b->rise_fall_time); +} + +static void +i2c_read_bit (i2c_bus_t * b, int *sda) +{ + int scl; + + b->put_bits (b, 1, 1); + i2c_wait_for_scl (b); + i2c_delay (b, b->hold_time); + + b->get_bits (b, &scl, sda); + + b->put_bits (b, 0, 1); + i2c_delay (b, b->rise_fall_time); +} + +static void +i2c_write_byte (i2c_bus_t * b, u8 data) +{ + int i, sda; + + for (i = 7; i >= 0; i--) + { + i2c_write_bit (b, (data >> i) & 1); + if (b->timeout) + return; + } + + b->put_bits (b, 0, 1); + i2c_delay (b, b->rise_fall_time); + + i2c_read_bit (b, &sda); + + if (sda) + b->timeout = 1; +} + + +static void +i2c_read_byte (i2c_bus_t * b, u8 * data, int ack) +{ + int i, sda; + + *data = 0; + + b->put_bits (b, 0, 1); + i2c_delay (b, b->rise_fall_time); + + for (i = 7; i >= 0; i--) + { + i2c_read_bit (b, &sda); + if (b->timeout) + return; + + *data |= (sda != 0) << i; + } + + i2c_write_bit (b, ack == 0); +} + + +void +vlib_i2c_init (i2c_bus_t * b) +{ + f64 tick; + if (!b->clock) + b->clock = 400000; + + tick = 1.0 / b->clock; + + /* Spend 40% of time in low and high states */ + if (!b->hold_time) + b->hold_time = 0.4 * tick; + + /* Spend 10% of time waiting for rise and fall */ + if (!b->rise_fall_time) + b->rise_fall_time = 0.1 * tick; +} + +void +vlib_i2c_xfer (i2c_bus_t * bus, i2c_msg_t * msgs) +{ + i2c_msg_t *msg; + int i; + + vec_foreach (msg, msgs) + { + i2c_start (bus); + i2c_write_byte (bus, + (msg->addr << 1) + (msg->flags == I2C_MSG_FLAG_READ)); + + if (msg->flags & I2C_MSG_FLAG_READ) + for (i = 0; i < msg->len; i++) + { + i2c_read_byte (bus, &msg->buffer[i], /* ack */ i + 1 != msg->len); + if (bus->timeout) + goto done; + } + + else + for (i = 0; i < msg->len; i++) + { + i2c_write_byte (bus, msg->buffer[i]); + if (bus->timeout) + goto done; + } + } + +done: + i2c_stop (bus); +} + +void +vlib_i2c_read_eeprom (i2c_bus_t * bus, u8 i2c_addr, u16 start_addr, + u16 length, u8 * data) +{ + i2c_msg_t *msg = 0; + u8 start_address[1]; + + vec_validate (msg, 1); + + start_address[0] = start_addr; + msg[0].addr = i2c_addr; + msg[0].flags = I2C_MSG_FLAG_WRITE; + msg[0].buffer = (u8 *) & start_address; + msg[0].len = 1; + + msg[1].addr = i2c_addr; + msg[1].flags = I2C_MSG_FLAG_READ; + msg[1].buffer = data; + msg[1].len = length; + + vlib_i2c_xfer (bus, msg); + + vec_free (msg); +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/i2c.h b/src/vlib/i2c.h new file mode 100644 index 00000000000..b79bdc75b81 --- /dev/null +++ b/src/vlib/i2c.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_vlib_i2c_h +#define included_vlib_i2c_h + +#include <vppinfra/types.h> + + +#define I2C_MSG_FLAG_WRITE 0 +#define I2C_MSG_FLAG_READ 1 + +typedef struct +{ + u8 addr; + u8 flags; + u16 len; + u8 *buffer; +} i2c_msg_t; + +typedef struct i2c_bus_t +{ + void (*put_bits) (struct i2c_bus_t * b, int scl, int sda); + void (*get_bits) (struct i2c_bus_t * b, int *scl, int *sda); + + int timeout; + u32 clock; + f64 hold_time; + f64 rise_fall_time; + + /* Private data */ + uword private_data; + +} i2c_bus_t; + +void vlib_i2c_init (i2c_bus_t * bus); +void vlib_i2c_xfer (i2c_bus_t * bus, i2c_msg_t * msgs); +void vlib_i2c_read_eeprom (i2c_bus_t * bus, u8 i2c_addr, u16 start_addr, + u16 length, u8 * data); + +static inline int +vlib_i2c_bus_timed_out (i2c_bus_t * bus) +{ + return bus->timeout; +} + +#endif /* included_vlib_i2c_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/init.c b/src/vlib/init.c new file mode 100644 index 00000000000..8d4784513ab --- /dev/null +++ b/src/vlib/init.c @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * init.c: mechanism for functions to be called at init/exit. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vlib/vlib.h> + +clib_error_t * +vlib_call_init_exit_functions (vlib_main_t * vm, + _vlib_init_function_list_elt_t * head, + int call_once) +{ + clib_error_t *error = 0; + _vlib_init_function_list_elt_t *i; + + i = head; + while (i) + { + if (call_once && !hash_get (vm->init_functions_called, i->f)) + { + if (call_once) + hash_set1 (vm->init_functions_called, i->f); + error = i->f (vm); + if (error) + return error; + } + i = i->next_init_function; + } + return error; +} + +clib_error_t * +vlib_call_all_init_functions (vlib_main_t * vm) +{ + /* Call dummy functions to make sure purely static modules are + linked in. */ +#define _(f) vlib_##f##_reference (); + foreach_vlib_module_reference; +#undef _ + + return vlib_call_init_exit_functions + (vm, vm->init_function_registrations, 1 /* call_once */ ); +} + +clib_error_t * +vlib_call_all_main_loop_enter_functions (vlib_main_t * vm) +{ + return vlib_call_init_exit_functions + (vm, vm->main_loop_enter_function_registrations, 1 /* call_once */ ); +} + +clib_error_t * +vlib_call_all_main_loop_exit_functions (vlib_main_t * vm) +{ + return vlib_call_init_exit_functions + (vm, vm->main_loop_exit_function_registrations, 1 /* call_once */ ); +} + +clib_error_t * +vlib_call_all_config_functions (vlib_main_t * vm, + unformat_input_t * input, int is_early) +{ + clib_error_t *error = 0; + vlib_config_function_runtime_t *c, **all; + uword *hash = 0, *p; + uword i; + + hash = hash_create_string (0, sizeof (uword)); + all = 0; + + c = vm->config_function_registrations; + + while (c) + { + hash_set_mem (hash, c->name, vec_len (all)); + vec_add1 (all, c); + unformat_init (&c->input, 0, 0); + c = c->next_registration; + } + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + u8 *s, *v; + + if (!unformat (input, "%s %v", &s, &v) || !(p = hash_get_mem (hash, s))) + { + error = clib_error_create ("unknown input `%s %v'", s, v); + goto done; + } + + c = all[p[0]]; + if (vec_len (c->input.buffer) > 0) + vec_add1 (c->input.buffer, ' '); + vec_add (c->input.buffer, v, vec_len (v)); + vec_free (v); + vec_free (s); + } + + for (i = 0; i < vec_len (all); i++) + { + c = all[i]; + + /* Is this an early config? Are we doing early configs? */ + if (is_early ^ c->is_early) + continue; + + /* Already called? */ + if (hash_get (vm->init_functions_called, c->function)) + continue; + hash_set1 (vm->init_functions_called, c->function); + + error = c->function (vm, &c->input); + if (error) + goto done; + } + +done: + for (i = 0; i < vec_len (all); i++) + { + c = all[i]; + unformat_free (&c->input); + } + vec_free (all); + hash_free (hash); + return error; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/init.h b/src/vlib/init.h new file mode 100644 index 00000000000..4fa5b304590 --- /dev/null +++ b/src/vlib/init.h @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * init.h: mechanism for functions to be called at init/exit. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_init_h +#define included_vlib_init_h + +#include <vppinfra/error.h> +#include <vppinfra/format.h> +#include <vppinfra/hash.h> + +/* Init/exit functions: called at start/end of main routine. Init + functions are typically used to register and setup packet + processing nodes. */ + +typedef clib_error_t *(vlib_init_function_t) (struct vlib_main_t * vm); + +typedef struct _vlib_init_function_list_elt +{ + struct _vlib_init_function_list_elt *next_init_function; + vlib_init_function_t *f; +} _vlib_init_function_list_elt_t; + +/* Configuration functions: called with configuration input just before + main polling loop starts. */ +typedef clib_error_t *(vlib_config_function_t) (struct vlib_main_t * vm, + unformat_input_t * input); + +typedef struct vlib_config_function_runtime_t +{ + /* Function to call. Set to null once function has already been called. */ + vlib_config_function_t *function; + + /* Input for function. */ + unformat_input_t input; + + /* next config function registration */ + struct vlib_config_function_runtime_t *next_registration; + + /* To be invoked as soon as the clib heap is available */ + u8 is_early; + + /* Name used to distinguish input on command line. */ + char name[32]; +} vlib_config_function_runtime_t; + +#define _VLIB_INIT_FUNCTION_SYMBOL(x, type) \ + _vlib_##type##_function_##x + +#define VLIB_INIT_FUNCTION_SYMBOL(x) \ + _VLIB_INIT_FUNCTION_SYMBOL(x, init) +#define VLIB_MAIN_LOOP_ENTER_FUNCTION_SYMBOL(x) \ + _VLIB_INIT_FUNCTION_SYMBOL(x, main_loop_enter) +#define VLIB_MAIN_LOOP_EXIT_FUNCTION_SYMBOL(x) \ + _VLIB_INIT_FUNCTION_SYMBOL(x, main_loop_exit) +#define VLIB_CONFIG_FUNCTION_SYMBOL(x) \ + _VLIB_INIT_FUNCTION_SYMBOL(x, config) + +/* Declaration is global (e.g. not static) so that init functions can + be called from other modules to resolve init function depend. */ + +#define VLIB_DECLARE_INIT_FUNCTION(x, tag) \ +vlib_init_function_t * _VLIB_INIT_FUNCTION_SYMBOL (x, tag) = x; \ +static void __vlib_add_##tag##_function_##x (void) \ + __attribute__((__constructor__)) ; \ +static void __vlib_add_##tag##_function_##x (void) \ +{ \ + vlib_main_t * vm = vlib_get_main(); \ + static _vlib_init_function_list_elt_t _vlib_init_function; \ + _vlib_init_function.next_init_function \ + = vm->tag##_function_registrations; \ + vm->tag##_function_registrations = &_vlib_init_function; \ + _vlib_init_function.f = &x; \ +} + +#define VLIB_INIT_FUNCTION(x) VLIB_DECLARE_INIT_FUNCTION(x,init) + +#define VLIB_MAIN_LOOP_ENTER_FUNCTION(x) \ + VLIB_DECLARE_INIT_FUNCTION(x,main_loop_enter) +#define VLIB_MAIN_LOOP_EXIT_FUNCTION(x) \ +VLIB_DECLARE_INIT_FUNCTION(x,main_loop_exit) + +#define VLIB_CONFIG_FUNCTION(x,n,...) \ + __VA_ARGS__ vlib_config_function_runtime_t \ + VLIB_CONFIG_FUNCTION_SYMBOL(x); \ +static void __vlib_add_config_function_##x (void) \ + __attribute__((__constructor__)) ; \ +static void __vlib_add_config_function_##x (void) \ +{ \ + vlib_main_t * vm = vlib_get_main(); \ + VLIB_CONFIG_FUNCTION_SYMBOL(x).next_registration \ + = vm->config_function_registrations; \ + vm->config_function_registrations \ + = &VLIB_CONFIG_FUNCTION_SYMBOL(x); \ +} \ + vlib_config_function_runtime_t \ + VLIB_CONFIG_FUNCTION_SYMBOL (x) \ + = { \ + .name = n, \ + .function = x, \ + .is_early = 0, \ + } + +#define VLIB_EARLY_CONFIG_FUNCTION(x,n,...) \ + __VA_ARGS__ vlib_config_function_runtime_t \ + VLIB_CONFIG_FUNCTION_SYMBOL(x); \ +static void __vlib_add_config_function_##x (void) \ + __attribute__((__constructor__)) ; \ +static void __vlib_add_config_function_##x (void) \ +{ \ + vlib_main_t * vm = vlib_get_main(); \ + VLIB_CONFIG_FUNCTION_SYMBOL(x).next_registration \ + = vm->config_function_registrations; \ + vm->config_function_registrations \ + = &VLIB_CONFIG_FUNCTION_SYMBOL(x); \ +} \ + vlib_config_function_runtime_t \ + VLIB_CONFIG_FUNCTION_SYMBOL (x) \ + = { \ + .name = n, \ + .function = x, \ + .is_early = 1, \ + } + +/* Call given init function: used for init function dependencies. */ +#define vlib_call_init_function(vm, x) \ + ({ \ + extern vlib_init_function_t * VLIB_INIT_FUNCTION_SYMBOL (x); \ + vlib_init_function_t * _f = VLIB_INIT_FUNCTION_SYMBOL (x); \ + clib_error_t * _error = 0; \ + if (! hash_get (vm->init_functions_called, _f)) \ + { \ + hash_set1 (vm->init_functions_called, _f); \ + _error = _f (vm); \ + } \ + _error; \ + }) + +/* Don't call given init function: used to suppress parts of the netstack */ +#define vlib_mark_init_function_complete(vm, x) \ + ({ \ + extern vlib_init_function_t * VLIB_INIT_FUNCTION_SYMBOL (x); \ + vlib_init_function_t * _f = VLIB_INIT_FUNCTION_SYMBOL (x); \ + hash_set1 (vm->init_functions_called, _f); \ + }) + +#define vlib_call_post_graph_init_function(vm, x) \ + ({ \ + extern vlib_init_function_t * VLIB_POST_GRAPH_INIT_FUNCTION_SYMBOL (x); \ + vlib_init_function_t * _f = VLIB_POST_GRAPH_INIT_FUNCTION_SYMBOL (x); \ + clib_error_t * _error = 0; \ + if (! hash_get (vm->init_functions_called, _f)) \ + { \ + hash_set1 (vm->init_functions_called, _f); \ + _error = _f (vm); \ + } \ + _error; \ + }) + +#define vlib_call_config_function(vm, x) \ + ({ \ + vlib_config_function_runtime_t * _r; \ + clib_error_t * _error = 0; \ + extern vlib_config_function_runtime_t \ + VLIB_CONFIG_FUNCTION_SYMBOL (x); \ + \ + _r = &VLIB_CONFIG_FUNCTION_SYMBOL (x); \ + if (! hash_get (vm->init_functions_called, _r->function)) \ + { \ + hash_set1 (vm->init_functions_called, _r->function); \ + _error = _r->function (vm, &_r->input); \ + } \ + _error; \ + }) + +/* External functions. */ +clib_error_t *vlib_call_all_init_functions (struct vlib_main_t *vm); +clib_error_t *vlib_call_all_config_functions (struct vlib_main_t *vm, + unformat_input_t * input, + int is_early); +clib_error_t *vlib_call_all_main_loop_enter_functions (struct vlib_main_t + *vm); +clib_error_t *vlib_call_all_main_loop_exit_functions (struct vlib_main_t *vm); +clib_error_t *vlib_call_init_exit_functions (struct vlib_main_t *vm, + _vlib_init_function_list_elt_t * + head, int call_once); + +#define foreach_vlib_module_reference \ + _ (node_cli) \ + _ (trace_cli) + +/* Dummy function to get node_cli.c linked in. */ +#define _(x) void vlib_##x##_reference (void); +foreach_vlib_module_reference +#undef _ +#endif /* included_vlib_init_h */ +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/lex.c b/src/vlib/lex.c new file mode 100644 index 00000000000..1cc8f1678d2 --- /dev/null +++ b/src/vlib/lex.c @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vlib/vlib.h> +#include <vlib/lex.h> + +vlib_lex_main_t vlib_lex_main; + +#define LEX_DEBUG 0 + +u8 * +format_vlib_lex_token (u8 * s, va_list * args) +{ + vlib_lex_main_t *lm = va_arg (*args, vlib_lex_main_t *); + vlib_lex_token_t *t = va_arg (*args, vlib_lex_token_t *); + + if (t->token == VLIB_LEX_word) + s = format (s, "%s", t->value.as_pointer); + else + s = format (s, "%s", lm->lex_token_names[t->token]); + return s; +} + +void +vlib_lex_get_token (vlib_lex_main_t * lm, vlib_lex_token_t * rv) +{ + u8 c; + vlib_lex_table_t *t; + vlib_lex_table_entry_t *e; + uword tv; + + if (PREDICT_FALSE (lm->pushback_sp >= 0)) + { + rv[0] = lm->pushback_vector[lm->pushback_sp--]; + return; + } + + rv->value.as_uword = ~0; + + while (1) + { + if (PREDICT_FALSE (lm->current_index >= vec_len (lm->input_vector))) + { + rv->token = VLIB_LEX_eof; + return; + } + + t = vec_elt_at_index (lm->lex_tables, lm->current_table_index); + c = (lm->input_vector[lm->current_index++]) & 0x7f; + e = &t->entries[c]; + lm->current_table_index = e->next_table_index; + + switch (e->action) + { + case VLIB_LEX_IGNORE: + continue; + + case VLIB_LEX_START_NUMBER: + lm->current_token_value = 0; + /* fallthru */ + + case VLIB_LEX_ADD_TO_NUMBER: + lm->current_number_base = e->token; + lm->current_token_value *= lm->current_number_base; + tv = c - '0'; + if (tv >= lm->current_number_base) + { + tv = 10 + c - 'A'; + if (tv >= lm->current_number_base) + tv = 10 + c - 'a'; + } + lm->current_token_value += tv; + continue; + + case VLIB_LEX_ADD_TO_TOKEN: + vec_add1 (lm->token_buffer, c); + continue; + + case VLIB_LEX_KEYWORD_CHECK: + { + uword *p; + + vec_add1 (lm->token_buffer, 0); + + /* It's either a keyword or just a word. */ + p = hash_get_mem (lm->lex_keywords, lm->token_buffer); + if (p) + { + rv->token = p[0]; + if (LEX_DEBUG > 0) + clib_warning ("keyword '%s' token %s", + lm->token_buffer, + lm->lex_token_names[rv->token]); + } + else + { + /* it's a WORD */ + rv->token = VLIB_LEX_word; + rv->value.as_pointer = vec_dup (lm->token_buffer); + if (LEX_DEBUG > 0) + clib_warning ("%s, value '%s'", + lm->lex_token_names[VLIB_LEX_word], + rv->value.as_pointer); + } + _vec_len (lm->token_buffer) = 0; + + /* Rescan the character which terminated the keyword/word. */ + lm->current_index--; + return; + } + + case VLIB_LEX_RETURN_AND_RESCAN: + ASSERT (lm->current_index); + lm->current_index--; + /* note flow-through */ + + case VLIB_LEX_RETURN: + rv->token = e->token; + rv->value.as_uword = lm->current_token_value; + lm->current_token_value = ~0; + if (LEX_DEBUG > 0) + { + clib_warning + ("table %s char '%c'(0x%02x) next table %s return %s", + t->name, c, c, lm->lex_tables[e->next_table_index].name, + lm->lex_token_names[e->token]); + if (rv->token == VLIB_LEX_number) + clib_warning (" numeric value 0x%x (%d)", rv->value, + rv->value); + } + return; + } + } +} + +u16 +vlib_lex_add_token (vlib_lex_main_t * lm, char *token_name) +{ + uword *p; + u16 rv; + + p = hash_get_mem (lm->lex_tokens_by_name, token_name); + + if (p) + return p[0]; + + rv = vec_len (lm->lex_token_names); + hash_set_mem (lm->lex_tokens_by_name, token_name, rv); + vec_add1 (lm->lex_token_names, token_name); + + return rv; +} + +static u16 +add_keyword (vlib_lex_main_t * lm, char *keyword, char *token_name) +{ + uword *p; + u16 token; + + p = hash_get_mem (lm->lex_keywords, keyword); + + ASSERT (p == 0); + + token = vlib_lex_add_token (lm, token_name); + + hash_set_mem (lm->lex_keywords, keyword, token); + return token; +} + +u16 +vlib_lex_find_or_add_keyword (vlib_lex_main_t * lm, char *keyword, + char *token_name) +{ + uword *p = hash_get_mem (lm->lex_keywords, keyword); + return p ? p[0] : add_keyword (lm, keyword, token_name); +} + +void +vlib_lex_set_action_range (u32 table_index, u8 lo, u8 hi, u16 action, + u16 token, u32 next_table_index) +{ + int i; + vlib_lex_main_t *lm = &vlib_lex_main; + vlib_lex_table_t *t = pool_elt_at_index (lm->lex_tables, table_index); + + for (i = lo; i <= hi; i++) + { + ASSERT (i < ARRAY_LEN (t->entries)); + t->entries[i].action = action; + t->entries[i].token = token; + t->entries[i].next_table_index = next_table_index; + } +} + +u16 +vlib_lex_add_table (char *name) +{ + vlib_lex_main_t *lm = &vlib_lex_main; + vlib_lex_table_t *t; + uword *p; + + p = hash_get_mem (lm->lex_tables_by_name, name); + + ASSERT (p == 0); + + pool_get_aligned (lm->lex_tables, t, CLIB_CACHE_LINE_BYTES); + + t->name = name; + + hash_set_mem (lm->lex_tables_by_name, name, t - lm->lex_tables); + + vlib_lex_set_action_range (t - lm->lex_tables, 1, 0x7F, VLIB_LEX_IGNORE, ~0, + t - lm->lex_tables); + + vlib_lex_set_action_range (t - lm->lex_tables, 0, 0, VLIB_LEX_RETURN, + VLIB_LEX_eof, t - lm->lex_tables); + + return t - lm->lex_tables; +} + +void +vlib_lex_reset (vlib_lex_main_t * lm, u8 * input_vector) +{ + if (lm->pushback_vector) + _vec_len (lm->pushback_vector) = 0; + lm->pushback_sp = -1; + + lm->input_vector = input_vector; + lm->current_index = 0; +} + +static clib_error_t * +lex_onetime_init (vlib_main_t * vm) +{ + vlib_lex_main_t *lm = &vlib_lex_main; + + lm->lex_tables_by_name = hash_create_string (0, sizeof (uword)); + lm->lex_tokens_by_name = hash_create_string (0, sizeof (uword)); + lm->lex_keywords = hash_create_string (0, sizeof (uword)); + lm->pushback_sp = -1; + +#define _(f) { u16 tmp = vlib_lex_add_token (lm, #f); ASSERT (tmp == VLIB_LEX_##f); } + foreach_vlib_lex_global_token; +#undef _ + + vec_validate (lm->token_buffer, 127); + _vec_len (lm->token_buffer) = 0; + + return 0; +} + +VLIB_INIT_FUNCTION (lex_onetime_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/lex.h b/src/vlib/lex.h new file mode 100644 index 00000000000..4ae58f468c1 --- /dev/null +++ b/src/vlib/lex.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef included_vlib_lex_h +#define included_vlib_lex_h + +#include <vppinfra/hash.h> +#include <vppinfra/bitmap.h> +#include <vppinfra/error.h> +#include <vppinfra/pool.h> + +#define foreach_vlib_lex_global_token \ + _ (invalid) \ + _ (eof) \ + _ (word) \ + _ (number) \ + _ (lt) \ + _ (gt) \ + _ (dot) \ + _ (slash) \ + _ (qmark) \ + _ (equals) \ + _ (plus) \ + _ (minus) \ + _ (star) \ + _ (lpar) \ + _ (rpar) + +typedef enum +{ +#define _(f) VLIB_LEX_##f, + foreach_vlib_lex_global_token +#undef _ +} vlib_lex_global_token_t; + +typedef enum +{ + VLIB_LEX_IGNORE, + VLIB_LEX_ADD_TO_TOKEN, + VLIB_LEX_RETURN, + VLIB_LEX_RETURN_AND_RESCAN, + VLIB_LEX_KEYWORD_CHECK, + VLIB_LEX_START_NUMBER, + VLIB_LEX_ADD_TO_NUMBER, +} vlib_lex_action_t; + +typedef struct +{ + u16 action; + u16 next_table_index; + u16 token; +} vlib_lex_table_entry_t; + +typedef struct +{ + char *name; + vlib_lex_table_entry_t entries[128]; +} vlib_lex_table_t; + +typedef struct +{ + u32 token; + + union + { + uword as_uword; + void *as_pointer; + char *as_string; + } value; +} vlib_lex_token_t; + +typedef struct +{ + vlib_lex_table_t *lex_tables; + uword *lex_tables_by_name; + + /* Vector of token strings. */ + char **lex_token_names; + + /* Hash mapping c string name to token index. */ + uword *lex_tokens_by_name; + + /* Hash mapping c string keyword name to token index. */ + uword *lex_keywords; + + vlib_lex_token_t *pushback_vector; + + i32 pushback_sp; + + u32 current_table_index; + + uword current_token_value; + + uword current_number_base; + + /* Input string we are lex-ing. */ + u8 *input_vector; + + /* Current index into input vector. */ + u32 current_index; + + /* Re-used vector for forming token strings and hashing them. */ + u8 *token_buffer; +} vlib_lex_main_t; + +vlib_lex_main_t vlib_lex_main; + +always_inline void +vlib_lex_cleanup_token (vlib_lex_token_t * t) +{ + if (t->token == VLIB_LEX_word) + { + u8 *tv = t->value.as_pointer; + vec_free (tv); + } +} + +u16 vlib_lex_add_table (char *name); +void vlib_lex_get_token (vlib_lex_main_t * lm, vlib_lex_token_t * result); +u16 vlib_lex_add_token (vlib_lex_main_t * lm, char *token_name); +void vlib_lex_set_action_range (u32 table_index, u8 lo, u8 hi, u16 action, + u16 token, u32 next_table_index); +void vlib_lex_reset (vlib_lex_main_t * lm, u8 * input_vector); +format_function_t format_vlib_lex_token; + +#endif /* included_vlib_lex_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/main.c b/src/vlib/main.c new file mode 100644 index 00000000000..6c6cad98bba --- /dev/null +++ b/src/vlib/main.c @@ -0,0 +1,1703 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * main.c: main vector processing loop + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <math.h> +#include <vppinfra/format.h> +#include <vlib/vlib.h> +#include <vlib/threads.h> + +#include <vlib/unix/cj.h> + +CJ_GLOBAL_LOG_PROTOTYPE; + +/* Actually allocate a few extra slots of vector data to support + speculative vector enqueues which overflow vector data in next frame. */ +#define VLIB_FRAME_SIZE_ALLOC (VLIB_FRAME_SIZE + 4) + +u32 wraps; + +always_inline u32 +vlib_frame_bytes (u32 n_scalar_bytes, u32 n_vector_bytes) +{ + u32 n_bytes; + + /* Make room for vlib_frame_t plus scalar arguments. */ + n_bytes = vlib_frame_vector_byte_offset (n_scalar_bytes); + + /* Make room for vector arguments. + Allocate a few extra slots of vector data to support + speculative vector enqueues which overflow vector data in next frame. */ +#define VLIB_FRAME_SIZE_EXTRA 4 + n_bytes += (VLIB_FRAME_SIZE + VLIB_FRAME_SIZE_EXTRA) * n_vector_bytes; + + /* Magic number is first 32bit number after vector data. + Used to make sure that vector data is never overrun. */ +#define VLIB_FRAME_MAGIC (0xabadc0ed) + n_bytes += sizeof (u32); + + /* Pad to cache line. */ + n_bytes = round_pow2 (n_bytes, CLIB_CACHE_LINE_BYTES); + + return n_bytes; +} + +always_inline u32 * +vlib_frame_find_magic (vlib_frame_t * f, vlib_node_t * node) +{ + void *p = f; + + p += vlib_frame_vector_byte_offset (node->scalar_size); + + p += (VLIB_FRAME_SIZE + VLIB_FRAME_SIZE_EXTRA) * node->vector_size; + + return p; +} + +static vlib_frame_size_t * +get_frame_size_info (vlib_node_main_t * nm, + u32 n_scalar_bytes, u32 n_vector_bytes) +{ + uword key = (n_scalar_bytes << 16) | n_vector_bytes; + uword *p, i; + + p = hash_get (nm->frame_size_hash, key); + if (p) + i = p[0]; + else + { + i = vec_len (nm->frame_sizes); + vec_validate (nm->frame_sizes, i); + hash_set (nm->frame_size_hash, key, i); + } + + return vec_elt_at_index (nm->frame_sizes, i); +} + +static u32 +vlib_frame_alloc_to_node (vlib_main_t * vm, u32 to_node_index, + u32 frame_flags) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_frame_size_t *fs; + vlib_node_t *to_node; + vlib_frame_t *f; + u32 fi, l, n, scalar_size, vector_size; + + to_node = vlib_get_node (vm, to_node_index); + + scalar_size = to_node->scalar_size; + vector_size = to_node->vector_size; + + fs = get_frame_size_info (nm, scalar_size, vector_size); + n = vlib_frame_bytes (scalar_size, vector_size); + if ((l = vec_len (fs->free_frame_indices)) > 0) + { + /* Allocate from end of free list. */ + fi = fs->free_frame_indices[l - 1]; + f = vlib_get_frame_no_check (vm, fi); + _vec_len (fs->free_frame_indices) = l - 1; + } + else + { + f = clib_mem_alloc_aligned_no_fail (n, VLIB_FRAME_ALIGN); + f->cpu_index = vm->cpu_index; + fi = vlib_frame_index_no_check (vm, f); + } + + /* Poison frame when debugging. */ + if (CLIB_DEBUG > 0) + { + u32 save_cpu_index = f->cpu_index; + + memset (f, 0xfe, n); + + f->cpu_index = save_cpu_index; + } + + /* Insert magic number. */ + { + u32 *magic; + + magic = vlib_frame_find_magic (f, to_node); + *magic = VLIB_FRAME_MAGIC; + } + + f->flags = VLIB_FRAME_IS_ALLOCATED | frame_flags; + f->n_vectors = 0; + f->scalar_size = scalar_size; + f->vector_size = vector_size; + + fs->n_alloc_frames += 1; + + return fi; +} + +/* Allocate a frame for from FROM_NODE to TO_NODE via TO_NEXT_INDEX. + Returns frame index. */ +static u32 +vlib_frame_alloc (vlib_main_t * vm, vlib_node_runtime_t * from_node_runtime, + u32 to_next_index) +{ + vlib_node_t *from_node; + + from_node = vlib_get_node (vm, from_node_runtime->node_index); + ASSERT (to_next_index < vec_len (from_node->next_nodes)); + + return vlib_frame_alloc_to_node (vm, from_node->next_nodes[to_next_index], + /* frame_flags */ 0); +} + +vlib_frame_t * +vlib_get_frame_to_node (vlib_main_t * vm, u32 to_node_index) +{ + u32 fi = vlib_frame_alloc_to_node (vm, to_node_index, + /* frame_flags */ + VLIB_FRAME_FREE_AFTER_DISPATCH); + return vlib_get_frame (vm, fi); +} + +void +vlib_put_frame_to_node (vlib_main_t * vm, u32 to_node_index, vlib_frame_t * f) +{ + vlib_pending_frame_t *p; + vlib_node_t *to_node; + + if (f->n_vectors == 0) + return; + + to_node = vlib_get_node (vm, to_node_index); + + vec_add2 (vm->node_main.pending_frames, p, 1); + + f->flags |= VLIB_FRAME_PENDING; + p->frame_index = vlib_frame_index (vm, f); + p->node_runtime_index = to_node->runtime_index; + p->next_frame_index = VLIB_PENDING_FRAME_NO_NEXT_FRAME; +} + +/* Free given frame. */ +void +vlib_frame_free (vlib_main_t * vm, vlib_node_runtime_t * r, vlib_frame_t * f) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *node; + vlib_frame_size_t *fs; + u32 frame_index; + + ASSERT (f->flags & VLIB_FRAME_IS_ALLOCATED); + + node = vlib_get_node (vm, r->node_index); + fs = get_frame_size_info (nm, node->scalar_size, node->vector_size); + + frame_index = vlib_frame_index (vm, f); + + ASSERT (f->flags & VLIB_FRAME_IS_ALLOCATED); + + /* No next frames may point to freed frame. */ + if (CLIB_DEBUG > 0) + { + vlib_next_frame_t *nf; + vec_foreach (nf, vm->node_main.next_frames) + ASSERT (nf->frame_index != frame_index); + } + + f->flags &= ~VLIB_FRAME_IS_ALLOCATED; + + vec_add1 (fs->free_frame_indices, frame_index); + ASSERT (fs->n_alloc_frames > 0); + fs->n_alloc_frames -= 1; +} + +static clib_error_t * +show_frame_stats (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_frame_size_t *fs; + + vlib_cli_output (vm, "%=6s%=12s%=12s", "Size", "# Alloc", "# Free"); + vec_foreach (fs, nm->frame_sizes) + { + u32 n_alloc = fs->n_alloc_frames; + u32 n_free = vec_len (fs->free_frame_indices); + + if (n_alloc + n_free > 0) + vlib_cli_output (vm, "%=6d%=12d%=12d", + fs - nm->frame_sizes, n_alloc, n_free); + } + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_frame_stats_cli, static) = { + .path = "show vlib frame-allocation", + .short_help = "Show node dispatch frame statistics", + .function = show_frame_stats, +}; +/* *INDENT-ON* */ + +/* Change ownership of enqueue rights to given next node. */ +static void +vlib_next_frame_change_ownership (vlib_main_t * vm, + vlib_node_runtime_t * node_runtime, + u32 next_index) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_next_frame_t *next_frame; + vlib_node_t *node, *next_node; + + node = vec_elt (nm->nodes, node_runtime->node_index); + + /* Only internal & input nodes are allowed to call other nodes. */ + ASSERT (node->type == VLIB_NODE_TYPE_INTERNAL + || node->type == VLIB_NODE_TYPE_INPUT + || node->type == VLIB_NODE_TYPE_PROCESS); + + ASSERT (vec_len (node->next_nodes) == node_runtime->n_next_nodes); + + next_frame = + vlib_node_runtime_get_next_frame (vm, node_runtime, next_index); + next_node = vec_elt (nm->nodes, node->next_nodes[next_index]); + + if (next_node->owner_node_index != VLIB_INVALID_NODE_INDEX) + { + /* Get frame from previous owner. */ + vlib_next_frame_t *owner_next_frame; + vlib_next_frame_t tmp; + + owner_next_frame = + vlib_node_get_next_frame (vm, + next_node->owner_node_index, + next_node->owner_next_index); + + /* Swap target next frame with owner's. */ + tmp = owner_next_frame[0]; + owner_next_frame[0] = next_frame[0]; + next_frame[0] = tmp; + + /* + * If next_frame is already pending, we have to track down + * all pending frames and fix their next_frame_index fields. + */ + if (next_frame->flags & VLIB_FRAME_PENDING) + { + vlib_pending_frame_t *p; + if (next_frame->frame_index != ~0) + { + vec_foreach (p, nm->pending_frames) + { + if (p->frame_index == next_frame->frame_index) + { + p->next_frame_index = + next_frame - vm->node_main.next_frames; + } + } + } + } + } + else + { + /* No previous owner. Take ownership. */ + next_frame->flags |= VLIB_FRAME_OWNER; + } + + /* Record new owner. */ + next_node->owner_node_index = node->index; + next_node->owner_next_index = next_index; + + /* Now we should be owner. */ + ASSERT (next_frame->flags & VLIB_FRAME_OWNER); +} + +/* Make sure that magic number is still there. + Otherwise, it is likely that caller has overrun frame arguments. */ +always_inline void +validate_frame_magic (vlib_main_t * vm, + vlib_frame_t * f, vlib_node_t * n, uword next_index) +{ + vlib_node_t *next_node = vlib_get_node (vm, n->next_nodes[next_index]); + u32 *magic = vlib_frame_find_magic (f, next_node); + ASSERT (VLIB_FRAME_MAGIC == magic[0]); +} + +vlib_frame_t * +vlib_get_next_frame_internal (vlib_main_t * vm, + vlib_node_runtime_t * node, + u32 next_index, u32 allocate_new_next_frame) +{ + vlib_frame_t *f; + vlib_next_frame_t *nf; + u32 n_used; + + nf = vlib_node_runtime_get_next_frame (vm, node, next_index); + + /* Make sure this next frame owns right to enqueue to destination frame. */ + if (PREDICT_FALSE (!(nf->flags & VLIB_FRAME_OWNER))) + vlib_next_frame_change_ownership (vm, node, next_index); + + /* ??? Don't need valid flag: can use frame_index == ~0 */ + if (PREDICT_FALSE (!(nf->flags & VLIB_FRAME_IS_ALLOCATED))) + { + nf->frame_index = vlib_frame_alloc (vm, node, next_index); + nf->flags |= VLIB_FRAME_IS_ALLOCATED; + } + + f = vlib_get_frame (vm, nf->frame_index); + + /* Has frame been removed from pending vector (e.g. finished dispatching)? + If so we can reuse frame. */ + if ((nf->flags & VLIB_FRAME_PENDING) && !(f->flags & VLIB_FRAME_PENDING)) + { + nf->flags &= ~VLIB_FRAME_PENDING; + f->n_vectors = 0; + } + + /* Allocate new frame if current one is already full. */ + n_used = f->n_vectors; + if (n_used >= VLIB_FRAME_SIZE || (allocate_new_next_frame && n_used > 0)) + { + /* Old frame may need to be freed after dispatch, since we'll have + two redundant frames from node -> next node. */ + if (!(nf->flags & VLIB_FRAME_NO_FREE_AFTER_DISPATCH)) + { + vlib_frame_t *f_old = vlib_get_frame (vm, nf->frame_index); + f_old->flags |= VLIB_FRAME_FREE_AFTER_DISPATCH; + } + + /* Allocate new frame to replace full one. */ + nf->frame_index = vlib_frame_alloc (vm, node, next_index); + f = vlib_get_frame (vm, nf->frame_index); + n_used = f->n_vectors; + } + + /* Should have free vectors in frame now. */ + ASSERT (n_used < VLIB_FRAME_SIZE); + + if (CLIB_DEBUG > 0) + { + validate_frame_magic (vm, f, + vlib_get_node (vm, node->node_index), next_index); + } + + return f; +} + +static void +vlib_put_next_frame_validate (vlib_main_t * vm, + vlib_node_runtime_t * rt, + u32 next_index, u32 n_vectors_left) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_next_frame_t *nf; + vlib_frame_t *f; + vlib_node_runtime_t *next_rt; + vlib_node_t *next_node; + u32 n_before, n_after; + + nf = vlib_node_runtime_get_next_frame (vm, rt, next_index); + f = vlib_get_frame (vm, nf->frame_index); + + ASSERT (n_vectors_left <= VLIB_FRAME_SIZE); + n_after = VLIB_FRAME_SIZE - n_vectors_left; + n_before = f->n_vectors; + + ASSERT (n_after >= n_before); + + next_rt = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL], + nf->node_runtime_index); + next_node = vlib_get_node (vm, next_rt->node_index); + if (n_after > 0 && next_node->validate_frame) + { + u8 *msg = next_node->validate_frame (vm, rt, f); + if (msg) + { + clib_warning ("%v", msg); + ASSERT (0); + } + vec_free (msg); + } +} + +void +vlib_put_next_frame (vlib_main_t * vm, + vlib_node_runtime_t * r, + u32 next_index, u32 n_vectors_left) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_next_frame_t *nf; + vlib_frame_t *f; + u32 n_vectors_in_frame; + + if (DPDK == 0 && CLIB_DEBUG > 0) + vlib_put_next_frame_validate (vm, r, next_index, n_vectors_left); + + nf = vlib_node_runtime_get_next_frame (vm, r, next_index); + f = vlib_get_frame (vm, nf->frame_index); + + /* Make sure that magic number is still there. Otherwise, caller + has overrun frame meta data. */ + if (CLIB_DEBUG > 0) + { + vlib_node_t *node = vlib_get_node (vm, r->node_index); + validate_frame_magic (vm, f, node, next_index); + } + + /* Convert # of vectors left -> number of vectors there. */ + ASSERT (n_vectors_left <= VLIB_FRAME_SIZE); + n_vectors_in_frame = VLIB_FRAME_SIZE - n_vectors_left; + + f->n_vectors = n_vectors_in_frame; + + /* If vectors were added to frame, add to pending vector. */ + if (PREDICT_TRUE (n_vectors_in_frame > 0)) + { + vlib_pending_frame_t *p; + u32 v0, v1; + + r->cached_next_index = next_index; + + if (!(f->flags & VLIB_FRAME_PENDING)) + { + __attribute__ ((unused)) vlib_node_t *node; + vlib_node_t *next_node; + vlib_node_runtime_t *next_runtime; + + node = vlib_get_node (vm, r->node_index); + next_node = vlib_get_next_node (vm, r->node_index, next_index); + next_runtime = vlib_node_get_runtime (vm, next_node->index); + + vec_add2 (nm->pending_frames, p, 1); + + p->frame_index = nf->frame_index; + p->node_runtime_index = nf->node_runtime_index; + p->next_frame_index = nf - nm->next_frames; + nf->flags |= VLIB_FRAME_PENDING; + f->flags |= VLIB_FRAME_PENDING; + + /* + * If we're going to dispatch this frame on another thread, + * force allocation of a new frame. Otherwise, we create + * a dangling frame reference. Each thread has its own copy of + * the next_frames vector. + */ + if (0 && r->cpu_index != next_runtime->cpu_index) + { + nf->frame_index = ~0; + nf->flags &= ~(VLIB_FRAME_PENDING | VLIB_FRAME_IS_ALLOCATED); + } + } + + /* Copy trace flag from next_frame and from runtime. */ + nf->flags |= + (nf->flags & VLIB_NODE_FLAG_TRACE) | (r-> + flags & VLIB_NODE_FLAG_TRACE); + + v0 = nf->vectors_since_last_overflow; + v1 = v0 + n_vectors_in_frame; + nf->vectors_since_last_overflow = v1; + if (PREDICT_FALSE (v1 < v0)) + { + vlib_node_t *node = vlib_get_node (vm, r->node_index); + vec_elt (node->n_vectors_by_next_node, next_index) += v0; + } + } +} + +/* Sync up runtime (32 bit counters) and main node stats (64 bit counters). */ +never_inline void +vlib_node_runtime_sync_stats (vlib_main_t * vm, + vlib_node_runtime_t * r, + uword n_calls, uword n_vectors, uword n_clocks) +{ + vlib_node_t *n = vlib_get_node (vm, r->node_index); + + n->stats_total.calls += n_calls + r->calls_since_last_overflow; + n->stats_total.vectors += n_vectors + r->vectors_since_last_overflow; + n->stats_total.clocks += n_clocks + r->clocks_since_last_overflow; + n->stats_total.max_clock = r->max_clock; + n->stats_total.max_clock_n = r->max_clock_n; + + r->calls_since_last_overflow = 0; + r->vectors_since_last_overflow = 0; + r->clocks_since_last_overflow = 0; +} + +always_inline void __attribute__ ((unused)) +vlib_process_sync_stats (vlib_main_t * vm, + vlib_process_t * p, + uword n_calls, uword n_vectors, uword n_clocks) +{ + vlib_node_runtime_t *rt = &p->node_runtime; + vlib_node_t *n = vlib_get_node (vm, rt->node_index); + vlib_node_runtime_sync_stats (vm, rt, n_calls, n_vectors, n_clocks); + n->stats_total.suspends += p->n_suspends; + p->n_suspends = 0; +} + +void +vlib_node_sync_stats (vlib_main_t * vm, vlib_node_t * n) +{ + vlib_node_runtime_t *rt; + + if (n->type == VLIB_NODE_TYPE_PROCESS) + { + /* Nothing to do for PROCESS nodes except in main thread */ + if (vm != &vlib_global_main) + return; + + vlib_process_t *p = vlib_get_process_from_node (vm, n); + n->stats_total.suspends += p->n_suspends; + p->n_suspends = 0; + rt = &p->node_runtime; + } + else + rt = + vec_elt_at_index (vm->node_main.nodes_by_type[n->type], + n->runtime_index); + + vlib_node_runtime_sync_stats (vm, rt, 0, 0, 0); + + /* Sync up runtime next frame vector counters with main node structure. */ + { + vlib_next_frame_t *nf; + uword i; + for (i = 0; i < rt->n_next_nodes; i++) + { + nf = vlib_node_runtime_get_next_frame (vm, rt, i); + vec_elt (n->n_vectors_by_next_node, i) += + nf->vectors_since_last_overflow; + nf->vectors_since_last_overflow = 0; + } + } +} + +always_inline u32 +vlib_node_runtime_update_stats (vlib_main_t * vm, + vlib_node_runtime_t * node, + uword n_calls, + uword n_vectors, uword n_clocks) +{ + u32 ca0, ca1, v0, v1, cl0, cl1, r; + + cl0 = cl1 = node->clocks_since_last_overflow; + ca0 = ca1 = node->calls_since_last_overflow; + v0 = v1 = node->vectors_since_last_overflow; + + ca1 = ca0 + n_calls; + v1 = v0 + n_vectors; + cl1 = cl0 + n_clocks; + + node->calls_since_last_overflow = ca1; + node->clocks_since_last_overflow = cl1; + node->vectors_since_last_overflow = v1; + node->max_clock_n = node->max_clock > n_clocks ? + node->max_clock_n : n_vectors; + node->max_clock = node->max_clock > n_clocks ? node->max_clock : n_clocks; + + r = vlib_node_runtime_update_main_loop_vector_stats (vm, node, n_vectors); + + if (PREDICT_FALSE (ca1 < ca0 || v1 < v0 || cl1 < cl0)) + { + node->calls_since_last_overflow = ca0; + node->clocks_since_last_overflow = cl0; + node->vectors_since_last_overflow = v0; + vlib_node_runtime_sync_stats (vm, node, n_calls, n_vectors, n_clocks); + } + + return r; +} + +always_inline void +vlib_process_update_stats (vlib_main_t * vm, + vlib_process_t * p, + uword n_calls, uword n_vectors, uword n_clocks) +{ + vlib_node_runtime_update_stats (vm, &p->node_runtime, + n_calls, n_vectors, n_clocks); +} + +static clib_error_t * +vlib_cli_elog_clear (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + elog_reset_buffer (&vm->elog_main); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (elog_clear_cli, static) = { + .path = "event-logger clear", + .short_help = "Clear the event log", + .function = vlib_cli_elog_clear, +}; +/* *INDENT-ON* */ + +#ifdef CLIB_UNIX +static clib_error_t * +elog_save_buffer (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + elog_main_t *em = &vm->elog_main; + char *file, *chroot_file; + clib_error_t *error = 0; + + if (!unformat (input, "%s", &file)) + { + vlib_cli_output (vm, "expected file name, got `%U'", + format_unformat_error, input); + return 0; + } + + /* It's fairly hard to get "../oopsie" through unformat; just in case */ + if (strstr (file, "..") || index (file, '/')) + { + vlib_cli_output (vm, "illegal characters in filename '%s'", file); + return 0; + } + + chroot_file = (char *) format (0, "/tmp/%s%c", file, 0); + + vec_free (file); + + vlib_cli_output (vm, "Saving %wd of %wd events to %s", + elog_n_events_in_buffer (em), + elog_buffer_capacity (em), chroot_file); + + vlib_worker_thread_barrier_sync (vm); + error = elog_write_file (em, chroot_file); + vlib_worker_thread_barrier_release (vm); + vec_free (chroot_file); + return error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (elog_save_cli, static) = { + .path = "event-logger save", + .short_help = "event-logger save <filename> (saves log in /tmp/<filename>)", + .function = elog_save_buffer, +}; +/* *INDENT-ON* */ + +static clib_error_t * +elog_stop (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + elog_main_t *em = &vm->elog_main; + + em->n_total_events_disable_limit = em->n_total_events; + + vlib_cli_output (vm, "Stopped the event logger..."); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (elog_stop_cli, static) = { + .path = "event-logger stop", + .short_help = "Stop the event-logger", + .function = elog_stop, +}; +/* *INDENT-ON* */ + +static clib_error_t * +elog_restart (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + elog_main_t *em = &vm->elog_main; + + em->n_total_events_disable_limit = ~0; + + vlib_cli_output (vm, "Restarted the event logger..."); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (elog_restart_cli, static) = { + .path = "event-logger restart", + .short_help = "Restart the event-logger", + .function = elog_restart, +}; +/* *INDENT-ON* */ + +static clib_error_t * +elog_resize (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + elog_main_t *em = &vm->elog_main; + u32 tmp; + + /* Stop the parade */ + elog_reset_buffer (&vm->elog_main); + + if (unformat (input, "%d", &tmp)) + { + elog_alloc (em, tmp); + em->n_total_events_disable_limit = ~0; + } + else + return clib_error_return (0, "Must specify how many events in the ring"); + + vlib_cli_output (vm, "Resized ring and restarted the event logger..."); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (elog_resize_cli, static) = { + .path = "event-logger resize", + .short_help = "event-logger resize <nnn>", + .function = elog_resize, +}; +/* *INDENT-ON* */ + +#endif /* CLIB_UNIX */ + +static void +elog_show_buffer_internal (vlib_main_t * vm, u32 n_events_to_show) +{ + elog_main_t *em = &vm->elog_main; + elog_event_t *e, *es; + f64 dt; + + /* Show events in VLIB time since log clock starts after VLIB clock. */ + dt = (em->init_time.cpu - vm->clib_time.init_cpu_time) + * vm->clib_time.seconds_per_clock; + + es = elog_peek_events (em); + vlib_cli_output (vm, "%d of %d events in buffer, logger %s", vec_len (es), + em->event_ring_size, + em->n_total_events < em->n_total_events_disable_limit ? + "running" : "stopped"); + vec_foreach (e, es) + { + vlib_cli_output (vm, "%18.9f: %U", + e->time + dt, format_elog_event, em, e); + n_events_to_show--; + if (n_events_to_show == 0) + break; + } + vec_free (es); + +} + +static clib_error_t * +elog_show_buffer (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + u32 n_events_to_show; + clib_error_t *error = 0; + + n_events_to_show = 250; + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "%d", &n_events_to_show)) + ; + else if (unformat (input, "all")) + n_events_to_show = ~0; + else + return unformat_parse_error (input); + } + elog_show_buffer_internal (vm, n_events_to_show); + return error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (elog_show_cli, static) = { + .path = "show event-logger", + .short_help = "Show event logger info", + .function = elog_show_buffer, +}; +/* *INDENT-ON* */ + +void +vlib_gdb_show_event_log (void) +{ + elog_show_buffer_internal (vlib_get_main (), (u32) ~ 0); +} + +static inline void +vlib_elog_main_loop_event (vlib_main_t * vm, + u32 node_index, + u64 time, u32 n_vectors, u32 is_return) +{ + vlib_main_t *evm = &vlib_global_main; + elog_main_t *em = &evm->elog_main; + + if (VLIB_ELOG_MAIN_LOOP && n_vectors) + elog_track (em, + /* event type */ + vec_elt_at_index (is_return + ? evm->node_return_elog_event_types + : evm->node_call_elog_event_types, + node_index), + /* track */ + (vm->cpu_index ? &vlib_worker_threads[vm->cpu_index]. + elog_track : &em->default_track), + /* data to log */ n_vectors); +} + +void +vlib_dump_context_trace (vlib_main_t * vm, u32 bi) +{ + vlib_node_main_t *vnm = &vm->node_main; + vlib_buffer_t *b; + u8 i, n; + + if (VLIB_BUFFER_TRACE_TRAJECTORY) + { + b = vlib_get_buffer (vm, bi); + n = b->pre_data[0]; + + fformat (stderr, "Context trace for bi %d b 0x%llx, visited %d\n", + bi, b, n); + + if (n == 0 || n > 20) + { + fformat (stderr, "n is unreasonable\n"); + return; + } + + + for (i = 0; i < n; i++) + { + u32 node_index; + + node_index = b->pre_data[i + 1]; + + if (node_index > vec_len (vnm->nodes)) + { + fformat (stderr, "Skip bogus node index %d\n", node_index); + continue; + } + + fformat (stderr, "%v (%d)\n", vnm->nodes[node_index]->name, + node_index); + } + } + else + { + fformat (stderr, + "in vlib/buffers.h, #define VLIB_BUFFER_TRACE_TRAJECTORY 1\n"); + } +} + + +/* static_always_inline */ u64 +dispatch_node (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_node_type_t type, + vlib_node_state_t dispatch_state, + vlib_frame_t * frame, u64 last_time_stamp) +{ + uword n, v; + u64 t; + vlib_node_main_t *nm = &vm->node_main; + vlib_next_frame_t *nf; + + if (CLIB_DEBUG > 0) + { + vlib_node_t *n = vlib_get_node (vm, node->node_index); + ASSERT (n->type == type); + } + + /* Only non-internal nodes may be disabled. */ + if (type != VLIB_NODE_TYPE_INTERNAL && node->state != dispatch_state) + { + ASSERT (type != VLIB_NODE_TYPE_INTERNAL); + return last_time_stamp; + } + + if ((type == VLIB_NODE_TYPE_PRE_INPUT || type == VLIB_NODE_TYPE_INPUT) + && dispatch_state != VLIB_NODE_STATE_INTERRUPT) + { + u32 c = node->input_main_loops_per_call; + /* Only call node when count reaches zero. */ + if (c) + { + node->input_main_loops_per_call = c - 1; + return last_time_stamp; + } + } + + /* Speculatively prefetch next frames. */ + if (node->n_next_nodes > 0) + { + nf = vec_elt_at_index (nm->next_frames, node->next_frame_index); + CLIB_PREFETCH (nf, 4 * sizeof (nf[0]), WRITE); + } + + vm->cpu_time_last_node_dispatch = last_time_stamp; + + if (1 /* || vm->cpu_index == node->cpu_index */ ) + { + vlib_main_t *stat_vm; + + stat_vm = /* vlib_mains ? vlib_mains[0] : */ vm; + + vlib_elog_main_loop_event (vm, node->node_index, + last_time_stamp, + frame ? frame->n_vectors : 0, + /* is_after */ 0); + + /* + * Turn this on if you run into + * "bad monkey" contexts, and you want to know exactly + * which nodes they've visited... See ixge.c... + */ + if (VLIB_BUFFER_TRACE_TRAJECTORY && frame) + { + int i; + int log_index; + u32 *from; + from = vlib_frame_vector_args (frame); + for (i = 0; i < frame->n_vectors; i++) + { + vlib_buffer_t *b = vlib_get_buffer (vm, from[i]); + ASSERT (b->pre_data[0] < 32); + log_index = b->pre_data[0]++ + 1; + b->pre_data[log_index] = node->node_index; + } + n = node->function (vm, node, frame); + } + else + n = node->function (vm, node, frame); + + t = clib_cpu_time_now (); + + vlib_elog_main_loop_event (vm, node->node_index, t, n, /* is_after */ + 1); + + vm->main_loop_vectors_processed += n; + vm->main_loop_nodes_processed += n > 0; + + v = vlib_node_runtime_update_stats (stat_vm, node, + /* n_calls */ 1, + /* n_vectors */ n, + /* n_clocks */ t - last_time_stamp); + + /* When in interrupt mode and vector rate crosses threshold switch to + polling mode. */ + if ((DPDK == 0 && dispatch_state == VLIB_NODE_STATE_INTERRUPT) + || (DPDK == 0 && dispatch_state == VLIB_NODE_STATE_POLLING + && (node->flags + & VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE))) + { + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "%s vector length %d, switching to %s",.format_args = + "T4i4t4",.n_enum_strings = 2,.enum_strings = + { + "interrupt", "polling",},}; + struct + { + u32 node_name, vector_length, is_polling; + } *ed; + + if (dispatch_state == VLIB_NODE_STATE_INTERRUPT + && v >= nm->polling_threshold_vector_length) + { + vlib_node_t *n = vlib_get_node (vm, node->node_index); + n->state = VLIB_NODE_STATE_POLLING; + node->state = VLIB_NODE_STATE_POLLING; + ASSERT (! + (node->flags & + VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE)); + node->flags &= + ~VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE; + node->flags |= + VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE; + nm->input_node_counts_by_state[VLIB_NODE_STATE_INTERRUPT] -= 1; + nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] += 1; + + ed = ELOG_DATA (&vm->elog_main, e); + ed->node_name = n->name_elog_string; + ed->vector_length = v; + ed->is_polling = 1; + } + else if (dispatch_state == VLIB_NODE_STATE_POLLING + && v <= nm->interrupt_threshold_vector_length) + { + vlib_node_t *n = vlib_get_node (vm, node->node_index); + if (node->flags & + VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE) + { + /* Switch to interrupt mode after dispatch in polling one more time. + This allows driver to re-enable interrupts. */ + n->state = VLIB_NODE_STATE_INTERRUPT; + node->state = VLIB_NODE_STATE_INTERRUPT; + node->flags &= + ~VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE; + nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] -= + 1; + nm->input_node_counts_by_state[VLIB_NODE_STATE_INTERRUPT] += + 1; + + } + else + { + node->flags |= + VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE; + ed = ELOG_DATA (&vm->elog_main, e); + ed->node_name = n->name_elog_string; + ed->vector_length = v; + ed->is_polling = 0; + } + } + } + } + + return t; +} + +/* static */ u64 +dispatch_pending_node (vlib_main_t * vm, + vlib_pending_frame_t * p, u64 last_time_stamp) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_frame_t *f; + vlib_next_frame_t *nf, nf_dummy; + vlib_node_runtime_t *n; + u32 restore_frame_index; + + n = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL], + p->node_runtime_index); + + f = vlib_get_frame (vm, p->frame_index); + if (p->next_frame_index == VLIB_PENDING_FRAME_NO_NEXT_FRAME) + { + /* No next frame: so use dummy on stack. */ + nf = &nf_dummy; + nf->flags = f->flags & VLIB_NODE_FLAG_TRACE; + nf->frame_index = ~p->frame_index; + } + else + nf = vec_elt_at_index (nm->next_frames, p->next_frame_index); + + ASSERT (f->flags & VLIB_FRAME_IS_ALLOCATED); + + /* Force allocation of new frame while current frame is being + dispatched. */ + restore_frame_index = ~0; + if (nf->frame_index == p->frame_index) + { + nf->frame_index = ~0; + nf->flags &= ~VLIB_FRAME_IS_ALLOCATED; + if (!(n->flags & VLIB_NODE_FLAG_FRAME_NO_FREE_AFTER_DISPATCH)) + restore_frame_index = p->frame_index; + } + + /* Frame must be pending. */ + ASSERT (f->flags & VLIB_FRAME_PENDING); + ASSERT (f->n_vectors > 0); + + /* Copy trace flag from next frame to node. + Trace flag indicates that at least one vector in the dispatched + frame is traced. */ + n->flags &= ~VLIB_NODE_FLAG_TRACE; + n->flags |= (nf->flags & VLIB_FRAME_TRACE) ? VLIB_NODE_FLAG_TRACE : 0; + nf->flags &= ~VLIB_FRAME_TRACE; + + last_time_stamp = dispatch_node (vm, n, + VLIB_NODE_TYPE_INTERNAL, + VLIB_NODE_STATE_POLLING, + f, last_time_stamp); + + f->flags &= ~VLIB_FRAME_PENDING; + + /* Frame is ready to be used again, so restore it. */ + if (restore_frame_index != ~0) + { + /* we musn't restore a frame that is flagged to be freed. This shouldn't + happen since frames to be freed post dispatch are those used + when the to-node frame becomes full i.e. they form a sort of queue of + frames to a single node. If we get here then the to-node frame and the + pending frame *were* the same, and so we removed the to-node frame. + Therefore this frame is no longer part of the queue for that node + and hence it cannot be it's overspill. + */ + ASSERT (!(f->flags & VLIB_FRAME_FREE_AFTER_DISPATCH)); + + /* p->next_frame_index can change during node dispatch if node + function decides to change graph hook up. */ + nf = vec_elt_at_index (nm->next_frames, p->next_frame_index); + nf->flags |= VLIB_FRAME_IS_ALLOCATED; + + if (~0 == nf->frame_index) + { + /* no new frame has been assigned to this node, use the saved one */ + nf->frame_index = restore_frame_index; + f->n_vectors = 0; + } + else + { + /* The node has gained a frame, implying packets from the current frame + were re-queued to this same node. we don't need the saved one + anymore */ + vlib_frame_free (vm, n, f); + } + } + else + { + if (f->flags & VLIB_FRAME_FREE_AFTER_DISPATCH) + { + ASSERT (!(n->flags & VLIB_NODE_FLAG_FRAME_NO_FREE_AFTER_DISPATCH)); + vlib_frame_free (vm, n, f); + } + } + + return last_time_stamp; +} + +always_inline uword +vlib_process_stack_is_valid (vlib_process_t * p) +{ + return p->stack[0] == VLIB_PROCESS_STACK_MAGIC; +} + +typedef struct +{ + vlib_main_t *vm; + vlib_process_t *process; + vlib_frame_t *frame; +} vlib_process_bootstrap_args_t; + +/* Called in process stack. */ +static uword +vlib_process_bootstrap (uword _a) +{ + vlib_process_bootstrap_args_t *a; + vlib_main_t *vm; + vlib_node_runtime_t *node; + vlib_frame_t *f; + vlib_process_t *p; + uword n; + + a = uword_to_pointer (_a, vlib_process_bootstrap_args_t *); + + vm = a->vm; + p = a->process; + f = a->frame; + node = &p->node_runtime; + + n = node->function (vm, node, f); + + ASSERT (vlib_process_stack_is_valid (p)); + + clib_longjmp (&p->return_longjmp, n); + + return n; +} + +/* Called in main stack. */ +static_always_inline uword +vlib_process_startup (vlib_main_t * vm, vlib_process_t * p, vlib_frame_t * f) +{ + vlib_process_bootstrap_args_t a; + uword r; + + a.vm = vm; + a.process = p; + a.frame = f; + + r = clib_setjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_RETURN); + if (r == VLIB_PROCESS_RETURN_LONGJMP_RETURN) + r = clib_calljmp (vlib_process_bootstrap, pointer_to_uword (&a), + (void *) p->stack + (1 << p->log2_n_stack_bytes)); + + return r; +} + +static_always_inline uword +vlib_process_resume (vlib_process_t * p) +{ + uword r; + p->flags &= ~(VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK + | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT + | VLIB_PROCESS_RESUME_PENDING); + r = clib_setjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_RETURN); + if (r == VLIB_PROCESS_RETURN_LONGJMP_RETURN) + clib_longjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_RESUME); + return r; +} + +static u64 +dispatch_process (vlib_main_t * vm, + vlib_process_t * p, vlib_frame_t * f, u64 last_time_stamp) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_runtime_t *node_runtime = &p->node_runtime; + vlib_node_t *node = vlib_get_node (vm, node_runtime->node_index); + u64 t; + uword n_vectors, is_suspend; + + if (node->state != VLIB_NODE_STATE_POLLING + || (p->flags & (VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK + | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT))) + return last_time_stamp; + + p->flags |= VLIB_PROCESS_IS_RUNNING; + + t = last_time_stamp; + vlib_elog_main_loop_event (vm, node_runtime->node_index, t, + f ? f->n_vectors : 0, /* is_after */ 0); + + /* Save away current process for suspend. */ + nm->current_process_index = node->runtime_index; + + n_vectors = vlib_process_startup (vm, p, f); + + nm->current_process_index = ~0; + + ASSERT (n_vectors != VLIB_PROCESS_RETURN_LONGJMP_RETURN); + is_suspend = n_vectors == VLIB_PROCESS_RETURN_LONGJMP_SUSPEND; + if (is_suspend) + { + vlib_pending_frame_t *pf; + + n_vectors = 0; + pool_get (nm->suspended_process_frames, pf); + pf->node_runtime_index = node->runtime_index; + pf->frame_index = f ? vlib_frame_index (vm, f) : ~0; + pf->next_frame_index = ~0; + + p->n_suspends += 1; + p->suspended_process_frame_index = pf - nm->suspended_process_frames; + + if (p->flags & VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK) + timing_wheel_insert (&nm->timing_wheel, p->resume_cpu_time, + vlib_timing_wheel_data_set_suspended_process + (node->runtime_index)); + } + else + p->flags &= ~VLIB_PROCESS_IS_RUNNING; + + t = clib_cpu_time_now (); + + vlib_elog_main_loop_event (vm, node_runtime->node_index, t, is_suspend, + /* is_after */ 1); + + vlib_process_update_stats (vm, p, + /* n_calls */ !is_suspend, + /* n_vectors */ n_vectors, + /* n_clocks */ t - last_time_stamp); + + return t; +} + +void +vlib_start_process (vlib_main_t * vm, uword process_index) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_process_t *p = vec_elt (nm->processes, process_index); + dispatch_process (vm, p, /* frame */ 0, /* cpu_time_now */ 0); +} + +static u64 +dispatch_suspended_process (vlib_main_t * vm, + uword process_index, u64 last_time_stamp) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_runtime_t *node_runtime; + vlib_node_t *node; + vlib_frame_t *f; + vlib_process_t *p; + vlib_pending_frame_t *pf; + u64 t, n_vectors, is_suspend; + + t = last_time_stamp; + + p = vec_elt (nm->processes, process_index); + if (PREDICT_FALSE (!(p->flags & VLIB_PROCESS_IS_RUNNING))) + return last_time_stamp; + + ASSERT (p->flags & (VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK + | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT)); + + pf = + pool_elt_at_index (nm->suspended_process_frames, + p->suspended_process_frame_index); + + node_runtime = &p->node_runtime; + node = vlib_get_node (vm, node_runtime->node_index); + f = pf->frame_index != ~0 ? vlib_get_frame (vm, pf->frame_index) : 0; + + vlib_elog_main_loop_event (vm, node_runtime->node_index, t, + f ? f->n_vectors : 0, /* is_after */ 0); + + /* Save away current process for suspend. */ + nm->current_process_index = node->runtime_index; + + n_vectors = vlib_process_resume (p); + t = clib_cpu_time_now (); + + nm->current_process_index = ~0; + + is_suspend = n_vectors == VLIB_PROCESS_RETURN_LONGJMP_SUSPEND; + if (is_suspend) + { + /* Suspend it again. */ + n_vectors = 0; + p->n_suspends += 1; + if (p->flags & VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK) + timing_wheel_insert (&nm->timing_wheel, p->resume_cpu_time, + vlib_timing_wheel_data_set_suspended_process + (node->runtime_index)); + } + else + { + p->flags &= ~VLIB_PROCESS_IS_RUNNING; + p->suspended_process_frame_index = ~0; + pool_put (nm->suspended_process_frames, pf); + } + + t = clib_cpu_time_now (); + vlib_elog_main_loop_event (vm, node_runtime->node_index, t, !is_suspend, + /* is_after */ 1); + + vlib_process_update_stats (vm, p, + /* n_calls */ !is_suspend, + /* n_vectors */ n_vectors, + /* n_clocks */ t - last_time_stamp); + + return t; +} + +static void +vlib_main_loop (vlib_main_t * vm) +{ + vlib_node_main_t *nm = &vm->node_main; + uword i; + u64 cpu_time_now; + + /* Initialize pending node vector. */ + vec_resize (nm->pending_frames, 32); + _vec_len (nm->pending_frames) = 0; + + /* Mark time of main loop start. */ + cpu_time_now = vm->clib_time.last_cpu_time; + vm->cpu_time_main_loop_start = cpu_time_now; + + /* Arrange for first level of timing wheel to cover times we care + most about. */ + nm->timing_wheel.min_sched_time = 10e-6; + nm->timing_wheel.max_sched_time = 10e-3; + timing_wheel_init (&nm->timing_wheel, + cpu_time_now, vm->clib_time.clocks_per_second); + + /* Pre-allocate expired nodes. */ + vec_alloc (nm->data_from_advancing_timing_wheel, 32); + vec_alloc (nm->pending_interrupt_node_runtime_indices, 32); + + if (!nm->polling_threshold_vector_length) + nm->polling_threshold_vector_length = 10; + if (!nm->interrupt_threshold_vector_length) + nm->interrupt_threshold_vector_length = 5; + + nm->current_process_index = ~0; + + /* Start all processes. */ + { + uword i; + for (i = 0; i < vec_len (nm->processes); i++) + cpu_time_now = + dispatch_process (vm, nm->processes[i], /* frame */ 0, cpu_time_now); + } + + while (1) + { + vlib_node_runtime_t *n; + + /* Process pre-input nodes. */ + vec_foreach (n, nm->nodes_by_type[VLIB_NODE_TYPE_PRE_INPUT]) + cpu_time_now = dispatch_node (vm, n, + VLIB_NODE_TYPE_PRE_INPUT, + VLIB_NODE_STATE_POLLING, + /* frame */ 0, + cpu_time_now); + + /* Next process input nodes. */ + vec_foreach (n, nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]) + cpu_time_now = dispatch_node (vm, n, + VLIB_NODE_TYPE_INPUT, + VLIB_NODE_STATE_POLLING, + /* frame */ 0, + cpu_time_now); + + if (PREDICT_TRUE (vm->queue_signal_pending == 0)) + vm->queue_signal_callback (vm); + + /* Next handle interrupts. */ + { + uword l = _vec_len (nm->pending_interrupt_node_runtime_indices); + uword i; + if (l > 0) + { + _vec_len (nm->pending_interrupt_node_runtime_indices) = 0; + for (i = 0; i < l; i++) + { + n = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT], + nm-> + pending_interrupt_node_runtime_indices + [i]); + cpu_time_now = + dispatch_node (vm, n, VLIB_NODE_TYPE_INPUT, + VLIB_NODE_STATE_INTERRUPT, + /* frame */ 0, + cpu_time_now); + } + } + } + + /* Check if process nodes have expired from timing wheel. */ + nm->data_from_advancing_timing_wheel + = timing_wheel_advance (&nm->timing_wheel, cpu_time_now, + nm->data_from_advancing_timing_wheel, + &nm->cpu_time_next_process_ready); + + ASSERT (nm->data_from_advancing_timing_wheel != 0); + if (PREDICT_FALSE (_vec_len (nm->data_from_advancing_timing_wheel) > 0)) + { + uword i; + + processes_timing_wheel_data: + for (i = 0; i < _vec_len (nm->data_from_advancing_timing_wheel); + i++) + { + u32 d = nm->data_from_advancing_timing_wheel[i]; + u32 di = vlib_timing_wheel_data_get_index (d); + + if (vlib_timing_wheel_data_is_timed_event (d)) + { + vlib_signal_timed_event_data_t *te = + pool_elt_at_index (nm->signal_timed_event_data_pool, di); + vlib_node_t *n = vlib_get_node (vm, te->process_node_index); + vlib_process_t *p = + vec_elt (nm->processes, n->runtime_index); + void *data; + data = + vlib_process_signal_event_helper (nm, n, p, + te->event_type_index, + te->n_data_elts, + te->n_data_elt_bytes); + if (te->n_data_bytes < sizeof (te->inline_event_data)) + clib_memcpy (data, te->inline_event_data, + te->n_data_bytes); + else + { + clib_memcpy (data, te->event_data_as_vector, + te->n_data_bytes); + vec_free (te->event_data_as_vector); + } + pool_put (nm->signal_timed_event_data_pool, te); + } + else + { + cpu_time_now = clib_cpu_time_now (); + cpu_time_now = + dispatch_suspended_process (vm, di, cpu_time_now); + } + } + + /* Reset vector. */ + _vec_len (nm->data_from_advancing_timing_wheel) = 0; + } + + /* Input nodes may have added work to the pending vector. + Process pending vector until there is nothing left. + All pending vectors will be processed from input -> output. */ + for (i = 0; i < _vec_len (nm->pending_frames); i++) + cpu_time_now = dispatch_pending_node (vm, nm->pending_frames + i, + cpu_time_now); + /* Reset pending vector for next iteration. */ + _vec_len (nm->pending_frames) = 0; + + /* Pending internal nodes may resume processes. */ + if (_vec_len (nm->data_from_advancing_timing_wheel) > 0) + goto processes_timing_wheel_data; + + vlib_increment_main_loop_counter (vm); + + /* Record time stamp in case there are no enabled nodes and above + calls do not update time stamp. */ + cpu_time_now = clib_cpu_time_now (); + } +} + +vlib_main_t vlib_global_main; + +static clib_error_t * +vlib_main_configure (vlib_main_t * vm, unformat_input_t * input) +{ + int turn_on_mem_trace = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "memory-trace")) + turn_on_mem_trace = 1; + + else if (unformat (input, "elog-events %d", + &vm->elog_main.event_ring_size)) + ; + else + return unformat_parse_error (input); + } + + unformat_free (input); + + /* Enable memory trace as early as possible. */ + if (turn_on_mem_trace) + clib_mem_trace (1); + + return 0; +} + +VLIB_EARLY_CONFIG_FUNCTION (vlib_main_configure, "vlib"); + +static void +dummy_queue_signal_callback (vlib_main_t * vm) +{ +} + +/* Main function. */ +int +vlib_main (vlib_main_t * volatile vm, unformat_input_t * input) +{ + clib_error_t *volatile error; + + vm->queue_signal_callback = dummy_queue_signal_callback; + + clib_time_init (&vm->clib_time); + + /* Turn on event log. */ + if (!vm->elog_main.event_ring_size) + vm->elog_main.event_ring_size = 128 << 10; + elog_init (&vm->elog_main, vm->elog_main.event_ring_size); + elog_enable_disable (&vm->elog_main, 1); + + /* Default name. */ + if (!vm->name) + vm->name = "VLIB"; + + vec_validate (vm->buffer_main, 0); + + if ((error = vlib_thread_init (vm))) + { + clib_error_report (error); + goto done; + } + + /* Register static nodes so that init functions may use them. */ + vlib_register_all_static_nodes (vm); + + /* Set seed for random number generator. + Allow user to specify seed to make random sequence deterministic. */ + if (!unformat (input, "seed %wd", &vm->random_seed)) + vm->random_seed = clib_cpu_time_now (); + clib_random_buffer_init (&vm->random_buffer, vm->random_seed); + + /* Initialize node graph. */ + if ((error = vlib_node_main_init (vm))) + { + /* Arrange for graph hook up error to not be fatal when debugging. */ + if (CLIB_DEBUG > 0) + clib_error_report (error); + else + goto done; + } + + /* See unix/main.c; most likely already set up */ + if (vm->init_functions_called == 0) + vm->init_functions_called = hash_create (0, /* value bytes */ 0); + if ((error = vlib_call_all_init_functions (vm))) + goto done; + + /* Create default buffer free list. */ + vlib_buffer_get_or_create_free_list (vm, + VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES, + "default"); + + switch (clib_setjmp (&vm->main_loop_exit, VLIB_MAIN_LOOP_EXIT_NONE)) + { + case VLIB_MAIN_LOOP_EXIT_NONE: + vm->main_loop_exit_set = 1; + break; + + case VLIB_MAIN_LOOP_EXIT_CLI: + goto done; + + default: + error = vm->main_loop_error; + goto done; + } + + if ((error = vlib_call_all_config_functions (vm, input, 0 /* is_early */ ))) + goto done; + + /* Call all main loop enter functions. */ + { + clib_error_t *sub_error; + sub_error = vlib_call_all_main_loop_enter_functions (vm); + if (sub_error) + clib_error_report (sub_error); + } + + vlib_main_loop (vm); + +done: + /* Call all exit functions. */ + { + clib_error_t *sub_error; + sub_error = vlib_call_all_main_loop_exit_functions (vm); + if (sub_error) + clib_error_report (sub_error); + } + + if (error) + clib_error_report (error); + + return 0; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/main.h b/src/vlib/main.h new file mode 100644 index 00000000000..d9ac1445ddd --- /dev/null +++ b/src/vlib/main.h @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * main.h: VLIB main data structure + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_main_h +#define included_vlib_main_h + +#include <vppinfra/elog.h> +#include <vppinfra/format.h> +#include <vppinfra/longjmp.h> +#include <vppinfra/pool.h> +#include <vppinfra/random_buffer.h> +#include <vppinfra/time.h> + +#include <pthread.h> + + +/* By default turn off node/error event logging. + Override with -DVLIB_ELOG_MAIN_LOOP */ +#ifndef VLIB_ELOG_MAIN_LOOP +#define VLIB_ELOG_MAIN_LOOP 0 +#endif + +typedef struct vlib_main_t +{ + /* Instruction level timing state. */ + clib_time_t clib_time; + + /* Time stamp of last node dispatch. */ + u64 cpu_time_last_node_dispatch; + + /* Time stamp when main loop was entered (time 0). */ + u64 cpu_time_main_loop_start; + + /* Incremented once for each main loop. */ + u32 main_loop_count; + + /* Count of vectors processed this main loop. */ + u32 main_loop_vectors_processed; + u32 main_loop_nodes_processed; + + /* Circular buffer of input node vector counts. + Indexed by low bits of + (main_loop_count >> VLIB_LOG2_INPUT_VECTORS_PER_MAIN_LOOP). */ + u32 vector_counts_per_main_loop[2]; + u32 node_counts_per_main_loop[2]; + + /* Every so often we switch to the next counter. */ +#define VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE 7 + + /* Jump target to exit main loop with given code. */ + u32 main_loop_exit_set; + clib_longjmp_t main_loop_exit; +#define VLIB_MAIN_LOOP_EXIT_NONE 0 +#define VLIB_MAIN_LOOP_EXIT_PANIC 1 + /* Exit via CLI. */ +#define VLIB_MAIN_LOOP_EXIT_CLI 2 + + /* Error marker to use when exiting main loop. */ + clib_error_t *main_loop_error; + + /* Name for e.g. syslog. */ + char *name; + + /* Start and size of CLIB heap. */ + void *heap_base; + uword heap_size; + + vlib_buffer_main_t *buffer_main; + + vlib_physmem_main_t physmem_main; + + /* Allocate/free buffer memory for DMA transfers, descriptor rings, etc. + buffer memory is guaranteed to be cache-aligned. */ + void *(*os_physmem_alloc_aligned) (vlib_physmem_main_t * pm, + uword n_bytes, uword alignment); + void (*os_physmem_free) (void *x); + + /* Node graph main structure. */ + vlib_node_main_t node_main; + + /* Command line interface. */ + vlib_cli_main_t cli_main; + + /* Packet trace buffer. */ + vlib_trace_main_t trace_main; + + /* Error handling. */ + vlib_error_main_t error_main; + + /* Punt packets to underlying operating system for when fast switching + code does not know what to do. */ + void (*os_punt_frame) (struct vlib_main_t * vm, + struct vlib_node_runtime_t * node, + vlib_frame_t * frame); + + /* Multicast distribution. Set to zero for MC disabled. */ + mc_main_t *mc_main; + + /* Stream index to use for distribution when MC is enabled. */ + u32 mc_stream_index; + + vlib_one_time_waiting_process_t *procs_waiting_for_mc_stream_join; + + /* Event logger. */ + elog_main_t elog_main; + + /* Node call and return event types. */ + elog_event_type_t *node_call_elog_event_types; + elog_event_type_t *node_return_elog_event_types; + + elog_event_type_t *error_elog_event_types; + + /* Seed for random number generator. */ + uword random_seed; + + /* Buffer of random data for various uses. */ + clib_random_buffer_t random_buffer; + + /* Hash table to record which init functions have been called. */ + uword *init_functions_called; + + /* to compare with node runtime */ + u32 cpu_index; + + void **mbuf_alloc_list; + + /* List of init functions to call, setup by constructors */ + _vlib_init_function_list_elt_t *init_function_registrations; + _vlib_init_function_list_elt_t *main_loop_enter_function_registrations; + _vlib_init_function_list_elt_t *main_loop_exit_function_registrations; + _vlib_init_function_list_elt_t *api_init_function_registrations; + vlib_config_function_runtime_t *config_function_registrations; + mc_serialize_msg_t *mc_msg_registrations; /* mc_main is a pointer... */ + + /* control-plane API queue signal pending, length indication */ + volatile u32 queue_signal_pending; + volatile u32 api_queue_nonempty; + void (*queue_signal_callback) (struct vlib_main_t *); + u8 **argv; +} vlib_main_t; + +/* Global main structure. */ +extern vlib_main_t vlib_global_main; + +always_inline f64 +vlib_time_now (vlib_main_t * vm) +{ + return clib_time_now (&vm->clib_time); +} + +always_inline f64 +vlib_time_now_ticks (vlib_main_t * vm, u64 n) +{ + return clib_time_now_internal (&vm->clib_time, n); +} + +/* Busy wait for specified time. */ +always_inline void +vlib_time_wait (vlib_main_t * vm, f64 wait) +{ + f64 t = vlib_time_now (vm); + f64 limit = t + wait; + while (t < limit) + t = vlib_time_now (vm); +} + +/* Time a piece of code. */ +#define vlib_time_code(vm,body) \ +do { \ + f64 _t[2]; \ + _t[0] = vlib_time_now (vm); \ + do { body; } while (0); \ + _t[1] = vlib_time_now (vm); \ + clib_warning ("%.7e", _t[1] - _t[0]); \ +} while (0) + +#define vlib_wait_with_timeout(vm,suspend_time,timeout_time,test) \ +({ \ + uword __vlib_wait_with_timeout = 0; \ + f64 __vlib_wait_time = 0; \ + while (! (__vlib_wait_with_timeout = (test)) \ + && __vlib_wait_time < (timeout_time)) \ + { \ + vlib_process_suspend (vm, suspend_time); \ + __vlib_wait_time += suspend_time; \ + } \ + __vlib_wait_with_timeout; \ +}) + +always_inline void +vlib_panic_with_error (vlib_main_t * vm, clib_error_t * error) +{ + vm->main_loop_error = error; + clib_longjmp (&vm->main_loop_exit, VLIB_MAIN_LOOP_EXIT_PANIC); +} + +#define vlib_panic_with_msg(vm,args...) \ + vlib_panic_with_error (vm, clib_error_return (0, args)) + +always_inline void +vlib_panic (vlib_main_t * vm) +{ + vlib_panic_with_error (vm, 0); +} + +always_inline u32 +vlib_vector_input_stats_index (vlib_main_t * vm, word delta) +{ + u32 i; + i = vm->main_loop_count >> VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE; + ASSERT (is_pow2 (ARRAY_LEN (vm->vector_counts_per_main_loop))); + return (i + delta) & (ARRAY_LEN (vm->vector_counts_per_main_loop) - 1); +} + +/* Estimate input rate based on previous + 2^VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE + samples. */ +always_inline u32 +vlib_last_vectors_per_main_loop (vlib_main_t * vm) +{ + u32 i = vlib_vector_input_stats_index (vm, -1); + u32 n = vm->vector_counts_per_main_loop[i]; + return n >> VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE; +} + +/* Total ave vector count per iteration of main loop. */ +always_inline f64 +vlib_last_vectors_per_main_loop_as_f64 (vlib_main_t * vm) +{ + u32 i = vlib_vector_input_stats_index (vm, -1); + u32 v = vm->vector_counts_per_main_loop[i]; + return (f64) v / (f64) (1 << VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE); +} + +/* Total ave vectors/node count per iteration of main loop. */ +always_inline f64 +vlib_last_vector_length_per_node (vlib_main_t * vm) +{ + u32 i = vlib_vector_input_stats_index (vm, -1); + u32 v = vm->vector_counts_per_main_loop[i]; + u32 n = vm->node_counts_per_main_loop[i]; + return n == 0 ? 0 : (f64) v / (f64) n; +} + +extern u32 wraps; + +always_inline void +vlib_increment_main_loop_counter (vlib_main_t * vm) +{ + u32 i, c, n, v, is_wrap; + + c = vm->main_loop_count++; + + is_wrap = (c & pow2_mask (VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE)) == 0; + + if (is_wrap) + wraps++; + + i = vlib_vector_input_stats_index (vm, /* delta */ is_wrap); + + v = is_wrap ? 0 : vm->vector_counts_per_main_loop[i]; + n = is_wrap ? 0 : vm->node_counts_per_main_loop[i]; + + v += vm->main_loop_vectors_processed; + n += vm->main_loop_nodes_processed; + vm->main_loop_vectors_processed = 0; + vm->main_loop_nodes_processed = 0; + vm->vector_counts_per_main_loop[i] = v; + vm->node_counts_per_main_loop[i] = n; +} + +always_inline void vlib_set_queue_signal_callback + (vlib_main_t * vm, void (*fp) (vlib_main_t *)) +{ + vm->queue_signal_callback = fp; +} + +/* Main routine. */ +int vlib_main (vlib_main_t * vm, unformat_input_t * input); + +/* Thread stacks, for os_get_cpu_number */ +extern u8 **vlib_thread_stacks; + +/* Number of thread stacks that the application needs */ +u32 vlib_app_num_thread_stacks_needed (void) __attribute__ ((weak)); + +extern void vlib_node_sync_stats (vlib_main_t * vm, vlib_node_t * n); + +#endif /* included_vlib_main_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/mc.c b/src/vlib/mc.c new file mode 100644 index 00000000000..8fde091389e --- /dev/null +++ b/src/vlib/mc.c @@ -0,0 +1,2609 @@ +/* + * mc.c: vlib reliable sequenced multicast distributed applications + * + * Copyright (c) 2010 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vlib/vlib.h> + +/* + * 1 to enable msg id training wheels, which are useful for tracking + * down catchup and/or partitioned network problems + */ +#define MSG_ID_DEBUG 0 + +static format_function_t format_mc_stream_state; + +static u32 +elog_id_for_peer_id (mc_main_t * m, u64 peer_id) +{ + uword *p, r; + mhash_t *h = &m->elog_id_by_peer_id; + + if (!m->elog_id_by_peer_id.hash) + mhash_init (h, sizeof (uword), sizeof (mc_peer_id_t)); + + p = mhash_get (h, &peer_id); + if (p) + return p[0]; + r = elog_string (m->elog_main, "%U", m->transport.format_peer_id, peer_id); + mhash_set (h, &peer_id, r, /* old_value */ 0); + return r; +} + +static u32 +elog_id_for_msg_name (mc_main_t * m, char *msg_name) +{ + uword *p, r; + uword *h = m->elog_id_by_msg_name; + u8 *name_copy; + + if (!h) + h = m->elog_id_by_msg_name = hash_create_string (0, sizeof (uword)); + + p = hash_get_mem (h, msg_name); + if (p) + return p[0]; + r = elog_string (m->elog_main, "%s", msg_name); + + name_copy = format (0, "%s%c", msg_name, 0); + + hash_set_mem (h, name_copy, r); + m->elog_id_by_msg_name = h; + + return r; +} + +static void +elog_tx_msg (mc_main_t * m, u32 stream_id, u32 local_sequence, + u32 retry_count) +{ + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "tx-msg: stream %d local seq %d attempt %d", + .format_args = "i4i4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 stream_id, local_sequence, retry_count; + } *ed; + ed = ELOG_DATA (m->elog_main, e); + ed->stream_id = stream_id; + ed->local_sequence = local_sequence; + ed->retry_count = retry_count; + } +} + +/* + * seq_cmp + * correctly compare two unsigned sequence numbers. + * This function works so long as x and y are within 2**(n-1) of each + * other, where n = bits(x, y). + * + * Magic decoder ring: + * seq_cmp == 0 => x and y are equal + * seq_cmp < 0 => x is "in the past" with respect to y + * seq_cmp > 0 => x is "in the future" with respect to y + */ +always_inline i32 +mc_seq_cmp (u32 x, u32 y) +{ + return (i32) x - (i32) y; +} + +void * +mc_get_vlib_buffer (vlib_main_t * vm, u32 n_bytes, u32 * bi_return) +{ + u32 n_alloc, bi; + vlib_buffer_t *b; + + n_alloc = vlib_buffer_alloc (vm, &bi, 1); + ASSERT (n_alloc == 1); + + b = vlib_get_buffer (vm, bi); + b->current_length = n_bytes; + *bi_return = bi; + return (void *) b->data; +} + +static void +delete_peer_with_index (mc_main_t * mcm, mc_stream_t * s, + uword index, int notify_application) +{ + mc_stream_peer_t *p = pool_elt_at_index (s->peers, index); + ASSERT (p != 0); + if (s->config.peer_died && notify_application) + s->config.peer_died (mcm, s, p->id); + + s->all_peer_bitmap = clib_bitmap_andnoti (s->all_peer_bitmap, p - s->peers); + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "delete peer %s from all_peer_bitmap", + .format_args = "T4", + }; + /* *INDENT-ON* */ + struct + { + u32 peer; + } *ed = 0; + + ed = ELOG_DATA (mcm->elog_main, e); + ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64); + } + /* Do not delete the pool / hash table entries, or we lose sequence number state */ +} + +static mc_stream_peer_t * +get_or_create_peer_with_id (mc_main_t * mcm, + mc_stream_t * s, mc_peer_id_t id, int *created) +{ + uword *q = mhash_get (&s->peer_index_by_id, &id); + mc_stream_peer_t *p; + + if (q) + { + p = pool_elt_at_index (s->peers, q[0]); + goto done; + } + + pool_get (s->peers, p); + memset (p, 0, sizeof (p[0])); + p->id = id; + p->last_sequence_received = ~0; + mhash_set (&s->peer_index_by_id, &id, p - s->peers, /* old_value */ 0); + if (created) + *created = 1; + +done: + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "get_or_create %s peer %s stream %d seq %d", + .format_args = "t4T4i4i4", + .n_enum_strings = 2, + .enum_strings = { + "old", "new", + }, + }; + /* *INDENT-ON* */ + struct + { + u32 is_new, peer, stream_index, rx_sequence; + } *ed = 0; + + ed = ELOG_DATA (mcm->elog_main, e); + ed->is_new = q ? 0 : 1; + ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64); + ed->stream_index = s->index; + ed->rx_sequence = p->last_sequence_received; + } + /* $$$$ Enable or reenable this peer */ + s->all_peer_bitmap = clib_bitmap_ori (s->all_peer_bitmap, p - s->peers); + return p; +} + +static void +maybe_send_window_open_event (vlib_main_t * vm, mc_stream_t * stream) +{ + vlib_one_time_waiting_process_t *p; + + if (pool_elts (stream->retry_pool) >= stream->config.window_size) + return; + + vec_foreach (p, stream->procs_waiting_for_open_window) + vlib_signal_one_time_waiting_process (vm, p); + + if (stream->procs_waiting_for_open_window) + _vec_len (stream->procs_waiting_for_open_window) = 0; +} + +static void +mc_retry_free (mc_main_t * mcm, mc_stream_t * s, mc_retry_t * r) +{ + mc_retry_t record, *retp; + + if (r->unacked_by_peer_bitmap) + _vec_len (r->unacked_by_peer_bitmap) = 0; + + if (clib_fifo_elts (s->retired_fifo) >= 2 * s->config.window_size) + { + clib_fifo_sub1 (s->retired_fifo, record); + vlib_buffer_free_one (mcm->vlib_main, record.buffer_index); + } + + clib_fifo_add2 (s->retired_fifo, retp); + + retp->buffer_index = r->buffer_index; + retp->local_sequence = r->local_sequence; + + r->buffer_index = ~0; /* poison buffer index in this retry */ +} + +static void +mc_resend_retired (mc_main_t * mcm, mc_stream_t * s, u32 local_sequence) +{ + mc_retry_t *retry; + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "resend-retired: search for local seq %d", + .format_args = "i4", + }; + /* *INDENT-ON* */ + struct + { + u32 local_sequence; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->local_sequence = local_sequence; + } + + /* *INDENT-OFF* */ + clib_fifo_foreach (retry, s->retired_fifo, + ({ + if (retry->local_sequence == local_sequence) + { + elog_tx_msg (mcm, s->index, retry-> local_sequence, -13); + mcm->transport.tx_buffer (mcm->transport.opaque, + MC_TRANSPORT_USER_REQUEST_TO_RELAY, + retry->buffer_index); + return; + } + })); + /* *INDENT-ON* */ + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "resend-retired: FAILED search for local seq %d", + .format_args = "i4", + }; + /* *INDENT-ON* */ + struct + { + u32 local_sequence; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->local_sequence = local_sequence; + } +} + +static uword * +delete_retry_fifo_elt (mc_main_t * mcm, + mc_stream_t * stream, + mc_retry_t * r, uword * dead_peer_bitmap) +{ + mc_stream_peer_t *p; + + /* *INDENT-OFF* */ + pool_foreach (p, stream->peers, ({ + uword pi = p - stream->peers; + uword is_alive = 0 == clib_bitmap_get (r->unacked_by_peer_bitmap, pi); + + if (! is_alive) + dead_peer_bitmap = clib_bitmap_ori (dead_peer_bitmap, pi); + + if (MC_EVENT_LOGGING > 0) + { + ELOG_TYPE_DECLARE (e) = { + .format = "delete_retry_fifo_elt: peer %s is %s", + .format_args = "T4t4", + .n_enum_strings = 2, + .enum_strings = { "alive", "dead", }, + }; + struct { u32 peer, is_alive; } * ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64); + ed->is_alive = is_alive; + } + })); + /* *INDENT-ON* */ + + hash_unset (stream->retry_index_by_local_sequence, r->local_sequence); + mc_retry_free (mcm, stream, r); + + return dead_peer_bitmap; +} + +always_inline mc_retry_t * +prev_retry (mc_stream_t * s, mc_retry_t * r) +{ + return (r->prev_index != ~0 + ? pool_elt_at_index (s->retry_pool, r->prev_index) : 0); +} + +always_inline mc_retry_t * +next_retry (mc_stream_t * s, mc_retry_t * r) +{ + return (r->next_index != ~0 + ? pool_elt_at_index (s->retry_pool, r->next_index) : 0); +} + +always_inline void +remove_retry_from_pool (mc_stream_t * s, mc_retry_t * r) +{ + mc_retry_t *p = prev_retry (s, r); + mc_retry_t *n = next_retry (s, r); + + if (p) + p->next_index = r->next_index; + else + s->retry_head_index = r->next_index; + if (n) + n->prev_index = r->prev_index; + else + s->retry_tail_index = r->prev_index; + + pool_put_index (s->retry_pool, r - s->retry_pool); +} + +static void +check_retry (mc_main_t * mcm, mc_stream_t * s) +{ + mc_retry_t *r; + vlib_main_t *vm = mcm->vlib_main; + f64 now = vlib_time_now (vm); + uword *dead_peer_bitmap = 0; + u32 ri, ri_next; + + for (ri = s->retry_head_index; ri != ~0; ri = ri_next) + { + r = pool_elt_at_index (s->retry_pool, ri); + ri_next = r->next_index; + + if (now < r->sent_at + s->config.retry_interval) + continue; + + r->n_retries += 1; + if (r->n_retries > s->config.retry_limit) + { + dead_peer_bitmap = + delete_retry_fifo_elt (mcm, s, r, dead_peer_bitmap); + remove_retry_from_pool (s, r); + } + else + { + if (MC_EVENT_LOGGING > 0) + { + mc_stream_peer_t *p; + + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (t) = + { + .format = "resend local seq %d attempt %d", + .format_args = "i4i4", + }; + /* *INDENT-ON* */ + + /* *INDENT-OFF* */ + pool_foreach (p, s->peers, ({ + if (clib_bitmap_get (r->unacked_by_peer_bitmap, p - s->peers)) + { + ELOG_TYPE_DECLARE (ev) = { + .format = "resend: needed by peer %s local seq %d", + .format_args = "T4i4", + }; + struct { u32 peer, rx_sequence; } * ed; + ed = ELOG_DATA (mcm->elog_main, ev); + ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64); + ed->rx_sequence = r->local_sequence; + } + })); + /* *INDENT-ON* */ + + struct + { + u32 sequence; + u32 trail; + } *ed; + ed = ELOG_DATA (mcm->elog_main, t); + ed->sequence = r->local_sequence; + ed->trail = r->n_retries; + } + + r->sent_at = vlib_time_now (vm); + s->stats.n_retries += 1; + + elog_tx_msg (mcm, s->index, r->local_sequence, r->n_retries); + + mcm->transport.tx_buffer + (mcm->transport.opaque, + MC_TRANSPORT_USER_REQUEST_TO_RELAY, r->buffer_index); + } + } + + maybe_send_window_open_event (mcm->vlib_main, s); + + /* Delete any dead peers we've found. */ + if (!clib_bitmap_is_zero (dead_peer_bitmap)) + { + uword i; + + /* *INDENT-OFF* */ + clib_bitmap_foreach (i, dead_peer_bitmap, ({ + delete_peer_with_index (mcm, s, i, /* notify_application */ 1); + + /* Delete any references to just deleted peer in retry pool. */ + pool_foreach (r, s->retry_pool, ({ + r->unacked_by_peer_bitmap = + clib_bitmap_andnoti (r->unacked_by_peer_bitmap, i); + })); + })); +/* *INDENT-ON* */ + clib_bitmap_free (dead_peer_bitmap); + } +} + +always_inline mc_main_t * +mc_node_get_main (vlib_node_runtime_t * node) +{ + mc_main_t **p = (void *) node->runtime_data; + return p[0]; +} + +static uword +mc_retry_process (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * f) +{ + mc_main_t *mcm = mc_node_get_main (node); + mc_stream_t *s; + + while (1) + { + vlib_process_suspend (vm, 1.0); + vec_foreach (s, mcm->stream_vector) + { + if (s->state != MC_STREAM_STATE_invalid) + check_retry (mcm, s); + } + } + return 0; /* not likely */ +} + +static void +send_join_or_leave_request (mc_main_t * mcm, u32 stream_index, u32 is_join) +{ + vlib_main_t *vm = mcm->vlib_main; + mc_msg_join_or_leave_request_t *mp; + u32 bi; + + mp = mc_get_vlib_buffer (vm, sizeof (mp[0]), &bi); + memset (mp, 0, sizeof (*mp)); + mp->type = MC_MSG_TYPE_join_or_leave_request; + mp->peer_id = mcm->transport.our_ack_peer_id; + mp->stream_index = stream_index; + mp->is_join = is_join; + + mc_byte_swap_msg_join_or_leave_request (mp); + + /* + * These msgs are unnumbered, unordered so send on the from-relay + * channel. + */ + mcm->transport.tx_buffer (mcm->transport.opaque, MC_TRANSPORT_JOIN, bi); +} + +static uword +mc_join_ager_process (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * f) +{ + mc_main_t *mcm = mc_node_get_main (node); + + while (1) + { + if (mcm->joins_in_progress) + { + mc_stream_t *s; + vlib_one_time_waiting_process_t *p; + f64 now = vlib_time_now (vm); + + vec_foreach (s, mcm->stream_vector) + { + if (s->state != MC_STREAM_STATE_join_in_progress) + continue; + + if (now > s->join_timeout) + { + s->state = MC_STREAM_STATE_ready; + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "stream %d join timeout", + }; + /* *INDENT-ON* */ + ELOG (mcm->elog_main, e, s->index); + } + /* Make sure that this app instance exists as a stream peer, + or we may answer a catchup request with a NULL + all_peer_bitmap... */ + (void) get_or_create_peer_with_id + (mcm, s, mcm->transport.our_ack_peer_id, /* created */ 0); + + vec_foreach (p, s->procs_waiting_for_join_done) + vlib_signal_one_time_waiting_process (vm, p); + if (s->procs_waiting_for_join_done) + _vec_len (s->procs_waiting_for_join_done) = 0; + + mcm->joins_in_progress--; + ASSERT (mcm->joins_in_progress >= 0); + } + else + { + /* Resent join request which may have been lost. */ + send_join_or_leave_request (mcm, s->index, 1 /* is_join */ ); + + /* We're *not* alone, retry for as long as it takes */ + if (mcm->relay_state == MC_RELAY_STATE_SLAVE) + s->join_timeout = vlib_time_now (vm) + 2.0; + + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "stream %d resend join request", + }; + /* *INDENT-ON* */ + ELOG (mcm->elog_main, e, s->index); + } + } + } + } + + vlib_process_suspend (vm, .5); + } + + return 0; /* not likely */ +} + +static void +serialize_mc_register_stream_name (serialize_main_t * m, va_list * va) +{ + char *name = va_arg (*va, char *); + serialize_cstring (m, name); +} + +static void +elog_stream_name (char *buf, int n_buf_bytes, char *v) +{ + clib_memcpy (buf, v, clib_min (n_buf_bytes - 1, vec_len (v))); + buf[n_buf_bytes - 1] = 0; +} + +static void +unserialize_mc_register_stream_name (serialize_main_t * m, va_list * va) +{ + mc_main_t *mcm = va_arg (*va, mc_main_t *); + char *name; + mc_stream_t *s; + uword *p; + + unserialize_cstring (m, &name); + + if ((p = hash_get_mem (mcm->stream_index_by_name, name))) + { + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "stream index %d already named %s", + .format_args = "i4s16", + }; + /* *INDENT-ON* */ + struct + { + u32 stream_index; + char name[16]; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->stream_index = p[0]; + elog_stream_name (ed->name, sizeof (ed->name), name); + } + + vec_free (name); + return; + } + + vec_add2 (mcm->stream_vector, s, 1); + mc_stream_init (s); + s->state = MC_STREAM_STATE_name_known; + s->index = s - mcm->stream_vector; + s->config.name = name; + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "stream index %d named %s", + .format_args = "i4s16", + }; + /* *INDENT-ON* */ + struct + { + u32 stream_index; + char name[16]; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->stream_index = s->index; + elog_stream_name (ed->name, sizeof (ed->name), name); + } + + hash_set_mem (mcm->stream_index_by_name, name, s->index); + + p = hash_get (mcm->procs_waiting_for_stream_name_by_name, name); + if (p) + { + vlib_one_time_waiting_process_t *wp, **w; + w = pool_elt_at_index (mcm->procs_waiting_for_stream_name_pool, p[0]); + vec_foreach (wp, w[0]) + vlib_signal_one_time_waiting_process (mcm->vlib_main, wp); + pool_put (mcm->procs_waiting_for_stream_name_pool, w); + hash_unset_mem (mcm->procs_waiting_for_stream_name_by_name, name); + } +} + +/* *INDENT-OFF* */ +MC_SERIALIZE_MSG (mc_register_stream_name_msg, static) = +{ + .name = "mc_register_stream_name", + .serialize = serialize_mc_register_stream_name, + .unserialize = unserialize_mc_register_stream_name, +}; +/* *INDENT-ON* */ + +void +mc_rx_buffer_unserialize (mc_main_t * mcm, + mc_stream_t * stream, + mc_peer_id_t peer_id, u32 buffer_index) +{ + return mc_unserialize (mcm, stream, buffer_index); +} + +static u8 * +mc_internal_catchup_snapshot (mc_main_t * mcm, + u8 * data_vector, + u32 last_global_sequence_processed) +{ + serialize_main_t m; + + /* Append serialized data to data vector. */ + serialize_open_vector (&m, data_vector); + m.stream.current_buffer_index = vec_len (data_vector); + + serialize (&m, serialize_mc_main, mcm); + return serialize_close_vector (&m); +} + +static void +mc_internal_catchup (mc_main_t * mcm, u8 * data, u32 n_data_bytes) +{ + serialize_main_t s; + + unserialize_open_data (&s, data, n_data_bytes); + + unserialize (&s, unserialize_mc_main, mcm); +} + +/* Overridden from the application layer, not actually used here */ +void mc_stream_join_process_hold (void) __attribute__ ((weak)); +void +mc_stream_join_process_hold (void) +{ +} + +static u32 +mc_stream_join_helper (mc_main_t * mcm, + mc_stream_config_t * config, u32 is_internal) +{ + mc_stream_t *s; + vlib_main_t *vm = mcm->vlib_main; + + s = 0; + if (!is_internal) + { + uword *p; + + /* Already have a stream with given name? */ + if ((s = mc_stream_by_name (mcm, config->name))) + { + /* Already joined and ready? */ + if (s->state == MC_STREAM_STATE_ready) + return s->index; + } + + /* First join MC internal stream. */ + if (!mcm->stream_vector + || (mcm->stream_vector[MC_STREAM_INDEX_INTERNAL].state + == MC_STREAM_STATE_invalid)) + { + static mc_stream_config_t c = { + .name = "mc-internal", + .rx_buffer = mc_rx_buffer_unserialize, + .catchup = mc_internal_catchup, + .catchup_snapshot = mc_internal_catchup_snapshot, + }; + + c.save_snapshot = config->save_snapshot; + + mc_stream_join_helper (mcm, &c, /* is_internal */ 1); + } + + /* If stream is still unknown register this name and wait for + sequenced message to name stream. This way all peers agree + on stream name to index mappings. */ + s = mc_stream_by_name (mcm, config->name); + if (!s) + { + vlib_one_time_waiting_process_t *wp, **w; + u8 *name_copy = format (0, "%s", config->name); + + mc_serialize_stream (mcm, + MC_STREAM_INDEX_INTERNAL, + &mc_register_stream_name_msg, config->name); + + /* Wait for this stream to be named. */ + p = + hash_get_mem (mcm->procs_waiting_for_stream_name_by_name, + name_copy); + if (p) + w = + pool_elt_at_index (mcm->procs_waiting_for_stream_name_pool, + p[0]); + else + { + pool_get (mcm->procs_waiting_for_stream_name_pool, w); + if (!mcm->procs_waiting_for_stream_name_by_name) + mcm->procs_waiting_for_stream_name_by_name = hash_create_string ( /* elts */ 0, /* value size */ + sizeof + (uword)); + hash_set_mem (mcm->procs_waiting_for_stream_name_by_name, + name_copy, + w - mcm->procs_waiting_for_stream_name_pool); + w[0] = 0; + } + + vec_add2 (w[0], wp, 1); + vlib_current_process_wait_for_one_time_event (vm, wp); + vec_free (name_copy); + } + + /* Name should be known now. */ + s = mc_stream_by_name (mcm, config->name); + ASSERT (s != 0); + ASSERT (s->state == MC_STREAM_STATE_name_known); + } + + if (!s) + { + vec_add2 (mcm->stream_vector, s, 1); + mc_stream_init (s); + s->index = s - mcm->stream_vector; + } + + { + /* Save name since we could have already used it as hash key. */ + char *name_save = s->config.name; + + s->config = config[0]; + + if (name_save) + s->config.name = name_save; + } + + if (s->config.window_size == 0) + s->config.window_size = 8; + + if (s->config.retry_interval == 0.0) + s->config.retry_interval = 1.0; + + /* Sanity. */ + ASSERT (s->config.retry_interval < 30); + + if (s->config.retry_limit == 0) + s->config.retry_limit = 7; + + s->state = MC_STREAM_STATE_join_in_progress; + if (!s->peer_index_by_id.hash) + mhash_init (&s->peer_index_by_id, sizeof (uword), sizeof (mc_peer_id_t)); + + /* If we don't hear from someone in 5 seconds, we're alone */ + s->join_timeout = vlib_time_now (vm) + 5.0; + mcm->joins_in_progress++; + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "stream index %d join request %s", + .format_args = "i4s16", + }; + /* *INDENT-ON* */ + struct + { + u32 stream_index; + char name[16]; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->stream_index = s->index; + elog_stream_name (ed->name, sizeof (ed->name), s->config.name); + } + + send_join_or_leave_request (mcm, s->index, 1 /* join */ ); + + vlib_current_process_wait_for_one_time_event_vector + (vm, &s->procs_waiting_for_join_done); + + if (MC_EVENT_LOGGING) + { + ELOG_TYPE (e, "join complete stream %d"); + ELOG (mcm->elog_main, e, s->index); + } + + return s->index; +} + +u32 +mc_stream_join (mc_main_t * mcm, mc_stream_config_t * config) +{ + return mc_stream_join_helper (mcm, config, /* is_internal */ 0); +} + +void +mc_stream_leave (mc_main_t * mcm, u32 stream_index) +{ + mc_stream_t *s = mc_stream_by_index (mcm, stream_index); + + if (!s) + return; + + if (MC_EVENT_LOGGING) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (t) = + { + .format = "leave-stream: %d",.format_args = "i4", + }; + /* *INDENT-ON* */ + struct + { + u32 index; + } *ed; + ed = ELOG_DATA (mcm->elog_main, t); + ed->index = stream_index; + } + + send_join_or_leave_request (mcm, stream_index, 0 /* is_join */ ); + mc_stream_free (s); + s->state = MC_STREAM_STATE_name_known; +} + +void +mc_msg_join_or_leave_request_handler (mc_main_t * mcm, + mc_msg_join_or_leave_request_t * req, + u32 buffer_index) +{ + mc_stream_t *s; + mc_msg_join_reply_t *rep; + u32 bi; + + mc_byte_swap_msg_join_or_leave_request (req); + + s = mc_stream_by_index (mcm, req->stream_index); + if (!s || s->state != MC_STREAM_STATE_ready) + return; + + /* If the peer is joining, create it */ + if (req->is_join) + { + mc_stream_t *this_s; + + /* We're not in a position to catch up a peer until all + stream joins are complete. */ + if (0) + { + /* XXX This is hard to test so we've. */ + vec_foreach (this_s, mcm->stream_vector) + { + if (this_s->state != MC_STREAM_STATE_ready + && this_s->state != MC_STREAM_STATE_name_known) + return; + } + } + else if (mcm->joins_in_progress > 0) + return; + + (void) get_or_create_peer_with_id (mcm, s, req->peer_id, + /* created */ 0); + + rep = mc_get_vlib_buffer (mcm->vlib_main, sizeof (rep[0]), &bi); + memset (rep, 0, sizeof (rep[0])); + rep->type = MC_MSG_TYPE_join_reply; + rep->stream_index = req->stream_index; + + mc_byte_swap_msg_join_reply (rep); + /* These two are already in network byte order... */ + rep->peer_id = mcm->transport.our_ack_peer_id; + rep->catchup_peer_id = mcm->transport.our_catchup_peer_id; + + mcm->transport.tx_buffer (mcm->transport.opaque, MC_TRANSPORT_JOIN, bi); + } + else + { + if (s->config.peer_died) + s->config.peer_died (mcm, s, req->peer_id); + } +} + +void +mc_msg_join_reply_handler (mc_main_t * mcm, + mc_msg_join_reply_t * mp, u32 buffer_index) +{ + mc_stream_t *s; + + mc_byte_swap_msg_join_reply (mp); + + s = mc_stream_by_index (mcm, mp->stream_index); + + if (!s || s->state != MC_STREAM_STATE_join_in_progress) + return; + + /* Switch to catchup state; next join reply + for this stream will be ignored. */ + s->state = MC_STREAM_STATE_catchup; + + mcm->joins_in_progress--; + mcm->transport.catchup_request_fun (mcm->transport.opaque, + mp->stream_index, mp->catchup_peer_id); +} + +void +mc_wait_for_stream_ready (mc_main_t * m, char *stream_name) +{ + mc_stream_t *s; + + while (1) + { + s = mc_stream_by_name (m, stream_name); + if (s) + break; + vlib_process_suspend (m->vlib_main, .1); + } + + /* It's OK to send a message in catchup and ready states. */ + if (s->state == MC_STREAM_STATE_catchup + || s->state == MC_STREAM_STATE_ready) + return; + + /* Otherwise we are waiting for a join to finish. */ + vlib_current_process_wait_for_one_time_event_vector + (m->vlib_main, &s->procs_waiting_for_join_done); +} + +u32 +mc_stream_send (mc_main_t * mcm, u32 stream_index, u32 buffer_index) +{ + mc_stream_t *s = mc_stream_by_index (mcm, stream_index); + vlib_main_t *vm = mcm->vlib_main; + mc_retry_t *r; + mc_msg_user_request_t *mp; + vlib_buffer_t *b = vlib_get_buffer (vm, buffer_index); + u32 ri; + + if (!s) + return 0; + + if (s->state != MC_STREAM_STATE_ready) + vlib_current_process_wait_for_one_time_event_vector + (vm, &s->procs_waiting_for_join_done); + + while (pool_elts (s->retry_pool) >= s->config.window_size) + { + vlib_current_process_wait_for_one_time_event_vector + (vm, &s->procs_waiting_for_open_window); + } + + pool_get (s->retry_pool, r); + ri = r - s->retry_pool; + + r->prev_index = s->retry_tail_index; + r->next_index = ~0; + s->retry_tail_index = ri; + + if (r->prev_index == ~0) + s->retry_head_index = ri; + else + { + mc_retry_t *p = pool_elt_at_index (s->retry_pool, r->prev_index); + p->next_index = ri; + } + + vlib_buffer_advance (b, -sizeof (mp[0])); + mp = vlib_buffer_get_current (b); + + mp->peer_id = mcm->transport.our_ack_peer_id; + /* mp->transport.global_sequence set by relay agent. */ + mp->global_sequence = 0xdeadbeef; + mp->stream_index = s->index; + mp->local_sequence = s->our_local_sequence++; + mp->n_data_bytes = + vlib_buffer_index_length_in_chain (vm, buffer_index) - sizeof (mp[0]); + + r->buffer_index = buffer_index; + r->local_sequence = mp->local_sequence; + r->sent_at = vlib_time_now (vm); + r->n_retries = 0; + + /* Retry will be freed when all currently known peers have acked. */ + vec_validate (r->unacked_by_peer_bitmap, vec_len (s->all_peer_bitmap) - 1); + vec_copy (r->unacked_by_peer_bitmap, s->all_peer_bitmap); + + hash_set (s->retry_index_by_local_sequence, r->local_sequence, + r - s->retry_pool); + + elog_tx_msg (mcm, s->index, mp->local_sequence, r->n_retries); + + mc_byte_swap_msg_user_request (mp); + + mcm->transport.tx_buffer (mcm->transport.opaque, + MC_TRANSPORT_USER_REQUEST_TO_RELAY, buffer_index); + + s->user_requests_sent++; + + /* return amount of window remaining */ + return s->config.window_size - pool_elts (s->retry_pool); +} + +void +mc_msg_user_request_handler (mc_main_t * mcm, mc_msg_user_request_t * mp, + u32 buffer_index) +{ + vlib_main_t *vm = mcm->vlib_main; + mc_stream_t *s; + mc_stream_peer_t *peer; + i32 seq_cmp_result; + static int once = 0; + + mc_byte_swap_msg_user_request (mp); + + s = mc_stream_by_index (mcm, mp->stream_index); + + /* Not signed up for this stream? Turf-o-matic */ + if (!s || s->state != MC_STREAM_STATE_ready) + { + vlib_buffer_free_one (vm, buffer_index); + return; + } + + /* Find peer, including ourselves. */ + peer = get_or_create_peer_with_id (mcm, s, mp->peer_id, + /* created */ 0); + + seq_cmp_result = mc_seq_cmp (mp->local_sequence, + peer->last_sequence_received + 1); + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "rx-msg: peer %s stream %d rx seq %d seq_cmp %d", + .format_args = "T4i4i4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 peer, stream_index, rx_sequence; + i32 seq_cmp_result; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->peer = elog_id_for_peer_id (mcm, peer->id.as_u64); + ed->stream_index = mp->stream_index; + ed->rx_sequence = mp->local_sequence; + ed->seq_cmp_result = seq_cmp_result; + } + + if (0 && mp->stream_index == 1 && once == 0) + { + once = 1; + ELOG_TYPE (e, "FAKE lost msg on stream 1"); + ELOG (mcm->elog_main, e, 0); + return; + } + + peer->last_sequence_received += seq_cmp_result == 0; + s->user_requests_received++; + + if (seq_cmp_result > 0) + peer->stats.n_msgs_from_future += 1; + + /* Send ack even if msg from future */ + if (1) + { + mc_msg_user_ack_t *rp; + u32 bi; + + rp = mc_get_vlib_buffer (vm, sizeof (rp[0]), &bi); + rp->peer_id = mcm->transport.our_ack_peer_id; + rp->stream_index = s->index; + rp->local_sequence = mp->local_sequence; + rp->seq_cmp_result = seq_cmp_result; + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "tx-ack: stream %d local seq %d", + .format_args = "i4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 stream_index; + u32 local_sequence; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->stream_index = rp->stream_index; + ed->local_sequence = rp->local_sequence; + } + + mc_byte_swap_msg_user_ack (rp); + + mcm->transport.tx_ack (mcm->transport.opaque, mp->peer_id, bi); + /* Msg from past? If so, free the buffer... */ + if (seq_cmp_result < 0) + { + vlib_buffer_free_one (vm, buffer_index); + peer->stats.n_msgs_from_past += 1; + } + } + + if (seq_cmp_result == 0) + { + vlib_buffer_t *b = vlib_get_buffer (vm, buffer_index); + switch (s->state) + { + case MC_STREAM_STATE_ready: + vlib_buffer_advance (b, sizeof (mp[0])); + s->config.rx_buffer (mcm, s, mp->peer_id, buffer_index); + + /* Stream vector can change address via rx callback for mc-internal + stream. */ + s = mc_stream_by_index (mcm, mp->stream_index); + ASSERT (s != 0); + s->last_global_sequence_processed = mp->global_sequence; + break; + + case MC_STREAM_STATE_catchup: + clib_fifo_add1 (s->catchup_fifo, buffer_index); + break; + + default: + clib_warning ("stream in unknown state %U", + format_mc_stream_state, s->state); + break; + } + } +} + +void +mc_msg_user_ack_handler (mc_main_t * mcm, mc_msg_user_ack_t * mp, + u32 buffer_index) +{ + vlib_main_t *vm = mcm->vlib_main; + uword *p; + mc_stream_t *s; + mc_stream_peer_t *peer; + mc_retry_t *r; + int peer_created = 0; + + mc_byte_swap_msg_user_ack (mp); + + s = mc_stream_by_index (mcm, mp->stream_index); + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (t) = + { + .format = "rx-ack: local seq %d peer %s seq_cmp_result %d", + .format_args = "i4T4i4", + }; + /* *INDENT-ON* */ + + struct + { + u32 local_sequence; + u32 peer; + i32 seq_cmp_result; + } *ed; + ed = ELOG_DATA (mcm->elog_main, t); + ed->local_sequence = mp->local_sequence; + ed->peer = elog_id_for_peer_id (mcm, mp->peer_id.as_u64); + ed->seq_cmp_result = mp->seq_cmp_result; + } + + /* Unknown stream? */ + if (!s) + return; + + /* Find the peer which just ack'ed. */ + peer = get_or_create_peer_with_id (mcm, s, mp->peer_id, + /* created */ &peer_created); + + /* + * Peer reports message from the future. If it's not in the retry + * fifo, look for a retired message. + */ + if (mp->seq_cmp_result > 0) + { + p = hash_get (s->retry_index_by_local_sequence, mp->local_sequence - + mp->seq_cmp_result); + if (p == 0) + mc_resend_retired (mcm, s, mp->local_sequence - mp->seq_cmp_result); + + /* Normal retry should fix it... */ + return; + } + + /* + * Pointer to the indicated retry fifo entry. + * Worth hashing because we could use a window size of 100 or 1000. + */ + p = hash_get (s->retry_index_by_local_sequence, mp->local_sequence); + + /* + * Is this a duplicate ACK, received after we've retired the + * fifo entry. This can happen when learning about new + * peers. + */ + if (p == 0) + { + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (t) = + { + .format = "ack: for seq %d from peer %s no fifo elt", + .format_args = "i4T4", + }; + /* *INDENT-ON* */ + + struct + { + u32 seq; + u32 peer; + } *ed; + ed = ELOG_DATA (mcm->elog_main, t); + ed->seq = mp->local_sequence; + ed->peer = elog_id_for_peer_id (mcm, mp->peer_id.as_u64); + } + + return; + } + + r = pool_elt_at_index (s->retry_pool, p[0]); + + /* Make sure that this new peer ACKs our msgs from now on */ + if (peer_created) + { + mc_retry_t *later_retry = next_retry (s, r); + + while (later_retry) + { + later_retry->unacked_by_peer_bitmap = + clib_bitmap_ori (later_retry->unacked_by_peer_bitmap, + peer - s->peers); + later_retry = next_retry (s, later_retry); + } + } + + ASSERT (mp->local_sequence == r->local_sequence); + + /* If we weren't expecting to hear from this peer */ + if (!peer_created && + !clib_bitmap_get (r->unacked_by_peer_bitmap, peer - s->peers)) + { + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (t) = + { + .format = "dup-ack: for seq %d from peer %s", + .format_args = "i4T4", + }; + /* *INDENT-ON* */ + struct + { + u32 seq; + u32 peer; + } *ed; + ed = ELOG_DATA (mcm->elog_main, t); + ed->seq = r->local_sequence; + ed->peer = elog_id_for_peer_id (mcm, peer->id.as_u64); + } + if (!clib_bitmap_is_zero (r->unacked_by_peer_bitmap)) + return; + } + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (t) = + { + .format = "ack: for seq %d from peer %s", + .format_args = "i4T4", + }; + /* *INDENT-ON* */ + struct + { + u32 seq; + u32 peer; + } *ed; + ed = ELOG_DATA (mcm->elog_main, t); + ed->seq = mp->local_sequence; + ed->peer = elog_id_for_peer_id (mcm, peer->id.as_u64); + } + + r->unacked_by_peer_bitmap = + clib_bitmap_andnoti (r->unacked_by_peer_bitmap, peer - s->peers); + + /* Not all clients have ack'ed */ + if (!clib_bitmap_is_zero (r->unacked_by_peer_bitmap)) + { + return; + } + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (t) = + { + .format = "ack: retire fifo elt loc seq %d after %d acks", + .format_args = "i4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 seq; + u32 npeers; + } *ed; + ed = ELOG_DATA (mcm->elog_main, t); + ed->seq = r->local_sequence; + ed->npeers = pool_elts (s->peers); + } + + hash_unset (s->retry_index_by_local_sequence, mp->local_sequence); + mc_retry_free (mcm, s, r); + remove_retry_from_pool (s, r); + maybe_send_window_open_event (vm, s); +} + +#define EVENT_MC_SEND_CATCHUP_DATA 0 + +static uword +mc_catchup_process (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * f) +{ + mc_main_t *mcm = mc_node_get_main (node); + uword *event_data = 0; + mc_catchup_process_arg_t *args; + int i; + + while (1) + { + if (event_data) + _vec_len (event_data) = 0; + vlib_process_wait_for_event_with_type (vm, &event_data, + EVENT_MC_SEND_CATCHUP_DATA); + + for (i = 0; i < vec_len (event_data); i++) + { + args = pool_elt_at_index (mcm->catchup_process_args, event_data[i]); + + mcm->transport.catchup_send_fun (mcm->transport.opaque, + args->catchup_opaque, + args->catchup_snapshot); + + /* Send function will free snapshot data vector. */ + pool_put (mcm->catchup_process_args, args); + } + } + + return 0; /* not likely */ +} + +static void +serialize_mc_stream (serialize_main_t * m, va_list * va) +{ + mc_stream_t *s = va_arg (*va, mc_stream_t *); + mc_stream_peer_t *p; + + serialize_integer (m, pool_elts (s->peers), sizeof (u32)); + /* *INDENT-OFF* */ + pool_foreach (p, s->peers, ({ + u8 * x = serialize_get (m, sizeof (p->id)); + clib_memcpy (x, p->id.as_u8, sizeof (p->id)); + serialize_integer (m, p->last_sequence_received, + sizeof (p->last_sequence_received)); + })); +/* *INDENT-ON* */ + serialize_bitmap (m, s->all_peer_bitmap); +} + +void +unserialize_mc_stream (serialize_main_t * m, va_list * va) +{ + mc_stream_t *s = va_arg (*va, mc_stream_t *); + u32 i, n_peers; + mc_stream_peer_t *p; + + unserialize_integer (m, &n_peers, sizeof (u32)); + mhash_init (&s->peer_index_by_id, sizeof (uword), sizeof (mc_peer_id_t)); + for (i = 0; i < n_peers; i++) + { + u8 *x; + pool_get (s->peers, p); + x = unserialize_get (m, sizeof (p->id)); + clib_memcpy (p->id.as_u8, x, sizeof (p->id)); + unserialize_integer (m, &p->last_sequence_received, + sizeof (p->last_sequence_received)); + mhash_set (&s->peer_index_by_id, &p->id, p - s->peers, /* old_value */ + 0); + } + s->all_peer_bitmap = unserialize_bitmap (m); + + /* This is really bad. */ + if (!s->all_peer_bitmap) + clib_warning ("BUG: stream %s all_peer_bitmap NULL", s->config.name); +} + +void +mc_msg_catchup_request_handler (mc_main_t * mcm, + mc_msg_catchup_request_t * req, + u32 catchup_opaque) +{ + vlib_main_t *vm = mcm->vlib_main; + mc_stream_t *s; + mc_catchup_process_arg_t *args; + + mc_byte_swap_msg_catchup_request (req); + + s = mc_stream_by_index (mcm, req->stream_index); + if (!s || s->state != MC_STREAM_STATE_ready) + return; + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (t) = + { + .format = "catchup-request: from %s stream %d", + .format_args = "T4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 peer, stream; + } *ed; + ed = ELOG_DATA (mcm->elog_main, t); + ed->peer = elog_id_for_peer_id (mcm, req->peer_id.as_u64); + ed->stream = req->stream_index; + } + + /* + * The application has to snapshoot its data structures right + * here, right now. If we process any messages after + * noting the last global sequence we've processed, the client + * won't be able to accurately reconstruct our data structures. + * + * Once the data structures are e.g. vec_dup()'ed, we + * send the resulting messages from a separate process, to + * make sure that we don't cause a bunch of message retransmissions + */ + pool_get (mcm->catchup_process_args, args); + + args->stream_index = s - mcm->stream_vector; + args->catchup_opaque = catchup_opaque; + args->catchup_snapshot = 0; + + /* Construct catchup reply and snapshot state for stream to send as + catchup reply payload. */ + { + mc_msg_catchup_reply_t *rep; + serialize_main_t m; + + vec_resize (args->catchup_snapshot, sizeof (rep[0])); + + rep = (void *) args->catchup_snapshot; + + rep->peer_id = req->peer_id; + rep->stream_index = req->stream_index; + rep->last_global_sequence_included = s->last_global_sequence_processed; + + /* Setup for serialize to append to catchup snapshot. */ + serialize_open_vector (&m, args->catchup_snapshot); + m.stream.current_buffer_index = vec_len (m.stream.buffer); + + serialize (&m, serialize_mc_stream, s); + + args->catchup_snapshot = serialize_close_vector (&m); + + /* Actually copy internal state */ + args->catchup_snapshot = s->config.catchup_snapshot + (mcm, args->catchup_snapshot, rep->last_global_sequence_included); + + rep = (void *) args->catchup_snapshot; + rep->n_data_bytes = vec_len (args->catchup_snapshot) - sizeof (rep[0]); + + mc_byte_swap_msg_catchup_reply (rep); + } + + /* now go send it... */ + vlib_process_signal_event (vm, mcm->catchup_process, + EVENT_MC_SEND_CATCHUP_DATA, + args - mcm->catchup_process_args); +} + +#define EVENT_MC_UNSERIALIZE_BUFFER 0 +#define EVENT_MC_UNSERIALIZE_CATCHUP 1 + +void +mc_msg_catchup_reply_handler (mc_main_t * mcm, mc_msg_catchup_reply_t * mp, + u32 catchup_opaque) +{ + vlib_process_signal_event (mcm->vlib_main, + mcm->unserialize_process, + EVENT_MC_UNSERIALIZE_CATCHUP, + pointer_to_uword (mp)); +} + +static void +perform_catchup (mc_main_t * mcm, mc_msg_catchup_reply_t * mp) +{ + mc_stream_t *s; + i32 seq_cmp_result; + + mc_byte_swap_msg_catchup_reply (mp); + + s = mc_stream_by_index (mcm, mp->stream_index); + + /* Never heard of this stream or already caught up. */ + if (!s || s->state == MC_STREAM_STATE_ready) + return; + + { + serialize_main_t m; + mc_stream_peer_t *p; + u32 n_stream_bytes; + + /* For offline sim replay: save the entire catchup snapshot... */ + if (s->config.save_snapshot) + s->config.save_snapshot (mcm, /* is_catchup */ 1, mp->data, + mp->n_data_bytes); + + unserialize_open_data (&m, mp->data, mp->n_data_bytes); + unserialize (&m, unserialize_mc_stream, s); + + /* Make sure we start numbering our messages as expected */ + /* *INDENT-OFF* */ + pool_foreach (p, s->peers, ({ + if (p->id.as_u64 == mcm->transport.our_ack_peer_id.as_u64) + s->our_local_sequence = p->last_sequence_received + 1; + })); +/* *INDENT-ON* */ + + n_stream_bytes = m.stream.current_buffer_index; + + /* No need to unserialize close; nothing to free. */ + + /* After serialized stream is user's catchup data. */ + s->config.catchup (mcm, mp->data + n_stream_bytes, + mp->n_data_bytes - n_stream_bytes); + } + + /* Vector could have been moved by catchup. + This can only happen for mc-internal stream. */ + s = mc_stream_by_index (mcm, mp->stream_index); + + s->last_global_sequence_processed = mp->last_global_sequence_included; + + while (clib_fifo_elts (s->catchup_fifo)) + { + mc_msg_user_request_t *gp; + u32 bi; + vlib_buffer_t *b; + + clib_fifo_sub1 (s->catchup_fifo, bi); + + b = vlib_get_buffer (mcm->vlib_main, bi); + gp = vlib_buffer_get_current (b); + + /* Make sure we're replaying "new" news */ + seq_cmp_result = mc_seq_cmp (gp->global_sequence, + mp->last_global_sequence_included); + + if (seq_cmp_result > 0) + { + vlib_buffer_advance (b, sizeof (gp[0])); + s->config.rx_buffer (mcm, s, gp->peer_id, bi); + s->last_global_sequence_processed = gp->global_sequence; + + if (MC_EVENT_LOGGING) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (t) = + { + .format = "catchup replay local sequence 0x%x", + .format_args = "i4", + }; + /* *INDENT-ON* */ + struct + { + u32 local_sequence; + } *ed; + ed = ELOG_DATA (mcm->elog_main, t); + ed->local_sequence = gp->local_sequence; + } + } + else + { + if (MC_EVENT_LOGGING) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (t) = + { + .format = "catchup discard local sequence 0x%x", + .format_args = "i4", + }; + /* *INDENT-ON* */ + struct + { + u32 local_sequence; + } *ed; + ed = ELOG_DATA (mcm->elog_main, t); + ed->local_sequence = gp->local_sequence; + } + + vlib_buffer_free_one (mcm->vlib_main, bi); + } + } + + s->state = MC_STREAM_STATE_ready; + + /* Now that we are caught up wake up joining process. */ + { + vlib_one_time_waiting_process_t *wp; + vec_foreach (wp, s->procs_waiting_for_join_done) + vlib_signal_one_time_waiting_process (mcm->vlib_main, wp); + if (s->procs_waiting_for_join_done) + _vec_len (s->procs_waiting_for_join_done) = 0; + } +} + +static void +this_node_maybe_master (mc_main_t * mcm) +{ + vlib_main_t *vm = mcm->vlib_main; + mc_msg_master_assert_t *mp; + uword event_type; + int timeouts = 0; + int is_master = mcm->relay_state == MC_RELAY_STATE_MASTER; + clib_error_t *error; + f64 now, time_last_master_assert = -1; + u32 bi; + + while (1) + { + if (!mcm->we_can_be_relay_master) + { + mcm->relay_state = MC_RELAY_STATE_SLAVE; + if (MC_EVENT_LOGGING) + { + ELOG_TYPE (e, "become slave (config)"); + ELOG (mcm->elog_main, e, 0); + } + return; + } + + now = vlib_time_now (vm); + if (now >= time_last_master_assert + 1) + { + time_last_master_assert = now; + mp = mc_get_vlib_buffer (mcm->vlib_main, sizeof (mp[0]), &bi); + + mp->peer_id = mcm->transport.our_ack_peer_id; + mp->global_sequence = mcm->relay_global_sequence; + + /* + * these messages clog the event log, set MC_EVENT_LOGGING higher + * if you want them + */ + if (MC_EVENT_LOGGING > 1) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "tx-massert: peer %s global seq %u", + .format_args = "T4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 peer, global_sequence; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->peer = elog_id_for_peer_id (mcm, mp->peer_id.as_u64); + ed->global_sequence = mp->global_sequence; + } + + mc_byte_swap_msg_master_assert (mp); + + error = + mcm->transport.tx_buffer (mcm->transport.opaque, + MC_TRANSPORT_MASTERSHIP, bi); + if (error) + clib_error_report (error); + } + + vlib_process_wait_for_event_or_clock (vm, 1.0); + event_type = vlib_process_get_events (vm, /* no event data */ 0); + + switch (event_type) + { + case ~0: + if (!is_master && timeouts++ > 2) + { + mcm->relay_state = MC_RELAY_STATE_MASTER; + mcm->relay_master_peer_id = + mcm->transport.our_ack_peer_id.as_u64; + if (MC_EVENT_LOGGING) + { + ELOG_TYPE (e, "become master (was maybe_master)"); + ELOG (mcm->elog_main, e, 0); + } + return; + } + break; + + case MC_RELAY_STATE_SLAVE: + mcm->relay_state = MC_RELAY_STATE_SLAVE; + if (MC_EVENT_LOGGING && mcm->relay_state != MC_RELAY_STATE_SLAVE) + { + ELOG_TYPE (e, "become slave (was maybe_master)"); + ELOG (mcm->elog_main, e, 0); + } + return; + } + } +} + +static void +this_node_slave (mc_main_t * mcm) +{ + vlib_main_t *vm = mcm->vlib_main; + uword event_type; + int timeouts = 0; + + if (MC_EVENT_LOGGING) + { + ELOG_TYPE (e, "become slave"); + ELOG (mcm->elog_main, e, 0); + } + + while (1) + { + vlib_process_wait_for_event_or_clock (vm, 1.0); + event_type = vlib_process_get_events (vm, /* no event data */ 0); + + switch (event_type) + { + case ~0: + if (timeouts++ > 2) + { + mcm->relay_state = MC_RELAY_STATE_NEGOTIATE; + mcm->relay_master_peer_id = ~0ULL; + if (MC_EVENT_LOGGING) + { + ELOG_TYPE (e, "timeouts; negoitate mastership"); + ELOG (mcm->elog_main, e, 0); + } + return; + } + break; + + case MC_RELAY_STATE_SLAVE: + mcm->relay_state = MC_RELAY_STATE_SLAVE; + timeouts = 0; + break; + } + } +} + +static uword +mc_mastership_process (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * f) +{ + mc_main_t *mcm = mc_node_get_main (node); + + while (1) + { + switch (mcm->relay_state) + { + case MC_RELAY_STATE_NEGOTIATE: + case MC_RELAY_STATE_MASTER: + this_node_maybe_master (mcm); + break; + + case MC_RELAY_STATE_SLAVE: + this_node_slave (mcm); + break; + } + } + return 0; /* not likely */ +} + +void +mc_enable_disable_mastership (mc_main_t * mcm, int we_can_be_master) +{ + if (we_can_be_master != mcm->we_can_be_relay_master) + { + mcm->we_can_be_relay_master = we_can_be_master; + vlib_process_signal_event (mcm->vlib_main, + mcm->mastership_process, + MC_RELAY_STATE_NEGOTIATE, 0); + } +} + +void +mc_msg_master_assert_handler (mc_main_t * mcm, mc_msg_master_assert_t * mp, + u32 buffer_index) +{ + mc_peer_id_t his_peer_id, our_peer_id; + i32 seq_cmp_result; + u8 signal_slave = 0; + u8 update_global_sequence = 0; + + mc_byte_swap_msg_master_assert (mp); + + his_peer_id = mp->peer_id; + our_peer_id = mcm->transport.our_ack_peer_id; + + /* compare the incoming global sequence with ours */ + seq_cmp_result = mc_seq_cmp (mp->global_sequence, + mcm->relay_global_sequence); + + /* If the sender has a lower peer id and the sender's sequence >= + our global sequence, we become a slave. Otherwise we are master. */ + if (mc_peer_id_compare (his_peer_id, our_peer_id) < 0 + && seq_cmp_result >= 0) + { + vlib_process_signal_event (mcm->vlib_main, + mcm->mastership_process, + MC_RELAY_STATE_SLAVE, 0); + signal_slave = 1; + } + + /* Update our global sequence. */ + if (seq_cmp_result > 0) + { + mcm->relay_global_sequence = mp->global_sequence; + update_global_sequence = 1; + } + + { + uword *q = mhash_get (&mcm->mastership_peer_index_by_id, &his_peer_id); + mc_mastership_peer_t *p; + + if (q) + p = vec_elt_at_index (mcm->mastership_peers, q[0]); + else + { + vec_add2 (mcm->mastership_peers, p, 1); + p->peer_id = his_peer_id; + mhash_set (&mcm->mastership_peer_index_by_id, &p->peer_id, + p - mcm->mastership_peers, + /* old_value */ 0); + } + p->time_last_master_assert_received = vlib_time_now (mcm->vlib_main); + } + + /* + * these messages clog the event log, set MC_EVENT_LOGGING higher + * if you want them. + */ + if (MC_EVENT_LOGGING > 1) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "rx-massert: peer %s global seq %u upd %d slave %d", + .format_args = "T4i4i1i1", + }; + /* *INDENT-ON* */ + + struct + { + u32 peer; + u32 global_sequence; + u8 update_sequence; + u8 slave; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->peer = elog_id_for_peer_id (mcm, his_peer_id.as_u64); + ed->global_sequence = mp->global_sequence; + ed->update_sequence = update_global_sequence; + ed->slave = signal_slave; + } +} + +static void +mc_serialize_init (mc_main_t * mcm) +{ + mc_serialize_msg_t *m; + vlib_main_t *vm = vlib_get_main (); + + mcm->global_msg_index_by_name + = hash_create_string ( /* elts */ 0, sizeof (uword)); + + m = vm->mc_msg_registrations; + + while (m) + { + m->global_index = vec_len (mcm->global_msgs); + hash_set_mem (mcm->global_msg_index_by_name, m->name, m->global_index); + vec_add1 (mcm->global_msgs, m); + m = m->next_registration; + } +} + +clib_error_t * +mc_serialize_va (mc_main_t * mc, + u32 stream_index, + u32 multiple_messages_per_vlib_buffer, + mc_serialize_msg_t * msg, va_list * va) +{ + mc_stream_t *s; + clib_error_t *error; + serialize_main_t *m = &mc->serialize_mains[VLIB_TX]; + vlib_serialize_buffer_main_t *sbm = &mc->serialize_buffer_mains[VLIB_TX]; + u32 bi, n_before, n_after, n_total, n_this_msg; + u32 si, gi; + + if (!sbm->vlib_main) + { + sbm->tx.max_n_data_bytes_per_chain = 4096; + sbm->tx.free_list_index = VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX; + } + + if (sbm->first_buffer == 0) + serialize_open_vlib_buffer (m, mc->vlib_main, sbm); + + n_before = serialize_vlib_buffer_n_bytes (m); + + s = mc_stream_by_index (mc, stream_index); + gi = msg->global_index; + ASSERT (msg == vec_elt (mc->global_msgs, gi)); + + si = ~0; + if (gi < vec_len (s->stream_msg_index_by_global_index)) + si = s->stream_msg_index_by_global_index[gi]; + + serialize_likely_small_unsigned_integer (m, si); + + /* For first time message is sent, use name to identify message. */ + if (si == ~0 || MSG_ID_DEBUG) + serialize_cstring (m, msg->name); + + if (MSG_ID_DEBUG && MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "serialize-msg: %s index %d", + .format_args = "T4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 c[2]; + } *ed; + ed = ELOG_DATA (mc->elog_main, e); + ed->c[0] = elog_id_for_msg_name (mc, msg->name); + ed->c[1] = si; + } + + error = va_serialize (m, va); + + n_after = serialize_vlib_buffer_n_bytes (m); + n_this_msg = n_after - n_before; + n_total = n_after + sizeof (mc_msg_user_request_t); + + /* For max message size ignore first message where string name is sent. */ + if (si != ~0) + msg->max_n_bytes_serialized = + clib_max (msg->max_n_bytes_serialized, n_this_msg); + + if (!multiple_messages_per_vlib_buffer + || si == ~0 + || n_total + msg->max_n_bytes_serialized > + mc->transport.max_packet_size) + { + bi = serialize_close_vlib_buffer (m); + sbm->first_buffer = 0; + if (!error) + mc_stream_send (mc, stream_index, bi); + else if (bi != ~0) + vlib_buffer_free_one (mc->vlib_main, bi); + } + + return error; +} + +clib_error_t * +mc_serialize_internal (mc_main_t * mc, + u32 stream_index, + u32 multiple_messages_per_vlib_buffer, + mc_serialize_msg_t * msg, ...) +{ + vlib_main_t *vm = mc->vlib_main; + va_list va; + clib_error_t *error; + + if (stream_index == ~0) + { + if (vm->mc_main && vm->mc_stream_index == ~0) + vlib_current_process_wait_for_one_time_event_vector + (vm, &vm->procs_waiting_for_mc_stream_join); + stream_index = vm->mc_stream_index; + } + + va_start (va, msg); + error = mc_serialize_va (mc, stream_index, + multiple_messages_per_vlib_buffer, msg, &va); + va_end (va); + return error; +} + +uword +mc_unserialize_message (mc_main_t * mcm, + mc_stream_t * s, serialize_main_t * m) +{ + mc_serialize_stream_msg_t *sm; + u32 gi, si; + + si = unserialize_likely_small_unsigned_integer (m); + + if (!(si == ~0 || MSG_ID_DEBUG)) + { + sm = vec_elt_at_index (s->stream_msgs, si); + gi = sm->global_index; + } + else + { + char *name; + + unserialize_cstring (m, &name); + + if (MSG_ID_DEBUG && MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "unserialize-msg: %s rx index %d", + .format_args = "T4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 c[2]; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->c[0] = elog_id_for_msg_name (mcm, name); + ed->c[1] = si; + } + + { + uword *p = hash_get_mem (mcm->global_msg_index_by_name, name); + gi = p ? p[0] : ~0; + } + + /* Unknown message? */ + if (gi == ~0) + { + vec_free (name); + goto done; + } + + vec_validate_init_empty (s->stream_msg_index_by_global_index, gi, ~0); + si = s->stream_msg_index_by_global_index[gi]; + + /* Stream local index unknown? Create it. */ + if (si == ~0) + { + vec_add2 (s->stream_msgs, sm, 1); + + si = sm - s->stream_msgs; + sm->global_index = gi; + s->stream_msg_index_by_global_index[gi] = si; + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "msg-bind: stream %d %s to index %d", + .format_args = "i4T4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 c[3]; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->c[0] = s->index; + ed->c[1] = elog_id_for_msg_name (mcm, name); + ed->c[2] = si; + } + } + else + { + sm = vec_elt_at_index (s->stream_msgs, si); + if (gi != sm->global_index && MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "msg-id-ERROR: %s index %d expected %d", + .format_args = "T4i4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 c[3]; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->c[0] = elog_id_for_msg_name (mcm, name); + ed->c[1] = si; + ed->c[2] = ~0; + if (sm->global_index < + vec_len (s->stream_msg_index_by_global_index)) + ed->c[2] = + s->stream_msg_index_by_global_index[sm->global_index]; + } + } + + vec_free (name); + } + + if (gi != ~0) + { + mc_serialize_msg_t *msg; + msg = vec_elt (mcm->global_msgs, gi); + unserialize (m, msg->unserialize, mcm); + } + +done: + return gi != ~0; +} + +void +mc_unserialize_internal (mc_main_t * mcm, u32 stream_and_buffer_index) +{ + vlib_main_t *vm = mcm->vlib_main; + serialize_main_t *m = &mcm->serialize_mains[VLIB_RX]; + vlib_serialize_buffer_main_t *sbm = &mcm->serialize_buffer_mains[VLIB_RX]; + mc_stream_and_buffer_t *sb; + mc_stream_t *stream; + u32 buffer_index; + + sb = + pool_elt_at_index (mcm->mc_unserialize_stream_and_buffers, + stream_and_buffer_index); + buffer_index = sb->buffer_index; + stream = vec_elt_at_index (mcm->stream_vector, sb->stream_index); + pool_put (mcm->mc_unserialize_stream_and_buffers, sb); + + if (stream->config.save_snapshot) + { + u32 n_bytes = vlib_buffer_index_length_in_chain (vm, buffer_index); + static u8 *contents; + vec_reset_length (contents); + vec_validate (contents, n_bytes - 1); + vlib_buffer_contents (vm, buffer_index, contents); + stream->config.save_snapshot (mcm, /* is_catchup */ 0, contents, + n_bytes); + } + + ASSERT (vlib_in_process_context (vm)); + + unserialize_open_vlib_buffer (m, vm, sbm); + + clib_fifo_add1 (sbm->rx.buffer_fifo, buffer_index); + + while (unserialize_vlib_buffer_n_bytes (m) > 0) + mc_unserialize_message (mcm, stream, m); + + /* Frees buffer. */ + unserialize_close_vlib_buffer (m); +} + +void +mc_unserialize (mc_main_t * mcm, mc_stream_t * s, u32 buffer_index) +{ + vlib_main_t *vm = mcm->vlib_main; + mc_stream_and_buffer_t *sb; + pool_get (mcm->mc_unserialize_stream_and_buffers, sb); + sb->stream_index = s->index; + sb->buffer_index = buffer_index; + vlib_process_signal_event (vm, mcm->unserialize_process, + EVENT_MC_UNSERIALIZE_BUFFER, + sb - mcm->mc_unserialize_stream_and_buffers); +} + +static uword +mc_unserialize_process (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * f) +{ + mc_main_t *mcm = mc_node_get_main (node); + uword event_type, *event_data = 0; + int i; + + while (1) + { + if (event_data) + _vec_len (event_data) = 0; + + vlib_process_wait_for_event (vm); + event_type = vlib_process_get_events (vm, &event_data); + switch (event_type) + { + case EVENT_MC_UNSERIALIZE_BUFFER: + for (i = 0; i < vec_len (event_data); i++) + mc_unserialize_internal (mcm, event_data[i]); + break; + + case EVENT_MC_UNSERIALIZE_CATCHUP: + for (i = 0; i < vec_len (event_data); i++) + { + u8 *mp = uword_to_pointer (event_data[i], u8 *); + perform_catchup (mcm, (void *) mp); + vec_free (mp); + } + break; + + default: + break; + } + } + + return 0; /* not likely */ +} + +void +serialize_mc_main (serialize_main_t * m, va_list * va) +{ + mc_main_t *mcm = va_arg (*va, mc_main_t *); + mc_stream_t *s; + mc_serialize_stream_msg_t *sm; + mc_serialize_msg_t *msg; + + serialize_integer (m, vec_len (mcm->stream_vector), sizeof (u32)); + vec_foreach (s, mcm->stream_vector) + { + /* Stream name. */ + serialize_cstring (m, s->config.name); + + /* Serialize global names for all sent messages. */ + serialize_integer (m, vec_len (s->stream_msgs), sizeof (u32)); + vec_foreach (sm, s->stream_msgs) + { + msg = vec_elt (mcm->global_msgs, sm->global_index); + serialize_cstring (m, msg->name); + } + } +} + +void +unserialize_mc_main (serialize_main_t * m, va_list * va) +{ + mc_main_t *mcm = va_arg (*va, mc_main_t *); + u32 i, n_streams, n_stream_msgs; + char *name; + mc_stream_t *s; + mc_serialize_stream_msg_t *sm; + + unserialize_integer (m, &n_streams, sizeof (u32)); + for (i = 0; i < n_streams; i++) + { + unserialize_cstring (m, &name); + if (i != MC_STREAM_INDEX_INTERNAL && !mc_stream_by_name (mcm, name)) + { + vec_validate (mcm->stream_vector, i); + s = vec_elt_at_index (mcm->stream_vector, i); + mc_stream_init (s); + s->index = s - mcm->stream_vector; + s->config.name = name; + s->state = MC_STREAM_STATE_name_known; + hash_set_mem (mcm->stream_index_by_name, s->config.name, s->index); + } + else + vec_free (name); + + s = vec_elt_at_index (mcm->stream_vector, i); + + vec_free (s->stream_msgs); + vec_free (s->stream_msg_index_by_global_index); + + unserialize_integer (m, &n_stream_msgs, sizeof (u32)); + vec_resize (s->stream_msgs, n_stream_msgs); + vec_foreach (sm, s->stream_msgs) + { + uword *p; + u32 si, gi; + + unserialize_cstring (m, &name); + p = hash_get (mcm->global_msg_index_by_name, name); + gi = p ? p[0] : ~0; + si = sm - s->stream_msgs; + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "catchup-bind: %s to %d global index %d stream %d", + .format_args = "T4i4i4i4", + }; + /* *INDENT-ON* */ + + struct + { + u32 c[4]; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->c[0] = elog_id_for_msg_name (mcm, name); + ed->c[1] = si; + ed->c[2] = gi; + ed->c[3] = s->index; + } + + vec_free (name); + + sm->global_index = gi; + if (gi != ~0) + { + vec_validate_init_empty (s->stream_msg_index_by_global_index, + gi, ~0); + s->stream_msg_index_by_global_index[gi] = si; + } + } + } +} + +void +mc_main_init (mc_main_t * mcm, char *tag) +{ + vlib_main_t *vm = vlib_get_main (); + + mcm->vlib_main = vm; + mcm->elog_main = &vm->elog_main; + + mcm->relay_master_peer_id = ~0ULL; + mcm->relay_state = MC_RELAY_STATE_NEGOTIATE; + + mcm->stream_index_by_name + = hash_create_string ( /* elts */ 0, /* value size */ sizeof (uword)); + + { + vlib_node_registration_t r; + + memset (&r, 0, sizeof (r)); + + r.type = VLIB_NODE_TYPE_PROCESS; + + /* Point runtime data to main instance. */ + r.runtime_data = &mcm; + r.runtime_data_bytes = sizeof (&mcm); + + r.name = (char *) format (0, "mc-mastership-%s", tag); + r.function = mc_mastership_process; + mcm->mastership_process = vlib_register_node (vm, &r); + + r.name = (char *) format (0, "mc-join-ager-%s", tag); + r.function = mc_join_ager_process; + mcm->join_ager_process = vlib_register_node (vm, &r); + + r.name = (char *) format (0, "mc-retry-%s", tag); + r.function = mc_retry_process; + mcm->retry_process = vlib_register_node (vm, &r); + + r.name = (char *) format (0, "mc-catchup-%s", tag); + r.function = mc_catchup_process; + mcm->catchup_process = vlib_register_node (vm, &r); + + r.name = (char *) format (0, "mc-unserialize-%s", tag); + r.function = mc_unserialize_process; + mcm->unserialize_process = vlib_register_node (vm, &r); + } + + if (MC_EVENT_LOGGING > 0) + mhash_init (&mcm->elog_id_by_peer_id, sizeof (uword), + sizeof (mc_peer_id_t)); + + mhash_init (&mcm->mastership_peer_index_by_id, sizeof (uword), + sizeof (mc_peer_id_t)); + mc_serialize_init (mcm); +} + +static u8 * +format_mc_relay_state (u8 * s, va_list * args) +{ + mc_relay_state_t state = va_arg (*args, mc_relay_state_t); + char *t = 0; + switch (state) + { + case MC_RELAY_STATE_NEGOTIATE: + t = "negotiate"; + break; + case MC_RELAY_STATE_MASTER: + t = "master"; + break; + case MC_RELAY_STATE_SLAVE: + t = "slave"; + break; + default: + return format (s, "unknown 0x%x", state); + } + + return format (s, "%s", t); +} + +static u8 * +format_mc_stream_state (u8 * s, va_list * args) +{ + mc_stream_state_t state = va_arg (*args, mc_stream_state_t); + char *t = 0; + switch (state) + { +#define _(f) case MC_STREAM_STATE_##f: t = #f; break; + foreach_mc_stream_state +#undef _ + default: + return format (s, "unknown 0x%x", state); + } + + return format (s, "%s", t); +} + +static int +mc_peer_comp (void *a1, void *a2) +{ + mc_stream_peer_t *p1 = a1; + mc_stream_peer_t *p2 = a2; + + return mc_peer_id_compare (p1->id, p2->id); +} + +u8 * +format_mc_main (u8 * s, va_list * args) +{ + mc_main_t *mcm = va_arg (*args, mc_main_t *); + mc_stream_t *t; + mc_stream_peer_t *p, *ps; + uword indent = format_get_indent (s); + + s = format (s, "MC state %U, %d streams joined, global sequence 0x%x", + format_mc_relay_state, mcm->relay_state, + vec_len (mcm->stream_vector), mcm->relay_global_sequence); + + { + mc_mastership_peer_t *mp; + f64 now = vlib_time_now (mcm->vlib_main); + s = format (s, "\n%UMost recent mastership peers:", + format_white_space, indent + 2); + vec_foreach (mp, mcm->mastership_peers) + { + s = format (s, "\n%U%-30U%.4e", + format_white_space, indent + 4, + mcm->transport.format_peer_id, mp->peer_id, + now - mp->time_last_master_assert_received); + } + } + + vec_foreach (t, mcm->stream_vector) + { + s = format (s, "\n%Ustream `%s' index %d", + format_white_space, indent + 2, t->config.name, t->index); + + s = format (s, "\n%Ustate %U", + format_white_space, indent + 4, + format_mc_stream_state, t->state); + + s = + format (s, + "\n%Uretries: interval %.0f sec, limit %d, pool elts %d, %Ld sent", + format_white_space, indent + 4, t->config.retry_interval, + t->config.retry_limit, pool_elts (t->retry_pool), + t->stats.n_retries - t->stats_last_clear.n_retries); + + s = format (s, "\n%U%Ld/%Ld user requests sent/received", + format_white_space, indent + 4, + t->user_requests_sent, t->user_requests_received); + + s = format (s, "\n%U%d peers, local/global sequence 0x%x/0x%x", + format_white_space, indent + 4, + pool_elts (t->peers), + t->our_local_sequence, t->last_global_sequence_processed); + + ps = 0; + /* *INDENT-OFF* */ + pool_foreach (p, t->peers, + ({ + if (clib_bitmap_get (t->all_peer_bitmap, p - t->peers)) + vec_add1 (ps, p[0]); + })); + /* *INDENT-ON* */ + vec_sort_with_function (ps, mc_peer_comp); + s = format (s, "\n%U%=30s%10s%16s%16s", + format_white_space, indent + 6, + "Peer", "Last seq", "Retries", "Future"); + + vec_foreach (p, ps) + { + s = format (s, "\n%U%-30U0x%08x%16Ld%16Ld%s", + format_white_space, indent + 6, + mcm->transport.format_peer_id, p->id.as_u64, + p->last_sequence_received, + p->stats.n_msgs_from_past - + p->stats_last_clear.n_msgs_from_past, + p->stats.n_msgs_from_future - + p->stats_last_clear.n_msgs_from_future, + (mcm->transport.our_ack_peer_id.as_u64 == + p->id.as_u64 ? " (self)" : "")); + } + vec_free (ps); + } + + return s; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/mc.h b/src/vlib/mc.h new file mode 100644 index 00000000000..dc95b0e9074 --- /dev/null +++ b/src/vlib/mc.h @@ -0,0 +1,687 @@ +/* + * mc.h: vlib reliable sequenced multicast distributed applications + * + * Copyright (c) 2010 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_vlib_mc_h +#define included_vlib_mc_h + +#include <vppinfra/elog.h> +#include <vppinfra/fifo.h> +#include <vppinfra/mhash.h> +#include <vlib/node.h> + +#ifndef MC_EVENT_LOGGING +#define MC_EVENT_LOGGING 1 +#endif + +always_inline uword +mc_need_byte_swap (void) +{ + return CLIB_ARCH_IS_LITTLE_ENDIAN; +} + +/* + * Used to uniquely identify hosts. + * For IP4 this would be ip4_address plus tcp/udp port. + */ +typedef union +{ + u8 as_u8[8]; + u64 as_u64; +} mc_peer_id_t; + +always_inline mc_peer_id_t +mc_byte_swap_peer_id (mc_peer_id_t i) +{ + /* Peer id is already in network byte order. */ + return i; +} + +always_inline int +mc_peer_id_compare (mc_peer_id_t a, mc_peer_id_t b) +{ + return memcmp (a.as_u8, b.as_u8, sizeof (a.as_u8)); +} + +/* Assert mastership. Lowest peer_id amount all peers wins mastership. + Only sent/received over mastership channel (MC_TRANSPORT_MASTERSHIP). + So, we don't need a message opcode. */ +typedef CLIB_PACKED (struct + { + /* Peer id asserting mastership. */ + mc_peer_id_t peer_id; + /* Global sequence number asserted. */ + u32 global_sequence;}) mc_msg_master_assert_t; + +always_inline void +mc_byte_swap_msg_master_assert (mc_msg_master_assert_t * r) +{ + if (mc_need_byte_swap ()) + { + r->peer_id = mc_byte_swap_peer_id (r->peer_id); + r->global_sequence = clib_byte_swap_u32 (r->global_sequence); + } +} + +#define foreach_mc_msg_type \ + _ (master_assert) \ + _ (join_or_leave_request) \ + _ (join_reply) \ + _ (user_request) \ + _ (user_ack) \ + _ (catchup_request) \ + _ (catchup_reply) + +typedef enum +{ +#define _(f) MC_MSG_TYPE_##f, + foreach_mc_msg_type +#undef _ +} mc_relay_msg_type_t; + +/* Request to join a given stream. Multicast over MC_TRANSPORT_JOIN. */ +typedef CLIB_PACKED (struct + { +mc_peer_id_t peer_id; mc_relay_msg_type_t type:32; + /* MC_MSG_TYPE_join_or_leave_request */ + /* Stream to join or leave. */ + u32 stream_index; + /* join = 1, leave = 0 */ + u8 is_join;}) mc_msg_join_or_leave_request_t; + +always_inline void +mc_byte_swap_msg_join_or_leave_request (mc_msg_join_or_leave_request_t * r) +{ + if (mc_need_byte_swap ()) + { + r->peer_id = mc_byte_swap_peer_id (r->peer_id); + r->type = clib_byte_swap_u32 (r->type); + r->stream_index = clib_byte_swap_u32 (r->stream_index); + } +} + +/* Join reply. Multicast over MC_TRANSPORT_JOIN. */ +typedef CLIB_PACKED (struct + { +mc_peer_id_t peer_id; mc_relay_msg_type_t type:32; + /* MC_MSG_TYPE_join_reply */ + u32 stream_index; + /* Peer ID to contact to catchup with this stream. */ + mc_peer_id_t catchup_peer_id;}) mc_msg_join_reply_t; + +always_inline void +mc_byte_swap_msg_join_reply (mc_msg_join_reply_t * r) +{ + if (mc_need_byte_swap ()) + { + r->peer_id = mc_byte_swap_peer_id (r->peer_id); + r->type = clib_byte_swap_u32 (r->type); + r->stream_index = clib_byte_swap_u32 (r->stream_index); + r->catchup_peer_id = mc_byte_swap_peer_id (r->catchup_peer_id); + } +} + +/* Generic (application) request. Multicast over MC_TRANSPORT_USER_REQUEST_TO_RELAY and then + relayed by relay master after filling in global sequence number. */ +typedef CLIB_PACKED (struct + { + mc_peer_id_t peer_id; u32 stream_index; + /* Global sequence number as filled in by relay master. */ + u32 global_sequence; + /* Local sequence number as filled in by peer sending message. */ + u32 local_sequence; + /* Size of request data. */ + u32 n_data_bytes; + /* Opaque request data. */ + u8 data[0];}) mc_msg_user_request_t; + +always_inline void +mc_byte_swap_msg_user_request (mc_msg_user_request_t * r) +{ + if (mc_need_byte_swap ()) + { + r->peer_id = mc_byte_swap_peer_id (r->peer_id); + r->stream_index = clib_byte_swap_u32 (r->stream_index); + r->global_sequence = clib_byte_swap_u32 (r->global_sequence); + r->local_sequence = clib_byte_swap_u32 (r->local_sequence); + r->n_data_bytes = clib_byte_swap_u32 (r->n_data_bytes); + } +} + +/* Sent unicast over ACK channel. */ +typedef CLIB_PACKED (struct + { + mc_peer_id_t peer_id; + u32 global_sequence; u32 stream_index; + u32 local_sequence; + i32 seq_cmp_result;}) mc_msg_user_ack_t; + +always_inline void +mc_byte_swap_msg_user_ack (mc_msg_user_ack_t * r) +{ + if (mc_need_byte_swap ()) + { + r->peer_id = mc_byte_swap_peer_id (r->peer_id); + r->stream_index = clib_byte_swap_u32 (r->stream_index); + r->global_sequence = clib_byte_swap_u32 (r->global_sequence); + r->local_sequence = clib_byte_swap_u32 (r->local_sequence); + r->seq_cmp_result = clib_byte_swap_i32 (r->seq_cmp_result); + } +} + +/* Sent/received unicast over catchup channel (e.g. using TCP). */ +typedef CLIB_PACKED (struct + { + mc_peer_id_t peer_id; + u32 stream_index;}) mc_msg_catchup_request_t; + +always_inline void +mc_byte_swap_msg_catchup_request (mc_msg_catchup_request_t * r) +{ + if (mc_need_byte_swap ()) + { + r->peer_id = mc_byte_swap_peer_id (r->peer_id); + r->stream_index = clib_byte_swap_u32 (r->stream_index); + } +} + +/* Sent/received unicast over catchup channel. */ +typedef CLIB_PACKED (struct + { + mc_peer_id_t peer_id; u32 stream_index; + /* Last global sequence number included in catchup data. */ + u32 last_global_sequence_included; + /* Size of catchup data. */ + u32 n_data_bytes; + /* Catchup data. */ + u8 data[0];}) mc_msg_catchup_reply_t; + +always_inline void +mc_byte_swap_msg_catchup_reply (mc_msg_catchup_reply_t * r) +{ + if (mc_need_byte_swap ()) + { + r->peer_id = mc_byte_swap_peer_id (r->peer_id); + r->stream_index = clib_byte_swap_u32 (r->stream_index); + r->last_global_sequence_included = + clib_byte_swap_u32 (r->last_global_sequence_included); + r->n_data_bytes = clib_byte_swap_u32 (r->n_data_bytes); + } +} + +typedef struct _mc_serialize_msg +{ + /* Name for this type. */ + char *name; + + /* Functions to serialize/unserialize data. */ + serialize_function_t *serialize; + serialize_function_t *unserialize; + + /* Maximum message size in bytes when serialized. + If zero then this will be set to the largest sent message. */ + u32 max_n_bytes_serialized; + + /* Opaque to use for first argument to serialize/unserialize function. */ + u32 opaque; + + /* Index in global message vector. */ + u32 global_index; + + /* Registration list */ + struct _mc_serialize_msg *next_registration; +} mc_serialize_msg_t; + +typedef struct +{ + /* Index into global message vector. */ + u32 global_index; +} mc_serialize_stream_msg_t; + +#define MC_SERIALIZE_MSG(x,...) \ + __VA_ARGS__ mc_serialize_msg_t x; \ +static void __mc_serialize_msg_registration_##x (void) \ + __attribute__((__constructor__)) ; \ +static void __mc_serialize_msg_registration_##x (void) \ +{ \ + vlib_main_t * vm = vlib_get_main(); \ + x.next_registration = vm->mc_msg_registrations; \ + vm->mc_msg_registrations = &x; \ +} \ +__VA_ARGS__ mc_serialize_msg_t x + +typedef enum +{ + MC_TRANSPORT_MASTERSHIP, + MC_TRANSPORT_JOIN, + MC_TRANSPORT_USER_REQUEST_TO_RELAY, + MC_TRANSPORT_USER_REQUEST_FROM_RELAY, + MC_N_TRANSPORT_TYPE, +} mc_transport_type_t; + +typedef struct +{ + clib_error_t *(*tx_buffer) (void *opaque, mc_transport_type_t type, + u32 buffer_index); + + clib_error_t *(*tx_ack) (void *opaque, mc_peer_id_t peer_id, + u32 buffer_index); + + /* Returns catchup opaque. */ + uword (*catchup_request_fun) (void *opaque, u32 stream_index, + mc_peer_id_t catchup_peer_id); + + void (*catchup_send_fun) (void *opaque, uword catchup_opaque, + u8 * data_vector); + + /* Opaque passed to callbacks. */ + void *opaque; + + mc_peer_id_t our_ack_peer_id; + mc_peer_id_t our_catchup_peer_id; + + /* Max packet size (MTU) for this transport. + For IP this is interface MTU less IP + UDP header size. */ + u32 max_packet_size; + + format_function_t *format_peer_id; +} mc_transport_t; + +typedef struct +{ + /* Count of messages received from this peer from the past/future + (with seq_cmp != 0). */ + u64 n_msgs_from_past; + u64 n_msgs_from_future; +} mc_stream_peer_stats_t; + +typedef struct +{ + /* ID of this peer. */ + mc_peer_id_t id; + + /* The last sequence we received from this peer. */ + u32 last_sequence_received; + + mc_stream_peer_stats_t stats, stats_last_clear; +} mc_stream_peer_t; + +typedef struct +{ + u32 buffer_index; + + /* Cached copy of local sequence number from buffer. */ + u32 local_sequence; + + /* Number of times this buffer has been sent (retried). */ + u32 n_retries; + + /* Previous/next retries in doubly-linked list. */ + u32 prev_index, next_index; + + /* Bitmap of all peers which have acked this msg */ + uword *unacked_by_peer_bitmap; + + /* Message send or resend time */ + f64 sent_at; +} mc_retry_t; + +typedef struct +{ + /* Number of retries sent for this stream. */ + u64 n_retries; +} mc_stream_stats_t; + +struct mc_main_t; +struct mc_stream_t; + +typedef struct +{ + /* Stream name. */ + char *name; + + /* Number of outstanding messages. */ + u32 window_size; + + /* Retry interval, in seconds */ + f64 retry_interval; + + /* Retry limit */ + u32 retry_limit; + + /* User rx buffer callback */ + void (*rx_buffer) (struct mc_main_t * mc_main, + struct mc_stream_t * stream, + mc_peer_id_t peer_id, u32 buffer_index); + + /* User callback to create a snapshot */ + u8 *(*catchup_snapshot) (struct mc_main_t * mc_main, + u8 * snapshot_vector, + u32 last_global_sequence_included); + + /* User callback to replay a snapshot */ + void (*catchup) (struct mc_main_t * mc_main, + u8 * snapshot_data, u32 n_snapshot_data_bytes); + + /* Callback to save a snapshot for offline replay */ + void (*save_snapshot) (struct mc_main_t * mc_main, + u32 is_catchup, + u8 * snapshot_data, u32 n_snapshot_data_bytes); + + /* Called when a peer dies */ + void (*peer_died) (struct mc_main_t * mc_main, + struct mc_stream_t * stream, mc_peer_id_t peer_id); +} mc_stream_config_t; + +#define foreach_mc_stream_state \ + _ (invalid) \ + _ (name_known) \ + _ (join_in_progress) \ + _ (catchup) \ + _ (ready) + +typedef enum +{ +#define _(f) MC_STREAM_STATE_##f, + foreach_mc_stream_state +#undef _ +} mc_stream_state_t; + +typedef struct mc_stream_t +{ + mc_stream_config_t config; + + mc_stream_state_t state; + + /* Index in stream pool. */ + u32 index; + + /* Stream index 0 is always for MC internal use. */ +#define MC_STREAM_INDEX_INTERNAL 0 + + mc_retry_t *retry_pool; + + /* Head and tail index of retry pool. */ + u32 retry_head_index, retry_tail_index; + + /* + * Country club for recently retired messages + * If the set of peers is expanding and a new peer + * misses a message, we can easily retire the FIFO + * element before we even know about the new peer + */ + mc_retry_t *retired_fifo; + + /* Hash mapping local sequence to retry pool index. */ + uword *retry_index_by_local_sequence; + + /* catch-up fifo of VLIB buffer indices. + start recording when catching up. */ + u32 *catchup_fifo; + + mc_stream_stats_t stats, stats_last_clear; + + /* Peer pool. */ + mc_stream_peer_t *peers; + + /* Bitmap with ones for all peers in peer pool. */ + uword *all_peer_bitmap; + + /* Map of 64 bit id to index in stream pool. */ + mhash_t peer_index_by_id; + + /* Timeout, in case we're alone in the world */ + f64 join_timeout; + + vlib_one_time_waiting_process_t *procs_waiting_for_join_done; + + vlib_one_time_waiting_process_t *procs_waiting_for_open_window; + + /* Next sequence number to use */ + u32 our_local_sequence; + + /* + * Last global sequence we processed. + * When supplying catchup data, we need to tell + * the client precisely where to start replaying + */ + u32 last_global_sequence_processed; + + /* Vector of unique messages we've sent on this stream. */ + mc_serialize_stream_msg_t *stream_msgs; + + /* Vector global message index into per stream message index. */ + u32 *stream_msg_index_by_global_index; + + /* Hashed by message name. */ + uword *stream_msg_index_by_name; + + u64 user_requests_sent; + u64 user_requests_received; +} mc_stream_t; + +always_inline void +mc_stream_free (mc_stream_t * s) +{ + pool_free (s->retry_pool); + hash_free (s->retry_index_by_local_sequence); + clib_fifo_free (s->catchup_fifo); + pool_free (s->peers); + mhash_free (&s->peer_index_by_id); + vec_free (s->procs_waiting_for_join_done); + vec_free (s->procs_waiting_for_open_window); +} + +always_inline void +mc_stream_init (mc_stream_t * s) +{ + memset (s, 0, sizeof (s[0])); + s->retry_head_index = s->retry_tail_index = ~0; +} + +typedef struct +{ + u32 stream_index; + u32 catchup_opaque; + u8 *catchup_snapshot; +} mc_catchup_process_arg_t; + +typedef enum +{ + MC_RELAY_STATE_NEGOTIATE, + MC_RELAY_STATE_MASTER, + MC_RELAY_STATE_SLAVE, +} mc_relay_state_t; + +typedef struct +{ + mc_peer_id_t peer_id; + + f64 time_last_master_assert_received; +} mc_mastership_peer_t; + +typedef struct +{ + u32 stream_index; + u32 buffer_index; +} mc_stream_and_buffer_t; + +typedef struct mc_main_t +{ + mc_relay_state_t relay_state; + + /* Mastership */ + u32 we_can_be_relay_master; + + u64 relay_master_peer_id; + + mc_mastership_peer_t *mastership_peers; + + /* Map of 64 bit id to index in stream pool. */ + mhash_t mastership_peer_index_by_id; + + /* The transport we're using. */ + mc_transport_t transport; + + /* Last-used global sequence number. */ + u32 relay_global_sequence; + + /* Vector of streams. */ + mc_stream_t *stream_vector; + + /* Hash table mapping stream name to pool index. */ + uword *stream_index_by_name; + + uword *procs_waiting_for_stream_name_by_name; + + vlib_one_time_waiting_process_t **procs_waiting_for_stream_name_pool; + + int joins_in_progress; + + mc_catchup_process_arg_t *catchup_process_args; + + /* Node indices for mastership, join ager, + retry and catchup processes. */ + u32 mastership_process; + u32 join_ager_process; + u32 retry_process; + u32 catchup_process; + u32 unserialize_process; + + /* Global vector of messages. */ + mc_serialize_msg_t **global_msgs; + + /* Hash table mapping message name to index. */ + uword *global_msg_index_by_name; + + /* Shared serialize/unserialize main. */ + serialize_main_t serialize_mains[VLIB_N_RX_TX]; + + vlib_serialize_buffer_main_t serialize_buffer_mains[VLIB_N_RX_TX]; + + /* Convenience variables */ + struct vlib_main_t *vlib_main; + elog_main_t *elog_main; + + /* Maps 64 bit peer id to elog string table offset for this formatted peer id. */ + mhash_t elog_id_by_peer_id; + + uword *elog_id_by_msg_name; + + /* For mc_unserialize. */ + mc_stream_and_buffer_t *mc_unserialize_stream_and_buffers; +} mc_main_t; + +always_inline mc_stream_t * +mc_stream_by_name (mc_main_t * m, char *name) +{ + uword *p = hash_get (m->stream_index_by_name, name); + return p ? vec_elt_at_index (m->stream_vector, p[0]) : 0; +} + +always_inline mc_stream_t * +mc_stream_by_index (mc_main_t * m, u32 i) +{ + return i < vec_len (m->stream_vector) ? m->stream_vector + i : 0; +} + +always_inline void +mc_clear_stream_stats (mc_main_t * m) +{ + mc_stream_t *s; + mc_stream_peer_t *p; + vec_foreach (s, m->stream_vector) + { + s->stats_last_clear = s->stats; + /* *INDENT-OFF* */ + pool_foreach (p, s->peers, ({ + p->stats_last_clear = p->stats; + })); + /* *INDENT-ON* */ + } +} + +/* Declare all message handlers. */ +#define _(f) void mc_msg_##f##_handler (mc_main_t * mcm, mc_msg_##f##_t * msg, u32 buffer_index); +foreach_mc_msg_type +#undef _ + u32 mc_stream_join (mc_main_t * mcm, mc_stream_config_t *); + +void mc_stream_leave (mc_main_t * mcm, u32 stream_index); + +void mc_wait_for_stream_ready (mc_main_t * m, char *stream_name); + +u32 mc_stream_send (mc_main_t * mcm, u32 stream_index, u32 buffer_index); + +void mc_main_init (mc_main_t * mcm, char *tag); + +void mc_enable_disable_mastership (mc_main_t * mcm, int we_can_be_master); + +void *mc_get_vlib_buffer (struct vlib_main_t *vm, u32 n_bytes, + u32 * bi_return); + +format_function_t format_mc_main; + +clib_error_t *mc_serialize_internal (mc_main_t * mc, + u32 stream_index, + u32 multiple_messages_per_vlib_buffer, + mc_serialize_msg_t * msg, ...); + +clib_error_t *mc_serialize_va (mc_main_t * mc, + u32 stream_index, + u32 multiple_messages_per_vlib_buffer, + mc_serialize_msg_t * msg, va_list * va); + +#define mc_serialize_stream(mc,si,msg,args...) \ + mc_serialize_internal((mc),(si),(0),(msg),(msg)->serialize,args) + +#define mc_serialize(mc,msg,args...) \ + mc_serialize_internal((mc),(~0),(0),(msg),(msg)->serialize,args) + +#define mc_serialize2(mc,add,msg,args...) \ + mc_serialize_internal((mc),(~0),(add),(msg),(msg)->serialize,args) + +void mc_unserialize (mc_main_t * mcm, mc_stream_t * s, u32 buffer_index); +uword mc_unserialize_message (mc_main_t * mcm, mc_stream_t * s, + serialize_main_t * m); + +serialize_function_t serialize_mc_main, unserialize_mc_main; + +always_inline uword +mc_max_message_size_in_bytes (mc_main_t * mcm) +{ + return mcm->transport.max_packet_size - sizeof (mc_msg_user_request_t); +} + +always_inline word +mc_serialize_n_bytes_left (mc_main_t * mcm, serialize_main_t * m) +{ + return mc_max_message_size_in_bytes (mcm) - + serialize_vlib_buffer_n_bytes (m); +} + +void unserialize_mc_stream (serialize_main_t * m, va_list * va); +void mc_stream_join_process_hold (void); + +#endif /* included_vlib_mc_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/node.c b/src/vlib/node.c new file mode 100644 index 00000000000..c419a13a487 --- /dev/null +++ b/src/vlib/node.c @@ -0,0 +1,631 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * node.c: VLIB processing nodes + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vlib/vlib.h> +#include <vlib/threads.h> + +/* Query node given name. */ +vlib_node_t * +vlib_get_node_by_name (vlib_main_t * vm, u8 * name) +{ + vlib_node_main_t *nm = &vm->node_main; + uword *p; + u8 *key = name; + if (!clib_mem_is_heap_object (key)) + key = format (0, "%s", key); + p = hash_get (nm->node_by_name, key); + if (key != name) + vec_free (key); + return p ? vec_elt (nm->nodes, p[0]) : 0; +} + +static void +node_set_elog_name (vlib_main_t * vm, uword node_index) +{ + vlib_node_t *n = vlib_get_node (vm, node_index); + elog_event_type_t *t; + + t = vec_elt_at_index (vm->node_call_elog_event_types, node_index); + vec_free (t->format); + t->format = (char *) format (0, "%v-call: %%d%c", n->name, 0); + + t = vec_elt_at_index (vm->node_return_elog_event_types, node_index); + vec_free (t->format); + t->format = (char *) format (0, "%v-return: %%d%c", n->name, 0); + + n->name_elog_string = elog_string (&vm->elog_main, "%v%c", n->name, 0); +} + +void +vlib_node_rename (vlib_main_t * vm, u32 node_index, char *fmt, ...) +{ + va_list va; + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n = vlib_get_node (vm, node_index); + + va_start (va, fmt); + hash_unset (nm->node_by_name, n->name); + vec_free (n->name); + n->name = va_format (0, fmt, &va); + va_end (va); + hash_set (nm->node_by_name, n->name, n->index); + + node_set_elog_name (vm, node_index); +} + +static void +vlib_node_runtime_update (vlib_main_t * vm, u32 node_index, u32 next_index) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_runtime_t *r, *s; + vlib_node_t *node, *next_node; + vlib_next_frame_t *nf; + vlib_pending_frame_t *pf; + i32 i, j, n_insert; + + ASSERT (os_get_cpu_number () == 0); + + vlib_worker_thread_barrier_sync (vm); + + node = vec_elt (nm->nodes, node_index); + r = vlib_node_get_runtime (vm, node_index); + + n_insert = vec_len (node->next_nodes) - r->n_next_nodes; + if (n_insert > 0) + { + i = r->next_frame_index + r->n_next_nodes; + vec_insert (nm->next_frames, n_insert, i); + + /* Initialize newly inserted next frames. */ + for (j = 0; j < n_insert; j++) + vlib_next_frame_init (nm->next_frames + i + j); + + /* Relocate other next frames at higher indices. */ + for (j = 0; j < vec_len (nm->nodes); j++) + { + s = vlib_node_get_runtime (vm, j); + if (j != node_index && s->next_frame_index >= i) + s->next_frame_index += n_insert; + } + + /* Pending frames may need to be relocated also. */ + vec_foreach (pf, nm->pending_frames) + { + if (pf->next_frame_index != VLIB_PENDING_FRAME_NO_NEXT_FRAME + && pf->next_frame_index >= i) + pf->next_frame_index += n_insert; + } + /* *INDENT-OFF* */ + pool_foreach (pf, nm->suspended_process_frames, ({ + if (pf->next_frame_index != ~0 && pf->next_frame_index >= i) + pf->next_frame_index += n_insert; + })); + /* *INDENT-ON* */ + + r->n_next_nodes = vec_len (node->next_nodes); + } + + /* Set frame's node runtime index. */ + next_node = vlib_get_node (vm, node->next_nodes[next_index]); + nf = nm->next_frames + r->next_frame_index + next_index; + nf->node_runtime_index = next_node->runtime_index; + + vlib_worker_thread_node_runtime_update (); + + vlib_worker_thread_barrier_release (vm); +} + +/* Add next node to given node in given slot. */ +uword +vlib_node_add_next_with_slot (vlib_main_t * vm, + uword node_index, + uword next_node_index, uword slot) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *node, *next; + uword *p; + + node = vec_elt (nm->nodes, node_index); + next = vec_elt (nm->nodes, next_node_index); + + /* Runtime has to be initialized. */ + ASSERT (nm->flags & VLIB_NODE_MAIN_RUNTIME_STARTED); + + if ((p = hash_get (node->next_slot_by_node, next_node_index))) + { + /* Next already exists: slot must match. */ + if (slot != ~0) + ASSERT (slot == p[0]); + return p[0]; + } + + if (slot == ~0) + slot = vec_len (node->next_nodes); + + vec_validate_init_empty (node->next_nodes, slot, ~0); + vec_validate (node->n_vectors_by_next_node, slot); + + node->next_nodes[slot] = next_node_index; + hash_set (node->next_slot_by_node, next_node_index, slot); + + vlib_node_runtime_update (vm, node_index, slot); + + next->prev_node_bitmap = clib_bitmap_ori (next->prev_node_bitmap, + node_index); + + /* Siblings all get same node structure. */ + { + uword sib_node_index, sib_slot; + vlib_node_t *sib_node; + /* *INDENT-OFF* */ + clib_bitmap_foreach (sib_node_index, node->sibling_bitmap, ({ + sib_node = vec_elt (nm->nodes, sib_node_index); + if (sib_node != node) + { + sib_slot = vlib_node_add_next_with_slot (vm, sib_node_index, next_node_index, slot); + ASSERT (sib_slot == slot); + } + })); + /* *INDENT-ON* */ + } + + return slot; +} + +/* Add named next node to given node in given slot. */ +uword +vlib_node_add_named_next_with_slot (vlib_main_t * vm, + uword node, char *name, uword slot) +{ + vlib_node_main_t *nm; + vlib_node_t *n, *n_next; + + nm = &vm->node_main; + n = vlib_get_node (vm, node); + + n_next = vlib_get_node_by_name (vm, (u8 *) name); + if (!n_next) + { + if (nm->flags & VLIB_NODE_MAIN_RUNTIME_STARTED) + return ~0; + + if (slot == ~0) + slot = clib_max (vec_len (n->next_node_names), + vec_len (n->next_nodes)); + vec_validate (n->next_node_names, slot); + n->next_node_names[slot] = name; + return slot; + } + + return vlib_node_add_next_with_slot (vm, node, n_next->index, slot); +} + +static void +node_elog_init (vlib_main_t * vm, uword ni) +{ + elog_event_type_t t; + + memset (&t, 0, sizeof (t)); + + /* 2 event types for this node: one when node function is called. + One when it returns. */ + vec_validate (vm->node_call_elog_event_types, ni); + vm->node_call_elog_event_types[ni] = t; + + vec_validate (vm->node_return_elog_event_types, ni); + vm->node_return_elog_event_types[ni] = t; + + node_set_elog_name (vm, ni); +} + +#ifdef CLIB_UNIX +#define STACK_ALIGN (clib_mem_get_page_size()) +#else +#define STACK_ALIGN CLIB_CACHE_LINE_BYTES +#endif + +static void +register_node (vlib_main_t * vm, vlib_node_registration_t * r) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n; + u32 page_size = clib_mem_get_page_size (); + int i; + + if (CLIB_DEBUG > 0) + { + /* Default (0) type should match INTERNAL. */ + vlib_node_t zero = { 0 }; + ASSERT (VLIB_NODE_TYPE_INTERNAL == zero.type); + } + + ASSERT (r->function != 0); + + n = clib_mem_alloc_no_fail (sizeof (n[0])); + memset (n, 0, sizeof (n[0])); + n->index = vec_len (nm->nodes); + + vec_add1 (nm->nodes, n); + + /* Name is always a vector so it can be formatted with %v. */ + if (clib_mem_is_heap_object (vec_header (r->name, 0))) + n->name = vec_dup ((u8 *) r->name); + else + n->name = format (0, "%s", r->name); + + if (!nm->node_by_name) + nm->node_by_name = hash_create_vec ( /* size */ 32, + sizeof (n->name[0]), sizeof (uword)); + + /* Node names must be unique. */ + { + vlib_node_t *o = vlib_get_node_by_name (vm, n->name); + if (o) + clib_error ("more than one node named `%v'", n->name); + } + + hash_set (nm->node_by_name, n->name, n->index); + + r->index = n->index; /* save index in registration */ + n->function = r->function; + + /* Node index of next sibling will be filled in by vlib_node_main_init. */ + n->sibling_of = r->sibling_of; + if (r->sibling_of && r->n_next_nodes > 0) + clib_error ("sibling node should not have any next nodes `%v'", n->name); + + if (r->type == VLIB_NODE_TYPE_INTERNAL) + ASSERT (r->vector_size > 0); + +#define _(f) n->f = r->f + + _(type); + _(flags); + _(state); + _(scalar_size); + _(vector_size); + _(format_buffer); + _(unformat_buffer); + _(format_trace); + _(validate_frame); + + /* Register error counters. */ + vlib_register_errors (vm, n->index, r->n_errors, r->error_strings); + node_elog_init (vm, n->index); + + _(runtime_data_bytes); + if (r->runtime_data_bytes > 0) + { + vec_resize (n->runtime_data, r->runtime_data_bytes); + if (r->runtime_data) + clib_memcpy (n->runtime_data, r->runtime_data, r->runtime_data_bytes); + } + + vec_resize (n->next_node_names, r->n_next_nodes); + for (i = 0; i < r->n_next_nodes; i++) + n->next_node_names[i] = r->next_nodes[i]; + + vec_validate_init_empty (n->next_nodes, r->n_next_nodes - 1, ~0); + vec_validate (n->n_vectors_by_next_node, r->n_next_nodes - 1); + + n->owner_node_index = n->owner_next_index = ~0; + + /* Initialize node runtime. */ + { + vlib_node_runtime_t *rt; + u32 i; + + if (n->type == VLIB_NODE_TYPE_PROCESS) + { + vlib_process_t *p; + uword log2_n_stack_bytes; + + log2_n_stack_bytes = clib_max (r->process_log2_n_stack_bytes, 15); + +#ifdef CLIB_UNIX + /* + * Bump the stack size if running over a kernel with a large page size, + * and the stack isn't any too big to begin with. Otherwise, we'll + * trip over the stack guard page for sure. + */ + if ((page_size > (4 << 10)) && log2_n_stack_bytes < 19) + { + if ((1 << log2_n_stack_bytes) <= page_size) + log2_n_stack_bytes = min_log2 (page_size) + 1; + else + log2_n_stack_bytes++; + } +#endif + + p = clib_mem_alloc_aligned_at_offset + (sizeof (p[0]) + (1 << log2_n_stack_bytes), + STACK_ALIGN, STRUCT_OFFSET_OF (vlib_process_t, stack), + 0 /* no, don't call os_out_of_memory */ ); + if (p == 0) + clib_panic ("failed to allocate process stack (%d bytes)", + 1 << log2_n_stack_bytes); + + memset (p, 0, sizeof (p[0])); + p->log2_n_stack_bytes = log2_n_stack_bytes; + + /* Process node's runtime index is really index into process + pointer vector. */ + n->runtime_index = vec_len (nm->processes); + + vec_add1 (nm->processes, p); + + /* Paint first stack word with magic number so we can at least + detect process stack overruns. */ + p->stack[0] = VLIB_PROCESS_STACK_MAGIC; + + /* Node runtime is stored inside of process. */ + rt = &p->node_runtime; + +#ifdef CLIB_UNIX + /* + * Disallow writes to the bottom page of the stack, to + * catch stack overflows. + */ + if (mprotect (p->stack, page_size, PROT_READ) < 0) + clib_unix_warning ("process stack"); +#endif + + } + else + { + vec_add2_aligned (nm->nodes_by_type[n->type], rt, 1, + /* align */ CLIB_CACHE_LINE_BYTES); + n->runtime_index = rt - nm->nodes_by_type[n->type]; + } + + if (n->type == VLIB_NODE_TYPE_INPUT) + nm->input_node_counts_by_state[n->state] += 1; + + rt->function = n->function; + rt->flags = n->flags; + rt->state = n->state; + rt->node_index = n->index; + + rt->n_next_nodes = r->n_next_nodes; + rt->next_frame_index = vec_len (nm->next_frames); + + vec_resize (nm->next_frames, rt->n_next_nodes); + for (i = 0; i < rt->n_next_nodes; i++) + vlib_next_frame_init (nm->next_frames + rt->next_frame_index + i); + + vec_resize (rt->errors, r->n_errors); + for (i = 0; i < vec_len (rt->errors); i++) + rt->errors[i] = vlib_error_set (n->index, i); + + STATIC_ASSERT_SIZEOF (vlib_node_runtime_t, 128); + ASSERT (vec_len (n->runtime_data) <= + sizeof (vlib_node_runtime_t) - + STRUCT_OFFSET_OF (vlib_node_runtime_t, runtime_data)); + + if (vec_len (n->runtime_data) > 0) + clib_memcpy (rt->runtime_data, n->runtime_data, + vec_len (n->runtime_data)); + + vec_free (n->runtime_data); + } +} + +/* Register new packet processing node. */ +u32 +vlib_register_node (vlib_main_t * vm, vlib_node_registration_t * r) +{ + register_node (vm, r); + return r->index; +} + +static uword +null_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + u16 n_vectors = frame->n_vectors; + + vlib_node_increment_counter (vm, node->node_index, 0, n_vectors); + vlib_buffer_free (vm, vlib_frame_args (frame), n_vectors); + vlib_frame_free (vm, node, frame); + + return n_vectors; +} + +void +vlib_register_all_static_nodes (vlib_main_t * vm) +{ + vlib_node_registration_t *r; + + static char *null_node_error_strings[] = { + "blackholed packets", + }; + + static vlib_node_registration_t null_node_reg = { + .function = null_node_fn, + .vector_size = sizeof (u32), + .name = "null-node", + .n_errors = 1, + .error_strings = null_node_error_strings, + }; + + /* make sure that node index 0 is not used by + real node */ + register_node (vm, &null_node_reg); + + r = vm->node_main.node_registrations; + while (r) + { + register_node (vm, r); + r = r->next_registration; + } +} + +clib_error_t * +vlib_node_main_init (vlib_main_t * vm) +{ + vlib_node_main_t *nm = &vm->node_main; + clib_error_t *error = 0; + vlib_node_t *n; + uword ni; + + nm->flags |= VLIB_NODE_MAIN_RUNTIME_STARTED; + + /* Generate sibling relationships */ + { + vlib_node_t *n, *sib; + uword si; + + for (ni = 0; ni < vec_len (nm->nodes); ni++) + { + n = vec_elt (nm->nodes, ni); + + if (!n->sibling_of) + continue; + + sib = vlib_get_node_by_name (vm, (u8 *) n->sibling_of); + if (!sib) + { + error = clib_error_create ("sibling `%s' not found for node `%v'", + n->sibling_of, n->name); + goto done; + } + + /* *INDENT-OFF* */ + clib_bitmap_foreach (si, sib->sibling_bitmap, ({ + vlib_node_t * m = vec_elt (nm->nodes, si); + + /* Connect all of sibling's siblings to us. */ + m->sibling_bitmap = clib_bitmap_ori (m->sibling_bitmap, n->index); + + /* Connect us to all of sibling's siblings. */ + n->sibling_bitmap = clib_bitmap_ori (n->sibling_bitmap, si); + })); + /* *INDENT-ON* */ + + /* Connect sibling to us. */ + sib->sibling_bitmap = clib_bitmap_ori (sib->sibling_bitmap, n->index); + + /* Connect us to sibling. */ + n->sibling_bitmap = clib_bitmap_ori (n->sibling_bitmap, sib->index); + } + } + + /* Resolve next names into next indices. */ + for (ni = 0; ni < vec_len (nm->nodes); ni++) + { + uword i; + + n = vec_elt (nm->nodes, ni); + + for (i = 0; i < vec_len (n->next_node_names); i++) + { + char *a = n->next_node_names[i]; + + if (!a) + continue; + + if (~0 == vlib_node_add_named_next_with_slot (vm, n->index, a, i)) + { + error = clib_error_create + ("node `%v' refers to unknown node `%s'", n->name, a); + goto done; + } + } + + vec_free (n->next_node_names); + } + + /* Set previous node pointers. */ + for (ni = 0; ni < vec_len (nm->nodes); ni++) + { + vlib_node_t *n_next; + uword i; + + n = vec_elt (nm->nodes, ni); + + for (i = 0; i < vec_len (n->next_nodes); i++) + { + if (n->next_nodes[i] >= vec_len (nm->nodes)) + continue; + + n_next = vec_elt (nm->nodes, n->next_nodes[i]); + n_next->prev_node_bitmap = + clib_bitmap_ori (n_next->prev_node_bitmap, n->index); + } + } + + { + vlib_next_frame_t *nf; + vlib_node_runtime_t *r; + vlib_node_t *next; + uword i; + + vec_foreach (r, nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]) + { + if (r->n_next_nodes == 0) + continue; + + n = vlib_get_node (vm, r->node_index); + nf = vec_elt_at_index (nm->next_frames, r->next_frame_index); + + for (i = 0; i < vec_len (n->next_nodes); i++) + { + next = vlib_get_node (vm, n->next_nodes[i]); + + /* Validate node runtime indices are correctly initialized. */ + ASSERT (nf[i].node_runtime_index == next->runtime_index); + + nf[i].flags = 0; + if (next->flags & VLIB_NODE_FLAG_FRAME_NO_FREE_AFTER_DISPATCH) + nf[i].flags |= VLIB_FRAME_NO_FREE_AFTER_DISPATCH; + } + } + } + +done: + return error; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/node.h b/src/vlib/node.h new file mode 100644 index 00000000000..b624e9d636d --- /dev/null +++ b/src/vlib/node.h @@ -0,0 +1,725 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * node.h: VLIB processing nodes + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_node_h +#define included_vlib_node_h + +#include <vppinfra/cpu.h> +#include <vppinfra/longjmp.h> +#include <vppinfra/timing_wheel.h> +#include <vlib/trace.h> /* for vlib_trace_filter_t */ + +/* Forward declaration. */ +struct vlib_node_runtime_t; +struct vlib_frame_t; + +/* Internal nodes (including output nodes) move data from node to + node (or out of the graph for output nodes). */ +typedef uword (vlib_node_function_t) (struct vlib_main_t * vm, + struct vlib_node_runtime_t * node, + struct vlib_frame_t * frame); + +typedef enum +{ + /* An internal node on the call graph (could be output). */ + VLIB_NODE_TYPE_INTERNAL, + + /* Nodes which input data into the processing graph. + Input nodes are called for each iteration of main loop. */ + VLIB_NODE_TYPE_INPUT, + + /* Nodes to be called before all input nodes. + Used, for example, to clean out driver TX rings before + processing input. */ + VLIB_NODE_TYPE_PRE_INPUT, + + /* "Process" nodes which can be suspended and later resumed. */ + VLIB_NODE_TYPE_PROCESS, + + VLIB_N_NODE_TYPE, +} vlib_node_type_t; + +typedef struct _vlib_node_registration +{ + /* Vector processing function for this node. */ + vlib_node_function_t *function; + + /* Node name. */ + char *name; + + /* Name of sibling (if applicable). */ + char *sibling_of; + + /* Node index filled in by registration. */ + u32 index; + + /* Type of this node. */ + vlib_node_type_t type; + + /* Error strings indexed by error code for this node. */ + char **error_strings; + + /* Buffer format/unformat for this node. */ + format_function_t *format_buffer; + unformat_function_t *unformat_buffer; + + /* Trace format/unformat for this node. */ + format_function_t *format_trace; + unformat_function_t *unformat_trace; + + /* Function to validate incoming frames. */ + u8 *(*validate_frame) (struct vlib_main_t * vm, + struct vlib_node_runtime_t *, + struct vlib_frame_t * f); + + /* Per-node runtime data. */ + void *runtime_data; + + /* Process stack size. */ + u16 process_log2_n_stack_bytes; + + /* Number of bytes of per-node run time data. */ + u8 runtime_data_bytes; + + /* State for input nodes. */ + u8 state; + + /* Node flags. */ + u16 flags; + + /* Size of scalar and vector arguments in bytes. */ + u16 scalar_size, vector_size; + + /* Number of error codes used by this node. */ + u16 n_errors; + + /* Number of next node names that follow. */ + u16 n_next_nodes; + + /* Constructor link-list, don't ask... */ + struct _vlib_node_registration *next_registration; + + /* Names of next nodes which this node feeds into. */ + char *next_nodes[]; + +} vlib_node_registration_t; + +#define VLIB_REGISTER_NODE(x,...) \ + __VA_ARGS__ vlib_node_registration_t x; \ +static void __vlib_add_node_registration_##x (void) \ + __attribute__((__constructor__)) ; \ +static void __vlib_add_node_registration_##x (void) \ +{ \ + vlib_main_t * vm = vlib_get_main(); \ + x.next_registration = vm->node_main.node_registrations; \ + vm->node_main.node_registrations = &x; \ +} \ +__VA_ARGS__ vlib_node_registration_t x + +#if CLIB_DEBUG > 0 +#define VLIB_NODE_FUNCTION_CLONE_TEMPLATE(arch, fn) +#define VLIB_NODE_FUNCTION_MULTIARCH_CLONE(fn) +#define VLIB_NODE_FUNCTION_MULTIARCH(node, fn) +#else +#define VLIB_NODE_FUNCTION_CLONE_TEMPLATE(arch, fn, tgt) \ + uword \ + __attribute__ ((flatten)) \ + __attribute__ ((target (tgt))) \ + CLIB_CPU_OPTIMIZED \ + fn ## _ ## arch ( struct vlib_main_t * vm, \ + struct vlib_node_runtime_t * node, \ + struct vlib_frame_t * frame) \ + { return fn (vm, node, frame); } + +#define VLIB_NODE_FUNCTION_MULTIARCH_CLONE(fn) \ + foreach_march_variant(VLIB_NODE_FUNCTION_CLONE_TEMPLATE, fn) + +#define VLIB_NODE_FUNCTION_MULTIARCH(node, fn) \ + VLIB_NODE_FUNCTION_MULTIARCH_CLONE(fn) \ + CLIB_MULTIARCH_SELECT_FN(fn, static inline) \ + static void __attribute__((__constructor__)) \ + __vlib_node_function_multiarch_select_##node (void) \ + { node.function = fn ## _multiarch_select(); } +#endif + +always_inline vlib_node_registration_t * +vlib_node_next_registered (vlib_node_registration_t * c) +{ + c = + clib_elf_section_data_next (c, + c->n_next_nodes * sizeof (c->next_nodes[0])); + return c; +} + +typedef struct +{ + /* Total calls, clock ticks and vector elements processed for this node. */ + u64 calls, vectors, clocks, suspends; + u64 max_clock; + u64 max_clock_n; +} vlib_node_stats_t; + +#define foreach_vlib_node_state \ + /* Input node is called each iteration of main loop. \ + This is the default (zero). */ \ + _ (POLLING) \ + /* Input node is called when device signals an interrupt. */ \ + _ (INTERRUPT) \ + /* Input node is never called. */ \ + _ (DISABLED) + +typedef enum +{ +#define _(f) VLIB_NODE_STATE_##f, + foreach_vlib_node_state +#undef _ + VLIB_N_NODE_STATE, +} vlib_node_state_t; + +typedef struct vlib_node_t +{ + /* Vector processing function for this node. */ + vlib_node_function_t *function; + + /* Node name. */ + u8 *name; + + /* Node name index in elog string table. */ + u32 name_elog_string; + + /* Total statistics for this node. */ + vlib_node_stats_t stats_total; + + /* Saved values as of last clear (or zero if never cleared). + Current values are always stats_total - stats_last_clear. */ + vlib_node_stats_t stats_last_clear; + + /* Type of this node. */ + vlib_node_type_t type; + + /* Node index. */ + u32 index; + + /* Index of corresponding node runtime. */ + u32 runtime_index; + + /* Runtime data for this node. */ + void *runtime_data; + + /* Node flags. */ + u16 flags; + + /* Processing function keeps frame. Tells node dispatching code not + to free frame after dispatch is done. */ +#define VLIB_NODE_FLAG_FRAME_NO_FREE_AFTER_DISPATCH (1 << 0) + + /* Node counts as output/drop/punt node for stats purposes. */ +#define VLIB_NODE_FLAG_IS_OUTPUT (1 << 1) +#define VLIB_NODE_FLAG_IS_DROP (1 << 2) +#define VLIB_NODE_FLAG_IS_PUNT (1 << 3) +#define VLIB_NODE_FLAG_IS_HANDOFF (1 << 4) + + /* Set if current node runtime has traced vectors. */ +#define VLIB_NODE_FLAG_TRACE (1 << 5) + +#define VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE (1 << 6) +#define VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE (1 << 7) + + /* State for input nodes. */ + u8 state; + + /* Number of bytes of run time data. */ + u8 runtime_data_bytes; + + /* Number of error codes used by this node. */ + u16 n_errors; + + /* Size of scalar and vector arguments in bytes. */ + u16 scalar_size, vector_size; + + /* Handle/index in error heap for this node. */ + u32 error_heap_handle; + u32 error_heap_index; + + /* Error strings indexed by error code for this node. */ + char **error_strings; + + /* Vector of next node names. + Only used before next_nodes array is initialized. */ + char **next_node_names; + + /* Next node indices for this node. */ + u32 *next_nodes; + + /* Name of node that we are sibling of. */ + char *sibling_of; + + /* Bitmap of all of this node's siblings. */ + uword *sibling_bitmap; + + /* Total number of vectors sent to each next node. */ + u64 *n_vectors_by_next_node; + + /* Hash table mapping next node index into slot in + next_nodes vector. Quickly determines whether this node + is connected to given next node and, if so, with which slot. */ + uword *next_slot_by_node; + + /* Bitmap of node indices which feed this node. */ + uword *prev_node_bitmap; + + /* Node/next-index which own enqueue rights with to this node. */ + u32 owner_node_index, owner_next_index; + + /* Buffer format/unformat for this node. */ + format_function_t *format_buffer; + unformat_function_t *unformat_buffer; + + /* Trace buffer format/unformat for this node. */ + format_function_t *format_trace; + + /* Function to validate incoming frames. */ + u8 *(*validate_frame) (struct vlib_main_t * vm, + struct vlib_node_runtime_t *, + struct vlib_frame_t * f); + /* for pretty-printing, not typically valid */ + u8 *state_string; +} vlib_node_t; + +#define VLIB_INVALID_NODE_INDEX ((u32) ~0) + +/* Max number of vector elements to process at once per node. */ +#define VLIB_FRAME_SIZE 256 +#define VLIB_FRAME_ALIGN VLIB_MAX_CPUS + +/* Calling frame (think stack frame) for a node. */ +typedef struct vlib_frame_t +{ + /* Frame flags. */ + u16 flags; + + /* Number of scalar bytes in arguments. */ + u8 scalar_size; + + /* Number of bytes per vector argument. */ + u8 vector_size; + + /* Number of vector elements currently in frame. */ + u16 n_vectors; + + /* Owner cpuid / heap id */ + u16 cpu_index; + + /* Scalar and vector arguments to next node. */ + u8 arguments[0]; +} vlib_frame_t; + +typedef struct +{ + /* Frame index. */ + u32 frame_index; + + /* Node runtime for this next. */ + u32 node_runtime_index; + + /* Next frame flags. */ + u32 flags; + + /* Reflects node frame-used flag for this next. */ +#define VLIB_FRAME_NO_FREE_AFTER_DISPATCH \ + VLIB_NODE_FLAG_FRAME_NO_FREE_AFTER_DISPATCH + + /* This next frame owns enqueue to node + corresponding to node_runtime_index. */ +#define VLIB_FRAME_OWNER (1 << 15) + + /* Set when frame has been allocated for this next. */ +#define VLIB_FRAME_IS_ALLOCATED VLIB_NODE_FLAG_IS_OUTPUT + + /* Set when frame has been added to pending vector. */ +#define VLIB_FRAME_PENDING VLIB_NODE_FLAG_IS_DROP + + /* Set when frame is to be freed after dispatch. */ +#define VLIB_FRAME_FREE_AFTER_DISPATCH VLIB_NODE_FLAG_IS_PUNT + + /* Set when frame has traced packets. */ +#define VLIB_FRAME_TRACE VLIB_NODE_FLAG_TRACE + + /* Number of vectors enqueue to this next since last overflow. */ + u32 vectors_since_last_overflow; +} vlib_next_frame_t; + +always_inline void +vlib_next_frame_init (vlib_next_frame_t * nf) +{ + memset (nf, 0, sizeof (nf[0])); + nf->frame_index = ~0; + nf->node_runtime_index = ~0; +} + +/* A frame pending dispatch by main loop. */ +typedef struct +{ + /* Node and runtime for this frame. */ + u32 node_runtime_index; + + /* Frame index (in the heap). */ + u32 frame_index; + + /* Start of next frames for this node. */ + u32 next_frame_index; + + /* Special value for next_frame_index when there is no next frame. */ +#define VLIB_PENDING_FRAME_NO_NEXT_FRAME ((u32) ~0) +} vlib_pending_frame_t; + +typedef struct vlib_node_runtime_t +{ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + /* Node function to call. */ + vlib_node_function_t *function; + + /* Vector of errors for this node. */ + vlib_error_t *errors; + + /* Number of clock cycles. */ + u32 clocks_since_last_overflow; + + /* Maximum clock cycle for an invocation. */ + u32 max_clock; + + /* Number of vectors in the recorded max_clock. */ + u32 max_clock_n; + + /* Number of calls. */ + u32 calls_since_last_overflow; + + /* Number of vector elements processed by this node. */ + u32 vectors_since_last_overflow; + + /* Start of next frames for this node. */ + u32 next_frame_index; + + /* Node index. */ + u32 node_index; + + /* For input nodes: decremented on each main loop interation until it reaches zero + and function is called. Allows some input nodes to be called + more than others. */ + u32 input_main_loops_per_call; + + /* Saved main loop counter of last dispatch of this node. */ + u32 main_loop_count_last_dispatch; + + u32 main_loop_vector_stats[2]; + + /* Copy of main node flags. */ + u16 flags; + + /* Input node state. */ + u16 state; + + u16 n_next_nodes; + + /* Next frame index that vector arguments were last enqueued to + last time this node ran. Set to zero before first run + of this node. */ + u16 cached_next_index; + + /* CPU this node runs on */ + u16 cpu_index; + + /* Function dependent node-runtime. */ + u8 runtime_data[0]; +} +vlib_node_runtime_t; + +typedef struct +{ + /* Number of allocated frames for this scalar/vector size. */ + u32 n_alloc_frames; + + /* Vector of free frame indices for this scalar/vector size. */ + u32 *free_frame_indices; +} vlib_frame_size_t; + +typedef struct +{ + /* Users opaque value for event type. */ + uword opaque; +} vlib_process_event_type_t; + +typedef struct +{ + /* Node runtime for this process. */ + vlib_node_runtime_t node_runtime; + + /* Where to longjmp when process is done. */ + clib_longjmp_t return_longjmp; + +#define VLIB_PROCESS_RETURN_LONGJMP_RETURN ((uword) ~0 - 0) +#define VLIB_PROCESS_RETURN_LONGJMP_SUSPEND ((uword) ~0 - 1) + + /* Where to longjmp to resume node after suspend. */ + clib_longjmp_t resume_longjmp; +#define VLIB_PROCESS_RESUME_LONGJMP_SUSPEND 0 +#define VLIB_PROCESS_RESUME_LONGJMP_RESUME 1 + + u16 flags; +#define VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK (1 << 0) +#define VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT (1 << 1) + /* Set to indicate that this process has been added to resume vector. */ +#define VLIB_PROCESS_RESUME_PENDING (1 << 2) + + /* Process function is currently running. */ +#define VLIB_PROCESS_IS_RUNNING (1 << 3) + + /* Size of process stack. */ + u16 log2_n_stack_bytes; + + u32 suspended_process_frame_index; + + /* Number of times this process was suspended. */ + u32 n_suspends; + + /* Vectors of pending event data indexed by event type index. */ + void **pending_event_data_by_type_index; + + /* Bitmap of event type-indices with non-empty vectors. */ + uword *non_empty_event_type_bitmap; + + /* Bitmap of event type-indices which are one time events. */ + uword *one_time_event_type_bitmap; + + /* Type is opaque pointer -- typically a pointer to an event handler + function. Hash table to map opaque to a type index. */ + uword *event_type_index_by_type_opaque; + + /* Pool of currently valid event types. */ + vlib_process_event_type_t *event_type_pool; + + /* When suspending saves cpu cycle counter when process is to be resumed. */ + u64 resume_cpu_time; + + /* Default output function and its argument for any CLI outputs + within the process. */ + vlib_cli_output_function_t *output_function; + uword output_function_arg; + +#ifdef CLIB_UNIX + /* Pad to a multiple of the page size so we can mprotect process stacks */ +#define PAGE_SIZE_MULTIPLE 0x1000 +#define ALIGN_ON_MULTIPLE_PAGE_BOUNDARY_FOR_MPROTECT __attribute__ ((aligned (PAGE_SIZE_MULTIPLE))) +#else +#define ALIGN_ON_MULTIPLE_PAGE_BOUNDARY_FOR_MPROTECT +#endif + + /* Process stack. Starts here and extends 2^log2_n_stack_bytes + bytes. */ + +#define VLIB_PROCESS_STACK_MAGIC (0xdead7ead) + u32 stack[0] ALIGN_ON_MULTIPLE_PAGE_BOUNDARY_FOR_MPROTECT; +} vlib_process_t __attribute__ ((aligned (CLIB_CACHE_LINE_BYTES))); + +#ifdef CLIB_UNIX + /* Ensure that the stack is aligned on the multiple of the page size */ +typedef char + assert_process_stack_must_be_aligned_exactly_to_page_size_multiple[(sizeof + (vlib_process_t) + - + PAGE_SIZE_MULTIPLE) + == + 0 ? 0 : + -1]; +#endif + +typedef struct +{ + u32 node_index; + + u32 one_time_event; +} vlib_one_time_waiting_process_t; + +typedef struct +{ + u16 n_data_elts; + + u16 n_data_elt_bytes; + + /* n_data_elts * n_data_elt_bytes */ + u32 n_data_bytes; + + /* Process node & event type to be used to signal event. */ + u32 process_node_index; + + u32 event_type_index; + + union + { + u8 inline_event_data[64 - 3 * sizeof (u32) - 2 * sizeof (u16)]; + + /* Vector of event data used only when data does not fit inline. */ + u8 *event_data_as_vector; + }; +} +vlib_signal_timed_event_data_t; + +always_inline uword +vlib_timing_wheel_data_is_timed_event (u32 d) +{ + return d & 1; +} + +always_inline u32 +vlib_timing_wheel_data_set_suspended_process (u32 i) +{ + return 0 + 2 * i; +} + +always_inline u32 +vlib_timing_wheel_data_set_timed_event (u32 i) +{ + return 1 + 2 * i; +} + +always_inline uword +vlib_timing_wheel_data_get_index (u32 d) +{ + return d / 2; +} + +typedef struct +{ + /* Public nodes. */ + vlib_node_t **nodes; + + /* Node index hashed by node name. */ + uword *node_by_name; + + u32 flags; +#define VLIB_NODE_MAIN_RUNTIME_STARTED (1 << 0) + + /* Nodes segregated by type for cache locality. + Does not apply to nodes of type VLIB_NODE_TYPE_INTERNAL. */ + vlib_node_runtime_t *nodes_by_type[VLIB_N_NODE_TYPE]; + + /* Node runtime indices for input nodes with pending interrupts. */ + u32 *pending_interrupt_node_runtime_indices; + + /* Input nodes are switched from/to interrupt to/from polling mode + when average vector length goes above/below polling/interrupt + thresholds. */ + u32 polling_threshold_vector_length; + u32 interrupt_threshold_vector_length; + + /* Vector of next frames. */ + vlib_next_frame_t *next_frames; + + /* Vector of internal node's frames waiting to be called. */ + vlib_pending_frame_t *pending_frames; + + /* Timing wheel for scheduling time-based node dispatch. */ + timing_wheel_t timing_wheel; + + vlib_signal_timed_event_data_t *signal_timed_event_data_pool; + + /* Opaque data vector added via timing_wheel_advance. */ + u32 *data_from_advancing_timing_wheel; + + /* CPU time of next process to be ready on timing wheel. */ + u64 cpu_time_next_process_ready; + + /* Vector of process nodes. + One for each node of type VLIB_NODE_TYPE_PROCESS. */ + vlib_process_t **processes; + + /* Current running process or ~0 if no process running. */ + u32 current_process_index; + + /* Pool of pending process frames. */ + vlib_pending_frame_t *suspended_process_frames; + + /* Vector of event data vectors pending recycle. */ + void **recycled_event_data_vectors; + + /* Current counts of nodes in each state. */ + u32 input_node_counts_by_state[VLIB_N_NODE_STATE]; + + /* Hash of (scalar_size,vector_size) to frame_sizes index. */ + uword *frame_size_hash; + + /* Per-size frame allocation information. */ + vlib_frame_size_t *frame_sizes; + + /* Time of last node runtime stats clear. */ + f64 time_last_runtime_stats_clear; + + /* Node registrations added by constructors */ + vlib_node_registration_t *node_registrations; +} vlib_node_main_t; + + +#define FRAME_QUEUE_MAX_NELTS 32 +typedef struct +{ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + u64 head; + u64 head_hint; + u64 tail; + u32 n_in_use; + u32 nelts; + u32 written; + u32 threshold; + i32 n_vectors[FRAME_QUEUE_MAX_NELTS]; +} frame_queue_trace_t; + +typedef struct +{ + u64 count[FRAME_QUEUE_MAX_NELTS]; +} frame_queue_nelt_counter_t; + +#endif /* included_vlib_node_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/node_cli.c b/src/vlib/node_cli.c new file mode 100644 index 00000000000..05d0f0b5a95 --- /dev/null +++ b/src/vlib/node_cli.c @@ -0,0 +1,466 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * node_cli.c: node CLI + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vlib/vlib.h> +#include <vlib/threads.h> + +static int +node_cmp (void *a1, void *a2) +{ + vlib_node_t **n1 = a1; + vlib_node_t **n2 = a2; + + return vec_cmp (n1[0]->name, n2[0]->name); +} + +static clib_error_t * +show_node_graph (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n; + u32 node_index; + + vlib_cli_output (vm, "%U\n", format_vlib_node_graph, nm, 0); + + if (unformat (input, "%U", unformat_vlib_node, vm, &node_index)) + { + n = vlib_get_node (vm, node_index); + vlib_cli_output (vm, "%U\n", format_vlib_node_graph, nm, n); + } + else + { + vlib_node_t **nodes = vec_dup (nm->nodes); + uword i; + + vec_sort_with_function (nodes, node_cmp); + + for (i = 0; i < vec_len (nodes); i++) + vlib_cli_output (vm, "%U\n\n", format_vlib_node_graph, nm, nodes[i]); + + vec_free (nodes); + } + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_node_graph_command, static) = { + .path = "show vlib graph", + .short_help = "Show packet processing node graph", + .function = show_node_graph, +}; +/* *INDENT-ON* */ + +static u8 * +format_vlib_node_stats (u8 * s, va_list * va) +{ + vlib_main_t *vm = va_arg (*va, vlib_main_t *); + vlib_node_t *n = va_arg (*va, vlib_node_t *); + int max = va_arg (*va, int); + f64 v; + char *state; + u8 *ns; + u8 *misc_info = 0; + u64 c, p, l, d; + f64 x; + f64 maxc, maxcn; + u32 maxn; + uword indent; + + if (!n) + { + if (max) + return format (s, + "%=30s%=17s%=16s%=16s%=16s%=16s", + "Name", "Max Node Clocks", "Vectors at Max", + "Max Clocks", "Avg Clocks", "Avg Vectors/Call"); + else + return format (s, + "%=30s%=12s%=16s%=16s%=16s%=16s%=16s", + "Name", "State", "Calls", "Vectors", "Suspends", + "Clocks", "Vectors/Call"); + } + + indent = format_get_indent (s); + + l = n->stats_total.clocks - n->stats_last_clear.clocks; + c = n->stats_total.calls - n->stats_last_clear.calls; + p = n->stats_total.vectors - n->stats_last_clear.vectors; + d = n->stats_total.suspends - n->stats_last_clear.suspends; + maxc = (f64) n->stats_total.max_clock; + maxn = n->stats_total.max_clock_n; + if (n->stats_total.max_clock_n) + maxcn = (f64) n->stats_total.max_clock / (f64) maxn; + else + maxcn = 0.0; + + /* Clocks per packet, per call or per suspend. */ + x = 0; + if (p > 0) + x = (f64) l / (f64) p; + else if (c > 0) + x = (f64) l / (f64) c; + else if (d > 0) + x = (f64) l / (f64) d; + + if (c > 0) + v = (double) p / (double) c; + else + v = 0; + + state = "active"; + if (n->type == VLIB_NODE_TYPE_PROCESS) + { + vlib_process_t *p = vlib_get_process_from_node (vm, n); + + /* Show processes with events pending. This helps spot bugs where events are not + being handled. */ + if (!clib_bitmap_is_zero (p->non_empty_event_type_bitmap)) + misc_info = format (misc_info, "events pending, "); + + switch (p->flags & (VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK + | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT)) + { + default: + if (!(p->flags & VLIB_PROCESS_IS_RUNNING)) + state = "done"; + break; + + case VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK: + state = "time wait"; + break; + + case VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT: + state = "event wait"; + break; + + case (VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK): + state = + "any wait"; + break; + } + } + else if (n->type != VLIB_NODE_TYPE_INTERNAL) + { + state = "polling"; + if (n->state == VLIB_NODE_STATE_DISABLED) + state = "disabled"; + else if (n->state == VLIB_NODE_STATE_INTERRUPT) + state = "interrupt wait"; + } + + ns = n->name; + + if (max) + s = format (s, "%-30v%=17.2e%=16d%=16.2e%=16.2e%=16.2e", + ns, maxc, maxn, maxcn, x, v); + else + s = format (s, "%-30v%=12s%16Ld%16Ld%16Ld%16.2e%16.2f", ns, state, + c, p, d, x, v); + + if (ns != n->name) + vec_free (ns); + + if (misc_info) + { + s = format (s, "\n%U%v", format_white_space, indent + 4, misc_info); + vec_free (misc_info); + } + + return s; +} + +static clib_error_t * +show_node_runtime (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n; + f64 time_now; + u32 node_index; + vlib_node_t ***node_dups = 0; + f64 *vectors_per_main_loop = 0; + f64 *last_vector_length_per_node = 0; + + time_now = vlib_time_now (vm); + + if (unformat (input, "%U", unformat_vlib_node, vm, &node_index)) + { + n = vlib_get_node (vm, node_index); + vlib_node_sync_stats (vm, n); + vlib_cli_output (vm, "%U\n", format_vlib_node_stats, vm, 0, 0); + vlib_cli_output (vm, "%U\n", format_vlib_node_stats, vm, n, 0); + } + else + { + vlib_node_t **nodes; + uword i, j; + f64 dt; + u64 n_input, n_output, n_drop, n_punt; + u64 n_internal_vectors, n_internal_calls; + u64 n_clocks, l, v, c, d; + int brief = 1; + int max = 0; + vlib_main_t **stat_vms = 0, *stat_vm; + + /* Suppress nodes with zero calls since last clear */ + if (unformat (input, "brief") || unformat (input, "b")) + brief = 1; + if (unformat (input, "verbose") || unformat (input, "v")) + brief = 0; + if (unformat (input, "max") || unformat (input, "m")) + max = 1; + + if (vec_len (vlib_mains) == 0) + vec_add1 (stat_vms, vm); + else + { + for (i = 0; i < vec_len (vlib_mains); i++) + { + stat_vm = vlib_mains[i]; + if (stat_vm) + vec_add1 (stat_vms, stat_vm); + } + } + + /* + * Barrier sync across stats scraping. + * Otherwise, the counts will be grossly inaccurate. + */ + vlib_worker_thread_barrier_sync (vm); + + for (j = 0; j < vec_len (stat_vms); j++) + { + stat_vm = stat_vms[j]; + nm = &stat_vm->node_main; + + for (i = 0; i < vec_len (nm->nodes); i++) + { + n = nm->nodes[i]; + vlib_node_sync_stats (stat_vm, n); + } + + nodes = vec_dup (nm->nodes); + + vec_add1 (node_dups, nodes); + vec_add1 (vectors_per_main_loop, + vlib_last_vectors_per_main_loop_as_f64 (stat_vm)); + vec_add1 (last_vector_length_per_node, + vlib_last_vector_length_per_node (stat_vm)); + } + vlib_worker_thread_barrier_release (vm); + + + for (j = 0; j < vec_len (stat_vms); j++) + { + stat_vm = stat_vms[j]; + nodes = node_dups[j]; + + vec_sort_with_function (nodes, node_cmp); + + n_input = n_output = n_drop = n_punt = n_clocks = 0; + n_internal_vectors = n_internal_calls = 0; + for (i = 0; i < vec_len (nodes); i++) + { + n = nodes[i]; + + l = n->stats_total.clocks - n->stats_last_clear.clocks; + n_clocks += l; + + v = n->stats_total.vectors - n->stats_last_clear.vectors; + c = n->stats_total.calls - n->stats_last_clear.calls; + + switch (n->type) + { + default: + continue; + + case VLIB_NODE_TYPE_INTERNAL: + n_output += (n->flags & VLIB_NODE_FLAG_IS_OUTPUT) ? v : 0; + n_drop += (n->flags & VLIB_NODE_FLAG_IS_DROP) ? v : 0; + n_punt += (n->flags & VLIB_NODE_FLAG_IS_PUNT) ? v : 0; + if (!(n->flags & VLIB_NODE_FLAG_IS_OUTPUT)) + { + n_internal_vectors += v; + n_internal_calls += c; + } + if (n->flags & VLIB_NODE_FLAG_IS_HANDOFF) + n_input += v; + break; + + case VLIB_NODE_TYPE_INPUT: + n_input += v; + break; + } + } + + if (vec_len (vlib_mains)) + { + vlib_worker_thread_t *w = vlib_worker_threads + j; + if (j > 0) + vlib_cli_output (vm, "---------------"); + + if (w->lcore_id > -1) + vlib_cli_output (vm, "Thread %d %s (lcore %u)", j, w->name, + w->lcore_id); + else + vlib_cli_output (vm, "Thread %d %s", j, w->name); + } + + dt = time_now - nm->time_last_runtime_stats_clear; + vlib_cli_output + (vm, + "Time %.1f, average vectors/node %.2f, last %d main loops %.2f per node %.2f" + "\n vector rates in %.4e, out %.4e, drop %.4e, punt %.4e", + dt, + (n_internal_calls > 0 + ? (f64) n_internal_vectors / (f64) n_internal_calls + : 0), + 1 << VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE, + vectors_per_main_loop[j], + last_vector_length_per_node[j], + (f64) n_input / dt, + (f64) n_output / dt, (f64) n_drop / dt, (f64) n_punt / dt); + + vlib_cli_output (vm, "%U", format_vlib_node_stats, stat_vm, 0, max); + for (i = 0; i < vec_len (nodes); i++) + { + c = + nodes[i]->stats_total.calls - + nodes[i]->stats_last_clear.calls; + d = + nodes[i]->stats_total.suspends - + nodes[i]->stats_last_clear.suspends; + if (c || d || !brief) + { + vlib_cli_output (vm, "%U", format_vlib_node_stats, stat_vm, + nodes[i], max); + } + } + vec_free (nodes); + } + vec_free (stat_vms); + vec_free (node_dups); + vec_free (vectors_per_main_loop); + vec_free (last_vector_length_per_node); + } + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_node_runtime_command, static) = { + .path = "show runtime", + .short_help = "Show packet processing runtime", + .function = show_node_runtime, + .is_mp_safe = 1, +}; +/* *INDENT-ON* */ + +static clib_error_t * +clear_node_runtime (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_node_main_t *nm; + vlib_node_t *n; + int i, j; + vlib_main_t **stat_vms = 0, *stat_vm; + vlib_node_runtime_t *r; + + if (vec_len (vlib_mains) == 0) + vec_add1 (stat_vms, vm); + else + { + for (i = 0; i < vec_len (vlib_mains); i++) + { + stat_vm = vlib_mains[i]; + if (stat_vm) + vec_add1 (stat_vms, stat_vm); + } + } + + vlib_worker_thread_barrier_sync (vm); + + for (j = 0; j < vec_len (stat_vms); j++) + { + stat_vm = stat_vms[j]; + nm = &stat_vm->node_main; + + for (i = 0; i < vec_len (nm->nodes); i++) + { + n = nm->nodes[i]; + vlib_node_sync_stats (stat_vm, n); + n->stats_last_clear = n->stats_total; + + r = vlib_node_get_runtime (stat_vm, n->index); + r->max_clock = 0; + } + /* Note: input/output rates computed using vlib_global_main */ + nm->time_last_runtime_stats_clear = vlib_time_now (vm); + } + + vlib_worker_thread_barrier_release (vm); + + vec_free (stat_vms); + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (clear_node_runtime_command, static) = { + .path = "clear runtime", + .short_help = "Clear packet processing runtime statistics", + .function = clear_node_runtime, +}; +/* *INDENT-ON* */ + +/* Dummy function to get us linked in. */ +void +vlib_node_cli_reference (void) +{ +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/node_format.c b/src/vlib/node_format.c new file mode 100644 index 00000000000..e9dde40fa70 --- /dev/null +++ b/src/vlib/node_format.c @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * node_format.c: node formatting + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vlib/vlib.h> + +u8 * +format_vlib_node_graph (u8 * s, va_list * va) +{ + vlib_node_main_t *nm = va_arg (*va, vlib_node_main_t *); + vlib_node_t *n = va_arg (*va, vlib_node_t *); + int i, j; + uword indent; + typedef struct + { + u32 next_node; + u32 next_slot; + u32 prev_node; + } tmp_t; + tmp_t *tmps = 0; + tmp_t empty = {.next_node = ~0,.prev_node = ~0 }; + + if (!n) + return format (s, "%=26s%=26s%=26s", "Name", "Next", "Previous"); + + s = format (s, "%-26v", n->name); + + indent = format_get_indent (s); + + for (i = j = 0; i < vec_len (n->next_nodes); i++) + { + if (n->next_nodes[i] == VLIB_INVALID_NODE_INDEX) + continue; + vec_validate_init_empty (tmps, j, empty); + tmps[j].next_node = n->next_nodes[i]; + tmps[j].next_slot = i; + j++; + } + + j = 0; + /* *INDENT-OFF* */ + clib_bitmap_foreach (i, n->prev_node_bitmap, ({ + vec_validate_init_empty (tmps, j, empty); + tmps[j].prev_node = i; + j++; + })); + /* *INDENT-ON* */ + + for (i = 0; i < vec_len (tmps); i++) + { + if (i > 0) + s = format (s, "\n%U", format_white_space, indent); + + if (tmps[i].next_node != ~0) + { + vlib_node_t *x; + u8 *t = 0; + + x = vec_elt (nm->nodes, tmps[i].next_node); + t = format (t, "%v [%d]", x->name, tmps[i].next_slot); + s = format (s, "%=26v", t); + vec_free (t); + } + else + s = format (s, "%26s", ""); + + if (tmps[i].prev_node != ~0) + { + vlib_node_t *x; + x = vec_elt (nm->nodes, tmps[i].prev_node); + s = format (s, "%=26v", x->name); + } + } + + vec_free (tmps); + + return s; +} + +u8 * +format_vlib_node_and_next (u8 * s, va_list * va) +{ + vlib_main_t *vm = va_arg (*va, vlib_main_t *); + vlib_node_t *n = va_arg (*va, vlib_node_t *); + u32 next_index = va_arg (*va, u32); + vlib_node_t *n_next; + u32 *ni; + + ni = vec_elt_at_index (n->next_nodes, next_index); + n_next = vlib_get_node (vm, ni[0]); + return format (s, "%v -> %v", n->name, n_next->name); +} + +u8 * +format_vlib_node_name (u8 * s, va_list * va) +{ + vlib_main_t *vm = va_arg (*va, vlib_main_t *); + u32 node_index = va_arg (*va, u32); + vlib_node_t *n = vlib_get_node (vm, node_index); + + return format (s, "%v", n->name); +} + +u8 * +format_vlib_next_node_name (u8 * s, va_list * va) +{ + vlib_main_t *vm = va_arg (*va, vlib_main_t *); + u32 node_index = va_arg (*va, u32); + u32 next_index = va_arg (*va, u32); + vlib_node_t *next = vlib_get_next_node (vm, node_index, next_index); + return format (s, "%v", next->name); +} + +/* Parse node name -> node index. */ +uword +unformat_vlib_node (unformat_input_t * input, va_list * args) +{ + vlib_main_t *vm = va_arg (*args, vlib_main_t *); + u32 *result = va_arg (*args, u32 *); + + return unformat_user (input, unformat_hash_vec_string, + vm->node_main.node_by_name, result); +} + +u8 * +format_vlib_time (u8 * s, va_list * va) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); + f64 time = va_arg (*va, f64); + return format (s, "%12.4f", time); +} + +u8 * +format_vlib_cpu_time (u8 * s, va_list * va) +{ + vlib_main_t *vm = va_arg (*va, vlib_main_t *); + u64 cpu_time = va_arg (*va, u64); + f64 dt; + + dt = + (cpu_time - + vm->clib_time.init_cpu_time) * vm->clib_time.seconds_per_clock; + return format (s, "%U", format_vlib_time, vm, dt); +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h new file mode 100644 index 00000000000..2116739602e --- /dev/null +++ b/src/vlib/node_funcs.h @@ -0,0 +1,1130 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * node_funcs.h: processing nodes global functions/inlines + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** \file + vlib node functions +*/ + + +#ifndef included_vlib_node_funcs_h +#define included_vlib_node_funcs_h + +#include <vppinfra/fifo.h> + +/** \brief Get vlib node by index. + @warning This function will ASSERT if @c i is out of range. + @param vm vlib_main_t pointer, varies by thread + @param i node index. + @return pointer to the requested vlib_node_t. +*/ + +always_inline vlib_node_t * +vlib_get_node (vlib_main_t * vm, u32 i) +{ + return vec_elt (vm->node_main.nodes, i); +} + +/** \brief Get vlib node by graph arc (next) index. + @param vm vlib_main_t pointer, varies by thread + @param node_index index of original node + @param next_index graph arc index + @return pointer to the vlib_node_t at the end of the indicated arc +*/ + +always_inline vlib_node_t * +vlib_get_next_node (vlib_main_t * vm, u32 node_index, u32 next_index) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n; + + n = vec_elt (nm->nodes, node_index); + ASSERT (next_index < vec_len (n->next_nodes)); + return vlib_get_node (vm, n->next_nodes[next_index]); +} + +/** \brief Get node runtime by node index. + @param vm vlib_main_t pointer, varies by thread + @param node_index index of node + @return pointer to the indicated vlib_node_runtime_t +*/ + +always_inline vlib_node_runtime_t * +vlib_node_get_runtime (vlib_main_t * vm, u32 node_index) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n = vec_elt (nm->nodes, node_index); + vlib_process_t *p; + if (n->type != VLIB_NODE_TYPE_PROCESS) + return vec_elt_at_index (nm->nodes_by_type[n->type], n->runtime_index); + else + { + p = vec_elt (nm->processes, n->runtime_index); + return &p->node_runtime; + } +} + +/** \brief Get node runtime private data by node index. + @param vm vlib_main_t pointer, varies by thread + @param node_index index of the node + @return pointer to the indicated vlib_node_runtime_t private data +*/ + +always_inline void * +vlib_node_get_runtime_data (vlib_main_t * vm, u32 node_index) +{ + vlib_node_runtime_t *r = vlib_node_get_runtime (vm, node_index); + return r->runtime_data; +} + +/** \brief Set node runtime private data. + @param vm vlib_main_t pointer, varies by thread + @param node_index index of the node + @param runtime_data arbitrary runtime private data + @param n_runtime_data_bytes size of runtime private data +*/ + +always_inline void +vlib_node_set_runtime_data (vlib_main_t * vm, u32 node_index, + void *runtime_data, u32 n_runtime_data_bytes) +{ + vlib_node_t *n = vlib_get_node (vm, node_index); + vlib_node_runtime_t *r = vlib_node_get_runtime (vm, node_index); + + n->runtime_data_bytes = n_runtime_data_bytes; + vec_free (n->runtime_data); + vec_add (n->runtime_data, runtime_data, n_runtime_data_bytes); + + ASSERT (vec_len (n->runtime_data) <= sizeof (vlib_node_runtime_t) - + STRUCT_OFFSET_OF (vlib_node_runtime_t, runtime_data)); + + if (vec_len (n->runtime_data) > 0) + clib_memcpy (r->runtime_data, n->runtime_data, vec_len (n->runtime_data)); +} + +/** \brief Set node dispatch state. + @param vm vlib_main_t pointer, varies by thread + @param node_index index of the node + @param new_state new state for node, see vlib_node_state_t +*/ +always_inline void +vlib_node_set_state (vlib_main_t * vm, u32 node_index, + vlib_node_state_t new_state) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n; + vlib_node_runtime_t *r; + + n = vec_elt (nm->nodes, node_index); + if (n->type == VLIB_NODE_TYPE_PROCESS) + { + vlib_process_t *p = vec_elt (nm->processes, n->runtime_index); + r = &p->node_runtime; + + /* When disabling make sure flags are cleared. */ + p->flags &= ~(VLIB_PROCESS_RESUME_PENDING + | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK + | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT); + } + else + r = vec_elt_at_index (nm->nodes_by_type[n->type], n->runtime_index); + + ASSERT (new_state < VLIB_N_NODE_STATE); + + if (n->type == VLIB_NODE_TYPE_INPUT) + { + ASSERT (nm->input_node_counts_by_state[n->state] > 0); + nm->input_node_counts_by_state[n->state] -= 1; + nm->input_node_counts_by_state[new_state] += 1; + } + + n->state = new_state; + r->state = new_state; +} + +always_inline void +vlib_node_set_interrupt_pending (vlib_main_t * vm, u32 node_index) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n = vec_elt (nm->nodes, node_index); + ASSERT (n->type == VLIB_NODE_TYPE_INPUT); + vec_add1 (nm->pending_interrupt_node_runtime_indices, n->runtime_index); +} + +always_inline vlib_process_t * +vlib_get_process_from_node (vlib_main_t * vm, vlib_node_t * node) +{ + vlib_node_main_t *nm = &vm->node_main; + ASSERT (node->type == VLIB_NODE_TYPE_PROCESS); + return vec_elt (nm->processes, node->runtime_index); +} + +/* Fetches frame with given handle. */ +always_inline vlib_frame_t * +vlib_get_frame_no_check (vlib_main_t * vm, uword frame_index) +{ + vlib_frame_t *f; + u32 cpu_index = frame_index & VLIB_CPU_MASK; + u32 offset = frame_index & VLIB_OFFSET_MASK; + vm = vlib_mains ? vlib_mains[cpu_index] : vm; + f = vm->heap_base + offset; + return f; +} + +always_inline u32 +vlib_frame_index_no_check (vlib_main_t * vm, vlib_frame_t * f) +{ + u32 i; + + ASSERT (((uword) f & VLIB_CPU_MASK) == 0); + + vm = vlib_mains ? vlib_mains[f->cpu_index] : vm; + + i = ((u8 *) f - (u8 *) vm->heap_base); + return i | f->cpu_index; +} + +always_inline vlib_frame_t * +vlib_get_frame (vlib_main_t * vm, uword frame_index) +{ + vlib_frame_t *f = vlib_get_frame_no_check (vm, frame_index); + ASSERT (f->flags & VLIB_FRAME_IS_ALLOCATED); + return f; +} + +always_inline u32 +vlib_frame_index (vlib_main_t * vm, vlib_frame_t * f) +{ + uword i = vlib_frame_index_no_check (vm, f); + ASSERT (vlib_get_frame (vm, i) == f); + return i; +} + +/* Byte alignment for vector arguments. */ +#define VLIB_FRAME_VECTOR_ALIGN (1 << 4) + +always_inline u32 +vlib_frame_vector_byte_offset (u32 scalar_size) +{ + return round_pow2 (sizeof (vlib_frame_t) + scalar_size, + VLIB_FRAME_VECTOR_ALIGN); +} + +/** \brief Get pointer to frame vector data. + @param f vlib_frame_t pointer + @return pointer to first vector element in frame +*/ +always_inline void * +vlib_frame_vector_args (vlib_frame_t * f) +{ + return (void *) f + vlib_frame_vector_byte_offset (f->scalar_size); +} + +/** \brief Get pointer to frame scalar data. + + @warning This is almost certainly not the function you wish to call. + See @ref vlib_frame_vector_args instead. + + @param f vlib_frame_t pointer + + @return arbitrary node scalar data + + @sa vlib_frame_vector_args +*/ +always_inline void * +vlib_frame_args (vlib_frame_t * f) +{ + return vlib_frame_vector_args (f) - f->scalar_size; +} + +always_inline vlib_next_frame_t * +vlib_node_runtime_get_next_frame (vlib_main_t * vm, + vlib_node_runtime_t * n, u32 next_index) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_next_frame_t *nf; + + ASSERT (next_index < n->n_next_nodes); + nf = vec_elt_at_index (nm->next_frames, n->next_frame_index + next_index); + + if (CLIB_DEBUG > 0) + { + vlib_node_t *node, *next; + node = vec_elt (nm->nodes, n->node_index); + next = vec_elt (nm->nodes, node->next_nodes[next_index]); + ASSERT (nf->node_runtime_index == next->runtime_index); + } + + return nf; +} + +/** \brief Get pointer to frame by (@c node_index, @c next_index). + + @warning This is not a function that you should call directly. + See @ref vlib_get_next_frame instead. + + @param vm vlib_main_t pointer, varies by thread + @param node_index index of the node + @param next_index graph arc index + + @return pointer to the requested vlib_next_frame_t + + @sa vlib_get_next_frame +*/ + +always_inline vlib_next_frame_t * +vlib_node_get_next_frame (vlib_main_t * vm, u32 node_index, u32 next_index) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n; + vlib_node_runtime_t *r; + + n = vec_elt (nm->nodes, node_index); + r = vec_elt_at_index (nm->nodes_by_type[n->type], n->runtime_index); + return vlib_node_runtime_get_next_frame (vm, r, next_index); +} + +vlib_frame_t *vlib_get_next_frame_internal (vlib_main_t * vm, + vlib_node_runtime_t * node, + u32 next_index, + u32 alloc_new_frame); + +#define vlib_get_next_frame_macro(vm,node,next_index,vectors,n_vectors_left,alloc_new_frame) \ +do { \ + vlib_frame_t * _f \ + = vlib_get_next_frame_internal ((vm), (node), (next_index), \ + (alloc_new_frame)); \ + u32 _n = _f->n_vectors; \ + (vectors) = vlib_frame_vector_args (_f) + _n * sizeof ((vectors)[0]); \ + (n_vectors_left) = VLIB_FRAME_SIZE - _n; \ +} while (0) + + +/** \brief Get pointer to next frame vector data by + (@c vlib_node_runtime_t, @c next_index). + Standard single/dual loop boilerplate element. + @attention This is a MACRO, with SIDE EFFECTS. + + @param vm vlib_main_t pointer, varies by thread + @param node current node vlib_node_runtime_t pointer + @param next_index requested graph arc index + + @return @c vectors -- pointer to next available vector slot + @return @c n_vectors_left -- number of vector slots available +*/ +#define vlib_get_next_frame(vm,node,next_index,vectors,n_vectors_left) \ + vlib_get_next_frame_macro (vm, node, next_index, \ + vectors, n_vectors_left, \ + /* alloc new frame */ 0) + +#define vlib_get_new_next_frame(vm,node,next_index,vectors,n_vectors_left) \ + vlib_get_next_frame_macro (vm, node, next_index, \ + vectors, n_vectors_left, \ + /* alloc new frame */ 1) + +/** \brief Release pointer to next frame vector data. + Standard single/dual loop boilerplate element. + @param vm vlib_main_t pointer, varies by thread + @param r current node vlib_node_runtime_t pointer + @param next_index graph arc index + @param n_packets_left number of slots still available in vector +*/ +void +vlib_put_next_frame (vlib_main_t * vm, + vlib_node_runtime_t * r, + u32 next_index, u32 n_packets_left); + +/* Combination get plus put. Returns vector argument just added. */ +#define vlib_set_next_frame(vm,node,next_index,v) \ +({ \ + uword _n_left; \ + vlib_get_next_frame ((vm), (node), (next_index), (v), _n_left); \ + ASSERT (_n_left > 0); \ + vlib_put_next_frame ((vm), (node), (next_index), _n_left - 1); \ + (v); \ +}) + +always_inline void +vlib_set_next_frame_buffer (vlib_main_t * vm, + vlib_node_runtime_t * node, + u32 next_index, u32 buffer_index) +{ + u32 *p; + p = vlib_set_next_frame (vm, node, next_index, p); + p[0] = buffer_index; +} + +vlib_frame_t *vlib_get_frame_to_node (vlib_main_t * vm, u32 to_node_index); +void vlib_put_frame_to_node (vlib_main_t * vm, u32 to_node_index, + vlib_frame_t * f); + +always_inline vlib_process_t * +vlib_get_current_process (vlib_main_t * vm) +{ + vlib_node_main_t *nm = &vm->node_main; + return vec_elt (nm->processes, nm->current_process_index); +} + +always_inline uword +vlib_in_process_context (vlib_main_t * vm) +{ + return vm->node_main.current_process_index != ~0; +} + +always_inline uword +vlib_current_process (vlib_main_t * vm) +{ + return vlib_get_current_process (vm)->node_runtime.node_index; +} + +/** Returns TRUE if a process suspend time is less than 1us + @param dt - remaining poll time in seconds + @returns 1 if dt < 1e-6, 0 otherwise +*/ +always_inline uword +vlib_process_suspend_time_is_zero (f64 dt) +{ + return dt < 1e-6; +} + +/** Suspend a vlib cooperative multi-tasking thread for a period of time + @param vm - vlib_main_t * + @param dt - suspend interval in seconds + @returns VLIB_PROCESS_RESUME_LONGJMP_RESUME, routinely ignored +*/ + +always_inline uword +vlib_process_suspend (vlib_main_t * vm, f64 dt) +{ + uword r; + vlib_node_main_t *nm = &vm->node_main; + vlib_process_t *p = vec_elt (nm->processes, nm->current_process_index); + u64 dt_cpu = dt * vm->clib_time.clocks_per_second; + + if (vlib_process_suspend_time_is_zero (dt)) + return VLIB_PROCESS_RESUME_LONGJMP_RESUME; + + p->flags |= VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK; + r = clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND); + if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND) + { + p->resume_cpu_time = clib_cpu_time_now () + dt_cpu; + clib_longjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_SUSPEND); + } + + return r; +} + +always_inline void +vlib_process_free_event_type (vlib_process_t * p, uword t, + uword is_one_time_event) +{ + ASSERT (!pool_is_free_index (p->event_type_pool, t)); + pool_put_index (p->event_type_pool, t); + if (is_one_time_event) + p->one_time_event_type_bitmap = + clib_bitmap_andnoti (p->one_time_event_type_bitmap, t); +} + +always_inline void +vlib_process_maybe_free_event_type (vlib_process_t * p, uword t) +{ + ASSERT (!pool_is_free_index (p->event_type_pool, t)); + if (clib_bitmap_get (p->one_time_event_type_bitmap, t)) + vlib_process_free_event_type (p, t, /* is_one_time_event */ 1); +} + +always_inline void * +vlib_process_get_event_data (vlib_main_t * vm, + uword * return_event_type_opaque) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_process_t *p; + vlib_process_event_type_t *et; + uword t, l; + void *event_data_vector; + + p = vec_elt (nm->processes, nm->current_process_index); + + /* Find first type with events ready. + Return invalid type when there's nothing there. */ + t = clib_bitmap_first_set (p->non_empty_event_type_bitmap); + if (t == ~0) + return 0; + + p->non_empty_event_type_bitmap = + clib_bitmap_andnoti (p->non_empty_event_type_bitmap, t); + + l = _vec_len (p->pending_event_data_by_type_index[t]); + ASSERT (l > 0); + event_data_vector = p->pending_event_data_by_type_index[t]; + p->pending_event_data_by_type_index[t] = 0; + + et = pool_elt_at_index (p->event_type_pool, t); + + /* Return user's opaque value and possibly index. */ + *return_event_type_opaque = et->opaque; + + vlib_process_maybe_free_event_type (p, t); + + return event_data_vector; +} + +/* Return event data vector for later reuse. We reuse event data to avoid + repeatedly allocating event vectors in cases where we care about speed. */ +always_inline void +vlib_process_put_event_data (vlib_main_t * vm, void *event_data) +{ + vlib_node_main_t *nm = &vm->node_main; + vec_add1 (nm->recycled_event_data_vectors, event_data); +} + +/** Return the first event type which has occurred and a vector of per-event + data of that type, or a timeout indication + + @param vm - vlib_main_t pointer + @param data_vector - pointer to a (uword *) vector to receive event data + @returns either an event type and a vector of per-event instance data, + or ~0 to indicate a timeout. +*/ + +always_inline uword +vlib_process_get_events (vlib_main_t * vm, uword ** data_vector) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_process_t *p; + vlib_process_event_type_t *et; + uword r, t, l; + + p = vec_elt (nm->processes, nm->current_process_index); + + /* Find first type with events ready. + Return invalid type when there's nothing there. */ + t = clib_bitmap_first_set (p->non_empty_event_type_bitmap); + if (t == ~0) + return t; + + p->non_empty_event_type_bitmap = + clib_bitmap_andnoti (p->non_empty_event_type_bitmap, t); + + l = _vec_len (p->pending_event_data_by_type_index[t]); + if (data_vector) + vec_add (*data_vector, p->pending_event_data_by_type_index[t], l); + _vec_len (p->pending_event_data_by_type_index[t]) = 0; + + et = pool_elt_at_index (p->event_type_pool, t); + + /* Return user's opaque value. */ + r = et->opaque; + + vlib_process_maybe_free_event_type (p, t); + + return r; +} + +always_inline uword +vlib_process_get_events_helper (vlib_process_t * p, uword t, + uword ** data_vector) +{ + uword l; + + p->non_empty_event_type_bitmap = + clib_bitmap_andnoti (p->non_empty_event_type_bitmap, t); + + l = _vec_len (p->pending_event_data_by_type_index[t]); + if (data_vector) + vec_add (*data_vector, p->pending_event_data_by_type_index[t], l); + _vec_len (p->pending_event_data_by_type_index[t]) = 0; + + vlib_process_maybe_free_event_type (p, t); + + return l; +} + +/* As above but query as specified type of event. Returns number of + events found. */ +always_inline uword +vlib_process_get_events_with_type (vlib_main_t * vm, uword ** data_vector, + uword with_type_opaque) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_process_t *p; + uword t, *h; + + p = vec_elt (nm->processes, nm->current_process_index); + h = hash_get (p->event_type_index_by_type_opaque, with_type_opaque); + if (!h) + /* This can happen when an event has not yet been + signaled with given opaque type. */ + return 0; + + t = h[0]; + if (!clib_bitmap_get (p->non_empty_event_type_bitmap, t)) + return 0; + + return vlib_process_get_events_helper (p, t, data_vector); +} + +always_inline uword * +vlib_process_wait_for_event (vlib_main_t * vm) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_process_t *p; + uword r; + + p = vec_elt (nm->processes, nm->current_process_index); + if (clib_bitmap_is_zero (p->non_empty_event_type_bitmap)) + { + p->flags |= VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT; + r = + clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND); + if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND) + clib_longjmp (&p->return_longjmp, + VLIB_PROCESS_RETURN_LONGJMP_SUSPEND); + } + + return p->non_empty_event_type_bitmap; +} + +always_inline uword +vlib_process_wait_for_one_time_event (vlib_main_t * vm, + uword ** data_vector, + uword with_type_index) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_process_t *p; + uword r; + + p = vec_elt (nm->processes, nm->current_process_index); + ASSERT (!pool_is_free_index (p->event_type_pool, with_type_index)); + while (!clib_bitmap_get (p->non_empty_event_type_bitmap, with_type_index)) + { + p->flags |= VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT; + r = + clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND); + if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND) + clib_longjmp (&p->return_longjmp, + VLIB_PROCESS_RETURN_LONGJMP_SUSPEND); + } + + return vlib_process_get_events_helper (p, with_type_index, data_vector); +} + +always_inline uword +vlib_process_wait_for_event_with_type (vlib_main_t * vm, + uword ** data_vector, + uword with_type_opaque) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_process_t *p; + uword r, *h; + + p = vec_elt (nm->processes, nm->current_process_index); + h = hash_get (p->event_type_index_by_type_opaque, with_type_opaque); + while (!h || !clib_bitmap_get (p->non_empty_event_type_bitmap, h[0])) + { + p->flags |= VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT; + r = + clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND); + if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND) + clib_longjmp (&p->return_longjmp, + VLIB_PROCESS_RETURN_LONGJMP_SUSPEND); + + /* See if unknown event type has been signaled now. */ + if (!h) + h = hash_get (p->event_type_index_by_type_opaque, with_type_opaque); + } + + return vlib_process_get_events_helper (p, h[0], data_vector); +} + +/** Suspend a cooperative multi-tasking thread + Waits for an event, or for the indicated number of seconds to elapse + @param vm - vlib_main_t pointer + @param dt - timeout, in seconds. + @returns the remaining time interval +*/ + +always_inline f64 +vlib_process_wait_for_event_or_clock (vlib_main_t * vm, f64 dt) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_process_t *p; + f64 wakeup_time; + uword r; + + p = vec_elt (nm->processes, nm->current_process_index); + + if (vlib_process_suspend_time_is_zero (dt) + || !clib_bitmap_is_zero (p->non_empty_event_type_bitmap)) + return dt; + + wakeup_time = vlib_time_now (vm) + dt; + + /* Suspend waiting for both clock and event to occur. */ + p->flags |= (VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT + | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK); + + r = clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND); + if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND) + { + p->resume_cpu_time = (clib_cpu_time_now () + + (dt * vm->clib_time.clocks_per_second)); + clib_longjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_SUSPEND); + } + + /* Return amount of time still left to sleep. + If <= 0 then we've been waken up by the clock (and not an event). */ + return wakeup_time - vlib_time_now (vm); +} + +always_inline vlib_process_event_type_t * +vlib_process_new_event_type (vlib_process_t * p, uword with_type_opaque) +{ + vlib_process_event_type_t *et; + pool_get (p->event_type_pool, et); + et->opaque = with_type_opaque; + return et; +} + +always_inline uword +vlib_process_create_one_time_event (vlib_main_t * vm, uword node_index, + uword with_type_opaque) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n = vlib_get_node (vm, node_index); + vlib_process_t *p = vec_elt (nm->processes, n->runtime_index); + vlib_process_event_type_t *et; + uword t; + + et = vlib_process_new_event_type (p, with_type_opaque); + t = et - p->event_type_pool; + p->one_time_event_type_bitmap = + clib_bitmap_ori (p->one_time_event_type_bitmap, t); + return t; +} + +always_inline void +vlib_process_delete_one_time_event (vlib_main_t * vm, uword node_index, + uword t) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n = vlib_get_node (vm, node_index); + vlib_process_t *p = vec_elt (nm->processes, n->runtime_index); + + ASSERT (clib_bitmap_get (p->one_time_event_type_bitmap, t)); + vlib_process_free_event_type (p, t, /* is_one_time_event */ 1); +} + +always_inline void * +vlib_process_signal_event_helper (vlib_node_main_t * nm, + vlib_node_t * n, + vlib_process_t * p, + uword t, + uword n_data_elts, uword n_data_elt_bytes) +{ + uword p_flags, add_to_pending, delete_from_wheel; + void *data_to_be_written_by_caller; + + ASSERT (!pool_is_free_index (p->event_type_pool, t)); + + vec_validate (p->pending_event_data_by_type_index, t); + + /* Resize data vector and return caller's data to be written. */ + { + void *data_vec = p->pending_event_data_by_type_index[t]; + uword l; + + if (!data_vec && vec_len (nm->recycled_event_data_vectors)) + { + data_vec = vec_pop (nm->recycled_event_data_vectors); + _vec_len (data_vec) = 0; + } + + l = vec_len (data_vec); + + data_vec = _vec_resize (data_vec, + /* length_increment */ n_data_elts, + /* total size after increment */ + (l + n_data_elts) * n_data_elt_bytes, + /* header_bytes */ 0, /* data_align */ 0); + + p->pending_event_data_by_type_index[t] = data_vec; + data_to_be_written_by_caller = data_vec + l * n_data_elt_bytes; + } + + p->non_empty_event_type_bitmap = + clib_bitmap_ori (p->non_empty_event_type_bitmap, t); + + p_flags = p->flags; + + /* Event was already signalled? */ + add_to_pending = (p_flags & VLIB_PROCESS_RESUME_PENDING) == 0; + + /* Process will resume when suspend time elapses? */ + delete_from_wheel = 0; + if (p_flags & VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK) + { + /* Waiting for both event and clock? */ + if (p_flags & VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT) + delete_from_wheel = 1; + else + /* Waiting only for clock. Event will be queue and may be + handled when timer expires. */ + add_to_pending = 0; + } + + /* Never add current process to pending vector since current process is + already running. */ + add_to_pending &= nm->current_process_index != n->runtime_index; + + if (add_to_pending) + { + u32 x = vlib_timing_wheel_data_set_suspended_process (n->runtime_index); + p->flags = p_flags | VLIB_PROCESS_RESUME_PENDING; + vec_add1 (nm->data_from_advancing_timing_wheel, x); + if (delete_from_wheel) + timing_wheel_delete (&nm->timing_wheel, x); + } + + return data_to_be_written_by_caller; +} + +always_inline void * +vlib_process_signal_event_data (vlib_main_t * vm, + uword node_index, + uword type_opaque, + uword n_data_elts, uword n_data_elt_bytes) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n = vlib_get_node (vm, node_index); + vlib_process_t *p = vec_elt (nm->processes, n->runtime_index); + uword *h, t; + + h = hash_get (p->event_type_index_by_type_opaque, type_opaque); + if (!h) + { + vlib_process_event_type_t *et = + vlib_process_new_event_type (p, type_opaque); + t = et - p->event_type_pool; + hash_set (p->event_type_index_by_type_opaque, type_opaque, t); + } + else + t = h[0]; + + return vlib_process_signal_event_helper (nm, n, p, t, n_data_elts, + n_data_elt_bytes); +} + +always_inline void * +vlib_process_signal_event_at_time (vlib_main_t * vm, + f64 dt, + uword node_index, + uword type_opaque, + uword n_data_elts, uword n_data_elt_bytes) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n = vlib_get_node (vm, node_index); + vlib_process_t *p = vec_elt (nm->processes, n->runtime_index); + uword *h, t; + + h = hash_get (p->event_type_index_by_type_opaque, type_opaque); + if (!h) + { + vlib_process_event_type_t *et = + vlib_process_new_event_type (p, type_opaque); + t = et - p->event_type_pool; + hash_set (p->event_type_index_by_type_opaque, type_opaque, t); + } + else + t = h[0]; + + if (vlib_process_suspend_time_is_zero (dt)) + return vlib_process_signal_event_helper (nm, n, p, t, n_data_elts, + n_data_elt_bytes); + else + { + vlib_signal_timed_event_data_t *te; + u64 dt_cpu = dt * vm->clib_time.clocks_per_second; + + pool_get_aligned (nm->signal_timed_event_data_pool, te, sizeof (te[0])); + + te->n_data_elts = n_data_elts; + te->n_data_elt_bytes = n_data_elt_bytes; + te->n_data_bytes = n_data_elts * n_data_elt_bytes; + + /* Assert that structure fields are big enough. */ + ASSERT (te->n_data_elts == n_data_elts); + ASSERT (te->n_data_elt_bytes == n_data_elt_bytes); + ASSERT (te->n_data_bytes == n_data_elts * n_data_elt_bytes); + + te->process_node_index = n->runtime_index; + te->event_type_index = t; + + timing_wheel_insert (&nm->timing_wheel, clib_cpu_time_now () + dt_cpu, + vlib_timing_wheel_data_set_timed_event (te - + nm-> + signal_timed_event_data_pool)); + + /* Inline data big enough to hold event? */ + if (te->n_data_bytes < sizeof (te->inline_event_data)) + return te->inline_event_data; + else + { + te->event_data_as_vector = 0; + vec_resize (te->event_data_as_vector, te->n_data_bytes); + return te->event_data_as_vector; + } + } +} + +always_inline void * +vlib_process_signal_one_time_event_data (vlib_main_t * vm, + uword node_index, + uword type_index, + uword n_data_elts, + uword n_data_elt_bytes) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n = vlib_get_node (vm, node_index); + vlib_process_t *p = vec_elt (nm->processes, n->runtime_index); + return vlib_process_signal_event_helper (nm, n, p, type_index, n_data_elts, + n_data_elt_bytes); +} + +always_inline void +vlib_process_signal_event (vlib_main_t * vm, + uword node_index, uword type_opaque, uword data) +{ + uword *d = vlib_process_signal_event_data (vm, node_index, type_opaque, + 1 /* elts */ , sizeof (uword)); + d[0] = data; +} + +always_inline void +vlib_process_signal_event_pointer (vlib_main_t * vm, + uword node_index, + uword type_opaque, void *data) +{ + void **d = vlib_process_signal_event_data (vm, node_index, type_opaque, + 1 /* elts */ , sizeof (data)); + d[0] = data; +} + +always_inline void +vlib_process_signal_one_time_event (vlib_main_t * vm, + uword node_index, + uword type_index, uword data) +{ + uword *d = + vlib_process_signal_one_time_event_data (vm, node_index, type_index, + 1 /* elts */ , sizeof (uword)); + d[0] = data; +} + +always_inline void +vlib_signal_one_time_waiting_process (vlib_main_t * vm, + vlib_one_time_waiting_process_t * p) +{ + vlib_process_signal_one_time_event (vm, p->node_index, p->one_time_event, + /* data */ ~0); + memset (p, ~0, sizeof (p[0])); +} + +always_inline void +vlib_signal_one_time_waiting_process_vector (vlib_main_t * vm, + vlib_one_time_waiting_process_t + ** wps) +{ + vlib_one_time_waiting_process_t *wp; + vec_foreach (wp, *wps) vlib_signal_one_time_waiting_process (vm, wp); + vec_free (*wps); +} + +always_inline void +vlib_current_process_wait_for_one_time_event (vlib_main_t * vm, + vlib_one_time_waiting_process_t + * p) +{ + p->node_index = vlib_current_process (vm); + p->one_time_event = vlib_process_create_one_time_event (vm, p->node_index, /* type opaque */ + ~0); + vlib_process_wait_for_one_time_event (vm, + /* don't care about data */ 0, + p->one_time_event); +} + +always_inline void +vlib_current_process_wait_for_one_time_event_vector (vlib_main_t * vm, + vlib_one_time_waiting_process_t + ** wps) +{ + vlib_one_time_waiting_process_t *wp; + vec_add2 (*wps, wp, 1); + vlib_current_process_wait_for_one_time_event (vm, wp); +} + +always_inline u32 +vlib_node_runtime_update_main_loop_vector_stats (vlib_main_t * vm, + vlib_node_runtime_t * node, + uword n_vectors) +{ + u32 i, d, vi0, vi1; + u32 i0, i1; + + ASSERT (is_pow2 (ARRAY_LEN (node->main_loop_vector_stats))); + i = ((vm->main_loop_count >> VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE) + & (ARRAY_LEN (node->main_loop_vector_stats) - 1)); + i0 = i ^ 0; + i1 = i ^ 1; + d = ((vm->main_loop_count >> VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE) + - + (node->main_loop_count_last_dispatch >> + VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE)); + vi0 = node->main_loop_vector_stats[i0]; + vi1 = node->main_loop_vector_stats[i1]; + vi0 = d == 0 ? vi0 : 0; + vi1 = d <= 1 ? vi1 : 0; + vi0 += n_vectors; + node->main_loop_vector_stats[i0] = vi0; + node->main_loop_vector_stats[i1] = vi1; + node->main_loop_count_last_dispatch = vm->main_loop_count; + /* Return previous counter. */ + return node->main_loop_vector_stats[i1]; +} + +always_inline f64 +vlib_node_vectors_per_main_loop_as_float (vlib_main_t * vm, u32 node_index) +{ + vlib_node_runtime_t *rt = vlib_node_get_runtime (vm, node_index); + u32 v; + + v = vlib_node_runtime_update_main_loop_vector_stats (vm, rt, /* n_vectors */ + 0); + return (f64) v / (1 << VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE); +} + +always_inline u32 +vlib_node_vectors_per_main_loop_as_integer (vlib_main_t * vm, u32 node_index) +{ + vlib_node_runtime_t *rt = vlib_node_get_runtime (vm, node_index); + u32 v; + + v = vlib_node_runtime_update_main_loop_vector_stats (vm, rt, /* n_vectors */ + 0); + return v >> VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE; +} + +void +vlib_frame_free (vlib_main_t * vm, vlib_node_runtime_t * r, vlib_frame_t * f); + +/* Add next node to given node in given slot. */ +uword +vlib_node_add_next_with_slot (vlib_main_t * vm, + uword node, uword next_node, uword slot); + +/* As above but adds to end of node's next vector. */ +always_inline uword +vlib_node_add_next (vlib_main_t * vm, uword node, uword next_node) +{ + return vlib_node_add_next_with_slot (vm, node, next_node, ~0); +} + +/* Add next node to given node in given slot. */ +uword +vlib_node_add_named_next_with_slot (vlib_main_t * vm, + uword node, char *next_name, uword slot); + +/* As above but adds to end of node's next vector. */ +always_inline uword +vlib_node_add_named_next (vlib_main_t * vm, uword node, char *name) +{ + return vlib_node_add_named_next_with_slot (vm, node, name, ~0); +} + +/* Query node given name. */ +vlib_node_t *vlib_get_node_by_name (vlib_main_t * vm, u8 * name); + +/* Rename a node. */ +void vlib_node_rename (vlib_main_t * vm, u32 node_index, char *fmt, ...); + +/* Register new packet processing node. Nodes can be registered + dynamically via this call or statically via the VLIB_REGISTER_NODE + macro. */ +u32 vlib_register_node (vlib_main_t * vm, vlib_node_registration_t * r); + +/* Register all static nodes registered via VLIB_REGISTER_NODE. */ +void vlib_register_all_static_nodes (vlib_main_t * vm); + +/* Start a process. */ +void vlib_start_process (vlib_main_t * vm, uword process_index); + +/* Sync up runtime and main node stats. */ +void vlib_node_sync_stats (vlib_main_t * vm, vlib_node_t * n); + +/* Node graph initialization function. */ +clib_error_t *vlib_node_main_init (vlib_main_t * vm); + +format_function_t format_vlib_node_graph; +format_function_t format_vlib_node_name; +format_function_t format_vlib_next_node_name; +format_function_t format_vlib_node_and_next; +format_function_t format_vlib_cpu_time; +format_function_t format_vlib_time; +/* Parse node name -> node index. */ +unformat_function_t unformat_vlib_node; + +always_inline void +vlib_node_increment_counter (vlib_main_t * vm, u32 node_index, + u32 counter_index, u64 increment) +{ + vlib_node_t *n = vlib_get_node (vm, node_index); + vlib_error_main_t *em = &vm->error_main; + u32 node_counter_base_index = n->error_heap_index; + em->counters[node_counter_base_index + counter_index] += increment; +} + +#endif /* included_vlib_node_funcs_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/parse.c b/src/vlib/parse.c new file mode 100644 index 00000000000..1c4500ce85a --- /dev/null +++ b/src/vlib/parse.c @@ -0,0 +1,1007 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vlib/parse.h> + +#define PARSE_DEBUG 0 + +u16 word_type_index, number_type_index, eof_type_index, rule_eof_type_index, + plus_type_index, minus_type_index, star_type_index, slash_type_index, + lpar_type_index, rpar_type_index; + +u8 * +format_vlib_parse_value (u8 * s, va_list * args) +{ + vlib_parse_main_t *pm = va_arg (*args, vlib_parse_main_t *); + vlib_parse_type_t *type; + vlib_parse_value_t *v; + u16 type_index; + + s = format (s, "%d items:\n", vec_len (pm->parse_value)); + vec_foreach (v, pm->parse_value) + { + type_index = v->type; + type = pool_elt_at_index (pm->parse_types, type_index); + if (type->format_value) + s = format (s, "[%d]: %U\n", v - pm->parse_value, + type->format_value, v); + else + s = format (s, "[%d]: (nofun)\n", v - pm->parse_value); + } + return s; +} + +static u8 * +format_vlib_parse_match (u8 * s, va_list * args) +{ + vlib_parse_match_t m = va_arg (*args, vlib_parse_match_t); + char *t = 0; + switch (m) + { +#define _(a) case VLIB_PARSE_##a: t = #a; break; + foreach_parse_match_type +#undef _ + default: + t = 0; + break; + } + + if (t) + return format (s, "%s", t); + else + return format (s, "unknown 0x%x", m); +} + +static u8 * +format_vlib_parse_item (u8 * s, va_list * args) +{ + vlib_parse_main_t *pm = va_arg (*args, vlib_parse_main_t *); + vlib_parse_item_t *item = va_arg (*args, vlib_parse_item_t *); + vlib_parse_type_t *type = pool_elt_at_index (pm->parse_types, item->type); + + if (item->type == word_type_index) + s = format (s, "%s", item->value.as_pointer); + else + s = format (s, "<%s>", type->name); + return s; +} + +static u8 * +format_vlib_parse_graph (u8 * s, va_list * args) +{ + vlib_parse_main_t *pm = va_arg (*args, vlib_parse_main_t *); + vlib_parse_graph_t *node = va_arg (*args, vlib_parse_graph_t *); + vlib_parse_item_t *item; + vlib_parse_type_t *type; + + /* $$$ hash table */ + /* *INDENT-OFF* */ + pool_foreach (type, pm->parse_types, + ({ + if (type->rule_index == node - pm->parse_graph) + s = format (s, "\n<%s>\n", type->name); + })); +/* *INDENT-ON* */ + + if (pm->root_index == (node - pm->parse_graph)) + s = format (s, "\n<root>\n"); + + item = pool_elt_at_index (pm->parse_items, node->item); + + s = format (s, "[%d] %U ", node - pm->parse_graph, + format_vlib_parse_item, pm, item); + + if (node->peer == (u32) ~ 0) + s = format (s, "peer nil "); + else + s = format (s, "peer %4u ", node->peer); + + if (node->deeper == (u32) ~ 0) + s = format (s, "deeper nil "); + else + s = format (s, "deeper %4u ", node->deeper); + + return s; +} + +void +dump_parse_graph (void) +{ + vlib_parse_main_t *pm = &vlib_parse_main; + vlib_parse_graph_t *node; + + /* *INDENT-OFF* */ + pool_foreach (node, pm->parse_graph, ({ + fformat(stdout, "%U\n", format_vlib_parse_graph, pm, node); + })); +/* *INDENT-ON* */ +} + +always_inline void +parse_cleanup_value (vlib_parse_main_t * pm, vlib_parse_value_t * pv) +{ + vlib_parse_type_t *type = pool_elt_at_index (pm->parse_types, pv->type); + if (type->value_cleanup_function) + type->value_cleanup_function (pv); +} + +static void +parse_reset (vlib_parse_main_t * pm, u8 * input) +{ + vlib_lex_token_t *t; + vlib_parse_value_t *pv; + + vlib_lex_reset (pm->lex_main, input); + + vec_foreach (t, pm->tokens) vlib_lex_cleanup_token (t); + + vec_foreach (pv, pm->parse_value) parse_cleanup_value (pm, pv); + + _vec_len (pm->parse_value) = 0; + _vec_len (pm->tokens) = 0; + pm->current_token_index = 0; +} + +static void +parse_help (vlib_parse_main_t * pm, u32 index) +{ + vlib_parse_graph_t *node; + vlib_parse_item_t *item; + vlib_parse_type_t *type; + vlib_main_t *vm = pm->vlib_main; + u8 *help_input; + int i; + + help_input = vec_dup (pm->lex_main->input_vector); + + for (i = vec_len (help_input) - 1; i >= 0; i--) + if (help_input[i] == '?') + { + help_input[i] = 0; + _vec_len (help_input) = i; + break; + } + + for (i = vec_len (help_input) - 1; i >= 0; i--) + { + if (help_input[i] != ' ' && help_input[i] != '\t') + break; + help_input[i] = 0; + break; + } + _vec_len (help_input) = i + 1; + + while (index != (u32) ~ 0) + { + node = pool_elt_at_index (pm->parse_graph, index); + item = pool_elt_at_index (pm->parse_items, node->item); + type = pool_elt_at_index (pm->parse_types, item->type); + + if (item->type == eof_type_index && vec_len (pm->match_items) == 0) + /* do nothing */ ; + else if (item->type == word_type_index) + vlib_cli_output (vm, "%s %s\n", help_input, item->value.as_pointer); + else + vlib_cli_output (vm, "%s <%s>\n", help_input, type->name); + index = node->peer; + } + vec_free (help_input); +} + +static vlib_parse_match_t +parse_eval_internal (vlib_parse_main_t * pm, u32 index) +{ + vlib_parse_graph_t *node; + vlib_parse_item_t *item; + vlib_parse_type_t *type; + vlib_parse_value_t value, *pv; + vlib_parse_match_t rv; + u32 *partial_matches = 0; + vlib_lex_token_t *t; + u32 save_token_index = (u32) ~ 0, save_match_items = 0; + int had_value = 0; + + if (pm->current_token_index >= vec_len (pm->tokens)) + return VLIB_PARSE_MATCH_FAIL; + + /* current token */ + t = vec_elt_at_index (pm->tokens, pm->current_token_index); + + /* Help ? */ + if (PREDICT_FALSE (t->token == VLIB_LEX_qmark)) + { + parse_help (pm, index); + _vec_len (pm->match_items) = 0; + return VLIB_PARSE_MATCH_DONE; + } + + /* Across all peers at this level of the parse graph */ + while (index != (u32) ~ 0) + { + node = pool_elt_at_index (pm->parse_graph, index); + item = pool_elt_at_index (pm->parse_items, node->item); + type = pool_elt_at_index (pm->parse_types, item->type); + + /* + * Save the token index. We may have to back up several + * trie plies. Type-specific match functions can consume + * multiple tokens, and they may not be optimally careful + */ + save_token_index = pm->current_token_index; + save_match_items = vec_len (pm->match_items); + vec_add1 (pm->match_items, node->item); + + if (PARSE_DEBUG > 1) + clib_warning ("Try to match token %U against node %d", + format_vlib_lex_token, pm->lex_main, t, index); + + /* Call the type-specific match function */ + rv = type->match_function (pm, type, t, &value); + + if (PARSE_DEBUG > 1) + clib_warning ("returned %U", format_vlib_parse_match, rv); + + switch (rv) + { + case VLIB_PARSE_MATCH_VALUE: + /* + * Matched, and returned a value to append to the + * set of args passed to the action function + */ + value.type = item->type; + vec_add1 (pm->parse_value, value); + had_value = 1; + /* fallthrough */ + + case VLIB_PARSE_MATCH_FULL: + unambiguous_partial_match: + /* Consume the matched token */ + pm->current_token_index++; + + /* continue matching along this path */ + rv = parse_eval_internal (pm, node->deeper); + + /* this is not the right path */ + if (rv == VLIB_PARSE_MATCH_FAIL) + { + if (had_value) + { + /* Delete the value */ + value = pm->parse_value[vec_len (pm->parse_value) - 1]; + parse_cleanup_value (pm, &value); + _vec_len (pm->parse_value) -= 1; + } + /* Continue with the next sibling */ + pm->current_token_index = save_token_index; + _vec_len (pm->match_items) = save_match_items; + index = node->peer; + break; + } + return rv; + + case VLIB_PARSE_MATCH_PARTIAL: + /* Partial (substring) match, remember it but keep going */ + vec_add1 (partial_matches, node - pm->parse_graph); + index = node->peer; + break; + + case VLIB_PARSE_MATCH_FAIL: + /* Continue with the next sibling */ + index = node->peer; + _vec_len (pm->match_items) = save_match_items; + break; + + case VLIB_PARSE_MATCH_DONE: + /* Parse complete, invoke the action function */ + if (PARSE_DEBUG > 0) + clib_warning ("parse_value: %U", format_vlib_parse_value, pm); + + { + vlib_parse_eval_function_t *f = item->value.as_pointer; + if (f) + rv = f (pm, item, pm->parse_value); + } + + vec_foreach (pv, pm->parse_value) parse_cleanup_value (pm, pv); + _vec_len (pm->parse_value) = 0; + _vec_len (pm->match_items) = 0; + return rv; + + case VLIB_PARSE_MATCH_AMBIGUOUS: + case VLIB_PARSE_MATCH_EVAL_FAIL: + case VLIB_PARSE_MATCH_RULE: + _vec_len (pm->match_items) = save_match_items; + return rv; + } + } + + /* + * Out of siblings. If we have exactly one partial match + * we win + */ + if (vec_len (partial_matches) == 1) + { + index = partial_matches[0]; + node = pool_elt_at_index (pm->parse_graph, index); + vec_free (partial_matches); + goto unambiguous_partial_match; + } + + /* Ordinary loser */ + rv = VLIB_PARSE_MATCH_FAIL; + + /* Ambiguous loser */ + if (vec_len (partial_matches) > 1) + { + vec_free (partial_matches); + rv = VLIB_PARSE_MATCH_AMBIGUOUS; + } + + _vec_len (pm->match_items) = save_match_items; + return rv; +} + +vlib_parse_match_t +rule_match (vlib_parse_main_t * pm, vlib_parse_type_t * type, + vlib_lex_token_t * t, vlib_parse_value_t * valuep) +{ + vlib_parse_match_t rv; + static int recursion_level; + + if (PARSE_DEBUG > 1) + clib_warning ("[%d]: try to match type %s graph index %d", + recursion_level, type->name, type->rule_index); + recursion_level++; + rv = parse_eval_internal (pm, type->rule_index); + recursion_level--; + + /* Break the recusive unwind here... */ + if (rv == VLIB_PARSE_MATCH_RULE) + { + if (PARSE_DEBUG > 1) + clib_warning ("[%d]: type %s matched", recursion_level, type->name); + + return VLIB_PARSE_MATCH_FULL; + } + else + { + if (PARSE_DEBUG > 1) + clib_warning ("[%d]: type %s returns %U", recursion_level, type->name, + format_vlib_parse_match, rv); + } + return rv; +} + +static int +parse_eval (vlib_parse_main_t * pm, u8 * input) +{ + vlib_lex_token_t *t; + + parse_reset (pm, input); + + /* Tokenize the entire input vector */ + do + { + vec_add2 (pm->tokens, t, 1); + vlib_lex_get_token (pm->lex_main, t); + } + while (t->token != VLIB_LEX_eof); + + /* Feed it to the parser */ + return parse_eval_internal (pm, pm->root_index); +} + +/* Temporary vlib stub */ +vlib_parse_match_t +vlib_parse_eval (u8 * input) +{ + return parse_eval (&vlib_parse_main, input); +} + +u16 +parse_type_find_or_create (vlib_parse_main_t * pm, vlib_parse_type_t * t) +{ + uword *p; + vlib_parse_type_t *n; + u8 *name_copy; + + p = hash_get_mem (pm->parse_type_by_name_hash, t->name); + if (p) + return p[0]; + + pool_get (pm->parse_types, n); + *n = *t; + n->rule_index = (u32) ~ 0; + + name_copy = format (0, "%s%c", n->name, 0); + + hash_set_mem (pm->parse_type_by_name_hash, name_copy, n - pm->parse_types); + return n - pm->parse_types; +} + +u16 +parse_type_find_by_name (vlib_parse_main_t * pm, char *name) +{ + uword *p; + + p = hash_get_mem (pm->parse_type_by_name_hash, name); + if (p) + return p[0]; + + return (u16) ~ 0; +} + +u32 +parse_item_find_or_create (vlib_parse_main_t * pm, vlib_parse_item_t * item) +{ + uword *p; + vlib_parse_item_t *i; + + /* Exact match the entire item */ + p = mhash_get (&pm->parse_item_hash, item); + if (p) + return p[0]; + + pool_get (pm->parse_items, i); + *i = *item; + + mhash_set (&pm->parse_item_hash, i, i - pm->parse_items, 0); + return i - pm->parse_items; +} + +static void +parse_type_and_graph_init (vlib_parse_main_t * pm) +{ + u32 eof_index; + vlib_parse_type_t type; + vlib_parse_item_t item; + + memset (&type, 0, sizeof (type)); + +#define foreach_token_type \ + _ (eof) \ + _ (rule_eof) \ + _ (word) \ + _ (number) \ + _ (plus) \ + _ (minus) \ + _ (star) \ + _ (slash) \ + _ (lpar) \ + _ (rpar) + +#define _(a) a##_type_index = parse_type_find_by_name (pm, #a); + foreach_token_type +#undef _ + memset (&item, 0, sizeof (item)); + item.type = eof_type_index; + + eof_index = parse_item_find_or_create (pm, &item); + pm->root_index = (u32) ~ 0; + +#if 0 + pool_get (pm->parse_graph, g); + memset (g, 0xff, sizeof (*g)); + g->item = eof_index; + pm->root_index = 0; +#endif +} + + + +static void +tokenize (vlib_parse_main_t * pm, parse_registration_t * pr) +{ + vlib_lex_token_t *t; + pm->register_input = format (pm->register_input, + "%s%c", pr->initializer, 0); + + parse_reset (pm, pm->register_input); + + do + { + vec_add2 (pm->tokens, t, 1); + vlib_lex_get_token (pm->lex_main, t); + } + while (t->token != VLIB_LEX_eof); + _vec_len (pm->register_input) = 0; +} + +static int +is_typed_rule (vlib_parse_main_t * pm) +{ + vlib_lex_token_t *t = vec_elt_at_index (pm->tokens, 0); + + /* <mytype> = blah blah blah */ + if (vec_len (pm->tokens) >= 4 + && t[0].token == VLIB_LEX_lt + && t[1].token == VLIB_LEX_word + && t[2].token == VLIB_LEX_gt && t[3].token == VLIB_LEX_equals) + return 1; + return 0; +} + +static int +token_matches_graph_node (vlib_parse_main_t * pm, + vlib_lex_token_t * t, + vlib_parse_graph_t * node, + vlib_parse_item_t * item, + vlib_parse_type_t * type, u32 * token_increment) +{ + /* EOFs don't match */ + if (t->token == VLIB_LEX_eof) + return 0; + + /* New chain element is a word */ + if (t->token == VLIB_LEX_word) + { + /* but the item in hand is not a word */ + if (item->type != word_type_index) + return 0; + + /* Or it's not this particular word */ + if (strcmp (t->value.as_pointer, item->value.as_pointer)) + return 0; + *token_increment = 1; + return 1; + } + /* New chain element is a type-name: < TYPE-NAME > */ + if (t->token == VLIB_LEX_lt) + { + u16 token_type_index; + + /* < TYPE > */ + if (t[1].token != VLIB_LEX_word || t[2].token != VLIB_LEX_gt) + { + clib_warning (0, "broken type name in '%s'", pm->register_input); + return 0; + } + + token_type_index = parse_type_find_by_name (pm, t[1].value.as_pointer); + if (token_type_index == (u16) ~ 0) + { + clib_warning (0, "unknown type '%s'", t[1].value.as_pointer); + return 0; + } + + /* Its a known type but does not match. */ + if (item->type != token_type_index) + return 0; + + *token_increment = 3; + return 1; + } + clib_warning ("BUG: t->token = %d", t->token); + return 0; +} + +u32 +generate_subgraph_from_tokens (vlib_parse_main_t * pm, + vlib_lex_token_t * t, + u32 * new_subgraph_depth, + parse_registration_t * pr, int not_a_rule) +{ + vlib_parse_graph_t *g, *last_g; + vlib_parse_item_t new_item; + u32 rv = (u32) ~ 0, new_item_index, last_index = (u32) ~ 0; + u16 token_type_index; + u32 depth = 0; + + while (t < pm->tokens + vec_len (pm->tokens)) + { + memset (&new_item, 0, sizeof (new_item)); + + if (t->token == VLIB_LEX_word) + { + new_item.type = word_type_index; + new_item.value.as_pointer = vec_dup ((u8 *) t->value.as_pointer); + new_item_index = parse_item_find_or_create (pm, &new_item); + t++; + } + else if (t->token == VLIB_LEX_lt) + { + if (t[1].token != VLIB_LEX_word || t[2].token != VLIB_LEX_gt) + { + clib_warning ("broken type name in '%s'", pm->register_input); + goto screwed; + } + token_type_index = parse_type_find_by_name (pm, + t[1].value.as_pointer); + if (token_type_index == (u16) ~ 0) + { + clib_warning ("unknown type 2 '%s'", t[1].value.as_pointer); + goto screwed; + } + + new_item.type = token_type_index; + new_item.value.as_pointer = 0; + new_item_index = parse_item_find_or_create (pm, &new_item); + t += 3; /* skip < <type-name> and > */ + } + else if (t->token == VLIB_LEX_eof) + { + screwed: + new_item.type = not_a_rule ? eof_type_index : rule_eof_type_index; + new_item.value.as_pointer = pr->eof_match; + new_item_index = parse_item_find_or_create (pm, &new_item); + t++; + } + else + { + clib_warning ("unexpected token %U index %d in '%s'", + format_vlib_lex_token, pm->lex_main, t, + t - pm->tokens, pm->register_input); + goto screwed; + } + + pool_get (pm->parse_graph, g); + memset (g, 0xff, sizeof (*g)); + g->item = new_item_index; + depth++; + + if (rv == (u32) ~ 0) + { + rv = g - pm->parse_graph; + last_index = rv; + } + else + { + last_g = pool_elt_at_index (pm->parse_graph, last_index); + last_index = last_g->deeper = g - pm->parse_graph; + } + } + *new_subgraph_depth = depth; + return rv; +} + +static u32 +measure_depth (vlib_parse_main_t * pm, u32 index) +{ + vlib_parse_graph_t *node; + vlib_parse_item_t *item; + u32 max = 0; + u32 depth; + + if (index == (u32) ~ 0) + return 0; + + node = pool_elt_at_index (pm->parse_graph, index); + item = pool_elt_at_index (pm->parse_items, node->item); + + if (item->type == eof_type_index) + return 1; + + while (index != (u32) ~ 0) + { + node = pool_elt_at_index (pm->parse_graph, index); + depth = measure_depth (pm, node->deeper); + if (max < depth) + max = depth; + index = node->peer; + } + + return max + 1; +} + +static void +add_subgraph_to_graph (vlib_parse_main_t * pm, + u32 last_matching_index, + u32 graph_root_index, + u32 new_subgraph_index, u32 new_subgraph_depth) +{ + vlib_parse_graph_t *parent_node; + int new_subgraph_longest = 1; + u32 current_peer_index; + u32 current_depth; + vlib_parse_graph_t *current_peer = 0; + vlib_parse_graph_t *new_subgraph_node = + pool_elt_at_index (pm->parse_graph, new_subgraph_index); + + /* + * Case 1: top-level peer. Splice into the top-level + * peer chain according to rule depth + */ + if (last_matching_index == (u32) ~ 0) + { + u32 index = graph_root_index; + while (1) + { + current_peer = pool_elt_at_index (pm->parse_graph, index); + current_depth = measure_depth (pm, index); + if (current_depth < new_subgraph_depth + || current_peer->peer == (u32) ~ 0) + break; + index = current_peer->peer; + } + new_subgraph_node->peer = current_peer->peer; + current_peer->peer = new_subgraph_index; + return; + } + + parent_node = pool_elt_at_index (pm->parse_graph, last_matching_index); + current_peer_index = parent_node->deeper; + + while (current_peer_index != (u32) ~ 0) + { + current_peer = pool_elt_at_index (pm->parse_graph, current_peer_index); + current_depth = measure_depth (pm, current_peer_index); + if (current_depth < new_subgraph_depth) + break; + new_subgraph_longest = 0; + current_peer_index = current_peer->peer; + } + + ASSERT (current_peer); + + if (new_subgraph_longest) + { + new_subgraph_node->peer = parent_node->deeper; + parent_node->deeper = new_subgraph_index; + } + else + { + new_subgraph_node->peer = current_peer->peer; + current_peer->peer = new_subgraph_index; + } +} + +static clib_error_t * +parse_register_one (vlib_parse_main_t * pm, parse_registration_t * pr) +{ + u32 graph_root_index; + u16 subgraph_type_index = (u16) ~ 0; + vlib_parse_type_t *subgraph_type = 0; + vlib_lex_token_t *t; + vlib_parse_graph_t *node; + u32 node_index, last_index, token_increment, new_subgraph_index; + u32 new_subgraph_depth, last_matching_index; + vlib_parse_item_t *item; + vlib_parse_type_t *type; + + int use_main_graph = 1; + + tokenize (pm, pr); + + /* A typed rule? */ + if (is_typed_rule (pm)) + { + /* Get the type and its current subgraph root, if any */ + t = vec_elt_at_index (pm->tokens, 1); + subgraph_type_index = parse_type_find_by_name (pm, t->value.as_pointer); + if (subgraph_type_index == (u16) ~ 0) + return clib_error_return (0, "undeclared type '%s'", + t->value.as_pointer); + subgraph_type = + pool_elt_at_index (pm->parse_types, subgraph_type_index); + graph_root_index = subgraph_type->rule_index; + /* Skip "mytype> = */ + t += 3; + use_main_graph = 0; + } + else + { + /* top-level graph */ + graph_root_index = pm->root_index; + t = vec_elt_at_index (pm->tokens, 0); + } + + last_matching_index = (u32) ~ 0; + last_index = node_index = graph_root_index; + + /* Find the first token which isn't already being parsed */ + while (t < pm->tokens + vec_len (pm->tokens) && node_index != (u32) ~ 0) + { + node = pool_elt_at_index (pm->parse_graph, node_index); + item = pool_elt_at_index (pm->parse_items, node->item); + type = pool_elt_at_index (pm->parse_types, item->type); + last_index = node_index; + + if (token_matches_graph_node + (pm, t, node, item, type, &token_increment)) + { + t += token_increment; + last_matching_index = node_index; + node_index = node->deeper; + } + else + node_index = node->peer; + } + + new_subgraph_index = + generate_subgraph_from_tokens (pm, t, &new_subgraph_depth, pr, + use_main_graph); + + /* trivial cases: first graph node or first type rule */ + if (graph_root_index == (u32) ~ 0) + { + if (use_main_graph) + pm->root_index = new_subgraph_index; + else + subgraph_type->rule_index = new_subgraph_index; + return 0; + } + + add_subgraph_to_graph (pm, last_matching_index, graph_root_index, + new_subgraph_index, new_subgraph_depth); + return 0; +} + +static clib_error_t * +parse_register (vlib_main_t * vm, + parse_registration_t * lo, + parse_registration_t * hi, vlib_parse_main_t * pm) +{ + parse_registration_t *pr; + + for (pr = lo; pr < hi; pr = vlib_elf_section_data_next (pr, 0)) + vec_add1 (pm->parse_registrations, pr); + + return 0; +} + +static clib_error_t * +parse_register_one_type (vlib_parse_main_t * pm, vlib_parse_type_t * rp) +{ + (void) parse_type_find_or_create (pm, (vlib_parse_type_t *) rp); + return 0; +} + +static clib_error_t * +parse_type_register (vlib_main_t * vm, + vlib_parse_type_t * lo, + vlib_parse_type_t * hi, vlib_parse_main_t * pm) +{ + clib_error_t *error = 0; + vlib_parse_type_t *ptr; + + for (ptr = lo; ptr < hi; ptr = vlib_elf_section_data_next (ptr, 0)) + { + error = parse_register_one_type (pm, ptr); + if (error) + goto done; + } + +done: + return error; +} + +clib_error_t *vlib_stdlex_init (vlib_main_t * vm) __attribute__ ((weak)); +clib_error_t * +vlib_stdlex_init (vlib_main_t * vm) +{ + (void) vlib_lex_add_table ("ignore_everything"); + return 0; +} + +static int +compute_rule_length (parse_registration_t * r) +{ + int length, i; + vlib_parse_main_t *pm = &vlib_parse_main; + + if (r->rule_length) + return r->rule_length; + + length = 0; + + tokenize (pm, r); + length = vec_len (pm->tokens); + + /* Account for "<foo> = " in "<foo> = bar" etc. */ + if (is_typed_rule (pm)) + length -= 2; + + for (i = 0; i < vec_len (pm->tokens); i++) + { + switch (pm->tokens[i].token) + { + case VLIB_LEX_lt: + case VLIB_LEX_gt: + length -= 1; + + default: + break; + } + } + + ASSERT (length > 0); + r->rule_length = length; + return length; +} + +static int +rule_length_compare (parse_registration_t * r1, parse_registration_t * r2) +{ + compute_rule_length (r1); + compute_rule_length (r2); + /* Descending sort */ + return r2->rule_length - r1->rule_length; +} + + +static clib_error_t * +parse_init (vlib_main_t * vm) +{ + vlib_parse_main_t *pm = &vlib_parse_main; + vlib_lex_main_t *lm = &vlib_lex_main; + vlib_elf_section_bounds_t *b, *bounds; + clib_error_t *error = 0; + parse_registration_t *rule; + int i; + + if ((error = vlib_call_init_function (vm, lex_onetime_init))) + return error; + + if ((error = vlib_stdlex_init (vm))) + return error; + + if ((error = vlib_call_init_function (vm, parse_builtin_init))) + return error; + + pm->vlib_main = vm; + pm->lex_main = lm; + + mhash_init (&pm->parse_item_hash, sizeof (u32), sizeof (vlib_parse_item_t)); + pm->parse_type_by_name_hash = hash_create_string (0, sizeof (u32)); + + vec_validate (pm->parse_value, 16); + vec_validate (pm->tokens, 16); + vec_validate (pm->register_input, 32); + vec_validate (pm->match_items, 16); + + _vec_len (pm->parse_value) = 0; + _vec_len (pm->tokens) = 0; + _vec_len (pm->register_input) = 0; + _vec_len (pm->match_items) = 0; + + bounds = vlib_get_elf_section_bounds (vm, "parse_type_registrations"); + vec_foreach (b, bounds) + { + error = parse_type_register (vm, b->lo, b->hi, pm); + if (error) + break; + } + vec_free (bounds); + + parse_type_and_graph_init (pm); + + bounds = vlib_get_elf_section_bounds (vm, "parse_registrations"); + vec_foreach (b, bounds) + { + error = parse_register (vm, b->lo, b->hi, pm); + if (error) + break; + } + vec_free (bounds); + + vec_sort_with_function (pm->parse_registrations, rule_length_compare); + + for (i = 0; i < vec_len (pm->parse_registrations); i++) + { + rule = pm->parse_registrations[i]; + parse_register_one (pm, rule); + } + + return error; +} + +VLIB_INIT_FUNCTION (parse_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/parse.h b/src/vlib/parse.h new file mode 100644 index 00000000000..036e744723b --- /dev/null +++ b/src/vlib/parse.h @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef included_vlib_parse_h +#define included_vlib_parse_h + +#include <vlib/vlib.h> +#include <vlib/lex.h> +#include <vppinfra/mhash.h> + +typedef struct +{ + /* Word aligned value. */ + union + { + u8 as_u8[32 - 1 * sizeof (u16)]; + void *as_pointer; + uword as_uword; + word as_word; + u64 as_u64; + } value; + + /* 16 bit type at end so that 30 bytes of value are aligned. */ + u16 type; +} __attribute ((packed)) + vlib_parse_value_t; + +/* Instance of a type. */ + typedef struct + { + u32 + type; + + u32 + origin; + + u32 + help_index; + + union + { + void * + as_pointer; + uword + as_uword; + } value; + } vlib_parse_item_t; + + typedef struct + { + /* Index of item for this node. */ + u32 + item; + + /* Graph index of peer (sibling) node (linked list of peers). */ + u32 + peer; + + /* Graph index of deeper (child) node (linked list of children). */ + u32 + deeper; + } vlib_parse_graph_t; + +#define foreach_parse_match_type \ + _(MATCH_DONE) \ + _(MATCH_RULE) \ + _(MATCH_FAIL) \ + _(MATCH_FULL) \ + _(MATCH_VALUE) \ + _(MATCH_PARTIAL) \ + _(MATCH_AMBIGUOUS) \ + _(MATCH_EVAL_FAIL) + + typedef enum + { +#define _(a) VLIB_PARSE_##a, + foreach_parse_match_type +#undef _ + } vlib_parse_match_t; + + struct vlib_parse_type; + struct vlib_parse_main; + + typedef + vlib_parse_match_t (vlib_parse_match_function_t) + (struct vlib_parse_main *, + struct vlib_parse_type *, vlib_lex_token_t *, vlib_parse_value_t *); + typedef void (vlib_parse_value_cleanup_function_t) (vlib_parse_value_t + *); + + typedef struct vlib_parse_type + { + /* Type name. */ + char * + name; + + vlib_parse_match_function_t * + match_function; + + vlib_parse_value_cleanup_function_t * + value_cleanup_function; + + format_function_t * + format_value; + + u32 + rule_index; + } vlib_parse_type_t; + + typedef struct + { + char * + initializer; + void * + eof_match; + int + rule_length; + } parse_registration_t; + + typedef struct vlib_parse_main + { + /* (type, origin, help, value) tuples */ + vlib_parse_item_t * + parse_items; + mhash_t + parse_item_hash; + + /* (item, peer, deeper) tuples */ + vlib_parse_graph_t * + parse_graph; + u32 + root_index; + + u8 * + register_input; + + /* parser types */ + vlib_parse_type_t * + parse_types; + uword * + parse_type_by_name_hash; + + /* Vector of MATCH_VALUEs */ + vlib_parse_value_t * + parse_value; + u32 * + match_items; + + /* Parse registrations */ + parse_registration_t ** + parse_registrations; + + /* Token vector */ + vlib_lex_token_t * + tokens; + u32 + current_token_index; + + vlib_lex_main_t * + lex_main; + vlib_main_t * + vlib_main; + } vlib_parse_main_t; + + vlib_parse_main_t + vlib_parse_main; + + typedef + vlib_parse_match_t (vlib_parse_eval_function_t) + (vlib_parse_main_t *, vlib_parse_item_t *, vlib_parse_value_t *); + +vlib_parse_match_t +vlib_parse_eval (u8 * input); + + format_function_t format_vlib_parse_value; + +/* FIXME need these to be global? */ + vlib_parse_match_function_t rule_match, eof_match, word_match, + number_match; + +#define _PARSE_REGISTRATION_DATA(x) \ +VLIB_ELF_SECTION_DATA(x##_registration,parse_registration_t,parse_registrations) + +#define PARSE_INIT(x, s, e) \ +static _PARSE_REGISTRATION_DATA(x) = { \ + .initializer = s, \ + .eof_match = e, \ +}; + +#define _PARSE_TYPE_REGISTRATION_DATA(x) \ +VLIB_ELF_SECTION_DATA(x##_type_registration,vlib_parse_type_t, \ +parse_type_registrations) + +#define PARSE_TYPE_INIT(n, m, c, f) \ +static _PARSE_TYPE_REGISTRATION_DATA(n) = { \ + .name = #n, \ + .match_function = m, \ + .value_cleanup_function = c, \ + .format_value = f, \ +}; + +#endif /* included_vlib_parse_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/parse_builtin.c b/src/vlib/parse_builtin.c new file mode 100644 index 00000000000..0ce716b539e --- /dev/null +++ b/src/vlib/parse_builtin.c @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vlib/parse.h> + +always_inline void * +parse_last_match_value (vlib_parse_main_t * pm) +{ + vlib_parse_item_t *i; + i = pool_elt_at_index (pm->parse_items, + vec_elt (pm->match_items, + vec_len (pm->match_items) - 1)); + return i->value.as_pointer; +} + +vlib_parse_match_t +eof_match (vlib_parse_main_t * pm, vlib_parse_type_t * type, + vlib_lex_token_t * t, vlib_parse_value_t * valuep) +{ + return t->token == + VLIB_LEX_eof ? VLIB_PARSE_MATCH_DONE : VLIB_PARSE_MATCH_FAIL; +} + +PARSE_TYPE_INIT (eof, eof_match, 0 /* cleanup value */ , + 0 /* format value */ ); + +vlib_parse_match_t +rule_eof_match (vlib_parse_main_t * pm, vlib_parse_type_t * type, + vlib_lex_token_t * t, vlib_parse_value_t * valuep) +{ + vlib_parse_match_function_t *fp = parse_last_match_value (pm); + pm->current_token_index--; + return fp ? fp (pm, type, t, valuep) : VLIB_PARSE_MATCH_RULE; +} + +PARSE_TYPE_INIT (rule_eof, rule_eof_match, 0, 0); + +vlib_parse_match_t +word_match (vlib_parse_main_t * pm, vlib_parse_type_t * type, + vlib_lex_token_t * t, vlib_parse_value_t * valuep) +{ + u8 *tv, *iv; + int i; + + if (t->token != VLIB_LEX_word) + return VLIB_PARSE_MATCH_FAIL; + + tv = t->value.as_pointer; + iv = parse_last_match_value (pm); + + for (i = 0; tv[i]; i++) + { + if (tv[i] != iv[i]) + return VLIB_PARSE_MATCH_FAIL; + } + + return iv[i] == 0 ? VLIB_PARSE_MATCH_FULL : VLIB_PARSE_MATCH_PARTIAL; +} + +PARSE_TYPE_INIT (word, word_match, 0 /* clnup value */ , + 0 /* format value */ ); + +vlib_parse_match_t +number_match (vlib_parse_main_t * pm, vlib_parse_type_t * type, + vlib_lex_token_t * t, vlib_parse_value_t * valuep) +{ + if (t->token == VLIB_LEX_number) + { + valuep->value.as_uword = t->value.as_uword; + return VLIB_PARSE_MATCH_VALUE; + } + return VLIB_PARSE_MATCH_FAIL; +} + +static u8 * +format_value_number (u8 * s, va_list * args) +{ + vlib_parse_value_t *v = va_arg (*args, vlib_parse_value_t *); + uword a = v->value.as_uword; + + if (BITS (uword) == 64) + s = format (s, "%lld(0x%llx)", a, a); + else + s = format (s, "%ld(0x%lx)", a, a); + return s; +} + +PARSE_TYPE_INIT (number, number_match, 0 /* cln value */ , + format_value_number /* fmt value */ ); + + +#define foreach_vanilla_lex_match_function \ + _(plus) \ + _(minus) \ + _(star) \ + _(slash) \ + _(lpar) \ + _(rpar) + +#define LEX_MATCH_DEBUG 0 + +#define _(name) \ +vlib_parse_match_t name##_match (vlib_parse_main_t *pm, \ + vlib_parse_type_t *type, \ + vlib_lex_token_t *t, \ + vlib_parse_value_t *valuep) \ +{ \ + if (LEX_MATCH_DEBUG > 0) \ + clib_warning ("against %U returns %s", \ + format_vlib_lex_token, pm->lex_main, t, \ + (t->token == VLIB_LEX_##name) \ + ? "VLIB_PARSE_MATCH_FULL" : \ + "VLIB_PARSE_MATCH_FAIL"); \ + if (t->token == VLIB_LEX_##name) \ + return VLIB_PARSE_MATCH_FULL; \ + return VLIB_PARSE_MATCH_FAIL; \ +} \ + \ +PARSE_TYPE_INIT (name, name##_match, 0 /* cln value */, \ + 0 /* fmt val */); + +foreach_vanilla_lex_match_function +#undef _ +/* So we're linked in. */ +static clib_error_t * +parse_builtin_init (vlib_main_t * vm) +{ + return 0; +} + +VLIB_INIT_FUNCTION (parse_builtin_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/pci/linux_pci.c b/src/vlib/pci/linux_pci.c new file mode 100644 index 00000000000..f9ee47ac145 --- /dev/null +++ b/src/vlib/pci/linux_pci.c @@ -0,0 +1,642 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * pci.c: Linux user space PCI bus management. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vlib/vlib.h> +#include <vlib/pci/pci.h> +#include <vlib/unix/unix.h> + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <dirent.h> +#include <sys/ioctl.h> +#include <net/if.h> +#include <linux/ethtool.h> +#include <linux/sockios.h> + +typedef struct +{ + /* /sys/bus/pci/devices/... directory name for this device. */ + u8 *dev_dir_name; + + /* Resource file descriptors. */ + int *resource_fds; + + /* File descriptor for config space read/write. */ + int config_fd; + + /* File descriptor for /dev/uio%d */ + int uio_fd; + + /* Minor device for uio device. */ + u32 uio_minor; + + /* Index given by unix_file_add. */ + u32 unix_file_index; + +} linux_pci_device_t; + +/* Pool of PCI devices. */ +typedef struct +{ + vlib_main_t *vlib_main; + linux_pci_device_t *linux_pci_devices; +} linux_pci_main_t; + +extern linux_pci_main_t linux_pci_main; + +/* Call to allocate/initialize the pci subsystem. + This is not an init function so that users can explicitly enable + pci only when it's needed. */ +clib_error_t *pci_bus_init (vlib_main_t * vm); + +clib_error_t *vlib_pci_bind_to_uio (vlib_pci_device_t * d, + char *uio_driver_name); + +linux_pci_main_t linux_pci_main; + +clib_error_t * +vlib_pci_bind_to_uio (vlib_pci_device_t * d, char *uio_driver_name) +{ + clib_error_t *error = 0; + u8 *s = 0; + DIR *dir = 0; + struct dirent *e; + int fd; + u8 *dev_dir_name = format (0, "/sys/bus/pci/devices/%U", + format_vlib_pci_addr, &d->bus_address); + + /* if uio sub-directory exists, we are fine, device is + already bound to UIO driver */ + s = format (s, "%v/uio%c", dev_dir_name, 0); + if (access ((char *) s, F_OK) == 0) + goto done; + vec_reset_length (s); + + /* walk trough all linux interfaces and if interface belonging to + this device is founf check if interface is admin up */ + dir = opendir ("/sys/class/net"); + s = format (s, "%U%c", format_vlib_pci_addr, &d->bus_address, 0); + + if (!dir) + { + error = clib_error_return (0, "Skipping PCI device %U: failed to " + "read /sys/class/net", + format_vlib_pci_addr, &d->bus_address); + goto done; + } + + fd = socket (PF_INET, SOCK_DGRAM, 0); + if (fd < 0) + { + error = clib_error_return_unix (0, "socket"); + goto done; + } + + while ((e = readdir (dir))) + { + struct ifreq ifr; + struct ethtool_drvinfo drvinfo; + + if (e->d_name[0] == '.') /* skip . and .. */ + continue; + + memset (&ifr, 0, sizeof ifr); + memset (&drvinfo, 0, sizeof drvinfo); + ifr.ifr_data = (char *) &drvinfo; + strncpy (ifr.ifr_name, e->d_name, IFNAMSIZ - 1); + drvinfo.cmd = ETHTOOL_GDRVINFO; + if (ioctl (fd, SIOCETHTOOL, &ifr) < 0) + { + /* Some interfaces (eg "lo") don't support this ioctl */ + if ((errno != ENOTSUP) && (errno != ENODEV)) + clib_unix_warning ("ioctl fetch intf %s bus info error", + e->d_name); + continue; + } + + if (strcmp ((char *) s, drvinfo.bus_info)) + continue; + + memset (&ifr, 0, sizeof (ifr)); + strncpy (ifr.ifr_name, e->d_name, IFNAMSIZ - 1); + if (ioctl (fd, SIOCGIFFLAGS, &ifr) < 0) + { + error = clib_error_return_unix (0, "ioctl fetch intf %s flags", + e->d_name); + close (fd); + goto done; + } + + if (ifr.ifr_flags & IFF_UP) + { + error = clib_error_return (0, "Skipping PCI device %U as host " + "interface %s is up", + format_vlib_pci_addr, &d->bus_address, + e->d_name); + close (fd); + goto done; + } + } + + close (fd); + vec_reset_length (s); + + s = format (s, "%v/driver/unbind%c", dev_dir_name, 0); + vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); + vec_reset_length (s); + + s = format (s, "/sys/bus/pci/drivers/%s/new_id%c", uio_driver_name, 0); + vlib_sysfs_write ((char *) s, "0x%04x 0x%04x", d->vendor_id, d->device_id); + vec_reset_length (s); + + s = format (s, "/sys/bus/pci/drivers/%s/bind%c", uio_driver_name, 0); + vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); + +done: + closedir (dir); + vec_free (s); + vec_free (dev_dir_name); + return error; +} + + +static clib_error_t * +scan_uio_dir (void *arg, u8 * path_name, u8 * file_name) +{ + linux_pci_device_t *l = arg; + unformat_input_t input; + + unformat_init_string (&input, (char *) file_name, vec_len (file_name)); + + if (!unformat (&input, "uio%d", &l->uio_minor)) + abort (); + + unformat_free (&input); + return 0; +} + +static clib_error_t * +linux_pci_uio_read_ready (unix_file_t * uf) +{ + vlib_pci_main_t *pm = &pci_main; + vlib_pci_device_t *d; + int __attribute__ ((unused)) rv; + + u32 icount; + rv = read (uf->file_descriptor, &icount, 4); + + d = pool_elt_at_index (pm->pci_devs, uf->private_data); + + if (d->interrupt_handler) + d->interrupt_handler (d); + + vlib_pci_intr_enable (d); + + return /* no error */ 0; +} + +static clib_error_t * +linux_pci_uio_error_ready (unix_file_t * uf) +{ + u32 error_index = (u32) uf->private_data; + + return clib_error_return (0, "pci device %d: error", error_index); +} + +static void +add_device (vlib_pci_device_t * dev, linux_pci_device_t * pdev) +{ + vlib_pci_main_t *pm = &pci_main; + linux_pci_main_t *lpm = &linux_pci_main; + linux_pci_device_t *l; + + pool_get (lpm->linux_pci_devices, l); + l[0] = pdev[0]; + + l->dev_dir_name = vec_dup (l->dev_dir_name); + + dev->os_handle = l - lpm->linux_pci_devices; + + { + u8 *uio_dir = format (0, "%s/uio", l->dev_dir_name); + foreach_directory_file ((char *) uio_dir, scan_uio_dir, l, /* scan_dirs */ + 1); + vec_free (uio_dir); + } + + { + char *uio_name = (char *) format (0, "/dev/uio%d%c", l->uio_minor, 0); + l->uio_fd = open (uio_name, O_RDWR); + if (l->uio_fd < 0) + clib_unix_error ("open `%s'", uio_name); + vec_free (uio_name); + } + + { + unix_file_t template = { 0 }; + unix_main_t *um = &unix_main; + + template.read_function = linux_pci_uio_read_ready; + template.file_descriptor = l->uio_fd; + template.error_function = linux_pci_uio_error_ready; + template.private_data = dev - pm->pci_devs; + + l->unix_file_index = unix_file_add (um, &template); + } +} + +static void +linux_pci_device_free (linux_pci_device_t * l) +{ + int i; + for (i = 0; i < vec_len (l->resource_fds); i++) + if (l->resource_fds[i] > 0) + close (l->resource_fds[i]); + if (l->config_fd > 0) + close (l->config_fd); + if (l->uio_fd > 0) + close (l->uio_fd); + vec_free (l->resource_fds); + vec_free (l->dev_dir_name); +} + +/* Configuration space read/write. */ +clib_error_t * +vlib_pci_read_write_config (vlib_pci_device_t * dev, + vlib_read_or_write_t read_or_write, + uword address, void *data, u32 n_bytes) +{ + linux_pci_main_t *lpm = &linux_pci_main; + linux_pci_device_t *p; + int n; + + p = pool_elt_at_index (lpm->linux_pci_devices, dev->os_handle); + + if (read_or_write == VLIB_READ) + n = pread (p->config_fd, data, n_bytes, address); + else + n = pwrite (p->config_fd, data, n_bytes, address); + + if (n != n_bytes) + return clib_error_return_unix (0, "%s", + read_or_write == VLIB_READ + ? "read" : "write"); + + return 0; +} + +static clib_error_t * +os_map_pci_resource_internal (uword os_handle, + u32 resource, u8 * addr, void **result) +{ + linux_pci_main_t *pm = &linux_pci_main; + linux_pci_device_t *p; + struct stat stat_buf; + u8 *file_name; + int fd; + clib_error_t *error; + int flags = MAP_SHARED; + + error = 0; + p = pool_elt_at_index (pm->linux_pci_devices, os_handle); + + file_name = format (0, "%v/resource%d%c", p->dev_dir_name, resource, 0); + fd = open ((char *) file_name, O_RDWR); + if (fd < 0) + { + error = clib_error_return_unix (0, "open `%s'", file_name); + goto done; + } + + if (fstat (fd, &stat_buf) < 0) + { + error = clib_error_return_unix (0, "fstat `%s'", file_name); + goto done; + } + + vec_validate (p->resource_fds, resource); + p->resource_fds[resource] = fd; + if (addr != 0) + flags |= MAP_FIXED; + + *result = mmap (addr, + /* size */ stat_buf.st_size, + PROT_READ | PROT_WRITE, flags, + /* file */ fd, + /* offset */ 0); + if (*result == (void *) -1) + { + error = clib_error_return_unix (0, "mmap `%s'", file_name); + goto done; + } + +done: + if (error) + { + if (fd >= 0) + close (fd); + } + vec_free (file_name); + return error; +} + +clib_error_t * +vlib_pci_map_resource (vlib_pci_device_t * dev, u32 resource, void **result) +{ + return (os_map_pci_resource_internal + (dev->os_handle, resource, 0 /* addr */ , + result)); +} + +clib_error_t * +vlib_pci_map_resource_fixed (vlib_pci_device_t * dev, + u32 resource, u8 * addr, void **result) +{ + return (os_map_pci_resource_internal + (dev->os_handle, resource, addr, result)); +} + +void +vlib_pci_free_device (vlib_pci_device_t * dev) +{ + linux_pci_main_t *pm = &linux_pci_main; + linux_pci_device_t *l; + + l = pool_elt_at_index (pm->linux_pci_devices, dev->os_handle); + linux_pci_device_free (l); + pool_put (pm->linux_pci_devices, l); +} + +pci_device_registration_t * __attribute__ ((unused)) +pci_device_next_registered (pci_device_registration_t * r) +{ + uword i; + + /* Null vendor id marks end of initialized list. */ + for (i = 0; r->supported_devices[i].vendor_id != 0; i++) + ; + + return clib_elf_section_data_next (r, i * sizeof (r->supported_devices[0])); +} + +static clib_error_t * +init_device_from_registered (vlib_main_t * vm, + vlib_pci_device_t * dev, + linux_pci_device_t * pdev) +{ + vlib_pci_main_t *pm = &pci_main; + pci_device_registration_t *r; + pci_device_id_t *i; + clib_error_t *error; + + r = pm->pci_device_registrations; + + while (r) + { + for (i = r->supported_devices; i->vendor_id != 0; i++) + if (i->vendor_id == dev->vendor_id && i->device_id == dev->device_id) + { + error = vlib_pci_bind_to_uio (dev, "uio_pci_generic"); + if (error) + { + clib_error_report (error); + continue; + } + + add_device (dev, pdev); + dev->interrupt_handler = r->interrupt_handler; + return r->init_function (vm, dev); + } + r = r->next_registration; + } + /* No driver, close the PCI config-space FD */ + close (pdev->config_fd); + return 0; +} + +static clib_error_t * +init_device (vlib_main_t * vm, + vlib_pci_device_t * dev, linux_pci_device_t * pdev) +{ + return init_device_from_registered (vm, dev, pdev); +} + +static clib_error_t * +scan_device (void *arg, u8 * dev_dir_name, u8 * ignored) +{ + vlib_main_t *vm = arg; + vlib_pci_main_t *pm = &pci_main; + int fd; + u8 *f; + clib_error_t *error = 0; + vlib_pci_device_t *dev; + linux_pci_device_t pdev = { 0 }; + u32 tmp; + + f = format (0, "%v/config%c", dev_dir_name, 0); + fd = open ((char *) f, O_RDWR); + + /* Try read-only access if write fails. */ + if (fd < 0) + fd = open ((char *) f, O_RDONLY); + + if (fd < 0) + { + error = clib_error_return_unix (0, "open `%s'", f); + goto done; + } + + pool_get (pm->pci_devs, dev); + + /* You can only read more that 64 bytes of config space as root; so we try to + read the full space but fall back to just the first 64 bytes. */ + if (read (fd, &dev->config_data, sizeof (dev->config_data)) != + sizeof (dev->config_data) + && read (fd, &dev->config0, + sizeof (dev->config0)) != sizeof (dev->config0)) + { + pool_put (pm->pci_devs, dev); + error = clib_error_return_unix (0, "read `%s'", f); + close (fd); + goto done; + } + + { + static pci_config_header_t all_ones; + if (all_ones.vendor_id == 0) + memset (&all_ones, ~0, sizeof (all_ones)); + + if (!memcmp (&dev->config0.header, &all_ones, sizeof (all_ones))) + { + pool_put (pm->pci_devs, dev); + error = clib_error_return (0, "invalid PCI config for `%s'", f); + close (fd); + goto done; + } + } + + if (dev->config0.header.header_type == 0) + pci_config_type0_little_to_host (&dev->config0); + else + pci_config_type1_little_to_host (&dev->config1); + + /* Parse bus, dev, function from directory name. */ + { + unformat_input_t input; + + unformat_init_string (&input, (char *) dev_dir_name, + vec_len (dev_dir_name)); + + if (!unformat (&input, "/sys/bus/pci/devices/%U", + unformat_vlib_pci_addr, &dev->bus_address)) + abort (); + + unformat_free (&input); + + } + + + pdev.config_fd = fd; + pdev.dev_dir_name = dev_dir_name; + + hash_set (pm->pci_dev_index_by_pci_addr, dev->bus_address.as_u32, + dev - pm->pci_devs); + + error = init_device (vm, dev, &pdev); + + vec_reset_length (f); + f = format (f, "%v/vpd%c", dev_dir_name, 0); + fd = open ((char *) f, O_RDONLY); + if (fd >= 0) + { + while (1) + { + u8 tag[3]; + u8 *data = 0; + int len; + + if (read (fd, &tag, 3) != 3) + break; + + if (tag[0] != 0x82 && tag[0] != 0x90 && tag[0] != 0x91) + break; + + len = (tag[2] << 8) | tag[1]; + vec_validate (data, len); + + if (read (fd, data, len) != len) + { + vec_free (data); + break; + } + if (tag[0] == 0x82) + dev->product_name = data; + else if (tag[0] == 0x90) + dev->vpd_r = data; + else if (tag[0] == 0x91) + dev->vpd_w = data; + + data = 0; + } + close (fd); + } + + vec_reset_length (f); + f = format (f, "%v/driver%c", dev_dir_name, 0); + dev->driver_name = vlib_sysfs_link_to_name ((char *) f); + + dev->numa_node = -1; + vec_reset_length (f); + f = format (f, "%v/numa_node%c", dev_dir_name, 0); + vlib_sysfs_read ((char *) f, "%u", &dev->numa_node); + + vec_reset_length (f); + f = format (f, "%v/class%c", dev_dir_name, 0); + vlib_sysfs_read ((char *) f, "0x%x", &tmp); + dev->device_class = tmp >> 8; + + vec_reset_length (f); + f = format (f, "%v/vendor%c", dev_dir_name, 0); + vlib_sysfs_read ((char *) f, "0x%x", &tmp); + dev->vendor_id = tmp; + + vec_reset_length (f); + f = format (f, "%v/device%c", dev_dir_name, 0); + vlib_sysfs_read ((char *) f, "0x%x", &tmp); + dev->device_id = tmp; + +done: + vec_free (f); + return error; +} + +clib_error_t * +linux_pci_init (vlib_main_t * vm) +{ + vlib_pci_main_t *pm = &pci_main; + clib_error_t *error; + + pm->vlib_main = vm; + + if ((error = vlib_call_init_function (vm, unix_input_init))) + return error; + + ASSERT (sizeof (vlib_pci_addr_t) == sizeof (u32)); + pm->pci_dev_index_by_pci_addr = hash_create (0, sizeof (uword)); + + error = foreach_directory_file ("/sys/bus/pci/devices", scan_device, vm, + /* scan_dirs */ 0); + + /* Complain and continue. might not be root, etc. */ + if (error) + clib_error_report (error); + + return error; +} + +VLIB_INIT_FUNCTION (linux_pci_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/pci/pci.c b/src/vlib/pci/pci.c new file mode 100644 index 00000000000..7100064df42 --- /dev/null +++ b/src/vlib/pci/pci.c @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * pci.c: Linux user space PCI bus management. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vlib/vlib.h> +#include <vlib/pci/pci.h> +#include <vlib/unix/unix.h> + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <dirent.h> +#include <sys/ioctl.h> +#include <net/if.h> +#include <linux/ethtool.h> +#include <linux/sockios.h> + +vlib_pci_main_t pci_main; + +vlib_pci_device_t * +vlib_get_pci_device (vlib_pci_addr_t * addr) +{ + vlib_pci_main_t *pm = &pci_main; + uword *p; + p = hash_get (pm->pci_dev_index_by_pci_addr, addr->as_u32); + + if (p == 0) + return 0; + + return vec_elt_at_index (pm->pci_devs, p[0]); +} + +static clib_error_t * +show_pci_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_pci_main_t *pm = &pci_main; + vlib_pci_device_t *d; + int show_all = 0; + u8 *s = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "all")) + show_all = 1; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + vlib_cli_output (vm, "%-13s%-5s%-12s%-13s%-16s%-32s%s", + "Address", "Sock", "VID:PID", "Link Speed", "Driver", + "Product Name", "Vital Product Data"); + + /* *INDENT-OFF* */ + pool_foreach (d, pm->pci_devs, ({ + + if (d->device_class != PCI_CLASS_NETWORK_ETHERNET && !show_all) + continue; + + vec_reset_length (s); + + if (d->numa_node >= 0) + s = format (s, " %d", d->numa_node); + + vlib_cli_output (vm, "%-13U%-5v%04x:%04x %-13U%-16s%-32v%U", + format_vlib_pci_addr, &d->bus_address, s, + d->vendor_id, d->device_id, + format_vlib_pci_link_speed, d, + d->driver_name ? (char *) d->driver_name : "", + d->product_name, + format_vlib_pci_vpd, d->vpd_r, 0); + })); +/* *INDENT-ON* */ + + vec_free (s); + return 0; +} + +uword +unformat_vlib_pci_addr (unformat_input_t * input, va_list * args) +{ + vlib_pci_addr_t *addr = va_arg (*args, vlib_pci_addr_t *); + u32 x[4]; + + if (!unformat (input, "%x:%x:%x.%x", &x[0], &x[1], &x[2], &x[3])) + return 0; + + addr->domain = x[0]; + addr->bus = x[1]; + addr->slot = x[2]; + addr->function = x[3]; + + return 1; +} + +u8 * +format_vlib_pci_addr (u8 * s, va_list * va) +{ + vlib_pci_addr_t *addr = va_arg (*va, vlib_pci_addr_t *); + return format (s, "%04x:%02x:%02x.%x", addr->domain, addr->bus, + addr->slot, addr->function); +} + +u8 * +format_vlib_pci_handle (u8 * s, va_list * va) +{ + vlib_pci_addr_t *addr = va_arg (*va, vlib_pci_addr_t *); + return format (s, "%x/%x/%x", addr->bus, addr->slot, addr->function); +} + +u8 * +format_vlib_pci_link_speed (u8 * s, va_list * va) +{ + vlib_pci_device_t *d = va_arg (*va, vlib_pci_device_t *); + pcie_config_regs_t *r = + pci_config_find_capability (&d->config0, PCI_CAP_ID_PCIE); + int width; + + if (!r) + return format (s, "unknown"); + + width = (r->link_status >> 4) & 0x3f; + + if ((r->link_status & 0xf) == 1) + return format (s, "2.5 GT/s x%u", width); + if ((r->link_status & 0xf) == 2) + return format (s, "5.0 GT/s x%u", width); + if ((r->link_status & 0xf) == 3) + return format (s, "8.0 GT/s x%u", width); + return format (s, "unknown"); +} + +u8 * +format_vlib_pci_vpd (u8 * s, va_list * args) +{ + u8 *data = va_arg (*args, u8 *); + u8 *id = va_arg (*args, u8 *); + uword indent = format_get_indent (s); + char *string_types[] = { "PN", "EC", "SN", "MN", 0 }; + uword p = 0; + int first_line = 1; + + if (vec_len (data) < 3) + return s; + + while (p + 3 < vec_len (data)) + { + + if (data[p] == 0 && data[p + 1] == 0) + return s; + + if (p + data[p + 2] > vec_len (data)) + return s; + + if (id == 0) + { + int is_string = 0; + char **c = string_types; + + while (c[0]) + { + if (*(u16 *) & data[p] == *(u16 *) c[0]) + is_string = 1; + c++; + } + + if (data[p + 2]) + { + if (!first_line) + s = format (s, "\n%U", format_white_space, indent); + else + { + first_line = 0; + s = format (s, " "); + } + + s = format (s, "%c%c: ", data[p], data[p + 1]); + if (is_string) + vec_add (s, data + p + 3, data[p + 2]); + else + { + int i; + const int max_bytes = 8; + s = format (s, "0x"); + for (i = 0; i < clib_min (data[p + 2], max_bytes); i++) + s = format (s, " %02x", data[p + 3 + i]); + + if (data[p + 2] > max_bytes) + s = format (s, " ..."); + } + } + } + else if (*(u16 *) & data[p] == *(u16 *) id) + { + vec_add (s, data + p + 3, data[p + 2]); + return s; + } + + p += 3 + data[p + 2]; + } + + return s; +} + + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_pci_command, static) = { + .path = "show pci", + .short_help = "show pci [all]", + .function = show_pci_fn, +}; +/* *INDENT-ON* */ + +clib_error_t * +pci_bus_init (vlib_main_t * vm) +{ + return 0; +} + +VLIB_INIT_FUNCTION (pci_bus_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/pci/pci.h b/src/vlib/pci/pci.h new file mode 100644 index 00000000000..811a6ff2336 --- /dev/null +++ b/src/vlib/pci/pci.h @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * pci.h: PCI definitions. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_pci_h +#define included_vlib_pci_h + +#include <vlib/vlib.h> +#include <vlib/pci/pci_config.h> + +typedef CLIB_PACKED (union + { + struct + { +u16 domain; u8 bus; u8 slot: 5; u8 function:3;}; + u32 as_u32;}) vlib_pci_addr_t; + +typedef struct vlib_pci_device +{ + /* Operating system handle for this device. */ + uword os_handle; + + vlib_pci_addr_t bus_address; + + /* First 64 bytes of configuration space. */ + union + { + pci_config_type0_regs_t config0; + pci_config_type1_regs_t config1; + u8 config_data[256]; + }; + + /* Interrupt handler */ + void (*interrupt_handler) (struct vlib_pci_device * dev); + + /* Driver name */ + u8 *driver_name; + + /* Numa Node */ + int numa_node; + + /* Device data */ + u16 device_class; + u16 vendor_id; + u16 device_id; + + /* Vital Product Data */ + u8 *product_name; + u8 *vpd_r; + u8 *vpd_w; + + /* Private data */ + uword private_data; + +} vlib_pci_device_t; + +typedef struct +{ + u16 vendor_id, device_id; +} pci_device_id_t; + +typedef struct _pci_device_registration +{ + /* Driver init function. */ + clib_error_t *(*init_function) (vlib_main_t * vm, vlib_pci_device_t * dev); + + /* Interrupt handler */ + void (*interrupt_handler) (vlib_pci_device_t * dev); + + /* List of registrations */ + struct _pci_device_registration *next_registration; + + /* Vendor/device ids supported by this driver. */ + pci_device_id_t supported_devices[]; +} pci_device_registration_t; + +/* Pool of PCI devices. */ +typedef struct +{ + vlib_main_t *vlib_main; + vlib_pci_device_t *pci_devs; + pci_device_registration_t *pci_device_registrations; + uword *pci_dev_index_by_pci_addr; +} vlib_pci_main_t; + +extern vlib_pci_main_t pci_main; + +#define PCI_REGISTER_DEVICE(x,...) \ + __VA_ARGS__ pci_device_registration_t x; \ +static void __vlib_add_pci_device_registration_##x (void) \ + __attribute__((__constructor__)) ; \ +static void __vlib_add_pci_device_registration_##x (void) \ +{ \ + vlib_pci_main_t * pm = &pci_main; \ + x.next_registration = pm->pci_device_registrations; \ + pm->pci_device_registrations = &x; \ +} \ +__VA_ARGS__ pci_device_registration_t x + +clib_error_t *vlib_pci_bind_to_uio (vlib_pci_device_t * d, + char *uio_driver_name); + +/* Configuration space read/write. */ +clib_error_t *vlib_pci_read_write_config (vlib_pci_device_t * dev, + vlib_read_or_write_t read_or_write, + uword address, + void *data, u32 n_bytes); + +#define _(t) \ +static inline clib_error_t * \ +vlib_pci_read_config_##t (vlib_pci_device_t * dev, \ + uword address, t * data) \ +{ \ + return vlib_pci_read_write_config (dev, VLIB_READ,address, data, \ + sizeof (data[0])); \ +} + +_(u32); +_(u16); +_(u8); + +#undef _ + +#define _(t) \ +static inline clib_error_t * \ +vlib_pci_write_config_##t (vlib_pci_device_t * dev, uword address, \ + t * data) \ +{ \ + return vlib_pci_read_write_config (dev, VLIB_WRITE, \ + address, data, sizeof (data[0])); \ +} + +_(u32); +_(u16); +_(u8); + +#undef _ + +static inline clib_error_t * +vlib_pci_intr_enable (vlib_pci_device_t * dev) +{ + u16 command; + clib_error_t *err; + + err = vlib_pci_read_config_u16 (dev, 4, &command); + + if (err) + return err; + + command &= ~PCI_COMMAND_INTX_DISABLE; + + return vlib_pci_write_config_u16 (dev, 4, &command); +} + +static inline clib_error_t * +vlib_pci_intr_disable (vlib_pci_device_t * dev) +{ + u16 command; + clib_error_t *err; + + err = vlib_pci_read_config_u16 (dev, 4, &command); + + if (err) + return err; + + command |= PCI_COMMAND_INTX_DISABLE; + + return vlib_pci_write_config_u16 (dev, 4, &command); +} + +static inline clib_error_t * +vlib_pci_bus_master_enable (vlib_pci_device_t * dev) +{ + clib_error_t *err; + u16 command; + + /* Set bus master enable (BME) */ + err = vlib_pci_read_config_u16 (dev, 4, &command); + + if (err) + return err; + + if (!(command & PCI_COMMAND_BUS_MASTER)) + return 0; + + command |= PCI_COMMAND_BUS_MASTER; + + return vlib_pci_write_config_u16 (dev, 4, &command); +} + +clib_error_t *vlib_pci_map_resource (vlib_pci_device_t * dev, u32 resource, + void **result); + +clib_error_t *vlib_pci_map_resource_fixed (vlib_pci_device_t * dev, + u32 resource, u8 * addr, + void **result); + +vlib_pci_device_t *vlib_get_pci_device (vlib_pci_addr_t * addr); +/* Free's device. */ +void vlib_pci_free_device (vlib_pci_device_t * dev); + +unformat_function_t unformat_vlib_pci_addr; +format_function_t format_vlib_pci_addr; +format_function_t format_vlib_pci_handle; +format_function_t format_vlib_pci_link_speed; +format_function_t format_vlib_pci_vpd; + +#endif /* included_vlib_pci_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/pci/pci_config.h b/src/vlib/pci/pci_config.h new file mode 100644 index 00000000000..92e56af6d57 --- /dev/null +++ b/src/vlib/pci/pci_config.h @@ -0,0 +1,731 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * pci.h: PCI definitions. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_pci_config_h +#define included_vlib_pci_config_h + +#include <vppinfra/byte_order.h> +#include <vppinfra/error.h> + +typedef enum +{ + PCI_CLASS_NOT_DEFINED = 0x0000, + PCI_CLASS_NOT_DEFINED_VGA = 0x0001, + + PCI_CLASS_STORAGE_SCSI = 0x0100, + PCI_CLASS_STORAGE_IDE = 0x0101, + PCI_CLASS_STORAGE_FLOPPY = 0x0102, + PCI_CLASS_STORAGE_IPI = 0x0103, + PCI_CLASS_STORAGE_RAID = 0x0104, + PCI_CLASS_STORAGE_OTHER = 0x0180, + PCI_CLASS_STORAGE = 0x0100, + + PCI_CLASS_NETWORK_ETHERNET = 0x0200, + PCI_CLASS_NETWORK_TOKEN_RING = 0x0201, + PCI_CLASS_NETWORK_FDDI = 0x0202, + PCI_CLASS_NETWORK_ATM = 0x0203, + PCI_CLASS_NETWORK_OTHER = 0x0280, + PCI_CLASS_NETWORK = 0x0200, + + PCI_CLASS_DISPLAY_VGA = 0x0300, + PCI_CLASS_DISPLAY_XGA = 0x0301, + PCI_CLASS_DISPLAY_3D = 0x0302, + PCI_CLASS_DISPLAY_OTHER = 0x0380, + PCI_CLASS_DISPLAY = 0x0300, + + PCI_CLASS_MULTIMEDIA_VIDEO = 0x0400, + PCI_CLASS_MULTIMEDIA_AUDIO = 0x0401, + PCI_CLASS_MULTIMEDIA_PHONE = 0x0402, + PCI_CLASS_MULTIMEDIA_OTHER = 0x0480, + PCI_CLASS_MULTIMEDIA = 0x0400, + + PCI_CLASS_MEMORY_RAM = 0x0500, + PCI_CLASS_MEMORY_FLASH = 0x0501, + PCI_CLASS_MEMORY_OTHER = 0x0580, + PCI_CLASS_MEMORY = 0x0500, + + PCI_CLASS_BRIDGE_HOST = 0x0600, + PCI_CLASS_BRIDGE_ISA = 0x0601, + PCI_CLASS_BRIDGE_EISA = 0x0602, + PCI_CLASS_BRIDGE_MC = 0x0603, + PCI_CLASS_BRIDGE_PCI = 0x0604, + PCI_CLASS_BRIDGE_PCMCIA = 0x0605, + PCI_CLASS_BRIDGE_NUBUS = 0x0606, + PCI_CLASS_BRIDGE_CARDBUS = 0x0607, + PCI_CLASS_BRIDGE_RACEWAY = 0x0608, + PCI_CLASS_BRIDGE_OTHER = 0x0680, + PCI_CLASS_BRIDGE = 0x0600, + + PCI_CLASS_COMMUNICATION_SERIAL = 0x0700, + PCI_CLASS_COMMUNICATION_PARALLEL = 0x0701, + PCI_CLASS_COMMUNICATION_MULTISERIAL = 0x0702, + PCI_CLASS_COMMUNICATION_MODEM = 0x0703, + PCI_CLASS_COMMUNICATION_OTHER = 0x0780, + PCI_CLASS_COMMUNICATION = 0x0700, + + PCI_CLASS_SYSTEM_PIC = 0x0800, + PCI_CLASS_SYSTEM_DMA = 0x0801, + PCI_CLASS_SYSTEM_TIMER = 0x0802, + PCI_CLASS_SYSTEM_RTC = 0x0803, + PCI_CLASS_SYSTEM_PCI_HOTPLUG = 0x0804, + PCI_CLASS_SYSTEM_OTHER = 0x0880, + PCI_CLASS_SYSTEM = 0x0800, + + PCI_CLASS_INPUT_KEYBOARD = 0x0900, + PCI_CLASS_INPUT_PEN = 0x0901, + PCI_CLASS_INPUT_MOUSE = 0x0902, + PCI_CLASS_INPUT_SCANNER = 0x0903, + PCI_CLASS_INPUT_GAMEPORT = 0x0904, + PCI_CLASS_INPUT_OTHER = 0x0980, + PCI_CLASS_INPUT = 0x0900, + + PCI_CLASS_DOCKING_GENERIC = 0x0a00, + PCI_CLASS_DOCKING_OTHER = 0x0a80, + PCI_CLASS_DOCKING = 0x0a00, + + PCI_CLASS_PROCESSOR_386 = 0x0b00, + PCI_CLASS_PROCESSOR_486 = 0x0b01, + PCI_CLASS_PROCESSOR_PENTIUM = 0x0b02, + PCI_CLASS_PROCESSOR_ALPHA = 0x0b10, + PCI_CLASS_PROCESSOR_POWERPC = 0x0b20, + PCI_CLASS_PROCESSOR_MIPS = 0x0b30, + PCI_CLASS_PROCESSOR_CO = 0x0b40, + PCI_CLASS_PROCESSOR = 0x0b00, + + PCI_CLASS_SERIAL_FIREWIRE = 0x0c00, + PCI_CLASS_SERIAL_ACCESS = 0x0c01, + PCI_CLASS_SERIAL_SSA = 0x0c02, + PCI_CLASS_SERIAL_USB = 0x0c03, + PCI_CLASS_SERIAL_FIBER = 0x0c04, + PCI_CLASS_SERIAL_SMBUS = 0x0c05, + PCI_CLASS_SERIAL = 0x0c00, + + PCI_CLASS_INTELLIGENT_I2O = 0x0e00, + PCI_CLASS_INTELLIGENT = 0x0e00, + + PCI_CLASS_SATELLITE_TV = 0x0f00, + PCI_CLASS_SATELLITE_AUDIO = 0x0f01, + PCI_CLASS_SATELLITE_VOICE = 0x0f03, + PCI_CLASS_SATELLITE_DATA = 0x0f04, + PCI_CLASS_SATELLITE = 0x0f00, + + PCI_CLASS_CRYPT_NETWORK = 0x1000, + PCI_CLASS_CRYPT_ENTERTAINMENT = 0x1001, + PCI_CLASS_CRYPT_OTHER = 0x1080, + PCI_CLASS_CRYPT = 0x1000, + + PCI_CLASS_SP_DPIO = 0x1100, + PCI_CLASS_SP_OTHER = 0x1180, + PCI_CLASS_SP = 0x1100, +} pci_device_class_t; + +static inline pci_device_class_t +pci_device_class_base (pci_device_class_t c) +{ + return c & ~0xff; +} + +/* + * Under PCI, each device has 256 bytes of configuration address space, + * of which the first 64 bytes are standardized as follows: + */ +typedef struct +{ + u16 vendor_id; + u16 device_id; + + u16 command; +#define PCI_COMMAND_IO (1 << 0) /* Enable response in I/O space */ +#define PCI_COMMAND_MEMORY (1 << 1) /* Enable response in Memory space */ +#define PCI_COMMAND_BUS_MASTER (1 << 2) /* Enable bus mastering */ +#define PCI_COMMAND_SPECIAL (1 << 3) /* Enable response to special cycles */ +#define PCI_COMMAND_WRITE_INVALIDATE (1 << 4) /* Use memory write and invalidate */ +#define PCI_COMMAND_VGA_PALETTE_SNOOP (1 << 5) +#define PCI_COMMAND_PARITY (1 << 6) +#define PCI_COMMAND_WAIT (1 << 7) /* Enable address/data stepping */ +#define PCI_COMMAND_SERR (1 << 8) /* Enable SERR */ +#define PCI_COMMAND_BACK_TO_BACK_WRITE (1 << 9) +#define PCI_COMMAND_INTX_DISABLE (1 << 10) /* INTx Emulation Disable */ + + u16 status; +#define PCI_STATUS_INTX_PENDING (1 << 3) +#define PCI_STATUS_CAPABILITY_LIST (1 << 4) +#define PCI_STATUS_66MHZ (1 << 5) /* Support 66 Mhz PCI 2.1 bus */ +#define PCI_STATUS_UDF (1 << 6) /* Support User Definable Features (obsolete) */ +#define PCI_STATUS_BACK_TO_BACK_WRITE (1 << 7) /* Accept fast-back to back */ +#define PCI_STATUS_PARITY_ERROR (1 << 8) /* Detected parity error */ +#define PCI_STATUS_DEVSEL_GET(x) ((x >> 9) & 3) /* DEVSEL timing */ +#define PCI_STATUS_DEVSEL_FAST (0 << 9) +#define PCI_STATUS_DEVSEL_MEDIUM (1 << 9) +#define PCI_STATUS_DEVSEL_SLOW (2 << 9) +#define PCI_STATUS_SIG_TARGET_ABORT (1 << 11) /* Set on target abort */ +#define PCI_STATUS_REC_TARGET_ABORT (1 << 12) /* Master ack of " */ +#define PCI_STATUS_REC_MASTER_ABORT (1 << 13) /* Set on master abort */ +#define PCI_STATUS_SIG_SYSTEM_ERROR (1 << 14) /* Set when we drive SERR */ +#define PCI_STATUS_DETECTED_PARITY_ERROR (1 << 15) + + u8 revision_id; + u8 programming_interface_class; /* Reg. Level Programming Interface */ + + pci_device_class_t device_class:16; + + u8 cache_size; + u8 latency_timer; + + u8 header_type; +#define PCI_HEADER_TYPE_NORMAL 0 +#define PCI_HEADER_TYPE_BRIDGE 1 +#define PCI_HEADER_TYPE_CARDBUS 2 + + u8 bist; +#define PCI_BIST_CODE_MASK 0x0f /* Return result */ +#define PCI_BIST_START 0x40 /* 1 to start BIST, 2 secs or less */ +#define PCI_BIST_CAPABLE 0x80 /* 1 if BIST capable */ +} pci_config_header_t; + +/* Byte swap config header. */ +always_inline void +pci_config_header_little_to_host (pci_config_header_t * r) +{ + if (!CLIB_ARCH_IS_BIG_ENDIAN) + return; +#define _(f,t) r->f = clib_byte_swap_##t (r->f) + _(vendor_id, u16); + _(device_id, u16); + _(command, u16); + _(status, u16); + _(device_class, u16); +#undef _ +} + +/* Header type 0 (normal devices) */ +typedef struct +{ + pci_config_header_t header; + + /* + * Base addresses specify locations in memory or I/O space. + * Decoded size can be determined by writing a value of + * 0xffffffff to the register, and reading it back. Only + * 1 bits are decoded. + */ + u32 base_address[6]; + + u16 cardbus_cis; + + u16 subsystem_vendor_id; + u16 subsystem_id; + + u32 rom_address; +#define PCI_ROM_ADDRESS 0x30 /* Bits 31..11 are address, 10..1 reserved */ +#define PCI_ROM_ADDRESS_ENABLE 0x01 +#define PCI_ROM_ADDRESS_MASK (~0x7ffUL) + + u8 first_capability_offset; + CLIB_PAD_FROM_TO (0x35, 0x3c); + + u8 interrupt_line; + u8 interrupt_pin; + u8 min_grant; + u8 max_latency; + + u8 capability_data[0]; +} pci_config_type0_regs_t; + +always_inline void +pci_config_type0_little_to_host (pci_config_type0_regs_t * r) +{ + int i; + if (!CLIB_ARCH_IS_BIG_ENDIAN) + return; + pci_config_header_little_to_host (&r->header); +#define _(f,t) r->f = clib_byte_swap_##t (r->f) + for (i = 0; i < ARRAY_LEN (r->base_address); i++) + _(base_address[i], u32); + _(cardbus_cis, u16); + _(subsystem_vendor_id, u16); + _(subsystem_id, u16); + _(rom_address, u32); +#undef _ +} + +/* Header type 1 (PCI-to-PCI bridges) */ +typedef struct +{ + pci_config_header_t header; + + u32 base_address[2]; + + /* Primary/secondary bus number. */ + u8 primary_bus; + u8 secondary_bus; + + /* Highest bus number behind the bridge */ + u8 subordinate_bus; + + u8 secondary_bus_latency_timer; + + /* I/O range behind bridge. */ + u8 io_base, io_limit; + + /* Secondary status register, only bit 14 used */ + u16 secondary_status; + + /* Memory range behind bridge in units of 64k bytes. */ + u16 memory_base, memory_limit; +#define PCI_MEMORY_RANGE_TYPE_MASK 0x0fUL +#define PCI_MEMORY_RANGE_MASK (~0x0fUL) + + u16 prefetchable_memory_base, prefetchable_memory_limit; +#define PCI_PREF_RANGE_TYPE_MASK 0x0fUL +#define PCI_PREF_RANGE_TYPE_32 0x00 +#define PCI_PREF_RANGE_TYPE_64 0x01 +#define PCI_PREF_RANGE_MASK (~0x0fUL) + + u32 prefetchable_memory_base_upper_32bits; + u32 prefetchable_memory_limit_upper_32bits; + u16 io_base_upper_16bits; + u16 io_limit_upper_16bits; + + /* Same as for type 0. */ + u8 capability_list_offset; + CLIB_PAD_FROM_TO (0x35, 0x37); + + u32 rom_address; + CLIB_PAD_FROM_TO (0x3c, 0x3e); + + u16 bridge_control; +#define PCI_BRIDGE_CTL_PARITY 0x01 /* Enable parity detection on secondary interface */ +#define PCI_BRIDGE_CTL_SERR 0x02 /* The same for SERR forwarding */ +#define PCI_BRIDGE_CTL_NO_ISA 0x04 /* Disable bridging of ISA ports */ +#define PCI_BRIDGE_CTL_VGA 0x08 /* Forward VGA addresses */ +#define PCI_BRIDGE_CTL_MASTER_ABORT 0x20 /* Report master aborts */ +#define PCI_BRIDGE_CTL_BUS_RESET 0x40 /* Secondary bus reset */ +#define PCI_BRIDGE_CTL_FAST_BACK 0x80 /* Fast Back2Back enabled on secondary interface */ + + u8 capability_data[0]; +} pci_config_type1_regs_t; + +always_inline void +pci_config_type1_little_to_host (pci_config_type1_regs_t * r) +{ + int i; + if (!CLIB_ARCH_IS_BIG_ENDIAN) + return; + pci_config_header_little_to_host (&r->header); +#define _(f,t) r->f = clib_byte_swap_##t (r->f) + for (i = 0; i < ARRAY_LEN (r->base_address); i++) + _(base_address[i], u32); + _(secondary_status, u16); + _(memory_base, u16); + _(memory_limit, u16); + _(prefetchable_memory_base, u16); + _(prefetchable_memory_limit, u16); + _(prefetchable_memory_base_upper_32bits, u32); + _(prefetchable_memory_limit_upper_32bits, u32); + _(io_base_upper_16bits, u16); + _(io_limit_upper_16bits, u16); + _(rom_address, u32); + _(bridge_control, u16); +#undef _ +} + +/* Capabilities. */ +typedef enum pci_capability_type +{ + /* Power Management */ + PCI_CAP_ID_PM = 1, + + /* Accelerated Graphics Port */ + PCI_CAP_ID_AGP = 2, + + /* Vital Product Data */ + PCI_CAP_ID_VPD = 3, + + /* Slot Identification */ + PCI_CAP_ID_SLOTID = 4, + + /* Message Signalled Interrupts */ + PCI_CAP_ID_MSI = 5, + + /* CompactPCI HotSwap */ + PCI_CAP_ID_CHSWP = 6, + + /* PCI-X */ + PCI_CAP_ID_PCIX = 7, + + /* Hypertransport. */ + PCI_CAP_ID_HYPERTRANSPORT = 8, + + /* PCI Standard Hot-Plug Controller */ + PCI_CAP_ID_SHPC = 0xc, + + /* PCI Express */ + PCI_CAP_ID_PCIE = 0x10, + + /* MSI-X */ + PCI_CAP_ID_MSIX = 0x11, +} pci_capability_type_t; + +/* Common header for capabilities. */ +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { + enum pci_capability_type type:8; + u8 next_offset;}) pci_capability_regs_t; +/* *INDENT-ON* */ + +always_inline void * +pci_config_find_capability (pci_config_type0_regs_t * t, int cap_type) +{ + pci_capability_regs_t *c; + u32 next_offset; + u32 ttl = 48; + + if (!(t->header.status & PCI_STATUS_CAPABILITY_LIST)) + return 0; + + next_offset = t->first_capability_offset; + while (ttl-- && next_offset >= 0x40) + { + c = (void *) t + (next_offset & ~3); + if ((u8) c->type == 0xff) + break; + if (c->type == cap_type) + return c; + next_offset = c->next_offset; + } + return 0; +} + +/* Power Management Registers */ +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { + pci_capability_regs_t header; u16 capabilities; +#define PCI_PM_CAP_VER_MASK 0x0007 /* Version */ +#define PCI_PM_CAP_PME_CLOCK 0x0008 /* PME clock required */ +#define PCI_PM_CAP_RESERVED 0x0010 /* Reserved field */ +#define PCI_PM_CAP_DSI 0x0020 /* Device specific initialization */ +#define PCI_PM_CAP_AUX_POWER 0x01C0 /* Auxilliary power support mask */ +#define PCI_PM_CAP_D1 0x0200 /* D1 power state support */ +#define PCI_PM_CAP_D2 0x0400 /* D2 power state support */ +#define PCI_PM_CAP_PME 0x0800 /* PME pin supported */ +#define PCI_PM_CAP_PME_MASK 0xF800 /* PME Mask of all supported states */ +#define PCI_PM_CAP_PME_D0 0x0800 /* PME# from D0 */ +#define PCI_PM_CAP_PME_D1 0x1000 /* PME# from D1 */ +#define PCI_PM_CAP_PME_D2 0x2000 /* PME# from D2 */ +#define PCI_PM_CAP_PME_D3 0x4000 /* PME# from D3 (hot) */ +#define PCI_PM_CAP_PME_D3cold 0x8000 /* PME# from D3 (cold) */ + u16 control; +#define PCI_PM_CTRL_STATE_MASK 0x0003 /* Current power state (D0 to D3) */ +#define PCI_PM_CTRL_PME_ENABLE 0x0100 /* PME pin enable */ +#define PCI_PM_CTRL_DATA_SEL_MASK 0x1e00 /* Data select (??) */ +#define PCI_PM_CTRL_DATA_SCALE_MASK 0x6000 /* Data scale (??) */ +#define PCI_PM_CTRL_PME_STATUS 0x8000 /* PME pin status */ + u8 extensions; +#define PCI_PM_PPB_B2_B3 0x40 /* Stop clock when in D3hot (??) */ +#define PCI_PM_BPCC_ENABLE 0x80 /* Bus power/clock control enable (??) */ + u8 data;}) pci_power_management_regs_t; +/* *INDENT-ON* */ + +/* AGP registers */ +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { + pci_capability_regs_t header; u8 version; + u8 rest_of_capability_flags; u32 status; u32 command; + /* Command & status common bits. */ +#define PCI_AGP_RQ_MASK 0xff000000 /* Maximum number of requests - 1 */ +#define PCI_AGP_SBA 0x0200 /* Sideband addressing supported */ +#define PCI_AGP_64BIT 0x0020 /* 64-bit addressing supported */ +#define PCI_AGP_ALLOW_TRANSACTIONS 0x0100 /* Allow processing of AGP transactions */ +#define PCI_AGP_FW 0x0010 /* FW transfers supported/forced */ +#define PCI_AGP_RATE4 0x0004 /* 4x transfer rate supported */ +#define PCI_AGP_RATE2 0x0002 /* 2x transfer rate supported */ +#define PCI_AGP_RATE1 0x0001 /* 1x transfer rate supported */ + }) pci_agp_regs_t; +/* *INDENT-ON* */ + +/* Vital Product Data */ +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { + pci_capability_regs_t header; u16 address; +#define PCI_VPD_ADDR_MASK 0x7fff /* Address mask */ +#define PCI_VPD_ADDR_F 0x8000 /* Write 0, 1 indicates completion */ + u32 data;}) pci_vpd_regs_t; +/* *INDENT-ON* */ + +/* Slot Identification */ +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { + pci_capability_regs_t header; u8 esr; +#define PCI_SID_ESR_NSLOTS 0x1f /* Number of expansion slots available */ +#define PCI_SID_ESR_FIC 0x20 /* First In Chassis Flag */ + u8 chassis;}) pci_sid_regs_t; +/* *INDENT-ON* */ + +/* Message Signalled Interrupts registers */ +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { + pci_capability_regs_t header; u16 flags; +#define PCI_MSI_FLAGS_ENABLE (1 << 0) /* MSI feature enabled */ +#define PCI_MSI_FLAGS_GET_MAX_QUEUE_SIZE(x) ((x >> 1) & 0x7) +#define PCI_MSI_FLAGS_MAX_QUEUE_SIZE(x) (((x) & 0x7) << 1) +#define PCI_MSI_FLAGS_GET_QUEUE_SIZE(x) ((x >> 4) & 0x7) +#define PCI_MSI_FLAGS_QUEUE_SIZE(x) (((x) & 0x7) << 4) +#define PCI_MSI_FLAGS_64BIT (1 << 7) /* 64-bit addresses allowed */ +#define PCI_MSI_FLAGS_MASKBIT (1 << 8) /* 64-bit mask bits allowed */ + u32 address; u32 data; u32 mask_bits;}) pci_msi32_regs_t; +/* *INDENT-ON* */ + +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { + pci_capability_regs_t header; u16 flags; + u32 address[2]; + u32 data; u32 mask_bits;}) pci_msi64_regs_t; +/* *INDENT-ON* */ + +/* CompactPCI Hotswap Register */ +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { + pci_capability_regs_t header; u16 control_status; +#define PCI_CHSWP_DHA 0x01 /* Device Hiding Arm */ +#define PCI_CHSWP_EIM 0x02 /* ENUM# Signal Mask */ +#define PCI_CHSWP_PIE 0x04 /* Pending Insert or Extract */ +#define PCI_CHSWP_LOO 0x08 /* LED On / Off */ +#define PCI_CHSWP_PI 0x30 /* Programming Interface */ +#define PCI_CHSWP_EXT 0x40 /* ENUM# status - extraction */ +#define PCI_CHSWP_INS 0x80 /* ENUM# status - insertion */ + }) pci_chswp_regs_t; +/* *INDENT-ON* */ + +/* PCIX registers */ +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { + pci_capability_regs_t header; u16 command; +#define PCIX_CMD_DPERR_E 0x0001 /* Data Parity Error Recovery Enable */ +#define PCIX_CMD_ERO 0x0002 /* Enable Relaxed Ordering */ +#define PCIX_CMD_MAX_READ 0x000c /* Max Memory Read Byte Count */ +#define PCIX_CMD_MAX_SPLIT 0x0070 /* Max Outstanding Split Transactions */ +#define PCIX_CMD_VERSION(x) (((x) >> 12) & 3) /* Version */ + u32 status; +#define PCIX_STATUS_DEVFN 0x000000ff /* A copy of devfn */ +#define PCIX_STATUS_BUS 0x0000ff00 /* A copy of bus nr */ +#define PCIX_STATUS_64BIT 0x00010000 /* 64-bit device */ +#define PCIX_STATUS_133MHZ 0x00020000 /* 133 MHz capable */ +#define PCIX_STATUS_SPL_DISC 0x00040000 /* Split Completion Discarded */ +#define PCIX_STATUS_UNX_SPL 0x00080000 /* Unexpected Split Completion */ +#define PCIX_STATUS_COMPLEX 0x00100000 /* Device Complexity */ +#define PCIX_STATUS_MAX_READ 0x00600000 /* Designed Max Memory Read Count */ +#define PCIX_STATUS_MAX_SPLIT 0x03800000 /* Designed Max Outstanding Split Transactions */ +#define PCIX_STATUS_MAX_CUM 0x1c000000 /* Designed Max Cumulative Read Size */ +#define PCIX_STATUS_SPL_ERR 0x20000000 /* Rcvd Split Completion Error Msg */ +#define PCIX_STATUS_266MHZ 0x40000000 /* 266 MHz capable */ +#define PCIX_STATUS_533MHZ 0x80000000 /* 533 MHz capable */ + }) pcix_config_regs_t; +/* *INDENT-ON* */ + +static inline int +pcie_size_to_code (int bytes) +{ + ASSERT (is_pow2 (bytes)); + ASSERT (bytes <= 4096); + return min_log2 (bytes) - 7; +} + +static inline int +pcie_code_to_size (int code) +{ + int size = 1 << (code + 7); + ASSERT (size <= 4096); + return size; +} + +/* PCI Express capability registers */ +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { + pci_capability_regs_t header; u16 pcie_capabilities; +#define PCIE_CAP_VERSION(x) (((x) >> 0) & 0xf) +#define PCIE_CAP_DEVICE_TYPE(x) (((x) >> 4) & 0xf) +#define PCIE_DEVICE_TYPE_ENDPOINT 0 +#define PCIE_DEVICE_TYPE_LEGACY_ENDPOINT 1 +#define PCIE_DEVICE_TYPE_ROOT_PORT 4 + /* Upstream/downstream port of PCI Express switch. */ +#define PCIE_DEVICE_TYPE_SWITCH_UPSTREAM 5 +#define PCIE_DEVICE_TYPE_SWITCH_DOWNSTREAM 6 +#define PCIE_DEVICE_TYPE_PCIE_TO_PCI_BRIDGE 7 +#define PCIE_DEVICE_TYPE_PCI_TO_PCIE_BRIDGE 8 + /* Root complex integrated endpoint. */ +#define PCIE_DEVICE_TYPE_ROOT_COMPLEX_ENDPOINT 9 +#define PCIE_DEVICE_TYPE_ROOT_COMPLEX_EVENT_COLLECTOR 10 +#define PCIE_CAP_SLOW_IMPLEMENTED (1 << 8) +#define PCIE_CAP_MSI_IRQ(x) (((x) >> 9) & 0x1f) + u32 dev_capabilities; +#define PCIE_DEVCAP_MAX_PAYLOAD(x) (128 << (((x) >> 0) & 0x7)) +#define PCIE_DEVCAP_PHANTOM_BITS(x) (((x) >> 3) & 0x3) +#define PCIE_DEVCAP_EXTENTED_TAG (1 << 5) +#define PCIE_DEVCAP_L0S 0x1c0 /* L0s Acceptable Latency */ +#define PCIE_DEVCAP_L1 0xe00 /* L1 Acceptable Latency */ +#define PCIE_DEVCAP_ATN_BUT 0x1000 /* Attention Button Present */ +#define PCIE_DEVCAP_ATN_IND 0x2000 /* Attention Indicator Present */ +#define PCIE_DEVCAP_PWR_IND 0x4000 /* Power Indicator Present */ +#define PCIE_DEVCAP_PWR_VAL 0x3fc0000 /* Slot Power Limit Value */ +#define PCIE_DEVCAP_PWR_SCL 0xc000000 /* Slot Power Limit Scale */ + u16 dev_control; +#define PCIE_CTRL_CERE 0x0001 /* Correctable Error Reporting En. */ +#define PCIE_CTRL_NFERE 0x0002 /* Non-Fatal Error Reporting Enable */ +#define PCIE_CTRL_FERE 0x0004 /* Fatal Error Reporting Enable */ +#define PCIE_CTRL_URRE 0x0008 /* Unsupported Request Reporting En. */ +#define PCIE_CTRL_RELAX_EN 0x0010 /* Enable relaxed ordering */ +#define PCIE_CTRL_MAX_PAYLOAD(n) (((n) & 7) << 5) +#define PCIE_CTRL_EXT_TAG 0x0100 /* Extended Tag Field Enable */ +#define PCIE_CTRL_PHANTOM 0x0200 /* Phantom Functions Enable */ +#define PCIE_CTRL_AUX_PME 0x0400 /* Auxiliary Power PM Enable */ +#define PCIE_CTRL_NOSNOOP_EN 0x0800 /* Enable No Snoop */ +#define PCIE_CTRL_MAX_READ_REQUEST(n) (((n) & 7) << 12) + u16 dev_status; +#define PCIE_DEVSTA_AUXPD 0x10 /* AUX Power Detected */ +#define PCIE_DEVSTA_TRPND 0x20 /* Transactions Pending */ + u32 link_capabilities; u16 link_control; u16 link_status; + u32 slot_capabilities; + u16 slot_control; u16 slot_status; u16 root_control; +#define PCIE_RTCTL_SECEE 0x01 /* System Error on Correctable Error */ +#define PCIE_RTCTL_SENFEE 0x02 /* System Error on Non-Fatal Error */ +#define PCIE_RTCTL_SEFEE 0x04 /* System Error on Fatal Error */ +#define PCIE_RTCTL_PMEIE 0x08 /* PME Interrupt Enable */ +#define PCIE_RTCTL_CRSSVE 0x10 /* CRS Software Visibility Enable */ + u16 root_capabilities; + u32 root_status; + u32 dev_capabilities2; + u16 dev_control2; + u16 dev_status2; + u32 link_capabilities2; + u16 link_control2; + u16 link_status2; + u32 slot_capabilities2; u16 slot_control2; + u16 slot_status2;}) pcie_config_regs_t; +/* *INDENT-ON* */ + +/* PCI express extended capabilities. */ +typedef enum pcie_capability_type +{ + PCIE_CAP_ADVANCED_ERROR = 1, + PCIE_CAP_VC = 2, + PCIE_CAP_DSN = 3, + PCIE_CAP_PWR = 4, +} pcie_capability_type_t; + +/* Common header for capabilities. */ +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { +enum pcie_capability_type type:16; u16 version: 4; u16 next_capability:12;}) + /* *INDENT-ON* */ +pcie_capability_regs_t; + +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { + pcie_capability_regs_t header; u32 uncorrectable_status; +#define PCIE_ERROR_UNC_LINK_TRAINING (1 << 0) +#define PCIE_ERROR_UNC_DATA_LINK_PROTOCOL (1 << 4) +#define PCIE_ERROR_UNC_SURPRISE_DOWN (1 << 5) +#define PCIE_ERROR_UNC_POISONED_TLP (1 << 12) +#define PCIE_ERROR_UNC_FLOW_CONTROL (1 << 13) +#define PCIE_ERROR_UNC_COMPLETION_TIMEOUT (1 << 14) +#define PCIE_ERROR_UNC_COMPLETER_ABORT (1 << 15) +#define PCIE_ERROR_UNC_UNEXPECTED_COMPLETION (1 << 16) +#define PCIE_ERROR_UNC_RX_OVERFLOW (1 << 17) +#define PCIE_ERROR_UNC_MALFORMED_TLP (1 << 18) +#define PCIE_ERROR_UNC_CRC_ERROR (1 << 19) +#define PCIE_ERROR_UNC_UNSUPPORTED_REQUEST (1 << 20) + u32 uncorrectable_mask; + u32 uncorrectable_severity; u32 correctable_status; +#define PCIE_ERROR_COR_RX_ERROR (1 << 0) +#define PCIE_ERROR_COR_BAD_TLP (1 << 6) +#define PCIE_ERROR_COR_BAD_DLLP (1 << 7) +#define PCIE_ERROR_COR_REPLAY_ROLLOVER (1 << 8) +#define PCIE_ERROR_COR_REPLAY_TIMER (1 << 12) +#define PCIE_ERROR_COR_ADVISORY (1 << 13) + u32 correctable_mask; + u32 control; + u32 log[4]; + u32 root_command; + u32 root_status; u16 correctable_error_source; + u16 error_source;}) pcie_advanced_error_regs_t; +/* *INDENT-ON* */ + +/* Virtual Channel */ +#define PCI_VC_PORT_REG1 4 +#define PCI_VC_PORT_REG2 8 +#define PCI_VC_PORT_CTRL 12 +#define PCI_VC_PORT_STATUS 14 +#define PCI_VC_RES_CAP 16 +#define PCI_VC_RES_CTRL 20 +#define PCI_VC_RES_STATUS 26 + +/* Power Budgeting */ +#define PCI_PWR_DSR 4 /* Data Select Register */ +#define PCI_PWR_DATA 8 /* Data Register */ +#define PCI_PWR_DATA_BASE(x) ((x) & 0xff) /* Base Power */ +#define PCI_PWR_DATA_SCALE(x) (((x) >> 8) & 3) /* Data Scale */ +#define PCI_PWR_DATA_PM_SUB(x) (((x) >> 10) & 7) /* PM Sub State */ +#define PCI_PWR_DATA_PM_STATE(x) (((x) >> 13) & 3) /* PM State */ +#define PCI_PWR_DATA_TYPE(x) (((x) >> 15) & 7) /* Type */ +#define PCI_PWR_DATA_RAIL(x) (((x) >> 18) & 7) /* Power Rail */ +#define PCI_PWR_CAP 12 /* Capability */ +#define PCI_PWR_CAP_BUDGET(x) ((x) & 1) /* Included in system budget */ + +#endif /* included_vlib_pci_config_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/physmem.h b/src/vlib/physmem.h new file mode 100644 index 00000000000..9e7d52a6226 --- /dev/null +++ b/src/vlib/physmem.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * physmem.h: virtual <-> physical memory mapping for VLIB buffers + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_physmem_h +#define included_vlib_physmem_h + +typedef struct +{ + uword start, end, size; +} vlib_physmem_region_t; + +typedef struct +{ + vlib_physmem_region_t virtual; + + uword log2_n_bytes_per_page; + + /* 1 << log2_n_bytes_per_page - 1. */ + uword page_mask; + + u64 *page_table; + + /* is fake physmem */ + u8 is_fake; +} vlib_physmem_main_t; + +always_inline u64 +vlib_physmem_offset_to_physical (vlib_physmem_main_t * pm, uword o) +{ + uword page_index = o >> pm->log2_n_bytes_per_page; + ASSERT (o < pm->virtual.size); + ASSERT (pm->page_table[page_index] != 0); + return (vec_elt (pm->page_table, page_index) + (o & pm->page_mask)); +} + +always_inline int +vlib_physmem_is_virtual (vlib_physmem_main_t * pm, uword p) +{ + return p >= pm->virtual.start && p < pm->virtual.end; +} + +always_inline uword +vlib_physmem_offset_of (vlib_physmem_main_t * pm, void *p) +{ + uword a = pointer_to_uword (p); + uword o; + + ASSERT (vlib_physmem_is_virtual (pm, a)); + o = a - pm->virtual.start; + + /* Offset must fit in 32 bits. */ + ASSERT ((uword) o == a - pm->virtual.start); + + return o; +} + +always_inline void * +vlib_physmem_at_offset (vlib_physmem_main_t * pm, uword offset) +{ + ASSERT (offset < pm->virtual.size); + return uword_to_pointer (pm->virtual.start + offset, void *); +} + +#endif /* included_vlib_physmem_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/threads.c b/src/vlib/threads.c new file mode 100644 index 00000000000..c5e58bc001a --- /dev/null +++ b/src/vlib/threads.c @@ -0,0 +1,1492 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#define _GNU_SOURCE + +#include <signal.h> +#include <math.h> +#include <vppinfra/format.h> +#include <vlib/vlib.h> + +#include <vlib/threads.h> +#include <vlib/unix/cj.h> + + +#if DPDK==1 +#include <rte_config.h> +#include <rte_common.h> +#include <rte_eal.h> +#include <rte_launch.h> +#include <rte_lcore.h> +#endif +DECLARE_CJ_GLOBAL_LOG; + +#define FRAME_QUEUE_NELTS 32 + + +#if DPDK==1 +/* + * Weak definitions of DPDK symbols used in this file. + * Needed for linking test programs without DPDK libs. + */ +unsigned __thread __attribute__ ((weak)) RTE_PER_LCORE (_lcore_id); +struct lcore_config __attribute__ ((weak)) lcore_config[]; +unsigned __attribute__ ((weak)) rte_socket_id (); +int __attribute__ ((weak)) rte_eal_remote_launch (); +#endif +u32 +vl (void *p) +{ + return vec_len (p); +} + +vlib_worker_thread_t *vlib_worker_threads; +vlib_thread_main_t vlib_thread_main; + +uword +os_get_cpu_number (void) +{ + void *sp; + uword n; + u32 len; + + len = vec_len (vlib_thread_stacks); + if (len == 0) + return 0; + + /* Get any old stack address. */ + sp = &sp; + + n = ((uword) sp - (uword) vlib_thread_stacks[0]) + >> VLIB_LOG2_THREAD_STACK_SIZE; + + /* "processes" have their own stacks, and they always run in thread 0 */ + n = n >= len ? 0 : n; + + return n; +} + +uword +os_get_ncpus (void) +{ + u32 len; + + len = vec_len (vlib_thread_stacks); + if (len == 0) + return 1; + else + return len; +} + +void +vlib_set_thread_name (char *name) +{ + int pthread_setname_np (pthread_t __target_thread, const char *__name); + int rv; + pthread_t thread = pthread_self (); + + if (thread) + { + rv = pthread_setname_np (thread, name); + if (rv) + clib_warning ("pthread_setname_np returned %d", rv); + } +} + +static int +sort_registrations_by_no_clone (void *a0, void *a1) +{ + vlib_thread_registration_t **tr0 = a0; + vlib_thread_registration_t **tr1 = a1; + + return ((i32) ((*tr0)->no_data_structure_clone) + - ((i32) ((*tr1)->no_data_structure_clone))); +} + +static uword * +vlib_sysfs_list_to_bitmap (char *filename) +{ + FILE *fp; + uword *r = 0; + + fp = fopen (filename, "r"); + + if (fp != NULL) + { + u8 *buffer = 0; + vec_validate (buffer, 256 - 1); + if (fgets ((char *) buffer, 256, fp)) + { + unformat_input_t in; + unformat_init_string (&in, (char *) buffer, + strlen ((char *) buffer)); + if (unformat (&in, "%U", unformat_bitmap_list, &r) != 1) + clib_warning ("unformat_bitmap_list failed"); + unformat_free (&in); + } + vec_free (buffer); + fclose (fp); + } + return r; +} + + +/* Called early in the init sequence */ + +clib_error_t * +vlib_thread_init (vlib_main_t * vm) +{ + vlib_thread_main_t *tm = &vlib_thread_main; + vlib_worker_thread_t *w; + vlib_thread_registration_t *tr; + u32 n_vlib_mains = 1; + u32 first_index = 1; + u32 i; + uword *avail_cpu; + + /* get bitmaps of active cpu cores and sockets */ + tm->cpu_core_bitmap = + vlib_sysfs_list_to_bitmap ("/sys/devices/system/cpu/online"); + tm->cpu_socket_bitmap = + vlib_sysfs_list_to_bitmap ("/sys/devices/system/node/online"); + + avail_cpu = clib_bitmap_dup (tm->cpu_core_bitmap); + + /* skip cores */ + for (i = 0; i < tm->skip_cores; i++) + { + uword c = clib_bitmap_first_set (avail_cpu); + if (c == ~0) + return clib_error_return (0, "no available cpus to skip"); + + avail_cpu = clib_bitmap_set (avail_cpu, c, 0); + } + + /* grab cpu for main thread */ + if (!tm->main_lcore) + { + tm->main_lcore = clib_bitmap_first_set (avail_cpu); + if (tm->main_lcore == (u8) ~ 0) + return clib_error_return (0, "no available cpus to be used for the" + " main thread"); + } + else + { + if (clib_bitmap_get (avail_cpu, tm->main_lcore) == 0) + return clib_error_return (0, "cpu %u is not available to be used" + " for the main thread", tm->main_lcore); + } + avail_cpu = clib_bitmap_set (avail_cpu, tm->main_lcore, 0); + + /* assume that there is socket 0 only if there is no data from sysfs */ + if (!tm->cpu_socket_bitmap) + tm->cpu_socket_bitmap = clib_bitmap_set (0, 0, 1); + + /* pin main thread to main_lcore */ +#if DPDK==0 + { + cpu_set_t cpuset; + CPU_ZERO (&cpuset); + CPU_SET (tm->main_lcore, &cpuset); + pthread_setaffinity_np (pthread_self (), sizeof (cpu_set_t), &cpuset); + } +#endif + + /* as many threads as stacks... */ + vec_validate_aligned (vlib_worker_threads, vec_len (vlib_thread_stacks) - 1, + CLIB_CACHE_LINE_BYTES); + + /* Preallocate thread 0 */ + _vec_len (vlib_worker_threads) = 1; + w = vlib_worker_threads; + w->thread_mheap = clib_mem_get_heap (); + w->thread_stack = vlib_thread_stacks[0]; + w->lcore_id = tm->main_lcore; + w->lwp = syscall (SYS_gettid); + w->thread_id = pthread_self (); + tm->n_vlib_mains = 1; + + if (tm->sched_policy != ~0) + { + struct sched_param sched_param; + if (!sched_getparam (w->lwp, &sched_param)) + { + if (tm->sched_priority != ~0) + sched_param.sched_priority = tm->sched_priority; + sched_setscheduler (w->lwp, tm->sched_policy, &sched_param); + } + } + + /* assign threads to cores and set n_vlib_mains */ + tr = tm->next; + + while (tr) + { + vec_add1 (tm->registrations, tr); + tr = tr->next; + } + + vec_sort_with_function (tm->registrations, sort_registrations_by_no_clone); + + for (i = 0; i < vec_len (tm->registrations); i++) + { + int j; + tr = tm->registrations[i]; + tr->first_index = first_index; + first_index += tr->count; + n_vlib_mains += (tr->no_data_structure_clone == 0) ? tr->count : 0; + + /* construct coremask */ + if (tr->use_pthreads || !tr->count) + continue; + + if (tr->coremask) + { + uword c; + /* *INDENT-OFF* */ + clib_bitmap_foreach (c, tr->coremask, ({ + if (clib_bitmap_get(avail_cpu, c) == 0) + return clib_error_return (0, "cpu %u is not available to be used" + " for the '%s' thread",c, tr->name); + + avail_cpu = clib_bitmap_set(avail_cpu, c, 0); + })); +/* *INDENT-ON* */ + + } + else + { + for (j = 0; j < tr->count; j++) + { + uword c = clib_bitmap_first_set (avail_cpu); + if (c == ~0) + return clib_error_return (0, + "no available cpus to be used for" + " the '%s' thread", tr->name); + + avail_cpu = clib_bitmap_set (avail_cpu, c, 0); + tr->coremask = clib_bitmap_set (tr->coremask, c, 1); + } + } + } + + clib_bitmap_free (avail_cpu); + + tm->n_vlib_mains = n_vlib_mains; + + vec_validate_aligned (vlib_worker_threads, first_index - 1, + CLIB_CACHE_LINE_BYTES); + + return 0; +} + +vlib_worker_thread_t * +vlib_alloc_thread (vlib_main_t * vm) +{ + vlib_worker_thread_t *w; + + if (vec_len (vlib_worker_threads) >= vec_len (vlib_thread_stacks)) + { + clib_warning ("out of worker threads... Quitting..."); + exit (1); + } + vec_add2 (vlib_worker_threads, w, 1); + w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads]; + return w; +} + +vlib_frame_queue_t * +vlib_frame_queue_alloc (int nelts) +{ + vlib_frame_queue_t *fq; + + fq = clib_mem_alloc_aligned (sizeof (*fq), CLIB_CACHE_LINE_BYTES); + memset (fq, 0, sizeof (*fq)); + fq->nelts = nelts; + fq->vector_threshold = 128; // packets + vec_validate_aligned (fq->elts, nelts - 1, CLIB_CACHE_LINE_BYTES); + + if (1) + { + if (((uword) & fq->tail) & (CLIB_CACHE_LINE_BYTES - 1)) + fformat (stderr, "WARNING: fq->tail unaligned\n"); + if (((uword) & fq->head) & (CLIB_CACHE_LINE_BYTES - 1)) + fformat (stderr, "WARNING: fq->head unaligned\n"); + if (((uword) fq->elts) & (CLIB_CACHE_LINE_BYTES - 1)) + fformat (stderr, "WARNING: fq->elts unaligned\n"); + + if (sizeof (fq->elts[0]) % CLIB_CACHE_LINE_BYTES) + fformat (stderr, "WARNING: fq->elts[0] size %d\n", + sizeof (fq->elts[0])); + if (nelts & (nelts - 1)) + { + fformat (stderr, "FATAL: nelts MUST be a power of 2\n"); + abort (); + } + } + + return (fq); +} + +void vl_msg_api_handler_no_free (void *) __attribute__ ((weak)); +void +vl_msg_api_handler_no_free (void *v) +{ +} + +/* Turned off, save as reference material... */ +#if 0 +static inline int +vlib_frame_queue_dequeue_internal (int thread_id, + vlib_main_t * vm, vlib_node_main_t * nm) +{ + vlib_frame_queue_t *fq = vlib_frame_queues[thread_id]; + vlib_frame_queue_elt_t *elt; + vlib_frame_t *f; + vlib_pending_frame_t *p; + vlib_node_runtime_t *r; + u32 node_runtime_index; + int msg_type; + u64 before; + int processed = 0; + + ASSERT (vm == vlib_mains[thread_id]); + + while (1) + { + if (fq->head == fq->tail) + return processed; + + elt = fq->elts + ((fq->head + 1) & (fq->nelts - 1)); + + if (!elt->valid) + return processed; + + before = clib_cpu_time_now (); + + f = elt->frame; + node_runtime_index = elt->node_runtime_index; + msg_type = elt->msg_type; + + switch (msg_type) + { + case VLIB_FRAME_QUEUE_ELT_FREE_BUFFERS: + vlib_buffer_free (vm, vlib_frame_vector_args (f), f->n_vectors); + /* note fallthrough... */ + case VLIB_FRAME_QUEUE_ELT_FREE_FRAME: + r = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL], + node_runtime_index); + vlib_frame_free (vm, r, f); + break; + case VLIB_FRAME_QUEUE_ELT_DISPATCH_FRAME: + vec_add2 (vm->node_main.pending_frames, p, 1); + f->flags |= (VLIB_FRAME_PENDING | VLIB_FRAME_FREE_AFTER_DISPATCH); + p->node_runtime_index = elt->node_runtime_index; + p->frame_index = vlib_frame_index (vm, f); + p->next_frame_index = VLIB_PENDING_FRAME_NO_NEXT_FRAME; + fq->dequeue_vectors += (u64) f->n_vectors; + break; + case VLIB_FRAME_QUEUE_ELT_API_MSG: + vl_msg_api_handler_no_free (f); + break; + default: + clib_warning ("bogus frame queue message, type %d", msg_type); + break; + } + elt->valid = 0; + fq->dequeues++; + fq->dequeue_ticks += clib_cpu_time_now () - before; + CLIB_MEMORY_BARRIER (); + fq->head++; + processed++; + } + ASSERT (0); + return processed; +} + +int +vlib_frame_queue_dequeue (int thread_id, + vlib_main_t * vm, vlib_node_main_t * nm) +{ + return vlib_frame_queue_dequeue_internal (thread_id, vm, nm); +} + +int +vlib_frame_queue_enqueue (vlib_main_t * vm, u32 node_runtime_index, + u32 frame_queue_index, vlib_frame_t * frame, + vlib_frame_queue_msg_type_t type) +{ + vlib_frame_queue_t *fq = vlib_frame_queues[frame_queue_index]; + vlib_frame_queue_elt_t *elt; + u32 save_count; + u64 new_tail; + u64 before = clib_cpu_time_now (); + + ASSERT (fq); + + new_tail = __sync_add_and_fetch (&fq->tail, 1); + + /* Wait until a ring slot is available */ + while (new_tail >= fq->head + fq->nelts) + { + f64 b4 = vlib_time_now_ticks (vm, before); + vlib_worker_thread_barrier_check (vm, b4); + /* Bad idea. Dequeue -> enqueue -> dequeue -> trouble */ + // vlib_frame_queue_dequeue (vm->cpu_index, vm, nm); + } + + elt = fq->elts + (new_tail & (fq->nelts - 1)); + + /* this would be very bad... */ + while (elt->valid) + { + } + + /* Once we enqueue the frame, frame->n_vectors is owned elsewhere... */ + save_count = frame->n_vectors; + + elt->frame = frame; + elt->node_runtime_index = node_runtime_index; + elt->msg_type = type; + CLIB_MEMORY_BARRIER (); + elt->valid = 1; + + return save_count; +} +#endif /* 0 */ + +/* To be called by vlib worker threads upon startup */ +void +vlib_worker_thread_init (vlib_worker_thread_t * w) +{ + vlib_thread_main_t *tm = vlib_get_thread_main (); + + /* + * Note: disabling signals in worker threads as follows + * prevents the api post-mortem dump scheme from working + * { + * sigset_t s; + * sigfillset (&s); + * pthread_sigmask (SIG_SETMASK, &s, 0); + * } + */ + + clib_mem_set_heap (w->thread_mheap); + + if (vec_len (tm->thread_prefix) && w->registration->short_name) + { + w->name = format (0, "%v_%s_%d%c", tm->thread_prefix, + w->registration->short_name, w->instance_id, '\0'); + vlib_set_thread_name ((char *) w->name); + } + + if (!w->registration->use_pthreads) + { + + /* Initial barrier sync, for both worker and i/o threads */ + clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, 1); + + while (*vlib_worker_threads->wait_at_barrier) + ; + + clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, -1); + } +} + +void * +vlib_worker_thread_bootstrap_fn (void *arg) +{ + void *rv; + vlib_worker_thread_t *w = arg; + + w->lwp = syscall (SYS_gettid); + w->thread_id = pthread_self (); + + rv = (void *) clib_calljmp + ((uword (*)(uword)) w->thread_function, + (uword) arg, w->thread_stack + VLIB_THREAD_STACK_SIZE); + /* NOTREACHED, we hope */ + return rv; +} + +static int +vlib_launch_thread (void *fp, vlib_worker_thread_t * w, unsigned lcore_id) +{ + void *(*fp_arg) (void *) = fp; + + w->lcore_id = lcore_id; +#if DPDK==1 + if (!w->registration->use_pthreads) + if (rte_eal_remote_launch) /* do we have dpdk linked */ + return rte_eal_remote_launch (fp, (void *) w, lcore_id); + else + return -1; + else +#endif + { + int ret; + pthread_t worker; + cpu_set_t cpuset; + CPU_ZERO (&cpuset); + CPU_SET (lcore_id, &cpuset); + + ret = pthread_create (&worker, NULL /* attr */ , fp_arg, (void *) w); + if (ret == 0) + return pthread_setaffinity_np (worker, sizeof (cpu_set_t), &cpuset); + else + return ret; + } +} + +static clib_error_t * +start_workers (vlib_main_t * vm) +{ + int i, j; + vlib_worker_thread_t *w; + vlib_main_t *vm_clone; + void *oldheap; + vlib_thread_main_t *tm = &vlib_thread_main; + vlib_thread_registration_t *tr; + vlib_node_runtime_t *rt; + u32 n_vlib_mains = tm->n_vlib_mains; + u32 worker_thread_index; + u8 *main_heap = clib_mem_get_per_cpu_heap (); + mheap_t *main_heap_header = mheap_header (main_heap); + + vec_reset_length (vlib_worker_threads); + + /* Set up the main thread */ + vec_add2_aligned (vlib_worker_threads, w, 1, CLIB_CACHE_LINE_BYTES); + w->elog_track.name = "main thread"; + elog_track_register (&vm->elog_main, &w->elog_track); + + if (vec_len (tm->thread_prefix)) + { + w->name = format (0, "%v_main%c", tm->thread_prefix, '\0'); + vlib_set_thread_name ((char *) w->name); + } + + /* + * Truth of the matter: we always use at least two + * threads. So, make the main heap thread-safe + * and make the event log thread-safe. + */ + main_heap_header->flags |= MHEAP_FLAG_THREAD_SAFE; + vm->elog_main.lock = + clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES); + vm->elog_main.lock[0] = 0; + + if (n_vlib_mains > 1) + { + vec_validate (vlib_mains, tm->n_vlib_mains - 1); + _vec_len (vlib_mains) = 0; + vec_add1 (vlib_mains, vm); + + vlib_worker_threads->wait_at_barrier = + clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES); + vlib_worker_threads->workers_at_barrier = + clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES); + + /* Ask for an initial barrier sync */ + *vlib_worker_threads->workers_at_barrier = 0; + *vlib_worker_threads->wait_at_barrier = 1; + + worker_thread_index = 1; + + for (i = 0; i < vec_len (tm->registrations); i++) + { + vlib_node_main_t *nm, *nm_clone; + vlib_buffer_main_t *bm_clone; + vlib_buffer_free_list_t *fl_clone, *fl_orig; + vlib_buffer_free_list_t *orig_freelist_pool; + int k; + + tr = tm->registrations[i]; + + if (tr->count == 0) + continue; + + for (k = 0; k < tr->count; k++) + { + vec_add2 (vlib_worker_threads, w, 1); + if (tr->mheap_size) + w->thread_mheap = + mheap_alloc (0 /* use VM */ , tr->mheap_size); + else + w->thread_mheap = main_heap; + w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads]; + w->thread_function = tr->function; + w->thread_function_arg = w; + w->instance_id = k; + w->registration = tr; + + w->elog_track.name = + (char *) format (0, "%s %d", tr->name, k + 1); + vec_add1 (w->elog_track.name, 0); + elog_track_register (&vm->elog_main, &w->elog_track); + + if (tr->no_data_structure_clone) + continue; + + /* Fork vlib_global_main et al. Look for bugs here */ + oldheap = clib_mem_set_heap (w->thread_mheap); + + vm_clone = clib_mem_alloc (sizeof (*vm_clone)); + clib_memcpy (vm_clone, vlib_mains[0], sizeof (*vm_clone)); + + vm_clone->cpu_index = worker_thread_index; + vm_clone->heap_base = w->thread_mheap; + vm_clone->mbuf_alloc_list = 0; + memset (&vm_clone->random_buffer, 0, + sizeof (vm_clone->random_buffer)); + + nm = &vlib_mains[0]->node_main; + nm_clone = &vm_clone->node_main; + /* fork next frames array, preserving node runtime indices */ + nm_clone->next_frames = vec_dup (nm->next_frames); + for (j = 0; j < vec_len (nm_clone->next_frames); j++) + { + vlib_next_frame_t *nf = &nm_clone->next_frames[j]; + u32 save_node_runtime_index; + u32 save_flags; + + save_node_runtime_index = nf->node_runtime_index; + save_flags = nf->flags & VLIB_FRAME_NO_FREE_AFTER_DISPATCH; + vlib_next_frame_init (nf); + nf->node_runtime_index = save_node_runtime_index; + nf->flags = save_flags; + } + + /* fork the frame dispatch queue */ + nm_clone->pending_frames = 0; + vec_validate (nm_clone->pending_frames, 10); /* $$$$$?????? */ + _vec_len (nm_clone->pending_frames) = 0; + + /* fork nodes */ + nm_clone->nodes = 0; + for (j = 0; j < vec_len (nm->nodes); j++) + { + vlib_node_t *n; + n = clib_mem_alloc_no_fail (sizeof (*n)); + clib_memcpy (n, nm->nodes[j], sizeof (*n)); + /* none of the copied nodes have enqueue rights given out */ + n->owner_node_index = VLIB_INVALID_NODE_INDEX; + memset (&n->stats_total, 0, sizeof (n->stats_total)); + memset (&n->stats_last_clear, 0, + sizeof (n->stats_last_clear)); + vec_add1 (nm_clone->nodes, n); + } + nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] = + vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]); + + nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] = + vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]); + vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]) + rt->cpu_index = vm_clone->cpu_index; + + nm_clone->processes = vec_dup (nm->processes); + + /* zap the (per worker) frame freelists, etc */ + nm_clone->frame_sizes = 0; + nm_clone->frame_size_hash = 0; + + /* Packet trace buffers are guaranteed to be empty, nothing to do here */ + + clib_mem_set_heap (oldheap); + vec_add1 (vlib_mains, vm_clone); + + vm_clone->error_main.counters = + vec_dup (vlib_mains[0]->error_main.counters); + vm_clone->error_main.counters_last_clear = + vec_dup (vlib_mains[0]->error_main.counters_last_clear); + + /* Fork the vlib_buffer_main_t free lists, etc. */ + bm_clone = vec_dup (vm_clone->buffer_main); + vm_clone->buffer_main = bm_clone; + + orig_freelist_pool = bm_clone->buffer_free_list_pool; + bm_clone->buffer_free_list_pool = 0; + + /* *INDENT-OFF* */ + pool_foreach (fl_orig, orig_freelist_pool, + ({ + pool_get_aligned (bm_clone->buffer_free_list_pool, + fl_clone, CLIB_CACHE_LINE_BYTES); + ASSERT (fl_orig - orig_freelist_pool + == fl_clone - bm_clone->buffer_free_list_pool); + + fl_clone[0] = fl_orig[0]; + fl_clone->aligned_buffers = 0; + fl_clone->unaligned_buffers = 0; + fl_clone->n_alloc = 0; + })); +/* *INDENT-ON* */ + + worker_thread_index++; + } + } + } + else + { + /* only have non-data-structure copy threads to create... */ + for (i = 0; i < vec_len (tm->registrations); i++) + { + tr = tm->registrations[i]; + + for (j = 0; j < tr->count; j++) + { + vec_add2 (vlib_worker_threads, w, 1); + if (tr->mheap_size) + w->thread_mheap = + mheap_alloc (0 /* use VM */ , tr->mheap_size); + else + w->thread_mheap = main_heap; + w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads]; + w->thread_function = tr->function; + w->thread_function_arg = w; + w->instance_id = j; + w->elog_track.name = + (char *) format (0, "%s %d", tr->name, j + 1); + w->registration = tr; + vec_add1 (w->elog_track.name, 0); + elog_track_register (&vm->elog_main, &w->elog_track); + } + } + } + + worker_thread_index = 1; + + for (i = 0; i < vec_len (tm->registrations); i++) + { + int j; + + tr = tm->registrations[i]; + + if (tr->use_pthreads || tm->use_pthreads) + { + for (j = 0; j < tr->count; j++) + { + w = vlib_worker_threads + worker_thread_index++; + if (vlib_launch_thread (vlib_worker_thread_bootstrap_fn, w, 0) < + 0) + clib_warning ("Couldn't start '%s' pthread ", tr->name); + } + } + else + { + uword c; + /* *INDENT-OFF* */ + clib_bitmap_foreach (c, tr->coremask, ({ + w = vlib_worker_threads + worker_thread_index++; + if (vlib_launch_thread (vlib_worker_thread_bootstrap_fn, w, c) < 0) + clib_warning ("Couldn't start DPDK lcore %d", c); + + })); +/* *INDENT-ON* */ + } + } + vlib_worker_thread_barrier_sync (vm); + vlib_worker_thread_barrier_release (vm); + return 0; +} + +VLIB_MAIN_LOOP_ENTER_FUNCTION (start_workers); + +void +vlib_worker_thread_node_runtime_update (void) +{ + int i, j; + vlib_worker_thread_t *w; + vlib_main_t *vm; + vlib_node_main_t *nm, *nm_clone; + vlib_node_t **old_nodes_clone; + vlib_main_t *vm_clone; + vlib_node_runtime_t *rt, *old_rt; + void *oldheap; + never_inline void + vlib_node_runtime_sync_stats (vlib_main_t * vm, + vlib_node_runtime_t * r, + uword n_calls, + uword n_vectors, uword n_clocks); + + ASSERT (os_get_cpu_number () == 0); + + if (vec_len (vlib_mains) == 0) + return; + + vm = vlib_mains[0]; + nm = &vm->node_main; + + ASSERT (os_get_cpu_number () == 0); + ASSERT (*vlib_worker_threads->wait_at_barrier == 1); + + /* + * Scrape all runtime stats, so we don't lose node runtime(s) with + * pending counts, or throw away worker / io thread counts. + */ + for (j = 0; j < vec_len (nm->nodes); j++) + { + vlib_node_t *n; + n = nm->nodes[j]; + vlib_node_sync_stats (vm, n); + } + + for (i = 1; i < vec_len (vlib_mains); i++) + { + vlib_node_t *n; + + vm_clone = vlib_mains[i]; + nm_clone = &vm_clone->node_main; + + for (j = 0; j < vec_len (nm_clone->nodes); j++) + { + n = nm_clone->nodes[j]; + + rt = vlib_node_get_runtime (vm_clone, n->index); + vlib_node_runtime_sync_stats (vm_clone, rt, 0, 0, 0); + } + } + + for (i = 1; i < vec_len (vlib_mains); i++) + { + vlib_node_runtime_t *rt; + w = vlib_worker_threads + i; + oldheap = clib_mem_set_heap (w->thread_mheap); + + vm_clone = vlib_mains[i]; + + /* Re-clone error heap */ + u64 *old_counters = vm_clone->error_main.counters; + u64 *old_counters_all_clear = vm_clone->error_main.counters_last_clear; + clib_memcpy (&vm_clone->error_main, &vm->error_main, + sizeof (vm->error_main)); + j = vec_len (vm->error_main.counters) - 1; + vec_validate_aligned (old_counters, j, CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (old_counters_all_clear, j, CLIB_CACHE_LINE_BYTES); + vm_clone->error_main.counters = old_counters; + vm_clone->error_main.counters_last_clear = old_counters_all_clear; + + nm_clone = &vm_clone->node_main; + vec_free (nm_clone->next_frames); + nm_clone->next_frames = vec_dup (nm->next_frames); + + for (j = 0; j < vec_len (nm_clone->next_frames); j++) + { + vlib_next_frame_t *nf = &nm_clone->next_frames[j]; + u32 save_node_runtime_index; + u32 save_flags; + + save_node_runtime_index = nf->node_runtime_index; + save_flags = nf->flags & VLIB_FRAME_NO_FREE_AFTER_DISPATCH; + vlib_next_frame_init (nf); + nf->node_runtime_index = save_node_runtime_index; + nf->flags = save_flags; + } + + old_nodes_clone = nm_clone->nodes; + nm_clone->nodes = 0; + + /* re-fork nodes */ + for (j = 0; j < vec_len (nm->nodes); j++) + { + vlib_node_t *old_n_clone; + vlib_node_t *new_n, *new_n_clone; + + new_n = nm->nodes[j]; + old_n_clone = old_nodes_clone[j]; + + new_n_clone = clib_mem_alloc_no_fail (sizeof (*new_n_clone)); + clib_memcpy (new_n_clone, new_n, sizeof (*new_n)); + /* none of the copied nodes have enqueue rights given out */ + new_n_clone->owner_node_index = VLIB_INVALID_NODE_INDEX; + + if (j >= vec_len (old_nodes_clone)) + { + /* new node, set to zero */ + memset (&new_n_clone->stats_total, 0, + sizeof (new_n_clone->stats_total)); + memset (&new_n_clone->stats_last_clear, 0, + sizeof (new_n_clone->stats_last_clear)); + } + else + { + /* Copy stats if the old data is valid */ + clib_memcpy (&new_n_clone->stats_total, + &old_n_clone->stats_total, + sizeof (new_n_clone->stats_total)); + clib_memcpy (&new_n_clone->stats_last_clear, + &old_n_clone->stats_last_clear, + sizeof (new_n_clone->stats_last_clear)); + + /* keep previous node state */ + new_n_clone->state = old_n_clone->state; + } + vec_add1 (nm_clone->nodes, new_n_clone); + } + /* Free the old node clone */ + for (j = 0; j < vec_len (old_nodes_clone); j++) + clib_mem_free (old_nodes_clone[j]); + vec_free (old_nodes_clone); + + vec_free (nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]); + + nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] = + vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]); + + /* clone input node runtime */ + old_rt = nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]; + + nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] = + vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]); + + vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]) + { + rt->cpu_index = vm_clone->cpu_index; + } + + for (j = 0; j < vec_len (old_rt); j++) + { + rt = vlib_node_get_runtime (vm_clone, old_rt[j].node_index); + rt->state = old_rt[j].state; + } + + vec_free (old_rt); + + nm_clone->processes = vec_dup (nm->processes); + + clib_mem_set_heap (oldheap); + + // vnet_main_fork_fixup (i); + } +} + +u32 +unformat_sched_policy (unformat_input_t * input, va_list * args) +{ + u32 *r = va_arg (*args, u32 *); + + if (0); +#define _(v,f,s) else if (unformat (input, s)) *r = SCHED_POLICY_##f; + foreach_sched_policy +#undef _ + else + return 0; + return 1; +} + +static clib_error_t * +cpu_config (vlib_main_t * vm, unformat_input_t * input) +{ + vlib_thread_registration_t *tr; + uword *p; + vlib_thread_main_t *tm = &vlib_thread_main; + u8 *name; + u64 coremask; + uword *bitmap; + u32 count; + + tm->thread_registrations_by_name = hash_create_string (0, sizeof (uword)); + + tm->n_thread_stacks = 1; /* account for main thread */ + tm->sched_policy = ~0; + tm->sched_priority = ~0; + + tr = tm->next; + + while (tr) + { + hash_set_mem (tm->thread_registrations_by_name, tr->name, (uword) tr); + tr = tr->next; + } + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "use-pthreads")) + tm->use_pthreads = 1; + else if (unformat (input, "thread-prefix %v", &tm->thread_prefix)) + ; + else if (unformat (input, "main-core %u", &tm->main_lcore)) + ; + else if (unformat (input, "skip-cores %u", &tm->skip_cores)) + ; + else if (unformat (input, "coremask-%s %llx", &name, &coremask)) + { + p = hash_get_mem (tm->thread_registrations_by_name, name); + if (p == 0) + return clib_error_return (0, "no such thread type '%s'", name); + + tr = (vlib_thread_registration_t *) p[0]; + + if (tr->use_pthreads) + return clib_error_return (0, + "coremask cannot be set for '%s' threads", + name); + + tr->coremask = clib_bitmap_set_multiple + (tr->coremask, 0, coremask, BITS (coremask)); + tr->count = clib_bitmap_count_set_bits (tr->coremask); + } + else if (unformat (input, "corelist-%s %U", &name, unformat_bitmap_list, + &bitmap)) + { + p = hash_get_mem (tm->thread_registrations_by_name, name); + if (p == 0) + return clib_error_return (0, "no such thread type '%s'", name); + + tr = (vlib_thread_registration_t *) p[0]; + + if (tr->use_pthreads) + return clib_error_return (0, + "corelist cannot be set for '%s' threads", + name); + + tr->coremask = bitmap; + tr->count = clib_bitmap_count_set_bits (tr->coremask); + } + else + if (unformat + (input, "scheduler-policy %U", unformat_sched_policy, + &tm->sched_policy)) + ; + else if (unformat (input, "scheduler-priority %u", &tm->sched_priority)) + ; + else if (unformat (input, "%s %u", &name, &count)) + { + p = hash_get_mem (tm->thread_registrations_by_name, name); + if (p == 0) + return clib_error_return (0, "no such thread type 3 '%s'", name); + + tr = (vlib_thread_registration_t *) p[0]; + if (tr->fixed_count) + return clib_error_return + (0, "number of %s threads not configurable", tr->name); + tr->count = count; + } + else + break; + } + + if (tm->sched_priority != ~0) + { + if (tm->sched_policy == SCHED_FIFO || tm->sched_policy == SCHED_RR) + { + u32 prio_max = sched_get_priority_max (tm->sched_policy); + u32 prio_min = sched_get_priority_min (tm->sched_policy); + if (tm->sched_priority > prio_max) + tm->sched_priority = prio_max; + if (tm->sched_priority < prio_min) + tm->sched_priority = prio_min; + } + else + { + return clib_error_return + (0, + "scheduling priority (%d) is not allowed for `normal` scheduling policy", + tm->sched_priority); + } + } + tr = tm->next; + + if (!tm->thread_prefix) + tm->thread_prefix = format (0, "vpp"); + + while (tr) + { + tm->n_thread_stacks += tr->count; + tm->n_pthreads += tr->count * tr->use_pthreads; + tm->n_eal_threads += tr->count * (tr->use_pthreads == 0); + tr = tr->next; + } + + return 0; +} + +VLIB_EARLY_CONFIG_FUNCTION (cpu_config, "cpu"); + +#if !defined (__x86_64__) && !defined (__aarch64__) && !defined (__powerpc64__) && !defined(__arm__) +void +__sync_fetch_and_add_8 (void) +{ + fformat (stderr, "%s called\n", __FUNCTION__); + abort (); +} + +void +__sync_add_and_fetch_8 (void) +{ + fformat (stderr, "%s called\n", __FUNCTION__); + abort (); +} +#endif + +void vnet_main_fixup (vlib_fork_fixup_t which) __attribute__ ((weak)); +void +vnet_main_fixup (vlib_fork_fixup_t which) +{ +} + +void +vlib_worker_thread_fork_fixup (vlib_fork_fixup_t which) +{ + vlib_main_t *vm = vlib_get_main (); + + if (vlib_mains == 0) + return; + + ASSERT (os_get_cpu_number () == 0); + vlib_worker_thread_barrier_sync (vm); + + switch (which) + { + case VLIB_WORKER_THREAD_FORK_FIXUP_NEW_SW_IF_INDEX: + vnet_main_fixup (VLIB_WORKER_THREAD_FORK_FIXUP_NEW_SW_IF_INDEX); + break; + + default: + ASSERT (0); + } + vlib_worker_thread_barrier_release (vm); +} + +void +vlib_worker_thread_barrier_sync (vlib_main_t * vm) +{ + f64 deadline; + u32 count; + + if (!vlib_mains) + return; + + count = vec_len (vlib_mains) - 1; + + /* Tolerate recursive calls */ + if (++vlib_worker_threads[0].recursion_level > 1) + return; + + vlib_worker_threads[0].barrier_sync_count++; + + ASSERT (os_get_cpu_number () == 0); + + deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT; + + *vlib_worker_threads->wait_at_barrier = 1; + while (*vlib_worker_threads->workers_at_barrier != count) + { + if (vlib_time_now (vm) > deadline) + { + fformat (stderr, "%s: worker thread deadlock\n", __FUNCTION__); + os_panic (); + } + } +} + +void +vlib_worker_thread_barrier_release (vlib_main_t * vm) +{ + f64 deadline; + + if (!vlib_mains) + return; + + if (--vlib_worker_threads[0].recursion_level > 0) + return; + + deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT; + + *vlib_worker_threads->wait_at_barrier = 0; + + while (*vlib_worker_threads->workers_at_barrier > 0) + { + if (vlib_time_now (vm) > deadline) + { + fformat (stderr, "%s: worker thread deadlock\n", __FUNCTION__); + os_panic (); + } + } +} + +/* + * Check the frame queue to see if any frames are available. + * If so, pull the packets off the frames and put them to + * the handoff node. + */ +static inline int +vlib_frame_queue_dequeue_internal (vlib_main_t * vm, + vlib_frame_queue_main_t * fqm) +{ + u32 thread_id = vm->cpu_index; + vlib_frame_queue_t *fq = fqm->vlib_frame_queues[thread_id]; + vlib_frame_queue_elt_t *elt; + u32 *from, *to; + vlib_frame_t *f; + int msg_type; + int processed = 0; + u32 n_left_to_node; + u32 vectors = 0; + + ASSERT (fq); + ASSERT (vm == vlib_mains[thread_id]); + + if (PREDICT_FALSE (fqm->node_index == ~0)) + return 0; + /* + * Gather trace data for frame queues + */ + if (PREDICT_FALSE (fq->trace)) + { + frame_queue_trace_t *fqt; + frame_queue_nelt_counter_t *fqh; + u32 elix; + + fqt = &fqm->frame_queue_traces[thread_id]; + + fqt->nelts = fq->nelts; + fqt->head = fq->head; + fqt->head_hint = fq->head_hint; + fqt->tail = fq->tail; + fqt->threshold = fq->vector_threshold; + fqt->n_in_use = fqt->tail - fqt->head; + if (fqt->n_in_use >= fqt->nelts) + { + // if beyond max then use max + fqt->n_in_use = fqt->nelts - 1; + } + + /* Record the number of elements in use in the histogram */ + fqh = &fqm->frame_queue_histogram[thread_id]; + fqh->count[fqt->n_in_use]++; + + /* Record a snapshot of the elements in use */ + for (elix = 0; elix < fqt->nelts; elix++) + { + elt = fq->elts + ((fq->head + 1 + elix) & (fq->nelts - 1)); + if (1 || elt->valid) + { + fqt->n_vectors[elix] = elt->n_vectors; + } + } + fqt->written = 1; + } + + while (1) + { + if (fq->head == fq->tail) + { + fq->head_hint = fq->head; + return processed; + } + + elt = fq->elts + ((fq->head + 1) & (fq->nelts - 1)); + + if (!elt->valid) + { + fq->head_hint = fq->head; + return processed; + } + + from = elt->buffer_index; + msg_type = elt->msg_type; + + ASSERT (msg_type == VLIB_FRAME_QUEUE_ELT_DISPATCH_FRAME); + ASSERT (elt->n_vectors <= VLIB_FRAME_SIZE); + + f = vlib_get_frame_to_node (vm, fqm->node_index); + + to = vlib_frame_vector_args (f); + + n_left_to_node = elt->n_vectors; + + while (n_left_to_node >= 4) + { + to[0] = from[0]; + to[1] = from[1]; + to[2] = from[2]; + to[3] = from[3]; + to += 4; + from += 4; + n_left_to_node -= 4; + } + + while (n_left_to_node > 0) + { + to[0] = from[0]; + to++; + from++; + n_left_to_node--; + } + + vectors += elt->n_vectors; + f->n_vectors = elt->n_vectors; + vlib_put_frame_to_node (vm, fqm->node_index, f); + + elt->valid = 0; + elt->n_vectors = 0; + elt->msg_type = 0xfefefefe; + CLIB_MEMORY_BARRIER (); + fq->head++; + processed++; + + /* + * Limit the number of packets pushed into the graph + */ + if (vectors >= fq->vector_threshold) + { + fq->head_hint = fq->head; + return processed; + } + } + ASSERT (0); + return processed; +} + +static_always_inline void +vlib_worker_thread_internal (vlib_main_t * vm) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_thread_main_t *tm = vlib_get_thread_main (); + u64 cpu_time_now = clib_cpu_time_now (); + vlib_frame_queue_main_t *fqm; + + vec_alloc (nm->pending_interrupt_node_runtime_indices, 32); + + while (1) + { + vlib_worker_thread_barrier_check (); + + vec_foreach (fqm, tm->frame_queue_mains) + vlib_frame_queue_dequeue_internal (vm, fqm); + + vlib_node_runtime_t *n; + vec_foreach (n, nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]) + { + cpu_time_now = dispatch_node (vm, n, VLIB_NODE_TYPE_INPUT, + VLIB_NODE_STATE_POLLING, /* frame */ 0, + cpu_time_now); + } + + /* Next handle interrupts. */ + { + uword l = _vec_len (nm->pending_interrupt_node_runtime_indices); + uword i; + if (l > 0) + { + _vec_len (nm->pending_interrupt_node_runtime_indices) = 0; + for (i = 0; i < l; i++) + { + n = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT], + nm-> + pending_interrupt_node_runtime_indices + [i]); + cpu_time_now = + dispatch_node (vm, n, VLIB_NODE_TYPE_INPUT, + VLIB_NODE_STATE_INTERRUPT, + /* frame */ 0, + cpu_time_now); + } + } + } + + if (_vec_len (nm->pending_frames)) + { + int i; + cpu_time_now = clib_cpu_time_now (); + for (i = 0; i < _vec_len (nm->pending_frames); i++) + { + vlib_pending_frame_t *p; + + p = nm->pending_frames + i; + + cpu_time_now = dispatch_pending_node (vm, p, cpu_time_now); + } + _vec_len (nm->pending_frames) = 0; + } + vlib_increment_main_loop_counter (vm); + + /* Record time stamp in case there are no enabled nodes and above + calls do not update time stamp. */ + cpu_time_now = clib_cpu_time_now (); + } +} + +void +vlib_worker_thread_fn (void *arg) +{ + vlib_worker_thread_t *w = (vlib_worker_thread_t *) arg; + vlib_main_t *vm = vlib_get_main (); + + ASSERT (vm->cpu_index == os_get_cpu_number ()); + + vlib_worker_thread_init (w); + clib_time_init (&vm->clib_time); + clib_mem_set_heap (w->thread_mheap); + +#if DPDK > 0 + /* Wait until the dpdk init sequence is complete */ + vlib_thread_main_t *tm = vlib_get_thread_main (); + while (tm->worker_thread_release == 0) + vlib_worker_thread_barrier_check (); +#endif + + vlib_worker_thread_internal (vm); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_THREAD (worker_thread_reg, static) = { + .name = "workers", + .short_name = "wk", + .function = vlib_worker_thread_fn, +}; +/* *INDENT-ON* */ + +u32 +vlib_frame_queue_main_init (u32 node_index, u32 frame_queue_nelts) +{ + vlib_thread_main_t *tm = vlib_get_thread_main (); + vlib_frame_queue_main_t *fqm; + vlib_frame_queue_t *fq; + int i; + + if (frame_queue_nelts == 0) + frame_queue_nelts = FRAME_QUEUE_NELTS; + + vec_add2 (tm->frame_queue_mains, fqm, 1); + + fqm->node_index = node_index; + + vec_validate (fqm->vlib_frame_queues, tm->n_vlib_mains - 1); + _vec_len (fqm->vlib_frame_queues) = 0; + for (i = 0; i < tm->n_vlib_mains; i++) + { + fq = vlib_frame_queue_alloc (frame_queue_nelts); + vec_add1 (fqm->vlib_frame_queues, fq); + } + + return (fqm - tm->frame_queue_mains); +} + +clib_error_t * +threads_init (vlib_main_t * vm) +{ + return 0; +} + +VLIB_INIT_FUNCTION (threads_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/threads.h b/src/vlib/threads.h new file mode 100644 index 00000000000..34ab5be8650 --- /dev/null +++ b/src/vlib/threads.h @@ -0,0 +1,470 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef included_vlib_threads_h +#define included_vlib_threads_h + +#include <vlib/main.h> +#include <linux/sched.h> + +extern vlib_main_t **vlib_mains; + +void vlib_set_thread_name (char *name); + +/* arg is actually a vlib__thread_t * */ +typedef void (vlib_thread_function_t) (void *arg); + +typedef struct vlib_thread_registration_ +{ + /* constructor generated list of thread registrations */ + struct vlib_thread_registration_ *next; + + /* config parameters */ + char *name; + char *short_name; + vlib_thread_function_t *function; + uword mheap_size; + int fixed_count; + u32 count; + int no_data_structure_clone; + u32 frame_queue_nelts; + + /* All threads of this type run on pthreads */ + int use_pthreads; + u32 first_index; + uword *coremask; +} vlib_thread_registration_t; + +/* + * Frames have their cpu / vlib_main_t index in the low-order N bits + * Make VLIB_MAX_CPUS a power-of-two, please... + */ + +#ifndef VLIB_MAX_CPUS +#define VLIB_MAX_CPUS 256 +#endif + +#if VLIB_MAX_CPUS > CLIB_MAX_MHEAPS +#error Please increase number of per-cpu mheaps +#endif + +#define VLIB_CPU_MASK (VLIB_MAX_CPUS - 1) /* 0x3f, max */ +#define VLIB_OFFSET_MASK (~VLIB_CPU_MASK) + +#define VLIB_LOG2_THREAD_STACK_SIZE (20) +#define VLIB_THREAD_STACK_SIZE (1<<VLIB_LOG2_THREAD_STACK_SIZE) + +typedef enum +{ + VLIB_FRAME_QUEUE_ELT_DISPATCH_FRAME, +} vlib_frame_queue_msg_type_t; + +typedef struct +{ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + volatile u32 valid; + u32 msg_type; + u32 n_vectors; + u32 last_n_vectors; + + /* 256 * 4 = 1024 bytes, even mult of cache line size */ + u32 buffer_index[VLIB_FRAME_SIZE]; +} +vlib_frame_queue_elt_t; + +typedef struct +{ + /* First cache line */ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + volatile u32 *wait_at_barrier; + volatile u32 *workers_at_barrier; + + /* Second Cache Line */ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline1); + void *thread_mheap; + u8 *thread_stack; + void (*thread_function) (void *); + void *thread_function_arg; + i64 recursion_level; + elog_track_t elog_track; + u32 instance_id; + vlib_thread_registration_t *registration; + u8 *name; + u64 barrier_sync_count; + + long lwp; + int lcore_id; + pthread_t thread_id; +} vlib_worker_thread_t; + +extern vlib_worker_thread_t *vlib_worker_threads; + +typedef struct +{ + /* enqueue side */ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + volatile u64 tail; + u64 enqueues; + u64 enqueue_ticks; + u64 enqueue_vectors; + u32 enqueue_full_events; + + /* dequeue side */ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline1); + volatile u64 head; + u64 dequeues; + u64 dequeue_ticks; + u64 dequeue_vectors; + u64 trace; + u64 vector_threshold; + + /* dequeue hint to enqueue side */ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline2); + volatile u64 head_hint; + + /* read-only, constant, shared */ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline3); + vlib_frame_queue_elt_t *elts; + u32 nelts; +} +vlib_frame_queue_t; + +typedef struct +{ + u32 node_index; + vlib_frame_queue_t **vlib_frame_queues; + + /* for frame queue tracing */ + frame_queue_trace_t *frame_queue_traces; + frame_queue_nelt_counter_t *frame_queue_histogram; +} vlib_frame_queue_main_t; + +/* Called early, in thread 0's context */ +clib_error_t *vlib_thread_init (vlib_main_t * vm); + +vlib_worker_thread_t *vlib_alloc_thread (vlib_main_t * vm); + +int vlib_frame_queue_enqueue (vlib_main_t * vm, u32 node_runtime_index, + u32 frame_queue_index, vlib_frame_t * frame, + vlib_frame_queue_msg_type_t type); + +int vlib_frame_queue_dequeue (int thread_id, + vlib_main_t * vm, vlib_node_main_t * nm); + +u64 dispatch_node (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_node_type_t type, + vlib_node_state_t dispatch_state, + vlib_frame_t * frame, u64 last_time_stamp); + +u64 dispatch_pending_node (vlib_main_t * vm, + vlib_pending_frame_t * p, u64 last_time_stamp); + +void vlib_worker_thread_node_runtime_update (void); + +void vlib_create_worker_threads (vlib_main_t * vm, int n, + void (*thread_function) (void *)); + +void vlib_worker_thread_init (vlib_worker_thread_t * w); +u32 vlib_frame_queue_main_init (u32 node_index, u32 frame_queue_nelts); + +/* Check for a barrier sync request every 30ms */ +#define BARRIER_SYNC_DELAY (0.030000) + +#if CLIB_DEBUG > 0 +/* long barrier timeout, for gdb... */ +#define BARRIER_SYNC_TIMEOUT (600.1) +#else +#define BARRIER_SYNC_TIMEOUT (1.0) +#endif + +void vlib_worker_thread_barrier_sync (vlib_main_t * vm); +void vlib_worker_thread_barrier_release (vlib_main_t * vm); + +always_inline void +vlib_smp_unsafe_warning (void) +{ + if (CLIB_DEBUG > 0) + { + if (os_get_cpu_number ()) + fformat (stderr, "%s: SMP unsafe warning...\n", __FUNCTION__); + } +} + +typedef enum +{ + VLIB_WORKER_THREAD_FORK_FIXUP_ILLEGAL = 0, + VLIB_WORKER_THREAD_FORK_FIXUP_NEW_SW_IF_INDEX, +} vlib_fork_fixup_t; + +void vlib_worker_thread_fork_fixup (vlib_fork_fixup_t which); + +static inline void +vlib_worker_thread_barrier_check (void) +{ + if (PREDICT_FALSE (*vlib_worker_threads->wait_at_barrier)) + { + clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, 1); + while (*vlib_worker_threads->wait_at_barrier) + ; + clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, -1); + } +} + +#define foreach_vlib_main(body) \ +do { \ + vlib_main_t ** __vlib_mains = 0, *this_vlib_main; \ + int ii; \ + \ + if (vec_len (vlib_mains) == 0) \ + vec_add1 (__vlib_mains, &vlib_global_main); \ + else \ + { \ + for (ii = 0; ii < vec_len (vlib_mains); ii++) \ + { \ + this_vlib_main = vlib_mains[ii]; \ + if (this_vlib_main) \ + vec_add1 (__vlib_mains, this_vlib_main); \ + } \ + } \ + \ + for (ii = 0; ii < vec_len (__vlib_mains); ii++) \ + { \ + this_vlib_main = __vlib_mains[ii]; \ + /* body uses this_vlib_main... */ \ + (body); \ + } \ + vec_free (__vlib_mains); \ +} while (0); + +#define foreach_sched_policy \ + _(SCHED_OTHER, OTHER, "other") \ + _(SCHED_BATCH, BATCH, "batch") \ + _(SCHED_IDLE, IDLE, "idle") \ + _(SCHED_FIFO, FIFO, "fifo") \ + _(SCHED_RR, RR, "rr") + +typedef enum +{ +#define _(v,f,s) SCHED_POLICY_##f = v, + foreach_sched_policy +#undef _ + SCHED_POLICY_N, +} sched_policy_t; + +typedef struct +{ + /* Link list of registrations, built by constructors */ + vlib_thread_registration_t *next; + + /* Vector of registrations, w/ non-data-structure clones at the top */ + vlib_thread_registration_t **registrations; + + uword *thread_registrations_by_name; + + vlib_worker_thread_t *worker_threads; + + /* + * Launch all threads as pthreads, + * not eal_rte_launch (strict affinity) threads + */ + int use_pthreads; + + /* Number of vlib_main / vnet_main clones */ + u32 n_vlib_mains; + + /* Number of thread stacks to create */ + u32 n_thread_stacks; + + /* Number of pthreads */ + u32 n_pthreads; + + /* Number of DPDK eal threads */ + u32 n_eal_threads; + + /* Number of cores to skip, must match the core mask */ + u32 skip_cores; + + /* Thread prefix name */ + u8 *thread_prefix; + + /* main thread lcore */ + u8 main_lcore; + + /* Bitmap of available CPU cores */ + uword *cpu_core_bitmap; + + /* Bitmap of available CPU sockets (NUMA nodes) */ + uword *cpu_socket_bitmap; + + /* Worker handoff queues */ + vlib_frame_queue_main_t *frame_queue_mains; + + /* worker thread initialization barrier */ + volatile u32 worker_thread_release; + + /* scheduling policy */ + u32 sched_policy; + + /* scheduling policy priority */ + u32 sched_priority; + +} vlib_thread_main_t; + +extern vlib_thread_main_t vlib_thread_main; + +#define VLIB_REGISTER_THREAD(x,...) \ + __VA_ARGS__ vlib_thread_registration_t x; \ +static void __vlib_add_thread_registration_##x (void) \ + __attribute__((__constructor__)) ; \ +static void __vlib_add_thread_registration_##x (void) \ +{ \ + vlib_thread_main_t * tm = &vlib_thread_main; \ + x.next = tm->next; \ + tm->next = &x; \ +} \ +__VA_ARGS__ vlib_thread_registration_t x + +always_inline u32 +vlib_num_workers () +{ + return vlib_thread_main.n_vlib_mains - 1; +} + +always_inline u32 +vlib_get_worker_cpu_index (u32 worker_index) +{ + return worker_index + 1; +} + +always_inline u32 +vlib_get_worker_index (u32 cpu_index) +{ + return cpu_index - 1; +} + +always_inline u32 +vlib_get_current_worker_index () +{ + return os_get_cpu_number () - 1; +} + +always_inline vlib_main_t * +vlib_get_worker_vlib_main (u32 worker_index) +{ + vlib_main_t *vm; + vlib_thread_main_t *tm = &vlib_thread_main; + ASSERT (worker_index < tm->n_vlib_mains - 1); + vm = vlib_mains[worker_index + 1]; + ASSERT (vm); + return vm; +} + +static inline void +vlib_put_frame_queue_elt (vlib_frame_queue_elt_t * hf) +{ + CLIB_MEMORY_BARRIER (); + hf->valid = 1; +} + +static inline vlib_frame_queue_elt_t * +vlib_get_frame_queue_elt (u32 frame_queue_index, u32 index) +{ + vlib_frame_queue_t *fq; + vlib_frame_queue_elt_t *elt; + vlib_thread_main_t *tm = &vlib_thread_main; + vlib_frame_queue_main_t *fqm = + vec_elt_at_index (tm->frame_queue_mains, frame_queue_index); + u64 new_tail; + + fq = fqm->vlib_frame_queues[index]; + ASSERT (fq); + + new_tail = __sync_add_and_fetch (&fq->tail, 1); + + /* Wait until a ring slot is available */ + while (new_tail >= fq->head_hint + fq->nelts) + vlib_worker_thread_barrier_check (); + + elt = fq->elts + (new_tail & (fq->nelts - 1)); + + /* this would be very bad... */ + while (elt->valid) + ; + + elt->msg_type = VLIB_FRAME_QUEUE_ELT_DISPATCH_FRAME; + elt->last_n_vectors = elt->n_vectors = 0; + + return elt; +} + +static inline vlib_frame_queue_t * +is_vlib_frame_queue_congested (u32 frame_queue_index, + u32 index, + u32 queue_hi_thresh, + vlib_frame_queue_t ** + handoff_queue_by_worker_index) +{ + vlib_frame_queue_t *fq; + vlib_thread_main_t *tm = &vlib_thread_main; + vlib_frame_queue_main_t *fqm = + vec_elt_at_index (tm->frame_queue_mains, frame_queue_index); + + fq = handoff_queue_by_worker_index[index]; + if (fq != (vlib_frame_queue_t *) (~0)) + return fq; + + fq = fqm->vlib_frame_queues[index]; + ASSERT (fq); + + if (PREDICT_FALSE (fq->tail >= (fq->head_hint + queue_hi_thresh))) + { + /* a valid entry in the array will indicate the queue has reached + * the specified threshold and is congested + */ + handoff_queue_by_worker_index[index] = fq; + fq->enqueue_full_events++; + return fq; + } + + return NULL; +} + +static inline vlib_frame_queue_elt_t * +vlib_get_worker_handoff_queue_elt (u32 frame_queue_index, + u32 vlib_worker_index, + vlib_frame_queue_elt_t ** + handoff_queue_elt_by_worker_index) +{ + vlib_frame_queue_elt_t *elt; + + if (handoff_queue_elt_by_worker_index[vlib_worker_index]) + return handoff_queue_elt_by_worker_index[vlib_worker_index]; + + elt = vlib_get_frame_queue_elt (frame_queue_index, vlib_worker_index); + + handoff_queue_elt_by_worker_index[vlib_worker_index] = elt; + + return elt; +} + +#endif /* included_vlib_threads_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/threads_cli.c b/src/vlib/threads_cli.c new file mode 100644 index 00000000000..ee632279db5 --- /dev/null +++ b/src/vlib/threads_cli.c @@ -0,0 +1,579 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#define _GNU_SOURCE + +#include <vppinfra/format.h> +#include <vlib/vlib.h> + +#include <vlib/threads.h> +#include <vlib/unix/unix.h> + +#if DPDK==1 +#include <rte_config.h> +#include <rte_common.h> +#include <rte_eal.h> +#include <rte_launch.h> +#include <rte_lcore.h> +#endif + +static u8 * +format_sched_policy_and_priority (u8 * s, va_list * args) +{ + long i = va_arg (*args, long); + struct sched_param sched_param; + u8 *t = 0; + + switch (sched_getscheduler (i)) + { +#define _(v,f,str) case SCHED_POLICY_##f: t = (u8 *) str; break; + foreach_sched_policy +#undef _ + } + if (sched_getparam (i, &sched_param) == 0) + return format (s, "%s (%d)", t, sched_param.sched_priority); + else + return format (s, "%s (n/a)", t); +} + +static clib_error_t * +show_threads_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_worker_thread_t *w; + int i; + + vlib_cli_output (vm, "%-7s%-20s%-12s%-8s%-25s%-7s%-7s%-7s%-10s", + "ID", "Name", "Type", "LWP", "Sched Policy (Priority)", + "lcore", "Core", "Socket", "State"); + +#if !defined(__powerpc64__) + for (i = 0; i < vec_len (vlib_worker_threads); i++) + { + w = vlib_worker_threads + i; + u8 *line = NULL; + + line = format (line, "%-7d%-20s%-12s%-8d", + i, + w->name ? w->name : (u8 *) "", + w->registration ? w->registration->name : "", w->lwp); + + line = format (line, "%-25U", format_sched_policy_and_priority, w->lwp); + + int lcore = -1; + cpu_set_t cpuset; + CPU_ZERO (&cpuset); + int ret = -1; + + ret = + pthread_getaffinity_np (w->thread_id, sizeof (cpu_set_t), &cpuset); + if (!ret) + { + int c; + for (c = 0; c < CPU_SETSIZE; c++) + if (CPU_ISSET (c, &cpuset)) + { + if (lcore > -1) + { + lcore = -2; + break; + } + lcore = c; + } + } + else + { + lcore = w->lcore_id; + } + + if (lcore > -1) + { + const char *sys_cpu_path = "/sys/devices/system/cpu/cpu"; + int socket_id = -1; + int core_id = -1; + u8 *p = 0; + + p = format (p, "%s%u/topology/core_id%c", sys_cpu_path, lcore, 0); + vlib_sysfs_read ((char *) p, "%d", &core_id); + + vec_reset_length (p); + p = + format (p, + "%s%u/topology/physical_package_id%c", + sys_cpu_path, lcore, 0); + vlib_sysfs_read ((char *) p, "%d", &socket_id); + vec_free (p); + + line = format (line, "%-7u%-7u%-7u%", lcore, core_id, socket_id); +#if DPDK==1 + ASSERT (lcore <= RTE_MAX_LCORE); + switch (lcore_config[lcore].state) + { + case WAIT: + line = format (line, "wait"); + break; + case RUNNING: + line = format (line, "running"); + break; + case FINISHED: + line = format (line, "finished"); + break; + default: + line = format (line, "unknown"); + } +#endif + } + else + { + line = + format (line, "%-7s%-7s%-7s%", (lcore == -2) ? "M" : "n/a", "n/a", + "n/a"); + } + + vlib_cli_output (vm, "%v", line); + vec_free (line); + } +#endif + + return 0; +} + + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_threads_command, static) = { + .path = "show threads", + .short_help = "Show threads", + .function = show_threads_fn, +}; +/* *INDENT-ON* */ + +/* + * Trigger threads to grab frame queue trace data + */ +static clib_error_t * +trace_frame_queue (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + clib_error_t *error = NULL; + frame_queue_trace_t *fqt; + frame_queue_nelt_counter_t *fqh; + vlib_thread_main_t *tm = vlib_get_thread_main (); + vlib_frame_queue_main_t *fqm; + u32 num_fq; + u32 fqix; + u32 enable = 2; + u32 index = ~(u32) 0; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "on")) + enable = 1; + else if (unformat (line_input, "off")) + enable = 0; + else if (unformat (line_input, "index %u"), &index) + ; + else + return clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + } + + unformat_free (line_input); + + if (enable > 1) + return clib_error_return (0, "expecting on or off"); + + if (vec_len (tm->frame_queue_mains) == 0) + return clib_error_return (0, "no worker handoffs exist"); + + if (index > vec_len (tm->frame_queue_mains) - 1) + return clib_error_return (0, + "expecting valid worker handoff queue index"); + + fqm = vec_elt_at_index (tm->frame_queue_mains, index); + + num_fq = vec_len (fqm->vlib_frame_queues); + if (num_fq == 0) + { + vlib_cli_output (vm, "No frame queues exist\n"); + return error; + } + + // Allocate storage for trace if necessary + vec_validate_aligned (fqm->frame_queue_traces, num_fq - 1, + CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (fqm->frame_queue_histogram, num_fq - 1, + CLIB_CACHE_LINE_BYTES); + + for (fqix = 0; fqix < num_fq; fqix++) + { + fqt = &fqm->frame_queue_traces[fqix]; + fqh = &fqm->frame_queue_histogram[fqix]; + + memset (fqt->n_vectors, 0xff, sizeof (fqt->n_vectors)); + fqt->written = 0; + memset (fqh, 0, sizeof (*fqh)); + fqm->vlib_frame_queues[fqix]->trace = enable; + } + return error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_trace_frame_queue,static) = { + .path = "trace frame-queue", + .short_help = "trace frame-queue (on|off)", + .function = trace_frame_queue, + .is_mp_safe = 1, +}; +/* *INDENT-ON* */ + + +/* + * Adding two counters and compute percent of total + * Round up, e.g. 0.000001 => 1% + */ +static u32 +compute_percent (u64 * two_counters, u64 total) +{ + if (total == 0) + { + return 0; + } + else + { + return (((two_counters[0] + two_counters[1]) * 100) + + (total - 1)) / total; + } +} + +/* + * Display frame queue trace data gathered by threads. + */ +static clib_error_t * +show_frame_queue_internal (vlib_main_t * vm, + vlib_frame_queue_main_t * fqm, u32 histogram) +{ + clib_error_t *error = NULL; + frame_queue_trace_t *fqt; + frame_queue_nelt_counter_t *fqh; + u32 num_fq; + u32 fqix; + + num_fq = vec_len (fqm->frame_queue_traces); + if (num_fq == 0) + { + vlib_cli_output (vm, "No trace data for frame queues\n"); + return error; + } + + if (histogram) + { + vlib_cli_output (vm, "0-1 2-3 4-5 6-7 8-9 10-11 12-13 14-15 " + "16-17 18-19 20-21 22-23 24-25 26-27 28-29 30-31\n"); + } + + for (fqix = 0; fqix < num_fq; fqix++) + { + fqt = &(fqm->frame_queue_traces[fqix]); + + vlib_cli_output (vm, "Thread %d %v\n", fqix, + vlib_worker_threads[fqix].name); + + if (fqt->written == 0) + { + vlib_cli_output (vm, " no trace data\n"); + continue; + } + + if (histogram) + { + fqh = &(fqm->frame_queue_histogram[fqix]); + u32 nelt; + u64 total = 0; + + for (nelt = 0; nelt < FRAME_QUEUE_MAX_NELTS; nelt++) + { + total += fqh->count[nelt]; + } + + /* + * Print in pairs to condense the output. + * Allow entries with 0 counts to be clearly identified, by rounding up. + * Any non-zero value will be displayed as at least one percent. This + * also means the sum of percentages can be > 100, but that is fine. The + * histogram is counted from the last time "trace frame on" was issued. + */ + vlib_cli_output (vm, + "%3d%% %3d%% %3d%% %3d%% %3d%% %3d%% %3d%% %3d%% " + "%3d%% %3d%% %3d%% %3d%% %3d%% %3d%% %3d%% %3d%%\n", + compute_percent (&fqh->count[0], total), + compute_percent (&fqh->count[2], total), + compute_percent (&fqh->count[4], total), + compute_percent (&fqh->count[6], total), + compute_percent (&fqh->count[8], total), + compute_percent (&fqh->count[10], total), + compute_percent (&fqh->count[12], total), + compute_percent (&fqh->count[14], total), + compute_percent (&fqh->count[16], total), + compute_percent (&fqh->count[18], total), + compute_percent (&fqh->count[20], total), + compute_percent (&fqh->count[22], total), + compute_percent (&fqh->count[24], total), + compute_percent (&fqh->count[26], total), + compute_percent (&fqh->count[28], total), + compute_percent (&fqh->count[30], total)); + } + else + { + vlib_cli_output (vm, + " vector-threshold %d ring size %d in use %d\n", + fqt->threshold, fqt->nelts, fqt->n_in_use); + vlib_cli_output (vm, " head %12d head_hint %12d tail %12d\n", + fqt->head, fqt->head_hint, fqt->tail); + vlib_cli_output (vm, + " %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d\n", + fqt->n_vectors[0], fqt->n_vectors[1], + fqt->n_vectors[2], fqt->n_vectors[3], + fqt->n_vectors[4], fqt->n_vectors[5], + fqt->n_vectors[6], fqt->n_vectors[7], + fqt->n_vectors[8], fqt->n_vectors[9], + fqt->n_vectors[10], fqt->n_vectors[11], + fqt->n_vectors[12], fqt->n_vectors[13], + fqt->n_vectors[14], fqt->n_vectors[15]); + + if (fqt->nelts > 16) + { + vlib_cli_output (vm, + " %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d\n", + fqt->n_vectors[16], fqt->n_vectors[17], + fqt->n_vectors[18], fqt->n_vectors[19], + fqt->n_vectors[20], fqt->n_vectors[21], + fqt->n_vectors[22], fqt->n_vectors[23], + fqt->n_vectors[24], fqt->n_vectors[25], + fqt->n_vectors[26], fqt->n_vectors[27], + fqt->n_vectors[28], fqt->n_vectors[29], + fqt->n_vectors[30], fqt->n_vectors[31]); + } + } + + } + return error; +} + +static clib_error_t * +show_frame_queue_trace (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vlib_thread_main_t *tm = vlib_get_thread_main (); + vlib_frame_queue_main_t *fqm; + clib_error_t *error; + + vec_foreach (fqm, tm->frame_queue_mains) + { + vlib_cli_output (vm, "Worker handoff queue index %u (next node '%U'):", + fqm - tm->frame_queue_mains, + format_vlib_node_name, vm, fqm->node_index); + error = show_frame_queue_internal (vm, fqm, 0); + if (error) + return error; + } + return 0; +} + +static clib_error_t * +show_frame_queue_histogram (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vlib_thread_main_t *tm = vlib_get_thread_main (); + vlib_frame_queue_main_t *fqm; + clib_error_t *error; + + vec_foreach (fqm, tm->frame_queue_mains) + { + vlib_cli_output (vm, "Worker handoff queue index %u (next node '%U'):", + fqm - tm->frame_queue_mains, + format_vlib_node_name, vm, fqm->node_index); + error = show_frame_queue_internal (vm, fqm, 1); + if (error) + return error; + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_show_frame_queue_trace,static) = { + .path = "show frame-queue", + .short_help = "show frame-queue trace", + .function = show_frame_queue_trace, +}; +/* *INDENT-ON* */ + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_show_frame_queue_histogram,static) = { + .path = "show frame-queue histogram", + .short_help = "show frame-queue histogram", + .function = show_frame_queue_histogram, +}; +/* *INDENT-ON* */ + + +/* + * Modify the number of elements on the frame_queues + */ +static clib_error_t * +test_frame_queue_nelts (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + vlib_thread_main_t *tm = vlib_get_thread_main (); + vlib_frame_queue_main_t *fqm; + clib_error_t *error = NULL; + u32 num_fq; + u32 fqix; + u32 nelts = 0; + u32 index = ~(u32) 0; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "nelts %u", &nelts)) + ; + else if (unformat (line_input, "index %u", &index)) + ; + else + return clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + } + + unformat_free (line_input); + + if (index > vec_len (tm->frame_queue_mains) - 1) + return clib_error_return (0, + "expecting valid worker handoff queue index"); + + fqm = vec_elt_at_index (tm->frame_queue_mains, index); + + if ((nelts != 4) && (nelts != 8) && (nelts != 16) && (nelts != 32)) + { + return clib_error_return (0, "expecting 4,8,16,32"); + } + + num_fq = vec_len (fqm->vlib_frame_queues); + if (num_fq == 0) + { + vlib_cli_output (vm, "No frame queues exist\n"); + return error; + } + + for (fqix = 0; fqix < num_fq; fqix++) + { + fqm->vlib_frame_queues[fqix]->nelts = nelts; + } + + return error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_test_frame_queue_nelts,static) = { + .path = "test frame-queue nelts", + .short_help = "test frame-queue nelts (4,8,16,32)", + .function = test_frame_queue_nelts, +}; +/* *INDENT-ON* */ + + +/* + * Modify the max number of packets pulled off the frame queues + */ +static clib_error_t * +test_frame_queue_threshold (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + vlib_thread_main_t *tm = vlib_get_thread_main (); + vlib_frame_queue_main_t *fqm; + clib_error_t *error = NULL; + u32 num_fq; + u32 fqix; + u32 threshold = ~(u32) 0; + u32 index = ~(u32) 0; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "threshold %u", &threshold)) + ; + else if (unformat (line_input, "index %u", &index)) + ; + else + return clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + } + + unformat_free (line_input); + + if (index > vec_len (tm->frame_queue_mains) - 1) + return clib_error_return (0, + "expecting valid worker handoff queue index"); + + fqm = vec_elt_at_index (tm->frame_queue_mains, index); + + + if (threshold == ~(u32) 0) + { + vlib_cli_output (vm, "expecting threshold value\n"); + return error; + } + + if (threshold == 0) + threshold = ~0; + + num_fq = vec_len (fqm->vlib_frame_queues); + if (num_fq == 0) + { + vlib_cli_output (vm, "No frame queues exist\n"); + return error; + } + + for (fqix = 0; fqix < num_fq; fqix++) + { + fqm->vlib_frame_queues[fqix]->vector_threshold = threshold; + } + + return error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_test_frame_queue_threshold,static) = { + .path = "test frame-queue threshold", + .short_help = "test frame-queue threshold N (0=no limit)", + .function = test_frame_queue_threshold, +}; +/* *INDENT-ON* */ + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/trace.c b/src/vlib/trace.c new file mode 100644 index 00000000000..dcdb837f16c --- /dev/null +++ b/src/vlib/trace.c @@ -0,0 +1,545 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * trace.c: VLIB trace buffer. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vlib/vlib.h> +#include <vlib/threads.h> + +/* Helper function for nodes which only trace buffer data. */ +void +vlib_trace_frame_buffers_only (vlib_main_t * vm, + vlib_node_runtime_t * node, + u32 * buffers, + uword n_buffers, + uword next_buffer_stride, + uword n_buffer_data_bytes_in_trace) +{ + u32 n_left, *from; + + n_left = n_buffers; + from = buffers; + + while (n_left >= 4) + { + u32 bi0, bi1; + vlib_buffer_t *b0, *b1; + u8 *t0, *t1; + + /* Prefetch next iteration. */ + vlib_prefetch_buffer_with_index (vm, from[2], LOAD); + vlib_prefetch_buffer_with_index (vm, from[3], LOAD); + + bi0 = from[0]; + bi1 = from[1]; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + if (b0->flags & VLIB_BUFFER_IS_TRACED) + { + t0 = vlib_add_trace (vm, node, b0, n_buffer_data_bytes_in_trace); + clib_memcpy (t0, b0->data + b0->current_data, + n_buffer_data_bytes_in_trace); + } + if (b1->flags & VLIB_BUFFER_IS_TRACED) + { + t1 = vlib_add_trace (vm, node, b1, n_buffer_data_bytes_in_trace); + clib_memcpy (t1, b1->data + b1->current_data, + n_buffer_data_bytes_in_trace); + } + from += 2; + n_left -= 2; + } + + while (n_left >= 1) + { + u32 bi0; + vlib_buffer_t *b0; + u8 *t0; + + bi0 = from[0]; + + b0 = vlib_get_buffer (vm, bi0); + + if (b0->flags & VLIB_BUFFER_IS_TRACED) + { + t0 = vlib_add_trace (vm, node, b0, n_buffer_data_bytes_in_trace); + clib_memcpy (t0, b0->data + b0->current_data, + n_buffer_data_bytes_in_trace); + } + from += 1; + n_left -= 1; + } +} + +/* Free up all trace buffer memory. */ +always_inline void +clear_trace_buffer (void) +{ + int i; + vlib_trace_main_t *tm; + + /* *INDENT-OFF* */ + foreach_vlib_main ( + ({ + void *mainheap; + + tm = &this_vlib_main->trace_main; + mainheap = clib_mem_set_heap (this_vlib_main->heap_base); + + tm->trace_active_hint = 0; + + for (i = 0; i < vec_len (tm->trace_buffer_pool); i++) + if (! pool_is_free_index (tm->trace_buffer_pool, i)) + vec_free (tm->trace_buffer_pool[i]); + pool_free (tm->trace_buffer_pool); + clib_mem_set_heap (mainheap); + })); + /* *INDENT-ON* */ +} + +static u8 * +format_vlib_trace (u8 * s, va_list * va) +{ + vlib_main_t *vm = va_arg (*va, vlib_main_t *); + vlib_trace_header_t *h = va_arg (*va, vlib_trace_header_t *); + vlib_trace_header_t *e = vec_end (h); + vlib_node_t *node, *prev_node; + clib_time_t *ct = &vm->clib_time; + f64 t; + + prev_node = 0; + while (h < e) + { + node = vlib_get_node (vm, h->node_index); + + if (node != prev_node) + { + t = + (h->time - vm->cpu_time_main_loop_start) * ct->seconds_per_clock; + s = + format (s, "\n%U: %v", format_time_interval, "h:m:s:u", t, + node->name); + } + prev_node = node; + + if (node->format_trace) + s = format (s, "\n %U", node->format_trace, vm, node, h->data); + else + s = format (s, "\n %U", node->format_buffer, h->data); + + h = vlib_trace_header_next (h); + } + + return s; +} + +/* Root of all trace cli commands. */ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (trace_cli_command,static) = { + .path = "trace", + .short_help = "Packet tracer commands", +}; +/* *INDENT-ON* */ + +static int +trace_cmp (void *a1, void *a2) +{ + vlib_trace_header_t **t1 = a1; + vlib_trace_header_t **t2 = a2; + i64 dt = t1[0]->time - t2[0]->time; + return dt < 0 ? -1 : (dt > 0 ? +1 : 0); +} + +/* + * Return 1 if this packet passes the trace filter, or 0 otherwise + */ +u32 +filter_accept (vlib_trace_main_t * tm, vlib_trace_header_t * h) +{ + vlib_trace_header_t *e = vec_end (h); + + if (tm->filter_flag == 0) + return 1; + + if (tm->filter_flag == FILTER_FLAG_INCLUDE) + { + while (h < e) + { + if (h->node_index == tm->filter_node_index) + return 1; + h = vlib_trace_header_next (h); + } + return 0; + } + else /* FILTER_FLAG_EXCLUDE */ + { + while (h < e) + { + if (h->node_index == tm->filter_node_index) + return 0; + h = vlib_trace_header_next (h); + } + return 1; + } + + return 0; +} + +/* + * Remove traces from the trace buffer pool that don't pass the filter + */ +void +trace_apply_filter (vlib_main_t * vm) +{ + vlib_trace_main_t *tm = &vm->trace_main; + vlib_trace_header_t **h; + vlib_trace_header_t ***traces_to_remove = 0; + u32 index; + u32 trace_index; + u32 n_accepted; + + u32 accept; + + if (tm->filter_flag == FILTER_FLAG_NONE) + return; + + /* + * Ideally we would retain the first N traces that pass the filter instead + * of any N traces. + */ + n_accepted = 0; + /* *INDENT-OFF* */ + pool_foreach (h, tm->trace_buffer_pool, + ({ + accept = filter_accept(tm, h[0]); + + if ((n_accepted == tm->filter_count) || !accept) + vec_add1 (traces_to_remove, h); + else + n_accepted++; + })); + /* *INDENT-ON* */ + + /* remove all traces that we don't want to keep */ + for (index = 0; index < vec_len (traces_to_remove); index++) + { + trace_index = traces_to_remove[index] - tm->trace_buffer_pool; + _vec_len (tm->trace_buffer_pool[trace_index]) = 0; + pool_put_index (tm->trace_buffer_pool, trace_index); + } + + vec_free (traces_to_remove); +} + +static clib_error_t * +cli_show_trace_buffer (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_trace_main_t *tm; + vlib_trace_header_t **h, **traces; + u32 i, index = 0; + char *fmt; + u8 *s = 0; + u32 max; + + /* + * By default display only this many traces. To display more, explicitly + * specify a max. This prevents unexpectedly huge outputs. + */ + max = 50; + while (unformat_check_input (input) != (uword) UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "max %d", &max)) + ; + else + return clib_error_create ("expected 'max COUNT', got `%U'", + format_unformat_error, input); + } + + + /* Get active traces from pool. */ + + /* *INDENT-OFF* */ + foreach_vlib_main ( + ({ + void *mainheap; + + fmt = "------------------- Start of thread %d %s -------------------\n"; + s = format (s, fmt, index, vlib_worker_threads[index].name); + + tm = &this_vlib_main->trace_main; + + mainheap = clib_mem_set_heap (this_vlib_main->heap_base); + + trace_apply_filter(this_vlib_main); + + traces = 0; + pool_foreach (h, tm->trace_buffer_pool, + ({ + vec_add1 (traces, h[0]); + })); + + if (vec_len (traces) == 0) + { + clib_mem_set_heap (mainheap); + s = format (s, "No packets in trace buffer\n"); + goto done; + } + + /* Sort them by increasing time. */ + vec_sort_with_function (traces, trace_cmp); + + for (i = 0; i < vec_len (traces); i++) + { + if (i == max) + { + vlib_cli_output (vm, "Limiting display to %d packets." + " To display more specify max.", max); + goto done; + } + + clib_mem_set_heap (mainheap); + + s = format (s, "Packet %d\n%U\n\n", i + 1, + format_vlib_trace, vm, traces[i]); + + mainheap = clib_mem_set_heap (this_vlib_main->heap_base); + } + + done: + vec_free (traces); + clib_mem_set_heap (mainheap); + + index++; + })); + /* *INDENT-ON* */ + + vlib_cli_output (vm, "%v", s); + vec_free (s); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_trace_cli,static) = { + .path = "show trace", + .short_help = "Show trace buffer [max COUNT]", + .function = cli_show_trace_buffer, +}; +/* *INDENT-ON* */ + +static clib_error_t * +cli_add_trace_buffer (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + vlib_trace_main_t *tm; + vlib_trace_node_t *tn; + u32 node_index, add; + u8 verbose = 0; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != (uword) UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "%U %d", + unformat_vlib_node, vm, &node_index, &add)) + ; + else if (unformat (line_input, "verbose")) + verbose = 1; + else + return clib_error_create ("expected NODE COUNT, got `%U'", + format_unformat_error, line_input); + } + + /* *INDENT-OFF* */ + foreach_vlib_main (( + { + void *oldheap; + tm = &this_vlib_main->trace_main; + tm->trace_active_hint = 1; + tm->verbose = verbose; + oldheap = + clib_mem_set_heap (this_vlib_main->heap_base); + vec_validate (tm->nodes, node_index); + tn = tm->nodes + node_index; + tn->limit += add; clib_mem_set_heap (oldheap); + })); + /* *INDENT-ON* */ + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (add_trace_cli,static) = { + .path = "trace add", + .short_help = "Trace given number of packets", + .function = cli_add_trace_buffer, +}; +/* *INDENT-ON* */ + + +/* + * Configure a filter for packet traces. + * + * This supplements the packet trace feature so that only packets matching + * the filter are included in the trace. Currently the only filter is to + * keep packets that include a certain node in the trace or exclude a certain + * node in the trace. + * + * The count of traced packets in the "trace add" command is still used to + * create a certain number of traces. The "trace filter" command specifies + * how many of those packets should be retained in the trace. + * + * For example, 1Mpps of traffic is arriving and one of those packets is being + * dropped. To capture the trace for only that dropped packet, you can do: + * trace filter include error-drop 1 + * trace add dpdk-input 1000000 + * <wait one second> + * show trace + * + * Note that the filter could be implemented by capturing all traces and just + * reducing traces displayed by the "show trace" function. But that would + * require a lot of memory for storing the traces, making that infeasible. + * + * To remove traces from the trace pool that do not include a certain node + * requires that the trace be "complete" before applying the filter. To + * accomplish this, the trace pool is filtered upon each iteraction of the + * main vlib loop. Doing so keeps the number of allocated traces down to a + * reasonably low number. This requires that tracing for a buffer is not + * performed after the vlib main loop interation completes. i.e. you can't + * save away a buffer temporarily then inject it back into the graph and + * expect that the trace_index is still valid (such as a traffic manager might + * do). A new trace buffer should be allocated for those types of packets. + * + * The filter can be extended to support multiple nodes and other match + * criteria (e.g. input sw_if_index, mac address) but for now just checks if + * a specified node is in the trace or not in the trace. + */ +static clib_error_t * +cli_filter_trace (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_trace_main_t *tm = &vm->trace_main; + u32 filter_node_index; + u32 filter_flag; + u32 filter_count; + void *mainheap; + + if (unformat (input, "include %U %d", + unformat_vlib_node, vm, &filter_node_index, &filter_count)) + { + filter_flag = FILTER_FLAG_INCLUDE; + } + else if (unformat (input, "exclude %U %d", + unformat_vlib_node, vm, &filter_node_index, + &filter_count)) + { + filter_flag = FILTER_FLAG_EXCLUDE; + } + else if (unformat (input, "none")) + { + filter_flag = FILTER_FLAG_NONE; + filter_node_index = 0; + filter_count = 0; + } + else + return + clib_error_create + ("expected 'include NODE COUNT' or 'exclude NODE COUNT' or 'none', got `%U'", + format_unformat_error, input); + + /* *INDENT-OFF* */ + foreach_vlib_main ( + ({ + tm = &this_vlib_main->trace_main; + tm->filter_node_index = filter_node_index; + tm->filter_flag = filter_flag; + tm->filter_count = filter_count; + + /* + * Clear the trace limits to stop any in-progress tracing + * Prevents runaway trace allocations when the filter changes (or is removed) + */ + mainheap = clib_mem_set_heap (this_vlib_main->heap_base); + vec_free (tm->nodes); + clib_mem_set_heap (mainheap); + })); + /* *INDENT-ON* */ + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (filter_trace_cli,static) = { + .path = "trace filter", + .short_help = "filter trace output - include NODE COUNT | exclude NODE COUNT | none", + .function = cli_filter_trace, +}; +/* *INDENT-ON* */ + +static clib_error_t * +cli_clear_trace_buffer (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + clear_trace_buffer (); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (clear_trace_cli,static) = { + .path = "clear trace", + .short_help = "Clear trace buffer and free memory", + .function = cli_clear_trace_buffer, +}; +/* *INDENT-ON* */ + +/* Dummy function to get us linked in. */ +void +vlib_trace_cli_reference (void) +{ +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/trace.h b/src/vlib/trace.h new file mode 100644 index 00000000000..fc0fc5c8ed4 --- /dev/null +++ b/src/vlib/trace.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * trace.h: VLIB trace buffer. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_trace_h +#define included_vlib_trace_h + +#include <vppinfra/pool.h> + +typedef struct +{ + /* CPU time stamp trace was made. */ + u64 time; + + /* Node which generated this trace. */ + u32 node_index; + + /* Number of data words in this trace. */ + u32 n_data; + + /* Trace data follows. */ + u8 data[0]; +} vlib_trace_header_t; + +typedef struct +{ + /* Current number of traces in buffer. */ + u32 count; + + /* Max. number of traces to be added to buffer. */ + u32 limit; +} vlib_trace_node_t; + +typedef struct +{ + /* Pool of trace buffers. */ + vlib_trace_header_t **trace_buffer_pool; + + u32 last_main_loop_count; + u32 filter_node_index; + u32 filter_flag; +#define FILTER_FLAG_NONE 0 +#define FILTER_FLAG_INCLUDE 1 +#define FILTER_FLAG_EXCLUDE 2 + u32 filter_count; + + /* set on trace add, cleared on clear trace */ + u32 trace_active_hint; + + /* Per node trace counts. */ + vlib_trace_node_t *nodes; + + /* verbosity */ + int verbose; +} vlib_trace_main_t; + +#endif /* included_vlib_trace_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/trace_funcs.h b/src/vlib/trace_funcs.h new file mode 100644 index 00000000000..5280eae9904 --- /dev/null +++ b/src/vlib/trace_funcs.h @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * trace_funcs.h: VLIB trace buffer. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_trace_funcs_h +#define included_vlib_trace_funcs_h + +always_inline void +vlib_validate_trace (vlib_trace_main_t * tm, vlib_buffer_t * b) +{ + /* + * this assert seems right, but goes off constantly. + * disabling it appears to make the pain go away + */ + ASSERT (1 || b->flags & VLIB_BUFFER_IS_TRACED); + ASSERT (!pool_is_free_index (tm->trace_buffer_pool, b->trace_index)); +} + +always_inline void * +vlib_add_trace (vlib_main_t * vm, + vlib_node_runtime_t * r, vlib_buffer_t * b, u32 n_data_bytes) +{ + vlib_trace_main_t *tm = &vm->trace_main; + vlib_trace_header_t *h; + u32 n_data_words; + + vlib_validate_trace (tm, b); + + n_data_bytes = round_pow2 (n_data_bytes, sizeof (h[0])); + n_data_words = n_data_bytes / sizeof (h[0]); + vec_add2_aligned (tm->trace_buffer_pool[b->trace_index], h, + 1 + n_data_words, sizeof (h[0])); + + h->time = vm->cpu_time_last_node_dispatch; + h->n_data = n_data_words; + h->node_index = r->node_index; + + return h->data; +} + +always_inline vlib_trace_header_t * +vlib_trace_header_next (vlib_trace_header_t * h) +{ + return h + 1 + h->n_data; +} + +always_inline void +vlib_free_trace (vlib_main_t * vm, vlib_buffer_t * b) +{ + vlib_trace_main_t *tm = &vm->trace_main; + vlib_validate_trace (tm, b); + _vec_len (tm->trace_buffer_pool[b->trace_index]) = 0; + pool_put_index (tm->trace_buffer_pool, b->trace_index); +} + +always_inline void +vlib_trace_next_frame (vlib_main_t * vm, + vlib_node_runtime_t * r, u32 next_index) +{ + vlib_next_frame_t *nf; + nf = vlib_node_runtime_get_next_frame (vm, r, next_index); + nf->flags |= VLIB_FRAME_TRACE; +} + +void trace_apply_filter (vlib_main_t * vm); + +/* Mark buffer as traced and allocate trace buffer. */ +always_inline void +vlib_trace_buffer (vlib_main_t * vm, + vlib_node_runtime_t * r, + u32 next_index, vlib_buffer_t * b, int follow_chain) +{ + vlib_trace_main_t *tm = &vm->trace_main; + vlib_trace_header_t **h; + + /* + * Apply filter to existing traces to keep number of allocated traces low. + * Performed each time around the main loop. + */ + if (tm->last_main_loop_count != vm->main_loop_count) + { + tm->last_main_loop_count = vm->main_loop_count; + trace_apply_filter (vm); + } + + vlib_trace_next_frame (vm, r, next_index); + + pool_get (tm->trace_buffer_pool, h); + + do + { + b->flags |= VLIB_BUFFER_IS_TRACED; + b->trace_index = h - tm->trace_buffer_pool; + } + while (follow_chain && (b = vlib_get_next_buffer (vm, b))); +} + +always_inline void +vlib_buffer_copy_trace_flag (vlib_main_t * vm, vlib_buffer_t * b, + u32 bi_target) +{ + vlib_buffer_t *b_target = vlib_get_buffer (vm, bi_target); + b_target->flags |= b->flags & VLIB_BUFFER_IS_TRACED; + b_target->trace_index = b->trace_index; +} + +always_inline u32 +vlib_get_trace_count (vlib_main_t * vm, vlib_node_runtime_t * rt) +{ + vlib_trace_main_t *tm = &vm->trace_main; + vlib_trace_node_t *tn; + int n; + + if (rt->node_index >= vec_len (tm->nodes)) + return 0; + tn = tm->nodes + rt->node_index; + n = tn->limit - tn->count; + ASSERT (n >= 0); + + return n; +} + +always_inline void +vlib_set_trace_count (vlib_main_t * vm, vlib_node_runtime_t * rt, u32 count) +{ + vlib_trace_main_t *tm = &vm->trace_main; + vlib_trace_node_t *tn = vec_elt_at_index (tm->nodes, rt->node_index); + + ASSERT (count <= tn->limit); + tn->count = tn->limit - count; +} + +/* Helper function for nodes which only trace buffer data. */ +void +vlib_trace_frame_buffers_only (vlib_main_t * vm, + vlib_node_runtime_t * node, + u32 * buffers, + uword n_buffers, + uword next_buffer_stride, + uword n_buffer_data_bytes_in_trace); + +#endif /* included_vlib_trace_funcs_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/cj.c b/src/vlib/unix/cj.c new file mode 100644 index 00000000000..33ba163abca --- /dev/null +++ b/src/vlib/unix/cj.c @@ -0,0 +1,271 @@ +/* + *------------------------------------------------------------------ + * cj.c + * + * Copyright (c) 2013 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +/** + * @file + * Circular joournal diagnostic mechanism. + * + * The @c cj thread-safe circular log buffer scheme is occasionally useful + * when chasing bugs. Calls to it should not be checked in. + */ +/*? %%clicmd:group_label Circular Journal %% ?*/ +/*? %%syscfg:group_label Circular Journal %% ?*/ + +#include <stdio.h> +#include <vlib/vlib.h> + +#include <vlib/unix/cj.h> + +cj_main_t cj_main; + +void +cj_log (u32 type, void *data0, void *data1) +{ + u64 new_tail; + cj_main_t *cjm = &cj_main; + cj_record_t *r; + + if (cjm->enable == 0) + return; + + new_tail = __sync_add_and_fetch (&cjm->tail, 1); + + r = (cj_record_t *) & (cjm->records[new_tail & (cjm->num_records - 1)]); + r->time = vlib_time_now (cjm->vlib_main); + r->cpu = os_get_cpu_number (); + r->type = type; + r->data[0] = pointer_to_uword (data0); + r->data[1] = pointer_to_uword (data1); +} + +void +cj_stop (void) +{ + cj_main_t *cjm = &cj_main; + + cjm->enable = 0; +} + + +clib_error_t * +cj_init (vlib_main_t * vm) +{ + cj_main_t *cjm = &cj_main; + + cjm->vlib_main = vm; + return 0; +} + +VLIB_INIT_FUNCTION (cj_init); + +static clib_error_t * +cj_config (vlib_main_t * vm, unformat_input_t * input) +{ + cj_main_t *cjm = &cj_main; + int matched = 0; + int enable = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "records %d", &cjm->num_records)) + matched = 1; + else if (unformat (input, "on")) + enable = 1; + else + return clib_error_return (0, "cj_config: unknown input '%U'", + format_unformat_error, input); + } + + if (matched == 0) + return 0; + + cjm->num_records = max_pow2 (cjm->num_records); + vec_validate (cjm->records, cjm->num_records - 1); + memset (cjm->records, 0xff, cjm->num_records * sizeof (cj_record_t)); + cjm->tail = ~0; + cjm->enable = enable; + + return 0; +} + +/*? + * Configure the circular journal diagnostic mechanism. This is only useful + * if you, the deveoper, have written code to make use of the circular + * journal. + * + * @cfgcmd{records, <number>} + * Configure the number of records to allocate for the circular journal. + * + * @cfgcmd{on} + * Enable the collection of records in the circular journal at the + * earliest opportunity. +?*/ +VLIB_CONFIG_FUNCTION (cj_config, "cj"); + +void +cj_enable_disable (int is_enable) +{ + cj_main_t *cjm = &cj_main; + + if (cjm->num_records) + cjm->enable = is_enable; + else + vlib_cli_output (cjm->vlib_main, "CJ not configured..."); +} + +static inline void +cj_dump_one_record (cj_record_t * r) +{ + fprintf (stderr, "[%d]: %10.6f T%02d %llx %llx\n", + r->cpu, r->time, r->type, (long long unsigned int) r->data[0], + (long long unsigned int) r->data[1]); +} + +static void +cj_dump_internal (u8 filter0_enable, u64 filter0, + u8 filter1_enable, u64 filter1) +{ + cj_main_t *cjm = &cj_main; + cj_record_t *r; + u32 i, index; + + if (cjm->num_records == 0) + { + fprintf (stderr, "CJ not configured...\n"); + return; + } + + if (cjm->tail == (u64) ~ 0) + { + fprintf (stderr, "No data collected...\n"); + return; + } + + /* Has the trace wrapped? */ + index = (cjm->tail + 1) & (cjm->num_records - 1); + r = &(cjm->records[index]); + + if (r->cpu != (u32) ~ 0) + { + /* Yes, dump from tail + 1 to the end */ + for (i = index; i < cjm->num_records; i++) + { + if (filter0_enable && (r->data[0] != filter0)) + goto skip; + if (filter1_enable && (r->data[1] != filter1)) + goto skip; + cj_dump_one_record (r); + skip: + r++; + } + } + /* dump from the beginning through the final tail */ + r = cjm->records; + for (i = 0; i <= cjm->tail; i++) + { + if (filter0_enable && (r->data[0] != filter0)) + goto skip2; + if (filter1_enable && (r->data[1] != filter1)) + goto skip2; + cj_dump_one_record (r); + skip2: + r++; + } +} + +void +cj_dump (void) +{ + cj_dump_internal (0, 0, 0, 0); +} + +void +cj_dump_filter_data0 (u64 filter0) +{ + cj_dump_internal (1 /* enable f0 */ , filter0, 0, 0); +} + +void +cj_dump_filter_data1 (u64 filter1) +{ + cj_dump_internal (0, 0, 1 /* enable f1 */ , filter1); +} + +void +cj_dump_filter_data12 (u64 filter0, u64 filter1) +{ + cj_dump_internal (1, filter0, 1, filter1); +} + +static clib_error_t * +cj_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + int is_enable = -1; + int is_dump = -1; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "enable") || unformat (input, "on")) + is_enable = 1; + else if (unformat (input, "disable") || unformat (input, "off")) + is_enable = 0; + else if (unformat (input, "dump")) + is_dump = 1; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + if (is_enable >= 0) + cj_enable_disable (is_enable); + + if (is_dump > 0) + cj_dump (); + + return 0; +} + +/*? + * Enable, disable the collection of diagnostic data into a + * circular journal or dump the circular journal diagnostic data. + * This is only useful if you, the deveoper, have written code to make + * use of the circular journal. + * + * When dumping the data it is formatted and sent to @c stderr of the + * VPP process; when running VPP in <code>unix interactive</code> mode + * this is typically the same place as the Debug CLI. +?*/ + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cj_command,static) = { + .path = "cj", + .short_help = "cj <enable | disable | dump>", + .function = cj_command_fn, +}; +/* *INDENT-ON* */ + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/cj.h b/src/vlib/unix/cj.h new file mode 100644 index 00000000000..67626afee2b --- /dev/null +++ b/src/vlib/unix/cj.h @@ -0,0 +1,79 @@ +/* + *------------------------------------------------------------------ + * cj.h + * + * Copyright (c) 2013 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#ifndef __included_cj_h__ +#define __included_cj_h__ + +typedef struct +{ + f64 time; + u32 cpu; + u32 type; + u64 data[2]; +} cj_record_t; + +typedef struct +{ + volatile u64 tail; + cj_record_t *records; + u32 num_records; + volatile u32 enable; + + vlib_main_t *vlib_main; +} cj_main_t; + +void cj_log (u32 type, void *data0, void *data1); + +/* + * Supply in application main, so we can log from any library... + * Declare a weak reference in the library, off you go. + */ + +#define DECLARE_CJ_GLOBAL_LOG \ +void cj_global_log (unsigned type, void * data0, void * data1) \ + __attribute__ ((weak)); \ + \ +unsigned __cj_type; \ +void * __cj_data0; \ +void * __cj_data1; \ + \ +void \ +cj_global_log (unsigned type, void * data0, void * data1) \ +{ \ + __cj_type = type; \ + __cj_data0 = data0; \ + __cj_data1 = data1; \ +} + +#define CJ_GLOBAL_LOG_PROTOTYPE +void +cj_global_log (unsigned type, void *data0, void *data1) +__attribute__ ((weak)); + +void cj_stop (void); + +#endif /* __included_cj_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/cli.c b/src/vlib/unix/cli.c new file mode 100644 index 00000000000..69fca6ec7bc --- /dev/null +++ b/src/vlib/unix/cli.c @@ -0,0 +1,2989 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * cli.c: Unix stdin/socket CLI. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +/** + * @file + * @brief Unix stdin/socket command line interface. + * Provides a command line interface so humans can interact with VPP. + * This is predominantly a debugging and testing mechanism. + */ +/*? %%clicmd:group_label Command line session %% ?*/ +/*? %%syscfg:group_label Command line session %% ?*/ + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vppinfra/timer.h> + +#include <ctype.h> +#include <fcntl.h> +#include <sys/stat.h> +#include <termios.h> +#include <signal.h> +#include <unistd.h> +#include <arpa/telnet.h> +#include <sys/ioctl.h> + +/** ANSI escape code. */ +#define ESC "\x1b" + +/** ANSI Control Sequence Introducer. */ +#define CSI ESC "[" + +/** ANSI clear screen. */ +#define ANSI_CLEAR CSI "2J" CSI "1;1H" +/** ANSI reset color settings. */ +#define ANSI_RESET CSI "0m" +/** ANSI Start bold text. */ +#define ANSI_BOLD CSI "1m" +/** ANSI Stop bold text. */ +#define ANSI_DIM CSI "2m" +/** ANSI Start dark red text. */ +#define ANSI_DRED ANSI_DIM CSI "31m" +/** ANSI Start bright red text. */ +#define ANSI_BRED ANSI_BOLD CSI "31m" +/** ANSI clear line cursor is on. */ +#define ANSI_CLEARLINE CSI "2K" +/** ANSI scroll screen down one line. */ +#define ANSI_SCROLLDN CSI "1T" +/** ANSI save cursor position. */ +#define ANSI_SAVECURSOR CSI "s" +/** ANSI restore cursor position if previously saved. */ +#define ANSI_RESTCURSOR CSI "u" + +/** Maximum depth into a byte stream from which to compile a Telnet + * protocol message. This is a saftey measure. */ +#define UNIX_CLI_MAX_DEPTH_TELNET 24 + +/** Unix standard in */ +#define UNIX_CLI_STDIN_FD 0 + + +/** A CLI banner line. */ +typedef struct +{ + u8 *line; /**< The line to print. */ + u32 length; /**< The length of the line without terminating NUL. */ +} unix_cli_banner_t; + +#define _(a) { .line = (u8 *)(a), .length = sizeof(a) - 1 } +/** Plain welcome banner. */ +static unix_cli_banner_t unix_cli_banner[] = { + _(" _______ _ _ _____ ___ \n"), + _(" __/ __/ _ \\ (_)__ | | / / _ \\/ _ \\\n"), + _(" _/ _// // / / / _ \\ | |/ / ___/ ___/\n"), + _(" /_/ /____(_)_/\\___/ |___/_/ /_/ \n"), + _("\n") +}; + +/** ANSI color welcome banner. */ +static unix_cli_banner_t unix_cli_banner_color[] = { + _(ANSI_BRED " _______ _ " ANSI_RESET " _ _____ ___ \n"), + _(ANSI_BRED " __/ __/ _ \\ (_)__ " ANSI_RESET " | | / / _ \\/ _ \\\n"), + _(ANSI_BRED " _/ _// // / / / _ \\" ANSI_RESET " | |/ / ___/ ___/\n"), + _(ANSI_BRED " /_/ /____(_)_/\\___/" ANSI_RESET " |___/_/ /_/ \n"), + _("\n") +}; + +#undef _ + +/** Pager line index */ +typedef struct +{ + /** Index into pager_vector */ + u32 line; + + /** Offset of the string in the line */ + u32 offset; + + /** Length of the string in the line */ + u32 length; +} unix_cli_pager_index_t; + + +/** Unix CLI session. */ +typedef struct +{ + /** The file index held by unix.c */ + u32 unix_file_index; + + /** Vector of output pending write to file descriptor. */ + u8 *output_vector; + + /** Vector of input saved by Unix input node to be processed by + CLI process. */ + u8 *input_vector; + + /** This session has command history. */ + u8 has_history; + /** Array of vectors of commands in the history. */ + u8 **command_history; + /** The command currently pointed at by the history cursor. */ + u8 *current_command; + /** How far from the end of the history array the user has browsed. */ + i32 excursion; + + /** Maximum number of history entries this session will store. */ + u32 history_limit; + + /** Current command line counter */ + u32 command_number; + + /** The string being searched for in the history. */ + u8 *search_key; + /** If non-zero then the CLI is searching in the history array. + * - @c -1 means search backwards. + * - @c 1 means search forwards. + */ + int search_mode; + + /** Position of the insert cursor on the current input line */ + u32 cursor; + + /** Line mode or char mode */ + u8 line_mode; + + /** Set if the CRLF mode wants CR + LF */ + u8 crlf_mode; + + /** Can we do ANSI output? */ + u8 ansi_capable; + + /** Has the session started? */ + u8 started; + + /** Disable the pager? */ + u8 no_pager; + + /** Pager buffer */ + u8 **pager_vector; + + /** Index of line fragments in the pager buffer */ + unix_cli_pager_index_t *pager_index; + + /** Line number of top of page */ + u32 pager_start; + + /** Terminal width */ + u32 width; + + /** Terminal height */ + u32 height; + + /** Process node identifier */ + u32 process_node_index; +} unix_cli_file_t; + +/** Resets the pager buffer and other data. + * @param f The CLI session whose pager needs to be reset. + */ +always_inline void +unix_cli_pager_reset (unix_cli_file_t * f) +{ + u8 **p; + + f->pager_start = 0; + + vec_free (f->pager_index); + f->pager_index = 0; + + vec_foreach (p, f->pager_vector) + { + vec_free (*p); + } + vec_free (f->pager_vector); + f->pager_vector = 0; +} + +/** Release storage used by a CLI session. + * @param f The CLI session whose storage needs to be released. + */ +always_inline void +unix_cli_file_free (unix_cli_file_t * f) +{ + vec_free (f->output_vector); + vec_free (f->input_vector); + unix_cli_pager_reset (f); +} + +/** CLI actions */ +typedef enum +{ + UNIX_CLI_PARSE_ACTION_NOACTION = 0, /**< No action */ + UNIX_CLI_PARSE_ACTION_CRLF, /**< Carriage return, newline or enter */ + UNIX_CLI_PARSE_ACTION_TAB, /**< Tab key */ + UNIX_CLI_PARSE_ACTION_ERASE, /**< Erase cursor left */ + UNIX_CLI_PARSE_ACTION_ERASERIGHT, /**< Erase cursor right */ + UNIX_CLI_PARSE_ACTION_UP, /**< Up arrow */ + UNIX_CLI_PARSE_ACTION_DOWN, /**< Down arrow */ + UNIX_CLI_PARSE_ACTION_LEFT, /**< Left arrow */ + UNIX_CLI_PARSE_ACTION_RIGHT, /**< Right arrow */ + UNIX_CLI_PARSE_ACTION_HOME, /**< Home key (jump to start of line) */ + UNIX_CLI_PARSE_ACTION_END, /**< End key (jump to end of line) */ + UNIX_CLI_PARSE_ACTION_WORDLEFT, /**< Jump cursor to start of left word */ + UNIX_CLI_PARSE_ACTION_WORDRIGHT, /**< Jump cursor to start of right word */ + UNIX_CLI_PARSE_ACTION_ERASELINELEFT, /**< Erase line to left of cursor */ + UNIX_CLI_PARSE_ACTION_ERASELINERIGHT, /**< Erase line to right & including cursor */ + UNIX_CLI_PARSE_ACTION_CLEAR, /**< Clear the terminal */ + UNIX_CLI_PARSE_ACTION_REVSEARCH, /**< Search backwards in command history */ + UNIX_CLI_PARSE_ACTION_FWDSEARCH, /**< Search forwards in command history */ + UNIX_CLI_PARSE_ACTION_YANK, /**< Undo last erase action */ + UNIX_CLI_PARSE_ACTION_TELNETIAC, /**< Telnet control code */ + + UNIX_CLI_PARSE_ACTION_PAGER_CRLF, /**< Enter pressed (CR, CRLF, LF, etc) */ + UNIX_CLI_PARSE_ACTION_PAGER_QUIT, /**< Exit the pager session */ + UNIX_CLI_PARSE_ACTION_PAGER_NEXT, /**< Scroll to next page */ + UNIX_CLI_PARSE_ACTION_PAGER_DN, /**< Scroll to next line */ + UNIX_CLI_PARSE_ACTION_PAGER_UP, /**< Scroll to previous line */ + UNIX_CLI_PARSE_ACTION_PAGER_TOP, /**< Scroll to first line */ + UNIX_CLI_PARSE_ACTION_PAGER_BOTTOM, /**< Scroll to last line */ + UNIX_CLI_PARSE_ACTION_PAGER_PGDN, /**< Scroll to next page */ + UNIX_CLI_PARSE_ACTION_PAGER_PGUP, /**< Scroll to previous page */ + UNIX_CLI_PARSE_ACTION_PAGER_REDRAW, /**< Clear and redraw the page on the terminal */ + UNIX_CLI_PARSE_ACTION_PAGER_SEARCH, /**< Search the pager buffer */ + + UNIX_CLI_PARSE_ACTION_PARTIALMATCH, /**< Action parser found a partial match */ + UNIX_CLI_PARSE_ACTION_NOMATCH /**< Action parser did not find any match */ +} unix_cli_parse_action_t; + +/** @brief Mapping of input buffer strings to action values. + * @note This won't work as a hash since we need to be able to do + * partial matches on the string. + */ +typedef struct +{ + u8 *input; /**< Input string to match. */ + u32 len; /**< Length of input without final NUL. */ + unix_cli_parse_action_t action; /**< Action to take when matched. */ +} unix_cli_parse_actions_t; + +/** @brief Given a capital ASCII letter character return a @c NUL terminated + * string with the control code for that letter. + * + * @param c An ASCII character. + * @return A @c NUL terminated string of type @c u8[]. + * + * @par Example + * @c CTL('A') returns <code>{ 0x01, 0x00 }</code> as a @c u8[]. + */ +#define CTL(c) (u8[]){ (c) - '@', 0 } + +#define _(a,b) { .input = (u8 *)(a), .len = sizeof(a) - 1, .action = (b) } +/** + * Patterns to match on a CLI input stream. + * @showinitializer + */ +static unix_cli_parse_actions_t unix_cli_parse_strings[] = { + /* Line handling */ + _("\r\n", UNIX_CLI_PARSE_ACTION_CRLF), /* Must be before '\r' */ + _("\n", UNIX_CLI_PARSE_ACTION_CRLF), + _("\r\0", UNIX_CLI_PARSE_ACTION_CRLF), /* Telnet does this */ + _("\r", UNIX_CLI_PARSE_ACTION_CRLF), + + /* Unix shell control codes */ + _(CTL ('B'), UNIX_CLI_PARSE_ACTION_LEFT), + _(CTL ('F'), UNIX_CLI_PARSE_ACTION_RIGHT), + _(CTL ('P'), UNIX_CLI_PARSE_ACTION_UP), + _(CTL ('N'), UNIX_CLI_PARSE_ACTION_DOWN), + _(CTL ('A'), UNIX_CLI_PARSE_ACTION_HOME), + _(CTL ('E'), UNIX_CLI_PARSE_ACTION_END), + _(CTL ('D'), UNIX_CLI_PARSE_ACTION_ERASERIGHT), + _(CTL ('U'), UNIX_CLI_PARSE_ACTION_ERASELINELEFT), + _(CTL ('K'), UNIX_CLI_PARSE_ACTION_ERASELINERIGHT), + _(CTL ('Y'), UNIX_CLI_PARSE_ACTION_YANK), + _(CTL ('L'), UNIX_CLI_PARSE_ACTION_CLEAR), + _(ESC "b", UNIX_CLI_PARSE_ACTION_WORDLEFT), /* Alt-B */ + _(ESC "f", UNIX_CLI_PARSE_ACTION_WORDRIGHT), /* Alt-F */ + _("\b", UNIX_CLI_PARSE_ACTION_ERASE), /* ^H */ + _("\x7f", UNIX_CLI_PARSE_ACTION_ERASE), /* Backspace */ + _("\t", UNIX_CLI_PARSE_ACTION_TAB), /* ^I */ + + /* VT100 Normal mode - Broadest support */ + _(CSI "A", UNIX_CLI_PARSE_ACTION_UP), + _(CSI "B", UNIX_CLI_PARSE_ACTION_DOWN), + _(CSI "C", UNIX_CLI_PARSE_ACTION_RIGHT), + _(CSI "D", UNIX_CLI_PARSE_ACTION_LEFT), + _(CSI "H", UNIX_CLI_PARSE_ACTION_HOME), + _(CSI "F", UNIX_CLI_PARSE_ACTION_END), + _(CSI "3~", UNIX_CLI_PARSE_ACTION_ERASERIGHT), /* Delete */ + _(CSI "1;5D", UNIX_CLI_PARSE_ACTION_WORDLEFT), /* C-Left */ + _(CSI "1;5C", UNIX_CLI_PARSE_ACTION_WORDRIGHT), /* C-Right */ + + /* VT100 Application mode - Some Gnome Terminal functions use these */ + _(ESC "OA", UNIX_CLI_PARSE_ACTION_UP), + _(ESC "OB", UNIX_CLI_PARSE_ACTION_DOWN), + _(ESC "OC", UNIX_CLI_PARSE_ACTION_RIGHT), + _(ESC "OD", UNIX_CLI_PARSE_ACTION_LEFT), + _(ESC "OH", UNIX_CLI_PARSE_ACTION_HOME), + _(ESC "OF", UNIX_CLI_PARSE_ACTION_END), + + /* ANSI X3.41-1974 - sent by Microsoft Telnet and PuTTY */ + _(CSI "1~", UNIX_CLI_PARSE_ACTION_HOME), + _(CSI "4~", UNIX_CLI_PARSE_ACTION_END), + + /* Emacs-ish history search */ + _(CTL ('S'), UNIX_CLI_PARSE_ACTION_FWDSEARCH), + _(CTL ('R'), UNIX_CLI_PARSE_ACTION_REVSEARCH), + + /* Other protocol things */ + _("\xff", UNIX_CLI_PARSE_ACTION_TELNETIAC), /* IAC */ + _("\0", UNIX_CLI_PARSE_ACTION_NOACTION), /* NUL */ + _(NULL, UNIX_CLI_PARSE_ACTION_NOMATCH) +}; + +/** + * Patterns to match when a CLI session is in the pager. + * @showinitializer + */ +static unix_cli_parse_actions_t unix_cli_parse_pager[] = { + /* Line handling */ + _("\r\n", UNIX_CLI_PARSE_ACTION_PAGER_CRLF), /* Must be before '\r' */ + _("\n", UNIX_CLI_PARSE_ACTION_PAGER_CRLF), + _("\r\0", UNIX_CLI_PARSE_ACTION_PAGER_CRLF), /* Telnet does this */ + _("\r", UNIX_CLI_PARSE_ACTION_PAGER_CRLF), + + /* Pager commands */ + _(" ", UNIX_CLI_PARSE_ACTION_PAGER_NEXT), + _("q", UNIX_CLI_PARSE_ACTION_PAGER_QUIT), + _(CTL ('L'), UNIX_CLI_PARSE_ACTION_PAGER_REDRAW), + _(CTL ('R'), UNIX_CLI_PARSE_ACTION_PAGER_REDRAW), + _("/", UNIX_CLI_PARSE_ACTION_PAGER_SEARCH), + + /* VT100 */ + _(CSI "A", UNIX_CLI_PARSE_ACTION_PAGER_UP), + _(CSI "B", UNIX_CLI_PARSE_ACTION_PAGER_DN), + _(CSI "H", UNIX_CLI_PARSE_ACTION_PAGER_TOP), + _(CSI "F", UNIX_CLI_PARSE_ACTION_PAGER_BOTTOM), + + /* VT100 Application mode */ + _(ESC "OA", UNIX_CLI_PARSE_ACTION_PAGER_UP), + _(ESC "OB", UNIX_CLI_PARSE_ACTION_PAGER_DN), + _(ESC "OH", UNIX_CLI_PARSE_ACTION_PAGER_TOP), + _(ESC "OF", UNIX_CLI_PARSE_ACTION_PAGER_BOTTOM), + + /* ANSI X3.41-1974 */ + _(CSI "1~", UNIX_CLI_PARSE_ACTION_PAGER_TOP), + _(CSI "4~", UNIX_CLI_PARSE_ACTION_PAGER_BOTTOM), + _(CSI "5~", UNIX_CLI_PARSE_ACTION_PAGER_PGUP), + _(CSI "6~", UNIX_CLI_PARSE_ACTION_PAGER_PGDN), + + /* Other protocol things */ + _("\xff", UNIX_CLI_PARSE_ACTION_TELNETIAC), /* IAC */ + _("\0", UNIX_CLI_PARSE_ACTION_NOACTION), /* NUL */ + _(NULL, UNIX_CLI_PARSE_ACTION_NOMATCH) +}; + +#undef _ + +/** CLI session events. */ +typedef enum +{ + UNIX_CLI_PROCESS_EVENT_READ_READY, /**< A file descriptor has data to be read. */ + UNIX_CLI_PROCESS_EVENT_QUIT, /**< A CLI session wants to close. */ +} unix_cli_process_event_type_t; + +/** CLI global state. */ +typedef struct +{ + /** Prompt string for CLI. */ + u8 *cli_prompt; + + /** Vec pool of CLI sessions. */ + unix_cli_file_t *cli_file_pool; + + /** Vec pool of unused session indices. */ + u32 *unused_cli_process_node_indices; + + /** The session index of the stdin cli */ + u32 stdin_cli_file_index; + + /** File pool index of current input. */ + u32 current_input_file_index; +} unix_cli_main_t; + +/** CLI global state */ +static unix_cli_main_t unix_cli_main; + +/** + * @brief Search for a byte sequence in the action list. + * + * Searches the @ref unix_cli_parse_actions_t list in @a a for a match with + * the bytes in @a input of maximum length @a ilen bytes. + * When a match is made @a *matched indicates how many bytes were matched. + * Returns a value from the enum @ref unix_cli_parse_action_t to indicate + * whether no match was found, a partial match was found or a complete + * match was found and what action, if any, should be taken. + * + * @param[in] a Actions list to search within. + * @param[in] input String fragment to search for. + * @param[in] ilen Length of the string in 'input'. + * @param[out] matched Pointer to an integer that will contain the number + * of bytes matched when a complete match is found. + * + * @return Action from @ref unix_cli_parse_action_t that the string fragment + * matches. + * @ref UNIX_CLI_PARSE_ACTION_PARTIALMATCH is returned when the + * whole input string matches the start of at least one action. + * @ref UNIX_CLI_PARSE_ACTION_NOMATCH is returned when there is no + * match at all. + */ +static unix_cli_parse_action_t +unix_cli_match_action (unix_cli_parse_actions_t * a, + u8 * input, u32 ilen, i32 * matched) +{ + u8 partial = 0; + + while (a->input) + { + if (ilen >= a->len) + { + /* see if the start of the input buffer exactly matches the current + * action string. */ + if (memcmp (input, a->input, a->len) == 0) + { + *matched = a->len; + return a->action; + } + } + else + { + /* if the first ilen characters match, flag this as a partial - + * meaning keep collecting bytes in case of a future match */ + if (memcmp (input, a->input, ilen) == 0) + partial = 1; + } + + /* check next action */ + a++; + } + + return partial ? + UNIX_CLI_PARSE_ACTION_PARTIALMATCH : UNIX_CLI_PARSE_ACTION_NOMATCH; +} + + +/** Add bytes to the output vector and then flagg the I/O system that bytes + * are available to be sent. + */ +static void +unix_cli_add_pending_output (unix_file_t * uf, + unix_cli_file_t * cf, + u8 * buffer, uword buffer_bytes) +{ + unix_main_t *um = &unix_main; + + vec_add (cf->output_vector, buffer, buffer_bytes); + if (vec_len (cf->output_vector) > 0) + { + int skip_update = 0 != (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE); + uf->flags |= UNIX_FILE_DATA_AVAILABLE_TO_WRITE; + if (!skip_update) + um->file_update (uf, UNIX_FILE_UPDATE_MODIFY); + } +} + +/** Delete all bytes from the output vector and flag the I/O system + * that no more bytes are available to be sent. + */ +static void +unix_cli_del_pending_output (unix_file_t * uf, + unix_cli_file_t * cf, uword n_bytes) +{ + unix_main_t *um = &unix_main; + + vec_delete (cf->output_vector, n_bytes, 0); + if (vec_len (cf->output_vector) <= 0) + { + int skip_update = 0 == (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE); + uf->flags &= ~UNIX_FILE_DATA_AVAILABLE_TO_WRITE; + if (!skip_update) + um->file_update (uf, UNIX_FILE_UPDATE_MODIFY); + } +} + +/** @brief A bit like strchr with a buffer length limit. + * Search a buffer for the first instance of a character up to the limit of + * the buffer length. If found then return the position of that character. + * + * The key departure from strchr is that if the character is not found then + * return the buffer length. + * + * @param chr The byte value to search for. + * @param str The buffer in which to search for the value. + * @param len The depth into the buffer to search. + * + * @return The index of the first occurence of \c chr. If \c chr is not + * found then \c len instead. + */ +always_inline word +unix_vlib_findchr (u8 chr, u8 * str, word len) +{ + word i = 0; + for (i = 0; i < len; i++, str++) + { + if (*str == chr) + return i; + } + return len; +} + +/** @brief Send a buffer to the CLI stream if possible, enqueue it otherwise. + * Attempts to write given buffer to the file descriptor of the given + * Unix CLI session. If that session already has data in the output buffer + * or if the write attempt tells us to try again later then the given buffer + * is appended to the pending output buffer instead. + * + * This is typically called only from \c unix_vlib_cli_output_cooked since + * that is where CRLF handling occurs or from places where we explicitly do + * not want cooked handling. + * + * @param cf Unix CLI session of the desired stream to write to. + * @param uf The Unix file structure of the desired stream to write to. + * @param buffer Pointer to the buffer that needs to be written. + * @param buffer_bytes The number of bytes from \c buffer to write. + */ +static void +unix_vlib_cli_output_raw (unix_cli_file_t * cf, + unix_file_t * uf, u8 * buffer, uword buffer_bytes) +{ + int n = 0; + + if (vec_len (cf->output_vector) == 0) + n = write (uf->file_descriptor, buffer, buffer_bytes); + + if (n < 0 && errno != EAGAIN) + { + clib_unix_warning ("write"); + } + else if ((word) n < (word) buffer_bytes) + { + /* We got EAGAIN or we already have stuff in the buffer; + * queue up whatever didn't get sent for later. */ + if (n < 0) + n = 0; + unix_cli_add_pending_output (uf, cf, buffer + n, buffer_bytes - n); + } +} + +/** @brief Process a buffer for CRLF handling before outputting it to the CLI. + * + * @param cf Unix CLI session of the desired stream to write to. + * @param uf The Unix file structure of the desired stream to write to. + * @param buffer Pointer to the buffer that needs to be written. + * @param buffer_bytes The number of bytes from \c buffer to write. + */ +static void +unix_vlib_cli_output_cooked (unix_cli_file_t * cf, + unix_file_t * uf, + u8 * buffer, uword buffer_bytes) +{ + word end = 0, start = 0; + + while (end < buffer_bytes) + { + if (cf->crlf_mode) + { + /* iterate the line on \n's so we can insert a \r before it */ + end = unix_vlib_findchr ('\n', + buffer + start, + buffer_bytes - start) + start; + } + else + { + /* otherwise just send the whole buffer */ + end = buffer_bytes; + } + + unix_vlib_cli_output_raw (cf, uf, buffer + start, end - start); + + if (cf->crlf_mode) + { + if (end < buffer_bytes) + { + unix_vlib_cli_output_raw (cf, uf, (u8 *) "\r\n", 2); + end++; /* skip the \n that we already sent */ + } + start = end; + } + } +} + +/** @brief Output the CLI prompt */ +static void +unix_cli_cli_prompt (unix_cli_file_t * cf, unix_file_t * uf) +{ + unix_cli_main_t *cm = &unix_cli_main; + + unix_vlib_cli_output_raw (cf, uf, cm->cli_prompt, vec_len (cm->cli_prompt)); +} + +/** @brief Output a pager prompt and show number of buffered lines */ +static void +unix_cli_pager_prompt (unix_cli_file_t * cf, unix_file_t * uf) +{ + u8 *prompt; + u32 h; + + h = cf->pager_start + (cf->height - 1); + if (h > vec_len (cf->pager_index)) + h = vec_len (cf->pager_index); + + prompt = format (0, "\r%s-- more -- (%d-%d/%d)%s", + cf->ansi_capable ? ANSI_BOLD : "", + cf->pager_start + 1, + h, + vec_len (cf->pager_index), + cf->ansi_capable ? ANSI_RESET : ""); + + unix_vlib_cli_output_cooked (cf, uf, prompt, vec_len (prompt)); + + vec_free (prompt); +} + +/** @brief Output a pager "skipping" message */ +static void +unix_cli_pager_message (unix_cli_file_t * cf, unix_file_t * uf, + char *message, char *postfix) +{ + u8 *prompt; + + prompt = format (0, "\r%s-- %s --%s%s", + cf->ansi_capable ? ANSI_BOLD : "", + message, cf->ansi_capable ? ANSI_RESET : "", postfix); + + unix_vlib_cli_output_cooked (cf, uf, prompt, vec_len (prompt)); + + vec_free (prompt); +} + +/** @brief Erase the printed pager prompt */ +static void +unix_cli_pager_prompt_erase (unix_cli_file_t * cf, unix_file_t * uf) +{ + if (cf->ansi_capable) + { + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\r", 1); + unix_vlib_cli_output_cooked (cf, uf, + (u8 *) ANSI_CLEARLINE, + sizeof (ANSI_CLEARLINE) - 1); + } + else + { + int i; + + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\r", 1); + for (i = 0; i < cf->width - 1; i++) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) " ", 1); + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\r", 1); + } +} + +/** @brief Uses an ANSI escape sequence to move the cursor */ +static void +unix_cli_ansi_cursor (unix_cli_file_t * cf, unix_file_t * uf, u16 x, u16 y) +{ + u8 *str; + + str = format (0, "%s%d;%dH", CSI, y, x); + + unix_vlib_cli_output_cooked (cf, uf, str, vec_len (str)); + + vec_free (str); +} + +/** Redraw the currently displayed page of text. + * @param cf CLI session to redraw the pager buffer of. + * @param uf Unix file of the CLI session. + */ +static void +unix_cli_pager_redraw (unix_cli_file_t * cf, unix_file_t * uf) +{ + unix_cli_pager_index_t *pi = NULL; + u8 *line = NULL; + word i; + + /* No active pager? Do nothing. */ + if (!vec_len (cf->pager_index)) + return; + + if (cf->ansi_capable) + { + /* If we have ANSI, send the clear screen sequence */ + unix_vlib_cli_output_cooked (cf, uf, + (u8 *) ANSI_CLEAR, + sizeof (ANSI_CLEAR) - 1); + } + else + { + /* Otherwise make sure we're on a blank line */ + unix_cli_pager_prompt_erase (cf, uf); + } + + /* (Re-)send the current page of content */ + for (i = 0; i < cf->height - 1 && + i + cf->pager_start < vec_len (cf->pager_index); i++) + { + pi = &cf->pager_index[cf->pager_start + i]; + line = cf->pager_vector[pi->line] + pi->offset; + + unix_vlib_cli_output_cooked (cf, uf, line, pi->length); + } + /* if the last line didn't end in newline, add a newline */ + if (pi && line[pi->length - 1] != '\n') + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1); + + unix_cli_pager_prompt (cf, uf); +} + +/** @brief Process and add a line to the pager index. + * In normal operation this function will take the given character string + * found in @c line and with length @c len_or_index and iterates the over the + * contents, adding each line of text discovered within it to the + * pager index. Lines are identified by newlines ("<code>\\n</code>") and by + * strings longer than the width of the terminal. + * + * If instead @c line is @c NULL then @c len_or_index is taken to mean the + * index of an existing line in the pager buffer; this simply means that the + * input line does not need to be cloned since we alreayd have it. This is + * typical if we are reindexing the pager buffer. + * + * @param cf The CLI session whose pager we are adding to. + * @param line The string of text to be indexed into the pager buffer. + * If @c line is @c NULL then the mode of operation + * changes slightly; see the description above. + * @param len_or_index If @c line is a pointer to a string then this parameter + * indicates the length of that string; Otherwise this + * value provides the index in the pager buffer of an + * existing string to be indexed. + */ +static void +unix_cli_pager_add_line (unix_cli_file_t * cf, u8 * line, word len_or_index) +{ + u8 *p; + word i, j, k; + word line_index, len; + u32 width = cf->width; + unix_cli_pager_index_t *pi; + + if (line == NULL) + { + /* Use a line already in the pager buffer */ + line_index = len_or_index; + p = cf->pager_vector[line_index]; + len = vec_len (p); + } + else + { + len = len_or_index; + /* Add a copy of the raw string to the pager buffer */ + p = vec_new (u8, len); + clib_memcpy (p, line, len); + + /* store in pager buffer */ + line_index = vec_len (cf->pager_vector); + vec_add1 (cf->pager_vector, p); + } + + i = 0; + while (i < len) + { + /* Find the next line, or run to terminal width, or run to EOL */ + int l = len - i; + j = unix_vlib_findchr ((u8) '\n', p, l < width ? l : width); + + if (j < l && p[j] == '\n') /* incl \n */ + j++; + + /* Add the line to the index */ + k = vec_len (cf->pager_index); + vec_validate (cf->pager_index, k); + pi = &cf->pager_index[k]; + + pi->line = line_index; + pi->offset = i; + pi->length = j; + + i += j; + p += j; + } +} + +/** @brief Reindex entire pager buffer. + * Resets the current pager index and then re-adds the lines in the pager + * buffer to the index. + * + * Additionally this function attempts to retain the current page start + * line offset by searching for the same top-of-screen line in the new index. + * + * @param cf The CLI session whose pager buffer should be reindexed. + */ +static void +unix_cli_pager_reindex (unix_cli_file_t * cf) +{ + word i, old_line, old_offset; + unix_cli_pager_index_t *pi; + + /* If there is nothing in the pager buffer then make sure the index + * is empty and move on. + */ + if (cf->pager_vector == 0) + { + vec_reset_length (cf->pager_index); + return; + } + + /* Retain a pointer to the current page start line so we can + * find it later + */ + pi = &cf->pager_index[cf->pager_start]; + old_line = pi->line; + old_offset = pi->offset; + + /* Re-add the buffered lines to the index */ + vec_reset_length (cf->pager_index); + vec_foreach_index (i, cf->pager_vector) + { + unix_cli_pager_add_line (cf, NULL, i); + } + + /* Attempt to re-locate the previously stored page start line */ + vec_foreach_index (i, cf->pager_index) + { + pi = &cf->pager_index[i]; + + if (pi->line == old_line && + (pi->offset <= old_offset || pi->offset + pi->length > old_offset)) + { + /* Found it! */ + cf->pager_start = i; + break; + } + } + + /* In case the start line was not found (rare), ensure the pager start + * index is within bounds + */ + if (cf->pager_start >= vec_len (cf->pager_index)) + { + if (!cf->height || vec_len (cf->pager_index) < (cf->height - 1)) + cf->pager_start = 0; + else + cf->pager_start = vec_len (cf->pager_index) - (cf->height - 1); + } +} + +/** VLIB CLI output function. + * + * If the terminal has a pager configured then this function takes care + * of collating output into the pager buffer; ensuring only the first page + * is displayed and any lines in excess of the first page are buffered. + * + * If the maximum number of index lines in the buffer is exceeded then the + * pager is cancelled and the contents of the current buffer are sent to the + * terminal. + * + * If there is no pager configured then the output is sent directly to the + * terminal. + * + * @param cli_file_index Index of the CLI session where this output is + * directed. + * @param buffer String of printabe bytes to be output. + * @param buffer_bytes The number of bytes in @c buffer to be output. + */ +static void +unix_vlib_cli_output (uword cli_file_index, u8 * buffer, uword buffer_bytes) +{ + unix_main_t *um = &unix_main; + unix_cli_main_t *cm = &unix_cli_main; + unix_cli_file_t *cf; + unix_file_t *uf; + + cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index); + uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + + if (cf->no_pager || um->cli_pager_buffer_limit == 0 || cf->height == 0) + { + unix_vlib_cli_output_cooked (cf, uf, buffer, buffer_bytes); + } + else + { + word row = vec_len (cf->pager_index); + u8 *line; + unix_cli_pager_index_t *pi; + + /* Index and add the output lines to the pager buffer. */ + unix_cli_pager_add_line (cf, buffer, buffer_bytes); + + /* Now iterate what was added to display the lines. + * If we reach the bottom of the page, display a prompt. + */ + while (row < vec_len (cf->pager_index)) + { + if (row < cf->height - 1) + { + /* output this line */ + pi = &cf->pager_index[row]; + line = cf->pager_vector[pi->line] + pi->offset; + unix_vlib_cli_output_cooked (cf, uf, line, pi->length); + + /* if the last line didn't end in newline, and we're at the + * bottom of the page, add a newline */ + if (line[pi->length - 1] != '\n' && row == cf->height - 2) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1); + } + else + { + /* Display the pager prompt every 10 lines */ + if (!(row % 10)) + unix_cli_pager_prompt (cf, uf); + } + row++; + } + + /* Check if we went over the pager buffer limit */ + if (vec_len (cf->pager_index) > um->cli_pager_buffer_limit) + { + /* Stop using the pager for the remainder of this CLI command */ + cf->no_pager = 2; + + /* If we likely printed the prompt, erase it */ + if (vec_len (cf->pager_index) > cf->height - 1) + unix_cli_pager_prompt_erase (cf, uf); + + /* Dump out the contents of the buffer */ + for (row = cf->pager_start + (cf->height - 1); + row < vec_len (cf->pager_index); row++) + { + pi = &cf->pager_index[row]; + line = cf->pager_vector[pi->line] + pi->offset; + unix_vlib_cli_output_cooked (cf, uf, line, pi->length); + } + + unix_cli_pager_reset (cf); + } + } +} + +/** Identify whether a terminal type is ANSI capable. + * + * Compares the string given in @c term with a list of terminal types known + * to support ANSI escape sequences. + * + * This list contains, for example, @c xterm, @c screen and @c ansi. + * + * @param term A string with a terminal type in it. + * @param len The length of the string in @c term. + * + * @return @c 1 if the terminal type is recognized as supporting ANSI + * terminal sequences; @c 0 otherwise. + */ +static u8 +unix_cli_terminal_type (u8 * term, uword len) +{ + /* This may later be better done as a hash of some sort. */ +#define _(a) do { \ + if (strncasecmp(a, (char *)term, (size_t)len) == 0) return 1; \ + } while(0) + + _("xterm"); + _("xterm-color"); + _("xterm-256color"); /* iTerm on Mac */ + _("screen"); + _("ansi"); /* Microsoft Telnet */ +#undef _ + + return 0; +} + +/** @brief Emit initial welcome banner and prompt on a connection. */ +static void +unix_cli_file_welcome (unix_cli_main_t * cm, unix_cli_file_t * cf) +{ + unix_main_t *um = &unix_main; + unix_file_t *uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + unix_cli_banner_t *banner; + int i, len; + + /* + * Put the first bytes directly into the buffer so that further output is + * queued until everything is ready. (oterwise initial prompt can appear + * mid way through VPP initialization) + */ + unix_cli_add_pending_output (uf, cf, (u8 *) "\r", 1); + + if (!um->cli_no_banner) + { + if (cf->ansi_capable) + { + banner = unix_cli_banner_color; + len = ARRAY_LEN (unix_cli_banner_color); + } + else + { + banner = unix_cli_banner; + len = ARRAY_LEN (unix_cli_banner); + } + + for (i = 0; i < len; i++) + { + unix_vlib_cli_output_cooked (cf, uf, + banner[i].line, banner[i].length); + } + } + + /* Prompt. */ + unix_cli_cli_prompt (cf, uf); + + cf->started = 1; +} + +/** @brief A failsafe triggered on a timer to ensure we send the prompt + * to telnet sessions that fail to negotiate the terminal type. */ +static void +unix_cli_file_welcome_timer (any arg, f64 delay) +{ + unix_cli_main_t *cm = &unix_cli_main; + unix_cli_file_t *cf; + (void) delay; + + /* Check the connection didn't close already */ + if (pool_is_free_index (cm->cli_file_pool, (uword) arg)) + return; + + cf = pool_elt_at_index (cm->cli_file_pool, (uword) arg); + + if (!cf->started) + unix_cli_file_welcome (cm, cf); +} + +/** @brief A mostly no-op Telnet state machine. + * Process Telnet command bytes in a way that ensures we're mostly + * transparent to the Telnet protocol. That is, it's mostly a no-op. + * + * @return -1 if we need more bytes, otherwise a positive integer number of + * bytes to consume from the input_vector, not including the initial + * IAC byte. + */ +static i32 +unix_cli_process_telnet (unix_main_t * um, + unix_cli_file_t * cf, + unix_file_t * uf, u8 * input_vector, uword len) +{ + /* Input_vector starts at IAC byte. + * See if we have a complete message; if not, return -1 so we wait for more. + * if we have a complete message, consume those bytes from the vector. + */ + i32 consume = 0; + + if (len == 1) + return -1; /* want more bytes */ + + switch (input_vector[1]) + { + case IAC: + /* two IAC's in a row means to pass through 0xff. + * since that makes no sense here, just consume it. + */ + consume = 1; + break; + + case WILL: + case WONT: + case DO: + case DONT: + /* Expect 3 bytes */ + if (vec_len (input_vector) < 3) + return -1; /* want more bytes */ + + consume = 2; + break; + + case SB: + { + /* Sub option - search ahead for IAC SE to end it */ + i32 i; + for (i = 3; i < len && i < UNIX_CLI_MAX_DEPTH_TELNET; i++) + { + if (input_vector[i - 1] == IAC && input_vector[i] == SE) + { + /* We have a complete message; see if we care about it */ + switch (input_vector[2]) + { + case TELOPT_TTYPE: + if (input_vector[3] != 0) + break; + /* See if the terminal type is ANSI capable */ + cf->ansi_capable = + unix_cli_terminal_type (input_vector + 4, i - 5); + /* If session not started, we can release the pause */ + if (!cf->started) + /* Send the welcome banner and initial prompt */ + unix_cli_file_welcome (&unix_cli_main, cf); + break; + + case TELOPT_NAWS: + /* Window size */ + if (i != 8) /* check message is correct size */ + break; + cf->width = + clib_net_to_host_u16 (*((u16 *) (input_vector + 3))); + cf->height = + clib_net_to_host_u16 (*((u16 *) (input_vector + 5))); + /* reindex pager buffer */ + unix_cli_pager_reindex (cf); + /* redraw page */ + unix_cli_pager_redraw (cf, uf); + break; + + default: + break; + } + /* Consume it all */ + consume = i; + break; + } + } + + if (i == UNIX_CLI_MAX_DEPTH_TELNET) + consume = 1; /* hit max search depth, advance one byte */ + + if (consume == 0) + return -1; /* want more bytes */ + + break; + } + + case GA: + case EL: + case EC: + case AO: + case IP: + case BREAK: + case DM: + case NOP: + case SE: + case EOR: + case ABORT: + case SUSP: + case xEOF: + /* Simple one-byte messages */ + consume = 1; + break; + + case AYT: + /* Are You There - trigger a visible response */ + consume = 1; + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "fd.io VPP\n", 10); + break; + + default: + /* Unknown command! Eat the IAC byte */ + break; + } + + return consume; +} + +/** @brief Process actionable input. + * Based on the \c action process the input; this typically involves + * searching the command history or editing the current command line. + */ +static int +unix_cli_line_process_one (unix_cli_main_t * cm, + unix_main_t * um, + unix_cli_file_t * cf, + unix_file_t * uf, + u8 input, unix_cli_parse_action_t action) +{ + u8 *prev; + int j, delta; + + switch (action) + { + case UNIX_CLI_PARSE_ACTION_NOACTION: + break; + + case UNIX_CLI_PARSE_ACTION_REVSEARCH: + case UNIX_CLI_PARSE_ACTION_FWDSEARCH: + if (!cf->has_history || !cf->history_limit) + break; + if (cf->search_mode == 0) + { + /* Erase the current command (if any) */ + for (j = 0; j < (vec_len (cf->current_command)); j++) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b \b", 3); + + vec_reset_length (cf->search_key); + vec_reset_length (cf->current_command); + if (action == UNIX_CLI_PARSE_ACTION_REVSEARCH) + cf->search_mode = -1; + else + cf->search_mode = 1; + cf->cursor = 0; + } + else + { + if (action == UNIX_CLI_PARSE_ACTION_REVSEARCH) + cf->search_mode = -1; + else + cf->search_mode = 1; + + cf->excursion += cf->search_mode; + goto search_again; + } + break; + + case UNIX_CLI_PARSE_ACTION_ERASELINELEFT: + /* Erase the command from the cursor to the start */ + + /* Shimmy forwards to the new end of line position */ + delta = vec_len (cf->current_command) - cf->cursor; + for (j = cf->cursor; j > delta; j--) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + /* Zap from here to the end of what is currently displayed */ + for (; j < (vec_len (cf->current_command)); j++) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) " ", 1); + /* Get back to the start of the line */ + for (j = 0; j < (vec_len (cf->current_command)); j++) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + + j = vec_len (cf->current_command) - cf->cursor; + memmove (cf->current_command, cf->current_command + cf->cursor, j); + _vec_len (cf->current_command) = j; + + /* Print the new contents */ + unix_vlib_cli_output_cooked (cf, uf, cf->current_command, j); + /* Shimmy back to the start */ + for (j = 0; j < (vec_len (cf->current_command)); j++) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + cf->cursor = 0; + + cf->search_mode = 0; + break; + + case UNIX_CLI_PARSE_ACTION_ERASELINERIGHT: + /* Erase the command from the cursor to the end */ + + /* Zap from cursor to end of what is currently displayed */ + for (j = cf->cursor; j < (vec_len (cf->current_command)); j++) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) " ", 1); + /* Get back to where we were */ + for (j = cf->cursor; j < (vec_len (cf->current_command)); j++) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + + /* Truncate the line at the cursor */ + _vec_len (cf->current_command) = cf->cursor; + + cf->search_mode = 0; + break; + + case UNIX_CLI_PARSE_ACTION_LEFT: + if (cf->cursor > 0) + { + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + cf->cursor--; + } + + cf->search_mode = 0; + break; + + case UNIX_CLI_PARSE_ACTION_RIGHT: + if (cf->cursor < vec_len (cf->current_command)) + { + /* have to emit the character under the cursor */ + unix_vlib_cli_output_cooked (cf, uf, + cf->current_command + cf->cursor, 1); + cf->cursor++; + } + + cf->search_mode = 0; + break; + + case UNIX_CLI_PARSE_ACTION_UP: + case UNIX_CLI_PARSE_ACTION_DOWN: + if (!cf->has_history || !cf->history_limit) + break; + cf->search_mode = 0; + /* Erase the command */ + for (j = cf->cursor; j < (vec_len (cf->current_command)); j++) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) " ", 1); + for (j = 0; j < (vec_len (cf->current_command)); j++) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b \b", 3); + vec_reset_length (cf->current_command); + if (vec_len (cf->command_history)) + { + if (action == UNIX_CLI_PARSE_ACTION_UP) + delta = -1; + else + delta = 1; + + cf->excursion += delta; + + if (cf->excursion == vec_len (cf->command_history)) + { + /* down-arrowed to last entry - want a blank line */ + _vec_len (cf->current_command) = 0; + } + else if (cf->excursion < 0) + { + /* up-arrowed over the start to the end, want a blank line */ + cf->excursion = vec_len (cf->command_history); + _vec_len (cf->current_command) = 0; + } + else + { + if (cf->excursion > (i32) vec_len (cf->command_history) - 1) + /* down-arrowed past end - wrap to start */ + cf->excursion = 0; + + /* Print the command at the current position */ + prev = cf->command_history[cf->excursion]; + vec_validate (cf->current_command, vec_len (prev) - 1); + + clib_memcpy (cf->current_command, prev, vec_len (prev)); + _vec_len (cf->current_command) = vec_len (prev); + unix_vlib_cli_output_cooked (cf, uf, cf->current_command, + vec_len (cf->current_command)); + } + cf->cursor = vec_len (cf->current_command); + + break; + } + break; + + case UNIX_CLI_PARSE_ACTION_HOME: + if (vec_len (cf->current_command) && cf->cursor > 0) + { + while (cf->cursor) + { + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + cf->cursor--; + } + } + + cf->search_mode = 0; + break; + + case UNIX_CLI_PARSE_ACTION_END: + if (vec_len (cf->current_command) && + cf->cursor < vec_len (cf->current_command)) + { + unix_vlib_cli_output_cooked (cf, uf, + cf->current_command + cf->cursor, + vec_len (cf->current_command) - + cf->cursor); + cf->cursor = vec_len (cf->current_command); + } + + cf->search_mode = 0; + break; + + case UNIX_CLI_PARSE_ACTION_WORDLEFT: + if (vec_len (cf->current_command) && cf->cursor > 0) + { + j = cf->cursor; + + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + j--; + + while (j && isspace (cf->current_command[j])) + { + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + j--; + } + while (j && !isspace (cf->current_command[j])) + { + if (isspace (cf->current_command[j - 1])) + break; + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + j--; + } + + cf->cursor = j; + } + + cf->search_mode = 0; + break; + + case UNIX_CLI_PARSE_ACTION_WORDRIGHT: + if (vec_len (cf->current_command) && + cf->cursor < vec_len (cf->current_command)) + { + int e = vec_len (cf->current_command); + j = cf->cursor; + while (j < e && !isspace (cf->current_command[j])) + j++; + while (j < e && isspace (cf->current_command[j])) + j++; + unix_vlib_cli_output_cooked (cf, uf, + cf->current_command + cf->cursor, + j - cf->cursor); + cf->cursor = j; + } + + cf->search_mode = 0; + break; + + + case UNIX_CLI_PARSE_ACTION_ERASE: + if (vec_len (cf->current_command)) + { + if (cf->cursor == vec_len (cf->current_command)) + { + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b \b", 3); + _vec_len (cf->current_command)--; + cf->cursor--; + } + else if (cf->cursor > 0) + { + /* shift everything at & to the right of the cursor left by 1 */ + j = vec_len (cf->current_command) - cf->cursor; + memmove (cf->current_command + cf->cursor - 1, + cf->current_command + cf->cursor, j); + _vec_len (cf->current_command)--; + cf->cursor--; + /* redraw the rest of the line */ + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + unix_vlib_cli_output_cooked (cf, uf, + cf->current_command + cf->cursor, + j); + unix_vlib_cli_output_cooked (cf, uf, (u8 *) " \b\b", 3); + /* and shift the terminal cursor back where it should be */ + while (--j) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + } + } + cf->search_mode = 0; + cf->excursion = 0; + vec_reset_length (cf->search_key); + break; + + case UNIX_CLI_PARSE_ACTION_ERASERIGHT: + if (vec_len (cf->current_command)) + { + if (cf->cursor < vec_len (cf->current_command)) + { + /* shift everything to the right of the cursor left by 1 */ + j = vec_len (cf->current_command) - cf->cursor - 1; + memmove (cf->current_command + cf->cursor, + cf->current_command + cf->cursor + 1, j); + _vec_len (cf->current_command)--; + /* redraw the rest of the line */ + unix_vlib_cli_output_cooked (cf, uf, + cf->current_command + cf->cursor, + j); + unix_vlib_cli_output_cooked (cf, uf, (u8 *) " \b", 2); + /* and shift the terminal cursor back where it should be */ + if (j) + { + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + while (--j) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + } + } + } + else if (input == 'D' - '@') + { + /* ^D with no command entered = quit */ + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "quit\n", 5); + vlib_process_signal_event (um->vlib_main, + vlib_current_process (um->vlib_main), + UNIX_CLI_PROCESS_EVENT_QUIT, + cf - cm->cli_file_pool); + } + cf->search_mode = 0; + cf->excursion = 0; + vec_reset_length (cf->search_key); + break; + + case UNIX_CLI_PARSE_ACTION_CLEAR: + /* If we're in ANSI mode, clear the screen. + * Then redraw the prompt and any existing command input, then put + * the cursor back where it was in that line. + */ + if (cf->ansi_capable) + unix_vlib_cli_output_cooked (cf, uf, + (u8 *) ANSI_CLEAR, + sizeof (ANSI_CLEAR) - 1); + else + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1); + + unix_vlib_cli_output_raw (cf, uf, + cm->cli_prompt, vec_len (cm->cli_prompt)); + unix_vlib_cli_output_raw (cf, uf, + cf->current_command, + vec_len (cf->current_command)); + for (j = cf->cursor; j < vec_len (cf->current_command); j++) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + + break; + + case UNIX_CLI_PARSE_ACTION_TAB: + case UNIX_CLI_PARSE_ACTION_YANK: + /* TODO */ + break; + + + case UNIX_CLI_PARSE_ACTION_PAGER_QUIT: + pager_quit: + unix_cli_pager_prompt_erase (cf, uf); + unix_cli_pager_reset (cf); + unix_cli_cli_prompt (cf, uf); + break; + + case UNIX_CLI_PARSE_ACTION_PAGER_NEXT: + case UNIX_CLI_PARSE_ACTION_PAGER_PGDN: + /* show next page of the buffer */ + if (cf->height + cf->pager_start < vec_len (cf->pager_index)) + { + u8 *line = NULL; + unix_cli_pager_index_t *pi = NULL; + + int m = cf->pager_start + (cf->height - 1); + unix_cli_pager_prompt_erase (cf, uf); + for (j = m; + j < vec_len (cf->pager_index) && cf->pager_start < m; + j++, cf->pager_start++) + { + pi = &cf->pager_index[j]; + line = cf->pager_vector[pi->line] + pi->offset; + unix_vlib_cli_output_cooked (cf, uf, line, pi->length); + } + /* if the last line didn't end in newline, add a newline */ + if (pi && line[pi->length - 1] != '\n') + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1); + unix_cli_pager_prompt (cf, uf); + } + else + { + if (action == UNIX_CLI_PARSE_ACTION_PAGER_NEXT) + /* no more in buffer, exit, but only if it was <space> */ + goto pager_quit; + } + break; + + case UNIX_CLI_PARSE_ACTION_PAGER_DN: + case UNIX_CLI_PARSE_ACTION_PAGER_CRLF: + /* display the next line of the buffer */ + if (cf->pager_start < vec_len (cf->pager_index) - (cf->height - 1)) + { + u8 *line; + unix_cli_pager_index_t *pi; + + unix_cli_pager_prompt_erase (cf, uf); + pi = &cf->pager_index[cf->pager_start + (cf->height - 1)]; + line = cf->pager_vector[pi->line] + pi->offset; + unix_vlib_cli_output_cooked (cf, uf, line, pi->length); + cf->pager_start++; + /* if the last line didn't end in newline, add a newline */ + if (line[pi->length - 1] != '\n') + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1); + unix_cli_pager_prompt (cf, uf); + } + else + { + if (action == UNIX_CLI_PARSE_ACTION_PAGER_CRLF) + /* no more in buffer, exit, but only if it was <enter> */ + goto pager_quit; + } + + break; + + case UNIX_CLI_PARSE_ACTION_PAGER_UP: + /* scroll the page back one line */ + if (cf->pager_start > 0) + { + u8 *line = NULL; + unix_cli_pager_index_t *pi = NULL; + + cf->pager_start--; + if (cf->ansi_capable) + { + pi = &cf->pager_index[cf->pager_start]; + line = cf->pager_vector[pi->line] + pi->offset; + unix_cli_pager_prompt_erase (cf, uf); + unix_vlib_cli_output_cooked (cf, uf, (u8 *) ANSI_SCROLLDN, + sizeof (ANSI_SCROLLDN) - 1); + unix_vlib_cli_output_cooked (cf, uf, (u8 *) ANSI_SAVECURSOR, + sizeof (ANSI_SAVECURSOR) - 1); + unix_cli_ansi_cursor (cf, uf, 1, 1); + unix_vlib_cli_output_cooked (cf, uf, (u8 *) ANSI_CLEARLINE, + sizeof (ANSI_CLEARLINE) - 1); + unix_vlib_cli_output_cooked (cf, uf, line, pi->length); + unix_vlib_cli_output_cooked (cf, uf, (u8 *) ANSI_RESTCURSOR, + sizeof (ANSI_RESTCURSOR) - 1); + unix_cli_pager_prompt_erase (cf, uf); + unix_cli_pager_prompt (cf, uf); + } + else + { + int m = cf->pager_start + (cf->height - 1); + unix_cli_pager_prompt_erase (cf, uf); + for (j = cf->pager_start; + j < vec_len (cf->pager_index) && j < m; j++) + { + pi = &cf->pager_index[j]; + line = cf->pager_vector[pi->line] + pi->offset; + unix_vlib_cli_output_cooked (cf, uf, line, pi->length); + } + /* if the last line didn't end in newline, add a newline */ + if (pi && line[pi->length - 1] != '\n') + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1); + unix_cli_pager_prompt (cf, uf); + } + } + break; + + case UNIX_CLI_PARSE_ACTION_PAGER_TOP: + /* back to the first page of the buffer */ + if (cf->pager_start > 0) + { + u8 *line = NULL; + unix_cli_pager_index_t *pi = NULL; + + cf->pager_start = 0; + int m = cf->pager_start + (cf->height - 1); + unix_cli_pager_prompt_erase (cf, uf); + for (j = cf->pager_start; j < vec_len (cf->pager_index) && j < m; + j++) + { + pi = &cf->pager_index[j]; + line = cf->pager_vector[pi->line] + pi->offset; + unix_vlib_cli_output_cooked (cf, uf, line, pi->length); + } + /* if the last line didn't end in newline, add a newline */ + if (pi && line[pi->length - 1] != '\n') + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1); + unix_cli_pager_prompt (cf, uf); + } + break; + + case UNIX_CLI_PARSE_ACTION_PAGER_BOTTOM: + /* skip to the last page of the buffer */ + if (cf->pager_start < vec_len (cf->pager_index) - (cf->height - 1)) + { + u8 *line = NULL; + unix_cli_pager_index_t *pi = NULL; + + cf->pager_start = vec_len (cf->pager_index) - (cf->height - 1); + unix_cli_pager_prompt_erase (cf, uf); + unix_cli_pager_message (cf, uf, "skipping", "\n"); + for (j = cf->pager_start; j < vec_len (cf->pager_index); j++) + { + pi = &cf->pager_index[j]; + line = cf->pager_vector[pi->line] + pi->offset; + unix_vlib_cli_output_cooked (cf, uf, line, pi->length); + } + /* if the last line didn't end in newline, add a newline */ + if (pi && line[pi->length - 1] != '\n') + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1); + unix_cli_pager_prompt (cf, uf); + } + break; + + case UNIX_CLI_PARSE_ACTION_PAGER_PGUP: + /* wander back one page in the buffer */ + if (cf->pager_start > 0) + { + u8 *line = NULL; + unix_cli_pager_index_t *pi = NULL; + int m; + + if (cf->pager_start >= cf->height) + cf->pager_start -= cf->height - 1; + else + cf->pager_start = 0; + m = cf->pager_start + cf->height - 1; + unix_cli_pager_prompt_erase (cf, uf); + for (j = cf->pager_start; j < vec_len (cf->pager_index) && j < m; + j++) + { + pi = &cf->pager_index[j]; + line = cf->pager_vector[pi->line] + pi->offset; + unix_vlib_cli_output_cooked (cf, uf, line, pi->length); + } + /* if the last line didn't end in newline, add a newline */ + if (pi && line[pi->length - 1] != '\n') + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1); + unix_cli_pager_prompt (cf, uf); + } + break; + + case UNIX_CLI_PARSE_ACTION_PAGER_REDRAW: + /* Redraw the current pager screen */ + unix_cli_pager_redraw (cf, uf); + break; + + case UNIX_CLI_PARSE_ACTION_PAGER_SEARCH: + /* search forwards in the buffer */ + break; + + + case UNIX_CLI_PARSE_ACTION_CRLF: + crlf: + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1); + + if (cf->has_history && cf->history_limit) + { + if (cf->command_history + && vec_len (cf->command_history) >= cf->history_limit) + { + vec_free (cf->command_history[0]); + vec_delete (cf->command_history, 1, 0); + } + /* Don't add blank lines to the cmd history */ + if (vec_len (cf->current_command)) + { + /* Don't duplicate the previous command */ + j = vec_len (cf->command_history); + if (j == 0 || + (vec_len (cf->current_command) != + vec_len (cf->command_history[j - 1]) + || memcmp (cf->current_command, cf->command_history[j - 1], + vec_len (cf->current_command)) != 0)) + { + /* copy the command to the history */ + u8 *c = 0; + vec_append (c, cf->current_command); + vec_add1 (cf->command_history, c); + cf->command_number++; + } + } + cf->excursion = vec_len (cf->command_history); + } + + cf->search_mode = 0; + vec_reset_length (cf->search_key); + cf->cursor = 0; + + return 0; + + case UNIX_CLI_PARSE_ACTION_PARTIALMATCH: + case UNIX_CLI_PARSE_ACTION_NOMATCH: + if (vec_len (cf->pager_index)) + { + /* no-op for now */ + } + else if (cf->has_history && cf->search_mode && isprint (input)) + { + int k, limit, offset; + u8 *item; + + vec_add1 (cf->search_key, input); + + search_again: + for (j = 0; j < vec_len (cf->command_history); j++) + { + if (cf->excursion > (i32) vec_len (cf->command_history) - 1) + cf->excursion = 0; + else if (cf->excursion < 0) + cf->excursion = vec_len (cf->command_history) - 1; + + item = cf->command_history[cf->excursion]; + + limit = (vec_len (cf->search_key) > vec_len (item)) ? + vec_len (item) : vec_len (cf->search_key); + + for (offset = 0; offset <= vec_len (item) - limit; offset++) + { + for (k = 0; k < limit; k++) + { + if (item[k + offset] != cf->search_key[k]) + goto next_offset; + } + goto found_at_offset; + + next_offset: + ; + } + goto next; + + found_at_offset: + for (j = 0; j < vec_len (cf->current_command); j++) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b \b", 3); + + vec_validate (cf->current_command, vec_len (item) - 1); + clib_memcpy (cf->current_command, item, vec_len (item)); + _vec_len (cf->current_command) = vec_len (item); + + unix_vlib_cli_output_cooked (cf, uf, cf->current_command, + vec_len (cf->current_command)); + cf->cursor = vec_len (cf->current_command); + goto found; + + next: + cf->excursion += cf->search_mode; + } + + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\nNo match...", 12); + vec_reset_length (cf->search_key); + vec_reset_length (cf->current_command); + cf->search_mode = 0; + cf->cursor = 0; + goto crlf; + } + else if (isprint (input)) /* skip any errant control codes */ + { + if (cf->cursor == vec_len (cf->current_command)) + { + /* Append to end */ + vec_add1 (cf->current_command, input); + cf->cursor++; + + /* Echo the character back to the client */ + unix_vlib_cli_output_raw (cf, uf, &input, 1); + } + else + { + /* Insert at cursor: resize +1 byte, move everything over */ + j = vec_len (cf->current_command) - cf->cursor; + vec_add1 (cf->current_command, (u8) 'A'); + memmove (cf->current_command + cf->cursor + 1, + cf->current_command + cf->cursor, j); + cf->current_command[cf->cursor] = input; + /* Redraw the line */ + j++; + unix_vlib_cli_output_raw (cf, uf, + cf->current_command + cf->cursor, j); + /* Put terminal cursor back */ + while (--j) + unix_vlib_cli_output_raw (cf, uf, (u8 *) "\b", 1); + cf->cursor++; + } + } + else + { + /* no-op - not printable or otherwise not actionable */ + } + + found: + + break; + + case UNIX_CLI_PARSE_ACTION_TELNETIAC: + break; + } + return 1; +} + +/** @brief Process input bytes on a stream to provide line editing and + * command history in the CLI. */ +static int +unix_cli_line_edit (unix_cli_main_t * cm, + unix_main_t * um, unix_cli_file_t * cf) +{ + unix_file_t *uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + int i; + + for (i = 0; i < vec_len (cf->input_vector); i++) + { + unix_cli_parse_action_t action; + i32 matched = 0; + unix_cli_parse_actions_t *a; + + /* If we're in the pager mode, search the pager actions */ + a = + vec_len (cf->pager_index) ? unix_cli_parse_pager : + unix_cli_parse_strings; + + /* See if the input buffer is some sort of control code */ + action = unix_cli_match_action (a, &cf->input_vector[i], + vec_len (cf->input_vector) - i, + &matched); + + switch (action) + { + case UNIX_CLI_PARSE_ACTION_PARTIALMATCH: + if (i) + { + /* There was a partial match which means we need more bytes + * than the input buffer currently has. + * Since the bytes before here have been processed, shift + * the remaining contents to the start of the input buffer. + */ + vec_delete (cf->input_vector, i, 0); + } + return 1; /* wait for more */ + + case UNIX_CLI_PARSE_ACTION_TELNETIAC: + /* process telnet options */ + matched = unix_cli_process_telnet (um, cf, uf, + cf->input_vector + i, + vec_len (cf->input_vector) - i); + if (matched < 0) + { + if (i) + { + /* There was a partial match which means we need more bytes + * than the input buffer currently has. + * Since the bytes before here have been processed, shift + * the remaining contents to the start of the input buffer. + */ + vec_delete (cf->input_vector, i, 0); + } + return 1; /* wait for more */ + } + break; + + default: + /* process the action */ + if (!unix_cli_line_process_one (cm, um, cf, uf, + cf->input_vector[i], action)) + { + /* CRLF found. Consume the bytes from the input_vector */ + vec_delete (cf->input_vector, i + matched, 0); + /* And tell our caller to execute cf->input_command */ + return 0; + } + } + + i += matched; + } + + vec_reset_length (cf->input_vector); + return 1; +} + +/** @brief Process input to a CLI session. */ +static void +unix_cli_process_input (unix_cli_main_t * cm, uword cli_file_index) +{ + unix_main_t *um = &unix_main; + unix_file_t *uf; + unix_cli_file_t *cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index); + unformat_input_t input; + int vlib_parse_eval (u8 *); + +more: + /* Try vlibplex first. Someday... */ + if (0 && vlib_parse_eval (cf->input_vector) == 0) + goto done; + + if (cf->line_mode) + { + /* just treat whatever we got as a complete line of input */ + cf->current_command = cf->input_vector; + } + else + { + /* Line edit, echo, etc. */ + if (unix_cli_line_edit (cm, um, cf)) + /* want more input */ + return; + } + + if (um->log_fd) + { + static u8 *lv; + vec_reset_length (lv); + lv = format (lv, "%U[%d]: %v", + format_timeval, 0 /* current bat-time */ , + 0 /* current bat-format */ , + cli_file_index, cf->input_vector); + { + int rv __attribute__ ((unused)) = + write (um->log_fd, lv, vec_len (lv)); + } + } + + /* Copy our input command to a new string */ + unformat_init_vector (&input, cf->current_command); + + /* Remove leading white space from input. */ + (void) unformat (&input, ""); + + cm->current_input_file_index = cli_file_index; + cf->pager_start = 0; /* start a new pager session */ + + if (unformat_check_input (&input) != UNFORMAT_END_OF_INPUT) + vlib_cli_input (um->vlib_main, &input, unix_vlib_cli_output, + cli_file_index); + + /* Zero buffer since otherwise unformat_free will call vec_free on it. */ + input.buffer = 0; + + unformat_free (&input); + + /* Re-fetch pointer since pool may have moved. */ + cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index); + uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + +done: + /* reset vector; we'll re-use it later */ + if (cf->line_mode) + vec_reset_length (cf->input_vector); + else + vec_reset_length (cf->current_command); + + if (cf->no_pager == 2) + { + /* Pager was programmatically disabled */ + unix_cli_pager_message (cf, uf, "pager buffer overflowed", "\n"); + cf->no_pager = um->cli_no_pager; + } + + if (vec_len (cf->pager_index) == 0 + || vec_len (cf->pager_index) < cf->height) + { + /* There was no need for the pager */ + unix_cli_pager_reset (cf); + + /* Prompt. */ + unix_cli_cli_prompt (cf, uf); + } + else + { + /* Display the pager prompt */ + unix_cli_pager_prompt (cf, uf); + } + + /* Any residual data in the input vector? */ + if (vec_len (cf->input_vector)) + goto more; +} + +/** Destroy a CLI session. + * @note If we destroy the @c stdin session this additionally signals + * the shutdown of VPP. + */ +static void +unix_cli_kill (unix_cli_main_t * cm, uword cli_file_index) +{ + unix_main_t *um = &unix_main; + unix_cli_file_t *cf; + unix_file_t *uf; + int i; + + cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index); + uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + + /* Quit/EOF on stdin means quit program. */ + if (uf->file_descriptor == UNIX_CLI_STDIN_FD) + clib_longjmp (&um->vlib_main->main_loop_exit, VLIB_MAIN_LOOP_EXIT_CLI); + + vec_free (cf->current_command); + vec_free (cf->search_key); + + for (i = 0; i < vec_len (cf->command_history); i++) + vec_free (cf->command_history[i]); + + vec_free (cf->command_history); + + unix_file_del (um, uf); + + unix_cli_file_free (cf); + pool_put (cm->cli_file_pool, cf); +} + +/** Handle system events. */ +static uword +unix_cli_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + unix_cli_main_t *cm = &unix_cli_main; + uword i, *data = 0; + + while (1) + { + unix_cli_process_event_type_t event_type; + vlib_process_wait_for_event (vm); + event_type = vlib_process_get_events (vm, &data); + + switch (event_type) + { + case UNIX_CLI_PROCESS_EVENT_READ_READY: + for (i = 0; i < vec_len (data); i++) + unix_cli_process_input (cm, data[i]); + break; + + case UNIX_CLI_PROCESS_EVENT_QUIT: + /* Kill this process. */ + for (i = 0; i < vec_len (data); i++) + unix_cli_kill (cm, data[i]); + goto done; + } + + if (data) + _vec_len (data) = 0; + } + +done: + vec_free (data); + + vlib_node_set_state (vm, rt->node_index, VLIB_NODE_STATE_DISABLED); + + /* Add node index so we can re-use this process later. */ + vec_add1 (cm->unused_cli_process_node_indices, rt->node_index); + + return 0; +} + +/** Called when a CLI session file descriptor can be written to without + * blocking. */ +static clib_error_t * +unix_cli_write_ready (unix_file_t * uf) +{ + unix_cli_main_t *cm = &unix_cli_main; + unix_cli_file_t *cf; + int n; + + cf = pool_elt_at_index (cm->cli_file_pool, uf->private_data); + + /* Flush output vector. */ + n = write (uf->file_descriptor, + cf->output_vector, vec_len (cf->output_vector)); + + if (n < 0 && errno != EAGAIN) + return clib_error_return_unix (0, "write"); + + else if (n > 0) + unix_cli_del_pending_output (uf, cf, n); + + return /* no error */ 0; +} + +/** Called when a CLI session file descriptor has data to be read. */ +static clib_error_t * +unix_cli_read_ready (unix_file_t * uf) +{ + unix_main_t *um = &unix_main; + unix_cli_main_t *cm = &unix_cli_main; + unix_cli_file_t *cf; + uword l; + int n, n_read, n_try; + + cf = pool_elt_at_index (cm->cli_file_pool, uf->private_data); + + n = n_try = 4096; + while (n == n_try) + { + l = vec_len (cf->input_vector); + vec_resize (cf->input_vector, l + n_try); + + n = read (uf->file_descriptor, cf->input_vector + l, n_try); + + /* Error? */ + if (n < 0 && errno != EAGAIN) + return clib_error_return_unix (0, "read"); + + n_read = n < 0 ? 0 : n; + _vec_len (cf->input_vector) = l + n_read; + } + + if (!(n < 0)) + vlib_process_signal_event (um->vlib_main, + cf->process_node_index, + (n_read == 0 + ? UNIX_CLI_PROCESS_EVENT_QUIT + : UNIX_CLI_PROCESS_EVENT_READ_READY), + /* event data */ uf->private_data); + + return /* no error */ 0; +} + +/** Store a new CLI session. + * @param name The name of the session. + * @param fd The file descriptor for the session I/O. + * @return The session ID. + */ +static u32 +unix_cli_file_add (unix_cli_main_t * cm, char *name, int fd) +{ + unix_main_t *um = &unix_main; + unix_cli_file_t *cf; + unix_file_t template = { 0 }; + vlib_main_t *vm = um->vlib_main; + vlib_node_t *n; + + name = (char *) format (0, "unix-cli-%s", name); + + if (vec_len (cm->unused_cli_process_node_indices) > 0) + { + uword l = vec_len (cm->unused_cli_process_node_indices); + + /* Find node and give it new name. */ + n = vlib_get_node (vm, cm->unused_cli_process_node_indices[l - 1]); + vec_free (n->name); + n->name = (u8 *) name; + + vlib_node_set_state (vm, n->index, VLIB_NODE_STATE_POLLING); + + _vec_len (cm->unused_cli_process_node_indices) = l - 1; + } + else + { + static vlib_node_registration_t r = { + .function = unix_cli_process, + .type = VLIB_NODE_TYPE_PROCESS, + .process_log2_n_stack_bytes = 16, + }; + + r.name = name; + vlib_register_node (vm, &r); + vec_free (name); + + n = vlib_get_node (vm, r.index); + } + + pool_get (cm->cli_file_pool, cf); + memset (cf, 0, sizeof (*cf)); + + template.read_function = unix_cli_read_ready; + template.write_function = unix_cli_write_ready; + template.file_descriptor = fd; + template.private_data = cf - cm->cli_file_pool; + + cf->process_node_index = n->index; + cf->unix_file_index = unix_file_add (um, &template); + cf->output_vector = 0; + cf->input_vector = 0; + + vlib_start_process (vm, n->runtime_index); + + vlib_process_t *p = vlib_get_process_from_node (vm, n); + p->output_function = unix_vlib_cli_output; + p->output_function_arg = cf - cm->cli_file_pool; + + return cf - cm->cli_file_pool; +} + +/** Telnet listening socket has a new connection. */ +static clib_error_t * +unix_cli_listen_read_ready (unix_file_t * uf) +{ + unix_main_t *um = &unix_main; + unix_cli_main_t *cm = &unix_cli_main; + clib_socket_t *s = &um->cli_listen_socket; + clib_socket_t client; + char *client_name; + clib_error_t *error; + unix_cli_file_t *cf; + u32 cf_index; + + error = clib_socket_accept (s, &client); + if (error) + return error; + + client_name = (char *) format (0, "%U%c", format_sockaddr, &client.peer, 0); + + cf_index = unix_cli_file_add (cm, client_name, client.fd); + cf = pool_elt_at_index (cm->cli_file_pool, cf_index); + + /* No longer need CLIB version of socket. */ + clib_socket_free (&client); + + vec_free (client_name); + + /* if we're supposed to run telnet session in character mode (default) */ + if (um->cli_line_mode == 0) + { + /* + * Set telnet client character mode, echo on, suppress "go-ahead". + * Technically these should be negotiated, but this works. + */ + u8 charmode_option[] = { + IAC, WONT, TELOPT_LINEMODE, /* server will do char-by-char */ + IAC, DONT, TELOPT_LINEMODE, /* client should do char-by-char */ + IAC, WILL, TELOPT_SGA, /* server willl supress GA */ + IAC, DO, TELOPT_SGA, /* client should supress Go Ahead */ + IAC, WILL, TELOPT_ECHO, /* server will do echo */ + IAC, DONT, TELOPT_ECHO, /* client should not echo */ + IAC, DO, TELOPT_TTYPE, /* client should tell us its term type */ + IAC, SB, TELOPT_TTYPE, 1, IAC, SE, /* now tell me ttype */ + IAC, DO, TELOPT_NAWS, /* client should tell us its window sz */ + IAC, SB, TELOPT_NAWS, 1, IAC, SE, /* now tell me window size */ + }; + + /* Enable history on this CLI */ + cf->history_limit = um->cli_history_limit; + cf->has_history = cf->history_limit != 0; + + /* Make sure this session is in line mode */ + cf->line_mode = 0; + + /* We need CRLF */ + cf->crlf_mode = 1; + + /* Setup the pager */ + cf->no_pager = um->cli_no_pager; + + uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + + /* Send the telnet options */ + unix_vlib_cli_output_raw (cf, uf, charmode_option, + ARRAY_LEN (charmode_option)); + + /* In case the client doesn't negotiate terminal type, use + * a timer to kick off the initial prompt. */ + timer_call (unix_cli_file_welcome_timer, cf_index, 1); + } + + return error; +} + +/** The system terminal has informed us that the window size + * has changed. + */ +static void +unix_cli_resize_interrupt (int signum) +{ + unix_main_t *um = &unix_main; + unix_cli_main_t *cm = &unix_cli_main; + unix_cli_file_t *cf = pool_elt_at_index (cm->cli_file_pool, + cm->stdin_cli_file_index); + unix_file_t *uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + struct winsize ws; + (void) signum; + + /* Terminal resized, fetch the new size */ + if (ioctl (UNIX_CLI_STDIN_FD, TIOCGWINSZ, &ws) < 0) + { + /* "Should never happen..." */ + clib_unix_warning ("TIOCGWINSZ"); + /* We can't trust ws.XXX... */ + return; + } + cf->width = ws.ws_col; + cf->height = ws.ws_row; + + /* Reindex the pager buffer */ + unix_cli_pager_reindex (cf); + + /* Redraw the page */ + unix_cli_pager_redraw (cf, uf); +} + +/** Handle configuration directives in the @em unix section. */ +static clib_error_t * +unix_cli_config (vlib_main_t * vm, unformat_input_t * input) +{ + unix_main_t *um = &unix_main; + unix_cli_main_t *cm = &unix_cli_main; + int flags; + clib_error_t *error = 0; + unix_cli_file_t *cf; + u32 cf_index; + struct termios tio; + struct sigaction sa; + struct winsize ws; + u8 *term; + + /* We depend on unix flags being set. */ + if ((error = vlib_call_config_function (vm, unix_config))) + return error; + + if (um->flags & UNIX_FLAG_INTERACTIVE) + { + /* Set stdin to be non-blocking. */ + if ((flags = fcntl (UNIX_CLI_STDIN_FD, F_GETFL, 0)) < 0) + flags = 0; + (void) fcntl (UNIX_CLI_STDIN_FD, F_SETFL, flags | O_NONBLOCK); + + cf_index = unix_cli_file_add (cm, "stdin", UNIX_CLI_STDIN_FD); + cf = pool_elt_at_index (cm->cli_file_pool, cf_index); + cm->stdin_cli_file_index = cf_index; + + /* If stdin is a tty and we are using chacracter mode, enable + * history on the CLI and set the tty line discipline accordingly. */ + if (isatty (UNIX_CLI_STDIN_FD) && um->cli_line_mode == 0) + { + /* Capture terminal resize events */ + memset (&sa, 0, sizeof (sa)); + sa.sa_handler = unix_cli_resize_interrupt; + if (sigaction (SIGWINCH, &sa, 0) < 0) + clib_panic ("sigaction"); + + /* Retrieve the current terminal size */ + ioctl (UNIX_CLI_STDIN_FD, TIOCGWINSZ, &ws); + cf->width = ws.ws_col; + cf->height = ws.ws_row; + + if (cf->width == 0 || cf->height == 0) + /* We have a tty, but no size. Stick to line mode. */ + goto notty; + + /* Setup the history */ + cf->history_limit = um->cli_history_limit; + cf->has_history = cf->history_limit != 0; + + /* Setup the pager */ + cf->no_pager = um->cli_no_pager; + + /* We're going to be in char by char mode */ + cf->line_mode = 0; + + /* Save the original tty state so we can restore it later */ + tcgetattr (UNIX_CLI_STDIN_FD, &um->tio_stdin); + um->tio_isset = 1; + + /* Tweak the tty settings */ + tio = um->tio_stdin; + /* echo off, canonical mode off, ext'd input processing off */ + tio.c_lflag &= ~(ECHO | ICANON | IEXTEN); + tio.c_cc[VMIN] = 1; /* 1 byte at a time */ + tio.c_cc[VTIME] = 0; /* no timer */ + tcsetattr (UNIX_CLI_STDIN_FD, TCSAFLUSH, &tio); + + /* See if we can do ANSI/VT100 output */ + term = (u8 *) getenv ("TERM"); + if (term != NULL) + cf->ansi_capable = unix_cli_terminal_type (term, + strlen ((char *) + term)); + } + else + { + notty: + /* No tty, so make sure these things are off */ + cf->no_pager = 1; + cf->history_limit = 0; + cf->has_history = 0; + cf->line_mode = 1; + } + + /* Send banner and initial prompt */ + unix_cli_file_welcome (cm, cf); + } + + /* If we have socket config, LISTEN, otherwise, don't */ + clib_socket_t *s = &um->cli_listen_socket; + if (s->config && s->config[0] != 0) + { + /* CLI listen. */ + unix_file_t template = { 0 }; + + s->flags = SOCKET_IS_SERVER; /* listen, don't connect */ + error = clib_socket_init (s); + + if (error) + return error; + + template.read_function = unix_cli_listen_read_ready; + template.file_descriptor = s->fd; + + unix_file_add (um, &template); + } + + /* Set CLI prompt. */ + if (!cm->cli_prompt) + cm->cli_prompt = format (0, "VLIB: "); + + return 0; +} + +/*? + * This module has no configurable parameters. +?*/ +VLIB_CONFIG_FUNCTION (unix_cli_config, "unix-cli"); + +/** Called when VPP is shutting down, this restores the system + * terminal state if previously saved. + */ +static clib_error_t * +unix_cli_exit (vlib_main_t * vm) +{ + unix_main_t *um = &unix_main; + + /* If stdin is a tty and we saved the tty state, reset the tty state */ + if (isatty (UNIX_CLI_STDIN_FD) && um->tio_isset) + tcsetattr (UNIX_CLI_STDIN_FD, TCSAFLUSH, &um->tio_stdin); + + return 0; +} + +VLIB_MAIN_LOOP_EXIT_FUNCTION (unix_cli_exit); + +/** Set the CLI prompt. + * @param prompt The C string to set the prompt to. + * @note This setting is global; it impacts all current + * and future CLI sessions. + */ +void +vlib_unix_cli_set_prompt (char *prompt) +{ + char *fmt = (prompt[strlen (prompt) - 1] == ' ') ? "%s" : "%s "; + unix_cli_main_t *cm = &unix_cli_main; + if (cm->cli_prompt) + vec_free (cm->cli_prompt); + cm->cli_prompt = format (0, fmt, prompt); +} + +/** CLI command to quit the terminal session. + * @note If this is a stdin session then this will + * shutdown VPP also. + */ +static clib_error_t * +unix_cli_quit (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + unix_cli_main_t *cm = &unix_cli_main; + + vlib_process_signal_event (vm, + vlib_current_process (vm), + UNIX_CLI_PROCESS_EVENT_QUIT, + cm->current_input_file_index); + return 0; +} + +/*? + * Terminates the current CLI session. + * + * If VPP is running in @em interactive mode and this is the console session + * (that is, the session on @c stdin) then this will also terminate VPP. +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (unix_cli_quit_command, static) = { + .path = "quit", + .short_help = "Exit CLI", + .function = unix_cli_quit, +}; +/* *INDENT-ON* */ + +/** CLI command to execute a VPP command script. */ +static clib_error_t * +unix_cli_exec (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + char *file_name; + int fd; + unformat_input_t sub_input; + clib_error_t *error; + + file_name = 0; + fd = -1; + error = 0; + + if (!unformat (input, "%s", &file_name)) + { + error = clib_error_return (0, "expecting file name, got `%U'", + format_unformat_error, input); + goto done; + } + + fd = open (file_name, O_RDONLY); + if (fd < 0) + { + error = clib_error_return_unix (0, "failed to open `%s'", file_name); + goto done; + } + + /* Make sure its a regular file. */ + { + struct stat s; + + if (fstat (fd, &s) < 0) + { + error = clib_error_return_unix (0, "failed to stat `%s'", file_name); + goto done; + } + + if (!(S_ISREG (s.st_mode) || S_ISLNK (s.st_mode))) + { + error = clib_error_return (0, "not a regular file `%s'", file_name); + goto done; + } + } + + unformat_init_unix_file (&sub_input, fd); + + vlib_cli_input (vm, &sub_input, 0, 0); + unformat_free (&sub_input); + +done: + if (fd > 0) + close (fd); + vec_free (file_name); + + return error; +} + +/*? + * Executes a sequence of CLI commands which are read from a file. + * + * If a command is unrecognised or otherwise invalid then the usual CLI + * feedback will be generated, however execution of subsequent commands + * from the file will continue. +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cli_exec, static) = { + .path = "exec", + .short_help = "Execute commands from file", + .function = unix_cli_exec, + .is_mp_safe = 1, +}; +/* *INDENT-ON* */ + +/** CLI command to show various unix error statistics. */ +static clib_error_t * +unix_show_errors (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + unix_main_t *um = &unix_main; + clib_error_t *error = 0; + int i, n_errors_to_show; + unix_error_history_t *unix_errors = 0; + + n_errors_to_show = 1 << 30; + + if (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (!unformat (input, "%d", &n_errors_to_show)) + { + error = + clib_error_return (0, + "expecting integer number of errors to show, got `%U'", + format_unformat_error, input); + goto done; + } + } + + n_errors_to_show = + clib_min (ARRAY_LEN (um->error_history), n_errors_to_show); + + i = + um->error_history_index > + 0 ? um->error_history_index - 1 : ARRAY_LEN (um->error_history) - 1; + + while (n_errors_to_show > 0) + { + unix_error_history_t *eh = um->error_history + i; + + if (!eh->error) + break; + + vec_add1 (unix_errors, eh[0]); + n_errors_to_show -= 1; + if (i == 0) + i = ARRAY_LEN (um->error_history) - 1; + else + i--; + } + + if (vec_len (unix_errors) == 0) + vlib_cli_output (vm, "no Unix errors so far"); + else + { + vlib_cli_output (vm, "%Ld total errors seen", um->n_total_errors); + for (i = vec_len (unix_errors) - 1; i >= 0; i--) + { + unix_error_history_t *eh = vec_elt_at_index (unix_errors, i); + vlib_cli_output (vm, "%U: %U", + format_time_interval, "h:m:s:u", eh->time, + format_clib_error, eh->error); + } + vlib_cli_output (vm, "%U: time now", + format_time_interval, "h:m:s:u", vlib_time_now (vm)); + } + +done: + vec_free (unix_errors); + return error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cli_unix_show_errors, static) = { + .path = "show unix-errors", + .short_help = "Show Unix system call error history", + .function = unix_show_errors, +}; +/* *INDENT-ON* */ + +/** CLI command to show session command history. */ +static clib_error_t * +unix_cli_show_history (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + unix_cli_main_t *cm = &unix_cli_main; + unix_cli_file_t *cf; + int i, j; + + cf = pool_elt_at_index (cm->cli_file_pool, cm->current_input_file_index); + + if (cf->has_history && cf->history_limit) + { + i = 1 + cf->command_number - vec_len (cf->command_history); + for (j = 0; j < vec_len (cf->command_history); j++) + vlib_cli_output (vm, "%d %v\n", i + j, cf->command_history[j]); + } + else + { + vlib_cli_output (vm, "History not enabled.\n"); + } + + return 0; +} + +/*? + * Displays the command history for the current session, if any. +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cli_unix_cli_show_history, static) = { + .path = "history", + .short_help = "Show current session command history", + .function = unix_cli_show_history, +}; +/* *INDENT-ON* */ + +/** CLI command to show terminal status. */ +static clib_error_t * +unix_cli_show_terminal (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + unix_main_t *um = &unix_main; + unix_cli_main_t *cm = &unix_cli_main; + unix_cli_file_t *cf; + vlib_node_t *n; + + cf = pool_elt_at_index (cm->cli_file_pool, cm->current_input_file_index); + n = vlib_get_node (vm, cf->process_node_index); + + vlib_cli_output (vm, "Terminal name: %v\n", n->name); + vlib_cli_output (vm, "Terminal mode: %s\n", cf->line_mode ? + "line-by-line" : "char-by-char"); + vlib_cli_output (vm, "Terminal width: %d\n", cf->width); + vlib_cli_output (vm, "Terminal height: %d\n", cf->height); + vlib_cli_output (vm, "ANSI capable: %s\n", + cf->ansi_capable ? "yes" : "no"); + vlib_cli_output (vm, "History enabled: %s%s\n", + cf->has_history ? "yes" : "no", !cf->has_history + || cf->history_limit ? "" : + " (disabled by history limit)"); + if (cf->has_history) + vlib_cli_output (vm, "History limit: %d\n", cf->history_limit); + vlib_cli_output (vm, "Pager enabled: %s%s%s\n", + cf->no_pager ? "no" : "yes", + cf->no_pager + || cf->height ? "" : " (disabled by terminal height)", + cf->no_pager + || um->cli_pager_buffer_limit ? "" : + " (disabled by buffer limit)"); + if (!cf->no_pager) + vlib_cli_output (vm, "Pager limit: %d\n", um->cli_pager_buffer_limit); + vlib_cli_output (vm, "CRLF mode: %s\n", + cf->crlf_mode ? "CR+LF" : "LF"); + + return 0; +} + +/*? + * Displays various information about the state of the current terminal + * session. + * + * @cliexpar + * @cliexstart{show terminal} + * Terminal name: unix-cli-stdin + * Terminal mode: char-by-char + * Terminal width: 123 + * Terminal height: 48 + * ANSI capable: yes + * History enabled: yes + * History limit: 50 + * Pager enabled: yes + * Pager limit: 100000 + * CRLF mode: LF + * @cliexend +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cli_unix_cli_show_terminal, static) = { + .path = "show terminal", + .short_help = "Show current session terminal settings", + .function = unix_cli_show_terminal, +}; +/* *INDENT-ON* */ + +/** CLI command to set terminal pager settings. */ +static clib_error_t * +unix_cli_set_terminal_pager (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unix_main_t *um = &unix_main; + unix_cli_main_t *cm = &unix_cli_main; + unix_cli_file_t *cf; + unformat_input_t _line_input, *line_input = &_line_input; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + cf = pool_elt_at_index (cm->cli_file_pool, cm->current_input_file_index); + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "on")) + cf->no_pager = 0; + else if (unformat (line_input, "off")) + cf->no_pager = 1; + else if (unformat (line_input, "limit %u", &um->cli_pager_buffer_limit)) + vlib_cli_output (vm, + "Pager limit set to %u lines; note, this is global.\n", + um->cli_pager_buffer_limit); + else + return clib_error_return (0, "unknown parameter: `%U`", + format_unformat_error, line_input); + } + + unformat_free (line_input); + + return 0; +} + +/*? + * Enables or disables the terminal pager for this session. Generally + * this defaults to enabled. + * + * Additionally allows the pager buffer size to be set; though note that + * this value is set globally and not per session. +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cli_unix_cli_set_terminal_pager, static) = { + .path = "set terminal pager", + .short_help = "set terminal pager [on|off] [limit <lines>]", + .function = unix_cli_set_terminal_pager, +}; +/* *INDENT-ON* */ + +/** CLI command to set terminal history settings. */ +static clib_error_t * +unix_cli_set_terminal_history (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unix_cli_main_t *cm = &unix_cli_main; + unix_cli_file_t *cf; + unformat_input_t _line_input, *line_input = &_line_input; + u32 limit; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + cf = pool_elt_at_index (cm->cli_file_pool, cm->current_input_file_index); + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "on")) + cf->has_history = 1; + else if (unformat (line_input, "off")) + cf->has_history = 0; + else if (unformat (line_input, "limit %u", &cf->history_limit)) + ; + else + return clib_error_return (0, "unknown parameter: `%U`", + format_unformat_error, line_input); + + /* If we reduced history size, or turned it off, purge the history */ + limit = cf->has_history ? cf->history_limit : 0; + + while (cf->command_history && vec_len (cf->command_history) >= limit) + { + vec_free (cf->command_history[0]); + vec_delete (cf->command_history, 1, 0); + } + } + + unformat_free (line_input); + + return 0; +} + +/*? + * Enables or disables the command history function of the current + * terminal. Generally this defaults to enabled. + * + * This command also allows the maximum size of the history buffer for + * this session to be altered. +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cli_unix_cli_set_terminal_history, static) = { + .path = "set terminal history", + .short_help = "set terminal history [on|off] [limit <lines>]", + .function = unix_cli_set_terminal_history, +}; +/* *INDENT-ON* */ + +/** CLI command to set terminal ANSI settings. */ +static clib_error_t * +unix_cli_set_terminal_ansi (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unix_cli_main_t *cm = &unix_cli_main; + unix_cli_file_t *cf; + + cf = pool_elt_at_index (cm->cli_file_pool, cm->current_input_file_index); + + if (unformat (input, "on")) + cf->ansi_capable = 1; + else if (unformat (input, "off")) + cf->ansi_capable = 0; + else + return clib_error_return (0, "unknown parameter: `%U`", + format_unformat_error, input); + + return 0; +} + +/*? + * Enables or disables the use of ANSI control sequences by this terminal. + * The default will vary based on terminal detection at the start of the + * session. + * + * ANSI control sequences are used in a small number of places to provide, + * for example, color text output and to control the cursor in the pager. +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cli_unix_cli_set_terminal_ansi, static) = { + .path = "set terminal ansi", + .short_help = "set terminal ansi [on|off]", + .function = unix_cli_set_terminal_ansi, +}; +/* *INDENT-ON* */ + +static clib_error_t * +unix_cli_init (vlib_main_t * vm) +{ + return 0; +} + +VLIB_INIT_FUNCTION (unix_cli_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/dir.dox b/src/vlib/unix/dir.dox new file mode 100644 index 00000000000..1380fa56b37 --- /dev/null +++ b/src/vlib/unix/dir.dox @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2016 Comcast Cable Communications Management, LLC. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Doxygen directory documentation */ + +/** +@dir +@brief VLIB Unix interface + +VLIB application library Unix interface layer. + +*/ +/*? %%clicmd:group_label Unix Interface %% ?*/ +/*? %%syscfg:group_label Unix Interface %% ?*/ + diff --git a/src/vlib/unix/input.c b/src/vlib/unix/input.c new file mode 100644 index 00000000000..07096ed27dc --- /dev/null +++ b/src/vlib/unix/input.c @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * input.c: Unix file input + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <signal.h> + +/* FIXME autoconf */ +#define HAVE_LINUX_EPOLL + +#ifdef HAVE_LINUX_EPOLL + +#include <sys/epoll.h> + +typedef struct +{ + int epoll_fd; + struct epoll_event *epoll_events; + + /* Statistics. */ + u64 epoll_files_ready; + u64 epoll_waits; +} linux_epoll_main_t; + +static linux_epoll_main_t linux_epoll_main; + +static void +linux_epoll_file_update (unix_file_t * f, unix_file_update_type_t update_type) +{ + unix_main_t *um = &unix_main; + linux_epoll_main_t *em = &linux_epoll_main; + struct epoll_event e; + + memset (&e, 0, sizeof (e)); + + e.events = EPOLLIN; + if (f->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE) + e.events |= EPOLLOUT; + if (f->flags & UNIX_FILE_EVENT_EDGE_TRIGGERED) + e.events |= EPOLLET; + e.data.u32 = f - um->file_pool; + + if (epoll_ctl (em->epoll_fd, + (update_type == UNIX_FILE_UPDATE_ADD + ? EPOLL_CTL_ADD + : (update_type == UNIX_FILE_UPDATE_MODIFY + ? EPOLL_CTL_MOD + : EPOLL_CTL_DEL)), f->file_descriptor, &e) < 0) + clib_warning ("epoll_ctl"); +} + +static uword +linux_epoll_input (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + unix_main_t *um = &unix_main; + linux_epoll_main_t *em = &linux_epoll_main; + struct epoll_event *e; + int n_fds_ready; + + { + vlib_node_main_t *nm = &vm->node_main; + u64 t = nm->cpu_time_next_process_ready; + f64 timeout; + int timeout_ms, max_timeout_ms = 10; + f64 vector_rate = vlib_last_vectors_per_main_loop (vm); + + if (t == ~0ULL) + { + timeout = 10e-3; + timeout_ms = max_timeout_ms; + } + else + { + timeout = + (((i64) t - (i64) clib_cpu_time_now ()) + * vm->clib_time.seconds_per_clock) + /* subtract off some slop time */ - 50e-6; + + if (timeout < 1e3) + { + /* We have event happenning in less than 1 ms so + don't allow epoll to wait */ + timeout_ms = 0; + } + else + { + timeout_ms = timeout * 1e3; + + /* Must be between 1 and 10 ms. */ + timeout_ms = clib_max (1, timeout_ms); + timeout_ms = clib_min (max_timeout_ms, timeout_ms); + } + } + + /* If we still have input nodes polling (e.g. vnet packet generator) + don't sleep. */ + if (nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] > 0) + timeout_ms = 0; + + /* + * When busy: don't wait & only epoll for input + * every 1024 times through main loop. + */ + if (vector_rate > 1 || vm->api_queue_nonempty) + { + timeout_ms = 0; + node->input_main_loops_per_call = 1024; + } + else + /* We're not busy; go to sleep for a while. */ + node->input_main_loops_per_call = 0; + + /* Allow any signal to wakeup our sleep. */ + { + static sigset_t unblock_all_signals; + n_fds_ready = epoll_pwait (em->epoll_fd, + em->epoll_events, + vec_len (em->epoll_events), + timeout_ms, &unblock_all_signals); + + /* This kludge is necessary to run over absurdly old kernels */ + if (n_fds_ready < 0 && errno == ENOSYS) + { + n_fds_ready = epoll_wait (em->epoll_fd, + em->epoll_events, + vec_len (em->epoll_events), timeout_ms); + } + } + } + + if (n_fds_ready < 0) + { + if (unix_error_is_fatal (errno)) + vlib_panic_with_error (vm, clib_error_return_unix (0, "epoll_wait")); + + /* non fatal error (e.g. EINTR). */ + return 0; + } + + em->epoll_waits += 1; + em->epoll_files_ready += n_fds_ready; + + for (e = em->epoll_events; e < em->epoll_events + n_fds_ready; e++) + { + u32 i = e->data.u32; + unix_file_t *f = pool_elt_at_index (um->file_pool, i); + clib_error_t *errors[4]; + int n_errors = 0; + + if (PREDICT_TRUE (!(e->events & EPOLLERR))) + { + if (e->events & EPOLLIN) + { + errors[n_errors] = f->read_function (f); + n_errors += errors[n_errors] != 0; + } + if (e->events & EPOLLOUT) + { + errors[n_errors] = f->write_function (f); + n_errors += errors[n_errors] != 0; + } + } + else + { + if (f->error_function) + { + errors[n_errors] = f->error_function (f); + n_errors += errors[n_errors] != 0; + } + else + close (f->file_descriptor); + } + + ASSERT (n_errors < ARRAY_LEN (errors)); + for (i = 0; i < n_errors; i++) + { + unix_save_error (um, errors[i]); + } + } + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (linux_epoll_input_node,static) = { + .function = linux_epoll_input, + .type = VLIB_NODE_TYPE_PRE_INPUT, + .name = "unix-epoll-input", +}; +/* *INDENT-ON* */ + +clib_error_t * +linux_epoll_input_init (vlib_main_t * vm) +{ + linux_epoll_main_t *em = &linux_epoll_main; + unix_main_t *um = &unix_main; + + /* Allocate some events. */ + vec_resize (em->epoll_events, VLIB_FRAME_SIZE); + + em->epoll_fd = epoll_create (vec_len (em->epoll_events)); + if (em->epoll_fd < 0) + return clib_error_return_unix (0, "epoll_create"); + + um->file_update = linux_epoll_file_update; + + return 0; +} + +VLIB_INIT_FUNCTION (linux_epoll_input_init); + +#endif /* HAVE_LINUX_EPOLL */ + +static clib_error_t * +unix_input_init (vlib_main_t * vm) +{ + return vlib_call_init_function (vm, linux_epoll_input_init); +} + +VLIB_INIT_FUNCTION (unix_input_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c new file mode 100644 index 00000000000..562778e0e5d --- /dev/null +++ b/src/vlib/unix/main.c @@ -0,0 +1,557 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * main.c: Unix main routine + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <vlib/unix/plugin.h> + +#include <signal.h> +#include <sys/ucontext.h> +#include <syslog.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +/** Default CLI pager limit is not configured in startup.conf */ +#define UNIX_CLI_DEFAULT_PAGER_LIMIT 100000 + +/** Default CLI history depth if not configured in startup.conf */ +#define UNIX_CLI_DEFAULT_HISTORY 50 + + +unix_main_t unix_main; + +static clib_error_t * +unix_main_init (vlib_main_t * vm) +{ + unix_main_t *um = &unix_main; + um->vlib_main = vm; + return vlib_call_init_function (vm, unix_input_init); +} + +VLIB_INIT_FUNCTION (unix_main_init); + +static void +unix_signal_handler (int signum, siginfo_t * si, ucontext_t * uc) +{ + uword fatal; + u8 *msg = 0; + + msg = format (msg, "received signal %U, PC %U", + format_signal, signum, format_ucontext_pc, uc); + + if (signum == SIGSEGV) + msg = format (msg, ", faulting address %p", si->si_addr); + + switch (signum) + { + /* these (caught) signals cause the application to exit */ + case SIGTERM: + if (unix_main.vlib_main->main_loop_exit_set) + { + syslog (LOG_ERR | LOG_DAEMON, "received SIGTERM, exiting..."); + + clib_longjmp (&unix_main.vlib_main->main_loop_exit, + VLIB_MAIN_LOOP_EXIT_CLI); + } + /* fall through */ + case SIGQUIT: + case SIGINT: + case SIGILL: + case SIGBUS: + case SIGSEGV: + case SIGHUP: + case SIGFPE: + fatal = 1; + break; + + /* by default, print a message and continue */ + default: + fatal = 0; + break; + } + + /* Null terminate. */ + vec_add1 (msg, 0); + + if (fatal) + { + syslog (LOG_ERR | LOG_DAEMON, "%s", msg); + os_exit (1); + } + else + clib_warning ("%s", msg); + + vec_free (msg); +} + +static clib_error_t * +setup_signal_handlers (unix_main_t * um) +{ + uword i; + struct sigaction sa; + + for (i = 1; i < 32; i++) + { + memset (&sa, 0, sizeof (sa)); + sa.sa_sigaction = (void *) unix_signal_handler; + sa.sa_flags = SA_SIGINFO; + + switch (i) + { + /* these signals take the default action */ + case SIGABRT: + case SIGKILL: + case SIGSTOP: + case SIGUSR1: + case SIGUSR2: + continue; + + /* ignore SIGPIPE, SIGCHLD */ + case SIGPIPE: + case SIGCHLD: + sa.sa_sigaction = (void *) SIG_IGN; + break; + + /* catch and handle all other signals */ + default: + break; + } + + if (sigaction (i, &sa, 0) < 0) + return clib_error_return_unix (0, "sigaction %U", format_signal, i); + } + + return 0; +} + +static void +unix_error_handler (void *arg, u8 * msg, int msg_len) +{ + unix_main_t *um = arg; + + /* Echo to stderr when interactive. */ + if (um->flags & UNIX_FLAG_INTERACTIVE) + { + CLIB_UNUSED (int r) = write (2, msg, msg_len); + } + else + { + char save = msg[msg_len - 1]; + + /* Null Terminate. */ + msg[msg_len - 1] = 0; + + syslog (LOG_ERR | LOG_DAEMON, "%s", msg); + + msg[msg_len - 1] = save; + } +} + +void +vlib_unix_error_report (vlib_main_t * vm, clib_error_t * error) +{ + unix_main_t *um = &unix_main; + + if (um->flags & UNIX_FLAG_INTERACTIVE || error == 0) + return; + + { + char save; + u8 *msg; + u32 msg_len; + + msg = error->what; + msg_len = vec_len (msg); + + /* Null Terminate. */ + save = msg[msg_len - 1]; + msg[msg_len - 1] = 0; + + syslog (LOG_ERR | LOG_DAEMON, "%s", msg); + + msg[msg_len - 1] = save; + } +} + +static uword +startup_config_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + unix_main_t *um = &unix_main; + u8 *buf = 0; + uword l, n = 1; + + vlib_process_suspend (vm, 2.0); + + while (um->unix_config_complete == 0) + vlib_process_suspend (vm, 0.1); + + if (um->startup_config_filename) + { + unformat_input_t sub_input; + int fd; + struct stat s; + char *fn = (char *) um->startup_config_filename; + + fd = open (fn, O_RDONLY); + if (fd < 0) + { + clib_warning ("failed to open `%s'", fn); + return 0; + } + + if (fstat (fd, &s) < 0) + { + clib_warning ("failed to stat `%s'", fn); + bail: + close (fd); + return 0; + } + + if (!(S_ISREG (s.st_mode) || S_ISLNK (s.st_mode))) + { + clib_warning ("not a regular file: `%s'", fn); + goto bail; + } + + while (n > 0) + { + l = vec_len (buf); + vec_resize (buf, 4096); + n = read (fd, buf + l, 4096); + if (n > 0) + { + _vec_len (buf) = l + n; + if (n < 4096) + break; + } + else + break; + } + if (um->log_fd && vec_len (buf)) + { + u8 *lv = 0; + lv = format (lv, "%U: ***** Startup Config *****\n%v", + format_timeval, 0 /* current bat-time */ , + 0 /* current bat-format */ , + buf); + { + int rv __attribute__ ((unused)) = + write (um->log_fd, lv, vec_len (lv)); + } + vec_reset_length (lv); + lv = format (lv, "%U: ***** End Startup Config *****\n", + format_timeval, 0 /* current bat-time */ , + 0 /* current bat-format */ ); + { + int rv __attribute__ ((unused)) = + write (um->log_fd, lv, vec_len (lv)); + } + vec_free (lv); + } + + if (vec_len (buf)) + { + unformat_init_vector (&sub_input, buf); + vlib_cli_input (vm, &sub_input, 0, 0); + /* frees buf for us */ + unformat_free (&sub_input); + } + close (fd); + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (startup_config_node,static) = { + .function = startup_config_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "startup-config-process", +}; +/* *INDENT-ON* */ + +static clib_error_t * +unix_config (vlib_main_t * vm, unformat_input_t * input) +{ + unix_main_t *um = &unix_main; + clib_error_t *error = 0; + + /* Defaults */ + um->cli_pager_buffer_limit = UNIX_CLI_DEFAULT_PAGER_LIMIT; + um->cli_history_limit = UNIX_CLI_DEFAULT_HISTORY; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + char *cli_prompt; + if (unformat (input, "interactive")) + um->flags |= UNIX_FLAG_INTERACTIVE; + else if (unformat (input, "nodaemon")) + um->flags |= UNIX_FLAG_NODAEMON; + else if (unformat (input, "cli-prompt %s", &cli_prompt)) + vlib_unix_cli_set_prompt (cli_prompt); + else + if (unformat (input, "cli-listen %s", &um->cli_listen_socket.config)) + ; + else if (unformat (input, "cli-line-mode")) + um->cli_line_mode = 1; + else if (unformat (input, "cli-no-banner")) + um->cli_no_banner = 1; + else if (unformat (input, "cli-no-pager")) + um->cli_no_pager = 1; + else if (unformat (input, "cli-pager-buffer-limit %d", + &um->cli_pager_buffer_limit)) + ; + else + if (unformat (input, "cli-history-limit %d", &um->cli_history_limit)) + ; + else if (unformat (input, "full-coredump")) + { + int fd; + + fd = open ("/proc/self/coredump_filter", O_WRONLY); + if (fd >= 0) + { + if (write (fd, "0x6f\n", 5) != 5) + clib_unix_warning ("coredump filter write failed!"); + close (fd); + } + else + clib_unix_warning ("couldn't open /proc/self/coredump_filter"); + } + else if (unformat (input, "startup-config %s", + &um->startup_config_filename)) + ; + else if (unformat (input, "exec %s", &um->startup_config_filename)) + ; + else if (unformat (input, "log %s", &um->log_filename)) + { + um->log_fd = open ((char *) um->log_filename, + O_CREAT | O_WRONLY | O_APPEND, 0644); + if (um->log_fd < 0) + { + clib_warning ("couldn't open log '%s'\n", um->log_filename); + um->log_fd = 0; + } + else + { + u8 *lv = 0; + lv = format (0, "%U: ***** Start: PID %d *****\n", + format_timeval, 0 /* current bat-time */ , + 0 /* current bat-format */ , + getpid ()); + { + int rv __attribute__ ((unused)) = + write (um->log_fd, lv, vec_len (lv)); + } + vec_free (lv); + } + } + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + if (!(um->flags & UNIX_FLAG_INTERACTIVE)) + { + error = setup_signal_handlers (um); + if (error) + return error; + + openlog (vm->name, LOG_CONS | LOG_PERROR | LOG_PID, LOG_DAEMON); + clib_error_register_handler (unix_error_handler, um); + + if (!(um->flags & UNIX_FLAG_NODAEMON) && daemon ( /* chdir to / */ 0, + /* stdin/stdout/stderr -> /dev/null */ + 0) < 0) + clib_error_return (0, "daemon () fails"); + } + um->unix_config_complete = 1; + + return 0; +} + +/* unix { ... } configuration. */ +/*? + * + * @cfgcmd{interactive} + * Attach CLI to stdin/out and provide a debugging command line interface. + * Implies @c nodaemon. + * + * @cfgcmd{nodaemon} + * Do not fork or background the VPP process. Typically used when invoking + * VPP applications from a process monitor. + * + * @cfgcmd{exec, <filename>} + * @par <code>startup-config <filename></code> + * Read startup operational configuration from @c filename. + * The contents of the file will be performed as though entered at the CLI. + * The two keywords are aliases for the same function; if both are specified, + * only the last will have an effect. + * + * @cfgcmd{log, <filename>} + * Logs the startup configuration and all subsequent CLI commands in + * @c filename. + * Very useful in situations where folks don't remember or can't be bothered + * to include CLI commands in bug reports. + * + * @cfgcmd{full-coredump} + * Ask the Linux kernel to dump all memory-mapped address regions, instead + * of just text+data+bss. + * + * @cfgcmd{cli-listen, <address:port>} + * Bind the CLI to listen at the address and port given. @clocalhost + * on TCP port @c 5002, given as <tt>cli-listen localhost:5002</tt>, + * is typical. + * + * @cfgcmd{cli-line-mode} + * Disable character-by-character I/O on stdin. Useful when combined with, + * for example, <tt>emacs M-x gud-gdb</tt>. + * + * @cfgcmd{cli-prompt, <string>} + * Configure the CLI prompt to be @c string. + * + * @cfgcmd{cli-history-limit, <nn>} + * Limit commmand history to @c nn lines. A value of @c 0 + * disables command history. Default value: @c 50 + * + * @cfgcmd{cli-no-banner} + * Disable the login banner on stdin and Telnet connections. + * + * @cfgcmd{cli-no-pager} + * Disable the output pager. + * + * @cfgcmd{cli-pager-buffer-limit, <nn>} + * Limit pager buffer to @c nn lines of output. + * A value of @c 0 disables the pager. Default value: @c 100000 +?*/ +VLIB_CONFIG_FUNCTION (unix_config, "unix"); + +static clib_error_t * +unix_exit (vlib_main_t * vm) +{ + /* Close syslog connection. */ + closelog (); + return 0; +} + +VLIB_MAIN_LOOP_EXIT_FUNCTION (unix_exit); + +u8 **vlib_thread_stacks; + +static uword +thread0 (uword arg) +{ + vlib_main_t *vm = (vlib_main_t *) arg; + unformat_input_t input; + int i; + + unformat_init_command_line (&input, (char **) vm->argv); + i = vlib_main (vm, &input); + unformat_free (&input); + + return i; +} + +int +vlib_unix_main (int argc, char *argv[]) +{ + vlib_main_t *vm = &vlib_global_main; /* one and only time for this! */ + vlib_thread_main_t *tm = &vlib_thread_main; + unformat_input_t input; + u8 *thread_stacks; + clib_error_t *e; + int i; + + vm->argv = (u8 **) argv; + vm->name = argv[0]; + vm->heap_base = clib_mem_get_heap (); + ASSERT (vm->heap_base); + + i = vlib_plugin_early_init (vm); + if (i) + return i; + + unformat_init_command_line (&input, (char **) vm->argv); + if (vm->init_functions_called == 0) + vm->init_functions_called = hash_create (0, /* value bytes */ 0); + e = vlib_call_all_config_functions (vm, &input, 1 /* early */ ); + if (e != 0) + { + clib_error_report (e); + return 1; + } + unformat_free (&input); + + /* + * allocate n x VLIB_THREAD_STACK_SIZE stacks, aligned to a + * VLIB_THREAD_STACK_SIZE boundary + * See also: os_get_cpu_number() in vlib/vlib/threads.c + */ + thread_stacks = clib_mem_alloc_aligned + ((uword) tm->n_thread_stacks * VLIB_THREAD_STACK_SIZE, + VLIB_THREAD_STACK_SIZE); + + vec_validate (vlib_thread_stacks, tm->n_thread_stacks - 1); + for (i = 0; i < vec_len (vlib_thread_stacks); i++) + { + vlib_thread_stacks[i] = thread_stacks; + + /* + * Disallow writes to the bottom page of the stack, to + * catch stack overflows. + */ + if (mprotect (thread_stacks, clib_mem_get_page_size (), PROT_READ) < 0) + clib_unix_warning ("thread stack"); + + thread_stacks += VLIB_THREAD_STACK_SIZE; + } + + i = clib_calljmp (thread0, (uword) vm, + (void *) (vlib_thread_stacks[0] + + VLIB_THREAD_STACK_SIZE)); + return i; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/mc_socket.c b/src/vlib/unix/mc_socket.c new file mode 100644 index 00000000000..9c12ad3b559 --- /dev/null +++ b/src/vlib/unix/mc_socket.c @@ -0,0 +1,1049 @@ +/* + * mc_socket.c: socket based multicast for vlib mc + * + * Copyright (c) 2010 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vlib/vlib.h> +#include <vlib/unix/mc_socket.h> + +#include <sys/ioctl.h> /* for FIONBIO */ +#include <netinet/tcp.h> /* for TCP_NODELAY */ +#include <net/if.h> /* for struct ifreq */ + +static u8 * +format_socket_peer_id (u8 * s, va_list * args) +{ + u64 peer_id_as_u64 = va_arg (*args, u64); + mc_peer_id_t peer_id; + peer_id.as_u64 = peer_id_as_u64; + u32 a = mc_socket_peer_id_get_address (peer_id); + u32 p = mc_socket_peer_id_get_port (peer_id); + + s = format (s, "%U:%04x", format_network_address, AF_INET, &a, ntohs (p)); + + return s; +} + +typedef void (mc_msg_handler_t) (mc_main_t * mcm, void *msg, + u32 buffer_index); + +always_inline void +msg_handler (mc_main_t * mcm, + u32 buffer_index, u32 handler_frees_buffer, void *_h) +{ + vlib_main_t *vm = mcm->vlib_main; + mc_msg_handler_t *h = _h; + vlib_buffer_t *b = vlib_get_buffer (vm, buffer_index); + void *the_msg = vlib_buffer_get_current (b); + + h (mcm, the_msg, buffer_index); + if (!handler_frees_buffer) + vlib_buffer_free_one (vm, buffer_index); +} + +static uword +append_buffer_index_to_iovec (vlib_main_t * vm, + u32 buffer_index, struct iovec **iovs_return) +{ + struct iovec *i; + vlib_buffer_t *b; + u32 bi = buffer_index; + u32 l = 0; + + while (1) + { + b = vlib_get_buffer (vm, bi); + vec_add2 (*iovs_return, i, 1); + i->iov_base = vlib_buffer_get_current (b); + i->iov_len = b->current_length; + l += i->iov_len; + if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT)) + break; + bi = b->next_buffer; + } + + return l; +} + +static clib_error_t * +sendmsg_helper (mc_socket_main_t * msm, + int socket, struct sockaddr_in *tx_addr, u32 buffer_index) +{ + vlib_main_t *vm = msm->mc_main.vlib_main; + struct msghdr h; + word n_bytes, n_bytes_tx, n_retries; + + memset (&h, 0, sizeof (h)); + h.msg_name = tx_addr; + h.msg_namelen = sizeof (tx_addr[0]); + + if (msm->iovecs) + _vec_len (msm->iovecs) = 0; + + n_bytes = append_buffer_index_to_iovec (vm, buffer_index, &msm->iovecs); + ASSERT (n_bytes <= msm->mc_main.transport.max_packet_size); + if (n_bytes > msm->mc_main.transport.max_packet_size) + clib_error ("sending packet larger than interace MTU %d bytes", n_bytes); + + h.msg_iov = msm->iovecs; + h.msg_iovlen = vec_len (msm->iovecs); + + n_retries = 0; + while ((n_bytes_tx = sendmsg (socket, &h, /* flags */ 0)) != n_bytes + && errno == EAGAIN) + n_retries++; + if (n_bytes_tx != n_bytes) + { + clib_unix_warning ("sendmsg"); + return 0; + } + if (n_retries) + { + ELOG_TYPE_DECLARE (e) = + { + .format = "sendmsg-helper: %d retries",.format_args = "i4",}; + struct + { + u32 retries; + } *ed = 0; + + ed = ELOG_DATA (&vm->elog_main, e); + ed->retries = n_retries; + } + return 0; +} + +static clib_error_t * +tx_buffer (void *transport, mc_transport_type_t type, u32 buffer_index) +{ + mc_socket_main_t *msm = (mc_socket_main_t *) transport; + vlib_main_t *vm = msm->mc_main.vlib_main; + mc_multicast_socket_t *ms = &msm->multicast_sockets[type]; + clib_error_t *error; + error = sendmsg_helper (msm, ms->socket, &ms->tx_addr, buffer_index); + if (type != MC_TRANSPORT_USER_REQUEST_TO_RELAY) + vlib_buffer_free_one (vm, buffer_index); + return error; +} + +static clib_error_t * +tx_ack (void *transport, mc_peer_id_t dest_peer_id, u32 buffer_index) +{ + struct sockaddr_in tx_addr; + mc_socket_main_t *msm = (mc_socket_main_t *) transport; + vlib_main_t *vm = msm->mc_main.vlib_main; + clib_error_t *error; + + memset (&tx_addr, 0, sizeof (tx_addr)); + tx_addr.sin_family = AF_INET; + tx_addr.sin_addr.s_addr = mc_socket_peer_id_get_address (dest_peer_id); + tx_addr.sin_port = mc_socket_peer_id_get_port (dest_peer_id); + + error = sendmsg_helper (msm, msm->ack_socket, &tx_addr, buffer_index); + vlib_buffer_free_one (vm, buffer_index); + return error; +} + +static clib_error_t * +recvmsg_helper (mc_socket_main_t * msm, + int socket, + struct sockaddr_in *rx_addr, + u32 * buffer_index, u32 drop_message) +{ + vlib_main_t *vm = msm->mc_main.vlib_main; + vlib_buffer_t *b; + uword n_left, n_alloc, n_mtu, i, i_rx; + const uword buffer_size = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES; + word n_bytes_left; + + /* Make sure we have at least a MTU worth of buffers. */ + n_mtu = msm->rx_mtu_n_buffers; + n_left = vec_len (msm->rx_buffers); + if (n_left < n_mtu) + { + uword max_alloc = 8 * n_mtu; + vec_validate (msm->rx_buffers, max_alloc - 1); + n_alloc = + vlib_buffer_alloc (vm, msm->rx_buffers + n_left, max_alloc - n_left); + _vec_len (msm->rx_buffers) = n_left + n_alloc; + } + + ASSERT (vec_len (msm->rx_buffers) >= n_mtu); + vec_validate (msm->iovecs, n_mtu - 1); + + /* Allocate RX buffers from end of rx_buffers. + Turn them into iovecs to pass to readv. */ + i_rx = vec_len (msm->rx_buffers) - 1; + for (i = 0; i < n_mtu; i++) + { + b = vlib_get_buffer (vm, msm->rx_buffers[i_rx - i]); + msm->iovecs[i].iov_base = b->data; + msm->iovecs[i].iov_len = buffer_size; + } + _vec_len (msm->iovecs) = n_mtu; + + { + struct msghdr h; + + memset (&h, 0, sizeof (h)); + if (rx_addr) + { + h.msg_name = rx_addr; + h.msg_namelen = sizeof (rx_addr[0]); + } + h.msg_iov = msm->iovecs; + h.msg_iovlen = vec_len (msm->iovecs); + + n_bytes_left = recvmsg (socket, &h, 0); + if (n_bytes_left < 0) + return clib_error_return_unix (0, "recvmsg"); + } + + if (drop_message) + { + *buffer_index = ~0; + return 0; + } + + *buffer_index = msm->rx_buffers[i_rx]; + while (1) + { + b = vlib_get_buffer (vm, msm->rx_buffers[i_rx]); + + b->flags = 0; + b->current_data = 0; + b->current_length = + n_bytes_left < buffer_size ? n_bytes_left : buffer_size; + + n_bytes_left -= buffer_size; + + if (n_bytes_left <= 0) + break; + + i_rx--; + b->flags |= VLIB_BUFFER_NEXT_PRESENT; + b->next_buffer = msm->rx_buffers[i_rx]; + } + + _vec_len (msm->rx_buffers) = i_rx; + + return 0 /* no error */ ; +} + +static clib_error_t * +mastership_socket_read_ready (unix_file_t * uf) +{ + mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; + mc_main_t *mcm = &msm->mc_main; + mc_multicast_socket_t *ms = + &msm->multicast_sockets[MC_TRANSPORT_MASTERSHIP]; + clib_error_t *error; + u32 bi; + + error = recvmsg_helper (msm, ms->socket, /* rx_addr */ 0, &bi, /* drop_message */ + 0); + if (!error) + msg_handler (mcm, bi, + /* handler_frees_buffer */ 0, + mc_msg_master_assert_handler); + + return error; +} + +static clib_error_t * +to_relay_socket_read_ready (unix_file_t * uf) +{ + mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; + mc_main_t *mcm = &msm->mc_main; + vlib_main_t *vm = msm->mc_main.vlib_main; + mc_multicast_socket_t *ms_to_relay = + &msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_TO_RELAY]; + mc_multicast_socket_t *ms_from_relay = + &msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_FROM_RELAY]; + clib_error_t *error; + u32 bi; + u32 is_master = mcm->relay_state == MC_RELAY_STATE_MASTER; + + /* Not the ordering master? Turf the msg */ + error = recvmsg_helper (msm, ms_to_relay->socket, /* rx_addr */ 0, &bi, + /* drop_message */ !is_master); + + /* If we are the master, number and rebroadcast the msg. */ + if (!error && is_master) + { + vlib_buffer_t *b = vlib_get_buffer (vm, bi); + mc_msg_user_request_t *mp = vlib_buffer_get_current (b); + mp->global_sequence = clib_host_to_net_u32 (mcm->relay_global_sequence); + mcm->relay_global_sequence++; + error = + sendmsg_helper (msm, ms_from_relay->socket, &ms_from_relay->tx_addr, + bi); + vlib_buffer_free_one (vm, bi); + } + + return error; +} + +static clib_error_t * +from_relay_socket_read_ready (unix_file_t * uf) +{ + mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; + mc_main_t *mcm = &msm->mc_main; + mc_multicast_socket_t *ms = + &msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_FROM_RELAY]; + clib_error_t *error; + u32 bi; + + error = recvmsg_helper (msm, ms->socket, /* rx_addr */ 0, &bi, /* drop_message */ + 0); + if (!error) + { + msg_handler (mcm, bi, /* handler_frees_buffer */ 1, + mc_msg_user_request_handler); + } + return error; +} + +static clib_error_t * +join_socket_read_ready (unix_file_t * uf) +{ + mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; + mc_main_t *mcm = &msm->mc_main; + vlib_main_t *vm = mcm->vlib_main; + mc_multicast_socket_t *ms = &msm->multicast_sockets[MC_TRANSPORT_JOIN]; + clib_error_t *error; + u32 bi; + + error = recvmsg_helper (msm, ms->socket, /* rx_addr */ 0, &bi, /* drop_message */ + 0); + if (!error) + { + vlib_buffer_t *b = vlib_get_buffer (vm, bi); + mc_msg_join_or_leave_request_t *mp = vlib_buffer_get_current (b); + + switch (clib_host_to_net_u32 (mp->type)) + { + case MC_MSG_TYPE_join_or_leave_request: + msg_handler (mcm, bi, /* handler_frees_buffer */ 0, + mc_msg_join_or_leave_request_handler); + break; + + case MC_MSG_TYPE_join_reply: + msg_handler (mcm, bi, /* handler_frees_buffer */ 0, + mc_msg_join_reply_handler); + break; + + default: + ASSERT (0); + break; + } + } + return error; +} + +static clib_error_t * +ack_socket_read_ready (unix_file_t * uf) +{ + mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; + mc_main_t *mcm = &msm->mc_main; + clib_error_t *error; + u32 bi; + + error = recvmsg_helper (msm, msm->ack_socket, /* rx_addr */ 0, &bi, + /* drop_message */ 0); + if (!error) + msg_handler (mcm, bi, /* handler_frees_buffer */ 0, + mc_msg_user_ack_handler); + return error; +} + +static void +catchup_cleanup (mc_socket_main_t * msm, + mc_socket_catchup_t * c, unix_main_t * um, unix_file_t * uf) +{ + hash_unset (msm->catchup_index_by_file_descriptor, uf->file_descriptor); + unix_file_del (um, uf); + vec_free (c->input_vector); + vec_free (c->output_vector); + pool_put (msm->catchups, c); +} + +static mc_socket_catchup_t * +find_catchup_from_file_descriptor (mc_socket_main_t * msm, + int file_descriptor) +{ + uword *p = + hash_get (msm->catchup_index_by_file_descriptor, file_descriptor); + return p ? pool_elt_at_index (msm->catchups, p[0]) : 0; +} + +static clib_error_t * +catchup_socket_read_ready (unix_file_t * uf, int is_server) +{ + unix_main_t *um = &unix_main; + mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; + mc_main_t *mcm = &msm->mc_main; + mc_socket_catchup_t *c = + find_catchup_from_file_descriptor (msm, uf->file_descriptor); + word l, n, is_eof; + + l = vec_len (c->input_vector); + vec_resize (c->input_vector, 4096); + n = + read (uf->file_descriptor, c->input_vector + l, + vec_len (c->input_vector) - l); + is_eof = n == 0; + + if (n < 0) + { + if (errno == EAGAIN) + n = 0; + else + { + catchup_cleanup (msm, c, um, uf); + return clib_error_return_unix (0, "read"); + } + } + + _vec_len (c->input_vector) = l + n; + + if (is_eof && vec_len (c->input_vector) > 0) + { + if (is_server) + { + mc_msg_catchup_request_handler (mcm, (void *) c->input_vector, + c - msm->catchups); + _vec_len (c->input_vector) = 0; + } + else + { + mc_msg_catchup_reply_handler (mcm, (void *) c->input_vector, + c - msm->catchups); + c->input_vector = 0; /* reply handler is responsible for freeing vector */ + catchup_cleanup (msm, c, um, uf); + } + } + + return 0 /* no error */ ; +} + +static clib_error_t * +catchup_server_read_ready (unix_file_t * uf) +{ + return catchup_socket_read_ready (uf, /* is_server */ 1); +} + +static clib_error_t * +catchup_client_read_ready (unix_file_t * uf) +{ + if (MC_EVENT_LOGGING) + { + mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; + vlib_main_t *vm = msm->mc_main.vlib_main; + + ELOG_TYPE (e, "catchup_client_read_ready"); + ELOG (&vm->elog_main, e, 0); + } + return catchup_socket_read_ready (uf, /* is_server */ 0); +} + +static clib_error_t * +catchup_socket_write_ready (unix_file_t * uf, int is_server) +{ + unix_main_t *um = &unix_main; + mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; + mc_socket_catchup_t *c = + find_catchup_from_file_descriptor (msm, uf->file_descriptor); + clib_error_t *error = 0; + int n; + + if (c->connect_in_progress) + { + u32 len, value; + + c->connect_in_progress = 0; + len = sizeof (value); + if (getsockopt (c->socket, SOL_SOCKET, SO_ERROR, &value, &len) < 0) + { + error = clib_error_return_unix (0, "getsockopt SO_ERROR"); + goto error_quit; + } + if (value != 0) + { + error = + clib_error_return_code (0, value, CLIB_ERROR_ERRNO_VALID, + "connect fails"); + goto error_quit; + } + } + + while (1) + { + u32 n_this_write; + + n_this_write = + clib_min (vec_len (c->output_vector) - c->output_vector_n_written, + msm->rx_mtu_n_bytes - + 64 /* ip + tcp + option allowance */ ); + + if (n_this_write <= 0) + break; + + do + { + n = write (uf->file_descriptor, + c->output_vector + c->output_vector_n_written, + n_this_write); + } + while (n < 0 && errno == EAGAIN); + + if (n < 0) + { + error = clib_error_return_unix (0, "write"); + goto error_quit; + } + c->output_vector_n_written += n; + } + + if (c->output_vector_n_written >= vec_len (c->output_vector)) + { + if (!is_server) + { + uf->flags &= ~UNIX_FILE_DATA_AVAILABLE_TO_WRITE; + unix_main.file_update (uf, UNIX_FILE_UPDATE_MODIFY); + /* Send EOF to other side. */ + shutdown (uf->file_descriptor, SHUT_WR); + return error; + } + else + { + error_quit: + catchup_cleanup (msm, c, um, uf); + } + } + return error; +} + +static clib_error_t * +catchup_server_write_ready (unix_file_t * uf) +{ + return catchup_socket_write_ready (uf, /* is_server */ 1); +} + +static clib_error_t * +catchup_client_write_ready (unix_file_t * uf) +{ + return catchup_socket_write_ready (uf, /* is_server */ 0); +} + +static clib_error_t * +catchup_socket_error_ready (unix_file_t * uf) +{ + unix_main_t *um = &unix_main; + mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; + mc_socket_catchup_t *c = + find_catchup_from_file_descriptor (msm, uf->file_descriptor); + catchup_cleanup (msm, c, um, uf); + return clib_error_return (0, "error"); +} + +static clib_error_t * +catchup_listen_read_ready (unix_file_t * uf) +{ + mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; + struct sockaddr_in client_addr; + int client_len; + mc_socket_catchup_t *c; + unix_file_t template = { 0 }; + + pool_get (msm->catchups, c); + memset (c, 0, sizeof (c[0])); + + client_len = sizeof (client_addr); + + /* Acquires the non-blocking attrib from the server socket. */ + c->socket = accept (uf->file_descriptor, + (struct sockaddr *) &client_addr, + (socklen_t *) & client_len); + + if (c->socket < 0) + { + pool_put (msm->catchups, c); + return clib_error_return_unix (0, "accept"); + } + + if (MC_EVENT_LOGGING) + { + mc_main_t *mcm = &msm->mc_main; + vlib_main_t *vm = mcm->vlib_main; + + ELOG_TYPE_DECLARE (e) = + { + .format = "catchup accepted from 0x%lx",.format_args = "i4",}; + struct + { + u32 addr; + } *ed = 0; + + ed = ELOG_DATA (&vm->elog_main, e); + ed->addr = ntohl (client_addr.sin_addr.s_addr); + } + + /* Disable the Nagle algorithm, ship catchup pkts immediately */ + { + int one = 1; + if ((setsockopt (c->socket, IPPROTO_TCP, + TCP_NODELAY, (void *) &one, sizeof (one))) < 0) + { + clib_unix_warning ("catchup socket: set TCP_NODELAY"); + } + } + + template.read_function = catchup_server_read_ready; + template.write_function = catchup_server_write_ready; + template.error_function = catchup_socket_error_ready; + template.file_descriptor = c->socket; + template.private_data = pointer_to_uword (msm); + c->unix_file_index = unix_file_add (&unix_main, &template); + hash_set (msm->catchup_index_by_file_descriptor, c->socket, + c - msm->catchups); + + return 0; +} + +/* Return and bind to an unused port. */ +static word +find_and_bind_to_free_port (word sock, word port) +{ + for (; port < 1 << 16; port++) + { + struct sockaddr_in a; + + memset (&a, 0, sizeof (a)); /* Warnings be gone */ + + a.sin_family = PF_INET; + a.sin_addr.s_addr = INADDR_ANY; + a.sin_port = htons (port); + + if (bind (sock, (struct sockaddr *) &a, sizeof (a)) >= 0) + break; + } + + return port < 1 << 16 ? port : -1; +} + +static clib_error_t * +setup_mutlicast_socket (mc_socket_main_t * msm, + mc_multicast_socket_t * ms, + char *type, uword udp_port) +{ + int one = 1; + struct ip_mreq mcast_req; + + if (!msm->multicast_ttl) + msm->multicast_ttl = 1; + + /* mastership (multicast) TX socket */ + if ((ms->socket = socket (PF_INET, SOCK_DGRAM, IPPROTO_UDP)) < 0) + return clib_error_return_unix (0, "%s socket", type); + + { + u8 ttl = msm->multicast_ttl; + + if ((setsockopt (ms->socket, IPPROTO_IP, + IP_MULTICAST_TTL, (void *) &ttl, sizeof (ttl))) < 0) + return clib_error_return_unix (0, "%s set multicast ttl", type); + } + + if (setsockopt (ms->socket, SOL_SOCKET, SO_REUSEADDR, &one, sizeof (one)) < + 0) + return clib_error_return_unix (0, "%s setsockopt SO_REUSEADDR", type); + + memset (&ms->tx_addr, 0, sizeof (ms->tx_addr)); + ms->tx_addr.sin_family = AF_INET; + ms->tx_addr.sin_addr.s_addr = + htonl (msm->multicast_tx_ip4_address_host_byte_order); + ms->tx_addr.sin_port = htons (udp_port); + + if (bind (ms->socket, (struct sockaddr *) &ms->tx_addr, + sizeof (ms->tx_addr)) < 0) + return clib_error_return_unix (0, "%s bind", type); + + memset (&mcast_req, 0, sizeof (mcast_req)); + mcast_req.imr_multiaddr.s_addr = + htonl (msm->multicast_tx_ip4_address_host_byte_order); + mcast_req.imr_interface.s_addr = msm->if_ip4_address_net_byte_order; + + if ((setsockopt (ms->socket, IPPROTO_IP, + IP_ADD_MEMBERSHIP, (void *) &mcast_req, + sizeof (mcast_req))) < 0) + return clib_error_return_unix (0, "%s IP_ADD_MEMBERSHIP setsockopt", + type); + + if (ioctl (ms->socket, FIONBIO, &one) < 0) + return clib_error_return_unix (0, "%s set FIONBIO", type); + + /* FIXME remove this when we support tx_ready. */ + { + u32 len = 1 << 20; + socklen_t sl = sizeof (len); + if (setsockopt (ms->socket, SOL_SOCKET, SO_SNDBUF, &len, sl) < 0) + clib_unix_error ("setsockopt"); + } + + return 0; +} + +static clib_error_t * +socket_setup (mc_socket_main_t * msm) +{ + int one = 1; + clib_error_t *error; + u32 port; + + if (!msm->base_multicast_udp_port_host_byte_order) + msm->base_multicast_udp_port_host_byte_order = + 0xffff - ((MC_N_TRANSPORT_TYPE + 2 /* ack socket, catchup socket */ ) + - 1); + + port = msm->base_multicast_udp_port_host_byte_order; + + error = setup_mutlicast_socket (msm, + &msm->multicast_sockets + [MC_TRANSPORT_MASTERSHIP], "mastership", + port++); + if (error) + return error; + + error = setup_mutlicast_socket (msm, + &msm->multicast_sockets[MC_TRANSPORT_JOIN], + "join", port++); + if (error) + return error; + + error = setup_mutlicast_socket (msm, + &msm->multicast_sockets + [MC_TRANSPORT_USER_REQUEST_TO_RELAY], + "to relay", port++); + if (error) + return error; + + error = setup_mutlicast_socket (msm, + &msm->multicast_sockets + [MC_TRANSPORT_USER_REQUEST_FROM_RELAY], + "from relay", port++); + if (error) + return error; + + /* ACK rx socket */ + msm->ack_socket = socket (PF_INET, SOCK_DGRAM, IPPROTO_UDP); + if (msm->ack_socket < 0) + return clib_error_return_unix (0, "ack socket"); + + msm->ack_udp_port = find_and_bind_to_free_port (msm->ack_socket, port++); + + if (ioctl (msm->ack_socket, FIONBIO, &one) < 0) + return clib_error_return_unix (0, "ack socket FIONBIO"); + + msm->catchup_server_socket = socket (AF_INET, SOCK_STREAM, 0); + if (msm->catchup_server_socket < 0) + return clib_error_return_unix (0, "catchup server socket"); + + msm->catchup_tcp_port = + find_and_bind_to_free_port (msm->catchup_server_socket, port++); + + if (ioctl (msm->catchup_server_socket, FIONBIO, &one) < 0) + return clib_error_return_unix (0, "catchup server socket FIONBIO"); + + if (listen (msm->catchup_server_socket, 5) < 0) + return clib_error_return_unix (0, "catchup server socket listen"); + + /* epoll setup for multicast mastership socket */ + { + unix_file_t template = { 0 }; + + template.read_function = mastership_socket_read_ready; + template.file_descriptor = + msm->multicast_sockets[MC_TRANSPORT_MASTERSHIP].socket; + template.private_data = (uword) msm; + unix_file_add (&unix_main, &template); + + /* epoll setup for multicast to_relay socket */ + template.read_function = to_relay_socket_read_ready; + template.file_descriptor = + msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_TO_RELAY].socket; + template.private_data = (uword) msm; + unix_file_add (&unix_main, &template); + + /* epoll setup for multicast from_relay socket */ + template.read_function = from_relay_socket_read_ready; + template.file_descriptor = + msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_FROM_RELAY].socket; + template.private_data = (uword) msm; + unix_file_add (&unix_main, &template); + + template.read_function = join_socket_read_ready; + template.file_descriptor = + msm->multicast_sockets[MC_TRANSPORT_JOIN].socket; + template.private_data = (uword) msm; + unix_file_add (&unix_main, &template); + + /* epoll setup for ack rx socket */ + template.read_function = ack_socket_read_ready; + template.file_descriptor = msm->ack_socket; + template.private_data = (uword) msm; + unix_file_add (&unix_main, &template); + + /* epoll setup for TCP catchup server */ + template.read_function = catchup_listen_read_ready; + template.file_descriptor = msm->catchup_server_socket; + template.private_data = (uword) msm; + unix_file_add (&unix_main, &template); + } + + return 0; +} + +static void * +catchup_add_pending_output (mc_socket_catchup_t * c, uword n_bytes, + u8 * set_output_vector) +{ + unix_file_t *uf = pool_elt_at_index (unix_main.file_pool, + c->unix_file_index); + u8 *result = 0; + + if (set_output_vector) + c->output_vector = set_output_vector; + else + vec_add2 (c->output_vector, result, n_bytes); + if (vec_len (c->output_vector) > 0) + { + int skip_update = 0 != (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE); + uf->flags |= UNIX_FILE_DATA_AVAILABLE_TO_WRITE; + if (!skip_update) + unix_main.file_update (uf, UNIX_FILE_UPDATE_MODIFY); + } + return result; +} + +static uword +catchup_request_fun (void *transport_main, + u32 stream_index, mc_peer_id_t catchup_peer_id) +{ + mc_socket_main_t *msm = (mc_socket_main_t *) transport_main; + mc_main_t *mcm = &msm->mc_main; + vlib_main_t *vm = mcm->vlib_main; + mc_socket_catchup_t *c; + struct sockaddr_in addr; + unix_main_t *um = &unix_main; + int one = 1; + + pool_get (msm->catchups, c); + memset (c, 0, sizeof (*c)); + + c->socket = socket (AF_INET, SOCK_STREAM, 0); + if (c->socket < 0) + { + clib_unix_warning ("socket"); + return 0; + } + + if (ioctl (c->socket, FIONBIO, &one) < 0) + { + clib_unix_warning ("FIONBIO"); + return 0; + } + + memset (&addr, 0, sizeof (addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = mc_socket_peer_id_get_address (catchup_peer_id); + addr.sin_port = mc_socket_peer_id_get_port (catchup_peer_id); + + c->connect_in_progress = 1; + + if (MC_EVENT_LOGGING) + { + ELOG_TYPE_DECLARE (e) = + { + .format = "connecting to peer 0x%Lx",.format_args = "i8",}; + struct + { + u64 peer; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->peer = catchup_peer_id.as_u64; + } + + if (connect (c->socket, (const void *) &addr, sizeof (addr)) + < 0 && errno != EINPROGRESS) + { + clib_unix_warning ("connect to %U fails", + format_socket_peer_id, catchup_peer_id); + return 0; + } + + { + unix_file_t template = { 0 }; + + template.read_function = catchup_client_read_ready; + template.write_function = catchup_client_write_ready; + template.error_function = catchup_socket_error_ready; + template.file_descriptor = c->socket; + template.private_data = (uword) msm; + c->unix_file_index = unix_file_add (um, &template); + + hash_set (msm->catchup_index_by_file_descriptor, c->socket, + c - msm->catchups); + } + + { + mc_msg_catchup_request_t *mp; + mp = catchup_add_pending_output (c, sizeof (mp[0]), /* set_output_vector */ + 0); + mp->peer_id = msm->mc_main.transport.our_catchup_peer_id; + mp->stream_index = stream_index; + mc_byte_swap_msg_catchup_request (mp); + } + + return c - msm->catchups; +} + +static void +catchup_send_fun (void *transport_main, uword opaque, u8 * data) +{ + mc_socket_main_t *msm = (mc_socket_main_t *) transport_main; + mc_socket_catchup_t *c = pool_elt_at_index (msm->catchups, opaque); + catchup_add_pending_output (c, 0, data); +} + +static int +find_interface_ip4_address (char *if_name, u32 * ip4_address, u32 * mtu) +{ + int fd; + struct ifreq ifr; + struct sockaddr_in *sa; + + /* Dig up our IP address */ + fd = socket (PF_INET, AF_INET, 0); + if (fd < 0) + { + clib_unix_error ("socket"); + return -1; + } + + ifr.ifr_addr.sa_family = AF_INET; + strncpy (ifr.ifr_name, if_name, sizeof (ifr.ifr_name) - 1); + if (ioctl (fd, SIOCGIFADDR, &ifr) < 0) + { + clib_unix_error ("ioctl(SIOCFIGADDR)"); + close (fd); + return -1; + } + + sa = (void *) &ifr.ifr_addr; + clib_memcpy (ip4_address, &sa->sin_addr.s_addr, sizeof (ip4_address[0])); + + if (ioctl (fd, SIOCGIFMTU, &ifr) < 0) + { + close (fd); + return -1; + } + if (mtu) + *mtu = ifr.ifr_mtu - ( /* IP4 header */ 20 + /* UDP header */ 8); + + close (fd); + + return 0; +} + +clib_error_t * +mc_socket_main_init (mc_socket_main_t * msm, char **intfc_probe_list, + int n_intfcs_to_probe) +{ + clib_error_t *error; + mc_main_t *mcm; + u32 mtu; + + mcm = &msm->mc_main; + + /* 239.255.0.7 */ + if (!msm->multicast_tx_ip4_address_host_byte_order) + msm->multicast_tx_ip4_address_host_byte_order = 0xefff0007; + + { + u32 i, a, win; + + win = 0; + if (msm->multicast_interface_name) + { + win = + !find_interface_ip4_address (msm->multicast_interface_name, &a, + &mtu); + } + else + { + for (i = 0; i < n_intfcs_to_probe; i++) + if (!find_interface_ip4_address (intfc_probe_list[i], &a, &mtu)) + { + win = 1; + msm->multicast_interface_name = intfc_probe_list[i]; + break; + } + } + + if (!win) + return clib_error_return (0, "can't find interface ip4 address"); + + msm->if_ip4_address_net_byte_order = a; + } + + msm->rx_mtu_n_bytes = mtu; + msm->rx_mtu_n_buffers = + msm->rx_mtu_n_bytes / VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES; + msm->rx_mtu_n_buffers += + (msm->rx_mtu_n_bytes % VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES) != 0; + + error = socket_setup (msm); + if (error) + return error; + + mcm->transport.our_ack_peer_id = + mc_socket_set_peer_id (msm->if_ip4_address_net_byte_order, + msm->ack_udp_port); + + mcm->transport.our_catchup_peer_id = + mc_socket_set_peer_id (msm->if_ip4_address_net_byte_order, + msm->catchup_tcp_port); + + mcm->transport.tx_buffer = tx_buffer; + mcm->transport.tx_ack = tx_ack; + mcm->transport.catchup_request_fun = catchup_request_fun; + mcm->transport.catchup_send_fun = catchup_send_fun; + mcm->transport.format_peer_id = format_socket_peer_id; + mcm->transport.opaque = msm; + mcm->transport.max_packet_size = mtu; + + mc_main_init (mcm, "socket"); + + return error; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/mc_socket.h b/src/vlib/unix/mc_socket.h new file mode 100644 index 00000000000..273c9ad430b --- /dev/null +++ b/src/vlib/unix/mc_socket.h @@ -0,0 +1,137 @@ +/* + * mc_socket.h: socket based multicast for vlib mc + * + * Copyright (c) 2010 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __included_mc_socket_h__ +#define __included_mc_socket_h__ + +#include <vlib/unix/unix.h> +#include <netinet/in.h> + +typedef struct +{ + int socket; + struct sockaddr_in tx_addr; +} mc_multicast_socket_t; + +/* TCP catchup socket */ +typedef struct +{ + int socket; + u32 unix_file_index; + + u8 *input_vector; + u8 *output_vector; + u32 output_vector_n_written; + + u32 connect_in_progress; +} mc_socket_catchup_t; + +typedef struct mc_socket_main_t +{ + mc_main_t mc_main; + + /* Multicast mastership/to-relay/from-relay sockets. */ + mc_multicast_socket_t multicast_sockets[MC_N_TRANSPORT_TYPE]; + + /* Unicast UDP ack sockets */ + int ack_socket; + + /* TCP catchup server socket */ + int catchup_server_socket; + + /* Pool of stream-private catchup sockets */ + mc_socket_catchup_t *catchups; + + uword *catchup_index_by_file_descriptor; + + u32 rx_mtu_n_bytes; + + /* Receive MTU in bytes and VLIB buffers. */ + u32 rx_mtu_n_buffers; + + /* Vector of RX VLIB buffers. */ + u32 *rx_buffers; + /* Vector of scatter/gather descriptors for sending/receiving VLIB buffers + via kernel. */ + struct iovec *iovecs; + + /* IP address of interface to use for multicast. */ + u32 if_ip4_address_net_byte_order; + + u32 ack_udp_port; + u32 catchup_tcp_port; + + /* Interface on which to listen for multicasts. */ + char *multicast_interface_name; + + /* Multicast address to use (e.g. 0xefff0000). + Host byte order. */ + u32 multicast_tx_ip4_address_host_byte_order; + + /* TTL to use for multicasts. */ + u32 multicast_ttl; + + /* Multicast ports for mastership, joins, etc. will be chosen + starting at the given port in host byte order. + A total of MC_N_TRANSPORT_TYPE ports will be used. */ + u32 base_multicast_udp_port_host_byte_order; +} mc_socket_main_t; + +always_inline u32 +mc_socket_peer_id_get_address (mc_peer_id_t i) +{ + u32 a = ((i.as_u8[0] << 24) + | (i.as_u8[1] << 16) | (i.as_u8[2] << 8) | (i.as_u8[3] << 0)); + return clib_host_to_net_u32 (a); +} + +always_inline u32 +mc_socket_peer_id_get_port (mc_peer_id_t i) +{ + return clib_host_to_net_u16 ((i.as_u8[4] << 8) | i.as_u8[5]); +} + +static_always_inline mc_peer_id_t +mc_socket_set_peer_id (u32 address_net_byte_order, u32 port_host_byte_order) +{ + mc_peer_id_t i; + u32 a = ntohl (address_net_byte_order); + u32 p = port_host_byte_order; + i.as_u8[0] = (a >> 24) & 0xff; + i.as_u8[1] = (a >> 16) & 0xff; + i.as_u8[2] = (a >> 8) & 0xff; + i.as_u8[3] = (a >> 0) & 0xff; + i.as_u8[4] = (p >> 8) & 0xff; + i.as_u8[5] = (p >> 0) & 0xff; + i.as_u8[6] = 0; + i.as_u8[7] = 0; + return i; +} + +clib_error_t *mc_socket_main_init (mc_socket_main_t * msm, + char **intfc_probe_list, + int n_intfcs_to_probe); +#endif /* __included_mc_socket_h__ */ + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/physmem.c b/src/vlib/unix/physmem.c new file mode 100644 index 00000000000..80ab7b9d6f8 --- /dev/null +++ b/src/vlib/unix/physmem.c @@ -0,0 +1,470 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * physmem.c: Unix physical memory + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vlib/unix/physmem.h> + +static physmem_main_t physmem_main; + +static void * +unix_physmem_alloc_aligned (vlib_physmem_main_t * vpm, uword n_bytes, + uword alignment) +{ + physmem_main_t *pm = &physmem_main; + uword lo_offset, hi_offset; + uword *to_free = 0; + +#if DPDK > 0 + clib_warning ("unsafe alloc!"); +#endif + + /* IO memory is always at least cache aligned. */ + alignment = clib_max (alignment, CLIB_CACHE_LINE_BYTES); + + while (1) + { + mheap_get_aligned (pm->heap, n_bytes, + /* align */ alignment, + /* align offset */ 0, + &lo_offset); + + /* Allocation failed? */ + if (lo_offset == ~0) + break; + + /* Make sure allocation does not span DMA physical chunk boundary. */ + hi_offset = lo_offset + n_bytes - 1; + + if ((lo_offset >> vpm->log2_n_bytes_per_page) == + (hi_offset >> vpm->log2_n_bytes_per_page)) + break; + + /* Allocation would span chunk boundary, queue it to be freed as soon as + we find suitable chunk. */ + vec_add1 (to_free, lo_offset); + } + + if (to_free != 0) + { + uword i; + for (i = 0; i < vec_len (to_free); i++) + mheap_put (pm->heap, to_free[i]); + vec_free (to_free); + } + + return lo_offset != ~0 ? pm->heap + lo_offset : 0; +} + +static void +unix_physmem_free (void *x) +{ + physmem_main_t *pm = &physmem_main; + + /* Return object to region's heap. */ + mheap_put (pm->heap, x - pm->heap); +} + +static void +htlb_shutdown (void) +{ + physmem_main_t *pm = &physmem_main; + + if (!pm->shmid) + return; + shmctl (pm->shmid, IPC_RMID, 0); + pm->shmid = 0; +} + +/* try to use huge TLB pgs if possible */ +static int +htlb_init (vlib_main_t * vm) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + physmem_main_t *pm = &physmem_main; + u64 hugepagesize, pagesize; + u64 pfn, seek_loc; + u64 cur, physaddr, ptbits; + int fd, i; + + pm->shmid = shmget (11 /* key, my amp goes to 11 */ , pm->mem_size, + IPC_CREAT | SHM_HUGETLB | SHM_R | SHM_W); + if (pm->shmid < 0) + { + clib_unix_warning ("shmget"); + return 0; + } + + pm->mem = shmat (pm->shmid, NULL, 0 /* flags */ ); + if (pm->mem == 0) + { + shmctl (pm->shmid, IPC_RMID, 0); + return 0; + } + + memset (pm->mem, 0, pm->mem_size); + + /* $$$ get page size info from /proc/meminfo */ + hugepagesize = 2 << 20; + pagesize = 4 << 10; + vpm->log2_n_bytes_per_page = min_log2 (hugepagesize); + vec_resize (vpm->page_table, pm->mem_size / hugepagesize); + + vpm->page_mask = pow2_mask (vpm->log2_n_bytes_per_page); + vpm->virtual.start = pointer_to_uword (pm->mem); + vpm->virtual.size = pm->mem_size; + vpm->virtual.end = vpm->virtual.start + vpm->virtual.size; + + fd = open ("/proc/self/pagemap", O_RDONLY); + + if (fd < 0) + { + (void) shmdt (pm->mem); + return 0; + } + + pm->heap = mheap_alloc_with_flags (pm->mem, pm->mem_size, + /* Don't want mheap mmap/munmap with IO memory. */ + MHEAP_FLAG_DISABLE_VM); + + cur = pointer_to_uword (pm->mem); + i = 0; + + while (cur < pointer_to_uword (pm->mem) + pm->mem_size) + { + pfn = (u64) cur / pagesize; + seek_loc = pfn * sizeof (u64); + if (lseek (fd, seek_loc, SEEK_SET) != seek_loc) + { + clib_unix_warning ("lseek to 0x%llx", seek_loc); + shmctl (pm->shmid, IPC_RMID, 0); + close (fd); + return 0; + } + if (read (fd, &ptbits, sizeof (ptbits)) != (sizeof (ptbits))) + { + clib_unix_warning ("read ptbits"); + shmctl (pm->shmid, IPC_RMID, 0); + close (fd); + return 0; + } + + /* bits 0-54 are the physical page number */ + physaddr = (ptbits & 0x7fffffffffffffULL) * pagesize; + if (CLIB_DEBUG > 1) + fformat (stderr, "pm: virtual 0x%llx physical 0x%llx\n", + cur, physaddr); + vpm->page_table[i++] = physaddr; + + cur += hugepagesize; + } + close (fd); + atexit (htlb_shutdown); + return 1; +} + +int vlib_app_physmem_init (vlib_main_t * vm, + physmem_main_t * pm, int) __attribute__ ((weak)); +int +vlib_app_physmem_init (vlib_main_t * vm, physmem_main_t * pm, int x) +{ + return 0; +} + +clib_error_t * +unix_physmem_init (vlib_main_t * vm, int physical_memory_required) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + physmem_main_t *pm = &physmem_main; + clib_error_t *error = 0; + + /* Avoid multiple calls. */ + if (vm->os_physmem_alloc_aligned) + return error; + + vm->os_physmem_alloc_aligned = unix_physmem_alloc_aligned; + vm->os_physmem_free = unix_physmem_free; + pm->mem = MAP_FAILED; + + if (pm->mem_size == 0) + pm->mem_size = 16 << 20; + + /* OK, Mr. App, you tell us */ + if (vlib_app_physmem_init (vm, pm, physical_memory_required)) + return 0; + + if (!pm->no_hugepages && htlb_init (vm)) + { + fformat (stderr, "%s: use huge pages\n", __FUNCTION__); + return 0; + } + + pm->mem = + mmap (0, pm->mem_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (pm->mem == MAP_FAILED) + { + error = clib_error_return_unix (0, "mmap"); + goto done; + } + + pm->heap = mheap_alloc (pm->mem, pm->mem_size); + + /* Identity map with a single page. */ + vpm->log2_n_bytes_per_page = min_log2 (pm->mem_size); + vec_add1 (vpm->page_table, pointer_to_uword (pm->mem)); + + vpm->page_mask = pow2_mask (vpm->log2_n_bytes_per_page); + vpm->virtual.start = pointer_to_uword (pm->mem); + vpm->virtual.size = pm->mem_size; + vpm->virtual.end = vpm->virtual.start + vpm->virtual.size; + vpm->is_fake = 1; + + fformat (stderr, "%s: use fake dma pages\n", __FUNCTION__); + +done: + if (error) + { + if (pm->mem != MAP_FAILED) + munmap (pm->mem, pm->mem_size); + } + return error; +} + +static clib_error_t * +show_physmem (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ +#if DPDK > 0 + vlib_cli_output (vm, "Not supported with DPDK drivers."); +#else + physmem_main_t *pm = &physmem_main; + + if (pm->heap) + vlib_cli_output (vm, "%U", format_mheap, pm->heap, /* verbose */ 1); + else + vlib_cli_output (vm, "No physmem allocated."); +#endif + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_physmem_command, static) = { + .path = "show physmem", + .short_help = "Show physical memory allocation", + .function = show_physmem, +}; +/* *INDENT-ON* */ + +static clib_error_t * +show_affinity (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + cpu_set_t set; + cpu_set_t *setp = &set; + int i, rv; + u8 *s = 0; + int first_set_bit_in_run = -1; + int last_set_bit_in_run = -1; + int output_done = 0; + + rv = sched_getaffinity (0 /* pid, 0 = this proc */ , + sizeof (*setp), setp); + if (rv < 0) + { + vlib_cli_output (vm, "Couldn't get affinity mask: %s\n", + strerror (errno)); + return 0; + } + + for (i = 0; i < 64; i++) + { + if (CPU_ISSET (i, setp)) + { + if (first_set_bit_in_run == -1) + { + first_set_bit_in_run = i; + last_set_bit_in_run = i; + if (output_done) + s = format (s, ","); + s = format (s, "%d-", i); + output_done = 1; + } + else + { + if (i == (last_set_bit_in_run + 1)) + last_set_bit_in_run = i; + } + } + else + { + if (first_set_bit_in_run != -1) + { + if (first_set_bit_in_run == (i - 1)) + { + _vec_len (s) -= 2 + ((first_set_bit_in_run / 10)); + } + s = format (s, "%d", last_set_bit_in_run); + first_set_bit_in_run = -1; + last_set_bit_in_run = -1; + } + } + } + + if (first_set_bit_in_run != -1) + s = format (s, "%d", first_set_bit_in_run); + + vlib_cli_output (vm, "Process runs on: %v", s); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_affinity_command, static) = { + .path = "show affinity", + .short_help = "Show process cpu affinity", + .function = show_affinity, +}; +/* *INDENT-ON* */ + +static clib_error_t * +set_affinity (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + cpu_set_t set; + cpu_set_t *setp = &set; + int i, rv; + int another_round; + u32 first, last; + + memset (setp, 0, sizeof (*setp)); + + do + { + another_round = 0; + if (unformat (input, "%d-%d,", &first, &last)) + { + if (first > 64 || last > 64) + { + barf1: + vlib_cli_output (vm, "range %d-%d invalid", first, last); + return 0; + } + + for (i = first; i <= last; i++) + CPU_SET (i, setp); + another_round = 1; + } + else if (unformat (input, "%d-%d", &first, &last)) + { + if (first > 64 || last > 64) + goto barf1; + + for (i = first; i <= last; i++) + CPU_SET (i, setp); + } + else if (unformat (input, "%d,", &first)) + { + if (first > 64) + { + barf2: + vlib_cli_output (vm, "cpu %d invalid", first); + return 0; + } + CPU_SET (first, setp); + another_round = 1; + } + else if (unformat (input, "%d", &first)) + { + if (first > 64) + goto barf2; + + CPU_SET (first, setp); + } + } + while (another_round); + + rv = sched_setaffinity (0 /* pid, 0 = this proc */ , + sizeof (*setp), setp); + + if (rv < 0) + { + vlib_cli_output (vm, "Couldn't get affinity mask: %s\n", + strerror (errno)); + return 0; + } + return show_affinity (vm, input, cmd); +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (set_affinity_command, static) = { + .path = "set affinity", + .short_help = "Set process cpu affinity", + .function = set_affinity, +}; +/* *INDENT-ON* */ + +static clib_error_t * +vlib_physmem_configure (vlib_main_t * vm, unformat_input_t * input) +{ + physmem_main_t *pm = &physmem_main; + u32 size_in_mb; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "no-huge") || unformat (input, "no-huge-pages")) + pm->no_hugepages = 1; + + else if (unformat (input, "size-in-mb %d", &size_in_mb) || + unformat (input, "size %d", &size_in_mb)) + pm->mem_size = size_in_mb << 20; + else + return unformat_parse_error (input); + } + + unformat_free (input); + return 0; +} + +VLIB_EARLY_CONFIG_FUNCTION (vlib_physmem_configure, "physmem"); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/physmem.h b/src/vlib/unix/physmem.h new file mode 100644 index 00000000000..5519a7d6f3e --- /dev/null +++ b/src/vlib/unix/physmem.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_physmem_h__ +#define __included_physmem_h__ + +/* Manage I/O physical memory. */ +#define _GNU_SOURCE +#include <sched.h> +#include <vppinfra/cache.h> +#include <vppinfra/error.h> +#include <vppinfra/mheap.h> +#include <vppinfra/os.h> + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> + +#include <sys/fcntl.h> /* for open */ +#include <sys/file.h> /* for flock */ +#include <sys/ioctl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/ipc.h> +#include <sys/shm.h> + +typedef struct +{ + /* Virtual memory via mmaped. */ + void *mem; + + /* Size in bytes. */ + uword mem_size; + + /* Heap allocated out of virtual memory. */ + void *heap; + + /* huge TLB segment id */ + int shmid; + + /* should we try to use htlb ? */ + int no_hugepages; + +} physmem_main_t; + +#endif /* __included_physmem_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/plugin.c b/src/vlib/unix/plugin.c new file mode 100644 index 00000000000..b3d5be02ed6 --- /dev/null +++ b/src/vlib/unix/plugin.c @@ -0,0 +1,260 @@ +/* + * plugin.c: plugin handling + * + * Copyright (c) 2011 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vlib/unix/plugin.h> +#include <dlfcn.h> +#include <dirent.h> + +plugin_main_t vlib_plugin_main; + +void +vlib_set_get_handoff_structure_cb (void *cb) +{ + plugin_main_t *pm = &vlib_plugin_main; + pm->handoff_structure_get_cb = cb; +} + +static void * +vnet_get_handoff_structure (void) +{ + void *(*fp) (void); + + fp = vlib_plugin_main.handoff_structure_get_cb; + if (fp == 0) + return 0; + else + return (*fp) (); +} + +static int +load_one_plugin (plugin_main_t * pm, plugin_info_t * pi, int from_early_init) +{ + void *handle, *register_handle; + clib_error_t *(*fp) (vlib_main_t *, void *, int); + clib_error_t *error; + void *handoff_structure; + + handle = dlopen ((char *) pi->name, RTLD_LAZY); + + /* + * Note: this can happen if the plugin has an undefined symbol reference, + * so print a warning. Otherwise, the poor slob won't know what happened. + * Ask me how I know that... + */ + if (handle == 0) + { + clib_warning ("%s", dlerror ()); + return -1; + } + + pi->handle = handle; + + + register_handle = dlsym (pi->handle, "vlib_plugin_register"); + if (register_handle == 0) + { + dlclose (handle); + clib_warning ("Plugin missing vlib_plugin_register: %s\n", + (char *) pi->name); + return 1; + } + + fp = register_handle; + + handoff_structure = vnet_get_handoff_structure (); + + if (handoff_structure == 0) + error = clib_error_return (0, "handoff structure callback returned 0"); + else + error = (*fp) (pm->vlib_main, handoff_structure, from_early_init); + + if (error) + { + clib_error_report (error); + dlclose (handle); + return 1; + } + + clib_warning ("Loaded plugin: %s", pi->name); + + return 0; +} + +static u8 ** +split_plugin_path (plugin_main_t * pm) +{ + int i; + u8 **rv = 0; + u8 *path = pm->plugin_path; + u8 *this = 0; + + for (i = 0; i < vec_len (pm->plugin_path); i++) + { + if (path[i] != ':') + { + vec_add1 (this, path[i]); + continue; + } + vec_add1 (this, 0); + vec_add1 (rv, this); + this = 0; + } + if (this) + { + vec_add1 (this, 0); + vec_add1 (rv, this); + } + return rv; +} + +int +vlib_load_new_plugins (plugin_main_t * pm, int from_early_init) +{ + DIR *dp; + struct dirent *entry; + struct stat statb; + uword *p; + plugin_info_t *pi; + u8 **plugin_path; + int i; + + plugin_path = split_plugin_path (pm); + + for (i = 0; i < vec_len (plugin_path); i++) + { + dp = opendir ((char *) plugin_path[i]); + + if (dp == 0) + continue; + + while ((entry = readdir (dp))) + { + u8 *plugin_name; + + if (pm->plugin_name_filter) + { + int j; + for (j = 0; j < vec_len (pm->plugin_name_filter); j++) + if (entry->d_name[j] != pm->plugin_name_filter[j]) + goto next; + } + + plugin_name = format (0, "%s/%s%c", plugin_path[i], + entry->d_name, 0); + + /* Only accept .so */ + char *ext = strrchr ((const char *) plugin_name, '.'); + /* unreadable */ + if (!ext || (strcmp (ext, ".so") != 0) || + stat ((char *) plugin_name, &statb) < 0) + { + ignore: + vec_free (plugin_name); + continue; + } + + /* a dir or other things which aren't plugins */ + if (!S_ISREG (statb.st_mode)) + goto ignore; + + p = hash_get_mem (pm->plugin_by_name_hash, plugin_name); + if (p == 0) + { + vec_add2 (pm->plugin_info, pi, 1); + pi->name = plugin_name; + pi->file_info = statb; + + if (load_one_plugin (pm, pi, from_early_init)) + { + vec_free (plugin_name); + _vec_len (pm->plugin_info) = vec_len (pm->plugin_info) - 1; + continue; + } + memset (pi, 0, sizeof (*pi)); + hash_set_mem (pm->plugin_by_name_hash, plugin_name, + pi - pm->plugin_info); + } + next: + ; + } + closedir (dp); + vec_free (plugin_path[i]); + } + vec_free (plugin_path); + return 0; +} + +char *vlib_plugin_path __attribute__ ((weak)); +char *vlib_plugin_path = ""; +char *vlib_plugin_name_filter __attribute__ ((weak)); +char *vlib_plugin_name_filter = 0; + +int +vlib_plugin_early_init (vlib_main_t * vm) +{ + plugin_main_t *pm = &vlib_plugin_main; + + pm->plugin_path = format (0, "%s%c", vlib_plugin_path, 0); + + clib_warning ("plugin path %s", pm->plugin_path); + + if (vlib_plugin_name_filter) + pm->plugin_name_filter = format (0, "%s%c", vlib_plugin_name_filter, 0); + + pm->plugin_by_name_hash = hash_create_string (0, sizeof (uword)); + pm->vlib_main = vm; + + return vlib_load_new_plugins (pm, 1 /* from_early_init */ ); +} + +static clib_error_t * +vlib_plugins_show_cmd_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + plugin_main_t *pm = &vlib_plugin_main; + u8 *s = 0; + u8 *key = 0; + uword *value = 0; + int index = 1; + + s = format (s, " Plugin path is: %s\n", pm->plugin_path); + if (vlib_plugin_name_filter) + s = format (s, " Plugin filter: %s\n", vlib_plugin_name_filter); + + s = format (s, " Plugins loaded: \n"); + hash_foreach_mem (key, value, pm->plugin_by_name_hash, + { + if (key != 0) + s = format (s, " %d.%s\n", index, key); index++;} + ); + + vlib_cli_output (vm, "%v", s); + vec_free (s); + return 0; +} + +VLIB_CLI_COMMAND (plugins_show_cmd, static) = +{ +.path = "show plugins",.short_help = "show loaded plugins",.function = + vlib_plugins_show_cmd_fn,}; +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/plugin.h b/src/vlib/unix/plugin.h new file mode 100644 index 00000000000..c17053bd306 --- /dev/null +++ b/src/vlib/unix/plugin.h @@ -0,0 +1,98 @@ +/* + * plugin.h: plugin handling + * + * Copyright (c) 2011 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __included_plugin_h__ +#define __included_plugin_h__ + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +/* + * vlib plugin scheme + * + * Almost anything which can be made to work in a vlib unix + * application will also work in a vlib plugin. + * + * The elf-section magic which registers static objects + * works so long as plugins are preset when the vlib unix process + * starts. But wait: there's more... + * + * If an application calls vlib_load_new_plugins() -- possibly after + * changing vlib_plugin_main.plugin_path / vlib_plugin_main.plugin_name_filter, + * -- new plugins will be loaded. That, in turn, allows considerable + * flexibility in terms of adding feature code or fixing bugs without + * requiring the data-plane process to restart. + * + * When the plugin mechanism loads a plugin, it uses dlsym to locate + * and call the plugin's function vlib_plugin_register() if it exists. + * A plugin which expects to be loaded after the vlib application + * starts uses this callback to modify the application. If vlib_plugin_register + * returns non-zero, the plugin mechanism dlclose()'s the plugin. + * + * Applications control the plugin search path and name filter by + * declaring the variables vlib_plugin_path and vlib_plugin_name_filter. + * libvlib_unix.la supplies weak references for these symbols which + * effectively disable the scheme. In order for the elf-section magic to + * work, static plugins must be loaded at the earliest possible moment. + * + * An application can change these parameters at any time and call + * vlib_load_new_plugins(). + */ + + + +typedef struct +{ + u8 *name; + struct stat file_info; + void *handle; +} plugin_info_t; + +typedef struct +{ + /* loaded plugin info */ + plugin_info_t *plugin_info; + uword *plugin_by_name_hash; + + /* path and name filter */ + u8 *plugin_path; + u8 *plugin_name_filter; + + /* handoff structure get callback */ + void *handoff_structure_get_cb; + + /* usual */ + vlib_main_t *vlib_main; +} plugin_main_t; + +extern plugin_main_t vlib_plugin_main; + +int vlib_plugin_early_init (vlib_main_t * vm); +int vlib_load_new_plugins (plugin_main_t * pm, int from_early_init); + +#endif /* __included_plugin_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/unix.h b/src/vlib/unix/unix.h new file mode 100644 index 00000000000..ea0d417b2b1 --- /dev/null +++ b/src/vlib/unix/unix.h @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * unix.h: Unix specific main state + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_unix_unix_h +#define included_unix_unix_h + +#include <vppinfra/socket.h> +#include <termios.h> + +struct unix_file; +typedef clib_error_t *(unix_file_function_t) (struct unix_file * f); + +typedef struct unix_file +{ + /* Unix file descriptor from open/socket. */ + u32 file_descriptor; + + u32 flags; +#define UNIX_FILE_DATA_AVAILABLE_TO_WRITE (1 << 0) +#define UNIX_FILE_EVENT_EDGE_TRIGGERED (1 << 1) + + /* Data available for function's use. */ + uword private_data; + + /* Functions to be called when read/write data becomes ready. */ + unix_file_function_t *read_function, *write_function, *error_function; +} unix_file_t; + +typedef struct +{ + f64 time; + clib_error_t *error; +} unix_error_history_t; + +typedef enum +{ + UNIX_FILE_UPDATE_ADD, + UNIX_FILE_UPDATE_MODIFY, + UNIX_FILE_UPDATE_DELETE, +} unix_file_update_type_t; + +typedef struct +{ + /* Back pointer to main structure. */ + vlib_main_t *vlib_main; + + u32 flags; + /* Run interactively or as daemon (background process). */ +#define UNIX_FLAG_INTERACTIVE (1 << 0) +#define UNIX_FLAG_NODAEMON (1 << 1) + + /* Pool of files to poll for input/output. */ + unix_file_t *file_pool; + + /* CLI listen socket. */ + clib_socket_t cli_listen_socket; + + void (*file_update) (unix_file_t * file, + unix_file_update_type_t update_type); + + /* Circular buffer of last unix errors. */ + unix_error_history_t error_history[128]; + u32 error_history_index; + u64 n_total_errors; + + /* startup-config filename */ + u8 *startup_config_filename; + + /* unix config complete */ + volatile int unix_config_complete; + + /* CLI log file. GIGO. */ + u8 *log_filename; + int log_fd; + + /* Don't put CLI connections into character mode */ + int cli_line_mode; + + /* Maximum amount of command line history to keep per session */ + u32 cli_history_limit; + + /* Suppress the welcome banner at CLI session start */ + int cli_no_banner; + + /* Maximum pager buffer size */ + u32 cli_pager_buffer_limit; + + /* Suppress the pager */ + int cli_no_pager; + + /* Store the original state of stdin when it's a tty */ + struct termios tio_stdin; + int tio_isset; +} unix_main_t; + +/* Global main structure. */ +extern unix_main_t unix_main; + +always_inline uword +unix_file_add (unix_main_t * um, unix_file_t * template) +{ + unix_file_t *f; + pool_get (um->file_pool, f); + f[0] = template[0]; + um->file_update (f, UNIX_FILE_UPDATE_ADD); + return f - um->file_pool; +} + +always_inline void +unix_file_del (unix_main_t * um, unix_file_t * f) +{ + um->file_update (f, UNIX_FILE_UPDATE_DELETE); + close (f->file_descriptor); + f->file_descriptor = ~0; + pool_put (um->file_pool, f); +} + +always_inline uword +unix_file_set_data_available_to_write (u32 unix_file_index, + uword is_available) +{ + unix_file_t *uf = pool_elt_at_index (unix_main.file_pool, unix_file_index); + uword was_available = (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE); + if ((was_available != 0) != (is_available != 0)) + { + uf->flags ^= UNIX_FILE_DATA_AVAILABLE_TO_WRITE; + unix_main.file_update (uf, UNIX_FILE_UPDATE_MODIFY); + } + return was_available != 0; +} + +always_inline void +unix_save_error (unix_main_t * um, clib_error_t * error) +{ + unix_error_history_t *eh = um->error_history + um->error_history_index; + clib_error_free_vector (eh->error); + eh->error = error; + eh->time = vlib_time_now (um->vlib_main); + um->n_total_errors += 1; + if (++um->error_history_index >= ARRAY_LEN (um->error_history)) + um->error_history_index = 0; +} + +/* Main function for Unix VLIB. */ +int vlib_unix_main (int argc, char *argv[]); + +/* Call to allocate/initialize physical DMA memory subsystem. + This is not an init function so that users can explicitly enable/disable + physmem when its not needed. */ +clib_error_t *unix_physmem_init (vlib_main_t * vm, + int fail_if_physical_memory_not_present); + +static inline int +unix_physmem_is_fake (vlib_main_t * vm) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + return vpm->is_fake; +} + +/* Set prompt for CLI. */ +void vlib_unix_cli_set_prompt (char *prompt); + +static inline unix_main_t * +vlib_unix_get_main (void) +{ + return &unix_main; +} + +/* thread stack array; vec_len = max number of threads */ +extern u8 **vlib_thread_stacks; + +/* utils */ + +clib_error_t *vlib_sysfs_write (char *file_name, char *fmt, ...); + +clib_error_t *vlib_sysfs_read (char *file_name, char *fmt, ...); + +u8 *vlib_sysfs_link_to_name (char *link); + +int vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size); + +clib_error_t *foreach_directory_file (char *dir_name, + clib_error_t * (*f) (void *arg, + u8 * path_name, + u8 * file_name), + void *arg, int scan_dirs); + +#endif /* included_unix_unix_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/util.c b/src/vlib/unix/util.c new file mode 100644 index 00000000000..edc3e591baf --- /dev/null +++ b/src/vlib/unix/util.c @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * pci.c: Linux user space PCI bus management. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vlib/vlib.h> +#include <vlib/unix/unix.h> + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <dirent.h> + +clib_error_t * +foreach_directory_file (char *dir_name, + clib_error_t * (*f) (void *arg, u8 * path_name, + u8 * file_name), void *arg, + int scan_dirs) +{ + DIR *d; + struct dirent *e; + clib_error_t *error = 0; + u8 *s, *t; + + d = opendir (dir_name); + if (!d) + { + if (errno == ENOENT) + return 0; + return clib_error_return_unix (0, "open `%s'", dir_name); + } + + s = t = 0; + while (1) + { + e = readdir (d); + if (!e) + break; + if (scan_dirs) + { + if (e->d_type == DT_DIR + && (!strcmp (e->d_name, ".") || !strcmp (e->d_name, ".."))) + continue; + } + else + { + if (e->d_type == DT_DIR) + continue; + } + + s = format (s, "%s/%s", dir_name, e->d_name); + t = format (t, "%s", e->d_name); + error = f (arg, s, t); + _vec_len (s) = 0; + _vec_len (t) = 0; + + if (error) + break; + } + + vec_free (s); + closedir (d); + + return error; +} + +clib_error_t * +vlib_sysfs_write (char *file_name, char *fmt, ...) +{ + u8 *s; + int fd; + clib_error_t *error = 0; + + fd = open (file_name, O_WRONLY); + if (fd < 0) + return clib_error_return_unix (0, "open `%s'", file_name); + + va_list va; + va_start (va, fmt); + s = va_format (0, fmt, &va); + va_end (va); + + if (write (fd, s, vec_len (s)) < 0) + error = clib_error_return_unix (0, "write `%s'", file_name); + + vec_free (s); + close (fd); + return error; +} + +clib_error_t * +vlib_sysfs_read (char *file_name, char *fmt, ...) +{ + unformat_input_t input; + u8 *s = 0; + int fd; + ssize_t sz; + uword result; + + fd = open (file_name, O_RDONLY); + if (fd < 0) + return clib_error_return_unix (0, "open `%s'", file_name); + + vec_validate (s, 4095); + + sz = read (fd, s, vec_len (s)); + if (sz < 0) + { + close (fd); + vec_free (s); + return clib_error_return_unix (0, "read `%s'", file_name); + } + + _vec_len (s) = sz; + unformat_init_vector (&input, s); + + va_list va; + va_start (va, fmt); + result = va_unformat (&input, fmt, &va); + va_end (va); + + vec_free (s); + close (fd); + + if (result == 0) + return clib_error_return (0, "unformat error"); + + return 0; +} + +u8 * +vlib_sysfs_link_to_name (char *link) +{ + char *p, buffer[64]; + unformat_input_t in; + u8 *s = 0; + int r; + + r = readlink (link, buffer, sizeof (buffer) - 1); + + if (r < 0) + return 0; + + buffer[r] = 0; + p = strrchr (buffer, '/'); + + if (!p) + return 0; + + unformat_init_string (&in, p + 1, strlen (p + 1)); + if (unformat (&in, "%s", &s) != 1) + clib_unix_warning ("no string?"); + unformat_free (&in); + + return s; +} + +int +vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size) +{ + struct stat sb; + u8 *p = 0; + int r = -1; + + p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); + + if (stat ((char *) p, &sb) == 0) + { + if (S_ISDIR (sb.st_mode) == 0) + goto done; + } + else if (numa_node == 0) + { + vec_reset_length (p); + p = format (p, "/sys/kernel/mm%c", 0); + if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) + goto done; + } + else + goto done; + + _vec_len (p) -= 1; + p = format (p, "/hugepages/hugepages-%ukB/free_hugepages%c", page_size, 0); + vlib_sysfs_read ((char *) p, "%d", &r); + +done: + vec_free (p); + return r; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/vlib.h b/src/vlib/vlib.h new file mode 100644 index 00000000000..b146a49b7f2 --- /dev/null +++ b/src/vlib/vlib.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * vlib.h: top-level include file + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_h +#define included_vlib_h + +#include <vppinfra/clib.h> +#include <vppinfra/elf_clib.h> + +/* Generic definitions. */ +#include <vlib/defs.h> + +/* Forward declarations of structs to avoid circular dependencies. */ +struct vlib_main_t; + +/* All includes in alphabetical order. */ +#include <vlib/buffer.h> +#include <vlib/cli.h> +#include <vlib/counter.h> +#include <vlib/error.h> +#include <vlib/init.h> +#include <vlib/mc.h> +#include <vlib/node.h> +#include <vlib/physmem.h> +#include <vlib/trace.h> + +/* Main include depends on other vlib/ includes so we put it last. */ +#include <vlib/main.h> + +/* Inline/extern function declarations. */ +#include <vlib/threads.h> +#include <vlib/buffer_funcs.h> +#include <vlib/cli_funcs.h> +#include <vlib/error_funcs.h> +#include <vlib/format_funcs.h> +#include <vlib/node_funcs.h> +#include <vlib/trace_funcs.h> +#include <vlib/global_funcs.h> + +#include <vlib/buffer_node.h> + +#endif /* included_vlib_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/vlib_process_doc.h b/src/vlib/vlib_process_doc.h new file mode 100644 index 00000000000..a47c5e4bbe4 --- /dev/null +++ b/src/vlib/vlib_process_doc.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +#error do not #include this file! + +/** \file + + Cooperative multi-tasking thread support. + + Vlib provides a lightweight cooperative multi-tasking thread + model. Context switching costs a setjmp/longjump pair. It's not + unreasonable to put vlib threads to sleep for 10us. + + The graph node scheduler invokes these processes in much the same + way as traditional vector-processing run-to-completion graph + nodes; plus-or-minus a setjmp/longjmp pair required to switch + stacks. Simply set the vlib_node_registration_t type field to + VLIB_NODE_TYPE_PROCESS. Process is a misnomer; these are threads. + + As of this writing, the default stack size is 2<<15; + 32kb. Initialize the node registration's + process_log2_n_stack_bytes member as needed. The graph node + dispatcher makes some effort to detect stack overrun. We map a + no-access page below each thread stack. + + Process node dispatch functions are expected to be while(1) { } + loops which suspend when not otherwise occupied, and which must + not run for unreasonably long periods of time. Unreasonably long + is an application-dependent concept. Over the years, we have + constructed frame-size sensitive control-plane nodes which will + use a much higher fraction of the available CPU bandwidth when the + frame size is low. Classic example: modifying forwarding + tables. So long as the table-builder leaves the forwarding tables + in a valid state, one can suspend the table builder to avoid + dropping packets as a result of control-plane activity. + + Process nodes can suspend for fixed amounts of time, or until another + entity signals an event, or both. See the example below. + + When running in VLIB process context, one must pay strict attention to + loop invariant issues. If one walks a data structure and calls a + function which may suspend, one had best know by construction that it + cannot change. Often, it s best to simply make a snapshot copy of a + data structure, walk the copy at leisure, then free the copy. + + Here's an example: + + <code><pre> + \#define EXAMPLE_POLL_PERIOD 10.0 + + static uword + example_process (vlib_main_t * vm, vlib_node_runtime_t * rt, + vlib_frame_t * f) + { + f64 poll_time_remaining; + uword event_type, *event_data = 0; + + poll_time_remaining = EXAMPLE_POLL_PERIOD; + while (1) + { + int i; + + // Sleep until next periodic call due, + // or until we receive event(s) + // + poll_time_remaining = + vlib_process_wait_for_event_or_clock (vm, poll_time_remaining); + + event_type = vlib_process_get_events (vm, &event_data); + switch (event_type) + { + case ~0: // no events => timeout + break; + + case EVENT1: + for (i = 0; i < vec_len (event_data); i++) + handle_event1 (mm, event_data[i]); + break; + + case EVENT2: + for (i = 0; i < vec_len (event_data); i++) + handle_event2 (vm, event_data[i]); + break; + + // ... and so forth for each event type + + default: + // This should never happen... + clib_warning ("BUG: unhandled event type %d", + event_type); + break; + } + vec_reset_length (event_data); + + // Timer expired, call periodic function + if (vlib_process_suspend_time_is_zero (poll_time_remaining)) + { + example_periodic (vm); + poll_time_remaining = EXAMPLE_POLL_PERIOD; + } + } + // NOTREACHED + return 0; + } + + static VLIB_REGISTER_NODE (example_node) = { + .function = example_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "example-process", + }; + </pre></code> + + In this example, the VLIB process node waits for an event to + occur, or for 10 seconds to elapse. The code demuxes on the event + type, calling the appropriate handler function. + + Each call to vlib_process_get_events returns a vector of + per-event-type data passed to successive vlib_process_signal_event + calls; vec_len (event_data) >= 1. It is an error to process only + event_data[0]. + + Resetting the event_data vector-length to 0 by calling + vec_reset_length (event_data) - instead of calling vec_free (...) + - means that the event scheme doesn t burn cycles continuously + allocating and freeing the event data vector. This is a common + coding pattern, well worth using when appropriate. +*/ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ |