From 7cd468a3d7dee7d6c92f69a0bb7061ae208ec727 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Mon, 19 Dec 2016 23:05:39 +0100 Subject: Reorganize source tree to use single autotools instance Change-Id: I7b51f88292e057c6443b12224486f2d0c9f8ae23 Signed-off-by: Damjan Marion --- src/vlib/buffer.c | 1987 ++++++++++++++++++++++++++++ src/vlib/buffer.h | 417 ++++++ src/vlib/buffer_funcs.h | 755 +++++++++++ src/vlib/buffer_node.h | 337 +++++ src/vlib/cli.c | 1173 +++++++++++++++++ src/vlib/cli.h | 192 +++ src/vlib/cli_funcs.h | 58 + src/vlib/counter.c | 151 +++ src/vlib/counter.h | 379 ++++++ src/vlib/defs.h | 82 ++ src/vlib/dir.dox | 23 + src/vlib/elog_samples.c | 122 ++ src/vlib/error.c | 338 +++++ src/vlib/error.h | 101 ++ src/vlib/error_funcs.h | 88 ++ src/vlib/format.c | 196 +++ src/vlib/format_funcs.h | 75 ++ src/vlib/global_funcs.h | 45 + src/vlib/i2c.c | 231 ++++ src/vlib/i2c.h | 67 + src/vlib/init.c | 168 +++ src/vlib/init.h | 238 ++++ src/vlib/lex.c | 271 ++++ src/vlib/lex.h | 145 +++ src/vlib/main.c | 1703 ++++++++++++++++++++++++ src/vlib/main.h | 333 +++++ src/vlib/mc.c | 2609 +++++++++++++++++++++++++++++++++++++ src/vlib/mc.h | 687 ++++++++++ src/vlib/node.c | 631 +++++++++ src/vlib/node.h | 725 +++++++++++ src/vlib/node_cli.c | 466 +++++++ src/vlib/node_format.c | 187 +++ src/vlib/node_funcs.h | 1130 ++++++++++++++++ src/vlib/parse.c | 1007 +++++++++++++++ src/vlib/parse.h | 221 ++++ src/vlib/parse_builtin.c | 150 +++ src/vlib/pci/linux_pci.c | 642 ++++++++++ src/vlib/pci/pci.c | 264 ++++ src/vlib/pci/pci.h | 251 ++++ src/vlib/pci/pci_config.h | 731 +++++++++++ src/vlib/physmem.h | 108 ++ src/vlib/threads.c | 1492 +++++++++++++++++++++ src/vlib/threads.h | 470 +++++++ src/vlib/threads_cli.c | 579 +++++++++ src/vlib/trace.c | 545 ++++++++ src/vlib/trace.h | 100 ++ src/vlib/trace_funcs.h | 185 +++ src/vlib/unix/cj.c | 271 ++++ src/vlib/unix/cj.h | 79 ++ src/vlib/unix/cli.c | 2989 +++++++++++++++++++++++++++++++++++++++++++ src/vlib/unix/dir.dox | 28 + src/vlib/unix/input.c | 265 ++++ src/vlib/unix/main.c | 557 ++++++++ src/vlib/unix/mc_socket.c | 1049 +++++++++++++++ src/vlib/unix/mc_socket.h | 137 ++ src/vlib/unix/physmem.c | 470 +++++++ src/vlib/unix/physmem.h | 65 + src/vlib/unix/plugin.c | 260 ++++ src/vlib/unix/plugin.h | 98 ++ src/vlib/unix/unix.h | 232 ++++ src/vlib/unix/util.c | 231 ++++ src/vlib/vlib.h | 86 ++ src/vlib/vlib_process_doc.h | 147 +++ 63 files changed, 29819 insertions(+) create mode 100644 src/vlib/buffer.c create mode 100644 src/vlib/buffer.h create mode 100644 src/vlib/buffer_funcs.h create mode 100644 src/vlib/buffer_node.h create mode 100644 src/vlib/cli.c create mode 100644 src/vlib/cli.h create mode 100644 src/vlib/cli_funcs.h create mode 100644 src/vlib/counter.c create mode 100644 src/vlib/counter.h create mode 100644 src/vlib/defs.h create mode 100644 src/vlib/dir.dox create mode 100644 src/vlib/elog_samples.c create mode 100644 src/vlib/error.c create mode 100644 src/vlib/error.h create mode 100644 src/vlib/error_funcs.h create mode 100644 src/vlib/format.c create mode 100644 src/vlib/format_funcs.h create mode 100644 src/vlib/global_funcs.h create mode 100644 src/vlib/i2c.c create mode 100644 src/vlib/i2c.h create mode 100644 src/vlib/init.c create mode 100644 src/vlib/init.h create mode 100644 src/vlib/lex.c create mode 100644 src/vlib/lex.h create mode 100644 src/vlib/main.c create mode 100644 src/vlib/main.h create mode 100644 src/vlib/mc.c create mode 100644 src/vlib/mc.h create mode 100644 src/vlib/node.c create mode 100644 src/vlib/node.h create mode 100644 src/vlib/node_cli.c create mode 100644 src/vlib/node_format.c create mode 100644 src/vlib/node_funcs.h create mode 100644 src/vlib/parse.c create mode 100644 src/vlib/parse.h create mode 100644 src/vlib/parse_builtin.c create mode 100644 src/vlib/pci/linux_pci.c create mode 100644 src/vlib/pci/pci.c create mode 100644 src/vlib/pci/pci.h create mode 100644 src/vlib/pci/pci_config.h create mode 100644 src/vlib/physmem.h create mode 100644 src/vlib/threads.c create mode 100644 src/vlib/threads.h create mode 100644 src/vlib/threads_cli.c create mode 100644 src/vlib/trace.c create mode 100644 src/vlib/trace.h create mode 100644 src/vlib/trace_funcs.h create mode 100644 src/vlib/unix/cj.c create mode 100644 src/vlib/unix/cj.h create mode 100644 src/vlib/unix/cli.c create mode 100644 src/vlib/unix/dir.dox create mode 100644 src/vlib/unix/input.c create mode 100644 src/vlib/unix/main.c create mode 100644 src/vlib/unix/mc_socket.c create mode 100644 src/vlib/unix/mc_socket.h create mode 100644 src/vlib/unix/physmem.c create mode 100644 src/vlib/unix/physmem.h create mode 100644 src/vlib/unix/plugin.c create mode 100644 src/vlib/unix/plugin.h create mode 100644 src/vlib/unix/unix.h create mode 100644 src/vlib/unix/util.c create mode 100644 src/vlib/vlib.h create mode 100644 src/vlib/vlib_process_doc.h (limited to 'src/vlib') diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c new file mode 100644 index 00000000..4bf6d125 --- /dev/null +++ b/src/vlib/buffer.c @@ -0,0 +1,1987 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * buffer.c: allocate/free network buffers. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @cond (!DPDK) + * @file + * + * Allocate/free network buffers. + */ + +#if DPDK > 0 +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +#include + +#if DPDK > 0 +#pragma weak rte_mem_virt2phy +#pragma weak rte_eal_has_hugepages +#pragma weak rte_socket_id +#pragma weak rte_pktmbuf_pool_create +#endif + +uword +vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm, + vlib_buffer_t * b_first) +{ + vlib_buffer_t *b = b_first; + uword l_first = b_first->current_length; + uword l = 0; + while (b->flags & VLIB_BUFFER_NEXT_PRESENT) + { + b = vlib_get_buffer (vm, b->next_buffer); + l += b->current_length; + } + b_first->total_length_not_including_first_buffer = l; + b_first->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + return l + l_first; +} + +u8 * +format_vlib_buffer (u8 * s, va_list * args) +{ + vlib_buffer_t *b = va_arg (*args, vlib_buffer_t *); +#if DPDK > 0 + uword indent = format_get_indent (s); + + s = format (s, "current data %d, length %d, free-list %d", + b->current_data, b->current_length, b->free_list_index); + + if (b->flags & VLIB_BUFFER_TOTAL_LENGTH_VALID) + s = format (s, ", totlen-nifb %d", + b->total_length_not_including_first_buffer); + + if (b->flags & VLIB_BUFFER_IS_TRACED) + s = format (s, ", trace 0x%x", b->trace_index); + + while (b->flags & VLIB_BUFFER_NEXT_PRESENT) + { + vlib_main_t *vm = vlib_get_main (); + u32 next_buffer = b->next_buffer; + b = vlib_get_buffer (vm, next_buffer); + + s = format (s, "\n%Unext-buffer 0x%x, segment length %d", + format_white_space, indent, next_buffer, b->current_length); + } + +#else + + s = format (s, "current data %d, length %d, free-list %d", + b->current_data, b->current_length, b->free_list_index); + + if (b->flags & VLIB_BUFFER_IS_TRACED) + s = format (s, ", trace 0x%x", b->trace_index); + + if (b->flags & VLIB_BUFFER_NEXT_PRESENT) + s = format (s, ", next-buffer 0x%x", b->next_buffer); +#endif + + return s; +} + +u8 * +format_vlib_buffer_and_data (u8 * s, va_list * args) +{ + vlib_buffer_t *b = va_arg (*args, vlib_buffer_t *); + + s = format (s, "%U, %U", + format_vlib_buffer, b, + format_hex_bytes, vlib_buffer_get_current (b), 64); + + return s; +} + +#if DPDK == 0 +static u8 * +format_vlib_buffer_known_state (u8 * s, va_list * args) +{ + vlib_buffer_known_state_t state = va_arg (*args, vlib_buffer_known_state_t); + char *t; + + switch (state) + { + case VLIB_BUFFER_UNKNOWN: + t = "unknown"; + break; + + case VLIB_BUFFER_KNOWN_ALLOCATED: + t = "known-allocated"; + break; + + case VLIB_BUFFER_KNOWN_FREE: + t = "known-free"; + break; + + default: + t = "invalid"; + break; + } + + return format (s, "%s", t); +} +#endif + +u8 * +format_vlib_buffer_contents (u8 * s, va_list * va) +{ + vlib_main_t *vm = va_arg (*va, vlib_main_t *); + vlib_buffer_t *b = va_arg (*va, vlib_buffer_t *); + + while (1) + { + vec_add (s, vlib_buffer_get_current (b), b->current_length); + if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT)) + break; + b = vlib_get_buffer (vm, b->next_buffer); + } + + return s; +} + +#if DPDK == 0 +static u8 * +vlib_validate_buffer_helper (vlib_main_t * vm, + u32 bi, + uword follow_buffer_next, uword ** unique_hash) +{ + vlib_buffer_t *b = vlib_get_buffer (vm, bi); + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *fl; + + if (pool_is_free_index (bm->buffer_free_list_pool, b->free_list_index)) + return format (0, "unknown free list 0x%x", b->free_list_index); + + fl = pool_elt_at_index (bm->buffer_free_list_pool, b->free_list_index); + + if ((signed) b->current_data < (signed) -VLIB_BUFFER_PRE_DATA_SIZE) + return format (0, "current data %d before pre-data", b->current_data); +#if DPDK == 0 + if (b->current_data + b->current_length > fl->n_data_bytes) + return format (0, "%d-%d beyond end of buffer %d", + b->current_data, b->current_length, fl->n_data_bytes); +#endif + + if (follow_buffer_next && (b->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + vlib_buffer_known_state_t k; + u8 *msg, *result; + + k = vlib_buffer_is_known (vm, b->next_buffer); + if (k != VLIB_BUFFER_KNOWN_ALLOCATED) + return format (0, "next 0x%x: %U", + b->next_buffer, format_vlib_buffer_known_state, k); + + if (unique_hash) + { + if (hash_get (*unique_hash, b->next_buffer)) + return format (0, "duplicate buffer 0x%x", b->next_buffer); + + hash_set1 (*unique_hash, b->next_buffer); + } + + msg = vlib_validate_buffer (vm, b->next_buffer, follow_buffer_next); + if (msg) + { + result = format (0, "next 0x%x: %v", b->next_buffer, msg); + vec_free (msg); + return result; + } + } + + return 0; +} + +u8 * +vlib_validate_buffer (vlib_main_t * vm, u32 bi, uword follow_buffer_next) +{ + return vlib_validate_buffer_helper (vm, bi, follow_buffer_next, + /* unique_hash */ 0); +} + +u8 * +vlib_validate_buffers (vlib_main_t * vm, + u32 * buffers, + uword next_buffer_stride, + uword n_buffers, + vlib_buffer_known_state_t known_state, + uword follow_buffer_next) +{ + uword i, *hash; + u32 bi, *b = buffers; + vlib_buffer_known_state_t k; + u8 *msg = 0, *result = 0; + + hash = hash_create (0, 0); + for (i = 0; i < n_buffers; i++) + { + bi = b[0]; + b += next_buffer_stride; + + /* Buffer is not unique. */ + if (hash_get (hash, bi)) + { + msg = format (0, "not unique"); + goto done; + } + + k = vlib_buffer_is_known (vm, bi); + if (k != known_state) + { + msg = format (0, "is %U; expected %U", + format_vlib_buffer_known_state, k, + format_vlib_buffer_known_state, known_state); + goto done; + } + + msg = vlib_validate_buffer_helper (vm, bi, follow_buffer_next, &hash); + if (msg) + goto done; + + hash_set1 (hash, bi); + } + +done: + if (msg) + { + result = format (0, "0x%x: %v", bi, msg); + vec_free (msg); + } + hash_free (hash); + return result; +} +#endif + +vlib_main_t **vlib_mains; + +#if DPDK == 0 +/* When dubugging validate that given buffers are either known allocated + or known free. */ +static void +vlib_buffer_validate_alloc_free (vlib_main_t * vm, + u32 * buffers, + uword n_buffers, + vlib_buffer_known_state_t expected_state) +{ + u32 *b; + uword i, bi, is_free; + + if (CLIB_DEBUG == 0) + return; + + ASSERT (os_get_cpu_number () == 0); + + /* smp disaster check */ + if (vlib_mains) + ASSERT (vm == vlib_mains[0]); + + is_free = expected_state == VLIB_BUFFER_KNOWN_ALLOCATED; + b = buffers; + for (i = 0; i < n_buffers; i++) + { + vlib_buffer_known_state_t known; + + bi = b[0]; + b += 1; + known = vlib_buffer_is_known (vm, bi); + if (known != expected_state) + { + ASSERT (0); + vlib_panic_with_msg + (vm, "%s %U buffer 0x%x", + is_free ? "freeing" : "allocating", + format_vlib_buffer_known_state, known, bi); + } + + vlib_buffer_set_known_state + (vm, bi, + is_free ? VLIB_BUFFER_KNOWN_FREE : VLIB_BUFFER_KNOWN_ALLOCATED); + } +} +#endif + +#define BUFFERS_PER_COPY (sizeof (vlib_copy_unit_t) / sizeof (u32)) + +/* Make sure we have at least given number of unaligned buffers. */ +static void +fill_unaligned (vlib_main_t * vm, + vlib_buffer_free_list_t * free_list, + uword n_unaligned_buffers) +{ + word la = vec_len (free_list->aligned_buffers); + word lu = vec_len (free_list->unaligned_buffers); + + /* Aligned come in aligned copy-sized chunks. */ + ASSERT (la % BUFFERS_PER_COPY == 0); + + ASSERT (la >= n_unaligned_buffers); + + while (lu < n_unaligned_buffers) + { + /* Copy 4 buffers from end of aligned vector to unaligned vector. */ + vec_add (free_list->unaligned_buffers, + free_list->aligned_buffers + la - BUFFERS_PER_COPY, + BUFFERS_PER_COPY); + la -= BUFFERS_PER_COPY; + lu += BUFFERS_PER_COPY; + } + _vec_len (free_list->aligned_buffers) = la; +} + +/* After free aligned buffers may not contain even sized chunks. */ +static void +trim_aligned (vlib_buffer_free_list_t * f) +{ + uword l, n_trim; + + /* Add unaligned to aligned before trim. */ + l = vec_len (f->unaligned_buffers); + if (l > 0) + { + vec_add_aligned (f->aligned_buffers, f->unaligned_buffers, l, + /* align */ sizeof (vlib_copy_unit_t)); + + _vec_len (f->unaligned_buffers) = 0; + } + + /* Remove unaligned buffers from end of aligned vector and save for next trim. */ + l = vec_len (f->aligned_buffers); + n_trim = l % BUFFERS_PER_COPY; + if (n_trim) + { + /* Trim aligned -> unaligned. */ + vec_add (f->unaligned_buffers, f->aligned_buffers + l - n_trim, n_trim); + + /* Remove from aligned. */ + _vec_len (f->aligned_buffers) = l - n_trim; + } +} + +static void +merge_free_lists (vlib_buffer_free_list_t * dst, + vlib_buffer_free_list_t * src) +{ + uword l; + u32 *d; + + trim_aligned (src); + trim_aligned (dst); + + l = vec_len (src->aligned_buffers); + if (l > 0) + { + vec_add2_aligned (dst->aligned_buffers, d, l, + /* align */ sizeof (vlib_copy_unit_t)); + clib_memcpy (d, src->aligned_buffers, l * sizeof (d[0])); + vec_free (src->aligned_buffers); + } + + l = vec_len (src->unaligned_buffers); + if (l > 0) + { + vec_add (dst->unaligned_buffers, src->unaligned_buffers, l); + vec_free (src->unaligned_buffers); + } +} + +always_inline u32 +vlib_buffer_get_free_list_with_size (vlib_main_t * vm, u32 size) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + + size = vlib_buffer_round_size (size); + uword *p = hash_get (bm->free_list_by_size, size); + return p ? p[0] : ~0; +} + +/* Add buffer free list. */ +static u32 +vlib_buffer_create_free_list_helper (vlib_main_t * vm, + u32 n_data_bytes, + u32 is_public, u32 is_default, u8 * name) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *f; +#if DPDK > 0 + int i; + + ASSERT (os_get_cpu_number () == 0); + + if (!is_default && pool_elts (bm->buffer_free_list_pool) == 0) + { + u32 default_free_free_list_index; + + /* *INDENT-OFF* */ + default_free_free_list_index = + vlib_buffer_create_free_list_helper + (vm, + /* default buffer size */ VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES, + /* is_public */ 1, + /* is_default */ 1, + (u8 *) "default"); + /* *INDENT-ON* */ + ASSERT (default_free_free_list_index == + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + + if (n_data_bytes == VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES && is_public) + return default_free_free_list_index; + } + + pool_get_aligned (bm->buffer_free_list_pool, f, CLIB_CACHE_LINE_BYTES); + + memset (f, 0, sizeof (f[0])); + f->index = f - bm->buffer_free_list_pool; + f->n_data_bytes = vlib_buffer_round_size (n_data_bytes); + f->min_n_buffers_each_physmem_alloc = 16; + f->name = clib_mem_is_heap_object (name) ? name : format (0, "%s", name); + + /* Setup free buffer template. */ + f->buffer_init_template.free_list_index = f->index; + + if (is_public) + { + uword *p = hash_get (bm->free_list_by_size, f->n_data_bytes); + if (!p) + hash_set (bm->free_list_by_size, f->n_data_bytes, f->index); + } + + for (i = 1; i < vec_len (vlib_mains); i++) + { + vlib_buffer_main_t *wbm = vlib_mains[i]->buffer_main; + vlib_buffer_free_list_t *wf; + pool_get_aligned (wbm->buffer_free_list_pool, + wf, CLIB_CACHE_LINE_BYTES); + ASSERT (f - bm->buffer_free_list_pool == + wf - wbm->buffer_free_list_pool); + wf[0] = f[0]; + wf->aligned_buffers = 0; + wf->unaligned_buffers = 0; + wf->n_alloc = 0; + } +#else + + if (!is_default && pool_elts (bm->buffer_free_list_pool) == 0) + { + u32 default_free_free_list_index; + + default_free_free_list_index = vlib_buffer_create_free_list_helper (vm, + /* default buffer size */ + VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES, + /* is_public */ + 1, + /* is_default */ + 1, + (u8 + *) + "default"); + ASSERT (default_free_free_list_index == + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + + if (n_data_bytes == VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES && is_public) + return default_free_free_list_index; + } + + pool_get_aligned (bm->buffer_free_list_pool, f, CLIB_CACHE_LINE_BYTES); + + memset (f, 0, sizeof (f[0])); + f->index = f - bm->buffer_free_list_pool; + f->n_data_bytes = vlib_buffer_round_size (n_data_bytes); + f->min_n_buffers_each_physmem_alloc = 256; + f->name = clib_mem_is_heap_object (name) ? name : format (0, "%s", name); + + /* Setup free buffer template. */ + f->buffer_init_template.free_list_index = f->index; + + if (is_public) + { + uword *p = hash_get (bm->free_list_by_size, f->n_data_bytes); + if (!p) + hash_set (bm->free_list_by_size, f->n_data_bytes, f->index); + } +#endif + + return f->index; +} + +u32 +vlib_buffer_create_free_list (vlib_main_t * vm, u32 n_data_bytes, + char *fmt, ...) +{ + va_list va; + u8 *name; + + va_start (va, fmt); + name = va_format (0, fmt, &va); + va_end (va); + + return vlib_buffer_create_free_list_helper (vm, n_data_bytes, + /* is_public */ 0, + /* is_default */ 0, + name); +} + +u32 +vlib_buffer_get_or_create_free_list (vlib_main_t * vm, u32 n_data_bytes, + char *fmt, ...) +{ + u32 i = vlib_buffer_get_free_list_with_size (vm, n_data_bytes); + + if (i == ~0) + { + va_list va; + u8 *name; + + va_start (va, fmt); + name = va_format (0, fmt, &va); + va_end (va); + + i = vlib_buffer_create_free_list_helper (vm, n_data_bytes, + /* is_public */ 1, + /* is_default */ 0, + name); + } + + return i; +} + +static void +del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f) +{ + u32 i; +#if DPDK > 0 + struct rte_mbuf *mb; + vlib_buffer_t *b; + + for (i = 0; i < vec_len (f->unaligned_buffers); i++) + { + b = vlib_get_buffer (vm, f->unaligned_buffers[i]); + mb = rte_mbuf_from_vlib_buffer (b); + ASSERT (rte_mbuf_refcnt_read (mb) == 1); + rte_pktmbuf_free (mb); + } + for (i = 0; i < vec_len (f->aligned_buffers); i++) + { + b = vlib_get_buffer (vm, f->aligned_buffers[i]); + mb = rte_mbuf_from_vlib_buffer (b); + ASSERT (rte_mbuf_refcnt_read (mb) == 1); + rte_pktmbuf_free (mb); + } + vec_free (f->name); +#else + + for (i = 0; i < vec_len (f->buffer_memory_allocated); i++) + vm->os_physmem_free (f->buffer_memory_allocated[i]); + vec_free (f->name); + vec_free (f->buffer_memory_allocated); +#endif + vec_free (f->unaligned_buffers); + vec_free (f->aligned_buffers); +} + +/* Add buffer free list. */ +void +vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *f; + u32 merge_index; +#if DPDK > 0 + int i; + + ASSERT (os_get_cpu_number () == 0); + + f = vlib_buffer_get_free_list (vm, free_list_index); + + merge_index = vlib_buffer_get_free_list_with_size (vm, f->n_data_bytes); + if (merge_index != ~0 && merge_index != free_list_index) + { + merge_free_lists (pool_elt_at_index (bm->buffer_free_list_pool, + merge_index), f); + } + + del_free_list (vm, f); + + /* Poison it. */ + memset (f, 0xab, sizeof (f[0])); + + pool_put (bm->buffer_free_list_pool, f); + + for (i = 1; i < vec_len (vlib_mains); i++) + { + bm = vlib_mains[i]->buffer_main; + f = vlib_buffer_get_free_list (vlib_mains[i], free_list_index);; + memset (f, 0xab, sizeof (f[0])); + pool_put (bm->buffer_free_list_pool, f); + } +#else + + f = vlib_buffer_get_free_list (vm, free_list_index); + + ASSERT (vec_len (f->unaligned_buffers) + vec_len (f->aligned_buffers) == + f->n_alloc); + merge_index = vlib_buffer_get_free_list_with_size (vm, f->n_data_bytes); + if (merge_index != ~0 && merge_index != free_list_index) + { + merge_free_lists (pool_elt_at_index (bm->buffer_free_list_pool, + merge_index), f); + } + + del_free_list (vm, f); + + /* Poison it. */ + memset (f, 0xab, sizeof (f[0])); + + pool_put (bm->buffer_free_list_pool, f); +#endif +} + +/* Make sure free list has at least given number of free buffers. */ +static uword +fill_free_list (vlib_main_t * vm, + vlib_buffer_free_list_t * fl, uword min_free_buffers) +{ +#if DPDK > 0 + vlib_buffer_t *b; + int n, i; + u32 bi; + u32 n_remaining = 0, n_alloc = 0; + unsigned socket_id = rte_socket_id ? rte_socket_id () : 0; + struct rte_mempool *rmp = vm->buffer_main->pktmbuf_pools[socket_id]; + struct rte_mbuf *mb; + + /* Too early? */ + if (PREDICT_FALSE (rmp == 0)) + return 0; + + trim_aligned (fl); + + /* Already have enough free buffers on free list? */ + n = min_free_buffers - vec_len (fl->aligned_buffers); + if (n <= 0) + return min_free_buffers; + + /* Always allocate round number of buffers. */ + n = round_pow2 (n, BUFFERS_PER_COPY); + + /* Always allocate new buffers in reasonably large sized chunks. */ + n = clib_max (n, fl->min_n_buffers_each_physmem_alloc); + + vec_validate (vm->mbuf_alloc_list, n - 1); + + if (rte_mempool_get_bulk (rmp, vm->mbuf_alloc_list, n) < 0) + return 0; + + _vec_len (vm->mbuf_alloc_list) = n; + + for (i = 0; i < n; i++) + { + mb = vm->mbuf_alloc_list[i]; + + ASSERT (rte_mbuf_refcnt_read (mb) == 0); + rte_mbuf_refcnt_set (mb, 1); + + b = vlib_buffer_from_rte_mbuf (mb); + bi = vlib_get_buffer_index (vm, b); + + vec_add1_aligned (fl->aligned_buffers, bi, sizeof (vlib_copy_unit_t)); + n_alloc++; + n_remaining--; + + vlib_buffer_init_for_free_list (b, fl); + + if (fl->buffer_init_function) + fl->buffer_init_function (vm, fl, &bi, 1); + } + + fl->n_alloc += n; + + return n; +#else + vlib_buffer_t *buffers, *b; + int n, n_bytes, i; + u32 *bi; + u32 n_remaining, n_alloc, n_this_chunk; + + trim_aligned (fl); + + /* Already have enough free buffers on free list? */ + n = min_free_buffers - vec_len (fl->aligned_buffers); + if (n <= 0) + return min_free_buffers; + + /* Always allocate round number of buffers. */ + n = round_pow2 (n, BUFFERS_PER_COPY); + + /* Always allocate new buffers in reasonably large sized chunks. */ + n = clib_max (n, fl->min_n_buffers_each_physmem_alloc); + + n_remaining = n; + n_alloc = 0; + while (n_remaining > 0) + { + n_this_chunk = clib_min (n_remaining, 16); + + n_bytes = n_this_chunk * (sizeof (b[0]) + fl->n_data_bytes); + + /* drb: removed power-of-2 ASSERT */ + buffers = vm->os_physmem_alloc_aligned (&vm->physmem_main, + n_bytes, + sizeof (vlib_buffer_t)); + if (!buffers) + return n_alloc; + + /* Record chunk as being allocated so we can free it later. */ + vec_add1 (fl->buffer_memory_allocated, buffers); + + fl->n_alloc += n_this_chunk; + n_alloc += n_this_chunk; + n_remaining -= n_this_chunk; + + b = buffers; + vec_add2_aligned (fl->aligned_buffers, bi, n_this_chunk, + sizeof (vlib_copy_unit_t)); + for (i = 0; i < n_this_chunk; i++) + { + bi[i] = vlib_get_buffer_index (vm, b); + + if (CLIB_DEBUG > 0) + vlib_buffer_set_known_state (vm, bi[i], VLIB_BUFFER_KNOWN_FREE); + b = vlib_buffer_next_contiguous (b, fl->n_data_bytes); + } + + memset (buffers, 0, n_bytes); + + /* Initialize all new buffers. */ + b = buffers; + for (i = 0; i < n_this_chunk; i++) + { + vlib_buffer_init_for_free_list (b, fl); + b = vlib_buffer_next_contiguous (b, fl->n_data_bytes); + } + + if (fl->buffer_init_function) + fl->buffer_init_function (vm, fl, bi, n_this_chunk); + } + return n_alloc; +#endif +} + +always_inline uword +copy_alignment (u32 * x) +{ + return (pointer_to_uword (x) / sizeof (x[0])) % BUFFERS_PER_COPY; +} + +static u32 +alloc_from_free_list (vlib_main_t * vm, + vlib_buffer_free_list_t * free_list, + u32 * alloc_buffers, u32 n_alloc_buffers) +{ + u32 *dst, *u_src; + uword u_len, n_left; + uword n_unaligned_start, n_unaligned_end, n_filled; + +#if DPDK == 0 + ASSERT (os_get_cpu_number () == 0); + +#endif + n_left = n_alloc_buffers; + dst = alloc_buffers; + n_unaligned_start = ((BUFFERS_PER_COPY - copy_alignment (dst)) + & (BUFFERS_PER_COPY - 1)); + + n_filled = fill_free_list (vm, free_list, n_alloc_buffers); + if (n_filled == 0) + return 0; + + n_left = n_filled < n_left ? n_filled : n_left; + n_alloc_buffers = n_left; + + if (n_unaligned_start >= n_left) + { + n_unaligned_start = n_left; + n_unaligned_end = 0; + } + else + n_unaligned_end = copy_alignment (dst + n_alloc_buffers); + + fill_unaligned (vm, free_list, n_unaligned_start + n_unaligned_end); + + u_len = vec_len (free_list->unaligned_buffers); + u_src = free_list->unaligned_buffers + u_len - 1; + + if (n_unaligned_start) + { + uword n_copy = n_unaligned_start; + if (n_copy > n_left) + n_copy = n_left; + n_left -= n_copy; + + while (n_copy > 0) + { + *dst++ = *u_src--; + n_copy--; + u_len--; + } + + /* Now dst should be aligned. */ + if (n_left > 0) + ASSERT (pointer_to_uword (dst) % sizeof (vlib_copy_unit_t) == 0); + } + + /* Aligned copy. */ + { + vlib_copy_unit_t *d, *s; + uword n_copy; + + if (vec_len (free_list->aligned_buffers) < + ((n_left / BUFFERS_PER_COPY) * BUFFERS_PER_COPY)) + abort (); + + n_copy = n_left / BUFFERS_PER_COPY; + n_left = n_left % BUFFERS_PER_COPY; + + /* Remove buffers from aligned free list. */ + _vec_len (free_list->aligned_buffers) -= n_copy * BUFFERS_PER_COPY; + + s = (vlib_copy_unit_t *) vec_end (free_list->aligned_buffers); + d = (vlib_copy_unit_t *) dst; + + /* Fast path loop. */ + while (n_copy >= 4) + { + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + n_copy -= 4; + s += 4; + d += 4; + } + + while (n_copy >= 1) + { + d[0] = s[0]; + n_copy -= 1; + s += 1; + d += 1; + } + + dst = (void *) d; + } + + /* Unaligned copy. */ + ASSERT (n_unaligned_end == n_left); + while (n_left > 0) + { + *dst++ = *u_src--; + n_left--; + u_len--; + } + + if (!free_list->unaligned_buffers) + ASSERT (u_len == 0); + else + _vec_len (free_list->unaligned_buffers) = u_len; + +#if DPDK == 0 + /* Verify that buffers are known free. */ + vlib_buffer_validate_alloc_free (vm, alloc_buffers, + n_alloc_buffers, VLIB_BUFFER_KNOWN_FREE); +#endif + + return n_alloc_buffers; +} + +/* Allocate a given number of buffers into given array. + Returns number actually allocated which will be either zero or + number requested. */ +u32 +vlib_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +{ + vlib_buffer_main_t *bm = vm->buffer_main; +#if DPDK == 0 + ASSERT (os_get_cpu_number () == 0); +#endif + + return alloc_from_free_list + (vm, + pool_elt_at_index (bm->buffer_free_list_pool, + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX), + buffers, n_buffers); +} + +u32 +vlib_buffer_alloc_from_free_list (vlib_main_t * vm, + u32 * buffers, + u32 n_buffers, u32 free_list_index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *f; + f = pool_elt_at_index (bm->buffer_free_list_pool, free_list_index); + return alloc_from_free_list (vm, f, buffers, n_buffers); +} + +always_inline void +add_buffer_to_free_list (vlib_main_t * vm, + vlib_buffer_free_list_t * f, + u32 buffer_index, u8 do_init) +{ + vlib_buffer_t *b; + b = vlib_get_buffer (vm, buffer_index); + if (PREDICT_TRUE (do_init)) + vlib_buffer_init_for_free_list (b, f); + vec_add1_aligned (f->aligned_buffers, buffer_index, + sizeof (vlib_copy_unit_t)); +} + +always_inline vlib_buffer_free_list_t * +buffer_get_free_list (vlib_main_t * vm, vlib_buffer_t * b, u32 * index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + u32 i; + + *index = i = b->free_list_index; + return pool_elt_at_index (bm->buffer_free_list_pool, i); +} + +void * +vlib_set_buffer_free_callback (vlib_main_t * vm, void *fp) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + void *rv = bm->buffer_free_callback; + + bm->buffer_free_callback = fp; + return rv; +} + +#if DPDK == 0 +void vnet_buffer_free_dpdk_mb (vlib_buffer_t * b) __attribute__ ((weak)); +void +vnet_buffer_free_dpdk_mb (vlib_buffer_t * b) +{ +} + +#endif +static_always_inline void +vlib_buffer_free_inline (vlib_main_t * vm, + u32 * buffers, u32 n_buffers, u32 follow_buffer_next) +{ +#if DPDK > 0 + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *fl; + u32 fi; + int i; + u32 (*cb) (vlib_main_t * vm, u32 * buffers, u32 n_buffers, + u32 follow_buffer_next); + + cb = bm->buffer_free_callback; + + if (PREDICT_FALSE (cb != 0)) + n_buffers = (*cb) (vm, buffers, n_buffers, follow_buffer_next); + + if (!n_buffers) + return; + + for (i = 0; i < n_buffers; i++) + { + vlib_buffer_t *b; + struct rte_mbuf *mb; + + b = vlib_get_buffer (vm, buffers[i]); + + fl = buffer_get_free_list (vm, b, &fi); + + /* The only current use of this callback: multicast recycle */ + if (PREDICT_FALSE (fl->buffers_added_to_freelist_function != 0)) + { + int j; + + add_buffer_to_free_list + (vm, fl, buffers[i], (b->flags & VLIB_BUFFER_RECYCLE) == 0); + + for (j = 0; j < vec_len (bm->announce_list); j++) + { + if (fl == bm->announce_list[j]) + goto already_announced; + } + vec_add1 (bm->announce_list, fl); + already_announced: + ; + } + else + { + if (PREDICT_TRUE ((b->flags & VLIB_BUFFER_RECYCLE) == 0)) + { + mb = rte_mbuf_from_vlib_buffer (b); + ASSERT (rte_mbuf_refcnt_read (mb) == 1); + rte_pktmbuf_free (mb); + } + } + } + if (vec_len (bm->announce_list)) + { + vlib_buffer_free_list_t *fl; + for (i = 0; i < vec_len (bm->announce_list); i++) + { + fl = bm->announce_list[i]; + fl->buffers_added_to_freelist_function (vm, fl); + } + _vec_len (bm->announce_list) = 0; + } +#else + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *fl; + static u32 *next_to_free[2]; /* smp bad */ + u32 i_next_to_free, *b, *n, *f, fi; + uword n_left; + int i; + static vlib_buffer_free_list_t **announce_list; + vlib_buffer_free_list_t *fl0 = 0, *fl1 = 0; + u32 bi0 = (u32) ~ 0, bi1 = (u32) ~ 0, fi0, fi1 = (u32) ~ 0; + u8 free0, free1 = 0, free_next0, free_next1; + u32 (*cb) (vlib_main_t * vm, u32 * buffers, u32 n_buffers, + u32 follow_buffer_next); + + ASSERT (os_get_cpu_number () == 0); + + cb = bm->buffer_free_callback; + + if (PREDICT_FALSE (cb != 0)) + n_buffers = (*cb) (vm, buffers, n_buffers, follow_buffer_next); + + if (!n_buffers) + return; + + /* Use first buffer to get default free list. */ + { + u32 bi0 = buffers[0]; + vlib_buffer_t *b0; + + b0 = vlib_get_buffer (vm, bi0); + fl = buffer_get_free_list (vm, b0, &fi); + if (fl->buffers_added_to_freelist_function) + vec_add1 (announce_list, fl); + } + + vec_validate (next_to_free[0], n_buffers - 1); + vec_validate (next_to_free[1], n_buffers - 1); + + i_next_to_free = 0; + n_left = n_buffers; + b = buffers; + +again: + /* Verify that buffers are known allocated. */ + vlib_buffer_validate_alloc_free (vm, b, + n_left, VLIB_BUFFER_KNOWN_ALLOCATED); + + vec_add2_aligned (fl->aligned_buffers, f, n_left, + /* align */ sizeof (vlib_copy_unit_t)); + + n = next_to_free[i_next_to_free]; + while (n_left >= 4) + { + vlib_buffer_t *b0, *b1, *binit0, *binit1, dummy_buffers[2]; + + bi0 = b[0]; + bi1 = b[1]; + + f[0] = bi0; + f[1] = bi1; + f += 2; + b += 2; + n_left -= 2; + + /* Prefetch buffers for next iteration. */ + vlib_prefetch_buffer_with_index (vm, b[0], WRITE); + vlib_prefetch_buffer_with_index (vm, b[1], WRITE); + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + free0 = (b0->flags & VLIB_BUFFER_RECYCLE) == 0; + free1 = (b1->flags & VLIB_BUFFER_RECYCLE) == 0; + + /* Must be before init which will over-write buffer flags. */ + if (follow_buffer_next) + { + n[0] = b0->next_buffer; + free_next0 = free0 && (b0->flags & VLIB_BUFFER_NEXT_PRESENT) != 0; + n += free_next0; + + n[0] = b1->next_buffer; + free_next1 = free1 && (b1->flags & VLIB_BUFFER_NEXT_PRESENT) != 0; + n += free_next1; + } + else + free_next0 = free_next1 = 0; + + /* Must be before init which will over-write buffer free list. */ + fi0 = b0->free_list_index; + fi1 = b1->free_list_index; + + if (PREDICT_FALSE (fi0 != fi || fi1 != fi)) + goto slow_path_x2; + + binit0 = free0 ? b0 : &dummy_buffers[0]; + binit1 = free1 ? b1 : &dummy_buffers[1]; + + vlib_buffer_init_two_for_free_list (binit0, binit1, fl); + continue; + + slow_path_x2: + /* Backup speculation. */ + f -= 2; + n -= free_next0 + free_next1; + + _vec_len (fl->aligned_buffers) = f - fl->aligned_buffers; + + fl0 = pool_elt_at_index (bm->buffer_free_list_pool, fi0); + fl1 = pool_elt_at_index (bm->buffer_free_list_pool, fi1); + + add_buffer_to_free_list (vm, fl0, bi0, free0); + if (PREDICT_FALSE (fl0->buffers_added_to_freelist_function != 0)) + { + int i; + for (i = 0; i < vec_len (announce_list); i++) + if (fl0 == announce_list[i]) + goto no_fl0; + vec_add1 (announce_list, fl0); + } + no_fl0: + if (PREDICT_FALSE (fl1->buffers_added_to_freelist_function != 0)) + { + int i; + for (i = 0; i < vec_len (announce_list); i++) + if (fl1 == announce_list[i]) + goto no_fl1; + vec_add1 (announce_list, fl1); + } + + no_fl1: + add_buffer_to_free_list (vm, fl1, bi1, free1); + + /* Possibly change current free list. */ + if (fi0 != fi && fi1 != fi) + { + fi = fi1; + fl = pool_elt_at_index (bm->buffer_free_list_pool, fi); + } + + vec_add2_aligned (fl->aligned_buffers, f, n_left, + /* align */ sizeof (vlib_copy_unit_t)); + } + + while (n_left >= 1) + { + vlib_buffer_t *b0, *binit0, dummy_buffers[1]; + + bi0 = b[0]; + f[0] = bi0; + f += 1; + b += 1; + n_left -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + free0 = (b0->flags & VLIB_BUFFER_RECYCLE) == 0; + + /* Must be before init which will over-write buffer flags. */ + if (follow_buffer_next) + { + n[0] = b0->next_buffer; + free_next0 = free0 && (b0->flags & VLIB_BUFFER_NEXT_PRESENT) != 0; + n += free_next0; + } + else + free_next0 = 0; + + /* Must be before init which will over-write buffer free list. */ + fi0 = b0->free_list_index; + + if (PREDICT_FALSE (fi0 != fi)) + goto slow_path_x1; + + binit0 = free0 ? b0 : &dummy_buffers[0]; + + vlib_buffer_init_for_free_list (binit0, fl); + continue; + + slow_path_x1: + /* Backup speculation. */ + f -= 1; + n -= free_next0; + + _vec_len (fl->aligned_buffers) = f - fl->aligned_buffers; + + fl0 = pool_elt_at_index (bm->buffer_free_list_pool, fi0); + + add_buffer_to_free_list (vm, fl0, bi0, free0); + if (PREDICT_FALSE (fl0->buffers_added_to_freelist_function != 0)) + { + int i; + for (i = 0; i < vec_len (announce_list); i++) + if (fl0 == announce_list[i]) + goto no_fl00; + vec_add1 (announce_list, fl0); + } + + no_fl00: + fi = fi0; + fl = pool_elt_at_index (bm->buffer_free_list_pool, fi); + + vec_add2_aligned (fl->aligned_buffers, f, n_left, + /* align */ sizeof (vlib_copy_unit_t)); + } + + if (follow_buffer_next && ((n_left = n - next_to_free[i_next_to_free]) > 0)) + { + b = next_to_free[i_next_to_free]; + i_next_to_free ^= 1; + goto again; + } + + _vec_len (fl->aligned_buffers) = f - fl->aligned_buffers; + + if (vec_len (announce_list)) + { + vlib_buffer_free_list_t *fl; + for (i = 0; i < vec_len (announce_list); i++) + { + fl = announce_list[i]; + fl->buffers_added_to_freelist_function (vm, fl); + } + _vec_len (announce_list) = 0; + } +#endif +} + +void +vlib_buffer_free (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +{ + vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */ + 1); +} + +void +vlib_buffer_free_no_next (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +{ + vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */ + 0); +} + +#if DPDK == 0 +/* Copy template packet data into buffers as they are allocated. */ +static void +vlib_packet_template_buffer_init (vlib_main_t * vm, + vlib_buffer_free_list_t * fl, + u32 * buffers, u32 n_buffers) +{ + vlib_packet_template_t *t = + uword_to_pointer (fl->buffer_init_function_opaque, + vlib_packet_template_t *); + uword i; + + for (i = 0; i < n_buffers; i++) + { + vlib_buffer_t *b = vlib_get_buffer (vm, buffers[i]); + ASSERT (b->current_length == vec_len (t->packet_data)); + clib_memcpy (vlib_buffer_get_current (b), t->packet_data, + b->current_length); + } +} +#endif + +void +vlib_packet_template_init (vlib_main_t * vm, + vlib_packet_template_t * t, + void *packet_data, + uword n_packet_data_bytes, + uword min_n_buffers_each_physmem_alloc, + char *fmt, ...) +{ +#if DPDK > 0 + va_list va; + __attribute__ ((unused)) u8 *name; + + va_start (va, fmt); + name = va_format (0, fmt, &va); + va_end (va); + + vlib_worker_thread_barrier_sync (vm); + memset (t, 0, sizeof (t[0])); + + vec_add (t->packet_data, packet_data, n_packet_data_bytes); + + vlib_worker_thread_barrier_release (vm); +#else + vlib_buffer_free_list_t *fl; + va_list va; + u8 *name; + + va_start (va, fmt); + name = va_format (0, fmt, &va); + va_end (va); + + memset (t, 0, sizeof (t[0])); + + vec_add (t->packet_data, packet_data, n_packet_data_bytes); + t->min_n_buffers_each_physmem_alloc = min_n_buffers_each_physmem_alloc; + + t->free_list_index = vlib_buffer_create_free_list_helper + (vm, n_packet_data_bytes, + /* is_public */ 1, + /* is_default */ 0, + name); + + ASSERT (t->free_list_index != 0); + fl = vlib_buffer_get_free_list (vm, t->free_list_index); + fl->min_n_buffers_each_physmem_alloc = t->min_n_buffers_each_physmem_alloc; + + fl->buffer_init_function = vlib_packet_template_buffer_init; + fl->buffer_init_function_opaque = pointer_to_uword (t); + + fl->buffer_init_template.current_data = 0; + fl->buffer_init_template.current_length = n_packet_data_bytes; + fl->buffer_init_template.flags = 0; +#endif +} + +void * +vlib_packet_template_get_packet (vlib_main_t * vm, + vlib_packet_template_t * t, u32 * bi_result) +{ + u32 bi; + vlib_buffer_t *b; + + if (vlib_buffer_alloc (vm, &bi, 1) != 1) + return 0; + + *bi_result = bi; + + b = vlib_get_buffer (vm, bi); + clib_memcpy (vlib_buffer_get_current (b), + t->packet_data, vec_len (t->packet_data)); + b->current_length = vec_len (t->packet_data); + + return b->data; +} + +#if DPDK == 0 +void +vlib_packet_template_get_packet_helper (vlib_main_t * vm, + vlib_packet_template_t * t) +{ + word n = t->min_n_buffers_each_physmem_alloc; + word l = vec_len (t->packet_data); + word n_alloc; + + ASSERT (l > 0); + ASSERT (vec_len (t->free_buffers) == 0); + + vec_validate (t->free_buffers, n - 1); + n_alloc = vlib_buffer_alloc_from_free_list (vm, t->free_buffers, + n, t->free_list_index); + _vec_len (t->free_buffers) = n_alloc; +} + +#endif +/* Append given data to end of buffer, possibly allocating new buffers. */ +u32 +vlib_buffer_add_data (vlib_main_t * vm, + u32 free_list_index, + u32 buffer_index, void *data, u32 n_data_bytes) +{ + u32 n_buffer_bytes, n_left, n_left_this_buffer, bi; + vlib_buffer_t *b; + void *d; + + bi = buffer_index; + if (bi == 0 + && 1 != vlib_buffer_alloc_from_free_list (vm, &bi, 1, free_list_index)) + goto out_of_buffers; + + d = data; + n_left = n_data_bytes; + n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm, free_list_index); + + b = vlib_get_buffer (vm, bi); + b->flags &= ~VLIB_BUFFER_TOTAL_LENGTH_VALID; + + /* Get to the end of the chain before we try to append data... */ + while (b->flags & VLIB_BUFFER_NEXT_PRESENT) + b = vlib_get_buffer (vm, b->next_buffer); + + while (1) + { + u32 n; + + ASSERT (n_buffer_bytes >= b->current_length); + n_left_this_buffer = + n_buffer_bytes - (b->current_data + b->current_length); + n = clib_min (n_left_this_buffer, n_left); + clib_memcpy (vlib_buffer_get_current (b) + b->current_length, d, n); + b->current_length += n; + n_left -= n; + if (n_left == 0) + break; + + d += n; + if (1 != + vlib_buffer_alloc_from_free_list (vm, &b->next_buffer, 1, + free_list_index)) + goto out_of_buffers; + + b->flags |= VLIB_BUFFER_NEXT_PRESENT; + + b = vlib_get_buffer (vm, b->next_buffer); + } + + return bi; + +out_of_buffers: + clib_error ("out of buffers"); + return bi; +} + +u16 +vlib_buffer_chain_append_data_with_alloc (vlib_main_t * vm, + u32 free_list_index, + vlib_buffer_t * first, + vlib_buffer_t ** last, + void *data, u16 data_len) +{ + vlib_buffer_t *l = *last; + u32 n_buffer_bytes = + vlib_buffer_free_list_buffer_size (vm, free_list_index); + u16 copied = 0; + ASSERT (n_buffer_bytes >= l->current_length + l->current_data); + while (data_len) + { + u16 max = n_buffer_bytes - l->current_length - l->current_data; + if (max == 0) + { + if (1 != + vlib_buffer_alloc_from_free_list (vm, &l->next_buffer, 1, + free_list_index)) + return copied; + *last = l = vlib_buffer_chain_buffer (vm, first, l, l->next_buffer); + max = n_buffer_bytes - l->current_length - l->current_data; + } + + u16 len = (data_len > max) ? max : data_len; + clib_memcpy (vlib_buffer_get_current (l) + l->current_length, + data + copied, len); + vlib_buffer_chain_increase_length (first, l, len); + data_len -= len; + copied += len; + } + return copied; +} + +#if DPDK > 0 +clib_error_t * +vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, + unsigned socket_id) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_physmem_main_t *vpm = &vm->physmem_main; + struct rte_mempool *rmp; + int i; + + if (!rte_pktmbuf_pool_create) + return clib_error_return (0, "not linked with DPDK"); + + vec_validate_aligned (bm->pktmbuf_pools, socket_id, CLIB_CACHE_LINE_BYTES); + + /* pool already exists, nothing to do */ + if (bm->pktmbuf_pools[socket_id]) + return 0; + + u8 *pool_name = format (0, "mbuf_pool_socket%u%c", socket_id, 0); + + rmp = rte_pktmbuf_pool_create ((char *) pool_name, /* pool name */ + num_mbufs, /* number of mbufs */ + 512, /* cache size */ + VLIB_BUFFER_HDR_SIZE, /* priv size */ + VLIB_BUFFER_PRE_DATA_SIZE + VLIB_BUFFER_DATA_SIZE, /* dataroom size */ + socket_id); /* cpu socket */ + + if (rmp) + { + { + uword this_pool_end; + uword this_pool_start; + uword this_pool_size; + uword save_vpm_start, save_vpm_end, save_vpm_size; + struct rte_mempool_memhdr *memhdr; + + this_pool_start = ~0ULL; + this_pool_end = 0LL; + + STAILQ_FOREACH (memhdr, &rmp->mem_list, next) + { + if (((uword) (memhdr->addr + memhdr->len)) > this_pool_end) + this_pool_end = (uword) (memhdr->addr + memhdr->len); + if (((uword) memhdr->addr) < this_pool_start) + this_pool_start = (uword) (memhdr->addr); + } + ASSERT (this_pool_start < ~0ULL && this_pool_end > 0); + this_pool_size = this_pool_end - this_pool_start; + + if (CLIB_DEBUG > 1) + { + clib_warning ("%s: pool start %llx pool end %llx pool size %lld", + pool_name, this_pool_start, this_pool_end, + this_pool_size); + clib_warning + ("before: virtual.start %llx virtual.end %llx virtual.size %lld", + vpm->virtual.start, vpm->virtual.end, vpm->virtual.size); + } + + save_vpm_start = vpm->virtual.start; + save_vpm_end = vpm->virtual.end; + save_vpm_size = vpm->virtual.size; + + if ((this_pool_start < vpm->virtual.start) || vpm->virtual.start == 0) + vpm->virtual.start = this_pool_start; + if (this_pool_end > vpm->virtual.end) + vpm->virtual.end = this_pool_end; + + vpm->virtual.size = vpm->virtual.end - vpm->virtual.start; + + if (CLIB_DEBUG > 1) + { + clib_warning + ("after: virtual.start %llx virtual.end %llx virtual.size %lld", + vpm->virtual.start, vpm->virtual.end, vpm->virtual.size); + } + + /* check if fits into buffer index range */ + if ((u64) vpm->virtual.size > + ((u64) 1 << (32 + CLIB_LOG2_CACHE_LINE_BYTES))) + { + clib_warning ("physmem: virtual size out of range!"); + vpm->virtual.start = save_vpm_start; + vpm->virtual.end = save_vpm_end; + vpm->virtual.size = save_vpm_size; + rmp = 0; + } + } + if (rmp) + { + bm->pktmbuf_pools[socket_id] = rmp; + vec_free (pool_name); + return 0; + } + } + + vec_free (pool_name); + + /* no usable pool for this socket, try to use pool from another one */ + for (i = 0; i < vec_len (bm->pktmbuf_pools); i++) + { + if (bm->pktmbuf_pools[i]) + { + clib_warning + ("WARNING: Failed to allocate mempool for CPU socket %u. " + "Threads running on socket %u will use socket %u mempool.", + socket_id, socket_id, i); + bm->pktmbuf_pools[socket_id] = bm->pktmbuf_pools[i]; + return 0; + } + } + + return clib_error_return (0, "failed to allocate mempool on socket %u", + socket_id); +} +#endif + +static void +vlib_serialize_tx (serialize_main_header_t * m, serialize_stream_t * s) +{ + vlib_main_t *vm; + vlib_serialize_buffer_main_t *sm; + uword n, n_bytes_to_write; + vlib_buffer_t *last; + + n_bytes_to_write = s->current_buffer_index; + sm = + uword_to_pointer (s->data_function_opaque, + vlib_serialize_buffer_main_t *); + vm = sm->vlib_main; + + ASSERT (sm->tx.max_n_data_bytes_per_chain > 0); + if (serialize_stream_is_end_of_stream (s) + || sm->tx.n_total_data_bytes + n_bytes_to_write > + sm->tx.max_n_data_bytes_per_chain) + { + vlib_process_t *p = vlib_get_current_process (vm); + + last = vlib_get_buffer (vm, sm->last_buffer); + last->current_length = n_bytes_to_write; + + vlib_set_next_frame_buffer (vm, &p->node_runtime, sm->tx.next_index, + sm->first_buffer); + + sm->first_buffer = sm->last_buffer = ~0; + sm->tx.n_total_data_bytes = 0; + } + + else if (n_bytes_to_write == 0 && s->n_buffer_bytes == 0) + { + ASSERT (sm->first_buffer == ~0); + ASSERT (sm->last_buffer == ~0); + n = + vlib_buffer_alloc_from_free_list (vm, &sm->first_buffer, 1, + sm->tx.free_list_index); + if (n != 1) + serialize_error (m, + clib_error_create + ("vlib_buffer_alloc_from_free_list fails")); + sm->last_buffer = sm->first_buffer; + s->n_buffer_bytes = + vlib_buffer_free_list_buffer_size (vm, sm->tx.free_list_index); + } + + if (n_bytes_to_write > 0) + { + vlib_buffer_t *prev = vlib_get_buffer (vm, sm->last_buffer); + n = + vlib_buffer_alloc_from_free_list (vm, &sm->last_buffer, 1, + sm->tx.free_list_index); + if (n != 1) + serialize_error (m, + clib_error_create + ("vlib_buffer_alloc_from_free_list fails")); + sm->tx.n_total_data_bytes += n_bytes_to_write; + prev->current_length = n_bytes_to_write; + prev->next_buffer = sm->last_buffer; + prev->flags |= VLIB_BUFFER_NEXT_PRESENT; + } + + if (sm->last_buffer != ~0) + { + last = vlib_get_buffer (vm, sm->last_buffer); + s->buffer = vlib_buffer_get_current (last); + s->current_buffer_index = 0; + ASSERT (last->current_data == s->current_buffer_index); + } +} + +static void +vlib_serialize_rx (serialize_main_header_t * m, serialize_stream_t * s) +{ + vlib_main_t *vm; + vlib_serialize_buffer_main_t *sm; + vlib_buffer_t *last; + + sm = + uword_to_pointer (s->data_function_opaque, + vlib_serialize_buffer_main_t *); + vm = sm->vlib_main; + + if (serialize_stream_is_end_of_stream (s)) + return; + + if (sm->last_buffer != ~0) + { + last = vlib_get_buffer (vm, sm->last_buffer); + + if (last->flags & VLIB_BUFFER_NEXT_PRESENT) + sm->last_buffer = last->next_buffer; + else + { + vlib_buffer_free (vm, &sm->first_buffer, /* count */ 1); + sm->first_buffer = sm->last_buffer = ~0; + } + } + + if (sm->last_buffer == ~0) + { + while (clib_fifo_elts (sm->rx.buffer_fifo) == 0) + { + sm->rx.ready_one_time_event = + vlib_process_create_one_time_event (vm, vlib_current_process (vm), + ~0); + vlib_process_wait_for_one_time_event (vm, /* no event data */ 0, + sm->rx.ready_one_time_event); + } + + clib_fifo_sub1 (sm->rx.buffer_fifo, sm->first_buffer); + sm->last_buffer = sm->first_buffer; + } + + ASSERT (sm->last_buffer != ~0); + + last = vlib_get_buffer (vm, sm->last_buffer); + s->current_buffer_index = 0; + s->buffer = vlib_buffer_get_current (last); + s->n_buffer_bytes = last->current_length; +} + +static void +serialize_open_vlib_helper (serialize_main_t * m, + vlib_main_t * vm, + vlib_serialize_buffer_main_t * sm, uword is_read) +{ + /* Initialize serialize main but save overflow buffer for re-use between calls. */ + { + u8 *save = m->stream.overflow_buffer; + memset (m, 0, sizeof (m[0])); + m->stream.overflow_buffer = save; + if (save) + _vec_len (save) = 0; + } + + sm->first_buffer = sm->last_buffer = ~0; + if (is_read) + clib_fifo_reset (sm->rx.buffer_fifo); + else + sm->tx.n_total_data_bytes = 0; + sm->vlib_main = vm; + m->header.data_function = is_read ? vlib_serialize_rx : vlib_serialize_tx; + m->stream.data_function_opaque = pointer_to_uword (sm); +} + +void +serialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm, + vlib_serialize_buffer_main_t * sm) +{ + serialize_open_vlib_helper (m, vm, sm, /* is_read */ 0); +} + +void +unserialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm, + vlib_serialize_buffer_main_t * sm) +{ + serialize_open_vlib_helper (m, vm, sm, /* is_read */ 1); +} + +u32 +serialize_close_vlib_buffer (serialize_main_t * m) +{ + vlib_serialize_buffer_main_t *sm + = uword_to_pointer (m->stream.data_function_opaque, + vlib_serialize_buffer_main_t *); + vlib_buffer_t *last; + serialize_stream_t *s = &m->stream; + + last = vlib_get_buffer (sm->vlib_main, sm->last_buffer); + last->current_length = s->current_buffer_index; + + if (vec_len (s->overflow_buffer) > 0) + { + sm->last_buffer + = vlib_buffer_add_data (sm->vlib_main, sm->tx.free_list_index, + sm->last_buffer == ~0 ? 0 : sm->last_buffer, + s->overflow_buffer, + vec_len (s->overflow_buffer)); + _vec_len (s->overflow_buffer) = 0; + } + + return sm->first_buffer; +} + +void +unserialize_close_vlib_buffer (serialize_main_t * m) +{ + vlib_serialize_buffer_main_t *sm + = uword_to_pointer (m->stream.data_function_opaque, + vlib_serialize_buffer_main_t *); + if (sm->first_buffer != ~0) + vlib_buffer_free_one (sm->vlib_main, sm->first_buffer); + clib_fifo_reset (sm->rx.buffer_fifo); + if (m->stream.overflow_buffer) + _vec_len (m->stream.overflow_buffer) = 0; +} + +static u8 * +format_vlib_buffer_free_list (u8 * s, va_list * va) +{ + vlib_buffer_free_list_t *f = va_arg (*va, vlib_buffer_free_list_t *); +#if DPDK > 0 + u32 threadnum = va_arg (*va, u32); + uword bytes_alloc, bytes_free, n_free, size; + + if (!f) + return format (s, "%=7s%=30s%=12s%=12s%=12s%=12s%=12s%=12s", + "Thread", "Name", "Index", "Size", "Alloc", "Free", + "#Alloc", "#Free"); + + size = sizeof (vlib_buffer_t) + f->n_data_bytes; + n_free = vec_len (f->aligned_buffers) + vec_len (f->unaligned_buffers); + bytes_alloc = size * f->n_alloc; + bytes_free = size * n_free; + + s = format (s, "%7d%30s%12d%12d%=12U%=12U%=12d%=12d", threadnum, +#else + uword bytes_alloc, bytes_free, n_free, size; + + if (!f) + return format (s, "%=30s%=12s%=12s%=12s%=12s%=12s%=12s", + "Name", "Index", "Size", "Alloc", "Free", "#Alloc", + "#Free"); + + size = sizeof (vlib_buffer_t) + f->n_data_bytes; + n_free = vec_len (f->aligned_buffers) + vec_len (f->unaligned_buffers); + bytes_alloc = size * f->n_alloc; + bytes_free = size * n_free; + + s = format (s, "%30s%12d%12d%=12U%=12U%=12d%=12d", +#endif + f->name, f->index, f->n_data_bytes, + format_memory_size, bytes_alloc, + format_memory_size, bytes_free, f->n_alloc, n_free); + + return s; +} + +static clib_error_t * +show_buffers (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ +#if DPDK > 0 + vlib_buffer_main_t *bm; + vlib_buffer_free_list_t *f; + vlib_main_t *curr_vm; + u32 vm_index = 0; + + vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, 0, 0); + + do + { + curr_vm = vec_len (vlib_mains) ? vlib_mains[vm_index] : vm; + bm = curr_vm->buffer_main; + + /* *INDENT-OFF* */ + pool_foreach (f, bm->buffer_free_list_pool, ({ + vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, f, vm_index); + })); + /* *INDENT-ON* */ + + vm_index++; + } + while (vm_index < vec_len (vlib_mains)); + +#else + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *f; + + vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, 0); + /* *INDENT-OFF* */ + pool_foreach (f, bm->buffer_free_list_pool, ({ + vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, f); + })); +/* *INDENT-ON* */ + +#endif + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_buffers_command, static) = { + .path = "show buffers", + .short_help = "Show packet buffer allocation", + .function = show_buffers, +}; +/* *INDENT-ON* */ + +#if DPDK > 0 +#if CLIB_DEBUG > 0 + +u32 *vlib_buffer_state_validation_lock; +uword *vlib_buffer_state_validation_hash; +void *vlib_buffer_state_heap; + +static clib_error_t * +buffer_state_validation_init (vlib_main_t * vm) +{ + void *oldheap; + + vlib_buffer_state_heap = mheap_alloc (0, 10 << 20); + + oldheap = clib_mem_set_heap (vlib_buffer_state_heap); + + vlib_buffer_state_validation_hash = hash_create (0, sizeof (uword)); + vec_validate_aligned (vlib_buffer_state_validation_lock, 0, + CLIB_CACHE_LINE_BYTES); + clib_mem_set_heap (oldheap); + return 0; +} + +VLIB_INIT_FUNCTION (buffer_state_validation_init); +#endif +#endif + + +/** @endcond */ +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h new file mode 100644 index 00000000..5f1e62f0 --- /dev/null +++ b/src/vlib/buffer.h @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * buffer.h: VLIB buffers + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_buffer_h +#define included_vlib_buffer_h + +#include +#include +#include +#include +#include /* for vlib_error_t */ + +#if DPDK > 0 +#include +#define VLIB_BUFFER_DATA_SIZE (2048) +#define VLIB_BUFFER_PRE_DATA_SIZE RTE_PKTMBUF_HEADROOM +#else +#include /* for __PRE_DATA_SIZE */ +#define VLIB_BUFFER_DATA_SIZE (512) +#define VLIB_BUFFER_PRE_DATA_SIZE __PRE_DATA_SIZE +#endif + +#if defined (CLIB_HAVE_VEC128) || defined (__aarch64__) +typedef u8x16 vlib_copy_unit_t; +#else +typedef u64 vlib_copy_unit_t; +#endif + +/** \file + vlib buffer structure definition and a few select + access methods. This structure and the buffer allocation + mechanism should perhaps live in vnet, but it would take a lot + of typing to make it so. +*/ + +/* VLIB buffer representation. */ +typedef struct +{ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + /* Offset within data[] that we are currently processing. + If negative current header points into predata area. */ + i16 current_data; /**< signed offset in data[], pre_data[] + that we are currently processing. + If negative current header points into predata area. + */ + u16 current_length; /**< Nbytes between current data and + the end of this buffer. + */ + u32 flags; /**< buffer flags: +
VLIB_BUFFER_IS_TRACED: trace this buffer. +
VLIB_BUFFER_NEXT_PRESENT: this is a multi-chunk buffer. +
VLIB_BUFFER_TOTAL_LENGTH_VALID: as it says +
VLIB_BUFFER_REPL_FAIL: packet replication failure +
VLIB_BUFFER_RECYCLE: as it says +
VLIB_BUFFER_FLOW_REPORT: buffer is a flow report, + set to avoid adding it to a flow report +
VLIB_BUFFER_FLAG_USER(n): user-defined bit N + */ +#define VLIB_BUFFER_IS_TRACED (1 << 0) +#define VLIB_BUFFER_LOG2_NEXT_PRESENT (1) +#define VLIB_BUFFER_NEXT_PRESENT (1 << VLIB_BUFFER_LOG2_NEXT_PRESENT) +#define VLIB_BUFFER_IS_RECYCLED (1 << 2) +#define VLIB_BUFFER_TOTAL_LENGTH_VALID (1 << 3) +#define VLIB_BUFFER_REPL_FAIL (1 << 4) +#define VLIB_BUFFER_RECYCLE (1 << 5) +#define VLIB_BUFFER_FLOW_REPORT (1 << 6) + + /* User defined buffer flags. */ +#define LOG2_VLIB_BUFFER_FLAG_USER(n) (32 - (n)) +#define VLIB_BUFFER_FLAG_USER(n) (1 << LOG2_VLIB_BUFFER_FLAG_USER(n)) + + u32 free_list_index; /**< Buffer free list that this buffer was + allocated from and will be freed to. + */ + + u32 total_length_not_including_first_buffer; + /**< Only valid for first buffer in chain. Current length plus + total length given here give total number of bytes in buffer chain. + */ + + u32 next_buffer; /**< Next buffer for this linked-list of buffers. + Only valid if VLIB_BUFFER_NEXT_PRESENT flag is set. + */ + + vlib_error_t error; /**< Error code for buffers to be enqueued + to error handler. + */ + u32 current_config_index; /**< Used by feature subgraph arcs to + visit enabled feature nodes + */ + + u8 feature_arc_index; /**< Used to identify feature arcs by intermediate + feature node + */ + + u8 dont_waste_me[3]; /**< Available space in the (precious) + first 32 octets of buffer metadata + Before allocating any of it, discussion required! + */ + + u32 opaque[8]; /**< Opaque data used by sub-graphs for their own purposes. + See .../vnet/vnet/buffer.h + */ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline1); + + u32 trace_index; /**< Specifies index into trace buffer + if VLIB_PACKET_IS_TRACED flag is set. + */ + u32 recycle_count; /**< Used by L2 path recycle code */ + u32 opaque2[14]; /**< More opaque data, currently unused */ + + /***** end of second cache line */ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline2); + u8 pre_data[VLIB_BUFFER_PRE_DATA_SIZE]; /**< Space for inserting data + before buffer start. + Packet rewrite string will be + rewritten backwards and may extend + back before buffer->data[0]. + Must come directly before packet data. + */ + + u8 data[0]; /**< Packet data. Hardware DMA here */ +} vlib_buffer_t; /* Must be a multiple of 64B. */ + +#define VLIB_BUFFER_HDR_SIZE (sizeof(vlib_buffer_t) - VLIB_BUFFER_PRE_DATA_SIZE) + +/** \brief Prefetch buffer metadata. + The first 64 bytes of buffer contains most header information + + @param b - (vlib_buffer_t *) pointer to the buffer + @param type - LOAD, STORE. In most cases, STORE is the right answer +*/ + +#define vlib_prefetch_buffer_header(b,type) CLIB_PREFETCH (b, 64, type) + +always_inline vlib_buffer_t * +vlib_buffer_next_contiguous (vlib_buffer_t * b, u32 buffer_bytes) +{ + return (void *) (b + 1) + buffer_bytes; +} + +always_inline void +vlib_buffer_struct_is_sane (vlib_buffer_t * b) +{ + ASSERT (sizeof (b[0]) % 64 == 0); + + /* Rewrite data must be before and contiguous with packet data. */ + ASSERT (b->pre_data + VLIB_BUFFER_PRE_DATA_SIZE == b->data); +} + +/** \brief Get pointer to current data to process + + @param b - (vlib_buffer_t *) pointer to the buffer + @return - (void *) (b->data + b->current_data) +*/ + +always_inline void * +vlib_buffer_get_current (vlib_buffer_t * b) +{ + /* Check bounds. */ + ASSERT ((signed) b->current_data >= (signed) -VLIB_BUFFER_PRE_DATA_SIZE); + return b->data + b->current_data; +} + +/** \brief Advance current data pointer by the supplied (signed!) amount + + @param b - (vlib_buffer_t *) pointer to the buffer + @param l - (word) signed increment +*/ +always_inline void +vlib_buffer_advance (vlib_buffer_t * b, word l) +{ + ASSERT (b->current_length >= l); + b->current_data += l; + b->current_length -= l; +} + +/** \brief Reset current header & length to state they were in when + packet was received. + + @param b - (vlib_buffer_t *) pointer to the buffer +*/ + +always_inline void +vlib_buffer_reset (vlib_buffer_t * b) +{ + b->current_length += clib_max (b->current_data, 0); + b->current_data = 0; +} + +/** \brief Get pointer to buffer's opaque data array + + @param b - (vlib_buffer_t *) pointer to the buffer + @return - (void *) b->opaque +*/ +always_inline void * +vlib_get_buffer_opaque (vlib_buffer_t * b) +{ + return (void *) b->opaque; +} + +/** \brief Get pointer to buffer's opaque2 data array + + @param b - (vlib_buffer_t *) pointer to the buffer + @return - (void *) b->opaque2 +*/ +always_inline void * +vlib_get_buffer_opaque2 (vlib_buffer_t * b) +{ + return (void *) b->opaque2; +} + +/* Forward declaration. */ +struct vlib_main_t; + +typedef struct vlib_buffer_free_list_t +{ + /* Template buffer used to initialize first 16 bytes of buffers + allocated on this free list. */ + vlib_buffer_t buffer_init_template; + + /* Our index into vlib_main_t's buffer_free_list_pool. */ + u32 index; + + /* Number of data bytes for buffers in this free list. */ + u32 n_data_bytes; + + /* Number of buffers to allocate when we need to allocate new buffers + from physmem heap. */ + u32 min_n_buffers_each_physmem_alloc; + + /* Total number of buffers allocated from this free list. */ + u32 n_alloc; + + /* Vector of free buffers. Each element is a byte offset into I/O heap. + Aligned vectors always has naturally aligned vlib_copy_unit_t sized chunks + of buffer indices. Unaligned vector has any left over. This is meant to + speed up copy routines. */ + u32 *aligned_buffers, *unaligned_buffers; + + /* Memory chunks allocated for this free list + recorded here so they can be freed when free list + is deleted. */ + void **buffer_memory_allocated; + + /* Free list name. */ + u8 *name; + + /* Callback functions to initialize newly allocated buffers. + If null buffers are zeroed. */ + void (*buffer_init_function) (struct vlib_main_t * vm, + struct vlib_buffer_free_list_t * fl, + u32 * buffers, u32 n_buffers); + + /* Callback function to announce that buffers have been + added to the freelist */ + void (*buffers_added_to_freelist_function) + (struct vlib_main_t * vm, struct vlib_buffer_free_list_t * fl); + + uword buffer_init_function_opaque; +} __attribute__ ((aligned (16))) vlib_buffer_free_list_t; + +typedef struct +{ + /* Buffer free callback, for subversive activities */ + u32 (*buffer_free_callback) (struct vlib_main_t * vm, + u32 * buffers, + u32 n_buffers, u32 follow_buffer_next); + /* Pool of buffer free lists. + Multiple free lists exist for packet generator which uses + separate free lists for each packet stream --- so as to avoid + initializing static data for each packet generated. */ + vlib_buffer_free_list_t *buffer_free_list_pool; +#define VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX (0) +#define VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES VLIB_BUFFER_DATA_SIZE + + /* Hash table mapping buffer size (rounded to next unit of + sizeof (vlib_buffer_t)) to free list index. */ + uword *free_list_by_size; + + /* Hash table mapping buffer index into number + 0 => allocated but free, 1 => allocated and not-free. + If buffer index is not in hash table then this buffer + has never been allocated. */ + uword *buffer_known_hash; + + /* List of free-lists needing Blue Light Special announcements */ + vlib_buffer_free_list_t **announce_list; + + /* Vector of rte_mempools per socket */ +#if DPDK == 1 + struct rte_mempool **pktmbuf_pools; +#endif +} vlib_buffer_main_t; + +typedef struct +{ + struct vlib_main_t *vlib_main; + + u32 first_buffer, last_buffer; + + union + { + struct + { + /* Total accumulated bytes in chain starting with first_buffer. */ + u32 n_total_data_bytes; + + /* Max number of bytes to accumulate in chain starting with first_buffer. + As this limit is reached buffers are enqueued to next node. */ + u32 max_n_data_bytes_per_chain; + + /* Next node to enqueue buffers to relative to current process node. */ + u32 next_index; + + /* Free list to use to allocate new buffers. */ + u32 free_list_index; + } tx; + + struct + { + /* CLIB fifo of buffer indices waiting to be unserialized. */ + u32 *buffer_fifo; + + /* Event type used to signal that RX buffers have been added to fifo. */ + uword ready_one_time_event; + } rx; + }; +} vlib_serialize_buffer_main_t; + +void serialize_open_vlib_buffer (serialize_main_t * m, struct vlib_main_t *vm, + vlib_serialize_buffer_main_t * sm); +void unserialize_open_vlib_buffer (serialize_main_t * m, + struct vlib_main_t *vm, + vlib_serialize_buffer_main_t * sm); + +u32 serialize_close_vlib_buffer (serialize_main_t * m); +void unserialize_close_vlib_buffer (serialize_main_t * m); +void *vlib_set_buffer_free_callback (struct vlib_main_t *vm, void *fp); + +always_inline u32 +serialize_vlib_buffer_n_bytes (serialize_main_t * m) +{ + serialize_stream_t *s = &m->stream; + vlib_serialize_buffer_main_t *sm + = uword_to_pointer (m->stream.data_function_opaque, + vlib_serialize_buffer_main_t *); + return sm->tx.n_total_data_bytes + s->current_buffer_index + + vec_len (s->overflow_buffer); +} + +#if DPDK > 0 +#define rte_mbuf_from_vlib_buffer(x) (((struct rte_mbuf *)x) - 1) +#define vlib_buffer_from_rte_mbuf(x) ((vlib_buffer_t *)(x+1)) +#endif + +/* + */ + +/** \brief Compile time buffer trajectory tracing option + Turn this on if you run into "bad monkey" contexts, + and you want to know exactly which nodes they've visited... + See vlib/main.c... +*/ +#define VLIB_BUFFER_TRACE_TRAJECTORY 0 + +#if VLIB_BUFFER_TRACE_TRAJECTORY > 0 +#define VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b) (b)->pre_data[0]=0 +#else +#define VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b) +#endif /* VLIB_BUFFER_TRACE_TRAJECTORY */ + +#endif /* included_vlib_buffer_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h new file mode 100644 index 00000000..75716eca --- /dev/null +++ b/src/vlib/buffer_funcs.h @@ -0,0 +1,755 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * buffer_funcs.h: VLIB buffer related functions/inlines + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_buffer_funcs_h +#define included_vlib_buffer_funcs_h + +#include + +/** \file + vlib buffer access methods. +*/ + + +/** \brief Translate buffer index into buffer pointer + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param buffer_index - (u32) buffer index + @return - (vlib_buffer_t *) buffer pointer +*/ +always_inline vlib_buffer_t * +vlib_get_buffer (vlib_main_t * vm, u32 buffer_index) +{ + return vlib_physmem_at_offset (&vm->physmem_main, ((uword) buffer_index) + << CLIB_LOG2_CACHE_LINE_BYTES); +} + +/** \brief Translate buffer pointer into buffer index + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param p - (void *) buffer pointer + @return - (u32) buffer index +*/ +always_inline u32 +vlib_get_buffer_index (vlib_main_t * vm, void *p) +{ + uword offset = vlib_physmem_offset_of (&vm->physmem_main, p); + ASSERT ((offset % (1 << CLIB_LOG2_CACHE_LINE_BYTES)) == 0); + return offset >> CLIB_LOG2_CACHE_LINE_BYTES; +} + +/** \brief Get next buffer in buffer linklist, or zero for end of list. + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param b - (void *) buffer pointer + @return - (vlib_buffer_t *) next buffer, or NULL +*/ +always_inline vlib_buffer_t * +vlib_get_next_buffer (vlib_main_t * vm, vlib_buffer_t * b) +{ + return (b->flags & VLIB_BUFFER_NEXT_PRESENT + ? vlib_get_buffer (vm, b->next_buffer) : 0); +} + +uword vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm, + vlib_buffer_t * b_first); + +/** \brief Get length in bytes of the buffer chain + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param b - (void *) buffer pointer + @return - (uword) length of buffer chain +*/ +always_inline uword +vlib_buffer_length_in_chain (vlib_main_t * vm, vlib_buffer_t * b) +{ + uword l = b->current_length + b->total_length_not_including_first_buffer; + if (PREDICT_FALSE ((b->flags & (VLIB_BUFFER_NEXT_PRESENT + | VLIB_BUFFER_TOTAL_LENGTH_VALID)) + == VLIB_BUFFER_NEXT_PRESENT)) + return vlib_buffer_length_in_chain_slow_path (vm, b); + return l; +} + +/** \brief Get length in bytes of the buffer index buffer chain + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param bi - (u32) buffer index + @return - (uword) length of buffer chain +*/ +always_inline uword +vlib_buffer_index_length_in_chain (vlib_main_t * vm, u32 bi) +{ + vlib_buffer_t *b = vlib_get_buffer (vm, bi); + return vlib_buffer_length_in_chain (vm, b); +} + +/** \brief Copy buffer contents to memory + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param buffer_index - (u32) buffer index + @param contents - (u8 *) memory, must be large enough + @return - (uword) length of buffer chain +*/ +always_inline uword +vlib_buffer_contents (vlib_main_t * vm, u32 buffer_index, u8 * contents) +{ + uword content_len = 0; + uword l; + vlib_buffer_t *b; + + while (1) + { + b = vlib_get_buffer (vm, buffer_index); + l = b->current_length; + clib_memcpy (contents + content_len, b->data + b->current_data, l); + content_len += l; + if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT)) + break; + buffer_index = b->next_buffer; + } + + return content_len; +} + +/* Return physical address of buffer->data start. */ +always_inline u64 +vlib_get_buffer_data_physical_address (vlib_main_t * vm, u32 buffer_index) +{ + return vlib_physmem_offset_to_physical (&vm->physmem_main, + (((uword) buffer_index) << + CLIB_LOG2_CACHE_LINE_BYTES) + + STRUCT_OFFSET_OF (vlib_buffer_t, + data)); +} + +/** \brief Prefetch buffer metadata by buffer index + The first 64 bytes of buffer contains most header information + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param bi - (u32) buffer index + @param type - LOAD, STORE. In most cases, STORE is the right answer +*/ +/* Prefetch buffer header given index. */ +#define vlib_prefetch_buffer_with_index(vm,bi,type) \ + do { \ + vlib_buffer_t * _b = vlib_get_buffer (vm, bi); \ + vlib_prefetch_buffer_header (_b, type); \ + } while (0) + +#if 0 +/* Iterate over known allocated vlib bufs. You probably do not want + * to do this! + @param vm the vlib_main_t + @param bi found allocated buffer index + @param body operation to perform on buffer index + function executes body for each allocated buffer index + */ +#define vlib_buffer_foreach_allocated(vm,bi,body) \ +do { \ + vlib_main_t * _vmain = (vm); \ + vlib_buffer_main_t * _bmain = &_vmain->buffer_main; \ + hash_pair_t * _vbpair; \ + hash_foreach_pair(_vbpair, _bmain->buffer_known_hash, ({ \ + if (VLIB_BUFFER_KNOWN_ALLOCATED == _vbpair->value[0]) { \ + (bi) = _vbpair->key; \ + body; \ + } \ + })); \ +} while (0) +#endif + +#if DPDK == 0 + +typedef enum +{ + /* Index is unknown. */ + VLIB_BUFFER_UNKNOWN, + + /* Index is known and free/allocated. */ + VLIB_BUFFER_KNOWN_FREE, + VLIB_BUFFER_KNOWN_ALLOCATED, +} vlib_buffer_known_state_t; + +always_inline vlib_buffer_known_state_t +vlib_buffer_is_known (vlib_main_t * vm, u32 buffer_index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + ASSERT (os_get_cpu_number () == 0); + + uword *p = hash_get (bm->buffer_known_hash, buffer_index); + return p ? p[0] : VLIB_BUFFER_UNKNOWN; +} + +always_inline void +vlib_buffer_set_known_state (vlib_main_t * vm, + u32 buffer_index, + vlib_buffer_known_state_t state) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + ASSERT (os_get_cpu_number () == 0); + hash_set (bm->buffer_known_hash, buffer_index, state); +} + +/* Validates sanity of a single buffer. + Returns format'ed vector with error message if any. */ +u8 *vlib_validate_buffer (vlib_main_t * vm, u32 buffer_index, + uword follow_chain); + +#endif /* DPDK == 0 */ + +clib_error_t *vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, + unsigned socket_id); + +/** \brief Allocate buffers into supplied array + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param buffers - (u32 * ) buffer index array + @param n_buffers - (u32) number of buffers requested + @return - (u32) number of buffers actually allocated, may be + less than the number requested or zero +*/ +u32 vlib_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers); + +always_inline u32 +vlib_buffer_round_size (u32 size) +{ + return round_pow2 (size, sizeof (vlib_buffer_t)); +} + +/** \brief Allocate buffers from specific freelist into supplied array + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param buffers - (u32 * ) buffer index array + @param n_buffers - (u32) number of buffers requested + @return - (u32) number of buffers actually allocated, may be + less than the number requested or zero +*/ +u32 vlib_buffer_alloc_from_free_list (vlib_main_t * vm, + u32 * buffers, + u32 n_buffers, u32 free_list_index); + +/** \brief Free buffers + Frees the entire buffer chain for each buffer + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param buffers - (u32 * ) buffer index array + @param n_buffers - (u32) number of buffers to free + +*/ +void vlib_buffer_free (vlib_main_t * vm, + /* pointer to first buffer */ + u32 * buffers, + /* number of buffers to free */ + u32 n_buffers); + +/** \brief Free buffers, does not free the buffer chain for each buffer + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param buffers - (u32 * ) buffer index array + @param n_buffers - (u32) number of buffers to free + +*/ +void vlib_buffer_free_no_next (vlib_main_t * vm, + /* pointer to first buffer */ + u32 * buffers, + /* number of buffers to free */ + u32 n_buffers); + +/** \brief Free one buffer + Shorthand to free a single buffer chain. + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param buffer_index - (u32) buffer index to free +*/ +always_inline void +vlib_buffer_free_one (vlib_main_t * vm, u32 buffer_index) +{ + vlib_buffer_free (vm, &buffer_index, /* n_buffers */ 1); +} + +/* Add/delete buffer free lists. */ +u32 vlib_buffer_create_free_list (vlib_main_t * vm, u32 n_data_bytes, + char *fmt, ...); +void vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index); + +/* Find already existing public free list with given size or create one. */ +u32 vlib_buffer_get_or_create_free_list (vlib_main_t * vm, u32 n_data_bytes, + char *fmt, ...); + +always_inline vlib_buffer_free_list_t * +vlib_buffer_get_free_list (vlib_main_t * vm, u32 free_list_index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *f; + + f = pool_elt_at_index (bm->buffer_free_list_pool, free_list_index); + + /* Sanity: indices must match. */ + ASSERT (f->index == free_list_index); + + return f; +} + +always_inline u32 +vlib_buffer_free_list_buffer_size (vlib_main_t * vm, u32 free_list_index) +{ + vlib_buffer_free_list_t *f = + vlib_buffer_get_free_list (vm, free_list_index); + return f->n_data_bytes; +} + +void vlib_aligned_memcpy (void *_dst, void *_src, int n_bytes); + +/* Reasonably fast buffer copy routine. */ +always_inline void +vlib_copy_buffers (u32 * dst, u32 * src, u32 n) +{ + while (n >= 4) + { + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; + dst += 4; + src += 4; + n -= 4; + } + while (n > 0) + { + dst[0] = src[0]; + dst += 1; + src += 1; + n -= 1; + } +} + +always_inline void * +vlib_physmem_alloc_aligned (vlib_main_t * vm, clib_error_t ** error, + uword n_bytes, uword alignment) +{ + void *r = + vm->os_physmem_alloc_aligned (&vm->physmem_main, n_bytes, alignment); + if (!r) + *error = + clib_error_return (0, "failed to allocate %wd bytes of I/O memory", + n_bytes); + else + *error = 0; + return r; +} + +/* By default allocate I/O memory with cache line alignment. */ +always_inline void * +vlib_physmem_alloc (vlib_main_t * vm, clib_error_t ** error, uword n_bytes) +{ + return vlib_physmem_alloc_aligned (vm, error, n_bytes, + CLIB_CACHE_LINE_BYTES); +} + +always_inline void +vlib_physmem_free (vlib_main_t * vm, void *mem) +{ + return vm->os_physmem_free (mem); +} + +always_inline u64 +vlib_physmem_virtual_to_physical (vlib_main_t * vm, void *mem) +{ + vlib_physmem_main_t *pm = &vm->physmem_main; + uword o = pointer_to_uword (mem) - pm->virtual.start; + return vlib_physmem_offset_to_physical (pm, o); +} + +/* Append given data to end of buffer, possibly allocating new buffers. */ +u32 vlib_buffer_add_data (vlib_main_t * vm, + u32 free_list_index, + u32 buffer_index, void *data, u32 n_data_bytes); + +/* duplicate all buffers in chain */ +always_inline vlib_buffer_t * +vlib_buffer_copy (vlib_main_t * vm, vlib_buffer_t * b) +{ + vlib_buffer_t *s, *d, *fd; + uword n_alloc, n_buffers = 1; + u32 *new_buffers = 0; + u32 flag_mask = VLIB_BUFFER_NEXT_PRESENT | VLIB_BUFFER_TOTAL_LENGTH_VALID; + int i; + + s = b; + while (s->flags & VLIB_BUFFER_NEXT_PRESENT) + { + n_buffers++; + s = vlib_get_buffer (vm, s->next_buffer); + } + + vec_validate (new_buffers, n_buffers - 1); + n_alloc = vlib_buffer_alloc (vm, new_buffers, n_buffers); + ASSERT (n_alloc == n_buffers); + + /* 1st segment */ + s = b; + fd = d = vlib_get_buffer (vm, new_buffers[0]); + d->current_data = s->current_data; + d->current_length = s->current_length; + d->flags = s->flags & flag_mask; + d->total_length_not_including_first_buffer = + s->total_length_not_including_first_buffer; + clib_memcpy (d->opaque, s->opaque, sizeof (s->opaque)); + clib_memcpy (vlib_buffer_get_current (d), + vlib_buffer_get_current (s), s->current_length); + + /* next segments */ + for (i = 1; i < n_buffers; i++) + { + /* previous */ + d->next_buffer = new_buffers[i]; + /* current */ + s = vlib_get_buffer (vm, s->next_buffer); + d = vlib_get_buffer (vm, new_buffers[i]); + d->current_data = s->current_data; + d->current_length = s->current_length; + clib_memcpy (vlib_buffer_get_current (d), + vlib_buffer_get_current (s), s->current_length); + d->flags = s->flags & flag_mask; + } + + return fd; +} + +/* + * vlib_buffer_chain_* functions provide a way to create long buffers. + * When DPDK is enabled, the 'hidden' DPDK header is taken care of transparently. + */ + +/* Initializes the buffer as an empty packet with no chained buffers. */ +always_inline void +vlib_buffer_chain_init (vlib_buffer_t * first) +{ + first->total_length_not_including_first_buffer = 0; + first->current_length = 0; + first->flags &= ~VLIB_BUFFER_NEXT_PRESENT; + first->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; +} + +/* The provided next_bi buffer index is appended to the end of the packet. */ +always_inline vlib_buffer_t * +vlib_buffer_chain_buffer (vlib_main_t * vm, + vlib_buffer_t * first, + vlib_buffer_t * last, u32 next_bi) +{ + vlib_buffer_t *next_buffer = vlib_get_buffer (vm, next_bi); + last->next_buffer = next_bi; + last->flags |= VLIB_BUFFER_NEXT_PRESENT; + next_buffer->current_length = 0; + next_buffer->flags &= ~VLIB_BUFFER_NEXT_PRESENT; + return next_buffer; +} + +/* Increases or decreases the packet length. + * It does not allocate or deallocate new buffers. + * Therefore, the added length must be compatible + * with the last buffer. */ +always_inline void +vlib_buffer_chain_increase_length (vlib_buffer_t * first, + vlib_buffer_t * last, i32 len) +{ + last->current_length += len; + if (first != last) + first->total_length_not_including_first_buffer += len; +} + +/* Copy data to the end of the packet and increases its length. + * It does not allocate new buffers. + * Returns the number of copied bytes. */ +always_inline u16 +vlib_buffer_chain_append_data (vlib_main_t * vm, + u32 free_list_index, + vlib_buffer_t * first, + vlib_buffer_t * last, void *data, u16 data_len) +{ + u32 n_buffer_bytes = + vlib_buffer_free_list_buffer_size (vm, free_list_index); + ASSERT (n_buffer_bytes >= last->current_length + last->current_data); + u16 len = clib_min (data_len, + n_buffer_bytes - last->current_length - + last->current_data); + clib_memcpy (vlib_buffer_get_current (last) + last->current_length, data, + len); + vlib_buffer_chain_increase_length (first, last, len); + return len; +} + +/* Copy data to the end of the packet and increases its length. + * Allocates additional buffers from the free list if necessary. + * Returns the number of copied bytes. + * 'last' value is modified whenever new buffers are allocated and + * chained and points to the last buffer in the chain. */ +u16 +vlib_buffer_chain_append_data_with_alloc (vlib_main_t * vm, + u32 free_list_index, + vlib_buffer_t * first, + vlib_buffer_t ** last, + void *data, u16 data_len); +void vlib_buffer_chain_validate (vlib_main_t * vm, vlib_buffer_t * first); + +format_function_t format_vlib_buffer, format_vlib_buffer_and_data, + format_vlib_buffer_contents; + +typedef struct +{ + /* Vector of packet data. */ + u8 *packet_data; + + /* Note: the next three fields are unused if DPDK == 1 */ + + /* Number of buffers to allocate in each call to physmem + allocator. */ + u32 min_n_buffers_each_physmem_alloc; + + /* Buffer free list for this template. */ + u32 free_list_index; + + u32 *free_buffers; +} vlib_packet_template_t; + +void vlib_packet_template_get_packet_helper (vlib_main_t * vm, + vlib_packet_template_t * t); + +void vlib_packet_template_init (vlib_main_t * vm, + vlib_packet_template_t * t, + void *packet_data, + uword n_packet_data_bytes, + uword min_n_buffers_each_physmem_alloc, + char *fmt, ...); + +void *vlib_packet_template_get_packet (vlib_main_t * vm, + vlib_packet_template_t * t, + u32 * bi_result); + +always_inline void +vlib_packet_template_free (vlib_main_t * vm, vlib_packet_template_t * t) +{ + vec_free (t->packet_data); +} + +always_inline u32 +unserialize_vlib_buffer_n_bytes (serialize_main_t * m) +{ + serialize_stream_t *s = &m->stream; + vlib_serialize_buffer_main_t *sm + = uword_to_pointer (m->stream.data_function_opaque, + vlib_serialize_buffer_main_t *); + vlib_main_t *vm = sm->vlib_main; + u32 n, *f; + + n = s->n_buffer_bytes - s->current_buffer_index; + if (sm->last_buffer != ~0) + { + vlib_buffer_t *b = vlib_get_buffer (vm, sm->last_buffer); + while (b->flags & VLIB_BUFFER_NEXT_PRESENT) + { + b = vlib_get_buffer (vm, b->next_buffer); + n += b->current_length; + } + } + + /* *INDENT-OFF* */ + clib_fifo_foreach (f, sm->rx.buffer_fifo, ({ + n += vlib_buffer_index_length_in_chain (vm, f[0]); + })); +/* *INDENT-ON* */ + + return n; +} + +typedef union +{ + vlib_buffer_t b; + vlib_copy_unit_t i[sizeof (vlib_buffer_t) / sizeof (vlib_copy_unit_t)]; +} +vlib_buffer_union_t; + +/* Set a buffer quickly into "uninitialized" state. We want this to + be extremely cheap and arrange for all fields that need to be + initialized to be in the first 128 bits of the buffer. */ +always_inline void +vlib_buffer_init_for_free_list (vlib_buffer_t * _dst, + vlib_buffer_free_list_t * fl) +{ + vlib_buffer_union_t *dst = (vlib_buffer_union_t *) _dst; + vlib_buffer_union_t *src = + (vlib_buffer_union_t *) & fl->buffer_init_template; + + /* Make sure vlib_buffer_t is cacheline aligned and sized */ + ASSERT (STRUCT_OFFSET_OF (vlib_buffer_t, cacheline0) == 0); + ASSERT (STRUCT_OFFSET_OF (vlib_buffer_t, cacheline1) == + CLIB_CACHE_LINE_BYTES); + ASSERT (STRUCT_OFFSET_OF (vlib_buffer_t, cacheline2) == + CLIB_CACHE_LINE_BYTES * 2); + + /* Make sure buffer template is sane. */ + ASSERT (fl->index == fl->buffer_init_template.free_list_index); + + /* Copy template from src->current_data thru src->free_list_index */ + dst->i[0] = src->i[0]; + if (1 * sizeof (dst->i[0]) < 16) + dst->i[1] = src->i[1]; + if (2 * sizeof (dst->i[0]) < 16) + dst->i[2] = src->i[2]; + + /* Make sure it really worked. */ +#define _(f) ASSERT (dst->b.f == src->b.f) + _(current_data); + _(current_length); + _(flags); + _(free_list_index); +#undef _ + ASSERT (dst->b.total_length_not_including_first_buffer == 0); +} + +always_inline void +vlib_buffer_init_two_for_free_list (vlib_buffer_t * _dst0, + vlib_buffer_t * _dst1, + vlib_buffer_free_list_t * fl) +{ + vlib_buffer_union_t *dst0 = (vlib_buffer_union_t *) _dst0; + vlib_buffer_union_t *dst1 = (vlib_buffer_union_t *) _dst1; + vlib_buffer_union_t *src = + (vlib_buffer_union_t *) & fl->buffer_init_template; + + /* Make sure buffer template is sane. */ + ASSERT (fl->index == fl->buffer_init_template.free_list_index); + + /* Copy template from src->current_data thru src->free_list_index */ + dst0->i[0] = dst1->i[0] = src->i[0]; + if (1 * sizeof (dst0->i[0]) < 16) + dst0->i[1] = dst1->i[1] = src->i[1]; + if (2 * sizeof (dst0->i[0]) < 16) + dst0->i[2] = dst1->i[2] = src->i[2]; + + /* Make sure it really worked. */ +#define _(f) ASSERT (dst0->b.f == src->b.f && dst1->b.f == src->b.f) + _(current_data); + _(current_length); + _(flags); + _(free_list_index); +#undef _ + ASSERT (dst0->b.total_length_not_including_first_buffer == 0); + ASSERT (dst1->b.total_length_not_including_first_buffer == 0); +} + +#if CLIB_DEBUG > 0 +extern u32 *vlib_buffer_state_validation_lock; +extern uword *vlib_buffer_state_validation_hash; +extern void *vlib_buffer_state_heap; +#endif + +static inline void +vlib_validate_buffer_in_use (vlib_buffer_t * b, u32 expected) +{ +#if CLIB_DEBUG > 0 + uword *p; + void *oldheap; + + oldheap = clib_mem_set_heap (vlib_buffer_state_heap); + + while (__sync_lock_test_and_set (vlib_buffer_state_validation_lock, 1)) + ; + + p = hash_get (vlib_buffer_state_validation_hash, b); + + /* If we don't know about b, declare it to be in the expected state */ + if (!p) + { + hash_set (vlib_buffer_state_validation_hash, b, expected); + goto out; + } + + if (p[0] != expected) + { + void cj_stop (void); + u32 bi; + vlib_main_t *vm = &vlib_global_main; + + cj_stop (); + + bi = vlib_get_buffer_index (vm, b); + + clib_mem_set_heap (oldheap); + clib_warning ("%.6f buffer %llx (%d): %s, not %s", + vlib_time_now (vm), bi, + p[0] ? "busy" : "free", expected ? "busy" : "free"); + os_panic (); + } +out: + CLIB_MEMORY_BARRIER (); + *vlib_buffer_state_validation_lock = 0; + clib_mem_set_heap (oldheap); +#endif +} + +static inline void +vlib_validate_buffer_set_in_use (vlib_buffer_t * b, u32 expected) +{ +#if CLIB_DEBUG > 0 + void *oldheap; + + oldheap = clib_mem_set_heap (vlib_buffer_state_heap); + + while (__sync_lock_test_and_set (vlib_buffer_state_validation_lock, 1)) + ; + + hash_set (vlib_buffer_state_validation_hash, b, expected); + + CLIB_MEMORY_BARRIER (); + *vlib_buffer_state_validation_lock = 0; + clib_mem_set_heap (oldheap); +#endif +} + +#endif /* included_vlib_buffer_funcs_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/buffer_node.h b/src/vlib/buffer_node.h new file mode 100644 index 00000000..8a779049 --- /dev/null +++ b/src/vlib/buffer_node.h @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * buffer_node.h: VLIB buffer handling node helper macros/inlines + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_buffer_node_h +#define included_vlib_buffer_node_h + +/** \file + vlib buffer/node functions +*/ + +/** \brief Finish enqueueing two buffers forward in the graph. + Standard dual loop boilerplate element. This is a MACRO, + with MULTIPLE SIDE EFFECTS. In the ideal case, + next_index == next0 == next1, + which means that the speculative enqueue at the top of the dual loop + has correctly dealt with both packets. In that case, the macro does + nothing at all. + + @param vm vlib_main_t pointer, varies by thread + @param node current node vlib_node_runtime_t pointer + @param next_index speculated next index used for both packets + @param to_next speculated vector pointer used for both packets + @param n_left_to_next number of slots left in speculated vector + @param bi0 first buffer index + @param bi1 second buffer index + @param next0 actual next index to be used for the first packet + @param next1 actual next index to be used for the second packet + + @return @c next_index -- speculative next index to be used for future packets + @return @c to_next -- speculative frame to be used for future packets + @return @c n_left_to_next -- number of slots left in speculative frame +*/ + +#define vlib_validate_buffer_enqueue_x2(vm,node,next_index,to_next,n_left_to_next,bi0,bi1,next0,next1) \ +do { \ + int enqueue_code = (next0 != next_index) + 2*(next1 != next_index); \ + \ + if (PREDICT_FALSE (enqueue_code != 0)) \ + { \ + switch (enqueue_code) \ + { \ + case 1: \ + /* A B A */ \ + to_next[-2] = bi1; \ + to_next -= 1; \ + n_left_to_next += 1; \ + vlib_set_next_frame_buffer (vm, node, next0, bi0); \ + break; \ + \ + case 2: \ + /* A A B */ \ + to_next -= 1; \ + n_left_to_next += 1; \ + vlib_set_next_frame_buffer (vm, node, next1, bi1); \ + break; \ + \ + case 3: \ + /* A B B or A B C */ \ + to_next -= 2; \ + n_left_to_next += 2; \ + vlib_set_next_frame_buffer (vm, node, next0, bi0); \ + vlib_set_next_frame_buffer (vm, node, next1, bi1); \ + if (next0 == next1) \ + { \ + vlib_put_next_frame (vm, node, next_index, \ + n_left_to_next); \ + next_index = next1; \ + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); \ + } \ + } \ + } \ +} while (0) + + +/** \brief Finish enqueueing four buffers forward in the graph. + Standard quad loop boilerplate element. This is a MACRO, + with MULTIPLE SIDE EFFECTS. In the ideal case, + next_index == next0 == next1 == next2 == next3, + which means that the speculative enqueue at the top of the quad loop + has correctly dealt with all four packets. In that case, the macro does + nothing at all. + + @param vm vlib_main_t pointer, varies by thread + @param node current node vlib_node_runtime_t pointer + @param next_index speculated next index used for both packets + @param to_next speculated vector pointer used for both packets + @param n_left_to_next number of slots left in speculated vector + @param bi0 first buffer index + @param bi1 second buffer index + @param bi2 third buffer index + @param bi3 fourth buffer index + @param next0 actual next index to be used for the first packet + @param next1 actual next index to be used for the second packet + @param next2 actual next index to be used for the third packet + @param next3 actual next index to be used for the fourth packet + + @return @c next_index -- speculative next index to be used for future packets + @return @c to_next -- speculative frame to be used for future packets + @return @c n_left_to_next -- number of slots left in speculative frame +*/ + +#define vlib_validate_buffer_enqueue_x4(vm,node,next_index,to_next,n_left_to_next,bi0,bi1,bi2,bi3,next0,next1,next2,next3) \ +do { \ + /* After the fact: check the [speculative] enqueue to "next" */ \ + u32 fix_speculation = next_index != next0 || next_index != next1 \ + || next_index != next2 || next_index != next3; \ + if (PREDICT_FALSE(fix_speculation)) \ + { \ + /* rewind... */ \ + to_next -= 4; \ + n_left_to_next += 4; \ + \ + /* If bi0 belongs to "next", send it there */ \ + if (next_index == next0) \ + { \ + to_next[0] = bi0; \ + to_next++; \ + n_left_to_next --; \ + } \ + else /* send it where it needs to go */ \ + vlib_set_next_frame_buffer (vm, node, next0, bi0); \ + \ + if (next_index == next1) \ + { \ + to_next[0] = bi1; \ + to_next++; \ + n_left_to_next --; \ + } \ + else \ + vlib_set_next_frame_buffer (vm, node, next1, bi1); \ + \ + if (next_index == next2) \ + { \ + to_next[0] = bi2; \ + to_next++; \ + n_left_to_next --; \ + } \ + else \ + vlib_set_next_frame_buffer (vm, node, next2, bi2); \ + \ + if (next_index == next3) \ + { \ + to_next[0] = bi3; \ + to_next++; \ + n_left_to_next --; \ + } \ + else \ + vlib_set_next_frame_buffer (vm, node, next3, bi3); \ + \ + /* Change speculation: last 2 packets went to the same node */ \ + if (next2 == next3) \ + { \ + vlib_put_next_frame (vm, node, next_index, n_left_to_next); \ + next_index = next3; \ + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); \ + } \ + } \ + } while(0); + +/** \brief Finish enqueueing one buffer forward in the graph. + Standard single loop boilerplate element. This is a MACRO, + with MULTIPLE SIDE EFFECTS. In the ideal case, + next_index == next0, + which means that the speculative enqueue at the top of the single loop + has correctly dealt with the packet in hand. In that case, the macro does + nothing at all. + + @param vm vlib_main_t pointer, varies by thread + @param node current node vlib_node_runtime_t pointer + @param next_index speculated next index used for both packets + @param to_next speculated vector pointer used for both packets + @param n_left_to_next number of slots left in speculated vector + @param bi0 first buffer index + @param next0 actual next index to be used for the first packet + + @return @c next_index -- speculative next index to be used for future packets + @return @c to_next -- speculative frame to be used for future packets + @return @c n_left_to_next -- number of slots left in speculative frame +*/ +#define vlib_validate_buffer_enqueue_x1(vm,node,next_index,to_next,n_left_to_next,bi0,next0) \ +do { \ + if (PREDICT_FALSE (next0 != next_index)) \ + { \ + vlib_put_next_frame (vm, node, next_index, n_left_to_next + 1); \ + next_index = next0; \ + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); \ + \ + to_next[0] = bi0; \ + to_next += 1; \ + n_left_to_next -= 1; \ + } \ +} while (0) + +always_inline uword +generic_buffer_node_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, + uword sizeof_trace, + void *opaque1, + uword opaque2, + void (*two_buffers) (vlib_main_t * vm, + void *opaque1, + uword opaque2, + vlib_buffer_t * b0, + vlib_buffer_t * b1, + u32 * next0, u32 * next1), + void (*one_buffer) (vlib_main_t * vm, + void *opaque1, uword opaque2, + vlib_buffer_t * b0, + u32 * next0)) +{ + u32 n_left_from, *from, *to_next; + u32 next_index; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + if (node->flags & VLIB_NODE_FLAG_TRACE) + vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors, + /* stride */ 1, sizeof_trace); + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + vlib_buffer_t *p0, *p1; + u32 pi0, next0; + u32 pi1, next1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t *p2, *p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, 64, LOAD); + CLIB_PREFETCH (p3->data, 64, LOAD); + } + + pi0 = to_next[0] = from[0]; + pi1 = to_next[1] = from[1]; + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + p0 = vlib_get_buffer (vm, pi0); + p1 = vlib_get_buffer (vm, pi1); + + two_buffers (vm, opaque1, opaque2, p0, p1, &next0, &next1); + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + pi0, pi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t *p0; + u32 pi0, next0; + + pi0 = from[0]; + to_next[0] = pi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, pi0); + + one_buffer (vm, opaque1, opaque2, p0, &next0); + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + pi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +#endif /* included_vlib_buffer_node_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/cli.c b/src/vlib/cli.c new file mode 100644 index 00000000..2d141115 --- /dev/null +++ b/src/vlib/cli.c @@ -0,0 +1,1173 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * cli.c: command line interface + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include + +/* Root of all show commands. */ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (vlib_cli_show_command, static) = { + .path = "show", + .short_help = "Show commands", +}; +/* *INDENT-ON* */ + +/* Root of all clear commands. */ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (vlib_cli_clear_command, static) = { + .path = "clear", + .short_help = "Clear commands", +}; +/* *INDENT-ON* */ + +/* Root of all set commands. */ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (vlib_cli_set_command, static) = { + .path = "set", + .short_help = "Set commands", +}; +/* *INDENT-ON* */ + +/* Root of all test commands. */ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (vlib_cli_test_command, static) = { + .path = "test", + .short_help = "Test commands", +}; +/* *INDENT-ON* */ + +/* Returns bitmap of commands which match key. */ +static uword * +vlib_cli_sub_command_match (vlib_cli_command_t * c, unformat_input_t * input) +{ + int i, n; + uword *match = 0; + vlib_cli_parse_position_t *p; + + unformat_skip_white_space (input); + + for (i = 0;; i++) + { + uword k; + + k = unformat_get_input (input); + switch (k) + { + case 'a' ... 'z': + case 'A' ... 'Z': + case '0' ... '9': + case '-': + case '_': + break; + + case ' ': + case '\t': + case '\r': + case '\n': + case UNFORMAT_END_OF_INPUT: + /* White space or end of input removes any non-white + matches that were before possible. */ + if (i < vec_len (c->sub_command_positions) + && clib_bitmap_count_set_bits (match) > 1) + { + p = vec_elt_at_index (c->sub_command_positions, i); + for (n = 0; n < vec_len (p->bitmaps); n++) + match = clib_bitmap_andnot (match, p->bitmaps[n]); + } + goto done; + + default: + unformat_put_input (input); + goto done; + } + + if (i >= vec_len (c->sub_command_positions)) + { + no_match: + clib_bitmap_free (match); + return 0; + } + + p = vec_elt_at_index (c->sub_command_positions, i); + if (vec_len (p->bitmaps) == 0) + goto no_match; + + n = k - p->min_char; + if (n < 0 || n >= vec_len (p->bitmaps)) + goto no_match; + + if (i == 0) + match = clib_bitmap_dup (p->bitmaps[n]); + else + match = clib_bitmap_and (match, p->bitmaps[n]); + + if (clib_bitmap_is_zero (match)) + goto no_match; + } + +done: + return match; +} + +/* Looks for string based sub-input formatted { SUB-INPUT }. */ +uword +unformat_vlib_cli_sub_input (unformat_input_t * i, va_list * args) +{ + unformat_input_t *sub_input = va_arg (*args, unformat_input_t *); + u8 *s; + uword c; + + while (1) + { + c = unformat_get_input (i); + switch (c) + { + case ' ': + case '\t': + case '\n': + case '\r': + case '\f': + break; + + case '{': + default: + /* Put back paren. */ + if (c != UNFORMAT_END_OF_INPUT) + unformat_put_input (i); + + if (c == '{' && unformat (i, "%v", &s)) + { + unformat_init_vector (sub_input, s); + return 1; + } + return 0; + } + } + return 0; +} + +static vlib_cli_command_t * +get_sub_command (vlib_cli_main_t * cm, vlib_cli_command_t * parent, u32 si) +{ + vlib_cli_sub_command_t *s = vec_elt_at_index (parent->sub_commands, si); + return vec_elt_at_index (cm->commands, s->index); +} + +static uword +unformat_vlib_cli_sub_command (unformat_input_t * i, va_list * args) +{ + vlib_main_t *vm = va_arg (*args, vlib_main_t *); + vlib_cli_command_t *c = va_arg (*args, vlib_cli_command_t *); + vlib_cli_command_t **result = va_arg (*args, vlib_cli_command_t **); + vlib_cli_main_t *cm = &vm->cli_main; + uword *match_bitmap, is_unique, index; + + { + vlib_cli_sub_rule_t *sr; + vlib_cli_parse_rule_t *r; + vec_foreach (sr, c->sub_rules) + { + void **d; + r = vec_elt_at_index (cm->parse_rules, sr->rule_index); + vec_add2 (cm->parse_rule_data, d, 1); + vec_reset_length (d[0]); + if (r->data_size) + d[0] = _vec_resize (d[0], + /* length increment */ 1, + r->data_size, + /* header_bytes */ 0, + /* data align */ sizeof (uword)); + if (unformat_user (i, r->unformat_function, vm, d[0])) + { + *result = vec_elt_at_index (cm->commands, sr->command_index); + return 1; + } + } + } + + match_bitmap = vlib_cli_sub_command_match (c, i); + is_unique = clib_bitmap_count_set_bits (match_bitmap) == 1; + index = ~0; + if (is_unique) + { + index = clib_bitmap_first_set (match_bitmap); + *result = get_sub_command (cm, c, index); + } + clib_bitmap_free (match_bitmap); + + return is_unique; +} + +static u8 * +format_vlib_cli_command_help (u8 * s, va_list * args) +{ + vlib_cli_command_t *c = va_arg (*args, vlib_cli_command_t *); + int is_long = va_arg (*args, int); + if (is_long && c->long_help) + s = format (s, "%s", c->long_help); + else if (c->short_help) + s = format (s, "%s", c->short_help); + else + s = format (s, "%v commands", c->path); + return s; +} + +static u8 * +format_vlib_cli_parse_rule_name (u8 * s, va_list * args) +{ + vlib_cli_parse_rule_t *r = va_arg (*args, vlib_cli_parse_rule_t *); + return format (s, "<%U>", format_c_identifier, r->name); +} + +static u8 * +format_vlib_cli_path (u8 * s, va_list * args) +{ + u8 *path = va_arg (*args, u8 *); + int i, in_rule; + in_rule = 0; + for (i = 0; i < vec_len (path); i++) + { + switch (path[i]) + { + case '%': + in_rule = 1; + vec_add1 (s, '<'); /* start of */ + break; + + case '_': + /* _ -> space in rules. */ + vec_add1 (s, in_rule ? ' ' : '_'); + break; + + case ' ': + if (in_rule) + { + vec_add1 (s, '>'); /* end of */ + in_rule = 0; + } + vec_add1 (s, ' '); + break; + + default: + vec_add1 (s, path[i]); + break; + } + } + + if (in_rule) + vec_add1 (s, '>'); /* terminate */ + + return s; +} + +static vlib_cli_command_t * +all_subs (vlib_cli_main_t * cm, vlib_cli_command_t * subs, u32 command_index) +{ + vlib_cli_command_t *c = vec_elt_at_index (cm->commands, command_index); + vlib_cli_sub_command_t *sc; + vlib_cli_sub_rule_t *sr; + + if (c->function) + vec_add1 (subs, c[0]); + + vec_foreach (sr, c->sub_rules) + subs = all_subs (cm, subs, sr->command_index); + vec_foreach (sc, c->sub_commands) subs = all_subs (cm, subs, sc->index); + + return subs; +} + +static int +vlib_cli_cmp_rule (void *a1, void *a2) +{ + vlib_cli_sub_rule_t *r1 = a1; + vlib_cli_sub_rule_t *r2 = a2; + + return vec_cmp (r1->name, r2->name); +} + +static int +vlib_cli_cmp_command (void *a1, void *a2) +{ + vlib_cli_command_t *c1 = a1; + vlib_cli_command_t *c2 = a2; + + return vec_cmp (c1->path, c2->path); +} + +static clib_error_t * +vlib_cli_dispatch_sub_commands (vlib_main_t * vm, + vlib_cli_main_t * cm, + unformat_input_t * input, + uword parent_command_index) +{ + vlib_cli_command_t *parent, *c; + clib_error_t *error = 0; + unformat_input_t sub_input; + u8 *string; + uword is_main_dispatch = cm == &vm->cli_main; + + parent = vec_elt_at_index (cm->commands, parent_command_index); + if (is_main_dispatch && unformat (input, "help")) + { + uword help_at_end_of_line, i; + + help_at_end_of_line = + unformat_check_input (input) == UNFORMAT_END_OF_INPUT; + while (1) + { + c = parent; + if (unformat_user + (input, unformat_vlib_cli_sub_command, vm, c, &parent)) + ; + + else if (!(unformat_check_input (input) == UNFORMAT_END_OF_INPUT)) + goto unknown; + + else + break; + } + + /* help SUB-COMMAND => long format help. + "help" at end of line: show all commands. */ + if (!help_at_end_of_line) + vlib_cli_output (vm, "%U", format_vlib_cli_command_help, c, + /* is_long */ 1); + + else if (vec_len (c->sub_commands) + vec_len (c->sub_rules) == 0) + vlib_cli_output (vm, "%v: no sub-commands", c->path); + + else + { + vlib_cli_sub_command_t *sc; + vlib_cli_sub_rule_t *sr, *subs; + + subs = vec_dup (c->sub_rules); + + /* Add in rules if any. */ + vec_foreach (sc, c->sub_commands) + { + vec_add2 (subs, sr, 1); + sr->name = sc->name; + sr->command_index = sc->index; + sr->rule_index = ~0; + } + + vec_sort_with_function (subs, vlib_cli_cmp_rule); + + for (i = 0; i < vec_len (subs); i++) + { + vlib_cli_command_t *d; + vlib_cli_parse_rule_t *r; + + d = vec_elt_at_index (cm->commands, subs[i].command_index); + r = + subs[i].rule_index != ~0 ? vec_elt_at_index (cm->parse_rules, + subs + [i].rule_index) : + 0; + + if (r) + vlib_cli_output + (vm, " %-30U %U", + format_vlib_cli_parse_rule_name, r, + format_vlib_cli_command_help, d, /* is_long */ 0); + else + vlib_cli_output + (vm, " %-30v %U", + subs[i].name, + format_vlib_cli_command_help, d, /* is_long */ 0); + } + + vec_free (subs); + } + } + + else if (is_main_dispatch + && (unformat (input, "choices") || unformat (input, "?"))) + { + vlib_cli_command_t *sub, *subs; + + subs = all_subs (cm, 0, parent_command_index); + vec_sort_with_function (subs, vlib_cli_cmp_command); + vec_foreach (sub, subs) + vlib_cli_output (vm, " %-40U %U", + format_vlib_cli_path, sub->path, + format_vlib_cli_command_help, sub, /* is_long */ 0); + vec_free (subs); + } + + else if (unformat (input, "comment %v", &string)) + { + vec_free (string); + } + + else if (unformat (input, "uncomment %U", + unformat_vlib_cli_sub_input, &sub_input)) + { + error = + vlib_cli_dispatch_sub_commands (vm, cm, &sub_input, + parent_command_index); + unformat_free (&sub_input); + } + + else + if (unformat_user (input, unformat_vlib_cli_sub_command, vm, parent, &c)) + { + unformat_input_t *si; + uword has_sub_commands = + vec_len (c->sub_commands) + vec_len (c->sub_rules) > 0; + + si = input; + if (unformat_user (input, unformat_vlib_cli_sub_input, &sub_input)) + si = &sub_input; + + if (has_sub_commands) + error = vlib_cli_dispatch_sub_commands (vm, cm, si, c - cm->commands); + + if (has_sub_commands && !error) + /* Found valid sub-command. */ ; + + else if (c->function) + { + clib_error_t *c_error; + + /* Skip white space for benefit of called function. */ + unformat_skip_white_space (si); + + if (unformat (si, "?")) + { + vlib_cli_output (vm, " %-40U %U", format_vlib_cli_path, c->path, format_vlib_cli_command_help, c, /* is_long */ + 0); + } + else + { + if (!c->is_mp_safe) + vlib_worker_thread_barrier_sync (vm); + + c_error = c->function (vm, si, c); + + if (!c->is_mp_safe) + vlib_worker_thread_barrier_release (vm); + + if (c_error) + { + error = + clib_error_return (0, "%v: %v", c->path, c_error->what); + clib_error_free (c_error); + /* Free sub input. */ + if (si != input) + unformat_free (si); + + return error; + } + } + + /* Free any previous error. */ + clib_error_free (error); + } + + else if (!error) + error = clib_error_return (0, "%v: no sub-commands", c->path); + + /* Free sub input. */ + if (si != input) + unformat_free (si); + } + + else + goto unknown; + + return error; + +unknown: + if (parent->path) + return clib_error_return (0, "%v: unknown input `%U'", parent->path, + format_unformat_error, input); + else + return clib_error_return (0, "unknown input `%U'", format_unformat_error, + input); +} + + +void vlib_unix_error_report (vlib_main_t *, clib_error_t *) + __attribute__ ((weak)); + +void +vlib_unix_error_report (vlib_main_t * vm, clib_error_t * error) +{ +} + +/* Process CLI input. */ +void +vlib_cli_input (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_output_function_t * function, uword function_arg) +{ + vlib_process_t *cp = vlib_get_current_process (vm); + vlib_cli_main_t *cm = &vm->cli_main; + clib_error_t *error; + vlib_cli_output_function_t *save_function; + uword save_function_arg; + + save_function = cp->output_function; + save_function_arg = cp->output_function_arg; + + cp->output_function = function; + cp->output_function_arg = function_arg; + + do + { + vec_reset_length (cm->parse_rule_data); + error = vlib_cli_dispatch_sub_commands (vm, &vm->cli_main, input, /* parent */ + 0); + } + while (!error && !unformat (input, "%U", unformat_eof)); + + if (error) + { + vlib_cli_output (vm, "%v", error->what); + vlib_unix_error_report (vm, error); + clib_error_free (error); + } + + cp->output_function = save_function; + cp->output_function_arg = save_function_arg; +} + +/* Output to current CLI connection. */ +void +vlib_cli_output (vlib_main_t * vm, char *fmt, ...) +{ + vlib_process_t *cp = vlib_get_current_process (vm); + va_list va; + u8 *s; + + va_start (va, fmt); + s = va_format (0, fmt, &va); + va_end (va); + + /* Terminate with \n if not present. */ + if (vec_len (s) > 0 && s[vec_len (s) - 1] != '\n') + vec_add1 (s, '\n'); + + if ((!cp) || (!cp->output_function)) + fformat (stdout, "%v", s); + else + cp->output_function (cp->output_function_arg, s, vec_len (s)); + + vec_free (s); +} + +static clib_error_t * +show_memory_usage (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + int verbose = 0; + clib_error_t *error; + u32 index = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "verbose")) + verbose = 1; + else + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + return error; + } + } + + /* *INDENT-OFF* */ + foreach_vlib_main ( + ({ + vlib_cli_output (vm, "Thread %d %v\n", index, vlib_worker_threads[index].name); + vlib_cli_output (vm, "%U\n", format_mheap, clib_per_cpu_mheaps[index], verbose); + index++; + })); + /* *INDENT-ON* */ + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_memory_usage_command, static) = { + .path = "show memory", + .short_help = "Show current memory usage", + .function = show_memory_usage, +}; +/* *INDENT-ON* */ + +static clib_error_t * +show_cpu (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ +#define _(a,b,c) vlib_cli_output (vm, "%-25s " b, a ":", c); + _("Model name", "%U", format_cpu_model_name); + _("Microarchitecture", "%U", format_cpu_uarch); + _("Flags", "%U", format_cpu_flags); + _("Base frequency", "%.2f GHz", + ((f64) vm->clib_time.clocks_per_second) * 1e-9); +#undef _ + return 0; +} + +/*? + * Displays various information about the CPU. + * + * @cliexpar + * @cliexstart{show cpu} + * Model name: Intel(R) Xeon(R) CPU E5-2667 v4 @ 3.20GHz + * Microarchitecture: Broadwell (Broadwell-EP/EX) + * Flags: sse3 ssse3 sse41 sse42 avx avx2 aes + * Base Frequency: 3.20 GHz + * @cliexend +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_cpu_command, static) = { + .path = "show cpu", + .short_help = "Show cpu information", + .function = show_cpu, +}; + +/* *INDENT-ON* */ +static clib_error_t * +enable_disable_memory_trace (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + clib_error_t *error = 0; + int enable; + + if (!unformat_user (input, unformat_vlib_enable_disable, &enable)) + { + error = clib_error_return (0, "expecting enable/on or disable/off"); + goto done; + } + + clib_mem_trace (enable); + +done: + return error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (enable_disable_memory_trace_command, static) = { + .path = "memory-trace", + .short_help = "Enable/disable memory allocation trace", + .function = enable_disable_memory_trace, +}; +/* *INDENT-ON* */ + + +static clib_error_t * +test_heap_validate (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + clib_error_t *error = 0; + void *heap; + mheap_t *mheap; + + if (unformat (input, "on")) + { + /* *INDENT-OFF* */ + foreach_vlib_main({ + heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index]; + mheap = mheap_header(heap); + mheap->flags |= MHEAP_FLAG_VALIDATE; + // Turn off small object cache because it delays detection of errors + mheap->flags &= ~MHEAP_FLAG_SMALL_OBJECT_CACHE; + }); + /* *INDENT-ON* */ + + } + else if (unformat (input, "off")) + { + /* *INDENT-OFF* */ + foreach_vlib_main({ + heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index]; + mheap = mheap_header(heap); + mheap->flags &= ~MHEAP_FLAG_VALIDATE; + mheap->flags |= MHEAP_FLAG_SMALL_OBJECT_CACHE; + }); + /* *INDENT-ON* */ + } + else if (unformat (input, "now")) + { + /* *INDENT-OFF* */ + foreach_vlib_main({ + heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index]; + mheap = mheap_header(heap); + mheap_validate(heap); + }); + /* *INDENT-ON* */ + vlib_cli_output (vm, "heap validation complete"); + + } + else + { + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + return error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_test_heap_validate,static) = { + .path = "test heap-validate", + .short_help = " validate heap on future allocs/frees or right now", + .function = test_heap_validate, +}; +/* *INDENT-ON* */ + +#ifdef TEST_CODE +/* + * A trivial test harness to verify the per-process output_function + * is working correcty. + */ + +static clib_error_t * +sleep_ten_seconds (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + u16 i; + u16 my_id = rand (); + + vlib_cli_output (vm, "Starting 10 seconds sleep with id %u\n", my_id); + + for (i = 0; i < 10; i++) + { + vlib_process_wait_for_event_or_clock (vm, 1.0); + vlib_cli_output (vm, "Iteration number %u, my id: %u\n", i, my_id); + } + vlib_cli_output (vm, "Done with sleep with id %u\n", my_id); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (ping_command, static) = { + .path = "test sleep", + .function = sleep_ten_seconds, + .short_help = "Sleep for 10 seconds", +}; +/* *INDENT-ON* */ +#endif /* ifdef TEST_CODE */ + +static uword +vlib_cli_normalize_path (char *input, char **result) +{ + char *i = input; + char *s = 0; + uword l = 0; + uword index_of_last_space = ~0; + + while (*i != 0) + { + u8 c = *i++; + /* Multiple white space -> single space. */ + switch (c) + { + case ' ': + case '\t': + case '\n': + case '\r': + if (l > 0 && s[l - 1] != ' ') + { + vec_add1 (s, ' '); + l++; + } + break; + + default: + if (l > 0 && s[l - 1] == ' ') + index_of_last_space = vec_len (s); + vec_add1 (s, c); + l++; + break; + } + } + + /* Remove any extra space at end. */ + if (l > 0 && s[l - 1] == ' ') + _vec_len (s) -= 1; + + *result = s; + return index_of_last_space; +} + +always_inline uword +parent_path_len (char *path) +{ + word i; + for (i = vec_len (path) - 1; i >= 0; i--) + { + if (path[i] == ' ') + return i; + } + return ~0; +} + +static void +add_sub_command (vlib_cli_main_t * cm, uword parent_index, uword child_index) +{ + vlib_cli_command_t *p, *c; + vlib_cli_sub_command_t *sub_c; + u8 *sub_name; + word i, l; + + p = vec_elt_at_index (cm->commands, parent_index); + c = vec_elt_at_index (cm->commands, child_index); + + l = parent_path_len (c->path); + if (l == ~0) + sub_name = vec_dup ((u8 *) c->path); + else + { + ASSERT (l + 1 < vec_len (c->path)); + sub_name = 0; + vec_add (sub_name, c->path + l + 1, vec_len (c->path) - (l + 1)); + } + + if (sub_name[0] == '%') + { + uword *q; + vlib_cli_sub_rule_t *sr; + + /* Remove %. */ + vec_delete (sub_name, 1, 0); + + if (!p->sub_rule_index_by_name) + p->sub_rule_index_by_name = hash_create_vec ( /* initial length */ 32, + sizeof (sub_name[0]), + sizeof (uword)); + q = hash_get_mem (p->sub_rule_index_by_name, sub_name); + if (q) + { + sr = vec_elt_at_index (p->sub_rules, q[0]); + ASSERT (sr->command_index == child_index); + return; + } + + q = hash_get_mem (cm->parse_rule_index_by_name, sub_name); + if (!q) + { + clib_error ("reference to unknown rule `%%%v' in path `%v'", + sub_name, c->path); + return; + } + + hash_set_mem (p->sub_rule_index_by_name, sub_name, + vec_len (p->sub_rules)); + vec_add2 (p->sub_rules, sr, 1); + sr->name = sub_name; + sr->rule_index = q[0]; + sr->command_index = child_index; + return; + } + + if (!p->sub_command_index_by_name) + p->sub_command_index_by_name = hash_create_vec ( /* initial length */ 32, + sizeof (c->path[0]), + sizeof (uword)); + + /* Check if sub-command has already been created. */ + if (hash_get_mem (p->sub_command_index_by_name, sub_name)) + { + vec_free (sub_name); + return; + } + + vec_add2 (p->sub_commands, sub_c, 1); + sub_c->index = child_index; + sub_c->name = sub_name; + hash_set_mem (p->sub_command_index_by_name, sub_c->name, + sub_c - p->sub_commands); + + vec_validate (p->sub_command_positions, vec_len (sub_c->name) - 1); + for (i = 0; i < vec_len (sub_c->name); i++) + { + int n; + vlib_cli_parse_position_t *pos; + + pos = vec_elt_at_index (p->sub_command_positions, i); + + if (!pos->bitmaps) + pos->min_char = sub_c->name[i]; + + n = sub_c->name[i] - pos->min_char; + if (n < 0) + { + pos->min_char = sub_c->name[i]; + vec_insert (pos->bitmaps, -n, 0); + n = 0; + } + + vec_validate (pos->bitmaps, n); + pos->bitmaps[n] = + clib_bitmap_ori (pos->bitmaps[n], sub_c - p->sub_commands); + } +} + +static void +vlib_cli_make_parent (vlib_cli_main_t * cm, uword ci) +{ + uword p_len, pi, *p; + char *p_path; + vlib_cli_command_t *c, *parent; + + /* Root command (index 0) should have already been added. */ + ASSERT (vec_len (cm->commands) > 0); + + c = vec_elt_at_index (cm->commands, ci); + p_len = parent_path_len (c->path); + + /* No space? Parent is root command. */ + if (p_len == ~0) + { + add_sub_command (cm, 0, ci); + return; + } + + p_path = 0; + vec_add (p_path, c->path, p_len); + + p = hash_get_mem (cm->command_index_by_path, p_path); + + /* Parent exists? */ + if (!p) + { + /* Parent does not exist; create it. */ + vec_add2 (cm->commands, parent, 1); + parent->path = p_path; + hash_set_mem (cm->command_index_by_path, parent->path, + parent - cm->commands); + pi = parent - cm->commands; + } + else + { + pi = p[0]; + vec_free (p_path); + } + + add_sub_command (cm, pi, ci); + + /* Create parent's parent. */ + if (!p) + vlib_cli_make_parent (cm, pi); +} + +always_inline uword +vlib_cli_command_is_empty (vlib_cli_command_t * c) +{ + return (c->long_help == 0 && c->short_help == 0 && c->function == 0); +} + +clib_error_t * +vlib_cli_register (vlib_main_t * vm, vlib_cli_command_t * c) +{ + vlib_cli_main_t *cm = &vm->cli_main; + clib_error_t *error = 0; + uword ci, *p; + char *normalized_path; + + if ((error = vlib_call_init_function (vm, vlib_cli_init))) + return error; + + (void) vlib_cli_normalize_path (c->path, &normalized_path); + + if (!cm->command_index_by_path) + cm->command_index_by_path = hash_create_vec ( /* initial length */ 32, + sizeof (c->path[0]), + sizeof (uword)); + + /* See if command already exists with given path. */ + p = hash_get_mem (cm->command_index_by_path, normalized_path); + if (p) + { + vlib_cli_command_t *d; + + ci = p[0]; + d = vec_elt_at_index (cm->commands, ci); + + /* If existing command was created via vlib_cli_make_parent + replaced it with callers data. */ + if (vlib_cli_command_is_empty (d)) + { + vlib_cli_command_t save = d[0]; + + ASSERT (!vlib_cli_command_is_empty (c)); + + /* Copy callers fields. */ + d[0] = c[0]; + + /* Save internal fields. */ + d->path = save.path; + d->sub_commands = save.sub_commands; + d->sub_command_index_by_name = save.sub_command_index_by_name; + d->sub_command_positions = save.sub_command_positions; + d->sub_rules = save.sub_rules; + } + else + error = + clib_error_return (0, "duplicate command name with path %v", + normalized_path); + + vec_free (normalized_path); + if (error) + return error; + } + else + { + /* Command does not exist: create it. */ + + /* Add root command (index 0). */ + if (vec_len (cm->commands) == 0) + { + /* Create command with index 0; path is empty string. */ + vec_resize (cm->commands, 1); + } + + ci = vec_len (cm->commands); + hash_set_mem (cm->command_index_by_path, normalized_path, ci); + vec_add1 (cm->commands, c[0]); + + c = vec_elt_at_index (cm->commands, ci); + c->path = normalized_path; + + /* Don't inherit from registration. */ + c->sub_commands = 0; + c->sub_command_index_by_name = 0; + c->sub_command_positions = 0; + } + + vlib_cli_make_parent (cm, ci); + return 0; +} + +clib_error_t * +vlib_cli_register_parse_rule (vlib_main_t * vm, vlib_cli_parse_rule_t * r_reg) +{ + vlib_cli_main_t *cm = &vm->cli_main; + vlib_cli_parse_rule_t *r; + clib_error_t *error = 0; + u8 *r_name; + uword *p; + + if (!cm->parse_rule_index_by_name) + cm->parse_rule_index_by_name = hash_create_vec ( /* initial length */ 32, + sizeof (r->name[0]), + sizeof (uword)); + + /* Make vector copy of name. */ + r_name = format (0, "%s", r_reg->name); + + if ((p = hash_get_mem (cm->parse_rule_index_by_name, r_name))) + { + vec_free (r_name); + return clib_error_return (0, "duplicate parse rule name `%s'", + r_reg->name); + } + + vec_add2 (cm->parse_rules, r, 1); + r[0] = r_reg[0]; + r->name = (char *) r_name; + hash_set_mem (cm->parse_rule_index_by_name, r->name, r - cm->parse_rules); + + return error; +} + +#if 0 +/* $$$ turn back on again someday, maybe */ +static clib_error_t *vlib_cli_register_parse_rules (vlib_main_t * vm, + vlib_cli_parse_rule_t * + lo, + vlib_cli_parse_rule_t * + hi) + __attribute__ ((unused)) +{ + clib_error_t *error = 0; + vlib_cli_parse_rule_t *r; + + for (r = lo; r < hi; r = clib_elf_section_data_next (r, 0)) + { + if (!r->name || strlen (r->name) == 0) + { + error = clib_error_return (0, "parse rule with no name"); + goto done; + } + + error = vlib_cli_register_parse_rule (vm, r); + if (error) + goto done; + } + +done: + return error; +} +#endif + +static clib_error_t * +vlib_cli_init (vlib_main_t * vm) +{ + vlib_cli_main_t *cm = &vm->cli_main; + clib_error_t *error = 0; + vlib_cli_command_t *cmd; + + cmd = cm->cli_command_registrations; + + while (cmd) + { + error = vlib_cli_register (vm, cmd); + if (error) + return error; + cmd = cmd->next_cli_command; + } + return error; +} + +VLIB_INIT_FUNCTION (vlib_cli_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/cli.h b/src/vlib/cli.h new file mode 100644 index 00000000..009c7e82 --- /dev/null +++ b/src/vlib/cli.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * cli.h: command line interface + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_cli_h +#define included_vlib_cli_h + +#include + +struct vlib_cli_command_t; + +typedef struct +{ + u32 min_char; + + /* Indexed by name[position] - min_char. */ + uword **bitmaps; +} vlib_cli_parse_position_t; + +typedef struct +{ + u8 *name; + + u32 index; +} vlib_cli_sub_command_t; + +typedef struct +{ + u8 *name; + + u32 rule_index; + + u32 command_index; +} vlib_cli_sub_rule_t; + +typedef struct +{ + char *name; + char *short_help; + char *long_help; + + /* Number of bytes in parsed data. Zero for vector. */ + uword data_size; + + unformat_function_t *unformat_function; + + /* Opaque for unformat function. */ + uword unformat_function_arg[2]; +} vlib_cli_parse_rule_t; + +/* CLI command callback function. */ +typedef clib_error_t *(vlib_cli_command_function_t) + (struct vlib_main_t * vm, + unformat_input_t * input, struct vlib_cli_command_t * cmd); + +typedef struct vlib_cli_command_t +{ + /* Command path (e.g. "show something"). + Spaces delimit elements of path. */ + char *path; + + /* Short/long help strings. */ + char *short_help; + char *long_help; + + /* Callback function. */ + vlib_cli_command_function_t *function; + + /* Opaque. */ + uword function_arg; + + /* Known MP-safe? */ + uword is_mp_safe; + + /* Sub commands for this command. */ + vlib_cli_sub_command_t *sub_commands; + + /* Hash table mapping name (e.g. last path element) to sub command index. */ + uword *sub_command_index_by_name; + + /* bitmap[p][c][i] says whether sub-command i has character + c in position p. */ + vlib_cli_parse_position_t *sub_command_positions; + + /* Hash table mapping name (e.g. last path element) to sub rule index. */ + uword *sub_rule_index_by_name; + + /* Vector of possible parse rules for this path. */ + vlib_cli_sub_rule_t *sub_rules; + + /* List of CLI commands, built by constructors */ + struct vlib_cli_command_t *next_cli_command; + +} vlib_cli_command_t; + +typedef void (vlib_cli_output_function_t) (uword arg, + u8 * buffer, uword buffer_bytes); +typedef struct +{ + /* Vector of all known commands. */ + vlib_cli_command_t *commands; + + /* Hash table mapping normalized path to index into all_commands. */ + uword *command_index_by_path; + + /* Vector of all known parse rules. */ + vlib_cli_parse_rule_t *parse_rules; + + /* Hash table mapping parse rule name to index into parse_rule vector. */ + uword *parse_rule_index_by_name; + + /* Data parsed for rules. */ + void **parse_rule_data; + + /* registration list added by constructors */ + vlib_cli_command_t *cli_command_registrations; +} vlib_cli_main_t; + +#define VLIB_CLI_COMMAND(x,...) \ + __VA_ARGS__ vlib_cli_command_t x; \ +static void __vlib_cli_command_registration_##x (void) \ + __attribute__((__constructor__)) ; \ +static void __vlib_cli_command_registration_##x (void) \ +{ \ + vlib_main_t * vm = vlib_get_main(); \ + vlib_cli_main_t *cm = &vm->cli_main; \ + x.next_cli_command = cm->cli_command_registrations; \ + cm->cli_command_registrations = &x; \ +} \ +__VA_ARGS__ vlib_cli_command_t x +#define VLIB_CLI_PARSE_RULE(x) \ + vlib_cli_parse_rule_t x +/* Output to current CLI connection. */ +void vlib_cli_output (struct vlib_main_t *vm, char *fmt, ...); + +/* Process CLI input. */ +void vlib_cli_input (struct vlib_main_t *vm, + unformat_input_t * input, + vlib_cli_output_function_t * function, + uword function_arg); + +clib_error_t *vlib_cli_register (struct vlib_main_t *vm, + vlib_cli_command_t * c); +clib_error_t *vlib_cli_register_parse_rule (struct vlib_main_t *vm, + vlib_cli_parse_rule_t * c); + +uword unformat_vlib_cli_sub_input (unformat_input_t * i, va_list * args); + +#endif /* included_vlib_cli_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/cli_funcs.h b/src/vlib/cli_funcs.h new file mode 100644 index 00000000..78aef73b --- /dev/null +++ b/src/vlib/cli_funcs.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * cli_funcs.h: VLIB CLI related functions/inlines + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_cli_funcs_h +#define included_vlib_cli_funcs_h + +always_inline void * +vlib_cli_get_parse_rule_result (vlib_main_t * vm, uword index) +{ + vlib_cli_main_t *cm = &vm->cli_main; + return vec_elt (cm->parse_rule_data, index); +} + +#endif /* included_vlib_cli_funcs_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/counter.c b/src/vlib/counter.c new file mode 100644 index 00000000..9f66e04d --- /dev/null +++ b/src/vlib/counter.c @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * counter.c: simple and packet/byte counters + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include + +void +vlib_clear_simple_counters (vlib_simple_counter_main_t * cm) +{ + uword i, j; + u16 *my_minis; + + for (i = 0; i < vec_len (cm->minis); i++) + { + my_minis = cm->minis[i]; + + for (j = 0; j < vec_len (my_minis); j++) + { + cm->maxi[j] += my_minis[j]; + my_minis[j] = 0; + } + } + + j = vec_len (cm->maxi); + if (j > 0) + vec_validate (cm->value_at_last_clear, j - 1); + for (i = 0; i < j; i++) + cm->value_at_last_clear[i] = cm->maxi[i]; +} + +void +vlib_clear_combined_counters (vlib_combined_counter_main_t * cm) +{ + uword i, j; + vlib_mini_counter_t *my_minis; + + for (i = 0; i < vec_len (cm->minis); i++) + { + my_minis = cm->minis[i]; + + for (j = 0; j < vec_len (my_minis); j++) + { + cm->maxi[j].packets += my_minis[j].packets; + cm->maxi[j].bytes += my_minis[j].bytes; + my_minis[j].packets = 0; + my_minis[j].bytes = 0; + } + } + + j = vec_len (cm->maxi); + if (j > 0) + vec_validate (cm->value_at_last_clear, j - 1); + + for (i = 0; i < j; i++) + { + vlib_counter_t *c = vec_elt_at_index (cm->value_at_last_clear, i); + + c[0] = cm->maxi[i]; + } +} + +void +vlib_validate_simple_counter (vlib_simple_counter_main_t * cm, u32 index) +{ + vlib_thread_main_t *tm = vlib_get_thread_main (); + int i; + + vec_validate (cm->minis, tm->n_vlib_mains - 1); + for (i = 0; i < tm->n_vlib_mains; i++) + vec_validate_aligned (cm->minis[i], index, CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (cm->maxi, index, CLIB_CACHE_LINE_BYTES); +} + +void +vlib_validate_combined_counter (vlib_combined_counter_main_t * cm, u32 index) +{ + vlib_thread_main_t *tm = vlib_get_thread_main (); + int i; + + vec_validate (cm->minis, tm->n_vlib_mains - 1); + for (i = 0; i < tm->n_vlib_mains; i++) + vec_validate_aligned (cm->minis[i], index, CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (cm->maxi, index, CLIB_CACHE_LINE_BYTES); +} + +void +serialize_vlib_simple_counter_main (serialize_main_t * m, va_list * va) +{ + clib_warning ("unimplemented"); +} + +void +unserialize_vlib_simple_counter_main (serialize_main_t * m, va_list * va) +{ + clib_warning ("unimplemented"); +} + +void +serialize_vlib_combined_counter_main (serialize_main_t * m, va_list * va) +{ + clib_warning ("unimplemented"); +} + +void +unserialize_vlib_combined_counter_main (serialize_main_t * m, va_list * va) +{ + clib_warning ("unimplemented"); +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/counter.h b/src/vlib/counter.h new file mode 100644 index 00000000..a7903206 --- /dev/null +++ b/src/vlib/counter.h @@ -0,0 +1,379 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * counter.h: simple and packet/byte counters + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_counter_h +#define included_vlib_counter_h + +/** \file + + Optimized thread-safe counters. + + Each vlib_[simple|combined]_counter_main_t consists of a single + vector of thread-safe / atomically-updated u64 counters [the + "maxi" vector], and a (u16 **) per-thread vector [the "minis" + vector] of narrow, per-thread counters. + + The idea is to drastically reduce the number of atomic operations. + In the case of packet counts, we divide the number of atomic ops + by 2**16, etc. +*/ + +/** A collection of simple counters */ + +typedef struct +{ + u16 **minis; /**< Per-thread u16 non-atomic counters */ + u64 *maxi; /**< Shared wide counters */ + u64 *value_at_last_clear; /**< Counter values as of last clear. */ + u64 *value_at_last_serialize; /**< Values as of last serialize. */ + u32 last_incremental_serialize_index; /**< Last counter index + serialized incrementally. */ + + char *name; /**< The counter collection's name. */ +} vlib_simple_counter_main_t; + +/** Increment a simple counter + @param cm - (vlib_simple_counter_main_t *) simple counter main pointer + @param cpu_index - (u32) the current cpu index + @param index - (u32) index of the counter to increment + @param increment - (u32) quantitiy to add to the counter +*/ +always_inline void +vlib_increment_simple_counter (vlib_simple_counter_main_t * cm, + u32 cpu_index, u32 index, u32 increment) +{ + u16 *my_minis; + u16 *mini; + u32 old, new; + + my_minis = cm->minis[cpu_index]; + mini = vec_elt_at_index (my_minis, index); + old = mini[0]; + new = old + increment; + mini[0] = new; + + if (PREDICT_FALSE (mini[0] != new)) + { + __sync_fetch_and_add (&cm->maxi[index], new); + my_minis[index] = 0; + } +} + +/** Get the value of a simple counter + Scrapes the entire set of mini counters. Innacurate unless + worker threads which might increment the counter are + barrier-synchronized + + @param cm - (vlib_simple_counter_main_t *) simple counter main pointer + @param index - (u32) index of the counter to fetch + @returns - (u64) current counter value +*/ +always_inline u64 +vlib_get_simple_counter (vlib_simple_counter_main_t * cm, u32 index) +{ + u16 *my_minis, *mini; + u64 v; + int i; + + ASSERT (index < vec_len (cm->maxi)); + + v = 0; + + for (i = 0; i < vec_len (cm->minis); i++) + { + my_minis = cm->minis[i]; + mini = vec_elt_at_index (my_minis, index); + v += mini[0]; + } + + v += cm->maxi[index]; + + if (index < vec_len (cm->value_at_last_clear)) + { + ASSERT (v >= cm->value_at_last_clear[index]); + v -= cm->value_at_last_clear[index]; + } + + return v; +} + +/** Clear a simple counter + Clears the set of per-thread u16 counters, and the u64 counter + + @param cm - (vlib_simple_counter_main_t *) simple counter main pointer + @param index - (u32) index of the counter to clear +*/ +always_inline void +vlib_zero_simple_counter (vlib_simple_counter_main_t * cm, u32 index) +{ + u16 *my_minis; + int i; + + ASSERT (index < vec_len (cm->maxi)); + + for (i = 0; i < vec_len (cm->minis); i++) + { + my_minis = cm->minis[i]; + my_minis[index] = 0; + } + + cm->maxi[index] = 0; + + if (index < vec_len (cm->value_at_last_clear)) + cm->value_at_last_clear[index] = 0; +} + +/** Combined counter to hold both packets and byte differences. + */ +typedef struct +{ + u64 packets; /**< packet counter */ + u64 bytes; /**< byte counter */ +} vlib_counter_t; + +/** Add two combined counters, results in the first counter + @param [in,out] a - (vlib_counter_t *) dst counter + @param b - (vlib_counter_t *) src counter +*/ + +always_inline void +vlib_counter_add (vlib_counter_t * a, vlib_counter_t * b) +{ + a->packets += b->packets; + a->bytes += b->bytes; +} + +/** Subtract combined counters, results in the first counter + @param [in,out] a - (vlib_counter_t *) dst counter + @param b - (vlib_counter_t *) src counter +*/ +always_inline void +vlib_counter_sub (vlib_counter_t * a, vlib_counter_t * b) +{ + ASSERT (a->packets >= b->packets); + ASSERT (a->bytes >= b->bytes); + a->packets -= b->packets; + a->bytes -= b->bytes; +} + +/** Clear a combined counter + @param a - (vlib_counter_t *) counter to clear +*/ +always_inline void +vlib_counter_zero (vlib_counter_t * a) +{ + a->packets = a->bytes = 0; +} + +/** Mini combined counter */ +typedef struct +{ + u16 packets; /**< Packet count */ + i16 bytes; /**< Byte count */ +} vlib_mini_counter_t; + +/** A collection of combined counters */ +typedef struct +{ + vlib_mini_counter_t **minis; /**< Per-thread u16 non-atomic counter pairs */ + vlib_counter_t *maxi; /**< Shared wide counter pairs */ + vlib_counter_t *value_at_last_clear; /**< Counter values as of last clear. */ + vlib_counter_t *value_at_last_serialize; /**< Counter values as of last serialize. */ + u32 last_incremental_serialize_index; /**< Last counter index serialized incrementally. */ + char *name; /**< The counter collection's name. */ +} vlib_combined_counter_main_t; + +/** Clear a collection of simple counters + @param cm - (vlib_simple_counter_main_t *) collection to clear +*/ +void vlib_clear_simple_counters (vlib_simple_counter_main_t * cm); + +/** Clear a collection of combined counters + @param cm - (vlib_combined_counter_main_t *) collection to clear +*/ +void vlib_clear_combined_counters (vlib_combined_counter_main_t * cm); + +/** Increment a combined counter + @param cm - (vlib_combined_counter_main_t *) comined counter main pointer + @param cpu_index - (u32) the current cpu index + @param index - (u32) index of the counter to increment + @param packet_increment - (u32) number of packets to add to the counter + @param byte_increment - (u32) number of bytes to add to the counter +*/ + +always_inline void +vlib_increment_combined_counter (vlib_combined_counter_main_t * cm, + u32 cpu_index, + u32 index, + u32 packet_increment, u32 byte_increment) +{ + vlib_mini_counter_t *my_minis, *mini; + u32 old_packets, new_packets; + i32 old_bytes, new_bytes; + + /* Use this CPU's mini counter array */ + my_minis = cm->minis[cpu_index]; + + mini = vec_elt_at_index (my_minis, index); + old_packets = mini->packets; + old_bytes = mini->bytes; + + new_packets = old_packets + packet_increment; + new_bytes = old_bytes + byte_increment; + + mini->packets = new_packets; + mini->bytes = new_bytes; + + /* Bytes always overflow before packets.. */ + if (PREDICT_FALSE (mini->bytes != new_bytes)) + { + vlib_counter_t *maxi = vec_elt_at_index (cm->maxi, index); + + __sync_fetch_and_add (&maxi->packets, new_packets); + __sync_fetch_and_add (&maxi->bytes, new_bytes); + + mini->packets = 0; + mini->bytes = 0; + } +} + +/** Get the value of a combined counter, never called in the speed path + Scrapes the entire set of mini counters. Innacurate unless + worker threads which might increment the counter are + barrier-synchronized + + @param cm - (vlib_combined_counter_main_t *) combined counter main pointer + @param index - (u32) index of the combined counter to fetch + @param result [out] - (vlib_counter_t *) result stored here +*/ + +static inline void +vlib_get_combined_counter (vlib_combined_counter_main_t * cm, + u32 index, vlib_counter_t * result) +{ + vlib_mini_counter_t *my_minis, *mini; + vlib_counter_t *maxi; + int i; + + result->packets = 0; + result->bytes = 0; + + for (i = 0; i < vec_len (cm->minis); i++) + { + my_minis = cm->minis[i]; + + mini = vec_elt_at_index (my_minis, index); + result->packets += mini->packets; + result->bytes += mini->bytes; + } + + maxi = vec_elt_at_index (cm->maxi, index); + result->packets += maxi->packets; + result->bytes += maxi->bytes; + + if (index < vec_len (cm->value_at_last_clear)) + vlib_counter_sub (result, &cm->value_at_last_clear[index]); +} + +/** Clear a combined counter + Clears the set of per-thread u16 counters, and the shared vlib_counter_t + + @param cm - (vlib_combined_counter_main_t *) combined counter main pointer + @param index - (u32) index of the counter to clear +*/ +always_inline void +vlib_zero_combined_counter (vlib_combined_counter_main_t * cm, u32 index) +{ + vlib_mini_counter_t *mini, *my_minis; + int i; + + for (i = 0; i < vec_len (cm->minis); i++) + { + my_minis = cm->minis[i]; + + mini = vec_elt_at_index (my_minis, index); + mini->packets = 0; + mini->bytes = 0; + } + + vlib_counter_zero (&cm->maxi[index]); + if (index < vec_len (cm->value_at_last_clear)) + vlib_counter_zero (&cm->value_at_last_clear[index]); +} + +/** validate a simple counter + @param cm - (vlib_simple_counter_main_t *) pointer to the counter collection + @param index - (u32) index of the counter to validate +*/ + +void vlib_validate_simple_counter (vlib_simple_counter_main_t * cm, + u32 index); +/** validate a combined counter + @param cm - (vlib_combined_counter_main_t *) pointer to the counter + collection + @param index - (u32) index of the counter to validate +*/ + +void vlib_validate_combined_counter (vlib_combined_counter_main_t * cm, + u32 index); + +/** Obtain the number of simple or combined counters allocated. + A macro which reduces to to vec_len(cm->maxi), the answer in either + case. + + @param cm - (vlib_simple_counter_main_t) or + (vlib_combined_counter_main_t) the counter collection to interrogate + @returns vec_len(cm->maxi) +*/ +#define vlib_counter_len(cm) vec_len((cm)->maxi) + +serialize_function_t serialize_vlib_simple_counter_main, + unserialize_vlib_simple_counter_main; +serialize_function_t serialize_vlib_combined_counter_main, + unserialize_vlib_combined_counter_main; + +#endif /* included_vlib_counter_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/defs.h b/src/vlib/defs.h new file mode 100644 index 00000000..ad58bc04 --- /dev/null +++ b/src/vlib/defs.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * defs.h: VLIB generic C definitions + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_defs_h +#define included_vlib_defs_h + +/* Receive or transmit. */ +typedef enum +{ + VLIB_RX, + VLIB_TX, + VLIB_N_RX_TX = 2, /* Used to size arrays. */ +} vlib_rx_or_tx_t; + +#define vlib_foreach_rx_tx(v) for (v = 0; v < VLIB_N_RX_TX; v++) + +/* Read/write. */ +typedef enum +{ + VLIB_READ, + VLIB_WRITE, +} vlib_read_or_write_t; + +/* Up/down. */ +typedef enum +{ + VLIB_DOWN = 0, + VLIB_UP = 1, +} vlib_up_or_down_t; + +/* Enable/disable. */ +typedef enum +{ + VLIB_DISABLE = 0, + VLIB_ENABLE = 1, +} vlib_enable_or_disable_t; + +#endif /* included_vlib_defs_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/dir.dox b/src/vlib/dir.dox new file mode 100644 index 00000000..4806e7a9 --- /dev/null +++ b/src/vlib/dir.dox @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Copyright (c) 2016 Comcast Cable Communications Management, LLC. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* Doxygen directory documentation */ +/** +@dir +@brief VLIB application library source. +*/ +/*? %%clicmd:group_label VLIB application library%% ?*/ + diff --git a/src/vlib/elog_samples.c b/src/vlib/elog_samples.c new file mode 100644 index 00000000..a8c800df --- /dev/null +++ b/src/vlib/elog_samples.c @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +static inline void +elog_four_int_sample (u32 * data) +{ + ELOG_TYPE_DECLARE (e) = + { + .format = "four int: first %d second %d third %d fourth %d",.format_args = + "i4i4i4i4",}; + struct + { + u32 data[4]; + } *ed; + ed = ELOG_DATA (&vlib_global_main.elog_main, e); + ed->data[0] = data[0]; + ed->data[1] = data[1]; + ed->data[2] = data[2]; + ed->data[3] = data[3]; +} + +static inline void +elog_four_int_track_sample (u32 * data) +{ + ELOG_TYPE_DECLARE (e) = + { + .format = + "four_int_track: first %d second %d third %d fourth %d",.format_args = + "i4i4i4i4",}; + struct + { + u32 data[4]; + } *ed; + ELOG_TRACK (sample_track); + ed = ELOG_TRACK_DATA (&vlib_global_main.elog_main, e, sample_track); + ed->data[0] = data[0]; + ed->data[1] = data[1]; + ed->data[2] = data[2]; + ed->data[3] = data[3]; +} + +static inline void +elog_enum_sample (u8 which) +{ + ELOG_TYPE_DECLARE (e) = + { + .format = "my enum: %s",.format_args = "t1",.n_enum_strings = + 2,.enum_strings = + { + "string 1", "string 2",},}; + struct + { + u8 which; + } *ed; + ed = ELOG_DATA (&vlib_global_main.elog_main, e); + ed->which = which; +} + +static inline void +elog_one_datum_sample (u32 data) +{ + ELOG_TYPE_DECLARE (e) = + { + .format = "one datum: %d",.format_args = "i4",}; + + elog (&vlib_global_main.elog_main, &e, data); +} + +static clib_error_t * +test_elog_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + int i; + u32 samples[4]; + + for (i = 0; i < 10; i++) + { + samples[0] = i; + samples[1] = i + 1; + samples[2] = i + 2; + samples[3] = i + 3; + + elog_four_int_sample (samples); + elog_four_int_track_sample (samples); + elog_enum_sample (0); + elog_enum_sample (1); + elog_one_datum_sample (i); + } + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (test_elog_command, static) = { + .path = "test elog sample", + .short_help = "test elog sample", + .function = test_elog_command_fn, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/error.c b/src/vlib/error.c new file mode 100644 index 00000000..a2c23176 --- /dev/null +++ b/src/vlib/error.c @@ -0,0 +1,338 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * error.c: VLIB error handler + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include + +uword +vlib_error_drop_buffers (vlib_main_t * vm, + vlib_node_runtime_t * node, + u32 * buffers, + u32 next_buffer_stride, + u32 n_buffers, + u32 next_index, + u32 drop_error_node, u32 drop_error_code) +{ + u32 n_left_this_frame, n_buffers_left, *args, n_args_left; + vlib_error_t drop_error; + + drop_error = vlib_error_set (drop_error_node, drop_error_code); + + n_buffers_left = n_buffers; + while (n_buffers_left > 0) + { + vlib_get_next_frame (vm, node, next_index, args, n_args_left); + + n_left_this_frame = clib_min (n_buffers_left, n_args_left); + n_buffers_left -= n_left_this_frame; + n_args_left -= n_left_this_frame; + + while (n_left_this_frame >= 4) + { + u32 bi0, bi1, bi2, bi3; + vlib_buffer_t *b0, *b1, *b2, *b3; + + args[0] = bi0 = buffers[0]; + args[1] = bi1 = buffers[1]; + args[2] = bi2 = buffers[2]; + args[3] = bi3 = buffers[3]; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + b2 = vlib_get_buffer (vm, bi2); + b3 = vlib_get_buffer (vm, bi3); + + b0->error = drop_error; + b1->error = drop_error; + b2->error = drop_error; + b3->error = drop_error; + + buffers += 4; + args += 4; + n_left_this_frame -= 4; + } + + while (n_left_this_frame >= 1) + { + u32 bi0; + vlib_buffer_t *b0; + + args[0] = bi0 = buffers[0]; + + b0 = vlib_get_buffer (vm, bi0); + b0->error = drop_error; + + buffers += 1; + args += 1; + n_left_this_frame -= 1; + } + + vlib_put_next_frame (vm, node, next_index, n_args_left); + } + + return n_buffers; +} + +/* Convenience node to drop a vector of buffers with a "misc error". */ +static uword +misc_drop_buffers (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + return vlib_error_drop_buffers (vm, node, vlib_frame_args (frame), + /* buffer stride */ 1, + frame->n_vectors, + /* next */ 0, + node->node_index, + /* error */ 0); +} + +static char *misc_drop_buffers_error_strings[] = { + [0] = "misc. errors", +}; + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (misc_drop_buffers_node,static) = { + .function = misc_drop_buffers, + .name = "misc-drop-buffers", + .vector_size = sizeof (u32), + .n_errors = 1, + .n_next_nodes = 1, + .next_nodes = { + "error-drop", + }, + .error_strings = misc_drop_buffers_error_strings, +}; +/* *INDENT-ON* */ + +/* Reserves given number of error codes for given node. */ +void +vlib_register_errors (vlib_main_t * vm, + u32 node_index, u32 n_errors, char *error_strings[]) +{ + vlib_error_main_t *em = &vm->error_main; + vlib_node_t *n = vlib_get_node (vm, node_index); + uword l; + + ASSERT (os_get_cpu_number () == 0); + + /* Free up any previous error strings. */ + if (n->n_errors > 0) + heap_dealloc (em->error_strings_heap, n->error_heap_handle); + + n->n_errors = n_errors; + n->error_strings = error_strings; + + if (n_errors == 0) + return; + + n->error_heap_index = + heap_alloc (em->error_strings_heap, n_errors, n->error_heap_handle); + + l = vec_len (em->error_strings_heap); + + clib_memcpy (vec_elt_at_index (em->error_strings_heap, n->error_heap_index), + error_strings, n_errors * sizeof (error_strings[0])); + + /* Allocate a counter/elog type for each error. */ + vec_validate (em->counters, l - 1); + vec_validate (vm->error_elog_event_types, l - 1); + + /* Zero counters for re-registrations of errors. */ + if (n->error_heap_index + n_errors <= vec_len (em->counters_last_clear)) + clib_memcpy (em->counters + n->error_heap_index, + em->counters_last_clear + n->error_heap_index, + n_errors * sizeof (em->counters[0])); + else + memset (em->counters + n->error_heap_index, + 0, n_errors * sizeof (em->counters[0])); + + { + elog_event_type_t t; + uword i; + + memset (&t, 0, sizeof (t)); + for (i = 0; i < n_errors; i++) + { + t.format = (char *) format (0, "%v %s: %%d", + n->name, error_strings[i]); + vm->error_elog_event_types[n->error_heap_index + i] = t; + } + } +} + +static clib_error_t * +show_errors (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_error_main_t *em = &vm->error_main; + vlib_node_t *n; + u32 code, i, ni; + u64 c; + int index = 0; + int verbose = 0; + u64 *sums = 0; + + if (unformat (input, "verbose %d", &verbose)) + ; + else if (unformat (input, "verbose")) + verbose = 1; + + vec_validate (sums, vec_len (em->counters)); + + if (verbose) + vlib_cli_output (vm, "%=10s%=40s%=20s%=6s", "Count", "Node", "Reason", + "Index"); + else + vlib_cli_output (vm, "%=10s%=40s%=6s", "Count", "Node", "Reason"); + + + /* *INDENT-OFF* */ + foreach_vlib_main(({ + em = &this_vlib_main->error_main; + + if (verbose) + vlib_cli_output(vm, "Thread %u (%v):", index, + vlib_worker_threads[index].name); + + for (ni = 0; ni < vec_len (this_vlib_main->node_main.nodes); ni++) + { + n = vlib_get_node (this_vlib_main, ni); + for (code = 0; code < n->n_errors; code++) + { + i = n->error_heap_index + code; + c = em->counters[i]; + if (i < vec_len (em->counters_last_clear)) + c -= em->counters_last_clear[i]; + sums[i] += c; + + if (c == 0 && verbose < 2) + continue; + + if (verbose) + vlib_cli_output (vm, "%10Ld%=40v%=20s%=6d", c, n->name, + em->error_strings_heap[i], i); + else + vlib_cli_output (vm, "%10d%=40v%s", c, n->name, + em->error_strings_heap[i]); + } + } + index++; + })); + /* *INDENT-ON* */ + + if (verbose) + vlib_cli_output (vm, "Total:"); + + for (ni = 0; ni < vec_len (vm->node_main.nodes); ni++) + { + n = vlib_get_node (vm, ni); + for (code = 0; code < n->n_errors; code++) + { + i = n->error_heap_index + code; + if (sums[i]) + { + if (verbose) + vlib_cli_output (vm, "%10Ld%=40v%=20s%=10d", sums[i], n->name, + em->error_strings_heap[i], i); + } + } + } + + vec_free (sums); + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cli_show_errors, static) = { + .path = "show errors", + .short_help = "Show error counts", + .function = show_errors, +}; +/* *INDENT-ON* */ + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cli_show_node_counters, static) = { + .path = "show node counters", + .short_help = "Show node counters", + .function = show_errors, +}; +/* *INDENT-ON* */ + +static clib_error_t * +clear_error_counters (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_error_main_t *em; + u32 i; + + /* *INDENT-OFF* */ + foreach_vlib_main(({ + em = &this_vlib_main->error_main; + vec_validate (em->counters_last_clear, vec_len (em->counters) - 1); + for (i = 0; i < vec_len (em->counters); i++) + em->counters_last_clear[i] = em->counters[i]; + })); + /* *INDENT-ON* */ + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cli_clear_error_counters, static) = { + .path = "clear errors", + .short_help = "Clear error counters", + .function = clear_error_counters, +}; +/* *INDENT-ON* */ + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cli_clear_node_counters, static) = { + .path = "clear node counters", + .short_help = "Clear node counters", + .function = clear_error_counters, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/error.h b/src/vlib/error.h new file mode 100644 index 00000000..df2075c3 --- /dev/null +++ b/src/vlib/error.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * error.h: drop/punt error packets + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_error_h +#define included_vlib_error_h + +/* Combined 16 bit node & 16 bit code as 32 bit number. */ +typedef u32 vlib_error_t; + +always_inline u32 +vlib_error_get_node (vlib_error_t e) +{ + return e >> 12; +} + +always_inline u32 +vlib_error_get_code (vlib_error_t e) +{ + return e & 0xfff; +} + +always_inline vlib_error_t +vlib_error_set (u32 node_index, u32 code) +{ + ASSERT (node_index < (1 << 20)); + ASSERT (code < (1 << 12)); + return (node_index << 12) | code; +} + +always_inline vlib_error_t +vlib_error_set_code (vlib_error_t e, u32 code) +{ + ASSERT (vlib_error_get_code (e) == 0); + ASSERT (code < (1 << 12)); + e |= code; + return e; +} + +typedef struct +{ + /* Error counters. */ + u64 *counters; + + /* Counter values as of last counter clear. */ + u64 *counters_last_clear; + + /* Error name strings in heap. Heap index + indexes counter vector. */ + char **error_strings_heap; +} vlib_error_main_t; + +/* Per node error registration. */ +void vlib_register_errors (struct vlib_main_t *vm, + u32 node_index, + u32 n_errors, char *error_strings[]); + +#endif /* included_vlib_error_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/error_funcs.h b/src/vlib/error_funcs.h new file mode 100644 index 00000000..1a3602e9 --- /dev/null +++ b/src/vlib/error_funcs.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * error_funcs.h: VLIB error handling + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_error_funcs_h +#define included_vlib_error_funcs_h + +#include + +always_inline void +vlib_error_elog_count (vlib_main_t * vm, uword counter, uword increment) +{ + elog_main_t *em = &vm->elog_main; + if (VLIB_ELOG_MAIN_LOOP > 0 && increment > 0) + elog (em, vec_elt_at_index (vm->error_elog_event_types, counter), + increment); +} + +always_inline void +vlib_error_count (vlib_main_t * vm, uword node_index, + uword counter, uword increment) +{ + vlib_node_t *n = vlib_get_node (vm, node_index); + vlib_error_main_t *em = &vm->error_main; + + ASSERT (counter < n->n_errors); + counter += n->error_heap_index; + + ASSERT (counter < vec_len (em->counters)); + em->counters[counter] += increment; + + vlib_error_elog_count (vm, counter, increment); +} + +/* Drop all buffers in frame with given error code. */ +uword +vlib_error_drop_buffers (vlib_main_t * vm, + vlib_node_runtime_t * node, + u32 * buffers, + u32 next_buffer_stride, + u32 n_buffers, + u32 error_next_index, + u32 error_node, u32 error_code); + +#endif /* included_vlib_error_funcs_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/format.c b/src/vlib/format.c new file mode 100644 index 00000000..79a4d686 --- /dev/null +++ b/src/vlib/format.c @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * format.c: generic network formatting/unformating + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include + +u8 * +format_vlib_rx_tx (u8 * s, va_list * args) +{ + vlib_rx_or_tx_t r = va_arg (*args, vlib_rx_or_tx_t); + char *t; + + switch (r) + { + case VLIB_RX: + t = "rx"; + break; + case VLIB_TX: + t = "tx"; + break; + default: + t = "INVALID"; + break; + } + + vec_add (s, t, strlen (t)); + return s; +} + +u8 * +format_vlib_read_write (u8 * s, va_list * args) +{ + vlib_rx_or_tx_t r = va_arg (*args, vlib_rx_or_tx_t); + char *t; + + switch (r) + { + case VLIB_READ: + t = "read"; + break; + case VLIB_WRITE: + t = "write"; + break; + default: + t = "INVALID"; + break; + } + + vec_add (s, t, strlen (t)); + return s; +} + +/* Formats buffer data as printable ascii or as hex. */ +u8 * +format_vlib_buffer_data (u8 * s, va_list * args) +{ + u8 *data = va_arg (*args, u8 *); + u32 n_data_bytes = va_arg (*args, u32); + u32 i, is_printable; + + is_printable = 1; + for (i = 0; i < n_data_bytes && is_printable; i++) + { + u8 c = data[i]; + if (c < 0x20) + is_printable = 0; + else if (c >= 0x7f) + is_printable = 0; + } + + if (is_printable) + vec_add (s, data, n_data_bytes); + else + s = format (s, "%U", format_hex_bytes, data, n_data_bytes); + + return s; +} + +/* Enable/on => 1; disable/off => 0. */ +uword +unformat_vlib_enable_disable (unformat_input_t * input, va_list * args) +{ + int *result = va_arg (*args, int *); + int enable; + + if (unformat (input, "enable") || unformat (input, "on")) + enable = 1; + else if (unformat (input, "disable") || unformat (input, "off")) + enable = 0; + else + return 0; + + *result = enable; + return 1; +} + +/* rx/tx => VLIB_RX/VLIB_TX. */ +uword +unformat_vlib_rx_tx (unformat_input_t * input, va_list * args) +{ + int *result = va_arg (*args, int *); + if (unformat (input, "rx")) + *result = VLIB_RX; + else if (unformat (input, "tx")) + *result = VLIB_TX; + else + return 0; + return 1; +} + +/* Parse an int either %d or 0x%x. */ +uword +unformat_vlib_number (unformat_input_t * input, va_list * args) +{ + int *result = va_arg (*args, int *); + + return (unformat (input, "0x%x", result) || unformat (input, "%d", result)); +} + +/* Parse a-zA-Z0-9_ token and hash to value. */ +uword +unformat_vlib_number_by_name (unformat_input_t * input, va_list * args) +{ + uword *hash = va_arg (*args, uword *); + int *result = va_arg (*args, int *); + uword *p; + u8 *token; + int i; + + if (!unformat_user (input, unformat_token, "a-zA-Z0-9_", &token)) + return 0; + + /* Null terminate. */ + if (vec_len (token) > 0 && token[vec_len (token) - 1] != 0) + vec_add1 (token, 0); + + /* Check for exact match. */ + p = hash_get_mem (hash, token); + if (p) + goto done; + + /* Convert to upper case & try match. */ + for (i = 0; i < vec_len (token); i++) + if (token[i] >= 'a' && token[i] <= 'z') + token[i] = 'A' + token[i] - 'a'; + p = hash_get_mem (hash, token); + +done: + vec_free (token); + if (p) + *result = p[0]; + return p != 0; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/format_funcs.h b/src/vlib/format_funcs.h new file mode 100644 index 00000000..f60b8940 --- /dev/null +++ b/src/vlib/format_funcs.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * format_funcs.h: VLIB formatting/unformating + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_format_h +#define included_vlib_format_h + +/* Format vlib_rx_or_tx_t/vlib_read_or_write_t enum as string. */ +u8 *format_vlib_rx_tx (u8 * s, va_list * args); +u8 *format_vlib_read_write (u8 * s, va_list * args); + +/* Formats buffer data as printable ascii or as hex. */ +u8 *format_vlib_buffer_data (u8 * s, va_list * args); + +/* Enable/on => 1; disable/off => 0. */ +uword unformat_vlib_enable_disable (unformat_input_t * input, va_list * args); + +/* rx/tx => VLIB_RX/VLIB_TX. */ +uword unformat_vlib_rx_tx (unformat_input_t * input, va_list * args); + +/* Parse a-zA-Z0-9_ token and hash to value. */ +uword unformat_vlib_number_by_name (unformat_input_t * input, va_list * args); + +/* Parse an int either %d or 0x%x. */ +uword unformat_vlib_number (unformat_input_t * input, va_list * args); + +/* Flag to format_vlib_*_header functions to tell them not to recurse + into the next layer's header. For example, tells format_vlib_ethernet_header + not to format ip header. */ +#define FORMAT_VLIB_HEADER_NO_RECURSION (~0) + +#endif /* included_vlib_format_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/global_funcs.h b/src/vlib/global_funcs.h new file mode 100644 index 00000000..bbdbdef5 --- /dev/null +++ b/src/vlib/global_funcs.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * global_funcs.h: global data structure access functions + */ + +#ifndef included_vlib_global_funcs_h_ +#define included_vlib_global_funcs_h_ + +always_inline vlib_main_t * +vlib_get_main (void) +{ + vlib_main_t *vm; + vm = vlib_mains ? vlib_mains[os_get_cpu_number ()] : &vlib_global_main; + ASSERT (vm); + return vm; +} + +always_inline vlib_thread_main_t * +vlib_get_thread_main () +{ + return &vlib_thread_main; +} + +#endif /* included_vlib_global_funcs_h_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/i2c.c b/src/vlib/i2c.c new file mode 100644 index 00000000..97f5bb21 --- /dev/null +++ b/src/vlib/i2c.c @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +static inline void +i2c_delay (i2c_bus_t * b, f64 timeout) +{ + vlib_main_t *vm = vlib_get_main (); + vlib_time_wait (vm, timeout); +} + +static void +i2c_wait_for_scl (i2c_bus_t * b) +{ + f64 t = 0; + + while (t < b->hold_time) + { + int sda, scl; + i2c_delay (b, b->rise_fall_time); + b->get_bits (b, &scl, &sda); + + if (scl) + return; + + t += b->rise_fall_time; + } + b->timeout = 1; +} + +static void +i2c_start (i2c_bus_t * b) +{ + b->timeout = 0; + + b->put_bits (b, 1, 1); + i2c_wait_for_scl (b); + + if (vlib_i2c_bus_timed_out (b)) + return; + + b->put_bits (b, 1, 0); + i2c_delay (b, b->hold_time); + b->put_bits (b, 0, 0); + i2c_delay (b, b->hold_time); +} + +static void +i2c_stop (i2c_bus_t * b) +{ + b->put_bits (b, 0, 0); + i2c_delay (b, b->rise_fall_time); + + b->put_bits (b, 1, 0); + i2c_delay (b, b->hold_time); + + b->put_bits (b, 1, 1); + i2c_delay (b, b->hold_time); +} + +static void +i2c_write_bit (i2c_bus_t * b, int sda) +{ + b->put_bits (b, 0, sda); + i2c_delay (b, b->rise_fall_time); + + b->put_bits (b, 1, sda); + i2c_wait_for_scl (b); + i2c_delay (b, b->hold_time); + + b->put_bits (b, 0, sda); + i2c_delay (b, b->rise_fall_time); +} + +static void +i2c_read_bit (i2c_bus_t * b, int *sda) +{ + int scl; + + b->put_bits (b, 1, 1); + i2c_wait_for_scl (b); + i2c_delay (b, b->hold_time); + + b->get_bits (b, &scl, sda); + + b->put_bits (b, 0, 1); + i2c_delay (b, b->rise_fall_time); +} + +static void +i2c_write_byte (i2c_bus_t * b, u8 data) +{ + int i, sda; + + for (i = 7; i >= 0; i--) + { + i2c_write_bit (b, (data >> i) & 1); + if (b->timeout) + return; + } + + b->put_bits (b, 0, 1); + i2c_delay (b, b->rise_fall_time); + + i2c_read_bit (b, &sda); + + if (sda) + b->timeout = 1; +} + + +static void +i2c_read_byte (i2c_bus_t * b, u8 * data, int ack) +{ + int i, sda; + + *data = 0; + + b->put_bits (b, 0, 1); + i2c_delay (b, b->rise_fall_time); + + for (i = 7; i >= 0; i--) + { + i2c_read_bit (b, &sda); + if (b->timeout) + return; + + *data |= (sda != 0) << i; + } + + i2c_write_bit (b, ack == 0); +} + + +void +vlib_i2c_init (i2c_bus_t * b) +{ + f64 tick; + if (!b->clock) + b->clock = 400000; + + tick = 1.0 / b->clock; + + /* Spend 40% of time in low and high states */ + if (!b->hold_time) + b->hold_time = 0.4 * tick; + + /* Spend 10% of time waiting for rise and fall */ + if (!b->rise_fall_time) + b->rise_fall_time = 0.1 * tick; +} + +void +vlib_i2c_xfer (i2c_bus_t * bus, i2c_msg_t * msgs) +{ + i2c_msg_t *msg; + int i; + + vec_foreach (msg, msgs) + { + i2c_start (bus); + i2c_write_byte (bus, + (msg->addr << 1) + (msg->flags == I2C_MSG_FLAG_READ)); + + if (msg->flags & I2C_MSG_FLAG_READ) + for (i = 0; i < msg->len; i++) + { + i2c_read_byte (bus, &msg->buffer[i], /* ack */ i + 1 != msg->len); + if (bus->timeout) + goto done; + } + + else + for (i = 0; i < msg->len; i++) + { + i2c_write_byte (bus, msg->buffer[i]); + if (bus->timeout) + goto done; + } + } + +done: + i2c_stop (bus); +} + +void +vlib_i2c_read_eeprom (i2c_bus_t * bus, u8 i2c_addr, u16 start_addr, + u16 length, u8 * data) +{ + i2c_msg_t *msg = 0; + u8 start_address[1]; + + vec_validate (msg, 1); + + start_address[0] = start_addr; + msg[0].addr = i2c_addr; + msg[0].flags = I2C_MSG_FLAG_WRITE; + msg[0].buffer = (u8 *) & start_address; + msg[0].len = 1; + + msg[1].addr = i2c_addr; + msg[1].flags = I2C_MSG_FLAG_READ; + msg[1].buffer = data; + msg[1].len = length; + + vlib_i2c_xfer (bus, msg); + + vec_free (msg); +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/i2c.h b/src/vlib/i2c.h new file mode 100644 index 00000000..b79bdc75 --- /dev/null +++ b/src/vlib/i2c.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_vlib_i2c_h +#define included_vlib_i2c_h + +#include + + +#define I2C_MSG_FLAG_WRITE 0 +#define I2C_MSG_FLAG_READ 1 + +typedef struct +{ + u8 addr; + u8 flags; + u16 len; + u8 *buffer; +} i2c_msg_t; + +typedef struct i2c_bus_t +{ + void (*put_bits) (struct i2c_bus_t * b, int scl, int sda); + void (*get_bits) (struct i2c_bus_t * b, int *scl, int *sda); + + int timeout; + u32 clock; + f64 hold_time; + f64 rise_fall_time; + + /* Private data */ + uword private_data; + +} i2c_bus_t; + +void vlib_i2c_init (i2c_bus_t * bus); +void vlib_i2c_xfer (i2c_bus_t * bus, i2c_msg_t * msgs); +void vlib_i2c_read_eeprom (i2c_bus_t * bus, u8 i2c_addr, u16 start_addr, + u16 length, u8 * data); + +static inline int +vlib_i2c_bus_timed_out (i2c_bus_t * bus) +{ + return bus->timeout; +} + +#endif /* included_vlib_i2c_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/init.c b/src/vlib/init.c new file mode 100644 index 00000000..8d478451 --- /dev/null +++ b/src/vlib/init.c @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * init.c: mechanism for functions to be called at init/exit. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include + +clib_error_t * +vlib_call_init_exit_functions (vlib_main_t * vm, + _vlib_init_function_list_elt_t * head, + int call_once) +{ + clib_error_t *error = 0; + _vlib_init_function_list_elt_t *i; + + i = head; + while (i) + { + if (call_once && !hash_get (vm->init_functions_called, i->f)) + { + if (call_once) + hash_set1 (vm->init_functions_called, i->f); + error = i->f (vm); + if (error) + return error; + } + i = i->next_init_function; + } + return error; +} + +clib_error_t * +vlib_call_all_init_functions (vlib_main_t * vm) +{ + /* Call dummy functions to make sure purely static modules are + linked in. */ +#define _(f) vlib_##f##_reference (); + foreach_vlib_module_reference; +#undef _ + + return vlib_call_init_exit_functions + (vm, vm->init_function_registrations, 1 /* call_once */ ); +} + +clib_error_t * +vlib_call_all_main_loop_enter_functions (vlib_main_t * vm) +{ + return vlib_call_init_exit_functions + (vm, vm->main_loop_enter_function_registrations, 1 /* call_once */ ); +} + +clib_error_t * +vlib_call_all_main_loop_exit_functions (vlib_main_t * vm) +{ + return vlib_call_init_exit_functions + (vm, vm->main_loop_exit_function_registrations, 1 /* call_once */ ); +} + +clib_error_t * +vlib_call_all_config_functions (vlib_main_t * vm, + unformat_input_t * input, int is_early) +{ + clib_error_t *error = 0; + vlib_config_function_runtime_t *c, **all; + uword *hash = 0, *p; + uword i; + + hash = hash_create_string (0, sizeof (uword)); + all = 0; + + c = vm->config_function_registrations; + + while (c) + { + hash_set_mem (hash, c->name, vec_len (all)); + vec_add1 (all, c); + unformat_init (&c->input, 0, 0); + c = c->next_registration; + } + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + u8 *s, *v; + + if (!unformat (input, "%s %v", &s, &v) || !(p = hash_get_mem (hash, s))) + { + error = clib_error_create ("unknown input `%s %v'", s, v); + goto done; + } + + c = all[p[0]]; + if (vec_len (c->input.buffer) > 0) + vec_add1 (c->input.buffer, ' '); + vec_add (c->input.buffer, v, vec_len (v)); + vec_free (v); + vec_free (s); + } + + for (i = 0; i < vec_len (all); i++) + { + c = all[i]; + + /* Is this an early config? Are we doing early configs? */ + if (is_early ^ c->is_early) + continue; + + /* Already called? */ + if (hash_get (vm->init_functions_called, c->function)) + continue; + hash_set1 (vm->init_functions_called, c->function); + + error = c->function (vm, &c->input); + if (error) + goto done; + } + +done: + for (i = 0; i < vec_len (all); i++) + { + c = all[i]; + unformat_free (&c->input); + } + vec_free (all); + hash_free (hash); + return error; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/init.h b/src/vlib/init.h new file mode 100644 index 00000000..4fa5b304 --- /dev/null +++ b/src/vlib/init.h @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * init.h: mechanism for functions to be called at init/exit. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_init_h +#define included_vlib_init_h + +#include +#include +#include + +/* Init/exit functions: called at start/end of main routine. Init + functions are typically used to register and setup packet + processing nodes. */ + +typedef clib_error_t *(vlib_init_function_t) (struct vlib_main_t * vm); + +typedef struct _vlib_init_function_list_elt +{ + struct _vlib_init_function_list_elt *next_init_function; + vlib_init_function_t *f; +} _vlib_init_function_list_elt_t; + +/* Configuration functions: called with configuration input just before + main polling loop starts. */ +typedef clib_error_t *(vlib_config_function_t) (struct vlib_main_t * vm, + unformat_input_t * input); + +typedef struct vlib_config_function_runtime_t +{ + /* Function to call. Set to null once function has already been called. */ + vlib_config_function_t *function; + + /* Input for function. */ + unformat_input_t input; + + /* next config function registration */ + struct vlib_config_function_runtime_t *next_registration; + + /* To be invoked as soon as the clib heap is available */ + u8 is_early; + + /* Name used to distinguish input on command line. */ + char name[32]; +} vlib_config_function_runtime_t; + +#define _VLIB_INIT_FUNCTION_SYMBOL(x, type) \ + _vlib_##type##_function_##x + +#define VLIB_INIT_FUNCTION_SYMBOL(x) \ + _VLIB_INIT_FUNCTION_SYMBOL(x, init) +#define VLIB_MAIN_LOOP_ENTER_FUNCTION_SYMBOL(x) \ + _VLIB_INIT_FUNCTION_SYMBOL(x, main_loop_enter) +#define VLIB_MAIN_LOOP_EXIT_FUNCTION_SYMBOL(x) \ + _VLIB_INIT_FUNCTION_SYMBOL(x, main_loop_exit) +#define VLIB_CONFIG_FUNCTION_SYMBOL(x) \ + _VLIB_INIT_FUNCTION_SYMBOL(x, config) + +/* Declaration is global (e.g. not static) so that init functions can + be called from other modules to resolve init function depend. */ + +#define VLIB_DECLARE_INIT_FUNCTION(x, tag) \ +vlib_init_function_t * _VLIB_INIT_FUNCTION_SYMBOL (x, tag) = x; \ +static void __vlib_add_##tag##_function_##x (void) \ + __attribute__((__constructor__)) ; \ +static void __vlib_add_##tag##_function_##x (void) \ +{ \ + vlib_main_t * vm = vlib_get_main(); \ + static _vlib_init_function_list_elt_t _vlib_init_function; \ + _vlib_init_function.next_init_function \ + = vm->tag##_function_registrations; \ + vm->tag##_function_registrations = &_vlib_init_function; \ + _vlib_init_function.f = &x; \ +} + +#define VLIB_INIT_FUNCTION(x) VLIB_DECLARE_INIT_FUNCTION(x,init) + +#define VLIB_MAIN_LOOP_ENTER_FUNCTION(x) \ + VLIB_DECLARE_INIT_FUNCTION(x,main_loop_enter) +#define VLIB_MAIN_LOOP_EXIT_FUNCTION(x) \ +VLIB_DECLARE_INIT_FUNCTION(x,main_loop_exit) + +#define VLIB_CONFIG_FUNCTION(x,n,...) \ + __VA_ARGS__ vlib_config_function_runtime_t \ + VLIB_CONFIG_FUNCTION_SYMBOL(x); \ +static void __vlib_add_config_function_##x (void) \ + __attribute__((__constructor__)) ; \ +static void __vlib_add_config_function_##x (void) \ +{ \ + vlib_main_t * vm = vlib_get_main(); \ + VLIB_CONFIG_FUNCTION_SYMBOL(x).next_registration \ + = vm->config_function_registrations; \ + vm->config_function_registrations \ + = &VLIB_CONFIG_FUNCTION_SYMBOL(x); \ +} \ + vlib_config_function_runtime_t \ + VLIB_CONFIG_FUNCTION_SYMBOL (x) \ + = { \ + .name = n, \ + .function = x, \ + .is_early = 0, \ + } + +#define VLIB_EARLY_CONFIG_FUNCTION(x,n,...) \ + __VA_ARGS__ vlib_config_function_runtime_t \ + VLIB_CONFIG_FUNCTION_SYMBOL(x); \ +static void __vlib_add_config_function_##x (void) \ + __attribute__((__constructor__)) ; \ +static void __vlib_add_config_function_##x (void) \ +{ \ + vlib_main_t * vm = vlib_get_main(); \ + VLIB_CONFIG_FUNCTION_SYMBOL(x).next_registration \ + = vm->config_function_registrations; \ + vm->config_function_registrations \ + = &VLIB_CONFIG_FUNCTION_SYMBOL(x); \ +} \ + vlib_config_function_runtime_t \ + VLIB_CONFIG_FUNCTION_SYMBOL (x) \ + = { \ + .name = n, \ + .function = x, \ + .is_early = 1, \ + } + +/* Call given init function: used for init function dependencies. */ +#define vlib_call_init_function(vm, x) \ + ({ \ + extern vlib_init_function_t * VLIB_INIT_FUNCTION_SYMBOL (x); \ + vlib_init_function_t * _f = VLIB_INIT_FUNCTION_SYMBOL (x); \ + clib_error_t * _error = 0; \ + if (! hash_get (vm->init_functions_called, _f)) \ + { \ + hash_set1 (vm->init_functions_called, _f); \ + _error = _f (vm); \ + } \ + _error; \ + }) + +/* Don't call given init function: used to suppress parts of the netstack */ +#define vlib_mark_init_function_complete(vm, x) \ + ({ \ + extern vlib_init_function_t * VLIB_INIT_FUNCTION_SYMBOL (x); \ + vlib_init_function_t * _f = VLIB_INIT_FUNCTION_SYMBOL (x); \ + hash_set1 (vm->init_functions_called, _f); \ + }) + +#define vlib_call_post_graph_init_function(vm, x) \ + ({ \ + extern vlib_init_function_t * VLIB_POST_GRAPH_INIT_FUNCTION_SYMBOL (x); \ + vlib_init_function_t * _f = VLIB_POST_GRAPH_INIT_FUNCTION_SYMBOL (x); \ + clib_error_t * _error = 0; \ + if (! hash_get (vm->init_functions_called, _f)) \ + { \ + hash_set1 (vm->init_functions_called, _f); \ + _error = _f (vm); \ + } \ + _error; \ + }) + +#define vlib_call_config_function(vm, x) \ + ({ \ + vlib_config_function_runtime_t * _r; \ + clib_error_t * _error = 0; \ + extern vlib_config_function_runtime_t \ + VLIB_CONFIG_FUNCTION_SYMBOL (x); \ + \ + _r = &VLIB_CONFIG_FUNCTION_SYMBOL (x); \ + if (! hash_get (vm->init_functions_called, _r->function)) \ + { \ + hash_set1 (vm->init_functions_called, _r->function); \ + _error = _r->function (vm, &_r->input); \ + } \ + _error; \ + }) + +/* External functions. */ +clib_error_t *vlib_call_all_init_functions (struct vlib_main_t *vm); +clib_error_t *vlib_call_all_config_functions (struct vlib_main_t *vm, + unformat_input_t * input, + int is_early); +clib_error_t *vlib_call_all_main_loop_enter_functions (struct vlib_main_t + *vm); +clib_error_t *vlib_call_all_main_loop_exit_functions (struct vlib_main_t *vm); +clib_error_t *vlib_call_init_exit_functions (struct vlib_main_t *vm, + _vlib_init_function_list_elt_t * + head, int call_once); + +#define foreach_vlib_module_reference \ + _ (node_cli) \ + _ (trace_cli) + +/* Dummy function to get node_cli.c linked in. */ +#define _(x) void vlib_##x##_reference (void); +foreach_vlib_module_reference +#undef _ +#endif /* included_vlib_init_h */ +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/lex.c b/src/vlib/lex.c new file mode 100644 index 00000000..1cc8f167 --- /dev/null +++ b/src/vlib/lex.c @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include + +vlib_lex_main_t vlib_lex_main; + +#define LEX_DEBUG 0 + +u8 * +format_vlib_lex_token (u8 * s, va_list * args) +{ + vlib_lex_main_t *lm = va_arg (*args, vlib_lex_main_t *); + vlib_lex_token_t *t = va_arg (*args, vlib_lex_token_t *); + + if (t->token == VLIB_LEX_word) + s = format (s, "%s", t->value.as_pointer); + else + s = format (s, "%s", lm->lex_token_names[t->token]); + return s; +} + +void +vlib_lex_get_token (vlib_lex_main_t * lm, vlib_lex_token_t * rv) +{ + u8 c; + vlib_lex_table_t *t; + vlib_lex_table_entry_t *e; + uword tv; + + if (PREDICT_FALSE (lm->pushback_sp >= 0)) + { + rv[0] = lm->pushback_vector[lm->pushback_sp--]; + return; + } + + rv->value.as_uword = ~0; + + while (1) + { + if (PREDICT_FALSE (lm->current_index >= vec_len (lm->input_vector))) + { + rv->token = VLIB_LEX_eof; + return; + } + + t = vec_elt_at_index (lm->lex_tables, lm->current_table_index); + c = (lm->input_vector[lm->current_index++]) & 0x7f; + e = &t->entries[c]; + lm->current_table_index = e->next_table_index; + + switch (e->action) + { + case VLIB_LEX_IGNORE: + continue; + + case VLIB_LEX_START_NUMBER: + lm->current_token_value = 0; + /* fallthru */ + + case VLIB_LEX_ADD_TO_NUMBER: + lm->current_number_base = e->token; + lm->current_token_value *= lm->current_number_base; + tv = c - '0'; + if (tv >= lm->current_number_base) + { + tv = 10 + c - 'A'; + if (tv >= lm->current_number_base) + tv = 10 + c - 'a'; + } + lm->current_token_value += tv; + continue; + + case VLIB_LEX_ADD_TO_TOKEN: + vec_add1 (lm->token_buffer, c); + continue; + + case VLIB_LEX_KEYWORD_CHECK: + { + uword *p; + + vec_add1 (lm->token_buffer, 0); + + /* It's either a keyword or just a word. */ + p = hash_get_mem (lm->lex_keywords, lm->token_buffer); + if (p) + { + rv->token = p[0]; + if (LEX_DEBUG > 0) + clib_warning ("keyword '%s' token %s", + lm->token_buffer, + lm->lex_token_names[rv->token]); + } + else + { + /* it's a WORD */ + rv->token = VLIB_LEX_word; + rv->value.as_pointer = vec_dup (lm->token_buffer); + if (LEX_DEBUG > 0) + clib_warning ("%s, value '%s'", + lm->lex_token_names[VLIB_LEX_word], + rv->value.as_pointer); + } + _vec_len (lm->token_buffer) = 0; + + /* Rescan the character which terminated the keyword/word. */ + lm->current_index--; + return; + } + + case VLIB_LEX_RETURN_AND_RESCAN: + ASSERT (lm->current_index); + lm->current_index--; + /* note flow-through */ + + case VLIB_LEX_RETURN: + rv->token = e->token; + rv->value.as_uword = lm->current_token_value; + lm->current_token_value = ~0; + if (LEX_DEBUG > 0) + { + clib_warning + ("table %s char '%c'(0x%02x) next table %s return %s", + t->name, c, c, lm->lex_tables[e->next_table_index].name, + lm->lex_token_names[e->token]); + if (rv->token == VLIB_LEX_number) + clib_warning (" numeric value 0x%x (%d)", rv->value, + rv->value); + } + return; + } + } +} + +u16 +vlib_lex_add_token (vlib_lex_main_t * lm, char *token_name) +{ + uword *p; + u16 rv; + + p = hash_get_mem (lm->lex_tokens_by_name, token_name); + + if (p) + return p[0]; + + rv = vec_len (lm->lex_token_names); + hash_set_mem (lm->lex_tokens_by_name, token_name, rv); + vec_add1 (lm->lex_token_names, token_name); + + return rv; +} + +static u16 +add_keyword (vlib_lex_main_t * lm, char *keyword, char *token_name) +{ + uword *p; + u16 token; + + p = hash_get_mem (lm->lex_keywords, keyword); + + ASSERT (p == 0); + + token = vlib_lex_add_token (lm, token_name); + + hash_set_mem (lm->lex_keywords, keyword, token); + return token; +} + +u16 +vlib_lex_find_or_add_keyword (vlib_lex_main_t * lm, char *keyword, + char *token_name) +{ + uword *p = hash_get_mem (lm->lex_keywords, keyword); + return p ? p[0] : add_keyword (lm, keyword, token_name); +} + +void +vlib_lex_set_action_range (u32 table_index, u8 lo, u8 hi, u16 action, + u16 token, u32 next_table_index) +{ + int i; + vlib_lex_main_t *lm = &vlib_lex_main; + vlib_lex_table_t *t = pool_elt_at_index (lm->lex_tables, table_index); + + for (i = lo; i <= hi; i++) + { + ASSERT (i < ARRAY_LEN (t->entries)); + t->entries[i].action = action; + t->entries[i].token = token; + t->entries[i].next_table_index = next_table_index; + } +} + +u16 +vlib_lex_add_table (char *name) +{ + vlib_lex_main_t *lm = &vlib_lex_main; + vlib_lex_table_t *t; + uword *p; + + p = hash_get_mem (lm->lex_tables_by_name, name); + + ASSERT (p == 0); + + pool_get_aligned (lm->lex_tables, t, CLIB_CACHE_LINE_BYTES); + + t->name = name; + + hash_set_mem (lm->lex_tables_by_name, name, t - lm->lex_tables); + + vlib_lex_set_action_range (t - lm->lex_tables, 1, 0x7F, VLIB_LEX_IGNORE, ~0, + t - lm->lex_tables); + + vlib_lex_set_action_range (t - lm->lex_tables, 0, 0, VLIB_LEX_RETURN, + VLIB_LEX_eof, t - lm->lex_tables); + + return t - lm->lex_tables; +} + +void +vlib_lex_reset (vlib_lex_main_t * lm, u8 * input_vector) +{ + if (lm->pushback_vector) + _vec_len (lm->pushback_vector) = 0; + lm->pushback_sp = -1; + + lm->input_vector = input_vector; + lm->current_index = 0; +} + +static clib_error_t * +lex_onetime_init (vlib_main_t * vm) +{ + vlib_lex_main_t *lm = &vlib_lex_main; + + lm->lex_tables_by_name = hash_create_string (0, sizeof (uword)); + lm->lex_tokens_by_name = hash_create_string (0, sizeof (uword)); + lm->lex_keywords = hash_create_string (0, sizeof (uword)); + lm->pushback_sp = -1; + +#define _(f) { u16 tmp = vlib_lex_add_token (lm, #f); ASSERT (tmp == VLIB_LEX_##f); } + foreach_vlib_lex_global_token; +#undef _ + + vec_validate (lm->token_buffer, 127); + _vec_len (lm->token_buffer) = 0; + + return 0; +} + +VLIB_INIT_FUNCTION (lex_onetime_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/lex.h b/src/vlib/lex.h new file mode 100644 index 00000000..4ae58f46 --- /dev/null +++ b/src/vlib/lex.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef included_vlib_lex_h +#define included_vlib_lex_h + +#include +#include +#include +#include + +#define foreach_vlib_lex_global_token \ + _ (invalid) \ + _ (eof) \ + _ (word) \ + _ (number) \ + _ (lt) \ + _ (gt) \ + _ (dot) \ + _ (slash) \ + _ (qmark) \ + _ (equals) \ + _ (plus) \ + _ (minus) \ + _ (star) \ + _ (lpar) \ + _ (rpar) + +typedef enum +{ +#define _(f) VLIB_LEX_##f, + foreach_vlib_lex_global_token +#undef _ +} vlib_lex_global_token_t; + +typedef enum +{ + VLIB_LEX_IGNORE, + VLIB_LEX_ADD_TO_TOKEN, + VLIB_LEX_RETURN, + VLIB_LEX_RETURN_AND_RESCAN, + VLIB_LEX_KEYWORD_CHECK, + VLIB_LEX_START_NUMBER, + VLIB_LEX_ADD_TO_NUMBER, +} vlib_lex_action_t; + +typedef struct +{ + u16 action; + u16 next_table_index; + u16 token; +} vlib_lex_table_entry_t; + +typedef struct +{ + char *name; + vlib_lex_table_entry_t entries[128]; +} vlib_lex_table_t; + +typedef struct +{ + u32 token; + + union + { + uword as_uword; + void *as_pointer; + char *as_string; + } value; +} vlib_lex_token_t; + +typedef struct +{ + vlib_lex_table_t *lex_tables; + uword *lex_tables_by_name; + + /* Vector of token strings. */ + char **lex_token_names; + + /* Hash mapping c string name to token index. */ + uword *lex_tokens_by_name; + + /* Hash mapping c string keyword name to token index. */ + uword *lex_keywords; + + vlib_lex_token_t *pushback_vector; + + i32 pushback_sp; + + u32 current_table_index; + + uword current_token_value; + + uword current_number_base; + + /* Input string we are lex-ing. */ + u8 *input_vector; + + /* Current index into input vector. */ + u32 current_index; + + /* Re-used vector for forming token strings and hashing them. */ + u8 *token_buffer; +} vlib_lex_main_t; + +vlib_lex_main_t vlib_lex_main; + +always_inline void +vlib_lex_cleanup_token (vlib_lex_token_t * t) +{ + if (t->token == VLIB_LEX_word) + { + u8 *tv = t->value.as_pointer; + vec_free (tv); + } +} + +u16 vlib_lex_add_table (char *name); +void vlib_lex_get_token (vlib_lex_main_t * lm, vlib_lex_token_t * result); +u16 vlib_lex_add_token (vlib_lex_main_t * lm, char *token_name); +void vlib_lex_set_action_range (u32 table_index, u8 lo, u8 hi, u16 action, + u16 token, u32 next_table_index); +void vlib_lex_reset (vlib_lex_main_t * lm, u8 * input_vector); +format_function_t format_vlib_lex_token; + +#endif /* included_vlib_lex_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/main.c b/src/vlib/main.c new file mode 100644 index 00000000..6c6cad98 --- /dev/null +++ b/src/vlib/main.c @@ -0,0 +1,1703 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * main.c: main vector processing loop + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include + +#include + +CJ_GLOBAL_LOG_PROTOTYPE; + +/* Actually allocate a few extra slots of vector data to support + speculative vector enqueues which overflow vector data in next frame. */ +#define VLIB_FRAME_SIZE_ALLOC (VLIB_FRAME_SIZE + 4) + +u32 wraps; + +always_inline u32 +vlib_frame_bytes (u32 n_scalar_bytes, u32 n_vector_bytes) +{ + u32 n_bytes; + + /* Make room for vlib_frame_t plus scalar arguments. */ + n_bytes = vlib_frame_vector_byte_offset (n_scalar_bytes); + + /* Make room for vector arguments. + Allocate a few extra slots of vector data to support + speculative vector enqueues which overflow vector data in next frame. */ +#define VLIB_FRAME_SIZE_EXTRA 4 + n_bytes += (VLIB_FRAME_SIZE + VLIB_FRAME_SIZE_EXTRA) * n_vector_bytes; + + /* Magic number is first 32bit number after vector data. + Used to make sure that vector data is never overrun. */ +#define VLIB_FRAME_MAGIC (0xabadc0ed) + n_bytes += sizeof (u32); + + /* Pad to cache line. */ + n_bytes = round_pow2 (n_bytes, CLIB_CACHE_LINE_BYTES); + + return n_bytes; +} + +always_inline u32 * +vlib_frame_find_magic (vlib_frame_t * f, vlib_node_t * node) +{ + void *p = f; + + p += vlib_frame_vector_byte_offset (node->scalar_size); + + p += (VLIB_FRAME_SIZE + VLIB_FRAME_SIZE_EXTRA) * node->vector_size; + + return p; +} + +static vlib_frame_size_t * +get_frame_size_info (vlib_node_main_t * nm, + u32 n_scalar_bytes, u32 n_vector_bytes) +{ + uword key = (n_scalar_bytes << 16) | n_vector_bytes; + uword *p, i; + + p = hash_get (nm->frame_size_hash, key); + if (p) + i = p[0]; + else + { + i = vec_len (nm->frame_sizes); + vec_validate (nm->frame_sizes, i); + hash_set (nm->frame_size_hash, key, i); + } + + return vec_elt_at_index (nm->frame_sizes, i); +} + +static u32 +vlib_frame_alloc_to_node (vlib_main_t * vm, u32 to_node_index, + u32 frame_flags) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_frame_size_t *fs; + vlib_node_t *to_node; + vlib_frame_t *f; + u32 fi, l, n, scalar_size, vector_size; + + to_node = vlib_get_node (vm, to_node_index); + + scalar_size = to_node->scalar_size; + vector_size = to_node->vector_size; + + fs = get_frame_size_info (nm, scalar_size, vector_size); + n = vlib_frame_bytes (scalar_size, vector_size); + if ((l = vec_len (fs->free_frame_indices)) > 0) + { + /* Allocate from end of free list. */ + fi = fs->free_frame_indices[l - 1]; + f = vlib_get_frame_no_check (vm, fi); + _vec_len (fs->free_frame_indices) = l - 1; + } + else + { + f = clib_mem_alloc_aligned_no_fail (n, VLIB_FRAME_ALIGN); + f->cpu_index = vm->cpu_index; + fi = vlib_frame_index_no_check (vm, f); + } + + /* Poison frame when debugging. */ + if (CLIB_DEBUG > 0) + { + u32 save_cpu_index = f->cpu_index; + + memset (f, 0xfe, n); + + f->cpu_index = save_cpu_index; + } + + /* Insert magic number. */ + { + u32 *magic; + + magic = vlib_frame_find_magic (f, to_node); + *magic = VLIB_FRAME_MAGIC; + } + + f->flags = VLIB_FRAME_IS_ALLOCATED | frame_flags; + f->n_vectors = 0; + f->scalar_size = scalar_size; + f->vector_size = vector_size; + + fs->n_alloc_frames += 1; + + return fi; +} + +/* Allocate a frame for from FROM_NODE to TO_NODE via TO_NEXT_INDEX. + Returns frame index. */ +static u32 +vlib_frame_alloc (vlib_main_t * vm, vlib_node_runtime_t * from_node_runtime, + u32 to_next_index) +{ + vlib_node_t *from_node; + + from_node = vlib_get_node (vm, from_node_runtime->node_index); + ASSERT (to_next_index < vec_len (from_node->next_nodes)); + + return vlib_frame_alloc_to_node (vm, from_node->next_nodes[to_next_index], + /* frame_flags */ 0); +} + +vlib_frame_t * +vlib_get_frame_to_node (vlib_main_t * vm, u32 to_node_index) +{ + u32 fi = vlib_frame_alloc_to_node (vm, to_node_index, + /* frame_flags */ + VLIB_FRAME_FREE_AFTER_DISPATCH); + return vlib_get_frame (vm, fi); +} + +void +vlib_put_frame_to_node (vlib_main_t * vm, u32 to_node_index, vlib_frame_t * f) +{ + vlib_pending_frame_t *p; + vlib_node_t *to_node; + + if (f->n_vectors == 0) + return; + + to_node = vlib_get_node (vm, to_node_index); + + vec_add2 (vm->node_main.pending_frames, p, 1); + + f->flags |= VLIB_FRAME_PENDING; + p->frame_index = vlib_frame_index (vm, f); + p->node_runtime_index = to_node->runtime_index; + p->next_frame_index = VLIB_PENDING_FRAME_NO_NEXT_FRAME; +} + +/* Free given frame. */ +void +vlib_frame_free (vlib_main_t * vm, vlib_node_runtime_t * r, vlib_frame_t * f) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *node; + vlib_frame_size_t *fs; + u32 frame_index; + + ASSERT (f->flags & VLIB_FRAME_IS_ALLOCATED); + + node = vlib_get_node (vm, r->node_index); + fs = get_frame_size_info (nm, node->scalar_size, node->vector_size); + + frame_index = vlib_frame_index (vm, f); + + ASSERT (f->flags & VLIB_FRAME_IS_ALLOCATED); + + /* No next frames may point to freed frame. */ + if (CLIB_DEBUG > 0) + { + vlib_next_frame_t *nf; + vec_foreach (nf, vm->node_main.next_frames) + ASSERT (nf->frame_index != frame_index); + } + + f->flags &= ~VLIB_FRAME_IS_ALLOCATED; + + vec_add1 (fs->free_frame_indices, frame_index); + ASSERT (fs->n_alloc_frames > 0); + fs->n_alloc_frames -= 1; +} + +static clib_error_t * +show_frame_stats (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_frame_size_t *fs; + + vlib_cli_output (vm, "%=6s%=12s%=12s", "Size", "# Alloc", "# Free"); + vec_foreach (fs, nm->frame_sizes) + { + u32 n_alloc = fs->n_alloc_frames; + u32 n_free = vec_len (fs->free_frame_indices); + + if (n_alloc + n_free > 0) + vlib_cli_output (vm, "%=6d%=12d%=12d", + fs - nm->frame_sizes, n_alloc, n_free); + } + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_frame_stats_cli, static) = { + .path = "show vlib frame-allocation", + .short_help = "Show node dispatch frame statistics", + .function = show_frame_stats, +}; +/* *INDENT-ON* */ + +/* Change ownership of enqueue rights to given next node. */ +static void +vlib_next_frame_change_ownership (vlib_main_t * vm, + vlib_node_runtime_t * node_runtime, + u32 next_index) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_next_frame_t *next_frame; + vlib_node_t *node, *next_node; + + node = vec_elt (nm->nodes, node_runtime->node_index); + + /* Only internal & input nodes are allowed to call other nodes. */ + ASSERT (node->type == VLIB_NODE_TYPE_INTERNAL + || node->type == VLIB_NODE_TYPE_INPUT + || node->type == VLIB_NODE_TYPE_PROCESS); + + ASSERT (vec_len (node->next_nodes) == node_runtime->n_next_nodes); + + next_frame = + vlib_node_runtime_get_next_frame (vm, node_runtime, next_index); + next_node = vec_elt (nm->nodes, node->next_nodes[next_index]); + + if (next_node->owner_node_index != VLIB_INVALID_NODE_INDEX) + { + /* Get frame from previous owner. */ + vlib_next_frame_t *owner_next_frame; + vlib_next_frame_t tmp; + + owner_next_frame = + vlib_node_get_next_frame (vm, + next_node->owner_node_index, + next_node->owner_next_index); + + /* Swap target next frame with owner's. */ + tmp = owner_next_frame[0]; + owner_next_frame[0] = next_frame[0]; + next_frame[0] = tmp; + + /* + * If next_frame is already pending, we have to track down + * all pending frames and fix their next_frame_index fields. + */ + if (next_frame->flags & VLIB_FRAME_PENDING) + { + vlib_pending_frame_t *p; + if (next_frame->frame_index != ~0) + { + vec_foreach (p, nm->pending_frames) + { + if (p->frame_index == next_frame->frame_index) + { + p->next_frame_index = + next_frame - vm->node_main.next_frames; + } + } + } + } + } + else + { + /* No previous owner. Take ownership. */ + next_frame->flags |= VLIB_FRAME_OWNER; + } + + /* Record new owner. */ + next_node->owner_node_index = node->index; + next_node->owner_next_index = next_index; + + /* Now we should be owner. */ + ASSERT (next_frame->flags & VLIB_FRAME_OWNER); +} + +/* Make sure that magic number is still there. + Otherwise, it is likely that caller has overrun frame arguments. */ +always_inline void +validate_frame_magic (vlib_main_t * vm, + vlib_frame_t * f, vlib_node_t * n, uword next_index) +{ + vlib_node_t *next_node = vlib_get_node (vm, n->next_nodes[next_index]); + u32 *magic = vlib_frame_find_magic (f, next_node); + ASSERT (VLIB_FRAME_MAGIC == magic[0]); +} + +vlib_frame_t * +vlib_get_next_frame_internal (vlib_main_t * vm, + vlib_node_runtime_t * node, + u32 next_index, u32 allocate_new_next_frame) +{ + vlib_frame_t *f; + vlib_next_frame_t *nf; + u32 n_used; + + nf = vlib_node_runtime_get_next_frame (vm, node, next_index); + + /* Make sure this next frame owns right to enqueue to destination frame. */ + if (PREDICT_FALSE (!(nf->flags & VLIB_FRAME_OWNER))) + vlib_next_frame_change_ownership (vm, node, next_index); + + /* ??? Don't need valid flag: can use frame_index == ~0 */ + if (PREDICT_FALSE (!(nf->flags & VLIB_FRAME_IS_ALLOCATED))) + { + nf->frame_index = vlib_frame_alloc (vm, node, next_index); + nf->flags |= VLIB_FRAME_IS_ALLOCATED; + } + + f = vlib_get_frame (vm, nf->frame_index); + + /* Has frame been removed from pending vector (e.g. finished dispatching)? + If so we can reuse frame. */ + if ((nf->flags & VLIB_FRAME_PENDING) && !(f->flags & VLIB_FRAME_PENDING)) + { + nf->flags &= ~VLIB_FRAME_PENDING; + f->n_vectors = 0; + } + + /* Allocate new frame if current one is already full. */ + n_used = f->n_vectors; + if (n_used >= VLIB_FRAME_SIZE || (allocate_new_next_frame && n_used > 0)) + { + /* Old frame may need to be freed after dispatch, since we'll have + two redundant frames from node -> next node. */ + if (!(nf->flags & VLIB_FRAME_NO_FREE_AFTER_DISPATCH)) + { + vlib_frame_t *f_old = vlib_get_frame (vm, nf->frame_index); + f_old->flags |= VLIB_FRAME_FREE_AFTER_DISPATCH; + } + + /* Allocate new frame to replace full one. */ + nf->frame_index = vlib_frame_alloc (vm, node, next_index); + f = vlib_get_frame (vm, nf->frame_index); + n_used = f->n_vectors; + } + + /* Should have free vectors in frame now. */ + ASSERT (n_used < VLIB_FRAME_SIZE); + + if (CLIB_DEBUG > 0) + { + validate_frame_magic (vm, f, + vlib_get_node (vm, node->node_index), next_index); + } + + return f; +} + +static void +vlib_put_next_frame_validate (vlib_main_t * vm, + vlib_node_runtime_t * rt, + u32 next_index, u32 n_vectors_left) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_next_frame_t *nf; + vlib_frame_t *f; + vlib_node_runtime_t *next_rt; + vlib_node_t *next_node; + u32 n_before, n_after; + + nf = vlib_node_runtime_get_next_frame (vm, rt, next_index); + f = vlib_get_frame (vm, nf->frame_index); + + ASSERT (n_vectors_left <= VLIB_FRAME_SIZE); + n_after = VLIB_FRAME_SIZE - n_vectors_left; + n_before = f->n_vectors; + + ASSERT (n_after >= n_before); + + next_rt = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL], + nf->node_runtime_index); + next_node = vlib_get_node (vm, next_rt->node_index); + if (n_after > 0 && next_node->validate_frame) + { + u8 *msg = next_node->validate_frame (vm, rt, f); + if (msg) + { + clib_warning ("%v", msg); + ASSERT (0); + } + vec_free (msg); + } +} + +void +vlib_put_next_frame (vlib_main_t * vm, + vlib_node_runtime_t * r, + u32 next_index, u32 n_vectors_left) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_next_frame_t *nf; + vlib_frame_t *f; + u32 n_vectors_in_frame; + + if (DPDK == 0 && CLIB_DEBUG > 0) + vlib_put_next_frame_validate (vm, r, next_index, n_vectors_left); + + nf = vlib_node_runtime_get_next_frame (vm, r, next_index); + f = vlib_get_frame (vm, nf->frame_index); + + /* Make sure that magic number is still there. Otherwise, caller + has overrun frame meta data. */ + if (CLIB_DEBUG > 0) + { + vlib_node_t *node = vlib_get_node (vm, r->node_index); + validate_frame_magic (vm, f, node, next_index); + } + + /* Convert # of vectors left -> number of vectors there. */ + ASSERT (n_vectors_left <= VLIB_FRAME_SIZE); + n_vectors_in_frame = VLIB_FRAME_SIZE - n_vectors_left; + + f->n_vectors = n_vectors_in_frame; + + /* If vectors were added to frame, add to pending vector. */ + if (PREDICT_TRUE (n_vectors_in_frame > 0)) + { + vlib_pending_frame_t *p; + u32 v0, v1; + + r->cached_next_index = next_index; + + if (!(f->flags & VLIB_FRAME_PENDING)) + { + __attribute__ ((unused)) vlib_node_t *node; + vlib_node_t *next_node; + vlib_node_runtime_t *next_runtime; + + node = vlib_get_node (vm, r->node_index); + next_node = vlib_get_next_node (vm, r->node_index, next_index); + next_runtime = vlib_node_get_runtime (vm, next_node->index); + + vec_add2 (nm->pending_frames, p, 1); + + p->frame_index = nf->frame_index; + p->node_runtime_index = nf->node_runtime_index; + p->next_frame_index = nf - nm->next_frames; + nf->flags |= VLIB_FRAME_PENDING; + f->flags |= VLIB_FRAME_PENDING; + + /* + * If we're going to dispatch this frame on another thread, + * force allocation of a new frame. Otherwise, we create + * a dangling frame reference. Each thread has its own copy of + * the next_frames vector. + */ + if (0 && r->cpu_index != next_runtime->cpu_index) + { + nf->frame_index = ~0; + nf->flags &= ~(VLIB_FRAME_PENDING | VLIB_FRAME_IS_ALLOCATED); + } + } + + /* Copy trace flag from next_frame and from runtime. */ + nf->flags |= + (nf->flags & VLIB_NODE_FLAG_TRACE) | (r-> + flags & VLIB_NODE_FLAG_TRACE); + + v0 = nf->vectors_since_last_overflow; + v1 = v0 + n_vectors_in_frame; + nf->vectors_since_last_overflow = v1; + if (PREDICT_FALSE (v1 < v0)) + { + vlib_node_t *node = vlib_get_node (vm, r->node_index); + vec_elt (node->n_vectors_by_next_node, next_index) += v0; + } + } +} + +/* Sync up runtime (32 bit counters) and main node stats (64 bit counters). */ +never_inline void +vlib_node_runtime_sync_stats (vlib_main_t * vm, + vlib_node_runtime_t * r, + uword n_calls, uword n_vectors, uword n_clocks) +{ + vlib_node_t *n = vlib_get_node (vm, r->node_index); + + n->stats_total.calls += n_calls + r->calls_since_last_overflow; + n->stats_total.vectors += n_vectors + r->vectors_since_last_overflow; + n->stats_total.clocks += n_clocks + r->clocks_since_last_overflow; + n->stats_total.max_clock = r->max_clock; + n->stats_total.max_clock_n = r->max_clock_n; + + r->calls_since_last_overflow = 0; + r->vectors_since_last_overflow = 0; + r->clocks_since_last_overflow = 0; +} + +always_inline void __attribute__ ((unused)) +vlib_process_sync_stats (vlib_main_t * vm, + vlib_process_t * p, + uword n_calls, uword n_vectors, uword n_clocks) +{ + vlib_node_runtime_t *rt = &p->node_runtime; + vlib_node_t *n = vlib_get_node (vm, rt->node_index); + vlib_node_runtime_sync_stats (vm, rt, n_calls, n_vectors, n_clocks); + n->stats_total.suspends += p->n_suspends; + p->n_suspends = 0; +} + +void +vlib_node_sync_stats (vlib_main_t * vm, vlib_node_t * n) +{ + vlib_node_runtime_t *rt; + + if (n->type == VLIB_NODE_TYPE_PROCESS) + { + /* Nothing to do for PROCESS nodes except in main thread */ + if (vm != &vlib_global_main) + return; + + vlib_process_t *p = vlib_get_process_from_node (vm, n); + n->stats_total.suspends += p->n_suspends; + p->n_suspends = 0; + rt = &p->node_runtime; + } + else + rt = + vec_elt_at_index (vm->node_main.nodes_by_type[n->type], + n->runtime_index); + + vlib_node_runtime_sync_stats (vm, rt, 0, 0, 0); + + /* Sync up runtime next frame vector counters with main node structure. */ + { + vlib_next_frame_t *nf; + uword i; + for (i = 0; i < rt->n_next_nodes; i++) + { + nf = vlib_node_runtime_get_next_frame (vm, rt, i); + vec_elt (n->n_vectors_by_next_node, i) += + nf->vectors_since_last_overflow; + nf->vectors_since_last_overflow = 0; + } + } +} + +always_inline u32 +vlib_node_runtime_update_stats (vlib_main_t * vm, + vlib_node_runtime_t * node, + uword n_calls, + uword n_vectors, uword n_clocks) +{ + u32 ca0, ca1, v0, v1, cl0, cl1, r; + + cl0 = cl1 = node->clocks_since_last_overflow; + ca0 = ca1 = node->calls_since_last_overflow; + v0 = v1 = node->vectors_since_last_overflow; + + ca1 = ca0 + n_calls; + v1 = v0 + n_vectors; + cl1 = cl0 + n_clocks; + + node->calls_since_last_overflow = ca1; + node->clocks_since_last_overflow = cl1; + node->vectors_since_last_overflow = v1; + node->max_clock_n = node->max_clock > n_clocks ? + node->max_clock_n : n_vectors; + node->max_clock = node->max_clock > n_clocks ? node->max_clock : n_clocks; + + r = vlib_node_runtime_update_main_loop_vector_stats (vm, node, n_vectors); + + if (PREDICT_FALSE (ca1 < ca0 || v1 < v0 || cl1 < cl0)) + { + node->calls_since_last_overflow = ca0; + node->clocks_since_last_overflow = cl0; + node->vectors_since_last_overflow = v0; + vlib_node_runtime_sync_stats (vm, node, n_calls, n_vectors, n_clocks); + } + + return r; +} + +always_inline void +vlib_process_update_stats (vlib_main_t * vm, + vlib_process_t * p, + uword n_calls, uword n_vectors, uword n_clocks) +{ + vlib_node_runtime_update_stats (vm, &p->node_runtime, + n_calls, n_vectors, n_clocks); +} + +static clib_error_t * +vlib_cli_elog_clear (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + elog_reset_buffer (&vm->elog_main); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (elog_clear_cli, static) = { + .path = "event-logger clear", + .short_help = "Clear the event log", + .function = vlib_cli_elog_clear, +}; +/* *INDENT-ON* */ + +#ifdef CLIB_UNIX +static clib_error_t * +elog_save_buffer (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + elog_main_t *em = &vm->elog_main; + char *file, *chroot_file; + clib_error_t *error = 0; + + if (!unformat (input, "%s", &file)) + { + vlib_cli_output (vm, "expected file name, got `%U'", + format_unformat_error, input); + return 0; + } + + /* It's fairly hard to get "../oopsie" through unformat; just in case */ + if (strstr (file, "..") || index (file, '/')) + { + vlib_cli_output (vm, "illegal characters in filename '%s'", file); + return 0; + } + + chroot_file = (char *) format (0, "/tmp/%s%c", file, 0); + + vec_free (file); + + vlib_cli_output (vm, "Saving %wd of %wd events to %s", + elog_n_events_in_buffer (em), + elog_buffer_capacity (em), chroot_file); + + vlib_worker_thread_barrier_sync (vm); + error = elog_write_file (em, chroot_file); + vlib_worker_thread_barrier_release (vm); + vec_free (chroot_file); + return error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (elog_save_cli, static) = { + .path = "event-logger save", + .short_help = "event-logger save (saves log in /tmp/)", + .function = elog_save_buffer, +}; +/* *INDENT-ON* */ + +static clib_error_t * +elog_stop (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + elog_main_t *em = &vm->elog_main; + + em->n_total_events_disable_limit = em->n_total_events; + + vlib_cli_output (vm, "Stopped the event logger..."); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (elog_stop_cli, static) = { + .path = "event-logger stop", + .short_help = "Stop the event-logger", + .function = elog_stop, +}; +/* *INDENT-ON* */ + +static clib_error_t * +elog_restart (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + elog_main_t *em = &vm->elog_main; + + em->n_total_events_disable_limit = ~0; + + vlib_cli_output (vm, "Restarted the event logger..."); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (elog_restart_cli, static) = { + .path = "event-logger restart", + .short_help = "Restart the event-logger", + .function = elog_restart, +}; +/* *INDENT-ON* */ + +static clib_error_t * +elog_resize (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + elog_main_t *em = &vm->elog_main; + u32 tmp; + + /* Stop the parade */ + elog_reset_buffer (&vm->elog_main); + + if (unformat (input, "%d", &tmp)) + { + elog_alloc (em, tmp); + em->n_total_events_disable_limit = ~0; + } + else + return clib_error_return (0, "Must specify how many events in the ring"); + + vlib_cli_output (vm, "Resized ring and restarted the event logger..."); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (elog_resize_cli, static) = { + .path = "event-logger resize", + .short_help = "event-logger resize ", + .function = elog_resize, +}; +/* *INDENT-ON* */ + +#endif /* CLIB_UNIX */ + +static void +elog_show_buffer_internal (vlib_main_t * vm, u32 n_events_to_show) +{ + elog_main_t *em = &vm->elog_main; + elog_event_t *e, *es; + f64 dt; + + /* Show events in VLIB time since log clock starts after VLIB clock. */ + dt = (em->init_time.cpu - vm->clib_time.init_cpu_time) + * vm->clib_time.seconds_per_clock; + + es = elog_peek_events (em); + vlib_cli_output (vm, "%d of %d events in buffer, logger %s", vec_len (es), + em->event_ring_size, + em->n_total_events < em->n_total_events_disable_limit ? + "running" : "stopped"); + vec_foreach (e, es) + { + vlib_cli_output (vm, "%18.9f: %U", + e->time + dt, format_elog_event, em, e); + n_events_to_show--; + if (n_events_to_show == 0) + break; + } + vec_free (es); + +} + +static clib_error_t * +elog_show_buffer (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + u32 n_events_to_show; + clib_error_t *error = 0; + + n_events_to_show = 250; + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "%d", &n_events_to_show)) + ; + else if (unformat (input, "all")) + n_events_to_show = ~0; + else + return unformat_parse_error (input); + } + elog_show_buffer_internal (vm, n_events_to_show); + return error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (elog_show_cli, static) = { + .path = "show event-logger", + .short_help = "Show event logger info", + .function = elog_show_buffer, +}; +/* *INDENT-ON* */ + +void +vlib_gdb_show_event_log (void) +{ + elog_show_buffer_internal (vlib_get_main (), (u32) ~ 0); +} + +static inline void +vlib_elog_main_loop_event (vlib_main_t * vm, + u32 node_index, + u64 time, u32 n_vectors, u32 is_return) +{ + vlib_main_t *evm = &vlib_global_main; + elog_main_t *em = &evm->elog_main; + + if (VLIB_ELOG_MAIN_LOOP && n_vectors) + elog_track (em, + /* event type */ + vec_elt_at_index (is_return + ? evm->node_return_elog_event_types + : evm->node_call_elog_event_types, + node_index), + /* track */ + (vm->cpu_index ? &vlib_worker_threads[vm->cpu_index]. + elog_track : &em->default_track), + /* data to log */ n_vectors); +} + +void +vlib_dump_context_trace (vlib_main_t * vm, u32 bi) +{ + vlib_node_main_t *vnm = &vm->node_main; + vlib_buffer_t *b; + u8 i, n; + + if (VLIB_BUFFER_TRACE_TRAJECTORY) + { + b = vlib_get_buffer (vm, bi); + n = b->pre_data[0]; + + fformat (stderr, "Context trace for bi %d b 0x%llx, visited %d\n", + bi, b, n); + + if (n == 0 || n > 20) + { + fformat (stderr, "n is unreasonable\n"); + return; + } + + + for (i = 0; i < n; i++) + { + u32 node_index; + + node_index = b->pre_data[i + 1]; + + if (node_index > vec_len (vnm->nodes)) + { + fformat (stderr, "Skip bogus node index %d\n", node_index); + continue; + } + + fformat (stderr, "%v (%d)\n", vnm->nodes[node_index]->name, + node_index); + } + } + else + { + fformat (stderr, + "in vlib/buffers.h, #define VLIB_BUFFER_TRACE_TRAJECTORY 1\n"); + } +} + + +/* static_always_inline */ u64 +dispatch_node (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_node_type_t type, + vlib_node_state_t dispatch_state, + vlib_frame_t * frame, u64 last_time_stamp) +{ + uword n, v; + u64 t; + vlib_node_main_t *nm = &vm->node_main; + vlib_next_frame_t *nf; + + if (CLIB_DEBUG > 0) + { + vlib_node_t *n = vlib_get_node (vm, node->node_index); + ASSERT (n->type == type); + } + + /* Only non-internal nodes may be disabled. */ + if (type != VLIB_NODE_TYPE_INTERNAL && node->state != dispatch_state) + { + ASSERT (type != VLIB_NODE_TYPE_INTERNAL); + return last_time_stamp; + } + + if ((type == VLIB_NODE_TYPE_PRE_INPUT || type == VLIB_NODE_TYPE_INPUT) + && dispatch_state != VLIB_NODE_STATE_INTERRUPT) + { + u32 c = node->input_main_loops_per_call; + /* Only call node when count reaches zero. */ + if (c) + { + node->input_main_loops_per_call = c - 1; + return last_time_stamp; + } + } + + /* Speculatively prefetch next frames. */ + if (node->n_next_nodes > 0) + { + nf = vec_elt_at_index (nm->next_frames, node->next_frame_index); + CLIB_PREFETCH (nf, 4 * sizeof (nf[0]), WRITE); + } + + vm->cpu_time_last_node_dispatch = last_time_stamp; + + if (1 /* || vm->cpu_index == node->cpu_index */ ) + { + vlib_main_t *stat_vm; + + stat_vm = /* vlib_mains ? vlib_mains[0] : */ vm; + + vlib_elog_main_loop_event (vm, node->node_index, + last_time_stamp, + frame ? frame->n_vectors : 0, + /* is_after */ 0); + + /* + * Turn this on if you run into + * "bad monkey" contexts, and you want to know exactly + * which nodes they've visited... See ixge.c... + */ + if (VLIB_BUFFER_TRACE_TRAJECTORY && frame) + { + int i; + int log_index; + u32 *from; + from = vlib_frame_vector_args (frame); + for (i = 0; i < frame->n_vectors; i++) + { + vlib_buffer_t *b = vlib_get_buffer (vm, from[i]); + ASSERT (b->pre_data[0] < 32); + log_index = b->pre_data[0]++ + 1; + b->pre_data[log_index] = node->node_index; + } + n = node->function (vm, node, frame); + } + else + n = node->function (vm, node, frame); + + t = clib_cpu_time_now (); + + vlib_elog_main_loop_event (vm, node->node_index, t, n, /* is_after */ + 1); + + vm->main_loop_vectors_processed += n; + vm->main_loop_nodes_processed += n > 0; + + v = vlib_node_runtime_update_stats (stat_vm, node, + /* n_calls */ 1, + /* n_vectors */ n, + /* n_clocks */ t - last_time_stamp); + + /* When in interrupt mode and vector rate crosses threshold switch to + polling mode. */ + if ((DPDK == 0 && dispatch_state == VLIB_NODE_STATE_INTERRUPT) + || (DPDK == 0 && dispatch_state == VLIB_NODE_STATE_POLLING + && (node->flags + & VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE))) + { + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "%s vector length %d, switching to %s",.format_args = + "T4i4t4",.n_enum_strings = 2,.enum_strings = + { + "interrupt", "polling",},}; + struct + { + u32 node_name, vector_length, is_polling; + } *ed; + + if (dispatch_state == VLIB_NODE_STATE_INTERRUPT + && v >= nm->polling_threshold_vector_length) + { + vlib_node_t *n = vlib_get_node (vm, node->node_index); + n->state = VLIB_NODE_STATE_POLLING; + node->state = VLIB_NODE_STATE_POLLING; + ASSERT (! + (node->flags & + VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE)); + node->flags &= + ~VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE; + node->flags |= + VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE; + nm->input_node_counts_by_state[VLIB_NODE_STATE_INTERRUPT] -= 1; + nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] += 1; + + ed = ELOG_DATA (&vm->elog_main, e); + ed->node_name = n->name_elog_string; + ed->vector_length = v; + ed->is_polling = 1; + } + else if (dispatch_state == VLIB_NODE_STATE_POLLING + && v <= nm->interrupt_threshold_vector_length) + { + vlib_node_t *n = vlib_get_node (vm, node->node_index); + if (node->flags & + VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE) + { + /* Switch to interrupt mode after dispatch in polling one more time. + This allows driver to re-enable interrupts. */ + n->state = VLIB_NODE_STATE_INTERRUPT; + node->state = VLIB_NODE_STATE_INTERRUPT; + node->flags &= + ~VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE; + nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] -= + 1; + nm->input_node_counts_by_state[VLIB_NODE_STATE_INTERRUPT] += + 1; + + } + else + { + node->flags |= + VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE; + ed = ELOG_DATA (&vm->elog_main, e); + ed->node_name = n->name_elog_string; + ed->vector_length = v; + ed->is_polling = 0; + } + } + } + } + + return t; +} + +/* static */ u64 +dispatch_pending_node (vlib_main_t * vm, + vlib_pending_frame_t * p, u64 last_time_stamp) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_frame_t *f; + vlib_next_frame_t *nf, nf_dummy; + vlib_node_runtime_t *n; + u32 restore_frame_index; + + n = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL], + p->node_runtime_index); + + f = vlib_get_frame (vm, p->frame_index); + if (p->next_frame_index == VLIB_PENDING_FRAME_NO_NEXT_FRAME) + { + /* No next frame: so use dummy on stack. */ + nf = &nf_dummy; + nf->flags = f->flags & VLIB_NODE_FLAG_TRACE; + nf->frame_index = ~p->frame_index; + } + else + nf = vec_elt_at_index (nm->next_frames, p->next_frame_index); + + ASSERT (f->flags & VLIB_FRAME_IS_ALLOCATED); + + /* Force allocation of new frame while current frame is being + dispatched. */ + restore_frame_index = ~0; + if (nf->frame_index == p->frame_index) + { + nf->frame_index = ~0; + nf->flags &= ~VLIB_FRAME_IS_ALLOCATED; + if (!(n->flags & VLIB_NODE_FLAG_FRAME_NO_FREE_AFTER_DISPATCH)) + restore_frame_index = p->frame_index; + } + + /* Frame must be pending. */ + ASSERT (f->flags & VLIB_FRAME_PENDING); + ASSERT (f->n_vectors > 0); + + /* Copy trace flag from next frame to node. + Trace flag indicates that at least one vector in the dispatched + frame is traced. */ + n->flags &= ~VLIB_NODE_FLAG_TRACE; + n->flags |= (nf->flags & VLIB_FRAME_TRACE) ? VLIB_NODE_FLAG_TRACE : 0; + nf->flags &= ~VLIB_FRAME_TRACE; + + last_time_stamp = dispatch_node (vm, n, + VLIB_NODE_TYPE_INTERNAL, + VLIB_NODE_STATE_POLLING, + f, last_time_stamp); + + f->flags &= ~VLIB_FRAME_PENDING; + + /* Frame is ready to be used again, so restore it. */ + if (restore_frame_index != ~0) + { + /* we musn't restore a frame that is flagged to be freed. This shouldn't + happen since frames to be freed post dispatch are those used + when the to-node frame becomes full i.e. they form a sort of queue of + frames to a single node. If we get here then the to-node frame and the + pending frame *were* the same, and so we removed the to-node frame. + Therefore this frame is no longer part of the queue for that node + and hence it cannot be it's overspill. + */ + ASSERT (!(f->flags & VLIB_FRAME_FREE_AFTER_DISPATCH)); + + /* p->next_frame_index can change during node dispatch if node + function decides to change graph hook up. */ + nf = vec_elt_at_index (nm->next_frames, p->next_frame_index); + nf->flags |= VLIB_FRAME_IS_ALLOCATED; + + if (~0 == nf->frame_index) + { + /* no new frame has been assigned to this node, use the saved one */ + nf->frame_index = restore_frame_index; + f->n_vectors = 0; + } + else + { + /* The node has gained a frame, implying packets from the current frame + were re-queued to this same node. we don't need the saved one + anymore */ + vlib_frame_free (vm, n, f); + } + } + else + { + if (f->flags & VLIB_FRAME_FREE_AFTER_DISPATCH) + { + ASSERT (!(n->flags & VLIB_NODE_FLAG_FRAME_NO_FREE_AFTER_DISPATCH)); + vlib_frame_free (vm, n, f); + } + } + + return last_time_stamp; +} + +always_inline uword +vlib_process_stack_is_valid (vlib_process_t * p) +{ + return p->stack[0] == VLIB_PROCESS_STACK_MAGIC; +} + +typedef struct +{ + vlib_main_t *vm; + vlib_process_t *process; + vlib_frame_t *frame; +} vlib_process_bootstrap_args_t; + +/* Called in process stack. */ +static uword +vlib_process_bootstrap (uword _a) +{ + vlib_process_bootstrap_args_t *a; + vlib_main_t *vm; + vlib_node_runtime_t *node; + vlib_frame_t *f; + vlib_process_t *p; + uword n; + + a = uword_to_pointer (_a, vlib_process_bootstrap_args_t *); + + vm = a->vm; + p = a->process; + f = a->frame; + node = &p->node_runtime; + + n = node->function (vm, node, f); + + ASSERT (vlib_process_stack_is_valid (p)); + + clib_longjmp (&p->return_longjmp, n); + + return n; +} + +/* Called in main stack. */ +static_always_inline uword +vlib_process_startup (vlib_main_t * vm, vlib_process_t * p, vlib_frame_t * f) +{ + vlib_process_bootstrap_args_t a; + uword r; + + a.vm = vm; + a.process = p; + a.frame = f; + + r = clib_setjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_RETURN); + if (r == VLIB_PROCESS_RETURN_LONGJMP_RETURN) + r = clib_calljmp (vlib_process_bootstrap, pointer_to_uword (&a), + (void *) p->stack + (1 << p->log2_n_stack_bytes)); + + return r; +} + +static_always_inline uword +vlib_process_resume (vlib_process_t * p) +{ + uword r; + p->flags &= ~(VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK + | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT + | VLIB_PROCESS_RESUME_PENDING); + r = clib_setjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_RETURN); + if (r == VLIB_PROCESS_RETURN_LONGJMP_RETURN) + clib_longjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_RESUME); + return r; +} + +static u64 +dispatch_process (vlib_main_t * vm, + vlib_process_t * p, vlib_frame_t * f, u64 last_time_stamp) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_runtime_t *node_runtime = &p->node_runtime; + vlib_node_t *node = vlib_get_node (vm, node_runtime->node_index); + u64 t; + uword n_vectors, is_suspend; + + if (node->state != VLIB_NODE_STATE_POLLING + || (p->flags & (VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK + | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT))) + return last_time_stamp; + + p->flags |= VLIB_PROCESS_IS_RUNNING; + + t = last_time_stamp; + vlib_elog_main_loop_event (vm, node_runtime->node_index, t, + f ? f->n_vectors : 0, /* is_after */ 0); + + /* Save away current process for suspend. */ + nm->current_process_index = node->runtime_index; + + n_vectors = vlib_process_startup (vm, p, f); + + nm->current_process_index = ~0; + + ASSERT (n_vectors != VLIB_PROCESS_RETURN_LONGJMP_RETURN); + is_suspend = n_vectors == VLIB_PROCESS_RETURN_LONGJMP_SUSPEND; + if (is_suspend) + { + vlib_pending_frame_t *pf; + + n_vectors = 0; + pool_get (nm->suspended_process_frames, pf); + pf->node_runtime_index = node->runtime_index; + pf->frame_index = f ? vlib_frame_index (vm, f) : ~0; + pf->next_frame_index = ~0; + + p->n_suspends += 1; + p->suspended_process_frame_index = pf - nm->suspended_process_frames; + + if (p->flags & VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK) + timing_wheel_insert (&nm->timing_wheel, p->resume_cpu_time, + vlib_timing_wheel_data_set_suspended_process + (node->runtime_index)); + } + else + p->flags &= ~VLIB_PROCESS_IS_RUNNING; + + t = clib_cpu_time_now (); + + vlib_elog_main_loop_event (vm, node_runtime->node_index, t, is_suspend, + /* is_after */ 1); + + vlib_process_update_stats (vm, p, + /* n_calls */ !is_suspend, + /* n_vectors */ n_vectors, + /* n_clocks */ t - last_time_stamp); + + return t; +} + +void +vlib_start_process (vlib_main_t * vm, uword process_index) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_process_t *p = vec_elt (nm->processes, process_index); + dispatch_process (vm, p, /* frame */ 0, /* cpu_time_now */ 0); +} + +static u64 +dispatch_suspended_process (vlib_main_t * vm, + uword process_index, u64 last_time_stamp) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_runtime_t *node_runtime; + vlib_node_t *node; + vlib_frame_t *f; + vlib_process_t *p; + vlib_pending_frame_t *pf; + u64 t, n_vectors, is_suspend; + + t = last_time_stamp; + + p = vec_elt (nm->processes, process_index); + if (PREDICT_FALSE (!(p->flags & VLIB_PROCESS_IS_RUNNING))) + return last_time_stamp; + + ASSERT (p->flags & (VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK + | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT)); + + pf = + pool_elt_at_index (nm->suspended_process_frames, + p->suspended_process_frame_index); + + node_runtime = &p->node_runtime; + node = vlib_get_node (vm, node_runtime->node_index); + f = pf->frame_index != ~0 ? vlib_get_frame (vm, pf->frame_index) : 0; + + vlib_elog_main_loop_event (vm, node_runtime->node_index, t, + f ? f->n_vectors : 0, /* is_after */ 0); + + /* Save away current process for suspend. */ + nm->current_process_index = node->runtime_index; + + n_vectors = vlib_process_resume (p); + t = clib_cpu_time_now (); + + nm->current_process_index = ~0; + + is_suspend = n_vectors == VLIB_PROCESS_RETURN_LONGJMP_SUSPEND; + if (is_suspend) + { + /* Suspend it again. */ + n_vectors = 0; + p->n_suspends += 1; + if (p->flags & VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK) + timing_wheel_insert (&nm->timing_wheel, p->resume_cpu_time, + vlib_timing_wheel_data_set_suspended_process + (node->runtime_index)); + } + else + { + p->flags &= ~VLIB_PROCESS_IS_RUNNING; + p->suspended_process_frame_index = ~0; + pool_put (nm->suspended_process_frames, pf); + } + + t = clib_cpu_time_now (); + vlib_elog_main_loop_event (vm, node_runtime->node_index, t, !is_suspend, + /* is_after */ 1); + + vlib_process_update_stats (vm, p, + /* n_calls */ !is_suspend, + /* n_vectors */ n_vectors, + /* n_clocks */ t - last_time_stamp); + + return t; +} + +static void +vlib_main_loop (vlib_main_t * vm) +{ + vlib_node_main_t *nm = &vm->node_main; + uword i; + u64 cpu_time_now; + + /* Initialize pending node vector. */ + vec_resize (nm->pending_frames, 32); + _vec_len (nm->pending_frames) = 0; + + /* Mark time of main loop start. */ + cpu_time_now = vm->clib_time.last_cpu_time; + vm->cpu_time_main_loop_start = cpu_time_now; + + /* Arrange for first level of timing wheel to cover times we care + most about. */ + nm->timing_wheel.min_sched_time = 10e-6; + nm->timing_wheel.max_sched_time = 10e-3; + timing_wheel_init (&nm->timing_wheel, + cpu_time_now, vm->clib_time.clocks_per_second); + + /* Pre-allocate expired nodes. */ + vec_alloc (nm->data_from_advancing_timing_wheel, 32); + vec_alloc (nm->pending_interrupt_node_runtime_indices, 32); + + if (!nm->polling_threshold_vector_length) + nm->polling_threshold_vector_length = 10; + if (!nm->interrupt_threshold_vector_length) + nm->interrupt_threshold_vector_length = 5; + + nm->current_process_index = ~0; + + /* Start all processes. */ + { + uword i; + for (i = 0; i < vec_len (nm->processes); i++) + cpu_time_now = + dispatch_process (vm, nm->processes[i], /* frame */ 0, cpu_time_now); + } + + while (1) + { + vlib_node_runtime_t *n; + + /* Process pre-input nodes. */ + vec_foreach (n, nm->nodes_by_type[VLIB_NODE_TYPE_PRE_INPUT]) + cpu_time_now = dispatch_node (vm, n, + VLIB_NODE_TYPE_PRE_INPUT, + VLIB_NODE_STATE_POLLING, + /* frame */ 0, + cpu_time_now); + + /* Next process input nodes. */ + vec_foreach (n, nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]) + cpu_time_now = dispatch_node (vm, n, + VLIB_NODE_TYPE_INPUT, + VLIB_NODE_STATE_POLLING, + /* frame */ 0, + cpu_time_now); + + if (PREDICT_TRUE (vm->queue_signal_pending == 0)) + vm->queue_signal_callback (vm); + + /* Next handle interrupts. */ + { + uword l = _vec_len (nm->pending_interrupt_node_runtime_indices); + uword i; + if (l > 0) + { + _vec_len (nm->pending_interrupt_node_runtime_indices) = 0; + for (i = 0; i < l; i++) + { + n = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT], + nm-> + pending_interrupt_node_runtime_indices + [i]); + cpu_time_now = + dispatch_node (vm, n, VLIB_NODE_TYPE_INPUT, + VLIB_NODE_STATE_INTERRUPT, + /* frame */ 0, + cpu_time_now); + } + } + } + + /* Check if process nodes have expired from timing wheel. */ + nm->data_from_advancing_timing_wheel + = timing_wheel_advance (&nm->timing_wheel, cpu_time_now, + nm->data_from_advancing_timing_wheel, + &nm->cpu_time_next_process_ready); + + ASSERT (nm->data_from_advancing_timing_wheel != 0); + if (PREDICT_FALSE (_vec_len (nm->data_from_advancing_timing_wheel) > 0)) + { + uword i; + + processes_timing_wheel_data: + for (i = 0; i < _vec_len (nm->data_from_advancing_timing_wheel); + i++) + { + u32 d = nm->data_from_advancing_timing_wheel[i]; + u32 di = vlib_timing_wheel_data_get_index (d); + + if (vlib_timing_wheel_data_is_timed_event (d)) + { + vlib_signal_timed_event_data_t *te = + pool_elt_at_index (nm->signal_timed_event_data_pool, di); + vlib_node_t *n = vlib_get_node (vm, te->process_node_index); + vlib_process_t *p = + vec_elt (nm->processes, n->runtime_index); + void *data; + data = + vlib_process_signal_event_helper (nm, n, p, + te->event_type_index, + te->n_data_elts, + te->n_data_elt_bytes); + if (te->n_data_bytes < sizeof (te->inline_event_data)) + clib_memcpy (data, te->inline_event_data, + te->n_data_bytes); + else + { + clib_memcpy (data, te->event_data_as_vector, + te->n_data_bytes); + vec_free (te->event_data_as_vector); + } + pool_put (nm->signal_timed_event_data_pool, te); + } + else + { + cpu_time_now = clib_cpu_time_now (); + cpu_time_now = + dispatch_suspended_process (vm, di, cpu_time_now); + } + } + + /* Reset vector. */ + _vec_len (nm->data_from_advancing_timing_wheel) = 0; + } + + /* Input nodes may have added work to the pending vector. + Process pending vector until there is nothing left. + All pending vectors will be processed from input -> output. */ + for (i = 0; i < _vec_len (nm->pending_frames); i++) + cpu_time_now = dispatch_pending_node (vm, nm->pending_frames + i, + cpu_time_now); + /* Reset pending vector for next iteration. */ + _vec_len (nm->pending_frames) = 0; + + /* Pending internal nodes may resume processes. */ + if (_vec_len (nm->data_from_advancing_timing_wheel) > 0) + goto processes_timing_wheel_data; + + vlib_increment_main_loop_counter (vm); + + /* Record time stamp in case there are no enabled nodes and above + calls do not update time stamp. */ + cpu_time_now = clib_cpu_time_now (); + } +} + +vlib_main_t vlib_global_main; + +static clib_error_t * +vlib_main_configure (vlib_main_t * vm, unformat_input_t * input) +{ + int turn_on_mem_trace = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "memory-trace")) + turn_on_mem_trace = 1; + + else if (unformat (input, "elog-events %d", + &vm->elog_main.event_ring_size)) + ; + else + return unformat_parse_error (input); + } + + unformat_free (input); + + /* Enable memory trace as early as possible. */ + if (turn_on_mem_trace) + clib_mem_trace (1); + + return 0; +} + +VLIB_EARLY_CONFIG_FUNCTION (vlib_main_configure, "vlib"); + +static void +dummy_queue_signal_callback (vlib_main_t * vm) +{ +} + +/* Main function. */ +int +vlib_main (vlib_main_t * volatile vm, unformat_input_t * input) +{ + clib_error_t *volatile error; + + vm->queue_signal_callback = dummy_queue_signal_callback; + + clib_time_init (&vm->clib_time); + + /* Turn on event log. */ + if (!vm->elog_main.event_ring_size) + vm->elog_main.event_ring_size = 128 << 10; + elog_init (&vm->elog_main, vm->elog_main.event_ring_size); + elog_enable_disable (&vm->elog_main, 1); + + /* Default name. */ + if (!vm->name) + vm->name = "VLIB"; + + vec_validate (vm->buffer_main, 0); + + if ((error = vlib_thread_init (vm))) + { + clib_error_report (error); + goto done; + } + + /* Register static nodes so that init functions may use them. */ + vlib_register_all_static_nodes (vm); + + /* Set seed for random number generator. + Allow user to specify seed to make random sequence deterministic. */ + if (!unformat (input, "seed %wd", &vm->random_seed)) + vm->random_seed = clib_cpu_time_now (); + clib_random_buffer_init (&vm->random_buffer, vm->random_seed); + + /* Initialize node graph. */ + if ((error = vlib_node_main_init (vm))) + { + /* Arrange for graph hook up error to not be fatal when debugging. */ + if (CLIB_DEBUG > 0) + clib_error_report (error); + else + goto done; + } + + /* See unix/main.c; most likely already set up */ + if (vm->init_functions_called == 0) + vm->init_functions_called = hash_create (0, /* value bytes */ 0); + if ((error = vlib_call_all_init_functions (vm))) + goto done; + + /* Create default buffer free list. */ + vlib_buffer_get_or_create_free_list (vm, + VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES, + "default"); + + switch (clib_setjmp (&vm->main_loop_exit, VLIB_MAIN_LOOP_EXIT_NONE)) + { + case VLIB_MAIN_LOOP_EXIT_NONE: + vm->main_loop_exit_set = 1; + break; + + case VLIB_MAIN_LOOP_EXIT_CLI: + goto done; + + default: + error = vm->main_loop_error; + goto done; + } + + if ((error = vlib_call_all_config_functions (vm, input, 0 /* is_early */ ))) + goto done; + + /* Call all main loop enter functions. */ + { + clib_error_t *sub_error; + sub_error = vlib_call_all_main_loop_enter_functions (vm); + if (sub_error) + clib_error_report (sub_error); + } + + vlib_main_loop (vm); + +done: + /* Call all exit functions. */ + { + clib_error_t *sub_error; + sub_error = vlib_call_all_main_loop_exit_functions (vm); + if (sub_error) + clib_error_report (sub_error); + } + + if (error) + clib_error_report (error); + + return 0; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/main.h b/src/vlib/main.h new file mode 100644 index 00000000..d9ac1445 --- /dev/null +++ b/src/vlib/main.h @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * main.h: VLIB main data structure + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_main_h +#define included_vlib_main_h + +#include +#include +#include +#include +#include +#include + +#include + + +/* By default turn off node/error event logging. + Override with -DVLIB_ELOG_MAIN_LOOP */ +#ifndef VLIB_ELOG_MAIN_LOOP +#define VLIB_ELOG_MAIN_LOOP 0 +#endif + +typedef struct vlib_main_t +{ + /* Instruction level timing state. */ + clib_time_t clib_time; + + /* Time stamp of last node dispatch. */ + u64 cpu_time_last_node_dispatch; + + /* Time stamp when main loop was entered (time 0). */ + u64 cpu_time_main_loop_start; + + /* Incremented once for each main loop. */ + u32 main_loop_count; + + /* Count of vectors processed this main loop. */ + u32 main_loop_vectors_processed; + u32 main_loop_nodes_processed; + + /* Circular buffer of input node vector counts. + Indexed by low bits of + (main_loop_count >> VLIB_LOG2_INPUT_VECTORS_PER_MAIN_LOOP). */ + u32 vector_counts_per_main_loop[2]; + u32 node_counts_per_main_loop[2]; + + /* Every so often we switch to the next counter. */ +#define VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE 7 + + /* Jump target to exit main loop with given code. */ + u32 main_loop_exit_set; + clib_longjmp_t main_loop_exit; +#define VLIB_MAIN_LOOP_EXIT_NONE 0 +#define VLIB_MAIN_LOOP_EXIT_PANIC 1 + /* Exit via CLI. */ +#define VLIB_MAIN_LOOP_EXIT_CLI 2 + + /* Error marker to use when exiting main loop. */ + clib_error_t *main_loop_error; + + /* Name for e.g. syslog. */ + char *name; + + /* Start and size of CLIB heap. */ + void *heap_base; + uword heap_size; + + vlib_buffer_main_t *buffer_main; + + vlib_physmem_main_t physmem_main; + + /* Allocate/free buffer memory for DMA transfers, descriptor rings, etc. + buffer memory is guaranteed to be cache-aligned. */ + void *(*os_physmem_alloc_aligned) (vlib_physmem_main_t * pm, + uword n_bytes, uword alignment); + void (*os_physmem_free) (void *x); + + /* Node graph main structure. */ + vlib_node_main_t node_main; + + /* Command line interface. */ + vlib_cli_main_t cli_main; + + /* Packet trace buffer. */ + vlib_trace_main_t trace_main; + + /* Error handling. */ + vlib_error_main_t error_main; + + /* Punt packets to underlying operating system for when fast switching + code does not know what to do. */ + void (*os_punt_frame) (struct vlib_main_t * vm, + struct vlib_node_runtime_t * node, + vlib_frame_t * frame); + + /* Multicast distribution. Set to zero for MC disabled. */ + mc_main_t *mc_main; + + /* Stream index to use for distribution when MC is enabled. */ + u32 mc_stream_index; + + vlib_one_time_waiting_process_t *procs_waiting_for_mc_stream_join; + + /* Event logger. */ + elog_main_t elog_main; + + /* Node call and return event types. */ + elog_event_type_t *node_call_elog_event_types; + elog_event_type_t *node_return_elog_event_types; + + elog_event_type_t *error_elog_event_types; + + /* Seed for random number generator. */ + uword random_seed; + + /* Buffer of random data for various uses. */ + clib_random_buffer_t random_buffer; + + /* Hash table to record which init functions have been called. */ + uword *init_functions_called; + + /* to compare with node runtime */ + u32 cpu_index; + + void **mbuf_alloc_list; + + /* List of init functions to call, setup by constructors */ + _vlib_init_function_list_elt_t *init_function_registrations; + _vlib_init_function_list_elt_t *main_loop_enter_function_registrations; + _vlib_init_function_list_elt_t *main_loop_exit_function_registrations; + _vlib_init_function_list_elt_t *api_init_function_registrations; + vlib_config_function_runtime_t *config_function_registrations; + mc_serialize_msg_t *mc_msg_registrations; /* mc_main is a pointer... */ + + /* control-plane API queue signal pending, length indication */ + volatile u32 queue_signal_pending; + volatile u32 api_queue_nonempty; + void (*queue_signal_callback) (struct vlib_main_t *); + u8 **argv; +} vlib_main_t; + +/* Global main structure. */ +extern vlib_main_t vlib_global_main; + +always_inline f64 +vlib_time_now (vlib_main_t * vm) +{ + return clib_time_now (&vm->clib_time); +} + +always_inline f64 +vlib_time_now_ticks (vlib_main_t * vm, u64 n) +{ + return clib_time_now_internal (&vm->clib_time, n); +} + +/* Busy wait for specified time. */ +always_inline void +vlib_time_wait (vlib_main_t * vm, f64 wait) +{ + f64 t = vlib_time_now (vm); + f64 limit = t + wait; + while (t < limit) + t = vlib_time_now (vm); +} + +/* Time a piece of code. */ +#define vlib_time_code(vm,body) \ +do { \ + f64 _t[2]; \ + _t[0] = vlib_time_now (vm); \ + do { body; } while (0); \ + _t[1] = vlib_time_now (vm); \ + clib_warning ("%.7e", _t[1] - _t[0]); \ +} while (0) + +#define vlib_wait_with_timeout(vm,suspend_time,timeout_time,test) \ +({ \ + uword __vlib_wait_with_timeout = 0; \ + f64 __vlib_wait_time = 0; \ + while (! (__vlib_wait_with_timeout = (test)) \ + && __vlib_wait_time < (timeout_time)) \ + { \ + vlib_process_suspend (vm, suspend_time); \ + __vlib_wait_time += suspend_time; \ + } \ + __vlib_wait_with_timeout; \ +}) + +always_inline void +vlib_panic_with_error (vlib_main_t * vm, clib_error_t * error) +{ + vm->main_loop_error = error; + clib_longjmp (&vm->main_loop_exit, VLIB_MAIN_LOOP_EXIT_PANIC); +} + +#define vlib_panic_with_msg(vm,args...) \ + vlib_panic_with_error (vm, clib_error_return (0, args)) + +always_inline void +vlib_panic (vlib_main_t * vm) +{ + vlib_panic_with_error (vm, 0); +} + +always_inline u32 +vlib_vector_input_stats_index (vlib_main_t * vm, word delta) +{ + u32 i; + i = vm->main_loop_count >> VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE; + ASSERT (is_pow2 (ARRAY_LEN (vm->vector_counts_per_main_loop))); + return (i + delta) & (ARRAY_LEN (vm->vector_counts_per_main_loop) - 1); +} + +/* Estimate input rate based on previous + 2^VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE + samples. */ +always_inline u32 +vlib_last_vectors_per_main_loop (vlib_main_t * vm) +{ + u32 i = vlib_vector_input_stats_index (vm, -1); + u32 n = vm->vector_counts_per_main_loop[i]; + return n >> VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE; +} + +/* Total ave vector count per iteration of main loop. */ +always_inline f64 +vlib_last_vectors_per_main_loop_as_f64 (vlib_main_t * vm) +{ + u32 i = vlib_vector_input_stats_index (vm, -1); + u32 v = vm->vector_counts_per_main_loop[i]; + return (f64) v / (f64) (1 << VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE); +} + +/* Total ave vectors/node count per iteration of main loop. */ +always_inline f64 +vlib_last_vector_length_per_node (vlib_main_t * vm) +{ + u32 i = vlib_vector_input_stats_index (vm, -1); + u32 v = vm->vector_counts_per_main_loop[i]; + u32 n = vm->node_counts_per_main_loop[i]; + return n == 0 ? 0 : (f64) v / (f64) n; +} + +extern u32 wraps; + +always_inline void +vlib_increment_main_loop_counter (vlib_main_t * vm) +{ + u32 i, c, n, v, is_wrap; + + c = vm->main_loop_count++; + + is_wrap = (c & pow2_mask (VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE)) == 0; + + if (is_wrap) + wraps++; + + i = vlib_vector_input_stats_index (vm, /* delta */ is_wrap); + + v = is_wrap ? 0 : vm->vector_counts_per_main_loop[i]; + n = is_wrap ? 0 : vm->node_counts_per_main_loop[i]; + + v += vm->main_loop_vectors_processed; + n += vm->main_loop_nodes_processed; + vm->main_loop_vectors_processed = 0; + vm->main_loop_nodes_processed = 0; + vm->vector_counts_per_main_loop[i] = v; + vm->node_counts_per_main_loop[i] = n; +} + +always_inline void vlib_set_queue_signal_callback + (vlib_main_t * vm, void (*fp) (vlib_main_t *)) +{ + vm->queue_signal_callback = fp; +} + +/* Main routine. */ +int vlib_main (vlib_main_t * vm, unformat_input_t * input); + +/* Thread stacks, for os_get_cpu_number */ +extern u8 **vlib_thread_stacks; + +/* Number of thread stacks that the application needs */ +u32 vlib_app_num_thread_stacks_needed (void) __attribute__ ((weak)); + +extern void vlib_node_sync_stats (vlib_main_t * vm, vlib_node_t * n); + +#endif /* included_vlib_main_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/mc.c b/src/vlib/mc.c new file mode 100644 index 00000000..8fde0913 --- /dev/null +++ b/src/vlib/mc.c @@ -0,0 +1,2609 @@ +/* + * mc.c: vlib reliable sequenced multicast distributed applications + * + * Copyright (c) 2010 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +/* + * 1 to enable msg id training wheels, which are useful for tracking + * down catchup and/or partitioned network problems + */ +#define MSG_ID_DEBUG 0 + +static format_function_t format_mc_stream_state; + +static u32 +elog_id_for_peer_id (mc_main_t * m, u64 peer_id) +{ + uword *p, r; + mhash_t *h = &m->elog_id_by_peer_id; + + if (!m->elog_id_by_peer_id.hash) + mhash_init (h, sizeof (uword), sizeof (mc_peer_id_t)); + + p = mhash_get (h, &peer_id); + if (p) + return p[0]; + r = elog_string (m->elog_main, "%U", m->transport.format_peer_id, peer_id); + mhash_set (h, &peer_id, r, /* old_value */ 0); + return r; +} + +static u32 +elog_id_for_msg_name (mc_main_t * m, char *msg_name) +{ + uword *p, r; + uword *h = m->elog_id_by_msg_name; + u8 *name_copy; + + if (!h) + h = m->elog_id_by_msg_name = hash_create_string (0, sizeof (uword)); + + p = hash_get_mem (h, msg_name); + if (p) + return p[0]; + r = elog_string (m->elog_main, "%s", msg_name); + + name_copy = format (0, "%s%c", msg_name, 0); + + hash_set_mem (h, name_copy, r); + m->elog_id_by_msg_name = h; + + return r; +} + +static void +elog_tx_msg (mc_main_t * m, u32 stream_id, u32 local_sequence, + u32 retry_count) +{ + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "tx-msg: stream %d local seq %d attempt %d", + .format_args = "i4i4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 stream_id, local_sequence, retry_count; + } *ed; + ed = ELOG_DATA (m->elog_main, e); + ed->stream_id = stream_id; + ed->local_sequence = local_sequence; + ed->retry_count = retry_count; + } +} + +/* + * seq_cmp + * correctly compare two unsigned sequence numbers. + * This function works so long as x and y are within 2**(n-1) of each + * other, where n = bits(x, y). + * + * Magic decoder ring: + * seq_cmp == 0 => x and y are equal + * seq_cmp < 0 => x is "in the past" with respect to y + * seq_cmp > 0 => x is "in the future" with respect to y + */ +always_inline i32 +mc_seq_cmp (u32 x, u32 y) +{ + return (i32) x - (i32) y; +} + +void * +mc_get_vlib_buffer (vlib_main_t * vm, u32 n_bytes, u32 * bi_return) +{ + u32 n_alloc, bi; + vlib_buffer_t *b; + + n_alloc = vlib_buffer_alloc (vm, &bi, 1); + ASSERT (n_alloc == 1); + + b = vlib_get_buffer (vm, bi); + b->current_length = n_bytes; + *bi_return = bi; + return (void *) b->data; +} + +static void +delete_peer_with_index (mc_main_t * mcm, mc_stream_t * s, + uword index, int notify_application) +{ + mc_stream_peer_t *p = pool_elt_at_index (s->peers, index); + ASSERT (p != 0); + if (s->config.peer_died && notify_application) + s->config.peer_died (mcm, s, p->id); + + s->all_peer_bitmap = clib_bitmap_andnoti (s->all_peer_bitmap, p - s->peers); + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "delete peer %s from all_peer_bitmap", + .format_args = "T4", + }; + /* *INDENT-ON* */ + struct + { + u32 peer; + } *ed = 0; + + ed = ELOG_DATA (mcm->elog_main, e); + ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64); + } + /* Do not delete the pool / hash table entries, or we lose sequence number state */ +} + +static mc_stream_peer_t * +get_or_create_peer_with_id (mc_main_t * mcm, + mc_stream_t * s, mc_peer_id_t id, int *created) +{ + uword *q = mhash_get (&s->peer_index_by_id, &id); + mc_stream_peer_t *p; + + if (q) + { + p = pool_elt_at_index (s->peers, q[0]); + goto done; + } + + pool_get (s->peers, p); + memset (p, 0, sizeof (p[0])); + p->id = id; + p->last_sequence_received = ~0; + mhash_set (&s->peer_index_by_id, &id, p - s->peers, /* old_value */ 0); + if (created) + *created = 1; + +done: + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "get_or_create %s peer %s stream %d seq %d", + .format_args = "t4T4i4i4", + .n_enum_strings = 2, + .enum_strings = { + "old", "new", + }, + }; + /* *INDENT-ON* */ + struct + { + u32 is_new, peer, stream_index, rx_sequence; + } *ed = 0; + + ed = ELOG_DATA (mcm->elog_main, e); + ed->is_new = q ? 0 : 1; + ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64); + ed->stream_index = s->index; + ed->rx_sequence = p->last_sequence_received; + } + /* $$$$ Enable or reenable this peer */ + s->all_peer_bitmap = clib_bitmap_ori (s->all_peer_bitmap, p - s->peers); + return p; +} + +static void +maybe_send_window_open_event (vlib_main_t * vm, mc_stream_t * stream) +{ + vlib_one_time_waiting_process_t *p; + + if (pool_elts (stream->retry_pool) >= stream->config.window_size) + return; + + vec_foreach (p, stream->procs_waiting_for_open_window) + vlib_signal_one_time_waiting_process (vm, p); + + if (stream->procs_waiting_for_open_window) + _vec_len (stream->procs_waiting_for_open_window) = 0; +} + +static void +mc_retry_free (mc_main_t * mcm, mc_stream_t * s, mc_retry_t * r) +{ + mc_retry_t record, *retp; + + if (r->unacked_by_peer_bitmap) + _vec_len (r->unacked_by_peer_bitmap) = 0; + + if (clib_fifo_elts (s->retired_fifo) >= 2 * s->config.window_size) + { + clib_fifo_sub1 (s->retired_fifo, record); + vlib_buffer_free_one (mcm->vlib_main, record.buffer_index); + } + + clib_fifo_add2 (s->retired_fifo, retp); + + retp->buffer_index = r->buffer_index; + retp->local_sequence = r->local_sequence; + + r->buffer_index = ~0; /* poison buffer index in this retry */ +} + +static void +mc_resend_retired (mc_main_t * mcm, mc_stream_t * s, u32 local_sequence) +{ + mc_retry_t *retry; + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "resend-retired: search for local seq %d", + .format_args = "i4", + }; + /* *INDENT-ON* */ + struct + { + u32 local_sequence; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->local_sequence = local_sequence; + } + + /* *INDENT-OFF* */ + clib_fifo_foreach (retry, s->retired_fifo, + ({ + if (retry->local_sequence == local_sequence) + { + elog_tx_msg (mcm, s->index, retry-> local_sequence, -13); + mcm->transport.tx_buffer (mcm->transport.opaque, + MC_TRANSPORT_USER_REQUEST_TO_RELAY, + retry->buffer_index); + return; + } + })); + /* *INDENT-ON* */ + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "resend-retired: FAILED search for local seq %d", + .format_args = "i4", + }; + /* *INDENT-ON* */ + struct + { + u32 local_sequence; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->local_sequence = local_sequence; + } +} + +static uword * +delete_retry_fifo_elt (mc_main_t * mcm, + mc_stream_t * stream, + mc_retry_t * r, uword * dead_peer_bitmap) +{ + mc_stream_peer_t *p; + + /* *INDENT-OFF* */ + pool_foreach (p, stream->peers, ({ + uword pi = p - stream->peers; + uword is_alive = 0 == clib_bitmap_get (r->unacked_by_peer_bitmap, pi); + + if (! is_alive) + dead_peer_bitmap = clib_bitmap_ori (dead_peer_bitmap, pi); + + if (MC_EVENT_LOGGING > 0) + { + ELOG_TYPE_DECLARE (e) = { + .format = "delete_retry_fifo_elt: peer %s is %s", + .format_args = "T4t4", + .n_enum_strings = 2, + .enum_strings = { "alive", "dead", }, + }; + struct { u32 peer, is_alive; } * ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64); + ed->is_alive = is_alive; + } + })); + /* *INDENT-ON* */ + + hash_unset (stream->retry_index_by_local_sequence, r->local_sequence); + mc_retry_free (mcm, stream, r); + + return dead_peer_bitmap; +} + +always_inline mc_retry_t * +prev_retry (mc_stream_t * s, mc_retry_t * r) +{ + return (r->prev_index != ~0 + ? pool_elt_at_index (s->retry_pool, r->prev_index) : 0); +} + +always_inline mc_retry_t * +next_retry (mc_stream_t * s, mc_retry_t * r) +{ + return (r->next_index != ~0 + ? pool_elt_at_index (s->retry_pool, r->next_index) : 0); +} + +always_inline void +remove_retry_from_pool (mc_stream_t * s, mc_retry_t * r) +{ + mc_retry_t *p = prev_retry (s, r); + mc_retry_t *n = next_retry (s, r); + + if (p) + p->next_index = r->next_index; + else + s->retry_head_index = r->next_index; + if (n) + n->prev_index = r->prev_index; + else + s->retry_tail_index = r->prev_index; + + pool_put_index (s->retry_pool, r - s->retry_pool); +} + +static void +check_retry (mc_main_t * mcm, mc_stream_t * s) +{ + mc_retry_t *r; + vlib_main_t *vm = mcm->vlib_main; + f64 now = vlib_time_now (vm); + uword *dead_peer_bitmap = 0; + u32 ri, ri_next; + + for (ri = s->retry_head_index; ri != ~0; ri = ri_next) + { + r = pool_elt_at_index (s->retry_pool, ri); + ri_next = r->next_index; + + if (now < r->sent_at + s->config.retry_interval) + continue; + + r->n_retries += 1; + if (r->n_retries > s->config.retry_limit) + { + dead_peer_bitmap = + delete_retry_fifo_elt (mcm, s, r, dead_peer_bitmap); + remove_retry_from_pool (s, r); + } + else + { + if (MC_EVENT_LOGGING > 0) + { + mc_stream_peer_t *p; + + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (t) = + { + .format = "resend local seq %d attempt %d", + .format_args = "i4i4", + }; + /* *INDENT-ON* */ + + /* *INDENT-OFF* */ + pool_foreach (p, s->peers, ({ + if (clib_bitmap_get (r->unacked_by_peer_bitmap, p - s->peers)) + { + ELOG_TYPE_DECLARE (ev) = { + .format = "resend: needed by peer %s local seq %d", + .format_args = "T4i4", + }; + struct { u32 peer, rx_sequence; } * ed; + ed = ELOG_DATA (mcm->elog_main, ev); + ed->peer = elog_id_for_peer_id (mcm, p->id.as_u64); + ed->rx_sequence = r->local_sequence; + } + })); + /* *INDENT-ON* */ + + struct + { + u32 sequence; + u32 trail; + } *ed; + ed = ELOG_DATA (mcm->elog_main, t); + ed->sequence = r->local_sequence; + ed->trail = r->n_retries; + } + + r->sent_at = vlib_time_now (vm); + s->stats.n_retries += 1; + + elog_tx_msg (mcm, s->index, r->local_sequence, r->n_retries); + + mcm->transport.tx_buffer + (mcm->transport.opaque, + MC_TRANSPORT_USER_REQUEST_TO_RELAY, r->buffer_index); + } + } + + maybe_send_window_open_event (mcm->vlib_main, s); + + /* Delete any dead peers we've found. */ + if (!clib_bitmap_is_zero (dead_peer_bitmap)) + { + uword i; + + /* *INDENT-OFF* */ + clib_bitmap_foreach (i, dead_peer_bitmap, ({ + delete_peer_with_index (mcm, s, i, /* notify_application */ 1); + + /* Delete any references to just deleted peer in retry pool. */ + pool_foreach (r, s->retry_pool, ({ + r->unacked_by_peer_bitmap = + clib_bitmap_andnoti (r->unacked_by_peer_bitmap, i); + })); + })); +/* *INDENT-ON* */ + clib_bitmap_free (dead_peer_bitmap); + } +} + +always_inline mc_main_t * +mc_node_get_main (vlib_node_runtime_t * node) +{ + mc_main_t **p = (void *) node->runtime_data; + return p[0]; +} + +static uword +mc_retry_process (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * f) +{ + mc_main_t *mcm = mc_node_get_main (node); + mc_stream_t *s; + + while (1) + { + vlib_process_suspend (vm, 1.0); + vec_foreach (s, mcm->stream_vector) + { + if (s->state != MC_STREAM_STATE_invalid) + check_retry (mcm, s); + } + } + return 0; /* not likely */ +} + +static void +send_join_or_leave_request (mc_main_t * mcm, u32 stream_index, u32 is_join) +{ + vlib_main_t *vm = mcm->vlib_main; + mc_msg_join_or_leave_request_t *mp; + u32 bi; + + mp = mc_get_vlib_buffer (vm, sizeof (mp[0]), &bi); + memset (mp, 0, sizeof (*mp)); + mp->type = MC_MSG_TYPE_join_or_leave_request; + mp->peer_id = mcm->transport.our_ack_peer_id; + mp->stream_index = stream_index; + mp->is_join = is_join; + + mc_byte_swap_msg_join_or_leave_request (mp); + + /* + * These msgs are unnumbered, unordered so send on the from-relay + * channel. + */ + mcm->transport.tx_buffer (mcm->transport.opaque, MC_TRANSPORT_JOIN, bi); +} + +static uword +mc_join_ager_process (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * f) +{ + mc_main_t *mcm = mc_node_get_main (node); + + while (1) + { + if (mcm->joins_in_progress) + { + mc_stream_t *s; + vlib_one_time_waiting_process_t *p; + f64 now = vlib_time_now (vm); + + vec_foreach (s, mcm->stream_vector) + { + if (s->state != MC_STREAM_STATE_join_in_progress) + continue; + + if (now > s->join_timeout) + { + s->state = MC_STREAM_STATE_ready; + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "stream %d join timeout", + }; + /* *INDENT-ON* */ + ELOG (mcm->elog_main, e, s->index); + } + /* Make sure that this app instance exists as a stream peer, + or we may answer a catchup request with a NULL + all_peer_bitmap... */ + (void) get_or_create_peer_with_id + (mcm, s, mcm->transport.our_ack_peer_id, /* created */ 0); + + vec_foreach (p, s->procs_waiting_for_join_done) + vlib_signal_one_time_waiting_process (vm, p); + if (s->procs_waiting_for_join_done) + _vec_len (s->procs_waiting_for_join_done) = 0; + + mcm->joins_in_progress--; + ASSERT (mcm->joins_in_progress >= 0); + } + else + { + /* Resent join request which may have been lost. */ + send_join_or_leave_request (mcm, s->index, 1 /* is_join */ ); + + /* We're *not* alone, retry for as long as it takes */ + if (mcm->relay_state == MC_RELAY_STATE_SLAVE) + s->join_timeout = vlib_time_now (vm) + 2.0; + + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "stream %d resend join request", + }; + /* *INDENT-ON* */ + ELOG (mcm->elog_main, e, s->index); + } + } + } + } + + vlib_process_suspend (vm, .5); + } + + return 0; /* not likely */ +} + +static void +serialize_mc_register_stream_name (serialize_main_t * m, va_list * va) +{ + char *name = va_arg (*va, char *); + serialize_cstring (m, name); +} + +static void +elog_stream_name (char *buf, int n_buf_bytes, char *v) +{ + clib_memcpy (buf, v, clib_min (n_buf_bytes - 1, vec_len (v))); + buf[n_buf_bytes - 1] = 0; +} + +static void +unserialize_mc_register_stream_name (serialize_main_t * m, va_list * va) +{ + mc_main_t *mcm = va_arg (*va, mc_main_t *); + char *name; + mc_stream_t *s; + uword *p; + + unserialize_cstring (m, &name); + + if ((p = hash_get_mem (mcm->stream_index_by_name, name))) + { + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "stream index %d already named %s", + .format_args = "i4s16", + }; + /* *INDENT-ON* */ + struct + { + u32 stream_index; + char name[16]; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->stream_index = p[0]; + elog_stream_name (ed->name, sizeof (ed->name), name); + } + + vec_free (name); + return; + } + + vec_add2 (mcm->stream_vector, s, 1); + mc_stream_init (s); + s->state = MC_STREAM_STATE_name_known; + s->index = s - mcm->stream_vector; + s->config.name = name; + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "stream index %d named %s", + .format_args = "i4s16", + }; + /* *INDENT-ON* */ + struct + { + u32 stream_index; + char name[16]; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->stream_index = s->index; + elog_stream_name (ed->name, sizeof (ed->name), name); + } + + hash_set_mem (mcm->stream_index_by_name, name, s->index); + + p = hash_get (mcm->procs_waiting_for_stream_name_by_name, name); + if (p) + { + vlib_one_time_waiting_process_t *wp, **w; + w = pool_elt_at_index (mcm->procs_waiting_for_stream_name_pool, p[0]); + vec_foreach (wp, w[0]) + vlib_signal_one_time_waiting_process (mcm->vlib_main, wp); + pool_put (mcm->procs_waiting_for_stream_name_pool, w); + hash_unset_mem (mcm->procs_waiting_for_stream_name_by_name, name); + } +} + +/* *INDENT-OFF* */ +MC_SERIALIZE_MSG (mc_register_stream_name_msg, static) = +{ + .name = "mc_register_stream_name", + .serialize = serialize_mc_register_stream_name, + .unserialize = unserialize_mc_register_stream_name, +}; +/* *INDENT-ON* */ + +void +mc_rx_buffer_unserialize (mc_main_t * mcm, + mc_stream_t * stream, + mc_peer_id_t peer_id, u32 buffer_index) +{ + return mc_unserialize (mcm, stream, buffer_index); +} + +static u8 * +mc_internal_catchup_snapshot (mc_main_t * mcm, + u8 * data_vector, + u32 last_global_sequence_processed) +{ + serialize_main_t m; + + /* Append serialized data to data vector. */ + serialize_open_vector (&m, data_vector); + m.stream.current_buffer_index = vec_len (data_vector); + + serialize (&m, serialize_mc_main, mcm); + return serialize_close_vector (&m); +} + +static void +mc_internal_catchup (mc_main_t * mcm, u8 * data, u32 n_data_bytes) +{ + serialize_main_t s; + + unserialize_open_data (&s, data, n_data_bytes); + + unserialize (&s, unserialize_mc_main, mcm); +} + +/* Overridden from the application layer, not actually used here */ +void mc_stream_join_process_hold (void) __attribute__ ((weak)); +void +mc_stream_join_process_hold (void) +{ +} + +static u32 +mc_stream_join_helper (mc_main_t * mcm, + mc_stream_config_t * config, u32 is_internal) +{ + mc_stream_t *s; + vlib_main_t *vm = mcm->vlib_main; + + s = 0; + if (!is_internal) + { + uword *p; + + /* Already have a stream with given name? */ + if ((s = mc_stream_by_name (mcm, config->name))) + { + /* Already joined and ready? */ + if (s->state == MC_STREAM_STATE_ready) + return s->index; + } + + /* First join MC internal stream. */ + if (!mcm->stream_vector + || (mcm->stream_vector[MC_STREAM_INDEX_INTERNAL].state + == MC_STREAM_STATE_invalid)) + { + static mc_stream_config_t c = { + .name = "mc-internal", + .rx_buffer = mc_rx_buffer_unserialize, + .catchup = mc_internal_catchup, + .catchup_snapshot = mc_internal_catchup_snapshot, + }; + + c.save_snapshot = config->save_snapshot; + + mc_stream_join_helper (mcm, &c, /* is_internal */ 1); + } + + /* If stream is still unknown register this name and wait for + sequenced message to name stream. This way all peers agree + on stream name to index mappings. */ + s = mc_stream_by_name (mcm, config->name); + if (!s) + { + vlib_one_time_waiting_process_t *wp, **w; + u8 *name_copy = format (0, "%s", config->name); + + mc_serialize_stream (mcm, + MC_STREAM_INDEX_INTERNAL, + &mc_register_stream_name_msg, config->name); + + /* Wait for this stream to be named. */ + p = + hash_get_mem (mcm->procs_waiting_for_stream_name_by_name, + name_copy); + if (p) + w = + pool_elt_at_index (mcm->procs_waiting_for_stream_name_pool, + p[0]); + else + { + pool_get (mcm->procs_waiting_for_stream_name_pool, w); + if (!mcm->procs_waiting_for_stream_name_by_name) + mcm->procs_waiting_for_stream_name_by_name = hash_create_string ( /* elts */ 0, /* value size */ + sizeof + (uword)); + hash_set_mem (mcm->procs_waiting_for_stream_name_by_name, + name_copy, + w - mcm->procs_waiting_for_stream_name_pool); + w[0] = 0; + } + + vec_add2 (w[0], wp, 1); + vlib_current_process_wait_for_one_time_event (vm, wp); + vec_free (name_copy); + } + + /* Name should be known now. */ + s = mc_stream_by_name (mcm, config->name); + ASSERT (s != 0); + ASSERT (s->state == MC_STREAM_STATE_name_known); + } + + if (!s) + { + vec_add2 (mcm->stream_vector, s, 1); + mc_stream_init (s); + s->index = s - mcm->stream_vector; + } + + { + /* Save name since we could have already used it as hash key. */ + char *name_save = s->config.name; + + s->config = config[0]; + + if (name_save) + s->config.name = name_save; + } + + if (s->config.window_size == 0) + s->config.window_size = 8; + + if (s->config.retry_interval == 0.0) + s->config.retry_interval = 1.0; + + /* Sanity. */ + ASSERT (s->config.retry_interval < 30); + + if (s->config.retry_limit == 0) + s->config.retry_limit = 7; + + s->state = MC_STREAM_STATE_join_in_progress; + if (!s->peer_index_by_id.hash) + mhash_init (&s->peer_index_by_id, sizeof (uword), sizeof (mc_peer_id_t)); + + /* If we don't hear from someone in 5 seconds, we're alone */ + s->join_timeout = vlib_time_now (vm) + 5.0; + mcm->joins_in_progress++; + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "stream index %d join request %s", + .format_args = "i4s16", + }; + /* *INDENT-ON* */ + struct + { + u32 stream_index; + char name[16]; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->stream_index = s->index; + elog_stream_name (ed->name, sizeof (ed->name), s->config.name); + } + + send_join_or_leave_request (mcm, s->index, 1 /* join */ ); + + vlib_current_process_wait_for_one_time_event_vector + (vm, &s->procs_waiting_for_join_done); + + if (MC_EVENT_LOGGING) + { + ELOG_TYPE (e, "join complete stream %d"); + ELOG (mcm->elog_main, e, s->index); + } + + return s->index; +} + +u32 +mc_stream_join (mc_main_t * mcm, mc_stream_config_t * config) +{ + return mc_stream_join_helper (mcm, config, /* is_internal */ 0); +} + +void +mc_stream_leave (mc_main_t * mcm, u32 stream_index) +{ + mc_stream_t *s = mc_stream_by_index (mcm, stream_index); + + if (!s) + return; + + if (MC_EVENT_LOGGING) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (t) = + { + .format = "leave-stream: %d",.format_args = "i4", + }; + /* *INDENT-ON* */ + struct + { + u32 index; + } *ed; + ed = ELOG_DATA (mcm->elog_main, t); + ed->index = stream_index; + } + + send_join_or_leave_request (mcm, stream_index, 0 /* is_join */ ); + mc_stream_free (s); + s->state = MC_STREAM_STATE_name_known; +} + +void +mc_msg_join_or_leave_request_handler (mc_main_t * mcm, + mc_msg_join_or_leave_request_t * req, + u32 buffer_index) +{ + mc_stream_t *s; + mc_msg_join_reply_t *rep; + u32 bi; + + mc_byte_swap_msg_join_or_leave_request (req); + + s = mc_stream_by_index (mcm, req->stream_index); + if (!s || s->state != MC_STREAM_STATE_ready) + return; + + /* If the peer is joining, create it */ + if (req->is_join) + { + mc_stream_t *this_s; + + /* We're not in a position to catch up a peer until all + stream joins are complete. */ + if (0) + { + /* XXX This is hard to test so we've. */ + vec_foreach (this_s, mcm->stream_vector) + { + if (this_s->state != MC_STREAM_STATE_ready + && this_s->state != MC_STREAM_STATE_name_known) + return; + } + } + else if (mcm->joins_in_progress > 0) + return; + + (void) get_or_create_peer_with_id (mcm, s, req->peer_id, + /* created */ 0); + + rep = mc_get_vlib_buffer (mcm->vlib_main, sizeof (rep[0]), &bi); + memset (rep, 0, sizeof (rep[0])); + rep->type = MC_MSG_TYPE_join_reply; + rep->stream_index = req->stream_index; + + mc_byte_swap_msg_join_reply (rep); + /* These two are already in network byte order... */ + rep->peer_id = mcm->transport.our_ack_peer_id; + rep->catchup_peer_id = mcm->transport.our_catchup_peer_id; + + mcm->transport.tx_buffer (mcm->transport.opaque, MC_TRANSPORT_JOIN, bi); + } + else + { + if (s->config.peer_died) + s->config.peer_died (mcm, s, req->peer_id); + } +} + +void +mc_msg_join_reply_handler (mc_main_t * mcm, + mc_msg_join_reply_t * mp, u32 buffer_index) +{ + mc_stream_t *s; + + mc_byte_swap_msg_join_reply (mp); + + s = mc_stream_by_index (mcm, mp->stream_index); + + if (!s || s->state != MC_STREAM_STATE_join_in_progress) + return; + + /* Switch to catchup state; next join reply + for this stream will be ignored. */ + s->state = MC_STREAM_STATE_catchup; + + mcm->joins_in_progress--; + mcm->transport.catchup_request_fun (mcm->transport.opaque, + mp->stream_index, mp->catchup_peer_id); +} + +void +mc_wait_for_stream_ready (mc_main_t * m, char *stream_name) +{ + mc_stream_t *s; + + while (1) + { + s = mc_stream_by_name (m, stream_name); + if (s) + break; + vlib_process_suspend (m->vlib_main, .1); + } + + /* It's OK to send a message in catchup and ready states. */ + if (s->state == MC_STREAM_STATE_catchup + || s->state == MC_STREAM_STATE_ready) + return; + + /* Otherwise we are waiting for a join to finish. */ + vlib_current_process_wait_for_one_time_event_vector + (m->vlib_main, &s->procs_waiting_for_join_done); +} + +u32 +mc_stream_send (mc_main_t * mcm, u32 stream_index, u32 buffer_index) +{ + mc_stream_t *s = mc_stream_by_index (mcm, stream_index); + vlib_main_t *vm = mcm->vlib_main; + mc_retry_t *r; + mc_msg_user_request_t *mp; + vlib_buffer_t *b = vlib_get_buffer (vm, buffer_index); + u32 ri; + + if (!s) + return 0; + + if (s->state != MC_STREAM_STATE_ready) + vlib_current_process_wait_for_one_time_event_vector + (vm, &s->procs_waiting_for_join_done); + + while (pool_elts (s->retry_pool) >= s->config.window_size) + { + vlib_current_process_wait_for_one_time_event_vector + (vm, &s->procs_waiting_for_open_window); + } + + pool_get (s->retry_pool, r); + ri = r - s->retry_pool; + + r->prev_index = s->retry_tail_index; + r->next_index = ~0; + s->retry_tail_index = ri; + + if (r->prev_index == ~0) + s->retry_head_index = ri; + else + { + mc_retry_t *p = pool_elt_at_index (s->retry_pool, r->prev_index); + p->next_index = ri; + } + + vlib_buffer_advance (b, -sizeof (mp[0])); + mp = vlib_buffer_get_current (b); + + mp->peer_id = mcm->transport.our_ack_peer_id; + /* mp->transport.global_sequence set by relay agent. */ + mp->global_sequence = 0xdeadbeef; + mp->stream_index = s->index; + mp->local_sequence = s->our_local_sequence++; + mp->n_data_bytes = + vlib_buffer_index_length_in_chain (vm, buffer_index) - sizeof (mp[0]); + + r->buffer_index = buffer_index; + r->local_sequence = mp->local_sequence; + r->sent_at = vlib_time_now (vm); + r->n_retries = 0; + + /* Retry will be freed when all currently known peers have acked. */ + vec_validate (r->unacked_by_peer_bitmap, vec_len (s->all_peer_bitmap) - 1); + vec_copy (r->unacked_by_peer_bitmap, s->all_peer_bitmap); + + hash_set (s->retry_index_by_local_sequence, r->local_sequence, + r - s->retry_pool); + + elog_tx_msg (mcm, s->index, mp->local_sequence, r->n_retries); + + mc_byte_swap_msg_user_request (mp); + + mcm->transport.tx_buffer (mcm->transport.opaque, + MC_TRANSPORT_USER_REQUEST_TO_RELAY, buffer_index); + + s->user_requests_sent++; + + /* return amount of window remaining */ + return s->config.window_size - pool_elts (s->retry_pool); +} + +void +mc_msg_user_request_handler (mc_main_t * mcm, mc_msg_user_request_t * mp, + u32 buffer_index) +{ + vlib_main_t *vm = mcm->vlib_main; + mc_stream_t *s; + mc_stream_peer_t *peer; + i32 seq_cmp_result; + static int once = 0; + + mc_byte_swap_msg_user_request (mp); + + s = mc_stream_by_index (mcm, mp->stream_index); + + /* Not signed up for this stream? Turf-o-matic */ + if (!s || s->state != MC_STREAM_STATE_ready) + { + vlib_buffer_free_one (vm, buffer_index); + return; + } + + /* Find peer, including ourselves. */ + peer = get_or_create_peer_with_id (mcm, s, mp->peer_id, + /* created */ 0); + + seq_cmp_result = mc_seq_cmp (mp->local_sequence, + peer->last_sequence_received + 1); + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "rx-msg: peer %s stream %d rx seq %d seq_cmp %d", + .format_args = "T4i4i4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 peer, stream_index, rx_sequence; + i32 seq_cmp_result; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->peer = elog_id_for_peer_id (mcm, peer->id.as_u64); + ed->stream_index = mp->stream_index; + ed->rx_sequence = mp->local_sequence; + ed->seq_cmp_result = seq_cmp_result; + } + + if (0 && mp->stream_index == 1 && once == 0) + { + once = 1; + ELOG_TYPE (e, "FAKE lost msg on stream 1"); + ELOG (mcm->elog_main, e, 0); + return; + } + + peer->last_sequence_received += seq_cmp_result == 0; + s->user_requests_received++; + + if (seq_cmp_result > 0) + peer->stats.n_msgs_from_future += 1; + + /* Send ack even if msg from future */ + if (1) + { + mc_msg_user_ack_t *rp; + u32 bi; + + rp = mc_get_vlib_buffer (vm, sizeof (rp[0]), &bi); + rp->peer_id = mcm->transport.our_ack_peer_id; + rp->stream_index = s->index; + rp->local_sequence = mp->local_sequence; + rp->seq_cmp_result = seq_cmp_result; + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "tx-ack: stream %d local seq %d", + .format_args = "i4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 stream_index; + u32 local_sequence; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->stream_index = rp->stream_index; + ed->local_sequence = rp->local_sequence; + } + + mc_byte_swap_msg_user_ack (rp); + + mcm->transport.tx_ack (mcm->transport.opaque, mp->peer_id, bi); + /* Msg from past? If so, free the buffer... */ + if (seq_cmp_result < 0) + { + vlib_buffer_free_one (vm, buffer_index); + peer->stats.n_msgs_from_past += 1; + } + } + + if (seq_cmp_result == 0) + { + vlib_buffer_t *b = vlib_get_buffer (vm, buffer_index); + switch (s->state) + { + case MC_STREAM_STATE_ready: + vlib_buffer_advance (b, sizeof (mp[0])); + s->config.rx_buffer (mcm, s, mp->peer_id, buffer_index); + + /* Stream vector can change address via rx callback for mc-internal + stream. */ + s = mc_stream_by_index (mcm, mp->stream_index); + ASSERT (s != 0); + s->last_global_sequence_processed = mp->global_sequence; + break; + + case MC_STREAM_STATE_catchup: + clib_fifo_add1 (s->catchup_fifo, buffer_index); + break; + + default: + clib_warning ("stream in unknown state %U", + format_mc_stream_state, s->state); + break; + } + } +} + +void +mc_msg_user_ack_handler (mc_main_t * mcm, mc_msg_user_ack_t * mp, + u32 buffer_index) +{ + vlib_main_t *vm = mcm->vlib_main; + uword *p; + mc_stream_t *s; + mc_stream_peer_t *peer; + mc_retry_t *r; + int peer_created = 0; + + mc_byte_swap_msg_user_ack (mp); + + s = mc_stream_by_index (mcm, mp->stream_index); + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (t) = + { + .format = "rx-ack: local seq %d peer %s seq_cmp_result %d", + .format_args = "i4T4i4", + }; + /* *INDENT-ON* */ + + struct + { + u32 local_sequence; + u32 peer; + i32 seq_cmp_result; + } *ed; + ed = ELOG_DATA (mcm->elog_main, t); + ed->local_sequence = mp->local_sequence; + ed->peer = elog_id_for_peer_id (mcm, mp->peer_id.as_u64); + ed->seq_cmp_result = mp->seq_cmp_result; + } + + /* Unknown stream? */ + if (!s) + return; + + /* Find the peer which just ack'ed. */ + peer = get_or_create_peer_with_id (mcm, s, mp->peer_id, + /* created */ &peer_created); + + /* + * Peer reports message from the future. If it's not in the retry + * fifo, look for a retired message. + */ + if (mp->seq_cmp_result > 0) + { + p = hash_get (s->retry_index_by_local_sequence, mp->local_sequence - + mp->seq_cmp_result); + if (p == 0) + mc_resend_retired (mcm, s, mp->local_sequence - mp->seq_cmp_result); + + /* Normal retry should fix it... */ + return; + } + + /* + * Pointer to the indicated retry fifo entry. + * Worth hashing because we could use a window size of 100 or 1000. + */ + p = hash_get (s->retry_index_by_local_sequence, mp->local_sequence); + + /* + * Is this a duplicate ACK, received after we've retired the + * fifo entry. This can happen when learning about new + * peers. + */ + if (p == 0) + { + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (t) = + { + .format = "ack: for seq %d from peer %s no fifo elt", + .format_args = "i4T4", + }; + /* *INDENT-ON* */ + + struct + { + u32 seq; + u32 peer; + } *ed; + ed = ELOG_DATA (mcm->elog_main, t); + ed->seq = mp->local_sequence; + ed->peer = elog_id_for_peer_id (mcm, mp->peer_id.as_u64); + } + + return; + } + + r = pool_elt_at_index (s->retry_pool, p[0]); + + /* Make sure that this new peer ACKs our msgs from now on */ + if (peer_created) + { + mc_retry_t *later_retry = next_retry (s, r); + + while (later_retry) + { + later_retry->unacked_by_peer_bitmap = + clib_bitmap_ori (later_retry->unacked_by_peer_bitmap, + peer - s->peers); + later_retry = next_retry (s, later_retry); + } + } + + ASSERT (mp->local_sequence == r->local_sequence); + + /* If we weren't expecting to hear from this peer */ + if (!peer_created && + !clib_bitmap_get (r->unacked_by_peer_bitmap, peer - s->peers)) + { + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (t) = + { + .format = "dup-ack: for seq %d from peer %s", + .format_args = "i4T4", + }; + /* *INDENT-ON* */ + struct + { + u32 seq; + u32 peer; + } *ed; + ed = ELOG_DATA (mcm->elog_main, t); + ed->seq = r->local_sequence; + ed->peer = elog_id_for_peer_id (mcm, peer->id.as_u64); + } + if (!clib_bitmap_is_zero (r->unacked_by_peer_bitmap)) + return; + } + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (t) = + { + .format = "ack: for seq %d from peer %s", + .format_args = "i4T4", + }; + /* *INDENT-ON* */ + struct + { + u32 seq; + u32 peer; + } *ed; + ed = ELOG_DATA (mcm->elog_main, t); + ed->seq = mp->local_sequence; + ed->peer = elog_id_for_peer_id (mcm, peer->id.as_u64); + } + + r->unacked_by_peer_bitmap = + clib_bitmap_andnoti (r->unacked_by_peer_bitmap, peer - s->peers); + + /* Not all clients have ack'ed */ + if (!clib_bitmap_is_zero (r->unacked_by_peer_bitmap)) + { + return; + } + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (t) = + { + .format = "ack: retire fifo elt loc seq %d after %d acks", + .format_args = "i4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 seq; + u32 npeers; + } *ed; + ed = ELOG_DATA (mcm->elog_main, t); + ed->seq = r->local_sequence; + ed->npeers = pool_elts (s->peers); + } + + hash_unset (s->retry_index_by_local_sequence, mp->local_sequence); + mc_retry_free (mcm, s, r); + remove_retry_from_pool (s, r); + maybe_send_window_open_event (vm, s); +} + +#define EVENT_MC_SEND_CATCHUP_DATA 0 + +static uword +mc_catchup_process (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * f) +{ + mc_main_t *mcm = mc_node_get_main (node); + uword *event_data = 0; + mc_catchup_process_arg_t *args; + int i; + + while (1) + { + if (event_data) + _vec_len (event_data) = 0; + vlib_process_wait_for_event_with_type (vm, &event_data, + EVENT_MC_SEND_CATCHUP_DATA); + + for (i = 0; i < vec_len (event_data); i++) + { + args = pool_elt_at_index (mcm->catchup_process_args, event_data[i]); + + mcm->transport.catchup_send_fun (mcm->transport.opaque, + args->catchup_opaque, + args->catchup_snapshot); + + /* Send function will free snapshot data vector. */ + pool_put (mcm->catchup_process_args, args); + } + } + + return 0; /* not likely */ +} + +static void +serialize_mc_stream (serialize_main_t * m, va_list * va) +{ + mc_stream_t *s = va_arg (*va, mc_stream_t *); + mc_stream_peer_t *p; + + serialize_integer (m, pool_elts (s->peers), sizeof (u32)); + /* *INDENT-OFF* */ + pool_foreach (p, s->peers, ({ + u8 * x = serialize_get (m, sizeof (p->id)); + clib_memcpy (x, p->id.as_u8, sizeof (p->id)); + serialize_integer (m, p->last_sequence_received, + sizeof (p->last_sequence_received)); + })); +/* *INDENT-ON* */ + serialize_bitmap (m, s->all_peer_bitmap); +} + +void +unserialize_mc_stream (serialize_main_t * m, va_list * va) +{ + mc_stream_t *s = va_arg (*va, mc_stream_t *); + u32 i, n_peers; + mc_stream_peer_t *p; + + unserialize_integer (m, &n_peers, sizeof (u32)); + mhash_init (&s->peer_index_by_id, sizeof (uword), sizeof (mc_peer_id_t)); + for (i = 0; i < n_peers; i++) + { + u8 *x; + pool_get (s->peers, p); + x = unserialize_get (m, sizeof (p->id)); + clib_memcpy (p->id.as_u8, x, sizeof (p->id)); + unserialize_integer (m, &p->last_sequence_received, + sizeof (p->last_sequence_received)); + mhash_set (&s->peer_index_by_id, &p->id, p - s->peers, /* old_value */ + 0); + } + s->all_peer_bitmap = unserialize_bitmap (m); + + /* This is really bad. */ + if (!s->all_peer_bitmap) + clib_warning ("BUG: stream %s all_peer_bitmap NULL", s->config.name); +} + +void +mc_msg_catchup_request_handler (mc_main_t * mcm, + mc_msg_catchup_request_t * req, + u32 catchup_opaque) +{ + vlib_main_t *vm = mcm->vlib_main; + mc_stream_t *s; + mc_catchup_process_arg_t *args; + + mc_byte_swap_msg_catchup_request (req); + + s = mc_stream_by_index (mcm, req->stream_index); + if (!s || s->state != MC_STREAM_STATE_ready) + return; + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (t) = + { + .format = "catchup-request: from %s stream %d", + .format_args = "T4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 peer, stream; + } *ed; + ed = ELOG_DATA (mcm->elog_main, t); + ed->peer = elog_id_for_peer_id (mcm, req->peer_id.as_u64); + ed->stream = req->stream_index; + } + + /* + * The application has to snapshoot its data structures right + * here, right now. If we process any messages after + * noting the last global sequence we've processed, the client + * won't be able to accurately reconstruct our data structures. + * + * Once the data structures are e.g. vec_dup()'ed, we + * send the resulting messages from a separate process, to + * make sure that we don't cause a bunch of message retransmissions + */ + pool_get (mcm->catchup_process_args, args); + + args->stream_index = s - mcm->stream_vector; + args->catchup_opaque = catchup_opaque; + args->catchup_snapshot = 0; + + /* Construct catchup reply and snapshot state for stream to send as + catchup reply payload. */ + { + mc_msg_catchup_reply_t *rep; + serialize_main_t m; + + vec_resize (args->catchup_snapshot, sizeof (rep[0])); + + rep = (void *) args->catchup_snapshot; + + rep->peer_id = req->peer_id; + rep->stream_index = req->stream_index; + rep->last_global_sequence_included = s->last_global_sequence_processed; + + /* Setup for serialize to append to catchup snapshot. */ + serialize_open_vector (&m, args->catchup_snapshot); + m.stream.current_buffer_index = vec_len (m.stream.buffer); + + serialize (&m, serialize_mc_stream, s); + + args->catchup_snapshot = serialize_close_vector (&m); + + /* Actually copy internal state */ + args->catchup_snapshot = s->config.catchup_snapshot + (mcm, args->catchup_snapshot, rep->last_global_sequence_included); + + rep = (void *) args->catchup_snapshot; + rep->n_data_bytes = vec_len (args->catchup_snapshot) - sizeof (rep[0]); + + mc_byte_swap_msg_catchup_reply (rep); + } + + /* now go send it... */ + vlib_process_signal_event (vm, mcm->catchup_process, + EVENT_MC_SEND_CATCHUP_DATA, + args - mcm->catchup_process_args); +} + +#define EVENT_MC_UNSERIALIZE_BUFFER 0 +#define EVENT_MC_UNSERIALIZE_CATCHUP 1 + +void +mc_msg_catchup_reply_handler (mc_main_t * mcm, mc_msg_catchup_reply_t * mp, + u32 catchup_opaque) +{ + vlib_process_signal_event (mcm->vlib_main, + mcm->unserialize_process, + EVENT_MC_UNSERIALIZE_CATCHUP, + pointer_to_uword (mp)); +} + +static void +perform_catchup (mc_main_t * mcm, mc_msg_catchup_reply_t * mp) +{ + mc_stream_t *s; + i32 seq_cmp_result; + + mc_byte_swap_msg_catchup_reply (mp); + + s = mc_stream_by_index (mcm, mp->stream_index); + + /* Never heard of this stream or already caught up. */ + if (!s || s->state == MC_STREAM_STATE_ready) + return; + + { + serialize_main_t m; + mc_stream_peer_t *p; + u32 n_stream_bytes; + + /* For offline sim replay: save the entire catchup snapshot... */ + if (s->config.save_snapshot) + s->config.save_snapshot (mcm, /* is_catchup */ 1, mp->data, + mp->n_data_bytes); + + unserialize_open_data (&m, mp->data, mp->n_data_bytes); + unserialize (&m, unserialize_mc_stream, s); + + /* Make sure we start numbering our messages as expected */ + /* *INDENT-OFF* */ + pool_foreach (p, s->peers, ({ + if (p->id.as_u64 == mcm->transport.our_ack_peer_id.as_u64) + s->our_local_sequence = p->last_sequence_received + 1; + })); +/* *INDENT-ON* */ + + n_stream_bytes = m.stream.current_buffer_index; + + /* No need to unserialize close; nothing to free. */ + + /* After serialized stream is user's catchup data. */ + s->config.catchup (mcm, mp->data + n_stream_bytes, + mp->n_data_bytes - n_stream_bytes); + } + + /* Vector could have been moved by catchup. + This can only happen for mc-internal stream. */ + s = mc_stream_by_index (mcm, mp->stream_index); + + s->last_global_sequence_processed = mp->last_global_sequence_included; + + while (clib_fifo_elts (s->catchup_fifo)) + { + mc_msg_user_request_t *gp; + u32 bi; + vlib_buffer_t *b; + + clib_fifo_sub1 (s->catchup_fifo, bi); + + b = vlib_get_buffer (mcm->vlib_main, bi); + gp = vlib_buffer_get_current (b); + + /* Make sure we're replaying "new" news */ + seq_cmp_result = mc_seq_cmp (gp->global_sequence, + mp->last_global_sequence_included); + + if (seq_cmp_result > 0) + { + vlib_buffer_advance (b, sizeof (gp[0])); + s->config.rx_buffer (mcm, s, gp->peer_id, bi); + s->last_global_sequence_processed = gp->global_sequence; + + if (MC_EVENT_LOGGING) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (t) = + { + .format = "catchup replay local sequence 0x%x", + .format_args = "i4", + }; + /* *INDENT-ON* */ + struct + { + u32 local_sequence; + } *ed; + ed = ELOG_DATA (mcm->elog_main, t); + ed->local_sequence = gp->local_sequence; + } + } + else + { + if (MC_EVENT_LOGGING) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (t) = + { + .format = "catchup discard local sequence 0x%x", + .format_args = "i4", + }; + /* *INDENT-ON* */ + struct + { + u32 local_sequence; + } *ed; + ed = ELOG_DATA (mcm->elog_main, t); + ed->local_sequence = gp->local_sequence; + } + + vlib_buffer_free_one (mcm->vlib_main, bi); + } + } + + s->state = MC_STREAM_STATE_ready; + + /* Now that we are caught up wake up joining process. */ + { + vlib_one_time_waiting_process_t *wp; + vec_foreach (wp, s->procs_waiting_for_join_done) + vlib_signal_one_time_waiting_process (mcm->vlib_main, wp); + if (s->procs_waiting_for_join_done) + _vec_len (s->procs_waiting_for_join_done) = 0; + } +} + +static void +this_node_maybe_master (mc_main_t * mcm) +{ + vlib_main_t *vm = mcm->vlib_main; + mc_msg_master_assert_t *mp; + uword event_type; + int timeouts = 0; + int is_master = mcm->relay_state == MC_RELAY_STATE_MASTER; + clib_error_t *error; + f64 now, time_last_master_assert = -1; + u32 bi; + + while (1) + { + if (!mcm->we_can_be_relay_master) + { + mcm->relay_state = MC_RELAY_STATE_SLAVE; + if (MC_EVENT_LOGGING) + { + ELOG_TYPE (e, "become slave (config)"); + ELOG (mcm->elog_main, e, 0); + } + return; + } + + now = vlib_time_now (vm); + if (now >= time_last_master_assert + 1) + { + time_last_master_assert = now; + mp = mc_get_vlib_buffer (mcm->vlib_main, sizeof (mp[0]), &bi); + + mp->peer_id = mcm->transport.our_ack_peer_id; + mp->global_sequence = mcm->relay_global_sequence; + + /* + * these messages clog the event log, set MC_EVENT_LOGGING higher + * if you want them + */ + if (MC_EVENT_LOGGING > 1) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "tx-massert: peer %s global seq %u", + .format_args = "T4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 peer, global_sequence; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->peer = elog_id_for_peer_id (mcm, mp->peer_id.as_u64); + ed->global_sequence = mp->global_sequence; + } + + mc_byte_swap_msg_master_assert (mp); + + error = + mcm->transport.tx_buffer (mcm->transport.opaque, + MC_TRANSPORT_MASTERSHIP, bi); + if (error) + clib_error_report (error); + } + + vlib_process_wait_for_event_or_clock (vm, 1.0); + event_type = vlib_process_get_events (vm, /* no event data */ 0); + + switch (event_type) + { + case ~0: + if (!is_master && timeouts++ > 2) + { + mcm->relay_state = MC_RELAY_STATE_MASTER; + mcm->relay_master_peer_id = + mcm->transport.our_ack_peer_id.as_u64; + if (MC_EVENT_LOGGING) + { + ELOG_TYPE (e, "become master (was maybe_master)"); + ELOG (mcm->elog_main, e, 0); + } + return; + } + break; + + case MC_RELAY_STATE_SLAVE: + mcm->relay_state = MC_RELAY_STATE_SLAVE; + if (MC_EVENT_LOGGING && mcm->relay_state != MC_RELAY_STATE_SLAVE) + { + ELOG_TYPE (e, "become slave (was maybe_master)"); + ELOG (mcm->elog_main, e, 0); + } + return; + } + } +} + +static void +this_node_slave (mc_main_t * mcm) +{ + vlib_main_t *vm = mcm->vlib_main; + uword event_type; + int timeouts = 0; + + if (MC_EVENT_LOGGING) + { + ELOG_TYPE (e, "become slave"); + ELOG (mcm->elog_main, e, 0); + } + + while (1) + { + vlib_process_wait_for_event_or_clock (vm, 1.0); + event_type = vlib_process_get_events (vm, /* no event data */ 0); + + switch (event_type) + { + case ~0: + if (timeouts++ > 2) + { + mcm->relay_state = MC_RELAY_STATE_NEGOTIATE; + mcm->relay_master_peer_id = ~0ULL; + if (MC_EVENT_LOGGING) + { + ELOG_TYPE (e, "timeouts; negoitate mastership"); + ELOG (mcm->elog_main, e, 0); + } + return; + } + break; + + case MC_RELAY_STATE_SLAVE: + mcm->relay_state = MC_RELAY_STATE_SLAVE; + timeouts = 0; + break; + } + } +} + +static uword +mc_mastership_process (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * f) +{ + mc_main_t *mcm = mc_node_get_main (node); + + while (1) + { + switch (mcm->relay_state) + { + case MC_RELAY_STATE_NEGOTIATE: + case MC_RELAY_STATE_MASTER: + this_node_maybe_master (mcm); + break; + + case MC_RELAY_STATE_SLAVE: + this_node_slave (mcm); + break; + } + } + return 0; /* not likely */ +} + +void +mc_enable_disable_mastership (mc_main_t * mcm, int we_can_be_master) +{ + if (we_can_be_master != mcm->we_can_be_relay_master) + { + mcm->we_can_be_relay_master = we_can_be_master; + vlib_process_signal_event (mcm->vlib_main, + mcm->mastership_process, + MC_RELAY_STATE_NEGOTIATE, 0); + } +} + +void +mc_msg_master_assert_handler (mc_main_t * mcm, mc_msg_master_assert_t * mp, + u32 buffer_index) +{ + mc_peer_id_t his_peer_id, our_peer_id; + i32 seq_cmp_result; + u8 signal_slave = 0; + u8 update_global_sequence = 0; + + mc_byte_swap_msg_master_assert (mp); + + his_peer_id = mp->peer_id; + our_peer_id = mcm->transport.our_ack_peer_id; + + /* compare the incoming global sequence with ours */ + seq_cmp_result = mc_seq_cmp (mp->global_sequence, + mcm->relay_global_sequence); + + /* If the sender has a lower peer id and the sender's sequence >= + our global sequence, we become a slave. Otherwise we are master. */ + if (mc_peer_id_compare (his_peer_id, our_peer_id) < 0 + && seq_cmp_result >= 0) + { + vlib_process_signal_event (mcm->vlib_main, + mcm->mastership_process, + MC_RELAY_STATE_SLAVE, 0); + signal_slave = 1; + } + + /* Update our global sequence. */ + if (seq_cmp_result > 0) + { + mcm->relay_global_sequence = mp->global_sequence; + update_global_sequence = 1; + } + + { + uword *q = mhash_get (&mcm->mastership_peer_index_by_id, &his_peer_id); + mc_mastership_peer_t *p; + + if (q) + p = vec_elt_at_index (mcm->mastership_peers, q[0]); + else + { + vec_add2 (mcm->mastership_peers, p, 1); + p->peer_id = his_peer_id; + mhash_set (&mcm->mastership_peer_index_by_id, &p->peer_id, + p - mcm->mastership_peers, + /* old_value */ 0); + } + p->time_last_master_assert_received = vlib_time_now (mcm->vlib_main); + } + + /* + * these messages clog the event log, set MC_EVENT_LOGGING higher + * if you want them. + */ + if (MC_EVENT_LOGGING > 1) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "rx-massert: peer %s global seq %u upd %d slave %d", + .format_args = "T4i4i1i1", + }; + /* *INDENT-ON* */ + + struct + { + u32 peer; + u32 global_sequence; + u8 update_sequence; + u8 slave; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->peer = elog_id_for_peer_id (mcm, his_peer_id.as_u64); + ed->global_sequence = mp->global_sequence; + ed->update_sequence = update_global_sequence; + ed->slave = signal_slave; + } +} + +static void +mc_serialize_init (mc_main_t * mcm) +{ + mc_serialize_msg_t *m; + vlib_main_t *vm = vlib_get_main (); + + mcm->global_msg_index_by_name + = hash_create_string ( /* elts */ 0, sizeof (uword)); + + m = vm->mc_msg_registrations; + + while (m) + { + m->global_index = vec_len (mcm->global_msgs); + hash_set_mem (mcm->global_msg_index_by_name, m->name, m->global_index); + vec_add1 (mcm->global_msgs, m); + m = m->next_registration; + } +} + +clib_error_t * +mc_serialize_va (mc_main_t * mc, + u32 stream_index, + u32 multiple_messages_per_vlib_buffer, + mc_serialize_msg_t * msg, va_list * va) +{ + mc_stream_t *s; + clib_error_t *error; + serialize_main_t *m = &mc->serialize_mains[VLIB_TX]; + vlib_serialize_buffer_main_t *sbm = &mc->serialize_buffer_mains[VLIB_TX]; + u32 bi, n_before, n_after, n_total, n_this_msg; + u32 si, gi; + + if (!sbm->vlib_main) + { + sbm->tx.max_n_data_bytes_per_chain = 4096; + sbm->tx.free_list_index = VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX; + } + + if (sbm->first_buffer == 0) + serialize_open_vlib_buffer (m, mc->vlib_main, sbm); + + n_before = serialize_vlib_buffer_n_bytes (m); + + s = mc_stream_by_index (mc, stream_index); + gi = msg->global_index; + ASSERT (msg == vec_elt (mc->global_msgs, gi)); + + si = ~0; + if (gi < vec_len (s->stream_msg_index_by_global_index)) + si = s->stream_msg_index_by_global_index[gi]; + + serialize_likely_small_unsigned_integer (m, si); + + /* For first time message is sent, use name to identify message. */ + if (si == ~0 || MSG_ID_DEBUG) + serialize_cstring (m, msg->name); + + if (MSG_ID_DEBUG && MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "serialize-msg: %s index %d", + .format_args = "T4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 c[2]; + } *ed; + ed = ELOG_DATA (mc->elog_main, e); + ed->c[0] = elog_id_for_msg_name (mc, msg->name); + ed->c[1] = si; + } + + error = va_serialize (m, va); + + n_after = serialize_vlib_buffer_n_bytes (m); + n_this_msg = n_after - n_before; + n_total = n_after + sizeof (mc_msg_user_request_t); + + /* For max message size ignore first message where string name is sent. */ + if (si != ~0) + msg->max_n_bytes_serialized = + clib_max (msg->max_n_bytes_serialized, n_this_msg); + + if (!multiple_messages_per_vlib_buffer + || si == ~0 + || n_total + msg->max_n_bytes_serialized > + mc->transport.max_packet_size) + { + bi = serialize_close_vlib_buffer (m); + sbm->first_buffer = 0; + if (!error) + mc_stream_send (mc, stream_index, bi); + else if (bi != ~0) + vlib_buffer_free_one (mc->vlib_main, bi); + } + + return error; +} + +clib_error_t * +mc_serialize_internal (mc_main_t * mc, + u32 stream_index, + u32 multiple_messages_per_vlib_buffer, + mc_serialize_msg_t * msg, ...) +{ + vlib_main_t *vm = mc->vlib_main; + va_list va; + clib_error_t *error; + + if (stream_index == ~0) + { + if (vm->mc_main && vm->mc_stream_index == ~0) + vlib_current_process_wait_for_one_time_event_vector + (vm, &vm->procs_waiting_for_mc_stream_join); + stream_index = vm->mc_stream_index; + } + + va_start (va, msg); + error = mc_serialize_va (mc, stream_index, + multiple_messages_per_vlib_buffer, msg, &va); + va_end (va); + return error; +} + +uword +mc_unserialize_message (mc_main_t * mcm, + mc_stream_t * s, serialize_main_t * m) +{ + mc_serialize_stream_msg_t *sm; + u32 gi, si; + + si = unserialize_likely_small_unsigned_integer (m); + + if (!(si == ~0 || MSG_ID_DEBUG)) + { + sm = vec_elt_at_index (s->stream_msgs, si); + gi = sm->global_index; + } + else + { + char *name; + + unserialize_cstring (m, &name); + + if (MSG_ID_DEBUG && MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "unserialize-msg: %s rx index %d", + .format_args = "T4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 c[2]; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->c[0] = elog_id_for_msg_name (mcm, name); + ed->c[1] = si; + } + + { + uword *p = hash_get_mem (mcm->global_msg_index_by_name, name); + gi = p ? p[0] : ~0; + } + + /* Unknown message? */ + if (gi == ~0) + { + vec_free (name); + goto done; + } + + vec_validate_init_empty (s->stream_msg_index_by_global_index, gi, ~0); + si = s->stream_msg_index_by_global_index[gi]; + + /* Stream local index unknown? Create it. */ + if (si == ~0) + { + vec_add2 (s->stream_msgs, sm, 1); + + si = sm - s->stream_msgs; + sm->global_index = gi; + s->stream_msg_index_by_global_index[gi] = si; + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "msg-bind: stream %d %s to index %d", + .format_args = "i4T4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 c[3]; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->c[0] = s->index; + ed->c[1] = elog_id_for_msg_name (mcm, name); + ed->c[2] = si; + } + } + else + { + sm = vec_elt_at_index (s->stream_msgs, si); + if (gi != sm->global_index && MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "msg-id-ERROR: %s index %d expected %d", + .format_args = "T4i4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 c[3]; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->c[0] = elog_id_for_msg_name (mcm, name); + ed->c[1] = si; + ed->c[2] = ~0; + if (sm->global_index < + vec_len (s->stream_msg_index_by_global_index)) + ed->c[2] = + s->stream_msg_index_by_global_index[sm->global_index]; + } + } + + vec_free (name); + } + + if (gi != ~0) + { + mc_serialize_msg_t *msg; + msg = vec_elt (mcm->global_msgs, gi); + unserialize (m, msg->unserialize, mcm); + } + +done: + return gi != ~0; +} + +void +mc_unserialize_internal (mc_main_t * mcm, u32 stream_and_buffer_index) +{ + vlib_main_t *vm = mcm->vlib_main; + serialize_main_t *m = &mcm->serialize_mains[VLIB_RX]; + vlib_serialize_buffer_main_t *sbm = &mcm->serialize_buffer_mains[VLIB_RX]; + mc_stream_and_buffer_t *sb; + mc_stream_t *stream; + u32 buffer_index; + + sb = + pool_elt_at_index (mcm->mc_unserialize_stream_and_buffers, + stream_and_buffer_index); + buffer_index = sb->buffer_index; + stream = vec_elt_at_index (mcm->stream_vector, sb->stream_index); + pool_put (mcm->mc_unserialize_stream_and_buffers, sb); + + if (stream->config.save_snapshot) + { + u32 n_bytes = vlib_buffer_index_length_in_chain (vm, buffer_index); + static u8 *contents; + vec_reset_length (contents); + vec_validate (contents, n_bytes - 1); + vlib_buffer_contents (vm, buffer_index, contents); + stream->config.save_snapshot (mcm, /* is_catchup */ 0, contents, + n_bytes); + } + + ASSERT (vlib_in_process_context (vm)); + + unserialize_open_vlib_buffer (m, vm, sbm); + + clib_fifo_add1 (sbm->rx.buffer_fifo, buffer_index); + + while (unserialize_vlib_buffer_n_bytes (m) > 0) + mc_unserialize_message (mcm, stream, m); + + /* Frees buffer. */ + unserialize_close_vlib_buffer (m); +} + +void +mc_unserialize (mc_main_t * mcm, mc_stream_t * s, u32 buffer_index) +{ + vlib_main_t *vm = mcm->vlib_main; + mc_stream_and_buffer_t *sb; + pool_get (mcm->mc_unserialize_stream_and_buffers, sb); + sb->stream_index = s->index; + sb->buffer_index = buffer_index; + vlib_process_signal_event (vm, mcm->unserialize_process, + EVENT_MC_UNSERIALIZE_BUFFER, + sb - mcm->mc_unserialize_stream_and_buffers); +} + +static uword +mc_unserialize_process (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * f) +{ + mc_main_t *mcm = mc_node_get_main (node); + uword event_type, *event_data = 0; + int i; + + while (1) + { + if (event_data) + _vec_len (event_data) = 0; + + vlib_process_wait_for_event (vm); + event_type = vlib_process_get_events (vm, &event_data); + switch (event_type) + { + case EVENT_MC_UNSERIALIZE_BUFFER: + for (i = 0; i < vec_len (event_data); i++) + mc_unserialize_internal (mcm, event_data[i]); + break; + + case EVENT_MC_UNSERIALIZE_CATCHUP: + for (i = 0; i < vec_len (event_data); i++) + { + u8 *mp = uword_to_pointer (event_data[i], u8 *); + perform_catchup (mcm, (void *) mp); + vec_free (mp); + } + break; + + default: + break; + } + } + + return 0; /* not likely */ +} + +void +serialize_mc_main (serialize_main_t * m, va_list * va) +{ + mc_main_t *mcm = va_arg (*va, mc_main_t *); + mc_stream_t *s; + mc_serialize_stream_msg_t *sm; + mc_serialize_msg_t *msg; + + serialize_integer (m, vec_len (mcm->stream_vector), sizeof (u32)); + vec_foreach (s, mcm->stream_vector) + { + /* Stream name. */ + serialize_cstring (m, s->config.name); + + /* Serialize global names for all sent messages. */ + serialize_integer (m, vec_len (s->stream_msgs), sizeof (u32)); + vec_foreach (sm, s->stream_msgs) + { + msg = vec_elt (mcm->global_msgs, sm->global_index); + serialize_cstring (m, msg->name); + } + } +} + +void +unserialize_mc_main (serialize_main_t * m, va_list * va) +{ + mc_main_t *mcm = va_arg (*va, mc_main_t *); + u32 i, n_streams, n_stream_msgs; + char *name; + mc_stream_t *s; + mc_serialize_stream_msg_t *sm; + + unserialize_integer (m, &n_streams, sizeof (u32)); + for (i = 0; i < n_streams; i++) + { + unserialize_cstring (m, &name); + if (i != MC_STREAM_INDEX_INTERNAL && !mc_stream_by_name (mcm, name)) + { + vec_validate (mcm->stream_vector, i); + s = vec_elt_at_index (mcm->stream_vector, i); + mc_stream_init (s); + s->index = s - mcm->stream_vector; + s->config.name = name; + s->state = MC_STREAM_STATE_name_known; + hash_set_mem (mcm->stream_index_by_name, s->config.name, s->index); + } + else + vec_free (name); + + s = vec_elt_at_index (mcm->stream_vector, i); + + vec_free (s->stream_msgs); + vec_free (s->stream_msg_index_by_global_index); + + unserialize_integer (m, &n_stream_msgs, sizeof (u32)); + vec_resize (s->stream_msgs, n_stream_msgs); + vec_foreach (sm, s->stream_msgs) + { + uword *p; + u32 si, gi; + + unserialize_cstring (m, &name); + p = hash_get (mcm->global_msg_index_by_name, name); + gi = p ? p[0] : ~0; + si = sm - s->stream_msgs; + + if (MC_EVENT_LOGGING > 0) + { + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "catchup-bind: %s to %d global index %d stream %d", + .format_args = "T4i4i4i4", + }; + /* *INDENT-ON* */ + + struct + { + u32 c[4]; + } *ed; + ed = ELOG_DATA (mcm->elog_main, e); + ed->c[0] = elog_id_for_msg_name (mcm, name); + ed->c[1] = si; + ed->c[2] = gi; + ed->c[3] = s->index; + } + + vec_free (name); + + sm->global_index = gi; + if (gi != ~0) + { + vec_validate_init_empty (s->stream_msg_index_by_global_index, + gi, ~0); + s->stream_msg_index_by_global_index[gi] = si; + } + } + } +} + +void +mc_main_init (mc_main_t * mcm, char *tag) +{ + vlib_main_t *vm = vlib_get_main (); + + mcm->vlib_main = vm; + mcm->elog_main = &vm->elog_main; + + mcm->relay_master_peer_id = ~0ULL; + mcm->relay_state = MC_RELAY_STATE_NEGOTIATE; + + mcm->stream_index_by_name + = hash_create_string ( /* elts */ 0, /* value size */ sizeof (uword)); + + { + vlib_node_registration_t r; + + memset (&r, 0, sizeof (r)); + + r.type = VLIB_NODE_TYPE_PROCESS; + + /* Point runtime data to main instance. */ + r.runtime_data = &mcm; + r.runtime_data_bytes = sizeof (&mcm); + + r.name = (char *) format (0, "mc-mastership-%s", tag); + r.function = mc_mastership_process; + mcm->mastership_process = vlib_register_node (vm, &r); + + r.name = (char *) format (0, "mc-join-ager-%s", tag); + r.function = mc_join_ager_process; + mcm->join_ager_process = vlib_register_node (vm, &r); + + r.name = (char *) format (0, "mc-retry-%s", tag); + r.function = mc_retry_process; + mcm->retry_process = vlib_register_node (vm, &r); + + r.name = (char *) format (0, "mc-catchup-%s", tag); + r.function = mc_catchup_process; + mcm->catchup_process = vlib_register_node (vm, &r); + + r.name = (char *) format (0, "mc-unserialize-%s", tag); + r.function = mc_unserialize_process; + mcm->unserialize_process = vlib_register_node (vm, &r); + } + + if (MC_EVENT_LOGGING > 0) + mhash_init (&mcm->elog_id_by_peer_id, sizeof (uword), + sizeof (mc_peer_id_t)); + + mhash_init (&mcm->mastership_peer_index_by_id, sizeof (uword), + sizeof (mc_peer_id_t)); + mc_serialize_init (mcm); +} + +static u8 * +format_mc_relay_state (u8 * s, va_list * args) +{ + mc_relay_state_t state = va_arg (*args, mc_relay_state_t); + char *t = 0; + switch (state) + { + case MC_RELAY_STATE_NEGOTIATE: + t = "negotiate"; + break; + case MC_RELAY_STATE_MASTER: + t = "master"; + break; + case MC_RELAY_STATE_SLAVE: + t = "slave"; + break; + default: + return format (s, "unknown 0x%x", state); + } + + return format (s, "%s", t); +} + +static u8 * +format_mc_stream_state (u8 * s, va_list * args) +{ + mc_stream_state_t state = va_arg (*args, mc_stream_state_t); + char *t = 0; + switch (state) + { +#define _(f) case MC_STREAM_STATE_##f: t = #f; break; + foreach_mc_stream_state +#undef _ + default: + return format (s, "unknown 0x%x", state); + } + + return format (s, "%s", t); +} + +static int +mc_peer_comp (void *a1, void *a2) +{ + mc_stream_peer_t *p1 = a1; + mc_stream_peer_t *p2 = a2; + + return mc_peer_id_compare (p1->id, p2->id); +} + +u8 * +format_mc_main (u8 * s, va_list * args) +{ + mc_main_t *mcm = va_arg (*args, mc_main_t *); + mc_stream_t *t; + mc_stream_peer_t *p, *ps; + uword indent = format_get_indent (s); + + s = format (s, "MC state %U, %d streams joined, global sequence 0x%x", + format_mc_relay_state, mcm->relay_state, + vec_len (mcm->stream_vector), mcm->relay_global_sequence); + + { + mc_mastership_peer_t *mp; + f64 now = vlib_time_now (mcm->vlib_main); + s = format (s, "\n%UMost recent mastership peers:", + format_white_space, indent + 2); + vec_foreach (mp, mcm->mastership_peers) + { + s = format (s, "\n%U%-30U%.4e", + format_white_space, indent + 4, + mcm->transport.format_peer_id, mp->peer_id, + now - mp->time_last_master_assert_received); + } + } + + vec_foreach (t, mcm->stream_vector) + { + s = format (s, "\n%Ustream `%s' index %d", + format_white_space, indent + 2, t->config.name, t->index); + + s = format (s, "\n%Ustate %U", + format_white_space, indent + 4, + format_mc_stream_state, t->state); + + s = + format (s, + "\n%Uretries: interval %.0f sec, limit %d, pool elts %d, %Ld sent", + format_white_space, indent + 4, t->config.retry_interval, + t->config.retry_limit, pool_elts (t->retry_pool), + t->stats.n_retries - t->stats_last_clear.n_retries); + + s = format (s, "\n%U%Ld/%Ld user requests sent/received", + format_white_space, indent + 4, + t->user_requests_sent, t->user_requests_received); + + s = format (s, "\n%U%d peers, local/global sequence 0x%x/0x%x", + format_white_space, indent + 4, + pool_elts (t->peers), + t->our_local_sequence, t->last_global_sequence_processed); + + ps = 0; + /* *INDENT-OFF* */ + pool_foreach (p, t->peers, + ({ + if (clib_bitmap_get (t->all_peer_bitmap, p - t->peers)) + vec_add1 (ps, p[0]); + })); + /* *INDENT-ON* */ + vec_sort_with_function (ps, mc_peer_comp); + s = format (s, "\n%U%=30s%10s%16s%16s", + format_white_space, indent + 6, + "Peer", "Last seq", "Retries", "Future"); + + vec_foreach (p, ps) + { + s = format (s, "\n%U%-30U0x%08x%16Ld%16Ld%s", + format_white_space, indent + 6, + mcm->transport.format_peer_id, p->id.as_u64, + p->last_sequence_received, + p->stats.n_msgs_from_past - + p->stats_last_clear.n_msgs_from_past, + p->stats.n_msgs_from_future - + p->stats_last_clear.n_msgs_from_future, + (mcm->transport.our_ack_peer_id.as_u64 == + p->id.as_u64 ? " (self)" : "")); + } + vec_free (ps); + } + + return s; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/mc.h b/src/vlib/mc.h new file mode 100644 index 00000000..dc95b0e9 --- /dev/null +++ b/src/vlib/mc.h @@ -0,0 +1,687 @@ +/* + * mc.h: vlib reliable sequenced multicast distributed applications + * + * Copyright (c) 2010 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_vlib_mc_h +#define included_vlib_mc_h + +#include +#include +#include +#include + +#ifndef MC_EVENT_LOGGING +#define MC_EVENT_LOGGING 1 +#endif + +always_inline uword +mc_need_byte_swap (void) +{ + return CLIB_ARCH_IS_LITTLE_ENDIAN; +} + +/* + * Used to uniquely identify hosts. + * For IP4 this would be ip4_address plus tcp/udp port. + */ +typedef union +{ + u8 as_u8[8]; + u64 as_u64; +} mc_peer_id_t; + +always_inline mc_peer_id_t +mc_byte_swap_peer_id (mc_peer_id_t i) +{ + /* Peer id is already in network byte order. */ + return i; +} + +always_inline int +mc_peer_id_compare (mc_peer_id_t a, mc_peer_id_t b) +{ + return memcmp (a.as_u8, b.as_u8, sizeof (a.as_u8)); +} + +/* Assert mastership. Lowest peer_id amount all peers wins mastership. + Only sent/received over mastership channel (MC_TRANSPORT_MASTERSHIP). + So, we don't need a message opcode. */ +typedef CLIB_PACKED (struct + { + /* Peer id asserting mastership. */ + mc_peer_id_t peer_id; + /* Global sequence number asserted. */ + u32 global_sequence;}) mc_msg_master_assert_t; + +always_inline void +mc_byte_swap_msg_master_assert (mc_msg_master_assert_t * r) +{ + if (mc_need_byte_swap ()) + { + r->peer_id = mc_byte_swap_peer_id (r->peer_id); + r->global_sequence = clib_byte_swap_u32 (r->global_sequence); + } +} + +#define foreach_mc_msg_type \ + _ (master_assert) \ + _ (join_or_leave_request) \ + _ (join_reply) \ + _ (user_request) \ + _ (user_ack) \ + _ (catchup_request) \ + _ (catchup_reply) + +typedef enum +{ +#define _(f) MC_MSG_TYPE_##f, + foreach_mc_msg_type +#undef _ +} mc_relay_msg_type_t; + +/* Request to join a given stream. Multicast over MC_TRANSPORT_JOIN. */ +typedef CLIB_PACKED (struct + { +mc_peer_id_t peer_id; mc_relay_msg_type_t type:32; + /* MC_MSG_TYPE_join_or_leave_request */ + /* Stream to join or leave. */ + u32 stream_index; + /* join = 1, leave = 0 */ + u8 is_join;}) mc_msg_join_or_leave_request_t; + +always_inline void +mc_byte_swap_msg_join_or_leave_request (mc_msg_join_or_leave_request_t * r) +{ + if (mc_need_byte_swap ()) + { + r->peer_id = mc_byte_swap_peer_id (r->peer_id); + r->type = clib_byte_swap_u32 (r->type); + r->stream_index = clib_byte_swap_u32 (r->stream_index); + } +} + +/* Join reply. Multicast over MC_TRANSPORT_JOIN. */ +typedef CLIB_PACKED (struct + { +mc_peer_id_t peer_id; mc_relay_msg_type_t type:32; + /* MC_MSG_TYPE_join_reply */ + u32 stream_index; + /* Peer ID to contact to catchup with this stream. */ + mc_peer_id_t catchup_peer_id;}) mc_msg_join_reply_t; + +always_inline void +mc_byte_swap_msg_join_reply (mc_msg_join_reply_t * r) +{ + if (mc_need_byte_swap ()) + { + r->peer_id = mc_byte_swap_peer_id (r->peer_id); + r->type = clib_byte_swap_u32 (r->type); + r->stream_index = clib_byte_swap_u32 (r->stream_index); + r->catchup_peer_id = mc_byte_swap_peer_id (r->catchup_peer_id); + } +} + +/* Generic (application) request. Multicast over MC_TRANSPORT_USER_REQUEST_TO_RELAY and then + relayed by relay master after filling in global sequence number. */ +typedef CLIB_PACKED (struct + { + mc_peer_id_t peer_id; u32 stream_index; + /* Global sequence number as filled in by relay master. */ + u32 global_sequence; + /* Local sequence number as filled in by peer sending message. */ + u32 local_sequence; + /* Size of request data. */ + u32 n_data_bytes; + /* Opaque request data. */ + u8 data[0];}) mc_msg_user_request_t; + +always_inline void +mc_byte_swap_msg_user_request (mc_msg_user_request_t * r) +{ + if (mc_need_byte_swap ()) + { + r->peer_id = mc_byte_swap_peer_id (r->peer_id); + r->stream_index = clib_byte_swap_u32 (r->stream_index); + r->global_sequence = clib_byte_swap_u32 (r->global_sequence); + r->local_sequence = clib_byte_swap_u32 (r->local_sequence); + r->n_data_bytes = clib_byte_swap_u32 (r->n_data_bytes); + } +} + +/* Sent unicast over ACK channel. */ +typedef CLIB_PACKED (struct + { + mc_peer_id_t peer_id; + u32 global_sequence; u32 stream_index; + u32 local_sequence; + i32 seq_cmp_result;}) mc_msg_user_ack_t; + +always_inline void +mc_byte_swap_msg_user_ack (mc_msg_user_ack_t * r) +{ + if (mc_need_byte_swap ()) + { + r->peer_id = mc_byte_swap_peer_id (r->peer_id); + r->stream_index = clib_byte_swap_u32 (r->stream_index); + r->global_sequence = clib_byte_swap_u32 (r->global_sequence); + r->local_sequence = clib_byte_swap_u32 (r->local_sequence); + r->seq_cmp_result = clib_byte_swap_i32 (r->seq_cmp_result); + } +} + +/* Sent/received unicast over catchup channel (e.g. using TCP). */ +typedef CLIB_PACKED (struct + { + mc_peer_id_t peer_id; + u32 stream_index;}) mc_msg_catchup_request_t; + +always_inline void +mc_byte_swap_msg_catchup_request (mc_msg_catchup_request_t * r) +{ + if (mc_need_byte_swap ()) + { + r->peer_id = mc_byte_swap_peer_id (r->peer_id); + r->stream_index = clib_byte_swap_u32 (r->stream_index); + } +} + +/* Sent/received unicast over catchup channel. */ +typedef CLIB_PACKED (struct + { + mc_peer_id_t peer_id; u32 stream_index; + /* Last global sequence number included in catchup data. */ + u32 last_global_sequence_included; + /* Size of catchup data. */ + u32 n_data_bytes; + /* Catchup data. */ + u8 data[0];}) mc_msg_catchup_reply_t; + +always_inline void +mc_byte_swap_msg_catchup_reply (mc_msg_catchup_reply_t * r) +{ + if (mc_need_byte_swap ()) + { + r->peer_id = mc_byte_swap_peer_id (r->peer_id); + r->stream_index = clib_byte_swap_u32 (r->stream_index); + r->last_global_sequence_included = + clib_byte_swap_u32 (r->last_global_sequence_included); + r->n_data_bytes = clib_byte_swap_u32 (r->n_data_bytes); + } +} + +typedef struct _mc_serialize_msg +{ + /* Name for this type. */ + char *name; + + /* Functions to serialize/unserialize data. */ + serialize_function_t *serialize; + serialize_function_t *unserialize; + + /* Maximum message size in bytes when serialized. + If zero then this will be set to the largest sent message. */ + u32 max_n_bytes_serialized; + + /* Opaque to use for first argument to serialize/unserialize function. */ + u32 opaque; + + /* Index in global message vector. */ + u32 global_index; + + /* Registration list */ + struct _mc_serialize_msg *next_registration; +} mc_serialize_msg_t; + +typedef struct +{ + /* Index into global message vector. */ + u32 global_index; +} mc_serialize_stream_msg_t; + +#define MC_SERIALIZE_MSG(x,...) \ + __VA_ARGS__ mc_serialize_msg_t x; \ +static void __mc_serialize_msg_registration_##x (void) \ + __attribute__((__constructor__)) ; \ +static void __mc_serialize_msg_registration_##x (void) \ +{ \ + vlib_main_t * vm = vlib_get_main(); \ + x.next_registration = vm->mc_msg_registrations; \ + vm->mc_msg_registrations = &x; \ +} \ +__VA_ARGS__ mc_serialize_msg_t x + +typedef enum +{ + MC_TRANSPORT_MASTERSHIP, + MC_TRANSPORT_JOIN, + MC_TRANSPORT_USER_REQUEST_TO_RELAY, + MC_TRANSPORT_USER_REQUEST_FROM_RELAY, + MC_N_TRANSPORT_TYPE, +} mc_transport_type_t; + +typedef struct +{ + clib_error_t *(*tx_buffer) (void *opaque, mc_transport_type_t type, + u32 buffer_index); + + clib_error_t *(*tx_ack) (void *opaque, mc_peer_id_t peer_id, + u32 buffer_index); + + /* Returns catchup opaque. */ + uword (*catchup_request_fun) (void *opaque, u32 stream_index, + mc_peer_id_t catchup_peer_id); + + void (*catchup_send_fun) (void *opaque, uword catchup_opaque, + u8 * data_vector); + + /* Opaque passed to callbacks. */ + void *opaque; + + mc_peer_id_t our_ack_peer_id; + mc_peer_id_t our_catchup_peer_id; + + /* Max packet size (MTU) for this transport. + For IP this is interface MTU less IP + UDP header size. */ + u32 max_packet_size; + + format_function_t *format_peer_id; +} mc_transport_t; + +typedef struct +{ + /* Count of messages received from this peer from the past/future + (with seq_cmp != 0). */ + u64 n_msgs_from_past; + u64 n_msgs_from_future; +} mc_stream_peer_stats_t; + +typedef struct +{ + /* ID of this peer. */ + mc_peer_id_t id; + + /* The last sequence we received from this peer. */ + u32 last_sequence_received; + + mc_stream_peer_stats_t stats, stats_last_clear; +} mc_stream_peer_t; + +typedef struct +{ + u32 buffer_index; + + /* Cached copy of local sequence number from buffer. */ + u32 local_sequence; + + /* Number of times this buffer has been sent (retried). */ + u32 n_retries; + + /* Previous/next retries in doubly-linked list. */ + u32 prev_index, next_index; + + /* Bitmap of all peers which have acked this msg */ + uword *unacked_by_peer_bitmap; + + /* Message send or resend time */ + f64 sent_at; +} mc_retry_t; + +typedef struct +{ + /* Number of retries sent for this stream. */ + u64 n_retries; +} mc_stream_stats_t; + +struct mc_main_t; +struct mc_stream_t; + +typedef struct +{ + /* Stream name. */ + char *name; + + /* Number of outstanding messages. */ + u32 window_size; + + /* Retry interval, in seconds */ + f64 retry_interval; + + /* Retry limit */ + u32 retry_limit; + + /* User rx buffer callback */ + void (*rx_buffer) (struct mc_main_t * mc_main, + struct mc_stream_t * stream, + mc_peer_id_t peer_id, u32 buffer_index); + + /* User callback to create a snapshot */ + u8 *(*catchup_snapshot) (struct mc_main_t * mc_main, + u8 * snapshot_vector, + u32 last_global_sequence_included); + + /* User callback to replay a snapshot */ + void (*catchup) (struct mc_main_t * mc_main, + u8 * snapshot_data, u32 n_snapshot_data_bytes); + + /* Callback to save a snapshot for offline replay */ + void (*save_snapshot) (struct mc_main_t * mc_main, + u32 is_catchup, + u8 * snapshot_data, u32 n_snapshot_data_bytes); + + /* Called when a peer dies */ + void (*peer_died) (struct mc_main_t * mc_main, + struct mc_stream_t * stream, mc_peer_id_t peer_id); +} mc_stream_config_t; + +#define foreach_mc_stream_state \ + _ (invalid) \ + _ (name_known) \ + _ (join_in_progress) \ + _ (catchup) \ + _ (ready) + +typedef enum +{ +#define _(f) MC_STREAM_STATE_##f, + foreach_mc_stream_state +#undef _ +} mc_stream_state_t; + +typedef struct mc_stream_t +{ + mc_stream_config_t config; + + mc_stream_state_t state; + + /* Index in stream pool. */ + u32 index; + + /* Stream index 0 is always for MC internal use. */ +#define MC_STREAM_INDEX_INTERNAL 0 + + mc_retry_t *retry_pool; + + /* Head and tail index of retry pool. */ + u32 retry_head_index, retry_tail_index; + + /* + * Country club for recently retired messages + * If the set of peers is expanding and a new peer + * misses a message, we can easily retire the FIFO + * element before we even know about the new peer + */ + mc_retry_t *retired_fifo; + + /* Hash mapping local sequence to retry pool index. */ + uword *retry_index_by_local_sequence; + + /* catch-up fifo of VLIB buffer indices. + start recording when catching up. */ + u32 *catchup_fifo; + + mc_stream_stats_t stats, stats_last_clear; + + /* Peer pool. */ + mc_stream_peer_t *peers; + + /* Bitmap with ones for all peers in peer pool. */ + uword *all_peer_bitmap; + + /* Map of 64 bit id to index in stream pool. */ + mhash_t peer_index_by_id; + + /* Timeout, in case we're alone in the world */ + f64 join_timeout; + + vlib_one_time_waiting_process_t *procs_waiting_for_join_done; + + vlib_one_time_waiting_process_t *procs_waiting_for_open_window; + + /* Next sequence number to use */ + u32 our_local_sequence; + + /* + * Last global sequence we processed. + * When supplying catchup data, we need to tell + * the client precisely where to start replaying + */ + u32 last_global_sequence_processed; + + /* Vector of unique messages we've sent on this stream. */ + mc_serialize_stream_msg_t *stream_msgs; + + /* Vector global message index into per stream message index. */ + u32 *stream_msg_index_by_global_index; + + /* Hashed by message name. */ + uword *stream_msg_index_by_name; + + u64 user_requests_sent; + u64 user_requests_received; +} mc_stream_t; + +always_inline void +mc_stream_free (mc_stream_t * s) +{ + pool_free (s->retry_pool); + hash_free (s->retry_index_by_local_sequence); + clib_fifo_free (s->catchup_fifo); + pool_free (s->peers); + mhash_free (&s->peer_index_by_id); + vec_free (s->procs_waiting_for_join_done); + vec_free (s->procs_waiting_for_open_window); +} + +always_inline void +mc_stream_init (mc_stream_t * s) +{ + memset (s, 0, sizeof (s[0])); + s->retry_head_index = s->retry_tail_index = ~0; +} + +typedef struct +{ + u32 stream_index; + u32 catchup_opaque; + u8 *catchup_snapshot; +} mc_catchup_process_arg_t; + +typedef enum +{ + MC_RELAY_STATE_NEGOTIATE, + MC_RELAY_STATE_MASTER, + MC_RELAY_STATE_SLAVE, +} mc_relay_state_t; + +typedef struct +{ + mc_peer_id_t peer_id; + + f64 time_last_master_assert_received; +} mc_mastership_peer_t; + +typedef struct +{ + u32 stream_index; + u32 buffer_index; +} mc_stream_and_buffer_t; + +typedef struct mc_main_t +{ + mc_relay_state_t relay_state; + + /* Mastership */ + u32 we_can_be_relay_master; + + u64 relay_master_peer_id; + + mc_mastership_peer_t *mastership_peers; + + /* Map of 64 bit id to index in stream pool. */ + mhash_t mastership_peer_index_by_id; + + /* The transport we're using. */ + mc_transport_t transport; + + /* Last-used global sequence number. */ + u32 relay_global_sequence; + + /* Vector of streams. */ + mc_stream_t *stream_vector; + + /* Hash table mapping stream name to pool index. */ + uword *stream_index_by_name; + + uword *procs_waiting_for_stream_name_by_name; + + vlib_one_time_waiting_process_t **procs_waiting_for_stream_name_pool; + + int joins_in_progress; + + mc_catchup_process_arg_t *catchup_process_args; + + /* Node indices for mastership, join ager, + retry and catchup processes. */ + u32 mastership_process; + u32 join_ager_process; + u32 retry_process; + u32 catchup_process; + u32 unserialize_process; + + /* Global vector of messages. */ + mc_serialize_msg_t **global_msgs; + + /* Hash table mapping message name to index. */ + uword *global_msg_index_by_name; + + /* Shared serialize/unserialize main. */ + serialize_main_t serialize_mains[VLIB_N_RX_TX]; + + vlib_serialize_buffer_main_t serialize_buffer_mains[VLIB_N_RX_TX]; + + /* Convenience variables */ + struct vlib_main_t *vlib_main; + elog_main_t *elog_main; + + /* Maps 64 bit peer id to elog string table offset for this formatted peer id. */ + mhash_t elog_id_by_peer_id; + + uword *elog_id_by_msg_name; + + /* For mc_unserialize. */ + mc_stream_and_buffer_t *mc_unserialize_stream_and_buffers; +} mc_main_t; + +always_inline mc_stream_t * +mc_stream_by_name (mc_main_t * m, char *name) +{ + uword *p = hash_get (m->stream_index_by_name, name); + return p ? vec_elt_at_index (m->stream_vector, p[0]) : 0; +} + +always_inline mc_stream_t * +mc_stream_by_index (mc_main_t * m, u32 i) +{ + return i < vec_len (m->stream_vector) ? m->stream_vector + i : 0; +} + +always_inline void +mc_clear_stream_stats (mc_main_t * m) +{ + mc_stream_t *s; + mc_stream_peer_t *p; + vec_foreach (s, m->stream_vector) + { + s->stats_last_clear = s->stats; + /* *INDENT-OFF* */ + pool_foreach (p, s->peers, ({ + p->stats_last_clear = p->stats; + })); + /* *INDENT-ON* */ + } +} + +/* Declare all message handlers. */ +#define _(f) void mc_msg_##f##_handler (mc_main_t * mcm, mc_msg_##f##_t * msg, u32 buffer_index); +foreach_mc_msg_type +#undef _ + u32 mc_stream_join (mc_main_t * mcm, mc_stream_config_t *); + +void mc_stream_leave (mc_main_t * mcm, u32 stream_index); + +void mc_wait_for_stream_ready (mc_main_t * m, char *stream_name); + +u32 mc_stream_send (mc_main_t * mcm, u32 stream_index, u32 buffer_index); + +void mc_main_init (mc_main_t * mcm, char *tag); + +void mc_enable_disable_mastership (mc_main_t * mcm, int we_can_be_master); + +void *mc_get_vlib_buffer (struct vlib_main_t *vm, u32 n_bytes, + u32 * bi_return); + +format_function_t format_mc_main; + +clib_error_t *mc_serialize_internal (mc_main_t * mc, + u32 stream_index, + u32 multiple_messages_per_vlib_buffer, + mc_serialize_msg_t * msg, ...); + +clib_error_t *mc_serialize_va (mc_main_t * mc, + u32 stream_index, + u32 multiple_messages_per_vlib_buffer, + mc_serialize_msg_t * msg, va_list * va); + +#define mc_serialize_stream(mc,si,msg,args...) \ + mc_serialize_internal((mc),(si),(0),(msg),(msg)->serialize,args) + +#define mc_serialize(mc,msg,args...) \ + mc_serialize_internal((mc),(~0),(0),(msg),(msg)->serialize,args) + +#define mc_serialize2(mc,add,msg,args...) \ + mc_serialize_internal((mc),(~0),(add),(msg),(msg)->serialize,args) + +void mc_unserialize (mc_main_t * mcm, mc_stream_t * s, u32 buffer_index); +uword mc_unserialize_message (mc_main_t * mcm, mc_stream_t * s, + serialize_main_t * m); + +serialize_function_t serialize_mc_main, unserialize_mc_main; + +always_inline uword +mc_max_message_size_in_bytes (mc_main_t * mcm) +{ + return mcm->transport.max_packet_size - sizeof (mc_msg_user_request_t); +} + +always_inline word +mc_serialize_n_bytes_left (mc_main_t * mcm, serialize_main_t * m) +{ + return mc_max_message_size_in_bytes (mcm) - + serialize_vlib_buffer_n_bytes (m); +} + +void unserialize_mc_stream (serialize_main_t * m, va_list * va); +void mc_stream_join_process_hold (void); + +#endif /* included_vlib_mc_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/node.c b/src/vlib/node.c new file mode 100644 index 00000000..c419a13a --- /dev/null +++ b/src/vlib/node.c @@ -0,0 +1,631 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * node.c: VLIB processing nodes + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include + +/* Query node given name. */ +vlib_node_t * +vlib_get_node_by_name (vlib_main_t * vm, u8 * name) +{ + vlib_node_main_t *nm = &vm->node_main; + uword *p; + u8 *key = name; + if (!clib_mem_is_heap_object (key)) + key = format (0, "%s", key); + p = hash_get (nm->node_by_name, key); + if (key != name) + vec_free (key); + return p ? vec_elt (nm->nodes, p[0]) : 0; +} + +static void +node_set_elog_name (vlib_main_t * vm, uword node_index) +{ + vlib_node_t *n = vlib_get_node (vm, node_index); + elog_event_type_t *t; + + t = vec_elt_at_index (vm->node_call_elog_event_types, node_index); + vec_free (t->format); + t->format = (char *) format (0, "%v-call: %%d%c", n->name, 0); + + t = vec_elt_at_index (vm->node_return_elog_event_types, node_index); + vec_free (t->format); + t->format = (char *) format (0, "%v-return: %%d%c", n->name, 0); + + n->name_elog_string = elog_string (&vm->elog_main, "%v%c", n->name, 0); +} + +void +vlib_node_rename (vlib_main_t * vm, u32 node_index, char *fmt, ...) +{ + va_list va; + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n = vlib_get_node (vm, node_index); + + va_start (va, fmt); + hash_unset (nm->node_by_name, n->name); + vec_free (n->name); + n->name = va_format (0, fmt, &va); + va_end (va); + hash_set (nm->node_by_name, n->name, n->index); + + node_set_elog_name (vm, node_index); +} + +static void +vlib_node_runtime_update (vlib_main_t * vm, u32 node_index, u32 next_index) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_runtime_t *r, *s; + vlib_node_t *node, *next_node; + vlib_next_frame_t *nf; + vlib_pending_frame_t *pf; + i32 i, j, n_insert; + + ASSERT (os_get_cpu_number () == 0); + + vlib_worker_thread_barrier_sync (vm); + + node = vec_elt (nm->nodes, node_index); + r = vlib_node_get_runtime (vm, node_index); + + n_insert = vec_len (node->next_nodes) - r->n_next_nodes; + if (n_insert > 0) + { + i = r->next_frame_index + r->n_next_nodes; + vec_insert (nm->next_frames, n_insert, i); + + /* Initialize newly inserted next frames. */ + for (j = 0; j < n_insert; j++) + vlib_next_frame_init (nm->next_frames + i + j); + + /* Relocate other next frames at higher indices. */ + for (j = 0; j < vec_len (nm->nodes); j++) + { + s = vlib_node_get_runtime (vm, j); + if (j != node_index && s->next_frame_index >= i) + s->next_frame_index += n_insert; + } + + /* Pending frames may need to be relocated also. */ + vec_foreach (pf, nm->pending_frames) + { + if (pf->next_frame_index != VLIB_PENDING_FRAME_NO_NEXT_FRAME + && pf->next_frame_index >= i) + pf->next_frame_index += n_insert; + } + /* *INDENT-OFF* */ + pool_foreach (pf, nm->suspended_process_frames, ({ + if (pf->next_frame_index != ~0 && pf->next_frame_index >= i) + pf->next_frame_index += n_insert; + })); + /* *INDENT-ON* */ + + r->n_next_nodes = vec_len (node->next_nodes); + } + + /* Set frame's node runtime index. */ + next_node = vlib_get_node (vm, node->next_nodes[next_index]); + nf = nm->next_frames + r->next_frame_index + next_index; + nf->node_runtime_index = next_node->runtime_index; + + vlib_worker_thread_node_runtime_update (); + + vlib_worker_thread_barrier_release (vm); +} + +/* Add next node to given node in given slot. */ +uword +vlib_node_add_next_with_slot (vlib_main_t * vm, + uword node_index, + uword next_node_index, uword slot) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *node, *next; + uword *p; + + node = vec_elt (nm->nodes, node_index); + next = vec_elt (nm->nodes, next_node_index); + + /* Runtime has to be initialized. */ + ASSERT (nm->flags & VLIB_NODE_MAIN_RUNTIME_STARTED); + + if ((p = hash_get (node->next_slot_by_node, next_node_index))) + { + /* Next already exists: slot must match. */ + if (slot != ~0) + ASSERT (slot == p[0]); + return p[0]; + } + + if (slot == ~0) + slot = vec_len (node->next_nodes); + + vec_validate_init_empty (node->next_nodes, slot, ~0); + vec_validate (node->n_vectors_by_next_node, slot); + + node->next_nodes[slot] = next_node_index; + hash_set (node->next_slot_by_node, next_node_index, slot); + + vlib_node_runtime_update (vm, node_index, slot); + + next->prev_node_bitmap = clib_bitmap_ori (next->prev_node_bitmap, + node_index); + + /* Siblings all get same node structure. */ + { + uword sib_node_index, sib_slot; + vlib_node_t *sib_node; + /* *INDENT-OFF* */ + clib_bitmap_foreach (sib_node_index, node->sibling_bitmap, ({ + sib_node = vec_elt (nm->nodes, sib_node_index); + if (sib_node != node) + { + sib_slot = vlib_node_add_next_with_slot (vm, sib_node_index, next_node_index, slot); + ASSERT (sib_slot == slot); + } + })); + /* *INDENT-ON* */ + } + + return slot; +} + +/* Add named next node to given node in given slot. */ +uword +vlib_node_add_named_next_with_slot (vlib_main_t * vm, + uword node, char *name, uword slot) +{ + vlib_node_main_t *nm; + vlib_node_t *n, *n_next; + + nm = &vm->node_main; + n = vlib_get_node (vm, node); + + n_next = vlib_get_node_by_name (vm, (u8 *) name); + if (!n_next) + { + if (nm->flags & VLIB_NODE_MAIN_RUNTIME_STARTED) + return ~0; + + if (slot == ~0) + slot = clib_max (vec_len (n->next_node_names), + vec_len (n->next_nodes)); + vec_validate (n->next_node_names, slot); + n->next_node_names[slot] = name; + return slot; + } + + return vlib_node_add_next_with_slot (vm, node, n_next->index, slot); +} + +static void +node_elog_init (vlib_main_t * vm, uword ni) +{ + elog_event_type_t t; + + memset (&t, 0, sizeof (t)); + + /* 2 event types for this node: one when node function is called. + One when it returns. */ + vec_validate (vm->node_call_elog_event_types, ni); + vm->node_call_elog_event_types[ni] = t; + + vec_validate (vm->node_return_elog_event_types, ni); + vm->node_return_elog_event_types[ni] = t; + + node_set_elog_name (vm, ni); +} + +#ifdef CLIB_UNIX +#define STACK_ALIGN (clib_mem_get_page_size()) +#else +#define STACK_ALIGN CLIB_CACHE_LINE_BYTES +#endif + +static void +register_node (vlib_main_t * vm, vlib_node_registration_t * r) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n; + u32 page_size = clib_mem_get_page_size (); + int i; + + if (CLIB_DEBUG > 0) + { + /* Default (0) type should match INTERNAL. */ + vlib_node_t zero = { 0 }; + ASSERT (VLIB_NODE_TYPE_INTERNAL == zero.type); + } + + ASSERT (r->function != 0); + + n = clib_mem_alloc_no_fail (sizeof (n[0])); + memset (n, 0, sizeof (n[0])); + n->index = vec_len (nm->nodes); + + vec_add1 (nm->nodes, n); + + /* Name is always a vector so it can be formatted with %v. */ + if (clib_mem_is_heap_object (vec_header (r->name, 0))) + n->name = vec_dup ((u8 *) r->name); + else + n->name = format (0, "%s", r->name); + + if (!nm->node_by_name) + nm->node_by_name = hash_create_vec ( /* size */ 32, + sizeof (n->name[0]), sizeof (uword)); + + /* Node names must be unique. */ + { + vlib_node_t *o = vlib_get_node_by_name (vm, n->name); + if (o) + clib_error ("more than one node named `%v'", n->name); + } + + hash_set (nm->node_by_name, n->name, n->index); + + r->index = n->index; /* save index in registration */ + n->function = r->function; + + /* Node index of next sibling will be filled in by vlib_node_main_init. */ + n->sibling_of = r->sibling_of; + if (r->sibling_of && r->n_next_nodes > 0) + clib_error ("sibling node should not have any next nodes `%v'", n->name); + + if (r->type == VLIB_NODE_TYPE_INTERNAL) + ASSERT (r->vector_size > 0); + +#define _(f) n->f = r->f + + _(type); + _(flags); + _(state); + _(scalar_size); + _(vector_size); + _(format_buffer); + _(unformat_buffer); + _(format_trace); + _(validate_frame); + + /* Register error counters. */ + vlib_register_errors (vm, n->index, r->n_errors, r->error_strings); + node_elog_init (vm, n->index); + + _(runtime_data_bytes); + if (r->runtime_data_bytes > 0) + { + vec_resize (n->runtime_data, r->runtime_data_bytes); + if (r->runtime_data) + clib_memcpy (n->runtime_data, r->runtime_data, r->runtime_data_bytes); + } + + vec_resize (n->next_node_names, r->n_next_nodes); + for (i = 0; i < r->n_next_nodes; i++) + n->next_node_names[i] = r->next_nodes[i]; + + vec_validate_init_empty (n->next_nodes, r->n_next_nodes - 1, ~0); + vec_validate (n->n_vectors_by_next_node, r->n_next_nodes - 1); + + n->owner_node_index = n->owner_next_index = ~0; + + /* Initialize node runtime. */ + { + vlib_node_runtime_t *rt; + u32 i; + + if (n->type == VLIB_NODE_TYPE_PROCESS) + { + vlib_process_t *p; + uword log2_n_stack_bytes; + + log2_n_stack_bytes = clib_max (r->process_log2_n_stack_bytes, 15); + +#ifdef CLIB_UNIX + /* + * Bump the stack size if running over a kernel with a large page size, + * and the stack isn't any too big to begin with. Otherwise, we'll + * trip over the stack guard page for sure. + */ + if ((page_size > (4 << 10)) && log2_n_stack_bytes < 19) + { + if ((1 << log2_n_stack_bytes) <= page_size) + log2_n_stack_bytes = min_log2 (page_size) + 1; + else + log2_n_stack_bytes++; + } +#endif + + p = clib_mem_alloc_aligned_at_offset + (sizeof (p[0]) + (1 << log2_n_stack_bytes), + STACK_ALIGN, STRUCT_OFFSET_OF (vlib_process_t, stack), + 0 /* no, don't call os_out_of_memory */ ); + if (p == 0) + clib_panic ("failed to allocate process stack (%d bytes)", + 1 << log2_n_stack_bytes); + + memset (p, 0, sizeof (p[0])); + p->log2_n_stack_bytes = log2_n_stack_bytes; + + /* Process node's runtime index is really index into process + pointer vector. */ + n->runtime_index = vec_len (nm->processes); + + vec_add1 (nm->processes, p); + + /* Paint first stack word with magic number so we can at least + detect process stack overruns. */ + p->stack[0] = VLIB_PROCESS_STACK_MAGIC; + + /* Node runtime is stored inside of process. */ + rt = &p->node_runtime; + +#ifdef CLIB_UNIX + /* + * Disallow writes to the bottom page of the stack, to + * catch stack overflows. + */ + if (mprotect (p->stack, page_size, PROT_READ) < 0) + clib_unix_warning ("process stack"); +#endif + + } + else + { + vec_add2_aligned (nm->nodes_by_type[n->type], rt, 1, + /* align */ CLIB_CACHE_LINE_BYTES); + n->runtime_index = rt - nm->nodes_by_type[n->type]; + } + + if (n->type == VLIB_NODE_TYPE_INPUT) + nm->input_node_counts_by_state[n->state] += 1; + + rt->function = n->function; + rt->flags = n->flags; + rt->state = n->state; + rt->node_index = n->index; + + rt->n_next_nodes = r->n_next_nodes; + rt->next_frame_index = vec_len (nm->next_frames); + + vec_resize (nm->next_frames, rt->n_next_nodes); + for (i = 0; i < rt->n_next_nodes; i++) + vlib_next_frame_init (nm->next_frames + rt->next_frame_index + i); + + vec_resize (rt->errors, r->n_errors); + for (i = 0; i < vec_len (rt->errors); i++) + rt->errors[i] = vlib_error_set (n->index, i); + + STATIC_ASSERT_SIZEOF (vlib_node_runtime_t, 128); + ASSERT (vec_len (n->runtime_data) <= + sizeof (vlib_node_runtime_t) - + STRUCT_OFFSET_OF (vlib_node_runtime_t, runtime_data)); + + if (vec_len (n->runtime_data) > 0) + clib_memcpy (rt->runtime_data, n->runtime_data, + vec_len (n->runtime_data)); + + vec_free (n->runtime_data); + } +} + +/* Register new packet processing node. */ +u32 +vlib_register_node (vlib_main_t * vm, vlib_node_registration_t * r) +{ + register_node (vm, r); + return r->index; +} + +static uword +null_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + u16 n_vectors = frame->n_vectors; + + vlib_node_increment_counter (vm, node->node_index, 0, n_vectors); + vlib_buffer_free (vm, vlib_frame_args (frame), n_vectors); + vlib_frame_free (vm, node, frame); + + return n_vectors; +} + +void +vlib_register_all_static_nodes (vlib_main_t * vm) +{ + vlib_node_registration_t *r; + + static char *null_node_error_strings[] = { + "blackholed packets", + }; + + static vlib_node_registration_t null_node_reg = { + .function = null_node_fn, + .vector_size = sizeof (u32), + .name = "null-node", + .n_errors = 1, + .error_strings = null_node_error_strings, + }; + + /* make sure that node index 0 is not used by + real node */ + register_node (vm, &null_node_reg); + + r = vm->node_main.node_registrations; + while (r) + { + register_node (vm, r); + r = r->next_registration; + } +} + +clib_error_t * +vlib_node_main_init (vlib_main_t * vm) +{ + vlib_node_main_t *nm = &vm->node_main; + clib_error_t *error = 0; + vlib_node_t *n; + uword ni; + + nm->flags |= VLIB_NODE_MAIN_RUNTIME_STARTED; + + /* Generate sibling relationships */ + { + vlib_node_t *n, *sib; + uword si; + + for (ni = 0; ni < vec_len (nm->nodes); ni++) + { + n = vec_elt (nm->nodes, ni); + + if (!n->sibling_of) + continue; + + sib = vlib_get_node_by_name (vm, (u8 *) n->sibling_of); + if (!sib) + { + error = clib_error_create ("sibling `%s' not found for node `%v'", + n->sibling_of, n->name); + goto done; + } + + /* *INDENT-OFF* */ + clib_bitmap_foreach (si, sib->sibling_bitmap, ({ + vlib_node_t * m = vec_elt (nm->nodes, si); + + /* Connect all of sibling's siblings to us. */ + m->sibling_bitmap = clib_bitmap_ori (m->sibling_bitmap, n->index); + + /* Connect us to all of sibling's siblings. */ + n->sibling_bitmap = clib_bitmap_ori (n->sibling_bitmap, si); + })); + /* *INDENT-ON* */ + + /* Connect sibling to us. */ + sib->sibling_bitmap = clib_bitmap_ori (sib->sibling_bitmap, n->index); + + /* Connect us to sibling. */ + n->sibling_bitmap = clib_bitmap_ori (n->sibling_bitmap, sib->index); + } + } + + /* Resolve next names into next indices. */ + for (ni = 0; ni < vec_len (nm->nodes); ni++) + { + uword i; + + n = vec_elt (nm->nodes, ni); + + for (i = 0; i < vec_len (n->next_node_names); i++) + { + char *a = n->next_node_names[i]; + + if (!a) + continue; + + if (~0 == vlib_node_add_named_next_with_slot (vm, n->index, a, i)) + { + error = clib_error_create + ("node `%v' refers to unknown node `%s'", n->name, a); + goto done; + } + } + + vec_free (n->next_node_names); + } + + /* Set previous node pointers. */ + for (ni = 0; ni < vec_len (nm->nodes); ni++) + { + vlib_node_t *n_next; + uword i; + + n = vec_elt (nm->nodes, ni); + + for (i = 0; i < vec_len (n->next_nodes); i++) + { + if (n->next_nodes[i] >= vec_len (nm->nodes)) + continue; + + n_next = vec_elt (nm->nodes, n->next_nodes[i]); + n_next->prev_node_bitmap = + clib_bitmap_ori (n_next->prev_node_bitmap, n->index); + } + } + + { + vlib_next_frame_t *nf; + vlib_node_runtime_t *r; + vlib_node_t *next; + uword i; + + vec_foreach (r, nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]) + { + if (r->n_next_nodes == 0) + continue; + + n = vlib_get_node (vm, r->node_index); + nf = vec_elt_at_index (nm->next_frames, r->next_frame_index); + + for (i = 0; i < vec_len (n->next_nodes); i++) + { + next = vlib_get_node (vm, n->next_nodes[i]); + + /* Validate node runtime indices are correctly initialized. */ + ASSERT (nf[i].node_runtime_index == next->runtime_index); + + nf[i].flags = 0; + if (next->flags & VLIB_NODE_FLAG_FRAME_NO_FREE_AFTER_DISPATCH) + nf[i].flags |= VLIB_FRAME_NO_FREE_AFTER_DISPATCH; + } + } + } + +done: + return error; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/node.h b/src/vlib/node.h new file mode 100644 index 00000000..b624e9d6 --- /dev/null +++ b/src/vlib/node.h @@ -0,0 +1,725 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * node.h: VLIB processing nodes + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_node_h +#define included_vlib_node_h + +#include +#include +#include +#include /* for vlib_trace_filter_t */ + +/* Forward declaration. */ +struct vlib_node_runtime_t; +struct vlib_frame_t; + +/* Internal nodes (including output nodes) move data from node to + node (or out of the graph for output nodes). */ +typedef uword (vlib_node_function_t) (struct vlib_main_t * vm, + struct vlib_node_runtime_t * node, + struct vlib_frame_t * frame); + +typedef enum +{ + /* An internal node on the call graph (could be output). */ + VLIB_NODE_TYPE_INTERNAL, + + /* Nodes which input data into the processing graph. + Input nodes are called for each iteration of main loop. */ + VLIB_NODE_TYPE_INPUT, + + /* Nodes to be called before all input nodes. + Used, for example, to clean out driver TX rings before + processing input. */ + VLIB_NODE_TYPE_PRE_INPUT, + + /* "Process" nodes which can be suspended and later resumed. */ + VLIB_NODE_TYPE_PROCESS, + + VLIB_N_NODE_TYPE, +} vlib_node_type_t; + +typedef struct _vlib_node_registration +{ + /* Vector processing function for this node. */ + vlib_node_function_t *function; + + /* Node name. */ + char *name; + + /* Name of sibling (if applicable). */ + char *sibling_of; + + /* Node index filled in by registration. */ + u32 index; + + /* Type of this node. */ + vlib_node_type_t type; + + /* Error strings indexed by error code for this node. */ + char **error_strings; + + /* Buffer format/unformat for this node. */ + format_function_t *format_buffer; + unformat_function_t *unformat_buffer; + + /* Trace format/unformat for this node. */ + format_function_t *format_trace; + unformat_function_t *unformat_trace; + + /* Function to validate incoming frames. */ + u8 *(*validate_frame) (struct vlib_main_t * vm, + struct vlib_node_runtime_t *, + struct vlib_frame_t * f); + + /* Per-node runtime data. */ + void *runtime_data; + + /* Process stack size. */ + u16 process_log2_n_stack_bytes; + + /* Number of bytes of per-node run time data. */ + u8 runtime_data_bytes; + + /* State for input nodes. */ + u8 state; + + /* Node flags. */ + u16 flags; + + /* Size of scalar and vector arguments in bytes. */ + u16 scalar_size, vector_size; + + /* Number of error codes used by this node. */ + u16 n_errors; + + /* Number of next node names that follow. */ + u16 n_next_nodes; + + /* Constructor link-list, don't ask... */ + struct _vlib_node_registration *next_registration; + + /* Names of next nodes which this node feeds into. */ + char *next_nodes[]; + +} vlib_node_registration_t; + +#define VLIB_REGISTER_NODE(x,...) \ + __VA_ARGS__ vlib_node_registration_t x; \ +static void __vlib_add_node_registration_##x (void) \ + __attribute__((__constructor__)) ; \ +static void __vlib_add_node_registration_##x (void) \ +{ \ + vlib_main_t * vm = vlib_get_main(); \ + x.next_registration = vm->node_main.node_registrations; \ + vm->node_main.node_registrations = &x; \ +} \ +__VA_ARGS__ vlib_node_registration_t x + +#if CLIB_DEBUG > 0 +#define VLIB_NODE_FUNCTION_CLONE_TEMPLATE(arch, fn) +#define VLIB_NODE_FUNCTION_MULTIARCH_CLONE(fn) +#define VLIB_NODE_FUNCTION_MULTIARCH(node, fn) +#else +#define VLIB_NODE_FUNCTION_CLONE_TEMPLATE(arch, fn, tgt) \ + uword \ + __attribute__ ((flatten)) \ + __attribute__ ((target (tgt))) \ + CLIB_CPU_OPTIMIZED \ + fn ## _ ## arch ( struct vlib_main_t * vm, \ + struct vlib_node_runtime_t * node, \ + struct vlib_frame_t * frame) \ + { return fn (vm, node, frame); } + +#define VLIB_NODE_FUNCTION_MULTIARCH_CLONE(fn) \ + foreach_march_variant(VLIB_NODE_FUNCTION_CLONE_TEMPLATE, fn) + +#define VLIB_NODE_FUNCTION_MULTIARCH(node, fn) \ + VLIB_NODE_FUNCTION_MULTIARCH_CLONE(fn) \ + CLIB_MULTIARCH_SELECT_FN(fn, static inline) \ + static void __attribute__((__constructor__)) \ + __vlib_node_function_multiarch_select_##node (void) \ + { node.function = fn ## _multiarch_select(); } +#endif + +always_inline vlib_node_registration_t * +vlib_node_next_registered (vlib_node_registration_t * c) +{ + c = + clib_elf_section_data_next (c, + c->n_next_nodes * sizeof (c->next_nodes[0])); + return c; +} + +typedef struct +{ + /* Total calls, clock ticks and vector elements processed for this node. */ + u64 calls, vectors, clocks, suspends; + u64 max_clock; + u64 max_clock_n; +} vlib_node_stats_t; + +#define foreach_vlib_node_state \ + /* Input node is called each iteration of main loop. \ + This is the default (zero). */ \ + _ (POLLING) \ + /* Input node is called when device signals an interrupt. */ \ + _ (INTERRUPT) \ + /* Input node is never called. */ \ + _ (DISABLED) + +typedef enum +{ +#define _(f) VLIB_NODE_STATE_##f, + foreach_vlib_node_state +#undef _ + VLIB_N_NODE_STATE, +} vlib_node_state_t; + +typedef struct vlib_node_t +{ + /* Vector processing function for this node. */ + vlib_node_function_t *function; + + /* Node name. */ + u8 *name; + + /* Node name index in elog string table. */ + u32 name_elog_string; + + /* Total statistics for this node. */ + vlib_node_stats_t stats_total; + + /* Saved values as of last clear (or zero if never cleared). + Current values are always stats_total - stats_last_clear. */ + vlib_node_stats_t stats_last_clear; + + /* Type of this node. */ + vlib_node_type_t type; + + /* Node index. */ + u32 index; + + /* Index of corresponding node runtime. */ + u32 runtime_index; + + /* Runtime data for this node. */ + void *runtime_data; + + /* Node flags. */ + u16 flags; + + /* Processing function keeps frame. Tells node dispatching code not + to free frame after dispatch is done. */ +#define VLIB_NODE_FLAG_FRAME_NO_FREE_AFTER_DISPATCH (1 << 0) + + /* Node counts as output/drop/punt node for stats purposes. */ +#define VLIB_NODE_FLAG_IS_OUTPUT (1 << 1) +#define VLIB_NODE_FLAG_IS_DROP (1 << 2) +#define VLIB_NODE_FLAG_IS_PUNT (1 << 3) +#define VLIB_NODE_FLAG_IS_HANDOFF (1 << 4) + + /* Set if current node runtime has traced vectors. */ +#define VLIB_NODE_FLAG_TRACE (1 << 5) + +#define VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE (1 << 6) +#define VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE (1 << 7) + + /* State for input nodes. */ + u8 state; + + /* Number of bytes of run time data. */ + u8 runtime_data_bytes; + + /* Number of error codes used by this node. */ + u16 n_errors; + + /* Size of scalar and vector arguments in bytes. */ + u16 scalar_size, vector_size; + + /* Handle/index in error heap for this node. */ + u32 error_heap_handle; + u32 error_heap_index; + + /* Error strings indexed by error code for this node. */ + char **error_strings; + + /* Vector of next node names. + Only used before next_nodes array is initialized. */ + char **next_node_names; + + /* Next node indices for this node. */ + u32 *next_nodes; + + /* Name of node that we are sibling of. */ + char *sibling_of; + + /* Bitmap of all of this node's siblings. */ + uword *sibling_bitmap; + + /* Total number of vectors sent to each next node. */ + u64 *n_vectors_by_next_node; + + /* Hash table mapping next node index into slot in + next_nodes vector. Quickly determines whether this node + is connected to given next node and, if so, with which slot. */ + uword *next_slot_by_node; + + /* Bitmap of node indices which feed this node. */ + uword *prev_node_bitmap; + + /* Node/next-index which own enqueue rights with to this node. */ + u32 owner_node_index, owner_next_index; + + /* Buffer format/unformat for this node. */ + format_function_t *format_buffer; + unformat_function_t *unformat_buffer; + + /* Trace buffer format/unformat for this node. */ + format_function_t *format_trace; + + /* Function to validate incoming frames. */ + u8 *(*validate_frame) (struct vlib_main_t * vm, + struct vlib_node_runtime_t *, + struct vlib_frame_t * f); + /* for pretty-printing, not typically valid */ + u8 *state_string; +} vlib_node_t; + +#define VLIB_INVALID_NODE_INDEX ((u32) ~0) + +/* Max number of vector elements to process at once per node. */ +#define VLIB_FRAME_SIZE 256 +#define VLIB_FRAME_ALIGN VLIB_MAX_CPUS + +/* Calling frame (think stack frame) for a node. */ +typedef struct vlib_frame_t +{ + /* Frame flags. */ + u16 flags; + + /* Number of scalar bytes in arguments. */ + u8 scalar_size; + + /* Number of bytes per vector argument. */ + u8 vector_size; + + /* Number of vector elements currently in frame. */ + u16 n_vectors; + + /* Owner cpuid / heap id */ + u16 cpu_index; + + /* Scalar and vector arguments to next node. */ + u8 arguments[0]; +} vlib_frame_t; + +typedef struct +{ + /* Frame index. */ + u32 frame_index; + + /* Node runtime for this next. */ + u32 node_runtime_index; + + /* Next frame flags. */ + u32 flags; + + /* Reflects node frame-used flag for this next. */ +#define VLIB_FRAME_NO_FREE_AFTER_DISPATCH \ + VLIB_NODE_FLAG_FRAME_NO_FREE_AFTER_DISPATCH + + /* This next frame owns enqueue to node + corresponding to node_runtime_index. */ +#define VLIB_FRAME_OWNER (1 << 15) + + /* Set when frame has been allocated for this next. */ +#define VLIB_FRAME_IS_ALLOCATED VLIB_NODE_FLAG_IS_OUTPUT + + /* Set when frame has been added to pending vector. */ +#define VLIB_FRAME_PENDING VLIB_NODE_FLAG_IS_DROP + + /* Set when frame is to be freed after dispatch. */ +#define VLIB_FRAME_FREE_AFTER_DISPATCH VLIB_NODE_FLAG_IS_PUNT + + /* Set when frame has traced packets. */ +#define VLIB_FRAME_TRACE VLIB_NODE_FLAG_TRACE + + /* Number of vectors enqueue to this next since last overflow. */ + u32 vectors_since_last_overflow; +} vlib_next_frame_t; + +always_inline void +vlib_next_frame_init (vlib_next_frame_t * nf) +{ + memset (nf, 0, sizeof (nf[0])); + nf->frame_index = ~0; + nf->node_runtime_index = ~0; +} + +/* A frame pending dispatch by main loop. */ +typedef struct +{ + /* Node and runtime for this frame. */ + u32 node_runtime_index; + + /* Frame index (in the heap). */ + u32 frame_index; + + /* Start of next frames for this node. */ + u32 next_frame_index; + + /* Special value for next_frame_index when there is no next frame. */ +#define VLIB_PENDING_FRAME_NO_NEXT_FRAME ((u32) ~0) +} vlib_pending_frame_t; + +typedef struct vlib_node_runtime_t +{ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + /* Node function to call. */ + vlib_node_function_t *function; + + /* Vector of errors for this node. */ + vlib_error_t *errors; + + /* Number of clock cycles. */ + u32 clocks_since_last_overflow; + + /* Maximum clock cycle for an invocation. */ + u32 max_clock; + + /* Number of vectors in the recorded max_clock. */ + u32 max_clock_n; + + /* Number of calls. */ + u32 calls_since_last_overflow; + + /* Number of vector elements processed by this node. */ + u32 vectors_since_last_overflow; + + /* Start of next frames for this node. */ + u32 next_frame_index; + + /* Node index. */ + u32 node_index; + + /* For input nodes: decremented on each main loop interation until it reaches zero + and function is called. Allows some input nodes to be called + more than others. */ + u32 input_main_loops_per_call; + + /* Saved main loop counter of last dispatch of this node. */ + u32 main_loop_count_last_dispatch; + + u32 main_loop_vector_stats[2]; + + /* Copy of main node flags. */ + u16 flags; + + /* Input node state. */ + u16 state; + + u16 n_next_nodes; + + /* Next frame index that vector arguments were last enqueued to + last time this node ran. Set to zero before first run + of this node. */ + u16 cached_next_index; + + /* CPU this node runs on */ + u16 cpu_index; + + /* Function dependent node-runtime. */ + u8 runtime_data[0]; +} +vlib_node_runtime_t; + +typedef struct +{ + /* Number of allocated frames for this scalar/vector size. */ + u32 n_alloc_frames; + + /* Vector of free frame indices for this scalar/vector size. */ + u32 *free_frame_indices; +} vlib_frame_size_t; + +typedef struct +{ + /* Users opaque value for event type. */ + uword opaque; +} vlib_process_event_type_t; + +typedef struct +{ + /* Node runtime for this process. */ + vlib_node_runtime_t node_runtime; + + /* Where to longjmp when process is done. */ + clib_longjmp_t return_longjmp; + +#define VLIB_PROCESS_RETURN_LONGJMP_RETURN ((uword) ~0 - 0) +#define VLIB_PROCESS_RETURN_LONGJMP_SUSPEND ((uword) ~0 - 1) + + /* Where to longjmp to resume node after suspend. */ + clib_longjmp_t resume_longjmp; +#define VLIB_PROCESS_RESUME_LONGJMP_SUSPEND 0 +#define VLIB_PROCESS_RESUME_LONGJMP_RESUME 1 + + u16 flags; +#define VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK (1 << 0) +#define VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT (1 << 1) + /* Set to indicate that this process has been added to resume vector. */ +#define VLIB_PROCESS_RESUME_PENDING (1 << 2) + + /* Process function is currently running. */ +#define VLIB_PROCESS_IS_RUNNING (1 << 3) + + /* Size of process stack. */ + u16 log2_n_stack_bytes; + + u32 suspended_process_frame_index; + + /* Number of times this process was suspended. */ + u32 n_suspends; + + /* Vectors of pending event data indexed by event type index. */ + void **pending_event_data_by_type_index; + + /* Bitmap of event type-indices with non-empty vectors. */ + uword *non_empty_event_type_bitmap; + + /* Bitmap of event type-indices which are one time events. */ + uword *one_time_event_type_bitmap; + + /* Type is opaque pointer -- typically a pointer to an event handler + function. Hash table to map opaque to a type index. */ + uword *event_type_index_by_type_opaque; + + /* Pool of currently valid event types. */ + vlib_process_event_type_t *event_type_pool; + + /* When suspending saves cpu cycle counter when process is to be resumed. */ + u64 resume_cpu_time; + + /* Default output function and its argument for any CLI outputs + within the process. */ + vlib_cli_output_function_t *output_function; + uword output_function_arg; + +#ifdef CLIB_UNIX + /* Pad to a multiple of the page size so we can mprotect process stacks */ +#define PAGE_SIZE_MULTIPLE 0x1000 +#define ALIGN_ON_MULTIPLE_PAGE_BOUNDARY_FOR_MPROTECT __attribute__ ((aligned (PAGE_SIZE_MULTIPLE))) +#else +#define ALIGN_ON_MULTIPLE_PAGE_BOUNDARY_FOR_MPROTECT +#endif + + /* Process stack. Starts here and extends 2^log2_n_stack_bytes + bytes. */ + +#define VLIB_PROCESS_STACK_MAGIC (0xdead7ead) + u32 stack[0] ALIGN_ON_MULTIPLE_PAGE_BOUNDARY_FOR_MPROTECT; +} vlib_process_t __attribute__ ((aligned (CLIB_CACHE_LINE_BYTES))); + +#ifdef CLIB_UNIX + /* Ensure that the stack is aligned on the multiple of the page size */ +typedef char + assert_process_stack_must_be_aligned_exactly_to_page_size_multiple[(sizeof + (vlib_process_t) + - + PAGE_SIZE_MULTIPLE) + == + 0 ? 0 : + -1]; +#endif + +typedef struct +{ + u32 node_index; + + u32 one_time_event; +} vlib_one_time_waiting_process_t; + +typedef struct +{ + u16 n_data_elts; + + u16 n_data_elt_bytes; + + /* n_data_elts * n_data_elt_bytes */ + u32 n_data_bytes; + + /* Process node & event type to be used to signal event. */ + u32 process_node_index; + + u32 event_type_index; + + union + { + u8 inline_event_data[64 - 3 * sizeof (u32) - 2 * sizeof (u16)]; + + /* Vector of event data used only when data does not fit inline. */ + u8 *event_data_as_vector; + }; +} +vlib_signal_timed_event_data_t; + +always_inline uword +vlib_timing_wheel_data_is_timed_event (u32 d) +{ + return d & 1; +} + +always_inline u32 +vlib_timing_wheel_data_set_suspended_process (u32 i) +{ + return 0 + 2 * i; +} + +always_inline u32 +vlib_timing_wheel_data_set_timed_event (u32 i) +{ + return 1 + 2 * i; +} + +always_inline uword +vlib_timing_wheel_data_get_index (u32 d) +{ + return d / 2; +} + +typedef struct +{ + /* Public nodes. */ + vlib_node_t **nodes; + + /* Node index hashed by node name. */ + uword *node_by_name; + + u32 flags; +#define VLIB_NODE_MAIN_RUNTIME_STARTED (1 << 0) + + /* Nodes segregated by type for cache locality. + Does not apply to nodes of type VLIB_NODE_TYPE_INTERNAL. */ + vlib_node_runtime_t *nodes_by_type[VLIB_N_NODE_TYPE]; + + /* Node runtime indices for input nodes with pending interrupts. */ + u32 *pending_interrupt_node_runtime_indices; + + /* Input nodes are switched from/to interrupt to/from polling mode + when average vector length goes above/below polling/interrupt + thresholds. */ + u32 polling_threshold_vector_length; + u32 interrupt_threshold_vector_length; + + /* Vector of next frames. */ + vlib_next_frame_t *next_frames; + + /* Vector of internal node's frames waiting to be called. */ + vlib_pending_frame_t *pending_frames; + + /* Timing wheel for scheduling time-based node dispatch. */ + timing_wheel_t timing_wheel; + + vlib_signal_timed_event_data_t *signal_timed_event_data_pool; + + /* Opaque data vector added via timing_wheel_advance. */ + u32 *data_from_advancing_timing_wheel; + + /* CPU time of next process to be ready on timing wheel. */ + u64 cpu_time_next_process_ready; + + /* Vector of process nodes. + One for each node of type VLIB_NODE_TYPE_PROCESS. */ + vlib_process_t **processes; + + /* Current running process or ~0 if no process running. */ + u32 current_process_index; + + /* Pool of pending process frames. */ + vlib_pending_frame_t *suspended_process_frames; + + /* Vector of event data vectors pending recycle. */ + void **recycled_event_data_vectors; + + /* Current counts of nodes in each state. */ + u32 input_node_counts_by_state[VLIB_N_NODE_STATE]; + + /* Hash of (scalar_size,vector_size) to frame_sizes index. */ + uword *frame_size_hash; + + /* Per-size frame allocation information. */ + vlib_frame_size_t *frame_sizes; + + /* Time of last node runtime stats clear. */ + f64 time_last_runtime_stats_clear; + + /* Node registrations added by constructors */ + vlib_node_registration_t *node_registrations; +} vlib_node_main_t; + + +#define FRAME_QUEUE_MAX_NELTS 32 +typedef struct +{ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + u64 head; + u64 head_hint; + u64 tail; + u32 n_in_use; + u32 nelts; + u32 written; + u32 threshold; + i32 n_vectors[FRAME_QUEUE_MAX_NELTS]; +} frame_queue_trace_t; + +typedef struct +{ + u64 count[FRAME_QUEUE_MAX_NELTS]; +} frame_queue_nelt_counter_t; + +#endif /* included_vlib_node_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/node_cli.c b/src/vlib/node_cli.c new file mode 100644 index 00000000..05d0f0b5 --- /dev/null +++ b/src/vlib/node_cli.c @@ -0,0 +1,466 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * node_cli.c: node CLI + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include + +static int +node_cmp (void *a1, void *a2) +{ + vlib_node_t **n1 = a1; + vlib_node_t **n2 = a2; + + return vec_cmp (n1[0]->name, n2[0]->name); +} + +static clib_error_t * +show_node_graph (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n; + u32 node_index; + + vlib_cli_output (vm, "%U\n", format_vlib_node_graph, nm, 0); + + if (unformat (input, "%U", unformat_vlib_node, vm, &node_index)) + { + n = vlib_get_node (vm, node_index); + vlib_cli_output (vm, "%U\n", format_vlib_node_graph, nm, n); + } + else + { + vlib_node_t **nodes = vec_dup (nm->nodes); + uword i; + + vec_sort_with_function (nodes, node_cmp); + + for (i = 0; i < vec_len (nodes); i++) + vlib_cli_output (vm, "%U\n\n", format_vlib_node_graph, nm, nodes[i]); + + vec_free (nodes); + } + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_node_graph_command, static) = { + .path = "show vlib graph", + .short_help = "Show packet processing node graph", + .function = show_node_graph, +}; +/* *INDENT-ON* */ + +static u8 * +format_vlib_node_stats (u8 * s, va_list * va) +{ + vlib_main_t *vm = va_arg (*va, vlib_main_t *); + vlib_node_t *n = va_arg (*va, vlib_node_t *); + int max = va_arg (*va, int); + f64 v; + char *state; + u8 *ns; + u8 *misc_info = 0; + u64 c, p, l, d; + f64 x; + f64 maxc, maxcn; + u32 maxn; + uword indent; + + if (!n) + { + if (max) + return format (s, + "%=30s%=17s%=16s%=16s%=16s%=16s", + "Name", "Max Node Clocks", "Vectors at Max", + "Max Clocks", "Avg Clocks", "Avg Vectors/Call"); + else + return format (s, + "%=30s%=12s%=16s%=16s%=16s%=16s%=16s", + "Name", "State", "Calls", "Vectors", "Suspends", + "Clocks", "Vectors/Call"); + } + + indent = format_get_indent (s); + + l = n->stats_total.clocks - n->stats_last_clear.clocks; + c = n->stats_total.calls - n->stats_last_clear.calls; + p = n->stats_total.vectors - n->stats_last_clear.vectors; + d = n->stats_total.suspends - n->stats_last_clear.suspends; + maxc = (f64) n->stats_total.max_clock; + maxn = n->stats_total.max_clock_n; + if (n->stats_total.max_clock_n) + maxcn = (f64) n->stats_total.max_clock / (f64) maxn; + else + maxcn = 0.0; + + /* Clocks per packet, per call or per suspend. */ + x = 0; + if (p > 0) + x = (f64) l / (f64) p; + else if (c > 0) + x = (f64) l / (f64) c; + else if (d > 0) + x = (f64) l / (f64) d; + + if (c > 0) + v = (double) p / (double) c; + else + v = 0; + + state = "active"; + if (n->type == VLIB_NODE_TYPE_PROCESS) + { + vlib_process_t *p = vlib_get_process_from_node (vm, n); + + /* Show processes with events pending. This helps spot bugs where events are not + being handled. */ + if (!clib_bitmap_is_zero (p->non_empty_event_type_bitmap)) + misc_info = format (misc_info, "events pending, "); + + switch (p->flags & (VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK + | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT)) + { + default: + if (!(p->flags & VLIB_PROCESS_IS_RUNNING)) + state = "done"; + break; + + case VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK: + state = "time wait"; + break; + + case VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT: + state = "event wait"; + break; + + case (VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK): + state = + "any wait"; + break; + } + } + else if (n->type != VLIB_NODE_TYPE_INTERNAL) + { + state = "polling"; + if (n->state == VLIB_NODE_STATE_DISABLED) + state = "disabled"; + else if (n->state == VLIB_NODE_STATE_INTERRUPT) + state = "interrupt wait"; + } + + ns = n->name; + + if (max) + s = format (s, "%-30v%=17.2e%=16d%=16.2e%=16.2e%=16.2e", + ns, maxc, maxn, maxcn, x, v); + else + s = format (s, "%-30v%=12s%16Ld%16Ld%16Ld%16.2e%16.2f", ns, state, + c, p, d, x, v); + + if (ns != n->name) + vec_free (ns); + + if (misc_info) + { + s = format (s, "\n%U%v", format_white_space, indent + 4, misc_info); + vec_free (misc_info); + } + + return s; +} + +static clib_error_t * +show_node_runtime (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n; + f64 time_now; + u32 node_index; + vlib_node_t ***node_dups = 0; + f64 *vectors_per_main_loop = 0; + f64 *last_vector_length_per_node = 0; + + time_now = vlib_time_now (vm); + + if (unformat (input, "%U", unformat_vlib_node, vm, &node_index)) + { + n = vlib_get_node (vm, node_index); + vlib_node_sync_stats (vm, n); + vlib_cli_output (vm, "%U\n", format_vlib_node_stats, vm, 0, 0); + vlib_cli_output (vm, "%U\n", format_vlib_node_stats, vm, n, 0); + } + else + { + vlib_node_t **nodes; + uword i, j; + f64 dt; + u64 n_input, n_output, n_drop, n_punt; + u64 n_internal_vectors, n_internal_calls; + u64 n_clocks, l, v, c, d; + int brief = 1; + int max = 0; + vlib_main_t **stat_vms = 0, *stat_vm; + + /* Suppress nodes with zero calls since last clear */ + if (unformat (input, "brief") || unformat (input, "b")) + brief = 1; + if (unformat (input, "verbose") || unformat (input, "v")) + brief = 0; + if (unformat (input, "max") || unformat (input, "m")) + max = 1; + + if (vec_len (vlib_mains) == 0) + vec_add1 (stat_vms, vm); + else + { + for (i = 0; i < vec_len (vlib_mains); i++) + { + stat_vm = vlib_mains[i]; + if (stat_vm) + vec_add1 (stat_vms, stat_vm); + } + } + + /* + * Barrier sync across stats scraping. + * Otherwise, the counts will be grossly inaccurate. + */ + vlib_worker_thread_barrier_sync (vm); + + for (j = 0; j < vec_len (stat_vms); j++) + { + stat_vm = stat_vms[j]; + nm = &stat_vm->node_main; + + for (i = 0; i < vec_len (nm->nodes); i++) + { + n = nm->nodes[i]; + vlib_node_sync_stats (stat_vm, n); + } + + nodes = vec_dup (nm->nodes); + + vec_add1 (node_dups, nodes); + vec_add1 (vectors_per_main_loop, + vlib_last_vectors_per_main_loop_as_f64 (stat_vm)); + vec_add1 (last_vector_length_per_node, + vlib_last_vector_length_per_node (stat_vm)); + } + vlib_worker_thread_barrier_release (vm); + + + for (j = 0; j < vec_len (stat_vms); j++) + { + stat_vm = stat_vms[j]; + nodes = node_dups[j]; + + vec_sort_with_function (nodes, node_cmp); + + n_input = n_output = n_drop = n_punt = n_clocks = 0; + n_internal_vectors = n_internal_calls = 0; + for (i = 0; i < vec_len (nodes); i++) + { + n = nodes[i]; + + l = n->stats_total.clocks - n->stats_last_clear.clocks; + n_clocks += l; + + v = n->stats_total.vectors - n->stats_last_clear.vectors; + c = n->stats_total.calls - n->stats_last_clear.calls; + + switch (n->type) + { + default: + continue; + + case VLIB_NODE_TYPE_INTERNAL: + n_output += (n->flags & VLIB_NODE_FLAG_IS_OUTPUT) ? v : 0; + n_drop += (n->flags & VLIB_NODE_FLAG_IS_DROP) ? v : 0; + n_punt += (n->flags & VLIB_NODE_FLAG_IS_PUNT) ? v : 0; + if (!(n->flags & VLIB_NODE_FLAG_IS_OUTPUT)) + { + n_internal_vectors += v; + n_internal_calls += c; + } + if (n->flags & VLIB_NODE_FLAG_IS_HANDOFF) + n_input += v; + break; + + case VLIB_NODE_TYPE_INPUT: + n_input += v; + break; + } + } + + if (vec_len (vlib_mains)) + { + vlib_worker_thread_t *w = vlib_worker_threads + j; + if (j > 0) + vlib_cli_output (vm, "---------------"); + + if (w->lcore_id > -1) + vlib_cli_output (vm, "Thread %d %s (lcore %u)", j, w->name, + w->lcore_id); + else + vlib_cli_output (vm, "Thread %d %s", j, w->name); + } + + dt = time_now - nm->time_last_runtime_stats_clear; + vlib_cli_output + (vm, + "Time %.1f, average vectors/node %.2f, last %d main loops %.2f per node %.2f" + "\n vector rates in %.4e, out %.4e, drop %.4e, punt %.4e", + dt, + (n_internal_calls > 0 + ? (f64) n_internal_vectors / (f64) n_internal_calls + : 0), + 1 << VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE, + vectors_per_main_loop[j], + last_vector_length_per_node[j], + (f64) n_input / dt, + (f64) n_output / dt, (f64) n_drop / dt, (f64) n_punt / dt); + + vlib_cli_output (vm, "%U", format_vlib_node_stats, stat_vm, 0, max); + for (i = 0; i < vec_len (nodes); i++) + { + c = + nodes[i]->stats_total.calls - + nodes[i]->stats_last_clear.calls; + d = + nodes[i]->stats_total.suspends - + nodes[i]->stats_last_clear.suspends; + if (c || d || !brief) + { + vlib_cli_output (vm, "%U", format_vlib_node_stats, stat_vm, + nodes[i], max); + } + } + vec_free (nodes); + } + vec_free (stat_vms); + vec_free (node_dups); + vec_free (vectors_per_main_loop); + vec_free (last_vector_length_per_node); + } + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_node_runtime_command, static) = { + .path = "show runtime", + .short_help = "Show packet processing runtime", + .function = show_node_runtime, + .is_mp_safe = 1, +}; +/* *INDENT-ON* */ + +static clib_error_t * +clear_node_runtime (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_node_main_t *nm; + vlib_node_t *n; + int i, j; + vlib_main_t **stat_vms = 0, *stat_vm; + vlib_node_runtime_t *r; + + if (vec_len (vlib_mains) == 0) + vec_add1 (stat_vms, vm); + else + { + for (i = 0; i < vec_len (vlib_mains); i++) + { + stat_vm = vlib_mains[i]; + if (stat_vm) + vec_add1 (stat_vms, stat_vm); + } + } + + vlib_worker_thread_barrier_sync (vm); + + for (j = 0; j < vec_len (stat_vms); j++) + { + stat_vm = stat_vms[j]; + nm = &stat_vm->node_main; + + for (i = 0; i < vec_len (nm->nodes); i++) + { + n = nm->nodes[i]; + vlib_node_sync_stats (stat_vm, n); + n->stats_last_clear = n->stats_total; + + r = vlib_node_get_runtime (stat_vm, n->index); + r->max_clock = 0; + } + /* Note: input/output rates computed using vlib_global_main */ + nm->time_last_runtime_stats_clear = vlib_time_now (vm); + } + + vlib_worker_thread_barrier_release (vm); + + vec_free (stat_vms); + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (clear_node_runtime_command, static) = { + .path = "clear runtime", + .short_help = "Clear packet processing runtime statistics", + .function = clear_node_runtime, +}; +/* *INDENT-ON* */ + +/* Dummy function to get us linked in. */ +void +vlib_node_cli_reference (void) +{ +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/node_format.c b/src/vlib/node_format.c new file mode 100644 index 00000000..e9dde40f --- /dev/null +++ b/src/vlib/node_format.c @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * node_format.c: node formatting + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include + +u8 * +format_vlib_node_graph (u8 * s, va_list * va) +{ + vlib_node_main_t *nm = va_arg (*va, vlib_node_main_t *); + vlib_node_t *n = va_arg (*va, vlib_node_t *); + int i, j; + uword indent; + typedef struct + { + u32 next_node; + u32 next_slot; + u32 prev_node; + } tmp_t; + tmp_t *tmps = 0; + tmp_t empty = {.next_node = ~0,.prev_node = ~0 }; + + if (!n) + return format (s, "%=26s%=26s%=26s", "Name", "Next", "Previous"); + + s = format (s, "%-26v", n->name); + + indent = format_get_indent (s); + + for (i = j = 0; i < vec_len (n->next_nodes); i++) + { + if (n->next_nodes[i] == VLIB_INVALID_NODE_INDEX) + continue; + vec_validate_init_empty (tmps, j, empty); + tmps[j].next_node = n->next_nodes[i]; + tmps[j].next_slot = i; + j++; + } + + j = 0; + /* *INDENT-OFF* */ + clib_bitmap_foreach (i, n->prev_node_bitmap, ({ + vec_validate_init_empty (tmps, j, empty); + tmps[j].prev_node = i; + j++; + })); + /* *INDENT-ON* */ + + for (i = 0; i < vec_len (tmps); i++) + { + if (i > 0) + s = format (s, "\n%U", format_white_space, indent); + + if (tmps[i].next_node != ~0) + { + vlib_node_t *x; + u8 *t = 0; + + x = vec_elt (nm->nodes, tmps[i].next_node); + t = format (t, "%v [%d]", x->name, tmps[i].next_slot); + s = format (s, "%=26v", t); + vec_free (t); + } + else + s = format (s, "%26s", ""); + + if (tmps[i].prev_node != ~0) + { + vlib_node_t *x; + x = vec_elt (nm->nodes, tmps[i].prev_node); + s = format (s, "%=26v", x->name); + } + } + + vec_free (tmps); + + return s; +} + +u8 * +format_vlib_node_and_next (u8 * s, va_list * va) +{ + vlib_main_t *vm = va_arg (*va, vlib_main_t *); + vlib_node_t *n = va_arg (*va, vlib_node_t *); + u32 next_index = va_arg (*va, u32); + vlib_node_t *n_next; + u32 *ni; + + ni = vec_elt_at_index (n->next_nodes, next_index); + n_next = vlib_get_node (vm, ni[0]); + return format (s, "%v -> %v", n->name, n_next->name); +} + +u8 * +format_vlib_node_name (u8 * s, va_list * va) +{ + vlib_main_t *vm = va_arg (*va, vlib_main_t *); + u32 node_index = va_arg (*va, u32); + vlib_node_t *n = vlib_get_node (vm, node_index); + + return format (s, "%v", n->name); +} + +u8 * +format_vlib_next_node_name (u8 * s, va_list * va) +{ + vlib_main_t *vm = va_arg (*va, vlib_main_t *); + u32 node_index = va_arg (*va, u32); + u32 next_index = va_arg (*va, u32); + vlib_node_t *next = vlib_get_next_node (vm, node_index, next_index); + return format (s, "%v", next->name); +} + +/* Parse node name -> node index. */ +uword +unformat_vlib_node (unformat_input_t * input, va_list * args) +{ + vlib_main_t *vm = va_arg (*args, vlib_main_t *); + u32 *result = va_arg (*args, u32 *); + + return unformat_user (input, unformat_hash_vec_string, + vm->node_main.node_by_name, result); +} + +u8 * +format_vlib_time (u8 * s, va_list * va) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); + f64 time = va_arg (*va, f64); + return format (s, "%12.4f", time); +} + +u8 * +format_vlib_cpu_time (u8 * s, va_list * va) +{ + vlib_main_t *vm = va_arg (*va, vlib_main_t *); + u64 cpu_time = va_arg (*va, u64); + f64 dt; + + dt = + (cpu_time - + vm->clib_time.init_cpu_time) * vm->clib_time.seconds_per_clock; + return format (s, "%U", format_vlib_time, vm, dt); +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h new file mode 100644 index 00000000..21167396 --- /dev/null +++ b/src/vlib/node_funcs.h @@ -0,0 +1,1130 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * node_funcs.h: processing nodes global functions/inlines + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** \file + vlib node functions +*/ + + +#ifndef included_vlib_node_funcs_h +#define included_vlib_node_funcs_h + +#include + +/** \brief Get vlib node by index. + @warning This function will ASSERT if @c i is out of range. + @param vm vlib_main_t pointer, varies by thread + @param i node index. + @return pointer to the requested vlib_node_t. +*/ + +always_inline vlib_node_t * +vlib_get_node (vlib_main_t * vm, u32 i) +{ + return vec_elt (vm->node_main.nodes, i); +} + +/** \brief Get vlib node by graph arc (next) index. + @param vm vlib_main_t pointer, varies by thread + @param node_index index of original node + @param next_index graph arc index + @return pointer to the vlib_node_t at the end of the indicated arc +*/ + +always_inline vlib_node_t * +vlib_get_next_node (vlib_main_t * vm, u32 node_index, u32 next_index) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n; + + n = vec_elt (nm->nodes, node_index); + ASSERT (next_index < vec_len (n->next_nodes)); + return vlib_get_node (vm, n->next_nodes[next_index]); +} + +/** \brief Get node runtime by node index. + @param vm vlib_main_t pointer, varies by thread + @param node_index index of node + @return pointer to the indicated vlib_node_runtime_t +*/ + +always_inline vlib_node_runtime_t * +vlib_node_get_runtime (vlib_main_t * vm, u32 node_index) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n = vec_elt (nm->nodes, node_index); + vlib_process_t *p; + if (n->type != VLIB_NODE_TYPE_PROCESS) + return vec_elt_at_index (nm->nodes_by_type[n->type], n->runtime_index); + else + { + p = vec_elt (nm->processes, n->runtime_index); + return &p->node_runtime; + } +} + +/** \brief Get node runtime private data by node index. + @param vm vlib_main_t pointer, varies by thread + @param node_index index of the node + @return pointer to the indicated vlib_node_runtime_t private data +*/ + +always_inline void * +vlib_node_get_runtime_data (vlib_main_t * vm, u32 node_index) +{ + vlib_node_runtime_t *r = vlib_node_get_runtime (vm, node_index); + return r->runtime_data; +} + +/** \brief Set node runtime private data. + @param vm vlib_main_t pointer, varies by thread + @param node_index index of the node + @param runtime_data arbitrary runtime private data + @param n_runtime_data_bytes size of runtime private data +*/ + +always_inline void +vlib_node_set_runtime_data (vlib_main_t * vm, u32 node_index, + void *runtime_data, u32 n_runtime_data_bytes) +{ + vlib_node_t *n = vlib_get_node (vm, node_index); + vlib_node_runtime_t *r = vlib_node_get_runtime (vm, node_index); + + n->runtime_data_bytes = n_runtime_data_bytes; + vec_free (n->runtime_data); + vec_add (n->runtime_data, runtime_data, n_runtime_data_bytes); + + ASSERT (vec_len (n->runtime_data) <= sizeof (vlib_node_runtime_t) - + STRUCT_OFFSET_OF (vlib_node_runtime_t, runtime_data)); + + if (vec_len (n->runtime_data) > 0) + clib_memcpy (r->runtime_data, n->runtime_data, vec_len (n->runtime_data)); +} + +/** \brief Set node dispatch state. + @param vm vlib_main_t pointer, varies by thread + @param node_index index of the node + @param new_state new state for node, see vlib_node_state_t +*/ +always_inline void +vlib_node_set_state (vlib_main_t * vm, u32 node_index, + vlib_node_state_t new_state) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n; + vlib_node_runtime_t *r; + + n = vec_elt (nm->nodes, node_index); + if (n->type == VLIB_NODE_TYPE_PROCESS) + { + vlib_process_t *p = vec_elt (nm->processes, n->runtime_index); + r = &p->node_runtime; + + /* When disabling make sure flags are cleared. */ + p->flags &= ~(VLIB_PROCESS_RESUME_PENDING + | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK + | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT); + } + else + r = vec_elt_at_index (nm->nodes_by_type[n->type], n->runtime_index); + + ASSERT (new_state < VLIB_N_NODE_STATE); + + if (n->type == VLIB_NODE_TYPE_INPUT) + { + ASSERT (nm->input_node_counts_by_state[n->state] > 0); + nm->input_node_counts_by_state[n->state] -= 1; + nm->input_node_counts_by_state[new_state] += 1; + } + + n->state = new_state; + r->state = new_state; +} + +always_inline void +vlib_node_set_interrupt_pending (vlib_main_t * vm, u32 node_index) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n = vec_elt (nm->nodes, node_index); + ASSERT (n->type == VLIB_NODE_TYPE_INPUT); + vec_add1 (nm->pending_interrupt_node_runtime_indices, n->runtime_index); +} + +always_inline vlib_process_t * +vlib_get_process_from_node (vlib_main_t * vm, vlib_node_t * node) +{ + vlib_node_main_t *nm = &vm->node_main; + ASSERT (node->type == VLIB_NODE_TYPE_PROCESS); + return vec_elt (nm->processes, node->runtime_index); +} + +/* Fetches frame with given handle. */ +always_inline vlib_frame_t * +vlib_get_frame_no_check (vlib_main_t * vm, uword frame_index) +{ + vlib_frame_t *f; + u32 cpu_index = frame_index & VLIB_CPU_MASK; + u32 offset = frame_index & VLIB_OFFSET_MASK; + vm = vlib_mains ? vlib_mains[cpu_index] : vm; + f = vm->heap_base + offset; + return f; +} + +always_inline u32 +vlib_frame_index_no_check (vlib_main_t * vm, vlib_frame_t * f) +{ + u32 i; + + ASSERT (((uword) f & VLIB_CPU_MASK) == 0); + + vm = vlib_mains ? vlib_mains[f->cpu_index] : vm; + + i = ((u8 *) f - (u8 *) vm->heap_base); + return i | f->cpu_index; +} + +always_inline vlib_frame_t * +vlib_get_frame (vlib_main_t * vm, uword frame_index) +{ + vlib_frame_t *f = vlib_get_frame_no_check (vm, frame_index); + ASSERT (f->flags & VLIB_FRAME_IS_ALLOCATED); + return f; +} + +always_inline u32 +vlib_frame_index (vlib_main_t * vm, vlib_frame_t * f) +{ + uword i = vlib_frame_index_no_check (vm, f); + ASSERT (vlib_get_frame (vm, i) == f); + return i; +} + +/* Byte alignment for vector arguments. */ +#define VLIB_FRAME_VECTOR_ALIGN (1 << 4) + +always_inline u32 +vlib_frame_vector_byte_offset (u32 scalar_size) +{ + return round_pow2 (sizeof (vlib_frame_t) + scalar_size, + VLIB_FRAME_VECTOR_ALIGN); +} + +/** \brief Get pointer to frame vector data. + @param f vlib_frame_t pointer + @return pointer to first vector element in frame +*/ +always_inline void * +vlib_frame_vector_args (vlib_frame_t * f) +{ + return (void *) f + vlib_frame_vector_byte_offset (f->scalar_size); +} + +/** \brief Get pointer to frame scalar data. + + @warning This is almost certainly not the function you wish to call. + See @ref vlib_frame_vector_args instead. + + @param f vlib_frame_t pointer + + @return arbitrary node scalar data + + @sa vlib_frame_vector_args +*/ +always_inline void * +vlib_frame_args (vlib_frame_t * f) +{ + return vlib_frame_vector_args (f) - f->scalar_size; +} + +always_inline vlib_next_frame_t * +vlib_node_runtime_get_next_frame (vlib_main_t * vm, + vlib_node_runtime_t * n, u32 next_index) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_next_frame_t *nf; + + ASSERT (next_index < n->n_next_nodes); + nf = vec_elt_at_index (nm->next_frames, n->next_frame_index + next_index); + + if (CLIB_DEBUG > 0) + { + vlib_node_t *node, *next; + node = vec_elt (nm->nodes, n->node_index); + next = vec_elt (nm->nodes, node->next_nodes[next_index]); + ASSERT (nf->node_runtime_index == next->runtime_index); + } + + return nf; +} + +/** \brief Get pointer to frame by (@c node_index, @c next_index). + + @warning This is not a function that you should call directly. + See @ref vlib_get_next_frame instead. + + @param vm vlib_main_t pointer, varies by thread + @param node_index index of the node + @param next_index graph arc index + + @return pointer to the requested vlib_next_frame_t + + @sa vlib_get_next_frame +*/ + +always_inline vlib_next_frame_t * +vlib_node_get_next_frame (vlib_main_t * vm, u32 node_index, u32 next_index) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n; + vlib_node_runtime_t *r; + + n = vec_elt (nm->nodes, node_index); + r = vec_elt_at_index (nm->nodes_by_type[n->type], n->runtime_index); + return vlib_node_runtime_get_next_frame (vm, r, next_index); +} + +vlib_frame_t *vlib_get_next_frame_internal (vlib_main_t * vm, + vlib_node_runtime_t * node, + u32 next_index, + u32 alloc_new_frame); + +#define vlib_get_next_frame_macro(vm,node,next_index,vectors,n_vectors_left,alloc_new_frame) \ +do { \ + vlib_frame_t * _f \ + = vlib_get_next_frame_internal ((vm), (node), (next_index), \ + (alloc_new_frame)); \ + u32 _n = _f->n_vectors; \ + (vectors) = vlib_frame_vector_args (_f) + _n * sizeof ((vectors)[0]); \ + (n_vectors_left) = VLIB_FRAME_SIZE - _n; \ +} while (0) + + +/** \brief Get pointer to next frame vector data by + (@c vlib_node_runtime_t, @c next_index). + Standard single/dual loop boilerplate element. + @attention This is a MACRO, with SIDE EFFECTS. + + @param vm vlib_main_t pointer, varies by thread + @param node current node vlib_node_runtime_t pointer + @param next_index requested graph arc index + + @return @c vectors -- pointer to next available vector slot + @return @c n_vectors_left -- number of vector slots available +*/ +#define vlib_get_next_frame(vm,node,next_index,vectors,n_vectors_left) \ + vlib_get_next_frame_macro (vm, node, next_index, \ + vectors, n_vectors_left, \ + /* alloc new frame */ 0) + +#define vlib_get_new_next_frame(vm,node,next_index,vectors,n_vectors_left) \ + vlib_get_next_frame_macro (vm, node, next_index, \ + vectors, n_vectors_left, \ + /* alloc new frame */ 1) + +/** \brief Release pointer to next frame vector data. + Standard single/dual loop boilerplate element. + @param vm vlib_main_t pointer, varies by thread + @param r current node vlib_node_runtime_t pointer + @param next_index graph arc index + @param n_packets_left number of slots still available in vector +*/ +void +vlib_put_next_frame (vlib_main_t * vm, + vlib_node_runtime_t * r, + u32 next_index, u32 n_packets_left); + +/* Combination get plus put. Returns vector argument just added. */ +#define vlib_set_next_frame(vm,node,next_index,v) \ +({ \ + uword _n_left; \ + vlib_get_next_frame ((vm), (node), (next_index), (v), _n_left); \ + ASSERT (_n_left > 0); \ + vlib_put_next_frame ((vm), (node), (next_index), _n_left - 1); \ + (v); \ +}) + +always_inline void +vlib_set_next_frame_buffer (vlib_main_t * vm, + vlib_node_runtime_t * node, + u32 next_index, u32 buffer_index) +{ + u32 *p; + p = vlib_set_next_frame (vm, node, next_index, p); + p[0] = buffer_index; +} + +vlib_frame_t *vlib_get_frame_to_node (vlib_main_t * vm, u32 to_node_index); +void vlib_put_frame_to_node (vlib_main_t * vm, u32 to_node_index, + vlib_frame_t * f); + +always_inline vlib_process_t * +vlib_get_current_process (vlib_main_t * vm) +{ + vlib_node_main_t *nm = &vm->node_main; + return vec_elt (nm->processes, nm->current_process_index); +} + +always_inline uword +vlib_in_process_context (vlib_main_t * vm) +{ + return vm->node_main.current_process_index != ~0; +} + +always_inline uword +vlib_current_process (vlib_main_t * vm) +{ + return vlib_get_current_process (vm)->node_runtime.node_index; +} + +/** Returns TRUE if a process suspend time is less than 1us + @param dt - remaining poll time in seconds + @returns 1 if dt < 1e-6, 0 otherwise +*/ +always_inline uword +vlib_process_suspend_time_is_zero (f64 dt) +{ + return dt < 1e-6; +} + +/** Suspend a vlib cooperative multi-tasking thread for a period of time + @param vm - vlib_main_t * + @param dt - suspend interval in seconds + @returns VLIB_PROCESS_RESUME_LONGJMP_RESUME, routinely ignored +*/ + +always_inline uword +vlib_process_suspend (vlib_main_t * vm, f64 dt) +{ + uword r; + vlib_node_main_t *nm = &vm->node_main; + vlib_process_t *p = vec_elt (nm->processes, nm->current_process_index); + u64 dt_cpu = dt * vm->clib_time.clocks_per_second; + + if (vlib_process_suspend_time_is_zero (dt)) + return VLIB_PROCESS_RESUME_LONGJMP_RESUME; + + p->flags |= VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK; + r = clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND); + if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND) + { + p->resume_cpu_time = clib_cpu_time_now () + dt_cpu; + clib_longjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_SUSPEND); + } + + return r; +} + +always_inline void +vlib_process_free_event_type (vlib_process_t * p, uword t, + uword is_one_time_event) +{ + ASSERT (!pool_is_free_index (p->event_type_pool, t)); + pool_put_index (p->event_type_pool, t); + if (is_one_time_event) + p->one_time_event_type_bitmap = + clib_bitmap_andnoti (p->one_time_event_type_bitmap, t); +} + +always_inline void +vlib_process_maybe_free_event_type (vlib_process_t * p, uword t) +{ + ASSERT (!pool_is_free_index (p->event_type_pool, t)); + if (clib_bitmap_get (p->one_time_event_type_bitmap, t)) + vlib_process_free_event_type (p, t, /* is_one_time_event */ 1); +} + +always_inline void * +vlib_process_get_event_data (vlib_main_t * vm, + uword * return_event_type_opaque) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_process_t *p; + vlib_process_event_type_t *et; + uword t, l; + void *event_data_vector; + + p = vec_elt (nm->processes, nm->current_process_index); + + /* Find first type with events ready. + Return invalid type when there's nothing there. */ + t = clib_bitmap_first_set (p->non_empty_event_type_bitmap); + if (t == ~0) + return 0; + + p->non_empty_event_type_bitmap = + clib_bitmap_andnoti (p->non_empty_event_type_bitmap, t); + + l = _vec_len (p->pending_event_data_by_type_index[t]); + ASSERT (l > 0); + event_data_vector = p->pending_event_data_by_type_index[t]; + p->pending_event_data_by_type_index[t] = 0; + + et = pool_elt_at_index (p->event_type_pool, t); + + /* Return user's opaque value and possibly index. */ + *return_event_type_opaque = et->opaque; + + vlib_process_maybe_free_event_type (p, t); + + return event_data_vector; +} + +/* Return event data vector for later reuse. We reuse event data to avoid + repeatedly allocating event vectors in cases where we care about speed. */ +always_inline void +vlib_process_put_event_data (vlib_main_t * vm, void *event_data) +{ + vlib_node_main_t *nm = &vm->node_main; + vec_add1 (nm->recycled_event_data_vectors, event_data); +} + +/** Return the first event type which has occurred and a vector of per-event + data of that type, or a timeout indication + + @param vm - vlib_main_t pointer + @param data_vector - pointer to a (uword *) vector to receive event data + @returns either an event type and a vector of per-event instance data, + or ~0 to indicate a timeout. +*/ + +always_inline uword +vlib_process_get_events (vlib_main_t * vm, uword ** data_vector) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_process_t *p; + vlib_process_event_type_t *et; + uword r, t, l; + + p = vec_elt (nm->processes, nm->current_process_index); + + /* Find first type with events ready. + Return invalid type when there's nothing there. */ + t = clib_bitmap_first_set (p->non_empty_event_type_bitmap); + if (t == ~0) + return t; + + p->non_empty_event_type_bitmap = + clib_bitmap_andnoti (p->non_empty_event_type_bitmap, t); + + l = _vec_len (p->pending_event_data_by_type_index[t]); + if (data_vector) + vec_add (*data_vector, p->pending_event_data_by_type_index[t], l); + _vec_len (p->pending_event_data_by_type_index[t]) = 0; + + et = pool_elt_at_index (p->event_type_pool, t); + + /* Return user's opaque value. */ + r = et->opaque; + + vlib_process_maybe_free_event_type (p, t); + + return r; +} + +always_inline uword +vlib_process_get_events_helper (vlib_process_t * p, uword t, + uword ** data_vector) +{ + uword l; + + p->non_empty_event_type_bitmap = + clib_bitmap_andnoti (p->non_empty_event_type_bitmap, t); + + l = _vec_len (p->pending_event_data_by_type_index[t]); + if (data_vector) + vec_add (*data_vector, p->pending_event_data_by_type_index[t], l); + _vec_len (p->pending_event_data_by_type_index[t]) = 0; + + vlib_process_maybe_free_event_type (p, t); + + return l; +} + +/* As above but query as specified type of event. Returns number of + events found. */ +always_inline uword +vlib_process_get_events_with_type (vlib_main_t * vm, uword ** data_vector, + uword with_type_opaque) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_process_t *p; + uword t, *h; + + p = vec_elt (nm->processes, nm->current_process_index); + h = hash_get (p->event_type_index_by_type_opaque, with_type_opaque); + if (!h) + /* This can happen when an event has not yet been + signaled with given opaque type. */ + return 0; + + t = h[0]; + if (!clib_bitmap_get (p->non_empty_event_type_bitmap, t)) + return 0; + + return vlib_process_get_events_helper (p, t, data_vector); +} + +always_inline uword * +vlib_process_wait_for_event (vlib_main_t * vm) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_process_t *p; + uword r; + + p = vec_elt (nm->processes, nm->current_process_index); + if (clib_bitmap_is_zero (p->non_empty_event_type_bitmap)) + { + p->flags |= VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT; + r = + clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND); + if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND) + clib_longjmp (&p->return_longjmp, + VLIB_PROCESS_RETURN_LONGJMP_SUSPEND); + } + + return p->non_empty_event_type_bitmap; +} + +always_inline uword +vlib_process_wait_for_one_time_event (vlib_main_t * vm, + uword ** data_vector, + uword with_type_index) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_process_t *p; + uword r; + + p = vec_elt (nm->processes, nm->current_process_index); + ASSERT (!pool_is_free_index (p->event_type_pool, with_type_index)); + while (!clib_bitmap_get (p->non_empty_event_type_bitmap, with_type_index)) + { + p->flags |= VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT; + r = + clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND); + if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND) + clib_longjmp (&p->return_longjmp, + VLIB_PROCESS_RETURN_LONGJMP_SUSPEND); + } + + return vlib_process_get_events_helper (p, with_type_index, data_vector); +} + +always_inline uword +vlib_process_wait_for_event_with_type (vlib_main_t * vm, + uword ** data_vector, + uword with_type_opaque) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_process_t *p; + uword r, *h; + + p = vec_elt (nm->processes, nm->current_process_index); + h = hash_get (p->event_type_index_by_type_opaque, with_type_opaque); + while (!h || !clib_bitmap_get (p->non_empty_event_type_bitmap, h[0])) + { + p->flags |= VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT; + r = + clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND); + if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND) + clib_longjmp (&p->return_longjmp, + VLIB_PROCESS_RETURN_LONGJMP_SUSPEND); + + /* See if unknown event type has been signaled now. */ + if (!h) + h = hash_get (p->event_type_index_by_type_opaque, with_type_opaque); + } + + return vlib_process_get_events_helper (p, h[0], data_vector); +} + +/** Suspend a cooperative multi-tasking thread + Waits for an event, or for the indicated number of seconds to elapse + @param vm - vlib_main_t pointer + @param dt - timeout, in seconds. + @returns the remaining time interval +*/ + +always_inline f64 +vlib_process_wait_for_event_or_clock (vlib_main_t * vm, f64 dt) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_process_t *p; + f64 wakeup_time; + uword r; + + p = vec_elt (nm->processes, nm->current_process_index); + + if (vlib_process_suspend_time_is_zero (dt) + || !clib_bitmap_is_zero (p->non_empty_event_type_bitmap)) + return dt; + + wakeup_time = vlib_time_now (vm) + dt; + + /* Suspend waiting for both clock and event to occur. */ + p->flags |= (VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT + | VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK); + + r = clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND); + if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND) + { + p->resume_cpu_time = (clib_cpu_time_now () + + (dt * vm->clib_time.clocks_per_second)); + clib_longjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_SUSPEND); + } + + /* Return amount of time still left to sleep. + If <= 0 then we've been waken up by the clock (and not an event). */ + return wakeup_time - vlib_time_now (vm); +} + +always_inline vlib_process_event_type_t * +vlib_process_new_event_type (vlib_process_t * p, uword with_type_opaque) +{ + vlib_process_event_type_t *et; + pool_get (p->event_type_pool, et); + et->opaque = with_type_opaque; + return et; +} + +always_inline uword +vlib_process_create_one_time_event (vlib_main_t * vm, uword node_index, + uword with_type_opaque) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n = vlib_get_node (vm, node_index); + vlib_process_t *p = vec_elt (nm->processes, n->runtime_index); + vlib_process_event_type_t *et; + uword t; + + et = vlib_process_new_event_type (p, with_type_opaque); + t = et - p->event_type_pool; + p->one_time_event_type_bitmap = + clib_bitmap_ori (p->one_time_event_type_bitmap, t); + return t; +} + +always_inline void +vlib_process_delete_one_time_event (vlib_main_t * vm, uword node_index, + uword t) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n = vlib_get_node (vm, node_index); + vlib_process_t *p = vec_elt (nm->processes, n->runtime_index); + + ASSERT (clib_bitmap_get (p->one_time_event_type_bitmap, t)); + vlib_process_free_event_type (p, t, /* is_one_time_event */ 1); +} + +always_inline void * +vlib_process_signal_event_helper (vlib_node_main_t * nm, + vlib_node_t * n, + vlib_process_t * p, + uword t, + uword n_data_elts, uword n_data_elt_bytes) +{ + uword p_flags, add_to_pending, delete_from_wheel; + void *data_to_be_written_by_caller; + + ASSERT (!pool_is_free_index (p->event_type_pool, t)); + + vec_validate (p->pending_event_data_by_type_index, t); + + /* Resize data vector and return caller's data to be written. */ + { + void *data_vec = p->pending_event_data_by_type_index[t]; + uword l; + + if (!data_vec && vec_len (nm->recycled_event_data_vectors)) + { + data_vec = vec_pop (nm->recycled_event_data_vectors); + _vec_len (data_vec) = 0; + } + + l = vec_len (data_vec); + + data_vec = _vec_resize (data_vec, + /* length_increment */ n_data_elts, + /* total size after increment */ + (l + n_data_elts) * n_data_elt_bytes, + /* header_bytes */ 0, /* data_align */ 0); + + p->pending_event_data_by_type_index[t] = data_vec; + data_to_be_written_by_caller = data_vec + l * n_data_elt_bytes; + } + + p->non_empty_event_type_bitmap = + clib_bitmap_ori (p->non_empty_event_type_bitmap, t); + + p_flags = p->flags; + + /* Event was already signalled? */ + add_to_pending = (p_flags & VLIB_PROCESS_RESUME_PENDING) == 0; + + /* Process will resume when suspend time elapses? */ + delete_from_wheel = 0; + if (p_flags & VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK) + { + /* Waiting for both event and clock? */ + if (p_flags & VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_EVENT) + delete_from_wheel = 1; + else + /* Waiting only for clock. Event will be queue and may be + handled when timer expires. */ + add_to_pending = 0; + } + + /* Never add current process to pending vector since current process is + already running. */ + add_to_pending &= nm->current_process_index != n->runtime_index; + + if (add_to_pending) + { + u32 x = vlib_timing_wheel_data_set_suspended_process (n->runtime_index); + p->flags = p_flags | VLIB_PROCESS_RESUME_PENDING; + vec_add1 (nm->data_from_advancing_timing_wheel, x); + if (delete_from_wheel) + timing_wheel_delete (&nm->timing_wheel, x); + } + + return data_to_be_written_by_caller; +} + +always_inline void * +vlib_process_signal_event_data (vlib_main_t * vm, + uword node_index, + uword type_opaque, + uword n_data_elts, uword n_data_elt_bytes) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n = vlib_get_node (vm, node_index); + vlib_process_t *p = vec_elt (nm->processes, n->runtime_index); + uword *h, t; + + h = hash_get (p->event_type_index_by_type_opaque, type_opaque); + if (!h) + { + vlib_process_event_type_t *et = + vlib_process_new_event_type (p, type_opaque); + t = et - p->event_type_pool; + hash_set (p->event_type_index_by_type_opaque, type_opaque, t); + } + else + t = h[0]; + + return vlib_process_signal_event_helper (nm, n, p, t, n_data_elts, + n_data_elt_bytes); +} + +always_inline void * +vlib_process_signal_event_at_time (vlib_main_t * vm, + f64 dt, + uword node_index, + uword type_opaque, + uword n_data_elts, uword n_data_elt_bytes) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n = vlib_get_node (vm, node_index); + vlib_process_t *p = vec_elt (nm->processes, n->runtime_index); + uword *h, t; + + h = hash_get (p->event_type_index_by_type_opaque, type_opaque); + if (!h) + { + vlib_process_event_type_t *et = + vlib_process_new_event_type (p, type_opaque); + t = et - p->event_type_pool; + hash_set (p->event_type_index_by_type_opaque, type_opaque, t); + } + else + t = h[0]; + + if (vlib_process_suspend_time_is_zero (dt)) + return vlib_process_signal_event_helper (nm, n, p, t, n_data_elts, + n_data_elt_bytes); + else + { + vlib_signal_timed_event_data_t *te; + u64 dt_cpu = dt * vm->clib_time.clocks_per_second; + + pool_get_aligned (nm->signal_timed_event_data_pool, te, sizeof (te[0])); + + te->n_data_elts = n_data_elts; + te->n_data_elt_bytes = n_data_elt_bytes; + te->n_data_bytes = n_data_elts * n_data_elt_bytes; + + /* Assert that structure fields are big enough. */ + ASSERT (te->n_data_elts == n_data_elts); + ASSERT (te->n_data_elt_bytes == n_data_elt_bytes); + ASSERT (te->n_data_bytes == n_data_elts * n_data_elt_bytes); + + te->process_node_index = n->runtime_index; + te->event_type_index = t; + + timing_wheel_insert (&nm->timing_wheel, clib_cpu_time_now () + dt_cpu, + vlib_timing_wheel_data_set_timed_event (te - + nm-> + signal_timed_event_data_pool)); + + /* Inline data big enough to hold event? */ + if (te->n_data_bytes < sizeof (te->inline_event_data)) + return te->inline_event_data; + else + { + te->event_data_as_vector = 0; + vec_resize (te->event_data_as_vector, te->n_data_bytes); + return te->event_data_as_vector; + } + } +} + +always_inline void * +vlib_process_signal_one_time_event_data (vlib_main_t * vm, + uword node_index, + uword type_index, + uword n_data_elts, + uword n_data_elt_bytes) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n = vlib_get_node (vm, node_index); + vlib_process_t *p = vec_elt (nm->processes, n->runtime_index); + return vlib_process_signal_event_helper (nm, n, p, type_index, n_data_elts, + n_data_elt_bytes); +} + +always_inline void +vlib_process_signal_event (vlib_main_t * vm, + uword node_index, uword type_opaque, uword data) +{ + uword *d = vlib_process_signal_event_data (vm, node_index, type_opaque, + 1 /* elts */ , sizeof (uword)); + d[0] = data; +} + +always_inline void +vlib_process_signal_event_pointer (vlib_main_t * vm, + uword node_index, + uword type_opaque, void *data) +{ + void **d = vlib_process_signal_event_data (vm, node_index, type_opaque, + 1 /* elts */ , sizeof (data)); + d[0] = data; +} + +always_inline void +vlib_process_signal_one_time_event (vlib_main_t * vm, + uword node_index, + uword type_index, uword data) +{ + uword *d = + vlib_process_signal_one_time_event_data (vm, node_index, type_index, + 1 /* elts */ , sizeof (uword)); + d[0] = data; +} + +always_inline void +vlib_signal_one_time_waiting_process (vlib_main_t * vm, + vlib_one_time_waiting_process_t * p) +{ + vlib_process_signal_one_time_event (vm, p->node_index, p->one_time_event, + /* data */ ~0); + memset (p, ~0, sizeof (p[0])); +} + +always_inline void +vlib_signal_one_time_waiting_process_vector (vlib_main_t * vm, + vlib_one_time_waiting_process_t + ** wps) +{ + vlib_one_time_waiting_process_t *wp; + vec_foreach (wp, *wps) vlib_signal_one_time_waiting_process (vm, wp); + vec_free (*wps); +} + +always_inline void +vlib_current_process_wait_for_one_time_event (vlib_main_t * vm, + vlib_one_time_waiting_process_t + * p) +{ + p->node_index = vlib_current_process (vm); + p->one_time_event = vlib_process_create_one_time_event (vm, p->node_index, /* type opaque */ + ~0); + vlib_process_wait_for_one_time_event (vm, + /* don't care about data */ 0, + p->one_time_event); +} + +always_inline void +vlib_current_process_wait_for_one_time_event_vector (vlib_main_t * vm, + vlib_one_time_waiting_process_t + ** wps) +{ + vlib_one_time_waiting_process_t *wp; + vec_add2 (*wps, wp, 1); + vlib_current_process_wait_for_one_time_event (vm, wp); +} + +always_inline u32 +vlib_node_runtime_update_main_loop_vector_stats (vlib_main_t * vm, + vlib_node_runtime_t * node, + uword n_vectors) +{ + u32 i, d, vi0, vi1; + u32 i0, i1; + + ASSERT (is_pow2 (ARRAY_LEN (node->main_loop_vector_stats))); + i = ((vm->main_loop_count >> VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE) + & (ARRAY_LEN (node->main_loop_vector_stats) - 1)); + i0 = i ^ 0; + i1 = i ^ 1; + d = ((vm->main_loop_count >> VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE) + - + (node->main_loop_count_last_dispatch >> + VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE)); + vi0 = node->main_loop_vector_stats[i0]; + vi1 = node->main_loop_vector_stats[i1]; + vi0 = d == 0 ? vi0 : 0; + vi1 = d <= 1 ? vi1 : 0; + vi0 += n_vectors; + node->main_loop_vector_stats[i0] = vi0; + node->main_loop_vector_stats[i1] = vi1; + node->main_loop_count_last_dispatch = vm->main_loop_count; + /* Return previous counter. */ + return node->main_loop_vector_stats[i1]; +} + +always_inline f64 +vlib_node_vectors_per_main_loop_as_float (vlib_main_t * vm, u32 node_index) +{ + vlib_node_runtime_t *rt = vlib_node_get_runtime (vm, node_index); + u32 v; + + v = vlib_node_runtime_update_main_loop_vector_stats (vm, rt, /* n_vectors */ + 0); + return (f64) v / (1 << VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE); +} + +always_inline u32 +vlib_node_vectors_per_main_loop_as_integer (vlib_main_t * vm, u32 node_index) +{ + vlib_node_runtime_t *rt = vlib_node_get_runtime (vm, node_index); + u32 v; + + v = vlib_node_runtime_update_main_loop_vector_stats (vm, rt, /* n_vectors */ + 0); + return v >> VLIB_LOG2_MAIN_LOOPS_PER_STATS_UPDATE; +} + +void +vlib_frame_free (vlib_main_t * vm, vlib_node_runtime_t * r, vlib_frame_t * f); + +/* Add next node to given node in given slot. */ +uword +vlib_node_add_next_with_slot (vlib_main_t * vm, + uword node, uword next_node, uword slot); + +/* As above but adds to end of node's next vector. */ +always_inline uword +vlib_node_add_next (vlib_main_t * vm, uword node, uword next_node) +{ + return vlib_node_add_next_with_slot (vm, node, next_node, ~0); +} + +/* Add next node to given node in given slot. */ +uword +vlib_node_add_named_next_with_slot (vlib_main_t * vm, + uword node, char *next_name, uword slot); + +/* As above but adds to end of node's next vector. */ +always_inline uword +vlib_node_add_named_next (vlib_main_t * vm, uword node, char *name) +{ + return vlib_node_add_named_next_with_slot (vm, node, name, ~0); +} + +/* Query node given name. */ +vlib_node_t *vlib_get_node_by_name (vlib_main_t * vm, u8 * name); + +/* Rename a node. */ +void vlib_node_rename (vlib_main_t * vm, u32 node_index, char *fmt, ...); + +/* Register new packet processing node. Nodes can be registered + dynamically via this call or statically via the VLIB_REGISTER_NODE + macro. */ +u32 vlib_register_node (vlib_main_t * vm, vlib_node_registration_t * r); + +/* Register all static nodes registered via VLIB_REGISTER_NODE. */ +void vlib_register_all_static_nodes (vlib_main_t * vm); + +/* Start a process. */ +void vlib_start_process (vlib_main_t * vm, uword process_index); + +/* Sync up runtime and main node stats. */ +void vlib_node_sync_stats (vlib_main_t * vm, vlib_node_t * n); + +/* Node graph initialization function. */ +clib_error_t *vlib_node_main_init (vlib_main_t * vm); + +format_function_t format_vlib_node_graph; +format_function_t format_vlib_node_name; +format_function_t format_vlib_next_node_name; +format_function_t format_vlib_node_and_next; +format_function_t format_vlib_cpu_time; +format_function_t format_vlib_time; +/* Parse node name -> node index. */ +unformat_function_t unformat_vlib_node; + +always_inline void +vlib_node_increment_counter (vlib_main_t * vm, u32 node_index, + u32 counter_index, u64 increment) +{ + vlib_node_t *n = vlib_get_node (vm, node_index); + vlib_error_main_t *em = &vm->error_main; + u32 node_counter_base_index = n->error_heap_index; + em->counters[node_counter_base_index + counter_index] += increment; +} + +#endif /* included_vlib_node_funcs_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/parse.c b/src/vlib/parse.c new file mode 100644 index 00000000..1c4500ce --- /dev/null +++ b/src/vlib/parse.c @@ -0,0 +1,1007 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#define PARSE_DEBUG 0 + +u16 word_type_index, number_type_index, eof_type_index, rule_eof_type_index, + plus_type_index, minus_type_index, star_type_index, slash_type_index, + lpar_type_index, rpar_type_index; + +u8 * +format_vlib_parse_value (u8 * s, va_list * args) +{ + vlib_parse_main_t *pm = va_arg (*args, vlib_parse_main_t *); + vlib_parse_type_t *type; + vlib_parse_value_t *v; + u16 type_index; + + s = format (s, "%d items:\n", vec_len (pm->parse_value)); + vec_foreach (v, pm->parse_value) + { + type_index = v->type; + type = pool_elt_at_index (pm->parse_types, type_index); + if (type->format_value) + s = format (s, "[%d]: %U\n", v - pm->parse_value, + type->format_value, v); + else + s = format (s, "[%d]: (nofun)\n", v - pm->parse_value); + } + return s; +} + +static u8 * +format_vlib_parse_match (u8 * s, va_list * args) +{ + vlib_parse_match_t m = va_arg (*args, vlib_parse_match_t); + char *t = 0; + switch (m) + { +#define _(a) case VLIB_PARSE_##a: t = #a; break; + foreach_parse_match_type +#undef _ + default: + t = 0; + break; + } + + if (t) + return format (s, "%s", t); + else + return format (s, "unknown 0x%x", m); +} + +static u8 * +format_vlib_parse_item (u8 * s, va_list * args) +{ + vlib_parse_main_t *pm = va_arg (*args, vlib_parse_main_t *); + vlib_parse_item_t *item = va_arg (*args, vlib_parse_item_t *); + vlib_parse_type_t *type = pool_elt_at_index (pm->parse_types, item->type); + + if (item->type == word_type_index) + s = format (s, "%s", item->value.as_pointer); + else + s = format (s, "<%s>", type->name); + return s; +} + +static u8 * +format_vlib_parse_graph (u8 * s, va_list * args) +{ + vlib_parse_main_t *pm = va_arg (*args, vlib_parse_main_t *); + vlib_parse_graph_t *node = va_arg (*args, vlib_parse_graph_t *); + vlib_parse_item_t *item; + vlib_parse_type_t *type; + + /* $$$ hash table */ + /* *INDENT-OFF* */ + pool_foreach (type, pm->parse_types, + ({ + if (type->rule_index == node - pm->parse_graph) + s = format (s, "\n<%s>\n", type->name); + })); +/* *INDENT-ON* */ + + if (pm->root_index == (node - pm->parse_graph)) + s = format (s, "\n\n"); + + item = pool_elt_at_index (pm->parse_items, node->item); + + s = format (s, "[%d] %U ", node - pm->parse_graph, + format_vlib_parse_item, pm, item); + + if (node->peer == (u32) ~ 0) + s = format (s, "peer nil "); + else + s = format (s, "peer %4u ", node->peer); + + if (node->deeper == (u32) ~ 0) + s = format (s, "deeper nil "); + else + s = format (s, "deeper %4u ", node->deeper); + + return s; +} + +void +dump_parse_graph (void) +{ + vlib_parse_main_t *pm = &vlib_parse_main; + vlib_parse_graph_t *node; + + /* *INDENT-OFF* */ + pool_foreach (node, pm->parse_graph, ({ + fformat(stdout, "%U\n", format_vlib_parse_graph, pm, node); + })); +/* *INDENT-ON* */ +} + +always_inline void +parse_cleanup_value (vlib_parse_main_t * pm, vlib_parse_value_t * pv) +{ + vlib_parse_type_t *type = pool_elt_at_index (pm->parse_types, pv->type); + if (type->value_cleanup_function) + type->value_cleanup_function (pv); +} + +static void +parse_reset (vlib_parse_main_t * pm, u8 * input) +{ + vlib_lex_token_t *t; + vlib_parse_value_t *pv; + + vlib_lex_reset (pm->lex_main, input); + + vec_foreach (t, pm->tokens) vlib_lex_cleanup_token (t); + + vec_foreach (pv, pm->parse_value) parse_cleanup_value (pm, pv); + + _vec_len (pm->parse_value) = 0; + _vec_len (pm->tokens) = 0; + pm->current_token_index = 0; +} + +static void +parse_help (vlib_parse_main_t * pm, u32 index) +{ + vlib_parse_graph_t *node; + vlib_parse_item_t *item; + vlib_parse_type_t *type; + vlib_main_t *vm = pm->vlib_main; + u8 *help_input; + int i; + + help_input = vec_dup (pm->lex_main->input_vector); + + for (i = vec_len (help_input) - 1; i >= 0; i--) + if (help_input[i] == '?') + { + help_input[i] = 0; + _vec_len (help_input) = i; + break; + } + + for (i = vec_len (help_input) - 1; i >= 0; i--) + { + if (help_input[i] != ' ' && help_input[i] != '\t') + break; + help_input[i] = 0; + break; + } + _vec_len (help_input) = i + 1; + + while (index != (u32) ~ 0) + { + node = pool_elt_at_index (pm->parse_graph, index); + item = pool_elt_at_index (pm->parse_items, node->item); + type = pool_elt_at_index (pm->parse_types, item->type); + + if (item->type == eof_type_index && vec_len (pm->match_items) == 0) + /* do nothing */ ; + else if (item->type == word_type_index) + vlib_cli_output (vm, "%s %s\n", help_input, item->value.as_pointer); + else + vlib_cli_output (vm, "%s <%s>\n", help_input, type->name); + index = node->peer; + } + vec_free (help_input); +} + +static vlib_parse_match_t +parse_eval_internal (vlib_parse_main_t * pm, u32 index) +{ + vlib_parse_graph_t *node; + vlib_parse_item_t *item; + vlib_parse_type_t *type; + vlib_parse_value_t value, *pv; + vlib_parse_match_t rv; + u32 *partial_matches = 0; + vlib_lex_token_t *t; + u32 save_token_index = (u32) ~ 0, save_match_items = 0; + int had_value = 0; + + if (pm->current_token_index >= vec_len (pm->tokens)) + return VLIB_PARSE_MATCH_FAIL; + + /* current token */ + t = vec_elt_at_index (pm->tokens, pm->current_token_index); + + /* Help ? */ + if (PREDICT_FALSE (t->token == VLIB_LEX_qmark)) + { + parse_help (pm, index); + _vec_len (pm->match_items) = 0; + return VLIB_PARSE_MATCH_DONE; + } + + /* Across all peers at this level of the parse graph */ + while (index != (u32) ~ 0) + { + node = pool_elt_at_index (pm->parse_graph, index); + item = pool_elt_at_index (pm->parse_items, node->item); + type = pool_elt_at_index (pm->parse_types, item->type); + + /* + * Save the token index. We may have to back up several + * trie plies. Type-specific match functions can consume + * multiple tokens, and they may not be optimally careful + */ + save_token_index = pm->current_token_index; + save_match_items = vec_len (pm->match_items); + vec_add1 (pm->match_items, node->item); + + if (PARSE_DEBUG > 1) + clib_warning ("Try to match token %U against node %d", + format_vlib_lex_token, pm->lex_main, t, index); + + /* Call the type-specific match function */ + rv = type->match_function (pm, type, t, &value); + + if (PARSE_DEBUG > 1) + clib_warning ("returned %U", format_vlib_parse_match, rv); + + switch (rv) + { + case VLIB_PARSE_MATCH_VALUE: + /* + * Matched, and returned a value to append to the + * set of args passed to the action function + */ + value.type = item->type; + vec_add1 (pm->parse_value, value); + had_value = 1; + /* fallthrough */ + + case VLIB_PARSE_MATCH_FULL: + unambiguous_partial_match: + /* Consume the matched token */ + pm->current_token_index++; + + /* continue matching along this path */ + rv = parse_eval_internal (pm, node->deeper); + + /* this is not the right path */ + if (rv == VLIB_PARSE_MATCH_FAIL) + { + if (had_value) + { + /* Delete the value */ + value = pm->parse_value[vec_len (pm->parse_value) - 1]; + parse_cleanup_value (pm, &value); + _vec_len (pm->parse_value) -= 1; + } + /* Continue with the next sibling */ + pm->current_token_index = save_token_index; + _vec_len (pm->match_items) = save_match_items; + index = node->peer; + break; + } + return rv; + + case VLIB_PARSE_MATCH_PARTIAL: + /* Partial (substring) match, remember it but keep going */ + vec_add1 (partial_matches, node - pm->parse_graph); + index = node->peer; + break; + + case VLIB_PARSE_MATCH_FAIL: + /* Continue with the next sibling */ + index = node->peer; + _vec_len (pm->match_items) = save_match_items; + break; + + case VLIB_PARSE_MATCH_DONE: + /* Parse complete, invoke the action function */ + if (PARSE_DEBUG > 0) + clib_warning ("parse_value: %U", format_vlib_parse_value, pm); + + { + vlib_parse_eval_function_t *f = item->value.as_pointer; + if (f) + rv = f (pm, item, pm->parse_value); + } + + vec_foreach (pv, pm->parse_value) parse_cleanup_value (pm, pv); + _vec_len (pm->parse_value) = 0; + _vec_len (pm->match_items) = 0; + return rv; + + case VLIB_PARSE_MATCH_AMBIGUOUS: + case VLIB_PARSE_MATCH_EVAL_FAIL: + case VLIB_PARSE_MATCH_RULE: + _vec_len (pm->match_items) = save_match_items; + return rv; + } + } + + /* + * Out of siblings. If we have exactly one partial match + * we win + */ + if (vec_len (partial_matches) == 1) + { + index = partial_matches[0]; + node = pool_elt_at_index (pm->parse_graph, index); + vec_free (partial_matches); + goto unambiguous_partial_match; + } + + /* Ordinary loser */ + rv = VLIB_PARSE_MATCH_FAIL; + + /* Ambiguous loser */ + if (vec_len (partial_matches) > 1) + { + vec_free (partial_matches); + rv = VLIB_PARSE_MATCH_AMBIGUOUS; + } + + _vec_len (pm->match_items) = save_match_items; + return rv; +} + +vlib_parse_match_t +rule_match (vlib_parse_main_t * pm, vlib_parse_type_t * type, + vlib_lex_token_t * t, vlib_parse_value_t * valuep) +{ + vlib_parse_match_t rv; + static int recursion_level; + + if (PARSE_DEBUG > 1) + clib_warning ("[%d]: try to match type %s graph index %d", + recursion_level, type->name, type->rule_index); + recursion_level++; + rv = parse_eval_internal (pm, type->rule_index); + recursion_level--; + + /* Break the recusive unwind here... */ + if (rv == VLIB_PARSE_MATCH_RULE) + { + if (PARSE_DEBUG > 1) + clib_warning ("[%d]: type %s matched", recursion_level, type->name); + + return VLIB_PARSE_MATCH_FULL; + } + else + { + if (PARSE_DEBUG > 1) + clib_warning ("[%d]: type %s returns %U", recursion_level, type->name, + format_vlib_parse_match, rv); + } + return rv; +} + +static int +parse_eval (vlib_parse_main_t * pm, u8 * input) +{ + vlib_lex_token_t *t; + + parse_reset (pm, input); + + /* Tokenize the entire input vector */ + do + { + vec_add2 (pm->tokens, t, 1); + vlib_lex_get_token (pm->lex_main, t); + } + while (t->token != VLIB_LEX_eof); + + /* Feed it to the parser */ + return parse_eval_internal (pm, pm->root_index); +} + +/* Temporary vlib stub */ +vlib_parse_match_t +vlib_parse_eval (u8 * input) +{ + return parse_eval (&vlib_parse_main, input); +} + +u16 +parse_type_find_or_create (vlib_parse_main_t * pm, vlib_parse_type_t * t) +{ + uword *p; + vlib_parse_type_t *n; + u8 *name_copy; + + p = hash_get_mem (pm->parse_type_by_name_hash, t->name); + if (p) + return p[0]; + + pool_get (pm->parse_types, n); + *n = *t; + n->rule_index = (u32) ~ 0; + + name_copy = format (0, "%s%c", n->name, 0); + + hash_set_mem (pm->parse_type_by_name_hash, name_copy, n - pm->parse_types); + return n - pm->parse_types; +} + +u16 +parse_type_find_by_name (vlib_parse_main_t * pm, char *name) +{ + uword *p; + + p = hash_get_mem (pm->parse_type_by_name_hash, name); + if (p) + return p[0]; + + return (u16) ~ 0; +} + +u32 +parse_item_find_or_create (vlib_parse_main_t * pm, vlib_parse_item_t * item) +{ + uword *p; + vlib_parse_item_t *i; + + /* Exact match the entire item */ + p = mhash_get (&pm->parse_item_hash, item); + if (p) + return p[0]; + + pool_get (pm->parse_items, i); + *i = *item; + + mhash_set (&pm->parse_item_hash, i, i - pm->parse_items, 0); + return i - pm->parse_items; +} + +static void +parse_type_and_graph_init (vlib_parse_main_t * pm) +{ + u32 eof_index; + vlib_parse_type_t type; + vlib_parse_item_t item; + + memset (&type, 0, sizeof (type)); + +#define foreach_token_type \ + _ (eof) \ + _ (rule_eof) \ + _ (word) \ + _ (number) \ + _ (plus) \ + _ (minus) \ + _ (star) \ + _ (slash) \ + _ (lpar) \ + _ (rpar) + +#define _(a) a##_type_index = parse_type_find_by_name (pm, #a); + foreach_token_type +#undef _ + memset (&item, 0, sizeof (item)); + item.type = eof_type_index; + + eof_index = parse_item_find_or_create (pm, &item); + pm->root_index = (u32) ~ 0; + +#if 0 + pool_get (pm->parse_graph, g); + memset (g, 0xff, sizeof (*g)); + g->item = eof_index; + pm->root_index = 0; +#endif +} + + + +static void +tokenize (vlib_parse_main_t * pm, parse_registration_t * pr) +{ + vlib_lex_token_t *t; + pm->register_input = format (pm->register_input, + "%s%c", pr->initializer, 0); + + parse_reset (pm, pm->register_input); + + do + { + vec_add2 (pm->tokens, t, 1); + vlib_lex_get_token (pm->lex_main, t); + } + while (t->token != VLIB_LEX_eof); + _vec_len (pm->register_input) = 0; +} + +static int +is_typed_rule (vlib_parse_main_t * pm) +{ + vlib_lex_token_t *t = vec_elt_at_index (pm->tokens, 0); + + /* = blah blah blah */ + if (vec_len (pm->tokens) >= 4 + && t[0].token == VLIB_LEX_lt + && t[1].token == VLIB_LEX_word + && t[2].token == VLIB_LEX_gt && t[3].token == VLIB_LEX_equals) + return 1; + return 0; +} + +static int +token_matches_graph_node (vlib_parse_main_t * pm, + vlib_lex_token_t * t, + vlib_parse_graph_t * node, + vlib_parse_item_t * item, + vlib_parse_type_t * type, u32 * token_increment) +{ + /* EOFs don't match */ + if (t->token == VLIB_LEX_eof) + return 0; + + /* New chain element is a word */ + if (t->token == VLIB_LEX_word) + { + /* but the item in hand is not a word */ + if (item->type != word_type_index) + return 0; + + /* Or it's not this particular word */ + if (strcmp (t->value.as_pointer, item->value.as_pointer)) + return 0; + *token_increment = 1; + return 1; + } + /* New chain element is a type-name: < TYPE-NAME > */ + if (t->token == VLIB_LEX_lt) + { + u16 token_type_index; + + /* < TYPE > */ + if (t[1].token != VLIB_LEX_word || t[2].token != VLIB_LEX_gt) + { + clib_warning (0, "broken type name in '%s'", pm->register_input); + return 0; + } + + token_type_index = parse_type_find_by_name (pm, t[1].value.as_pointer); + if (token_type_index == (u16) ~ 0) + { + clib_warning (0, "unknown type '%s'", t[1].value.as_pointer); + return 0; + } + + /* Its a known type but does not match. */ + if (item->type != token_type_index) + return 0; + + *token_increment = 3; + return 1; + } + clib_warning ("BUG: t->token = %d", t->token); + return 0; +} + +u32 +generate_subgraph_from_tokens (vlib_parse_main_t * pm, + vlib_lex_token_t * t, + u32 * new_subgraph_depth, + parse_registration_t * pr, int not_a_rule) +{ + vlib_parse_graph_t *g, *last_g; + vlib_parse_item_t new_item; + u32 rv = (u32) ~ 0, new_item_index, last_index = (u32) ~ 0; + u16 token_type_index; + u32 depth = 0; + + while (t < pm->tokens + vec_len (pm->tokens)) + { + memset (&new_item, 0, sizeof (new_item)); + + if (t->token == VLIB_LEX_word) + { + new_item.type = word_type_index; + new_item.value.as_pointer = vec_dup ((u8 *) t->value.as_pointer); + new_item_index = parse_item_find_or_create (pm, &new_item); + t++; + } + else if (t->token == VLIB_LEX_lt) + { + if (t[1].token != VLIB_LEX_word || t[2].token != VLIB_LEX_gt) + { + clib_warning ("broken type name in '%s'", pm->register_input); + goto screwed; + } + token_type_index = parse_type_find_by_name (pm, + t[1].value.as_pointer); + if (token_type_index == (u16) ~ 0) + { + clib_warning ("unknown type 2 '%s'", t[1].value.as_pointer); + goto screwed; + } + + new_item.type = token_type_index; + new_item.value.as_pointer = 0; + new_item_index = parse_item_find_or_create (pm, &new_item); + t += 3; /* skip < and > */ + } + else if (t->token == VLIB_LEX_eof) + { + screwed: + new_item.type = not_a_rule ? eof_type_index : rule_eof_type_index; + new_item.value.as_pointer = pr->eof_match; + new_item_index = parse_item_find_or_create (pm, &new_item); + t++; + } + else + { + clib_warning ("unexpected token %U index %d in '%s'", + format_vlib_lex_token, pm->lex_main, t, + t - pm->tokens, pm->register_input); + goto screwed; + } + + pool_get (pm->parse_graph, g); + memset (g, 0xff, sizeof (*g)); + g->item = new_item_index; + depth++; + + if (rv == (u32) ~ 0) + { + rv = g - pm->parse_graph; + last_index = rv; + } + else + { + last_g = pool_elt_at_index (pm->parse_graph, last_index); + last_index = last_g->deeper = g - pm->parse_graph; + } + } + *new_subgraph_depth = depth; + return rv; +} + +static u32 +measure_depth (vlib_parse_main_t * pm, u32 index) +{ + vlib_parse_graph_t *node; + vlib_parse_item_t *item; + u32 max = 0; + u32 depth; + + if (index == (u32) ~ 0) + return 0; + + node = pool_elt_at_index (pm->parse_graph, index); + item = pool_elt_at_index (pm->parse_items, node->item); + + if (item->type == eof_type_index) + return 1; + + while (index != (u32) ~ 0) + { + node = pool_elt_at_index (pm->parse_graph, index); + depth = measure_depth (pm, node->deeper); + if (max < depth) + max = depth; + index = node->peer; + } + + return max + 1; +} + +static void +add_subgraph_to_graph (vlib_parse_main_t * pm, + u32 last_matching_index, + u32 graph_root_index, + u32 new_subgraph_index, u32 new_subgraph_depth) +{ + vlib_parse_graph_t *parent_node; + int new_subgraph_longest = 1; + u32 current_peer_index; + u32 current_depth; + vlib_parse_graph_t *current_peer = 0; + vlib_parse_graph_t *new_subgraph_node = + pool_elt_at_index (pm->parse_graph, new_subgraph_index); + + /* + * Case 1: top-level peer. Splice into the top-level + * peer chain according to rule depth + */ + if (last_matching_index == (u32) ~ 0) + { + u32 index = graph_root_index; + while (1) + { + current_peer = pool_elt_at_index (pm->parse_graph, index); + current_depth = measure_depth (pm, index); + if (current_depth < new_subgraph_depth + || current_peer->peer == (u32) ~ 0) + break; + index = current_peer->peer; + } + new_subgraph_node->peer = current_peer->peer; + current_peer->peer = new_subgraph_index; + return; + } + + parent_node = pool_elt_at_index (pm->parse_graph, last_matching_index); + current_peer_index = parent_node->deeper; + + while (current_peer_index != (u32) ~ 0) + { + current_peer = pool_elt_at_index (pm->parse_graph, current_peer_index); + current_depth = measure_depth (pm, current_peer_index); + if (current_depth < new_subgraph_depth) + break; + new_subgraph_longest = 0; + current_peer_index = current_peer->peer; + } + + ASSERT (current_peer); + + if (new_subgraph_longest) + { + new_subgraph_node->peer = parent_node->deeper; + parent_node->deeper = new_subgraph_index; + } + else + { + new_subgraph_node->peer = current_peer->peer; + current_peer->peer = new_subgraph_index; + } +} + +static clib_error_t * +parse_register_one (vlib_parse_main_t * pm, parse_registration_t * pr) +{ + u32 graph_root_index; + u16 subgraph_type_index = (u16) ~ 0; + vlib_parse_type_t *subgraph_type = 0; + vlib_lex_token_t *t; + vlib_parse_graph_t *node; + u32 node_index, last_index, token_increment, new_subgraph_index; + u32 new_subgraph_depth, last_matching_index; + vlib_parse_item_t *item; + vlib_parse_type_t *type; + + int use_main_graph = 1; + + tokenize (pm, pr); + + /* A typed rule? */ + if (is_typed_rule (pm)) + { + /* Get the type and its current subgraph root, if any */ + t = vec_elt_at_index (pm->tokens, 1); + subgraph_type_index = parse_type_find_by_name (pm, t->value.as_pointer); + if (subgraph_type_index == (u16) ~ 0) + return clib_error_return (0, "undeclared type '%s'", + t->value.as_pointer); + subgraph_type = + pool_elt_at_index (pm->parse_types, subgraph_type_index); + graph_root_index = subgraph_type->rule_index; + /* Skip "mytype> = */ + t += 3; + use_main_graph = 0; + } + else + { + /* top-level graph */ + graph_root_index = pm->root_index; + t = vec_elt_at_index (pm->tokens, 0); + } + + last_matching_index = (u32) ~ 0; + last_index = node_index = graph_root_index; + + /* Find the first token which isn't already being parsed */ + while (t < pm->tokens + vec_len (pm->tokens) && node_index != (u32) ~ 0) + { + node = pool_elt_at_index (pm->parse_graph, node_index); + item = pool_elt_at_index (pm->parse_items, node->item); + type = pool_elt_at_index (pm->parse_types, item->type); + last_index = node_index; + + if (token_matches_graph_node + (pm, t, node, item, type, &token_increment)) + { + t += token_increment; + last_matching_index = node_index; + node_index = node->deeper; + } + else + node_index = node->peer; + } + + new_subgraph_index = + generate_subgraph_from_tokens (pm, t, &new_subgraph_depth, pr, + use_main_graph); + + /* trivial cases: first graph node or first type rule */ + if (graph_root_index == (u32) ~ 0) + { + if (use_main_graph) + pm->root_index = new_subgraph_index; + else + subgraph_type->rule_index = new_subgraph_index; + return 0; + } + + add_subgraph_to_graph (pm, last_matching_index, graph_root_index, + new_subgraph_index, new_subgraph_depth); + return 0; +} + +static clib_error_t * +parse_register (vlib_main_t * vm, + parse_registration_t * lo, + parse_registration_t * hi, vlib_parse_main_t * pm) +{ + parse_registration_t *pr; + + for (pr = lo; pr < hi; pr = vlib_elf_section_data_next (pr, 0)) + vec_add1 (pm->parse_registrations, pr); + + return 0; +} + +static clib_error_t * +parse_register_one_type (vlib_parse_main_t * pm, vlib_parse_type_t * rp) +{ + (void) parse_type_find_or_create (pm, (vlib_parse_type_t *) rp); + return 0; +} + +static clib_error_t * +parse_type_register (vlib_main_t * vm, + vlib_parse_type_t * lo, + vlib_parse_type_t * hi, vlib_parse_main_t * pm) +{ + clib_error_t *error = 0; + vlib_parse_type_t *ptr; + + for (ptr = lo; ptr < hi; ptr = vlib_elf_section_data_next (ptr, 0)) + { + error = parse_register_one_type (pm, ptr); + if (error) + goto done; + } + +done: + return error; +} + +clib_error_t *vlib_stdlex_init (vlib_main_t * vm) __attribute__ ((weak)); +clib_error_t * +vlib_stdlex_init (vlib_main_t * vm) +{ + (void) vlib_lex_add_table ("ignore_everything"); + return 0; +} + +static int +compute_rule_length (parse_registration_t * r) +{ + int length, i; + vlib_parse_main_t *pm = &vlib_parse_main; + + if (r->rule_length) + return r->rule_length; + + length = 0; + + tokenize (pm, r); + length = vec_len (pm->tokens); + + /* Account for " = " in " = bar" etc. */ + if (is_typed_rule (pm)) + length -= 2; + + for (i = 0; i < vec_len (pm->tokens); i++) + { + switch (pm->tokens[i].token) + { + case VLIB_LEX_lt: + case VLIB_LEX_gt: + length -= 1; + + default: + break; + } + } + + ASSERT (length > 0); + r->rule_length = length; + return length; +} + +static int +rule_length_compare (parse_registration_t * r1, parse_registration_t * r2) +{ + compute_rule_length (r1); + compute_rule_length (r2); + /* Descending sort */ + return r2->rule_length - r1->rule_length; +} + + +static clib_error_t * +parse_init (vlib_main_t * vm) +{ + vlib_parse_main_t *pm = &vlib_parse_main; + vlib_lex_main_t *lm = &vlib_lex_main; + vlib_elf_section_bounds_t *b, *bounds; + clib_error_t *error = 0; + parse_registration_t *rule; + int i; + + if ((error = vlib_call_init_function (vm, lex_onetime_init))) + return error; + + if ((error = vlib_stdlex_init (vm))) + return error; + + if ((error = vlib_call_init_function (vm, parse_builtin_init))) + return error; + + pm->vlib_main = vm; + pm->lex_main = lm; + + mhash_init (&pm->parse_item_hash, sizeof (u32), sizeof (vlib_parse_item_t)); + pm->parse_type_by_name_hash = hash_create_string (0, sizeof (u32)); + + vec_validate (pm->parse_value, 16); + vec_validate (pm->tokens, 16); + vec_validate (pm->register_input, 32); + vec_validate (pm->match_items, 16); + + _vec_len (pm->parse_value) = 0; + _vec_len (pm->tokens) = 0; + _vec_len (pm->register_input) = 0; + _vec_len (pm->match_items) = 0; + + bounds = vlib_get_elf_section_bounds (vm, "parse_type_registrations"); + vec_foreach (b, bounds) + { + error = parse_type_register (vm, b->lo, b->hi, pm); + if (error) + break; + } + vec_free (bounds); + + parse_type_and_graph_init (pm); + + bounds = vlib_get_elf_section_bounds (vm, "parse_registrations"); + vec_foreach (b, bounds) + { + error = parse_register (vm, b->lo, b->hi, pm); + if (error) + break; + } + vec_free (bounds); + + vec_sort_with_function (pm->parse_registrations, rule_length_compare); + + for (i = 0; i < vec_len (pm->parse_registrations); i++) + { + rule = pm->parse_registrations[i]; + parse_register_one (pm, rule); + } + + return error; +} + +VLIB_INIT_FUNCTION (parse_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/parse.h b/src/vlib/parse.h new file mode 100644 index 00000000..036e7447 --- /dev/null +++ b/src/vlib/parse.h @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef included_vlib_parse_h +#define included_vlib_parse_h + +#include +#include +#include + +typedef struct +{ + /* Word aligned value. */ + union + { + u8 as_u8[32 - 1 * sizeof (u16)]; + void *as_pointer; + uword as_uword; + word as_word; + u64 as_u64; + } value; + + /* 16 bit type at end so that 30 bytes of value are aligned. */ + u16 type; +} __attribute ((packed)) + vlib_parse_value_t; + +/* Instance of a type. */ + typedef struct + { + u32 + type; + + u32 + origin; + + u32 + help_index; + + union + { + void * + as_pointer; + uword + as_uword; + } value; + } vlib_parse_item_t; + + typedef struct + { + /* Index of item for this node. */ + u32 + item; + + /* Graph index of peer (sibling) node (linked list of peers). */ + u32 + peer; + + /* Graph index of deeper (child) node (linked list of children). */ + u32 + deeper; + } vlib_parse_graph_t; + +#define foreach_parse_match_type \ + _(MATCH_DONE) \ + _(MATCH_RULE) \ + _(MATCH_FAIL) \ + _(MATCH_FULL) \ + _(MATCH_VALUE) \ + _(MATCH_PARTIAL) \ + _(MATCH_AMBIGUOUS) \ + _(MATCH_EVAL_FAIL) + + typedef enum + { +#define _(a) VLIB_PARSE_##a, + foreach_parse_match_type +#undef _ + } vlib_parse_match_t; + + struct vlib_parse_type; + struct vlib_parse_main; + + typedef + vlib_parse_match_t (vlib_parse_match_function_t) + (struct vlib_parse_main *, + struct vlib_parse_type *, vlib_lex_token_t *, vlib_parse_value_t *); + typedef void (vlib_parse_value_cleanup_function_t) (vlib_parse_value_t + *); + + typedef struct vlib_parse_type + { + /* Type name. */ + char * + name; + + vlib_parse_match_function_t * + match_function; + + vlib_parse_value_cleanup_function_t * + value_cleanup_function; + + format_function_t * + format_value; + + u32 + rule_index; + } vlib_parse_type_t; + + typedef struct + { + char * + initializer; + void * + eof_match; + int + rule_length; + } parse_registration_t; + + typedef struct vlib_parse_main + { + /* (type, origin, help, value) tuples */ + vlib_parse_item_t * + parse_items; + mhash_t + parse_item_hash; + + /* (item, peer, deeper) tuples */ + vlib_parse_graph_t * + parse_graph; + u32 + root_index; + + u8 * + register_input; + + /* parser types */ + vlib_parse_type_t * + parse_types; + uword * + parse_type_by_name_hash; + + /* Vector of MATCH_VALUEs */ + vlib_parse_value_t * + parse_value; + u32 * + match_items; + + /* Parse registrations */ + parse_registration_t ** + parse_registrations; + + /* Token vector */ + vlib_lex_token_t * + tokens; + u32 + current_token_index; + + vlib_lex_main_t * + lex_main; + vlib_main_t * + vlib_main; + } vlib_parse_main_t; + + vlib_parse_main_t + vlib_parse_main; + + typedef + vlib_parse_match_t (vlib_parse_eval_function_t) + (vlib_parse_main_t *, vlib_parse_item_t *, vlib_parse_value_t *); + +vlib_parse_match_t +vlib_parse_eval (u8 * input); + + format_function_t format_vlib_parse_value; + +/* FIXME need these to be global? */ + vlib_parse_match_function_t rule_match, eof_match, word_match, + number_match; + +#define _PARSE_REGISTRATION_DATA(x) \ +VLIB_ELF_SECTION_DATA(x##_registration,parse_registration_t,parse_registrations) + +#define PARSE_INIT(x, s, e) \ +static _PARSE_REGISTRATION_DATA(x) = { \ + .initializer = s, \ + .eof_match = e, \ +}; + +#define _PARSE_TYPE_REGISTRATION_DATA(x) \ +VLIB_ELF_SECTION_DATA(x##_type_registration,vlib_parse_type_t, \ +parse_type_registrations) + +#define PARSE_TYPE_INIT(n, m, c, f) \ +static _PARSE_TYPE_REGISTRATION_DATA(n) = { \ + .name = #n, \ + .match_function = m, \ + .value_cleanup_function = c, \ + .format_value = f, \ +}; + +#endif /* included_vlib_parse_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/parse_builtin.c b/src/vlib/parse_builtin.c new file mode 100644 index 00000000..0ce716b5 --- /dev/null +++ b/src/vlib/parse_builtin.c @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +always_inline void * +parse_last_match_value (vlib_parse_main_t * pm) +{ + vlib_parse_item_t *i; + i = pool_elt_at_index (pm->parse_items, + vec_elt (pm->match_items, + vec_len (pm->match_items) - 1)); + return i->value.as_pointer; +} + +vlib_parse_match_t +eof_match (vlib_parse_main_t * pm, vlib_parse_type_t * type, + vlib_lex_token_t * t, vlib_parse_value_t * valuep) +{ + return t->token == + VLIB_LEX_eof ? VLIB_PARSE_MATCH_DONE : VLIB_PARSE_MATCH_FAIL; +} + +PARSE_TYPE_INIT (eof, eof_match, 0 /* cleanup value */ , + 0 /* format value */ ); + +vlib_parse_match_t +rule_eof_match (vlib_parse_main_t * pm, vlib_parse_type_t * type, + vlib_lex_token_t * t, vlib_parse_value_t * valuep) +{ + vlib_parse_match_function_t *fp = parse_last_match_value (pm); + pm->current_token_index--; + return fp ? fp (pm, type, t, valuep) : VLIB_PARSE_MATCH_RULE; +} + +PARSE_TYPE_INIT (rule_eof, rule_eof_match, 0, 0); + +vlib_parse_match_t +word_match (vlib_parse_main_t * pm, vlib_parse_type_t * type, + vlib_lex_token_t * t, vlib_parse_value_t * valuep) +{ + u8 *tv, *iv; + int i; + + if (t->token != VLIB_LEX_word) + return VLIB_PARSE_MATCH_FAIL; + + tv = t->value.as_pointer; + iv = parse_last_match_value (pm); + + for (i = 0; tv[i]; i++) + { + if (tv[i] != iv[i]) + return VLIB_PARSE_MATCH_FAIL; + } + + return iv[i] == 0 ? VLIB_PARSE_MATCH_FULL : VLIB_PARSE_MATCH_PARTIAL; +} + +PARSE_TYPE_INIT (word, word_match, 0 /* clnup value */ , + 0 /* format value */ ); + +vlib_parse_match_t +number_match (vlib_parse_main_t * pm, vlib_parse_type_t * type, + vlib_lex_token_t * t, vlib_parse_value_t * valuep) +{ + if (t->token == VLIB_LEX_number) + { + valuep->value.as_uword = t->value.as_uword; + return VLIB_PARSE_MATCH_VALUE; + } + return VLIB_PARSE_MATCH_FAIL; +} + +static u8 * +format_value_number (u8 * s, va_list * args) +{ + vlib_parse_value_t *v = va_arg (*args, vlib_parse_value_t *); + uword a = v->value.as_uword; + + if (BITS (uword) == 64) + s = format (s, "%lld(0x%llx)", a, a); + else + s = format (s, "%ld(0x%lx)", a, a); + return s; +} + +PARSE_TYPE_INIT (number, number_match, 0 /* cln value */ , + format_value_number /* fmt value */ ); + + +#define foreach_vanilla_lex_match_function \ + _(plus) \ + _(minus) \ + _(star) \ + _(slash) \ + _(lpar) \ + _(rpar) + +#define LEX_MATCH_DEBUG 0 + +#define _(name) \ +vlib_parse_match_t name##_match (vlib_parse_main_t *pm, \ + vlib_parse_type_t *type, \ + vlib_lex_token_t *t, \ + vlib_parse_value_t *valuep) \ +{ \ + if (LEX_MATCH_DEBUG > 0) \ + clib_warning ("against %U returns %s", \ + format_vlib_lex_token, pm->lex_main, t, \ + (t->token == VLIB_LEX_##name) \ + ? "VLIB_PARSE_MATCH_FULL" : \ + "VLIB_PARSE_MATCH_FAIL"); \ + if (t->token == VLIB_LEX_##name) \ + return VLIB_PARSE_MATCH_FULL; \ + return VLIB_PARSE_MATCH_FAIL; \ +} \ + \ +PARSE_TYPE_INIT (name, name##_match, 0 /* cln value */, \ + 0 /* fmt val */); + +foreach_vanilla_lex_match_function +#undef _ +/* So we're linked in. */ +static clib_error_t * +parse_builtin_init (vlib_main_t * vm) +{ + return 0; +} + +VLIB_INIT_FUNCTION (parse_builtin_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/pci/linux_pci.c b/src/vlib/pci/linux_pci.c new file mode 100644 index 00000000..f9ee47ac --- /dev/null +++ b/src/vlib/pci/linux_pci.c @@ -0,0 +1,642 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * pci.c: Linux user space PCI bus management. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct +{ + /* /sys/bus/pci/devices/... directory name for this device. */ + u8 *dev_dir_name; + + /* Resource file descriptors. */ + int *resource_fds; + + /* File descriptor for config space read/write. */ + int config_fd; + + /* File descriptor for /dev/uio%d */ + int uio_fd; + + /* Minor device for uio device. */ + u32 uio_minor; + + /* Index given by unix_file_add. */ + u32 unix_file_index; + +} linux_pci_device_t; + +/* Pool of PCI devices. */ +typedef struct +{ + vlib_main_t *vlib_main; + linux_pci_device_t *linux_pci_devices; +} linux_pci_main_t; + +extern linux_pci_main_t linux_pci_main; + +/* Call to allocate/initialize the pci subsystem. + This is not an init function so that users can explicitly enable + pci only when it's needed. */ +clib_error_t *pci_bus_init (vlib_main_t * vm); + +clib_error_t *vlib_pci_bind_to_uio (vlib_pci_device_t * d, + char *uio_driver_name); + +linux_pci_main_t linux_pci_main; + +clib_error_t * +vlib_pci_bind_to_uio (vlib_pci_device_t * d, char *uio_driver_name) +{ + clib_error_t *error = 0; + u8 *s = 0; + DIR *dir = 0; + struct dirent *e; + int fd; + u8 *dev_dir_name = format (0, "/sys/bus/pci/devices/%U", + format_vlib_pci_addr, &d->bus_address); + + /* if uio sub-directory exists, we are fine, device is + already bound to UIO driver */ + s = format (s, "%v/uio%c", dev_dir_name, 0); + if (access ((char *) s, F_OK) == 0) + goto done; + vec_reset_length (s); + + /* walk trough all linux interfaces and if interface belonging to + this device is founf check if interface is admin up */ + dir = opendir ("/sys/class/net"); + s = format (s, "%U%c", format_vlib_pci_addr, &d->bus_address, 0); + + if (!dir) + { + error = clib_error_return (0, "Skipping PCI device %U: failed to " + "read /sys/class/net", + format_vlib_pci_addr, &d->bus_address); + goto done; + } + + fd = socket (PF_INET, SOCK_DGRAM, 0); + if (fd < 0) + { + error = clib_error_return_unix (0, "socket"); + goto done; + } + + while ((e = readdir (dir))) + { + struct ifreq ifr; + struct ethtool_drvinfo drvinfo; + + if (e->d_name[0] == '.') /* skip . and .. */ + continue; + + memset (&ifr, 0, sizeof ifr); + memset (&drvinfo, 0, sizeof drvinfo); + ifr.ifr_data = (char *) &drvinfo; + strncpy (ifr.ifr_name, e->d_name, IFNAMSIZ - 1); + drvinfo.cmd = ETHTOOL_GDRVINFO; + if (ioctl (fd, SIOCETHTOOL, &ifr) < 0) + { + /* Some interfaces (eg "lo") don't support this ioctl */ + if ((errno != ENOTSUP) && (errno != ENODEV)) + clib_unix_warning ("ioctl fetch intf %s bus info error", + e->d_name); + continue; + } + + if (strcmp ((char *) s, drvinfo.bus_info)) + continue; + + memset (&ifr, 0, sizeof (ifr)); + strncpy (ifr.ifr_name, e->d_name, IFNAMSIZ - 1); + if (ioctl (fd, SIOCGIFFLAGS, &ifr) < 0) + { + error = clib_error_return_unix (0, "ioctl fetch intf %s flags", + e->d_name); + close (fd); + goto done; + } + + if (ifr.ifr_flags & IFF_UP) + { + error = clib_error_return (0, "Skipping PCI device %U as host " + "interface %s is up", + format_vlib_pci_addr, &d->bus_address, + e->d_name); + close (fd); + goto done; + } + } + + close (fd); + vec_reset_length (s); + + s = format (s, "%v/driver/unbind%c", dev_dir_name, 0); + vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); + vec_reset_length (s); + + s = format (s, "/sys/bus/pci/drivers/%s/new_id%c", uio_driver_name, 0); + vlib_sysfs_write ((char *) s, "0x%04x 0x%04x", d->vendor_id, d->device_id); + vec_reset_length (s); + + s = format (s, "/sys/bus/pci/drivers/%s/bind%c", uio_driver_name, 0); + vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); + +done: + closedir (dir); + vec_free (s); + vec_free (dev_dir_name); + return error; +} + + +static clib_error_t * +scan_uio_dir (void *arg, u8 * path_name, u8 * file_name) +{ + linux_pci_device_t *l = arg; + unformat_input_t input; + + unformat_init_string (&input, (char *) file_name, vec_len (file_name)); + + if (!unformat (&input, "uio%d", &l->uio_minor)) + abort (); + + unformat_free (&input); + return 0; +} + +static clib_error_t * +linux_pci_uio_read_ready (unix_file_t * uf) +{ + vlib_pci_main_t *pm = &pci_main; + vlib_pci_device_t *d; + int __attribute__ ((unused)) rv; + + u32 icount; + rv = read (uf->file_descriptor, &icount, 4); + + d = pool_elt_at_index (pm->pci_devs, uf->private_data); + + if (d->interrupt_handler) + d->interrupt_handler (d); + + vlib_pci_intr_enable (d); + + return /* no error */ 0; +} + +static clib_error_t * +linux_pci_uio_error_ready (unix_file_t * uf) +{ + u32 error_index = (u32) uf->private_data; + + return clib_error_return (0, "pci device %d: error", error_index); +} + +static void +add_device (vlib_pci_device_t * dev, linux_pci_device_t * pdev) +{ + vlib_pci_main_t *pm = &pci_main; + linux_pci_main_t *lpm = &linux_pci_main; + linux_pci_device_t *l; + + pool_get (lpm->linux_pci_devices, l); + l[0] = pdev[0]; + + l->dev_dir_name = vec_dup (l->dev_dir_name); + + dev->os_handle = l - lpm->linux_pci_devices; + + { + u8 *uio_dir = format (0, "%s/uio", l->dev_dir_name); + foreach_directory_file ((char *) uio_dir, scan_uio_dir, l, /* scan_dirs */ + 1); + vec_free (uio_dir); + } + + { + char *uio_name = (char *) format (0, "/dev/uio%d%c", l->uio_minor, 0); + l->uio_fd = open (uio_name, O_RDWR); + if (l->uio_fd < 0) + clib_unix_error ("open `%s'", uio_name); + vec_free (uio_name); + } + + { + unix_file_t template = { 0 }; + unix_main_t *um = &unix_main; + + template.read_function = linux_pci_uio_read_ready; + template.file_descriptor = l->uio_fd; + template.error_function = linux_pci_uio_error_ready; + template.private_data = dev - pm->pci_devs; + + l->unix_file_index = unix_file_add (um, &template); + } +} + +static void +linux_pci_device_free (linux_pci_device_t * l) +{ + int i; + for (i = 0; i < vec_len (l->resource_fds); i++) + if (l->resource_fds[i] > 0) + close (l->resource_fds[i]); + if (l->config_fd > 0) + close (l->config_fd); + if (l->uio_fd > 0) + close (l->uio_fd); + vec_free (l->resource_fds); + vec_free (l->dev_dir_name); +} + +/* Configuration space read/write. */ +clib_error_t * +vlib_pci_read_write_config (vlib_pci_device_t * dev, + vlib_read_or_write_t read_or_write, + uword address, void *data, u32 n_bytes) +{ + linux_pci_main_t *lpm = &linux_pci_main; + linux_pci_device_t *p; + int n; + + p = pool_elt_at_index (lpm->linux_pci_devices, dev->os_handle); + + if (read_or_write == VLIB_READ) + n = pread (p->config_fd, data, n_bytes, address); + else + n = pwrite (p->config_fd, data, n_bytes, address); + + if (n != n_bytes) + return clib_error_return_unix (0, "%s", + read_or_write == VLIB_READ + ? "read" : "write"); + + return 0; +} + +static clib_error_t * +os_map_pci_resource_internal (uword os_handle, + u32 resource, u8 * addr, void **result) +{ + linux_pci_main_t *pm = &linux_pci_main; + linux_pci_device_t *p; + struct stat stat_buf; + u8 *file_name; + int fd; + clib_error_t *error; + int flags = MAP_SHARED; + + error = 0; + p = pool_elt_at_index (pm->linux_pci_devices, os_handle); + + file_name = format (0, "%v/resource%d%c", p->dev_dir_name, resource, 0); + fd = open ((char *) file_name, O_RDWR); + if (fd < 0) + { + error = clib_error_return_unix (0, "open `%s'", file_name); + goto done; + } + + if (fstat (fd, &stat_buf) < 0) + { + error = clib_error_return_unix (0, "fstat `%s'", file_name); + goto done; + } + + vec_validate (p->resource_fds, resource); + p->resource_fds[resource] = fd; + if (addr != 0) + flags |= MAP_FIXED; + + *result = mmap (addr, + /* size */ stat_buf.st_size, + PROT_READ | PROT_WRITE, flags, + /* file */ fd, + /* offset */ 0); + if (*result == (void *) -1) + { + error = clib_error_return_unix (0, "mmap `%s'", file_name); + goto done; + } + +done: + if (error) + { + if (fd >= 0) + close (fd); + } + vec_free (file_name); + return error; +} + +clib_error_t * +vlib_pci_map_resource (vlib_pci_device_t * dev, u32 resource, void **result) +{ + return (os_map_pci_resource_internal + (dev->os_handle, resource, 0 /* addr */ , + result)); +} + +clib_error_t * +vlib_pci_map_resource_fixed (vlib_pci_device_t * dev, + u32 resource, u8 * addr, void **result) +{ + return (os_map_pci_resource_internal + (dev->os_handle, resource, addr, result)); +} + +void +vlib_pci_free_device (vlib_pci_device_t * dev) +{ + linux_pci_main_t *pm = &linux_pci_main; + linux_pci_device_t *l; + + l = pool_elt_at_index (pm->linux_pci_devices, dev->os_handle); + linux_pci_device_free (l); + pool_put (pm->linux_pci_devices, l); +} + +pci_device_registration_t * __attribute__ ((unused)) +pci_device_next_registered (pci_device_registration_t * r) +{ + uword i; + + /* Null vendor id marks end of initialized list. */ + for (i = 0; r->supported_devices[i].vendor_id != 0; i++) + ; + + return clib_elf_section_data_next (r, i * sizeof (r->supported_devices[0])); +} + +static clib_error_t * +init_device_from_registered (vlib_main_t * vm, + vlib_pci_device_t * dev, + linux_pci_device_t * pdev) +{ + vlib_pci_main_t *pm = &pci_main; + pci_device_registration_t *r; + pci_device_id_t *i; + clib_error_t *error; + + r = pm->pci_device_registrations; + + while (r) + { + for (i = r->supported_devices; i->vendor_id != 0; i++) + if (i->vendor_id == dev->vendor_id && i->device_id == dev->device_id) + { + error = vlib_pci_bind_to_uio (dev, "uio_pci_generic"); + if (error) + { + clib_error_report (error); + continue; + } + + add_device (dev, pdev); + dev->interrupt_handler = r->interrupt_handler; + return r->init_function (vm, dev); + } + r = r->next_registration; + } + /* No driver, close the PCI config-space FD */ + close (pdev->config_fd); + return 0; +} + +static clib_error_t * +init_device (vlib_main_t * vm, + vlib_pci_device_t * dev, linux_pci_device_t * pdev) +{ + return init_device_from_registered (vm, dev, pdev); +} + +static clib_error_t * +scan_device (void *arg, u8 * dev_dir_name, u8 * ignored) +{ + vlib_main_t *vm = arg; + vlib_pci_main_t *pm = &pci_main; + int fd; + u8 *f; + clib_error_t *error = 0; + vlib_pci_device_t *dev; + linux_pci_device_t pdev = { 0 }; + u32 tmp; + + f = format (0, "%v/config%c", dev_dir_name, 0); + fd = open ((char *) f, O_RDWR); + + /* Try read-only access if write fails. */ + if (fd < 0) + fd = open ((char *) f, O_RDONLY); + + if (fd < 0) + { + error = clib_error_return_unix (0, "open `%s'", f); + goto done; + } + + pool_get (pm->pci_devs, dev); + + /* You can only read more that 64 bytes of config space as root; so we try to + read the full space but fall back to just the first 64 bytes. */ + if (read (fd, &dev->config_data, sizeof (dev->config_data)) != + sizeof (dev->config_data) + && read (fd, &dev->config0, + sizeof (dev->config0)) != sizeof (dev->config0)) + { + pool_put (pm->pci_devs, dev); + error = clib_error_return_unix (0, "read `%s'", f); + close (fd); + goto done; + } + + { + static pci_config_header_t all_ones; + if (all_ones.vendor_id == 0) + memset (&all_ones, ~0, sizeof (all_ones)); + + if (!memcmp (&dev->config0.header, &all_ones, sizeof (all_ones))) + { + pool_put (pm->pci_devs, dev); + error = clib_error_return (0, "invalid PCI config for `%s'", f); + close (fd); + goto done; + } + } + + if (dev->config0.header.header_type == 0) + pci_config_type0_little_to_host (&dev->config0); + else + pci_config_type1_little_to_host (&dev->config1); + + /* Parse bus, dev, function from directory name. */ + { + unformat_input_t input; + + unformat_init_string (&input, (char *) dev_dir_name, + vec_len (dev_dir_name)); + + if (!unformat (&input, "/sys/bus/pci/devices/%U", + unformat_vlib_pci_addr, &dev->bus_address)) + abort (); + + unformat_free (&input); + + } + + + pdev.config_fd = fd; + pdev.dev_dir_name = dev_dir_name; + + hash_set (pm->pci_dev_index_by_pci_addr, dev->bus_address.as_u32, + dev - pm->pci_devs); + + error = init_device (vm, dev, &pdev); + + vec_reset_length (f); + f = format (f, "%v/vpd%c", dev_dir_name, 0); + fd = open ((char *) f, O_RDONLY); + if (fd >= 0) + { + while (1) + { + u8 tag[3]; + u8 *data = 0; + int len; + + if (read (fd, &tag, 3) != 3) + break; + + if (tag[0] != 0x82 && tag[0] != 0x90 && tag[0] != 0x91) + break; + + len = (tag[2] << 8) | tag[1]; + vec_validate (data, len); + + if (read (fd, data, len) != len) + { + vec_free (data); + break; + } + if (tag[0] == 0x82) + dev->product_name = data; + else if (tag[0] == 0x90) + dev->vpd_r = data; + else if (tag[0] == 0x91) + dev->vpd_w = data; + + data = 0; + } + close (fd); + } + + vec_reset_length (f); + f = format (f, "%v/driver%c", dev_dir_name, 0); + dev->driver_name = vlib_sysfs_link_to_name ((char *) f); + + dev->numa_node = -1; + vec_reset_length (f); + f = format (f, "%v/numa_node%c", dev_dir_name, 0); + vlib_sysfs_read ((char *) f, "%u", &dev->numa_node); + + vec_reset_length (f); + f = format (f, "%v/class%c", dev_dir_name, 0); + vlib_sysfs_read ((char *) f, "0x%x", &tmp); + dev->device_class = tmp >> 8; + + vec_reset_length (f); + f = format (f, "%v/vendor%c", dev_dir_name, 0); + vlib_sysfs_read ((char *) f, "0x%x", &tmp); + dev->vendor_id = tmp; + + vec_reset_length (f); + f = format (f, "%v/device%c", dev_dir_name, 0); + vlib_sysfs_read ((char *) f, "0x%x", &tmp); + dev->device_id = tmp; + +done: + vec_free (f); + return error; +} + +clib_error_t * +linux_pci_init (vlib_main_t * vm) +{ + vlib_pci_main_t *pm = &pci_main; + clib_error_t *error; + + pm->vlib_main = vm; + + if ((error = vlib_call_init_function (vm, unix_input_init))) + return error; + + ASSERT (sizeof (vlib_pci_addr_t) == sizeof (u32)); + pm->pci_dev_index_by_pci_addr = hash_create (0, sizeof (uword)); + + error = foreach_directory_file ("/sys/bus/pci/devices", scan_device, vm, + /* scan_dirs */ 0); + + /* Complain and continue. might not be root, etc. */ + if (error) + clib_error_report (error); + + return error; +} + +VLIB_INIT_FUNCTION (linux_pci_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/pci/pci.c b/src/vlib/pci/pci.c new file mode 100644 index 00000000..7100064d --- /dev/null +++ b/src/vlib/pci/pci.c @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * pci.c: Linux user space PCI bus management. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +vlib_pci_main_t pci_main; + +vlib_pci_device_t * +vlib_get_pci_device (vlib_pci_addr_t * addr) +{ + vlib_pci_main_t *pm = &pci_main; + uword *p; + p = hash_get (pm->pci_dev_index_by_pci_addr, addr->as_u32); + + if (p == 0) + return 0; + + return vec_elt_at_index (pm->pci_devs, p[0]); +} + +static clib_error_t * +show_pci_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_pci_main_t *pm = &pci_main; + vlib_pci_device_t *d; + int show_all = 0; + u8 *s = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "all")) + show_all = 1; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + vlib_cli_output (vm, "%-13s%-5s%-12s%-13s%-16s%-32s%s", + "Address", "Sock", "VID:PID", "Link Speed", "Driver", + "Product Name", "Vital Product Data"); + + /* *INDENT-OFF* */ + pool_foreach (d, pm->pci_devs, ({ + + if (d->device_class != PCI_CLASS_NETWORK_ETHERNET && !show_all) + continue; + + vec_reset_length (s); + + if (d->numa_node >= 0) + s = format (s, " %d", d->numa_node); + + vlib_cli_output (vm, "%-13U%-5v%04x:%04x %-13U%-16s%-32v%U", + format_vlib_pci_addr, &d->bus_address, s, + d->vendor_id, d->device_id, + format_vlib_pci_link_speed, d, + d->driver_name ? (char *) d->driver_name : "", + d->product_name, + format_vlib_pci_vpd, d->vpd_r, 0); + })); +/* *INDENT-ON* */ + + vec_free (s); + return 0; +} + +uword +unformat_vlib_pci_addr (unformat_input_t * input, va_list * args) +{ + vlib_pci_addr_t *addr = va_arg (*args, vlib_pci_addr_t *); + u32 x[4]; + + if (!unformat (input, "%x:%x:%x.%x", &x[0], &x[1], &x[2], &x[3])) + return 0; + + addr->domain = x[0]; + addr->bus = x[1]; + addr->slot = x[2]; + addr->function = x[3]; + + return 1; +} + +u8 * +format_vlib_pci_addr (u8 * s, va_list * va) +{ + vlib_pci_addr_t *addr = va_arg (*va, vlib_pci_addr_t *); + return format (s, "%04x:%02x:%02x.%x", addr->domain, addr->bus, + addr->slot, addr->function); +} + +u8 * +format_vlib_pci_handle (u8 * s, va_list * va) +{ + vlib_pci_addr_t *addr = va_arg (*va, vlib_pci_addr_t *); + return format (s, "%x/%x/%x", addr->bus, addr->slot, addr->function); +} + +u8 * +format_vlib_pci_link_speed (u8 * s, va_list * va) +{ + vlib_pci_device_t *d = va_arg (*va, vlib_pci_device_t *); + pcie_config_regs_t *r = + pci_config_find_capability (&d->config0, PCI_CAP_ID_PCIE); + int width; + + if (!r) + return format (s, "unknown"); + + width = (r->link_status >> 4) & 0x3f; + + if ((r->link_status & 0xf) == 1) + return format (s, "2.5 GT/s x%u", width); + if ((r->link_status & 0xf) == 2) + return format (s, "5.0 GT/s x%u", width); + if ((r->link_status & 0xf) == 3) + return format (s, "8.0 GT/s x%u", width); + return format (s, "unknown"); +} + +u8 * +format_vlib_pci_vpd (u8 * s, va_list * args) +{ + u8 *data = va_arg (*args, u8 *); + u8 *id = va_arg (*args, u8 *); + uword indent = format_get_indent (s); + char *string_types[] = { "PN", "EC", "SN", "MN", 0 }; + uword p = 0; + int first_line = 1; + + if (vec_len (data) < 3) + return s; + + while (p + 3 < vec_len (data)) + { + + if (data[p] == 0 && data[p + 1] == 0) + return s; + + if (p + data[p + 2] > vec_len (data)) + return s; + + if (id == 0) + { + int is_string = 0; + char **c = string_types; + + while (c[0]) + { + if (*(u16 *) & data[p] == *(u16 *) c[0]) + is_string = 1; + c++; + } + + if (data[p + 2]) + { + if (!first_line) + s = format (s, "\n%U", format_white_space, indent); + else + { + first_line = 0; + s = format (s, " "); + } + + s = format (s, "%c%c: ", data[p], data[p + 1]); + if (is_string) + vec_add (s, data + p + 3, data[p + 2]); + else + { + int i; + const int max_bytes = 8; + s = format (s, "0x"); + for (i = 0; i < clib_min (data[p + 2], max_bytes); i++) + s = format (s, " %02x", data[p + 3 + i]); + + if (data[p + 2] > max_bytes) + s = format (s, " ..."); + } + } + } + else if (*(u16 *) & data[p] == *(u16 *) id) + { + vec_add (s, data + p + 3, data[p + 2]); + return s; + } + + p += 3 + data[p + 2]; + } + + return s; +} + + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_pci_command, static) = { + .path = "show pci", + .short_help = "show pci [all]", + .function = show_pci_fn, +}; +/* *INDENT-ON* */ + +clib_error_t * +pci_bus_init (vlib_main_t * vm) +{ + return 0; +} + +VLIB_INIT_FUNCTION (pci_bus_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/pci/pci.h b/src/vlib/pci/pci.h new file mode 100644 index 00000000..811a6ff2 --- /dev/null +++ b/src/vlib/pci/pci.h @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * pci.h: PCI definitions. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_pci_h +#define included_vlib_pci_h + +#include +#include + +typedef CLIB_PACKED (union + { + struct + { +u16 domain; u8 bus; u8 slot: 5; u8 function:3;}; + u32 as_u32;}) vlib_pci_addr_t; + +typedef struct vlib_pci_device +{ + /* Operating system handle for this device. */ + uword os_handle; + + vlib_pci_addr_t bus_address; + + /* First 64 bytes of configuration space. */ + union + { + pci_config_type0_regs_t config0; + pci_config_type1_regs_t config1; + u8 config_data[256]; + }; + + /* Interrupt handler */ + void (*interrupt_handler) (struct vlib_pci_device * dev); + + /* Driver name */ + u8 *driver_name; + + /* Numa Node */ + int numa_node; + + /* Device data */ + u16 device_class; + u16 vendor_id; + u16 device_id; + + /* Vital Product Data */ + u8 *product_name; + u8 *vpd_r; + u8 *vpd_w; + + /* Private data */ + uword private_data; + +} vlib_pci_device_t; + +typedef struct +{ + u16 vendor_id, device_id; +} pci_device_id_t; + +typedef struct _pci_device_registration +{ + /* Driver init function. */ + clib_error_t *(*init_function) (vlib_main_t * vm, vlib_pci_device_t * dev); + + /* Interrupt handler */ + void (*interrupt_handler) (vlib_pci_device_t * dev); + + /* List of registrations */ + struct _pci_device_registration *next_registration; + + /* Vendor/device ids supported by this driver. */ + pci_device_id_t supported_devices[]; +} pci_device_registration_t; + +/* Pool of PCI devices. */ +typedef struct +{ + vlib_main_t *vlib_main; + vlib_pci_device_t *pci_devs; + pci_device_registration_t *pci_device_registrations; + uword *pci_dev_index_by_pci_addr; +} vlib_pci_main_t; + +extern vlib_pci_main_t pci_main; + +#define PCI_REGISTER_DEVICE(x,...) \ + __VA_ARGS__ pci_device_registration_t x; \ +static void __vlib_add_pci_device_registration_##x (void) \ + __attribute__((__constructor__)) ; \ +static void __vlib_add_pci_device_registration_##x (void) \ +{ \ + vlib_pci_main_t * pm = &pci_main; \ + x.next_registration = pm->pci_device_registrations; \ + pm->pci_device_registrations = &x; \ +} \ +__VA_ARGS__ pci_device_registration_t x + +clib_error_t *vlib_pci_bind_to_uio (vlib_pci_device_t * d, + char *uio_driver_name); + +/* Configuration space read/write. */ +clib_error_t *vlib_pci_read_write_config (vlib_pci_device_t * dev, + vlib_read_or_write_t read_or_write, + uword address, + void *data, u32 n_bytes); + +#define _(t) \ +static inline clib_error_t * \ +vlib_pci_read_config_##t (vlib_pci_device_t * dev, \ + uword address, t * data) \ +{ \ + return vlib_pci_read_write_config (dev, VLIB_READ,address, data, \ + sizeof (data[0])); \ +} + +_(u32); +_(u16); +_(u8); + +#undef _ + +#define _(t) \ +static inline clib_error_t * \ +vlib_pci_write_config_##t (vlib_pci_device_t * dev, uword address, \ + t * data) \ +{ \ + return vlib_pci_read_write_config (dev, VLIB_WRITE, \ + address, data, sizeof (data[0])); \ +} + +_(u32); +_(u16); +_(u8); + +#undef _ + +static inline clib_error_t * +vlib_pci_intr_enable (vlib_pci_device_t * dev) +{ + u16 command; + clib_error_t *err; + + err = vlib_pci_read_config_u16 (dev, 4, &command); + + if (err) + return err; + + command &= ~PCI_COMMAND_INTX_DISABLE; + + return vlib_pci_write_config_u16 (dev, 4, &command); +} + +static inline clib_error_t * +vlib_pci_intr_disable (vlib_pci_device_t * dev) +{ + u16 command; + clib_error_t *err; + + err = vlib_pci_read_config_u16 (dev, 4, &command); + + if (err) + return err; + + command |= PCI_COMMAND_INTX_DISABLE; + + return vlib_pci_write_config_u16 (dev, 4, &command); +} + +static inline clib_error_t * +vlib_pci_bus_master_enable (vlib_pci_device_t * dev) +{ + clib_error_t *err; + u16 command; + + /* Set bus master enable (BME) */ + err = vlib_pci_read_config_u16 (dev, 4, &command); + + if (err) + return err; + + if (!(command & PCI_COMMAND_BUS_MASTER)) + return 0; + + command |= PCI_COMMAND_BUS_MASTER; + + return vlib_pci_write_config_u16 (dev, 4, &command); +} + +clib_error_t *vlib_pci_map_resource (vlib_pci_device_t * dev, u32 resource, + void **result); + +clib_error_t *vlib_pci_map_resource_fixed (vlib_pci_device_t * dev, + u32 resource, u8 * addr, + void **result); + +vlib_pci_device_t *vlib_get_pci_device (vlib_pci_addr_t * addr); +/* Free's device. */ +void vlib_pci_free_device (vlib_pci_device_t * dev); + +unformat_function_t unformat_vlib_pci_addr; +format_function_t format_vlib_pci_addr; +format_function_t format_vlib_pci_handle; +format_function_t format_vlib_pci_link_speed; +format_function_t format_vlib_pci_vpd; + +#endif /* included_vlib_pci_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/pci/pci_config.h b/src/vlib/pci/pci_config.h new file mode 100644 index 00000000..92e56af6 --- /dev/null +++ b/src/vlib/pci/pci_config.h @@ -0,0 +1,731 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * pci.h: PCI definitions. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_pci_config_h +#define included_vlib_pci_config_h + +#include +#include + +typedef enum +{ + PCI_CLASS_NOT_DEFINED = 0x0000, + PCI_CLASS_NOT_DEFINED_VGA = 0x0001, + + PCI_CLASS_STORAGE_SCSI = 0x0100, + PCI_CLASS_STORAGE_IDE = 0x0101, + PCI_CLASS_STORAGE_FLOPPY = 0x0102, + PCI_CLASS_STORAGE_IPI = 0x0103, + PCI_CLASS_STORAGE_RAID = 0x0104, + PCI_CLASS_STORAGE_OTHER = 0x0180, + PCI_CLASS_STORAGE = 0x0100, + + PCI_CLASS_NETWORK_ETHERNET = 0x0200, + PCI_CLASS_NETWORK_TOKEN_RING = 0x0201, + PCI_CLASS_NETWORK_FDDI = 0x0202, + PCI_CLASS_NETWORK_ATM = 0x0203, + PCI_CLASS_NETWORK_OTHER = 0x0280, + PCI_CLASS_NETWORK = 0x0200, + + PCI_CLASS_DISPLAY_VGA = 0x0300, + PCI_CLASS_DISPLAY_XGA = 0x0301, + PCI_CLASS_DISPLAY_3D = 0x0302, + PCI_CLASS_DISPLAY_OTHER = 0x0380, + PCI_CLASS_DISPLAY = 0x0300, + + PCI_CLASS_MULTIMEDIA_VIDEO = 0x0400, + PCI_CLASS_MULTIMEDIA_AUDIO = 0x0401, + PCI_CLASS_MULTIMEDIA_PHONE = 0x0402, + PCI_CLASS_MULTIMEDIA_OTHER = 0x0480, + PCI_CLASS_MULTIMEDIA = 0x0400, + + PCI_CLASS_MEMORY_RAM = 0x0500, + PCI_CLASS_MEMORY_FLASH = 0x0501, + PCI_CLASS_MEMORY_OTHER = 0x0580, + PCI_CLASS_MEMORY = 0x0500, + + PCI_CLASS_BRIDGE_HOST = 0x0600, + PCI_CLASS_BRIDGE_ISA = 0x0601, + PCI_CLASS_BRIDGE_EISA = 0x0602, + PCI_CLASS_BRIDGE_MC = 0x0603, + PCI_CLASS_BRIDGE_PCI = 0x0604, + PCI_CLASS_BRIDGE_PCMCIA = 0x0605, + PCI_CLASS_BRIDGE_NUBUS = 0x0606, + PCI_CLASS_BRIDGE_CARDBUS = 0x0607, + PCI_CLASS_BRIDGE_RACEWAY = 0x0608, + PCI_CLASS_BRIDGE_OTHER = 0x0680, + PCI_CLASS_BRIDGE = 0x0600, + + PCI_CLASS_COMMUNICATION_SERIAL = 0x0700, + PCI_CLASS_COMMUNICATION_PARALLEL = 0x0701, + PCI_CLASS_COMMUNICATION_MULTISERIAL = 0x0702, + PCI_CLASS_COMMUNICATION_MODEM = 0x0703, + PCI_CLASS_COMMUNICATION_OTHER = 0x0780, + PCI_CLASS_COMMUNICATION = 0x0700, + + PCI_CLASS_SYSTEM_PIC = 0x0800, + PCI_CLASS_SYSTEM_DMA = 0x0801, + PCI_CLASS_SYSTEM_TIMER = 0x0802, + PCI_CLASS_SYSTEM_RTC = 0x0803, + PCI_CLASS_SYSTEM_PCI_HOTPLUG = 0x0804, + PCI_CLASS_SYSTEM_OTHER = 0x0880, + PCI_CLASS_SYSTEM = 0x0800, + + PCI_CLASS_INPUT_KEYBOARD = 0x0900, + PCI_CLASS_INPUT_PEN = 0x0901, + PCI_CLASS_INPUT_MOUSE = 0x0902, + PCI_CLASS_INPUT_SCANNER = 0x0903, + PCI_CLASS_INPUT_GAMEPORT = 0x0904, + PCI_CLASS_INPUT_OTHER = 0x0980, + PCI_CLASS_INPUT = 0x0900, + + PCI_CLASS_DOCKING_GENERIC = 0x0a00, + PCI_CLASS_DOCKING_OTHER = 0x0a80, + PCI_CLASS_DOCKING = 0x0a00, + + PCI_CLASS_PROCESSOR_386 = 0x0b00, + PCI_CLASS_PROCESSOR_486 = 0x0b01, + PCI_CLASS_PROCESSOR_PENTIUM = 0x0b02, + PCI_CLASS_PROCESSOR_ALPHA = 0x0b10, + PCI_CLASS_PROCESSOR_POWERPC = 0x0b20, + PCI_CLASS_PROCESSOR_MIPS = 0x0b30, + PCI_CLASS_PROCESSOR_CO = 0x0b40, + PCI_CLASS_PROCESSOR = 0x0b00, + + PCI_CLASS_SERIAL_FIREWIRE = 0x0c00, + PCI_CLASS_SERIAL_ACCESS = 0x0c01, + PCI_CLASS_SERIAL_SSA = 0x0c02, + PCI_CLASS_SERIAL_USB = 0x0c03, + PCI_CLASS_SERIAL_FIBER = 0x0c04, + PCI_CLASS_SERIAL_SMBUS = 0x0c05, + PCI_CLASS_SERIAL = 0x0c00, + + PCI_CLASS_INTELLIGENT_I2O = 0x0e00, + PCI_CLASS_INTELLIGENT = 0x0e00, + + PCI_CLASS_SATELLITE_TV = 0x0f00, + PCI_CLASS_SATELLITE_AUDIO = 0x0f01, + PCI_CLASS_SATELLITE_VOICE = 0x0f03, + PCI_CLASS_SATELLITE_DATA = 0x0f04, + PCI_CLASS_SATELLITE = 0x0f00, + + PCI_CLASS_CRYPT_NETWORK = 0x1000, + PCI_CLASS_CRYPT_ENTERTAINMENT = 0x1001, + PCI_CLASS_CRYPT_OTHER = 0x1080, + PCI_CLASS_CRYPT = 0x1000, + + PCI_CLASS_SP_DPIO = 0x1100, + PCI_CLASS_SP_OTHER = 0x1180, + PCI_CLASS_SP = 0x1100, +} pci_device_class_t; + +static inline pci_device_class_t +pci_device_class_base (pci_device_class_t c) +{ + return c & ~0xff; +} + +/* + * Under PCI, each device has 256 bytes of configuration address space, + * of which the first 64 bytes are standardized as follows: + */ +typedef struct +{ + u16 vendor_id; + u16 device_id; + + u16 command; +#define PCI_COMMAND_IO (1 << 0) /* Enable response in I/O space */ +#define PCI_COMMAND_MEMORY (1 << 1) /* Enable response in Memory space */ +#define PCI_COMMAND_BUS_MASTER (1 << 2) /* Enable bus mastering */ +#define PCI_COMMAND_SPECIAL (1 << 3) /* Enable response to special cycles */ +#define PCI_COMMAND_WRITE_INVALIDATE (1 << 4) /* Use memory write and invalidate */ +#define PCI_COMMAND_VGA_PALETTE_SNOOP (1 << 5) +#define PCI_COMMAND_PARITY (1 << 6) +#define PCI_COMMAND_WAIT (1 << 7) /* Enable address/data stepping */ +#define PCI_COMMAND_SERR (1 << 8) /* Enable SERR */ +#define PCI_COMMAND_BACK_TO_BACK_WRITE (1 << 9) +#define PCI_COMMAND_INTX_DISABLE (1 << 10) /* INTx Emulation Disable */ + + u16 status; +#define PCI_STATUS_INTX_PENDING (1 << 3) +#define PCI_STATUS_CAPABILITY_LIST (1 << 4) +#define PCI_STATUS_66MHZ (1 << 5) /* Support 66 Mhz PCI 2.1 bus */ +#define PCI_STATUS_UDF (1 << 6) /* Support User Definable Features (obsolete) */ +#define PCI_STATUS_BACK_TO_BACK_WRITE (1 << 7) /* Accept fast-back to back */ +#define PCI_STATUS_PARITY_ERROR (1 << 8) /* Detected parity error */ +#define PCI_STATUS_DEVSEL_GET(x) ((x >> 9) & 3) /* DEVSEL timing */ +#define PCI_STATUS_DEVSEL_FAST (0 << 9) +#define PCI_STATUS_DEVSEL_MEDIUM (1 << 9) +#define PCI_STATUS_DEVSEL_SLOW (2 << 9) +#define PCI_STATUS_SIG_TARGET_ABORT (1 << 11) /* Set on target abort */ +#define PCI_STATUS_REC_TARGET_ABORT (1 << 12) /* Master ack of " */ +#define PCI_STATUS_REC_MASTER_ABORT (1 << 13) /* Set on master abort */ +#define PCI_STATUS_SIG_SYSTEM_ERROR (1 << 14) /* Set when we drive SERR */ +#define PCI_STATUS_DETECTED_PARITY_ERROR (1 << 15) + + u8 revision_id; + u8 programming_interface_class; /* Reg. Level Programming Interface */ + + pci_device_class_t device_class:16; + + u8 cache_size; + u8 latency_timer; + + u8 header_type; +#define PCI_HEADER_TYPE_NORMAL 0 +#define PCI_HEADER_TYPE_BRIDGE 1 +#define PCI_HEADER_TYPE_CARDBUS 2 + + u8 bist; +#define PCI_BIST_CODE_MASK 0x0f /* Return result */ +#define PCI_BIST_START 0x40 /* 1 to start BIST, 2 secs or less */ +#define PCI_BIST_CAPABLE 0x80 /* 1 if BIST capable */ +} pci_config_header_t; + +/* Byte swap config header. */ +always_inline void +pci_config_header_little_to_host (pci_config_header_t * r) +{ + if (!CLIB_ARCH_IS_BIG_ENDIAN) + return; +#define _(f,t) r->f = clib_byte_swap_##t (r->f) + _(vendor_id, u16); + _(device_id, u16); + _(command, u16); + _(status, u16); + _(device_class, u16); +#undef _ +} + +/* Header type 0 (normal devices) */ +typedef struct +{ + pci_config_header_t header; + + /* + * Base addresses specify locations in memory or I/O space. + * Decoded size can be determined by writing a value of + * 0xffffffff to the register, and reading it back. Only + * 1 bits are decoded. + */ + u32 base_address[6]; + + u16 cardbus_cis; + + u16 subsystem_vendor_id; + u16 subsystem_id; + + u32 rom_address; +#define PCI_ROM_ADDRESS 0x30 /* Bits 31..11 are address, 10..1 reserved */ +#define PCI_ROM_ADDRESS_ENABLE 0x01 +#define PCI_ROM_ADDRESS_MASK (~0x7ffUL) + + u8 first_capability_offset; + CLIB_PAD_FROM_TO (0x35, 0x3c); + + u8 interrupt_line; + u8 interrupt_pin; + u8 min_grant; + u8 max_latency; + + u8 capability_data[0]; +} pci_config_type0_regs_t; + +always_inline void +pci_config_type0_little_to_host (pci_config_type0_regs_t * r) +{ + int i; + if (!CLIB_ARCH_IS_BIG_ENDIAN) + return; + pci_config_header_little_to_host (&r->header); +#define _(f,t) r->f = clib_byte_swap_##t (r->f) + for (i = 0; i < ARRAY_LEN (r->base_address); i++) + _(base_address[i], u32); + _(cardbus_cis, u16); + _(subsystem_vendor_id, u16); + _(subsystem_id, u16); + _(rom_address, u32); +#undef _ +} + +/* Header type 1 (PCI-to-PCI bridges) */ +typedef struct +{ + pci_config_header_t header; + + u32 base_address[2]; + + /* Primary/secondary bus number. */ + u8 primary_bus; + u8 secondary_bus; + + /* Highest bus number behind the bridge */ + u8 subordinate_bus; + + u8 secondary_bus_latency_timer; + + /* I/O range behind bridge. */ + u8 io_base, io_limit; + + /* Secondary status register, only bit 14 used */ + u16 secondary_status; + + /* Memory range behind bridge in units of 64k bytes. */ + u16 memory_base, memory_limit; +#define PCI_MEMORY_RANGE_TYPE_MASK 0x0fUL +#define PCI_MEMORY_RANGE_MASK (~0x0fUL) + + u16 prefetchable_memory_base, prefetchable_memory_limit; +#define PCI_PREF_RANGE_TYPE_MASK 0x0fUL +#define PCI_PREF_RANGE_TYPE_32 0x00 +#define PCI_PREF_RANGE_TYPE_64 0x01 +#define PCI_PREF_RANGE_MASK (~0x0fUL) + + u32 prefetchable_memory_base_upper_32bits; + u32 prefetchable_memory_limit_upper_32bits; + u16 io_base_upper_16bits; + u16 io_limit_upper_16bits; + + /* Same as for type 0. */ + u8 capability_list_offset; + CLIB_PAD_FROM_TO (0x35, 0x37); + + u32 rom_address; + CLIB_PAD_FROM_TO (0x3c, 0x3e); + + u16 bridge_control; +#define PCI_BRIDGE_CTL_PARITY 0x01 /* Enable parity detection on secondary interface */ +#define PCI_BRIDGE_CTL_SERR 0x02 /* The same for SERR forwarding */ +#define PCI_BRIDGE_CTL_NO_ISA 0x04 /* Disable bridging of ISA ports */ +#define PCI_BRIDGE_CTL_VGA 0x08 /* Forward VGA addresses */ +#define PCI_BRIDGE_CTL_MASTER_ABORT 0x20 /* Report master aborts */ +#define PCI_BRIDGE_CTL_BUS_RESET 0x40 /* Secondary bus reset */ +#define PCI_BRIDGE_CTL_FAST_BACK 0x80 /* Fast Back2Back enabled on secondary interface */ + + u8 capability_data[0]; +} pci_config_type1_regs_t; + +always_inline void +pci_config_type1_little_to_host (pci_config_type1_regs_t * r) +{ + int i; + if (!CLIB_ARCH_IS_BIG_ENDIAN) + return; + pci_config_header_little_to_host (&r->header); +#define _(f,t) r->f = clib_byte_swap_##t (r->f) + for (i = 0; i < ARRAY_LEN (r->base_address); i++) + _(base_address[i], u32); + _(secondary_status, u16); + _(memory_base, u16); + _(memory_limit, u16); + _(prefetchable_memory_base, u16); + _(prefetchable_memory_limit, u16); + _(prefetchable_memory_base_upper_32bits, u32); + _(prefetchable_memory_limit_upper_32bits, u32); + _(io_base_upper_16bits, u16); + _(io_limit_upper_16bits, u16); + _(rom_address, u32); + _(bridge_control, u16); +#undef _ +} + +/* Capabilities. */ +typedef enum pci_capability_type +{ + /* Power Management */ + PCI_CAP_ID_PM = 1, + + /* Accelerated Graphics Port */ + PCI_CAP_ID_AGP = 2, + + /* Vital Product Data */ + PCI_CAP_ID_VPD = 3, + + /* Slot Identification */ + PCI_CAP_ID_SLOTID = 4, + + /* Message Signalled Interrupts */ + PCI_CAP_ID_MSI = 5, + + /* CompactPCI HotSwap */ + PCI_CAP_ID_CHSWP = 6, + + /* PCI-X */ + PCI_CAP_ID_PCIX = 7, + + /* Hypertransport. */ + PCI_CAP_ID_HYPERTRANSPORT = 8, + + /* PCI Standard Hot-Plug Controller */ + PCI_CAP_ID_SHPC = 0xc, + + /* PCI Express */ + PCI_CAP_ID_PCIE = 0x10, + + /* MSI-X */ + PCI_CAP_ID_MSIX = 0x11, +} pci_capability_type_t; + +/* Common header for capabilities. */ +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { + enum pci_capability_type type:8; + u8 next_offset;}) pci_capability_regs_t; +/* *INDENT-ON* */ + +always_inline void * +pci_config_find_capability (pci_config_type0_regs_t * t, int cap_type) +{ + pci_capability_regs_t *c; + u32 next_offset; + u32 ttl = 48; + + if (!(t->header.status & PCI_STATUS_CAPABILITY_LIST)) + return 0; + + next_offset = t->first_capability_offset; + while (ttl-- && next_offset >= 0x40) + { + c = (void *) t + (next_offset & ~3); + if ((u8) c->type == 0xff) + break; + if (c->type == cap_type) + return c; + next_offset = c->next_offset; + } + return 0; +} + +/* Power Management Registers */ +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { + pci_capability_regs_t header; u16 capabilities; +#define PCI_PM_CAP_VER_MASK 0x0007 /* Version */ +#define PCI_PM_CAP_PME_CLOCK 0x0008 /* PME clock required */ +#define PCI_PM_CAP_RESERVED 0x0010 /* Reserved field */ +#define PCI_PM_CAP_DSI 0x0020 /* Device specific initialization */ +#define PCI_PM_CAP_AUX_POWER 0x01C0 /* Auxilliary power support mask */ +#define PCI_PM_CAP_D1 0x0200 /* D1 power state support */ +#define PCI_PM_CAP_D2 0x0400 /* D2 power state support */ +#define PCI_PM_CAP_PME 0x0800 /* PME pin supported */ +#define PCI_PM_CAP_PME_MASK 0xF800 /* PME Mask of all supported states */ +#define PCI_PM_CAP_PME_D0 0x0800 /* PME# from D0 */ +#define PCI_PM_CAP_PME_D1 0x1000 /* PME# from D1 */ +#define PCI_PM_CAP_PME_D2 0x2000 /* PME# from D2 */ +#define PCI_PM_CAP_PME_D3 0x4000 /* PME# from D3 (hot) */ +#define PCI_PM_CAP_PME_D3cold 0x8000 /* PME# from D3 (cold) */ + u16 control; +#define PCI_PM_CTRL_STATE_MASK 0x0003 /* Current power state (D0 to D3) */ +#define PCI_PM_CTRL_PME_ENABLE 0x0100 /* PME pin enable */ +#define PCI_PM_CTRL_DATA_SEL_MASK 0x1e00 /* Data select (??) */ +#define PCI_PM_CTRL_DATA_SCALE_MASK 0x6000 /* Data scale (??) */ +#define PCI_PM_CTRL_PME_STATUS 0x8000 /* PME pin status */ + u8 extensions; +#define PCI_PM_PPB_B2_B3 0x40 /* Stop clock when in D3hot (??) */ +#define PCI_PM_BPCC_ENABLE 0x80 /* Bus power/clock control enable (??) */ + u8 data;}) pci_power_management_regs_t; +/* *INDENT-ON* */ + +/* AGP registers */ +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { + pci_capability_regs_t header; u8 version; + u8 rest_of_capability_flags; u32 status; u32 command; + /* Command & status common bits. */ +#define PCI_AGP_RQ_MASK 0xff000000 /* Maximum number of requests - 1 */ +#define PCI_AGP_SBA 0x0200 /* Sideband addressing supported */ +#define PCI_AGP_64BIT 0x0020 /* 64-bit addressing supported */ +#define PCI_AGP_ALLOW_TRANSACTIONS 0x0100 /* Allow processing of AGP transactions */ +#define PCI_AGP_FW 0x0010 /* FW transfers supported/forced */ +#define PCI_AGP_RATE4 0x0004 /* 4x transfer rate supported */ +#define PCI_AGP_RATE2 0x0002 /* 2x transfer rate supported */ +#define PCI_AGP_RATE1 0x0001 /* 1x transfer rate supported */ + }) pci_agp_regs_t; +/* *INDENT-ON* */ + +/* Vital Product Data */ +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { + pci_capability_regs_t header; u16 address; +#define PCI_VPD_ADDR_MASK 0x7fff /* Address mask */ +#define PCI_VPD_ADDR_F 0x8000 /* Write 0, 1 indicates completion */ + u32 data;}) pci_vpd_regs_t; +/* *INDENT-ON* */ + +/* Slot Identification */ +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { + pci_capability_regs_t header; u8 esr; +#define PCI_SID_ESR_NSLOTS 0x1f /* Number of expansion slots available */ +#define PCI_SID_ESR_FIC 0x20 /* First In Chassis Flag */ + u8 chassis;}) pci_sid_regs_t; +/* *INDENT-ON* */ + +/* Message Signalled Interrupts registers */ +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { + pci_capability_regs_t header; u16 flags; +#define PCI_MSI_FLAGS_ENABLE (1 << 0) /* MSI feature enabled */ +#define PCI_MSI_FLAGS_GET_MAX_QUEUE_SIZE(x) ((x >> 1) & 0x7) +#define PCI_MSI_FLAGS_MAX_QUEUE_SIZE(x) (((x) & 0x7) << 1) +#define PCI_MSI_FLAGS_GET_QUEUE_SIZE(x) ((x >> 4) & 0x7) +#define PCI_MSI_FLAGS_QUEUE_SIZE(x) (((x) & 0x7) << 4) +#define PCI_MSI_FLAGS_64BIT (1 << 7) /* 64-bit addresses allowed */ +#define PCI_MSI_FLAGS_MASKBIT (1 << 8) /* 64-bit mask bits allowed */ + u32 address; u32 data; u32 mask_bits;}) pci_msi32_regs_t; +/* *INDENT-ON* */ + +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { + pci_capability_regs_t header; u16 flags; + u32 address[2]; + u32 data; u32 mask_bits;}) pci_msi64_regs_t; +/* *INDENT-ON* */ + +/* CompactPCI Hotswap Register */ +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { + pci_capability_regs_t header; u16 control_status; +#define PCI_CHSWP_DHA 0x01 /* Device Hiding Arm */ +#define PCI_CHSWP_EIM 0x02 /* ENUM# Signal Mask */ +#define PCI_CHSWP_PIE 0x04 /* Pending Insert or Extract */ +#define PCI_CHSWP_LOO 0x08 /* LED On / Off */ +#define PCI_CHSWP_PI 0x30 /* Programming Interface */ +#define PCI_CHSWP_EXT 0x40 /* ENUM# status - extraction */ +#define PCI_CHSWP_INS 0x80 /* ENUM# status - insertion */ + }) pci_chswp_regs_t; +/* *INDENT-ON* */ + +/* PCIX registers */ +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { + pci_capability_regs_t header; u16 command; +#define PCIX_CMD_DPERR_E 0x0001 /* Data Parity Error Recovery Enable */ +#define PCIX_CMD_ERO 0x0002 /* Enable Relaxed Ordering */ +#define PCIX_CMD_MAX_READ 0x000c /* Max Memory Read Byte Count */ +#define PCIX_CMD_MAX_SPLIT 0x0070 /* Max Outstanding Split Transactions */ +#define PCIX_CMD_VERSION(x) (((x) >> 12) & 3) /* Version */ + u32 status; +#define PCIX_STATUS_DEVFN 0x000000ff /* A copy of devfn */ +#define PCIX_STATUS_BUS 0x0000ff00 /* A copy of bus nr */ +#define PCIX_STATUS_64BIT 0x00010000 /* 64-bit device */ +#define PCIX_STATUS_133MHZ 0x00020000 /* 133 MHz capable */ +#define PCIX_STATUS_SPL_DISC 0x00040000 /* Split Completion Discarded */ +#define PCIX_STATUS_UNX_SPL 0x00080000 /* Unexpected Split Completion */ +#define PCIX_STATUS_COMPLEX 0x00100000 /* Device Complexity */ +#define PCIX_STATUS_MAX_READ 0x00600000 /* Designed Max Memory Read Count */ +#define PCIX_STATUS_MAX_SPLIT 0x03800000 /* Designed Max Outstanding Split Transactions */ +#define PCIX_STATUS_MAX_CUM 0x1c000000 /* Designed Max Cumulative Read Size */ +#define PCIX_STATUS_SPL_ERR 0x20000000 /* Rcvd Split Completion Error Msg */ +#define PCIX_STATUS_266MHZ 0x40000000 /* 266 MHz capable */ +#define PCIX_STATUS_533MHZ 0x80000000 /* 533 MHz capable */ + }) pcix_config_regs_t; +/* *INDENT-ON* */ + +static inline int +pcie_size_to_code (int bytes) +{ + ASSERT (is_pow2 (bytes)); + ASSERT (bytes <= 4096); + return min_log2 (bytes) - 7; +} + +static inline int +pcie_code_to_size (int code) +{ + int size = 1 << (code + 7); + ASSERT (size <= 4096); + return size; +} + +/* PCI Express capability registers */ +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { + pci_capability_regs_t header; u16 pcie_capabilities; +#define PCIE_CAP_VERSION(x) (((x) >> 0) & 0xf) +#define PCIE_CAP_DEVICE_TYPE(x) (((x) >> 4) & 0xf) +#define PCIE_DEVICE_TYPE_ENDPOINT 0 +#define PCIE_DEVICE_TYPE_LEGACY_ENDPOINT 1 +#define PCIE_DEVICE_TYPE_ROOT_PORT 4 + /* Upstream/downstream port of PCI Express switch. */ +#define PCIE_DEVICE_TYPE_SWITCH_UPSTREAM 5 +#define PCIE_DEVICE_TYPE_SWITCH_DOWNSTREAM 6 +#define PCIE_DEVICE_TYPE_PCIE_TO_PCI_BRIDGE 7 +#define PCIE_DEVICE_TYPE_PCI_TO_PCIE_BRIDGE 8 + /* Root complex integrated endpoint. */ +#define PCIE_DEVICE_TYPE_ROOT_COMPLEX_ENDPOINT 9 +#define PCIE_DEVICE_TYPE_ROOT_COMPLEX_EVENT_COLLECTOR 10 +#define PCIE_CAP_SLOW_IMPLEMENTED (1 << 8) +#define PCIE_CAP_MSI_IRQ(x) (((x) >> 9) & 0x1f) + u32 dev_capabilities; +#define PCIE_DEVCAP_MAX_PAYLOAD(x) (128 << (((x) >> 0) & 0x7)) +#define PCIE_DEVCAP_PHANTOM_BITS(x) (((x) >> 3) & 0x3) +#define PCIE_DEVCAP_EXTENTED_TAG (1 << 5) +#define PCIE_DEVCAP_L0S 0x1c0 /* L0s Acceptable Latency */ +#define PCIE_DEVCAP_L1 0xe00 /* L1 Acceptable Latency */ +#define PCIE_DEVCAP_ATN_BUT 0x1000 /* Attention Button Present */ +#define PCIE_DEVCAP_ATN_IND 0x2000 /* Attention Indicator Present */ +#define PCIE_DEVCAP_PWR_IND 0x4000 /* Power Indicator Present */ +#define PCIE_DEVCAP_PWR_VAL 0x3fc0000 /* Slot Power Limit Value */ +#define PCIE_DEVCAP_PWR_SCL 0xc000000 /* Slot Power Limit Scale */ + u16 dev_control; +#define PCIE_CTRL_CERE 0x0001 /* Correctable Error Reporting En. */ +#define PCIE_CTRL_NFERE 0x0002 /* Non-Fatal Error Reporting Enable */ +#define PCIE_CTRL_FERE 0x0004 /* Fatal Error Reporting Enable */ +#define PCIE_CTRL_URRE 0x0008 /* Unsupported Request Reporting En. */ +#define PCIE_CTRL_RELAX_EN 0x0010 /* Enable relaxed ordering */ +#define PCIE_CTRL_MAX_PAYLOAD(n) (((n) & 7) << 5) +#define PCIE_CTRL_EXT_TAG 0x0100 /* Extended Tag Field Enable */ +#define PCIE_CTRL_PHANTOM 0x0200 /* Phantom Functions Enable */ +#define PCIE_CTRL_AUX_PME 0x0400 /* Auxiliary Power PM Enable */ +#define PCIE_CTRL_NOSNOOP_EN 0x0800 /* Enable No Snoop */ +#define PCIE_CTRL_MAX_READ_REQUEST(n) (((n) & 7) << 12) + u16 dev_status; +#define PCIE_DEVSTA_AUXPD 0x10 /* AUX Power Detected */ +#define PCIE_DEVSTA_TRPND 0x20 /* Transactions Pending */ + u32 link_capabilities; u16 link_control; u16 link_status; + u32 slot_capabilities; + u16 slot_control; u16 slot_status; u16 root_control; +#define PCIE_RTCTL_SECEE 0x01 /* System Error on Correctable Error */ +#define PCIE_RTCTL_SENFEE 0x02 /* System Error on Non-Fatal Error */ +#define PCIE_RTCTL_SEFEE 0x04 /* System Error on Fatal Error */ +#define PCIE_RTCTL_PMEIE 0x08 /* PME Interrupt Enable */ +#define PCIE_RTCTL_CRSSVE 0x10 /* CRS Software Visibility Enable */ + u16 root_capabilities; + u32 root_status; + u32 dev_capabilities2; + u16 dev_control2; + u16 dev_status2; + u32 link_capabilities2; + u16 link_control2; + u16 link_status2; + u32 slot_capabilities2; u16 slot_control2; + u16 slot_status2;}) pcie_config_regs_t; +/* *INDENT-ON* */ + +/* PCI express extended capabilities. */ +typedef enum pcie_capability_type +{ + PCIE_CAP_ADVANCED_ERROR = 1, + PCIE_CAP_VC = 2, + PCIE_CAP_DSN = 3, + PCIE_CAP_PWR = 4, +} pcie_capability_type_t; + +/* Common header for capabilities. */ +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { +enum pcie_capability_type type:16; u16 version: 4; u16 next_capability:12;}) + /* *INDENT-ON* */ +pcie_capability_regs_t; + +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct + { + pcie_capability_regs_t header; u32 uncorrectable_status; +#define PCIE_ERROR_UNC_LINK_TRAINING (1 << 0) +#define PCIE_ERROR_UNC_DATA_LINK_PROTOCOL (1 << 4) +#define PCIE_ERROR_UNC_SURPRISE_DOWN (1 << 5) +#define PCIE_ERROR_UNC_POISONED_TLP (1 << 12) +#define PCIE_ERROR_UNC_FLOW_CONTROL (1 << 13) +#define PCIE_ERROR_UNC_COMPLETION_TIMEOUT (1 << 14) +#define PCIE_ERROR_UNC_COMPLETER_ABORT (1 << 15) +#define PCIE_ERROR_UNC_UNEXPECTED_COMPLETION (1 << 16) +#define PCIE_ERROR_UNC_RX_OVERFLOW (1 << 17) +#define PCIE_ERROR_UNC_MALFORMED_TLP (1 << 18) +#define PCIE_ERROR_UNC_CRC_ERROR (1 << 19) +#define PCIE_ERROR_UNC_UNSUPPORTED_REQUEST (1 << 20) + u32 uncorrectable_mask; + u32 uncorrectable_severity; u32 correctable_status; +#define PCIE_ERROR_COR_RX_ERROR (1 << 0) +#define PCIE_ERROR_COR_BAD_TLP (1 << 6) +#define PCIE_ERROR_COR_BAD_DLLP (1 << 7) +#define PCIE_ERROR_COR_REPLAY_ROLLOVER (1 << 8) +#define PCIE_ERROR_COR_REPLAY_TIMER (1 << 12) +#define PCIE_ERROR_COR_ADVISORY (1 << 13) + u32 correctable_mask; + u32 control; + u32 log[4]; + u32 root_command; + u32 root_status; u16 correctable_error_source; + u16 error_source;}) pcie_advanced_error_regs_t; +/* *INDENT-ON* */ + +/* Virtual Channel */ +#define PCI_VC_PORT_REG1 4 +#define PCI_VC_PORT_REG2 8 +#define PCI_VC_PORT_CTRL 12 +#define PCI_VC_PORT_STATUS 14 +#define PCI_VC_RES_CAP 16 +#define PCI_VC_RES_CTRL 20 +#define PCI_VC_RES_STATUS 26 + +/* Power Budgeting */ +#define PCI_PWR_DSR 4 /* Data Select Register */ +#define PCI_PWR_DATA 8 /* Data Register */ +#define PCI_PWR_DATA_BASE(x) ((x) & 0xff) /* Base Power */ +#define PCI_PWR_DATA_SCALE(x) (((x) >> 8) & 3) /* Data Scale */ +#define PCI_PWR_DATA_PM_SUB(x) (((x) >> 10) & 7) /* PM Sub State */ +#define PCI_PWR_DATA_PM_STATE(x) (((x) >> 13) & 3) /* PM State */ +#define PCI_PWR_DATA_TYPE(x) (((x) >> 15) & 7) /* Type */ +#define PCI_PWR_DATA_RAIL(x) (((x) >> 18) & 7) /* Power Rail */ +#define PCI_PWR_CAP 12 /* Capability */ +#define PCI_PWR_CAP_BUDGET(x) ((x) & 1) /* Included in system budget */ + +#endif /* included_vlib_pci_config_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/physmem.h b/src/vlib/physmem.h new file mode 100644 index 00000000..9e7d52a6 --- /dev/null +++ b/src/vlib/physmem.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * physmem.h: virtual <-> physical memory mapping for VLIB buffers + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_physmem_h +#define included_vlib_physmem_h + +typedef struct +{ + uword start, end, size; +} vlib_physmem_region_t; + +typedef struct +{ + vlib_physmem_region_t virtual; + + uword log2_n_bytes_per_page; + + /* 1 << log2_n_bytes_per_page - 1. */ + uword page_mask; + + u64 *page_table; + + /* is fake physmem */ + u8 is_fake; +} vlib_physmem_main_t; + +always_inline u64 +vlib_physmem_offset_to_physical (vlib_physmem_main_t * pm, uword o) +{ + uword page_index = o >> pm->log2_n_bytes_per_page; + ASSERT (o < pm->virtual.size); + ASSERT (pm->page_table[page_index] != 0); + return (vec_elt (pm->page_table, page_index) + (o & pm->page_mask)); +} + +always_inline int +vlib_physmem_is_virtual (vlib_physmem_main_t * pm, uword p) +{ + return p >= pm->virtual.start && p < pm->virtual.end; +} + +always_inline uword +vlib_physmem_offset_of (vlib_physmem_main_t * pm, void *p) +{ + uword a = pointer_to_uword (p); + uword o; + + ASSERT (vlib_physmem_is_virtual (pm, a)); + o = a - pm->virtual.start; + + /* Offset must fit in 32 bits. */ + ASSERT ((uword) o == a - pm->virtual.start); + + return o; +} + +always_inline void * +vlib_physmem_at_offset (vlib_physmem_main_t * pm, uword offset) +{ + ASSERT (offset < pm->virtual.size); + return uword_to_pointer (pm->virtual.start + offset, void *); +} + +#endif /* included_vlib_physmem_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/threads.c b/src/vlib/threads.c new file mode 100644 index 00000000..c5e58bc0 --- /dev/null +++ b/src/vlib/threads.c @@ -0,0 +1,1492 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#define _GNU_SOURCE + +#include +#include +#include +#include + +#include +#include + + +#if DPDK==1 +#include +#include +#include +#include +#include +#endif +DECLARE_CJ_GLOBAL_LOG; + +#define FRAME_QUEUE_NELTS 32 + + +#if DPDK==1 +/* + * Weak definitions of DPDK symbols used in this file. + * Needed for linking test programs without DPDK libs. + */ +unsigned __thread __attribute__ ((weak)) RTE_PER_LCORE (_lcore_id); +struct lcore_config __attribute__ ((weak)) lcore_config[]; +unsigned __attribute__ ((weak)) rte_socket_id (); +int __attribute__ ((weak)) rte_eal_remote_launch (); +#endif +u32 +vl (void *p) +{ + return vec_len (p); +} + +vlib_worker_thread_t *vlib_worker_threads; +vlib_thread_main_t vlib_thread_main; + +uword +os_get_cpu_number (void) +{ + void *sp; + uword n; + u32 len; + + len = vec_len (vlib_thread_stacks); + if (len == 0) + return 0; + + /* Get any old stack address. */ + sp = &sp; + + n = ((uword) sp - (uword) vlib_thread_stacks[0]) + >> VLIB_LOG2_THREAD_STACK_SIZE; + + /* "processes" have their own stacks, and they always run in thread 0 */ + n = n >= len ? 0 : n; + + return n; +} + +uword +os_get_ncpus (void) +{ + u32 len; + + len = vec_len (vlib_thread_stacks); + if (len == 0) + return 1; + else + return len; +} + +void +vlib_set_thread_name (char *name) +{ + int pthread_setname_np (pthread_t __target_thread, const char *__name); + int rv; + pthread_t thread = pthread_self (); + + if (thread) + { + rv = pthread_setname_np (thread, name); + if (rv) + clib_warning ("pthread_setname_np returned %d", rv); + } +} + +static int +sort_registrations_by_no_clone (void *a0, void *a1) +{ + vlib_thread_registration_t **tr0 = a0; + vlib_thread_registration_t **tr1 = a1; + + return ((i32) ((*tr0)->no_data_structure_clone) + - ((i32) ((*tr1)->no_data_structure_clone))); +} + +static uword * +vlib_sysfs_list_to_bitmap (char *filename) +{ + FILE *fp; + uword *r = 0; + + fp = fopen (filename, "r"); + + if (fp != NULL) + { + u8 *buffer = 0; + vec_validate (buffer, 256 - 1); + if (fgets ((char *) buffer, 256, fp)) + { + unformat_input_t in; + unformat_init_string (&in, (char *) buffer, + strlen ((char *) buffer)); + if (unformat (&in, "%U", unformat_bitmap_list, &r) != 1) + clib_warning ("unformat_bitmap_list failed"); + unformat_free (&in); + } + vec_free (buffer); + fclose (fp); + } + return r; +} + + +/* Called early in the init sequence */ + +clib_error_t * +vlib_thread_init (vlib_main_t * vm) +{ + vlib_thread_main_t *tm = &vlib_thread_main; + vlib_worker_thread_t *w; + vlib_thread_registration_t *tr; + u32 n_vlib_mains = 1; + u32 first_index = 1; + u32 i; + uword *avail_cpu; + + /* get bitmaps of active cpu cores and sockets */ + tm->cpu_core_bitmap = + vlib_sysfs_list_to_bitmap ("/sys/devices/system/cpu/online"); + tm->cpu_socket_bitmap = + vlib_sysfs_list_to_bitmap ("/sys/devices/system/node/online"); + + avail_cpu = clib_bitmap_dup (tm->cpu_core_bitmap); + + /* skip cores */ + for (i = 0; i < tm->skip_cores; i++) + { + uword c = clib_bitmap_first_set (avail_cpu); + if (c == ~0) + return clib_error_return (0, "no available cpus to skip"); + + avail_cpu = clib_bitmap_set (avail_cpu, c, 0); + } + + /* grab cpu for main thread */ + if (!tm->main_lcore) + { + tm->main_lcore = clib_bitmap_first_set (avail_cpu); + if (tm->main_lcore == (u8) ~ 0) + return clib_error_return (0, "no available cpus to be used for the" + " main thread"); + } + else + { + if (clib_bitmap_get (avail_cpu, tm->main_lcore) == 0) + return clib_error_return (0, "cpu %u is not available to be used" + " for the main thread", tm->main_lcore); + } + avail_cpu = clib_bitmap_set (avail_cpu, tm->main_lcore, 0); + + /* assume that there is socket 0 only if there is no data from sysfs */ + if (!tm->cpu_socket_bitmap) + tm->cpu_socket_bitmap = clib_bitmap_set (0, 0, 1); + + /* pin main thread to main_lcore */ +#if DPDK==0 + { + cpu_set_t cpuset; + CPU_ZERO (&cpuset); + CPU_SET (tm->main_lcore, &cpuset); + pthread_setaffinity_np (pthread_self (), sizeof (cpu_set_t), &cpuset); + } +#endif + + /* as many threads as stacks... */ + vec_validate_aligned (vlib_worker_threads, vec_len (vlib_thread_stacks) - 1, + CLIB_CACHE_LINE_BYTES); + + /* Preallocate thread 0 */ + _vec_len (vlib_worker_threads) = 1; + w = vlib_worker_threads; + w->thread_mheap = clib_mem_get_heap (); + w->thread_stack = vlib_thread_stacks[0]; + w->lcore_id = tm->main_lcore; + w->lwp = syscall (SYS_gettid); + w->thread_id = pthread_self (); + tm->n_vlib_mains = 1; + + if (tm->sched_policy != ~0) + { + struct sched_param sched_param; + if (!sched_getparam (w->lwp, &sched_param)) + { + if (tm->sched_priority != ~0) + sched_param.sched_priority = tm->sched_priority; + sched_setscheduler (w->lwp, tm->sched_policy, &sched_param); + } + } + + /* assign threads to cores and set n_vlib_mains */ + tr = tm->next; + + while (tr) + { + vec_add1 (tm->registrations, tr); + tr = tr->next; + } + + vec_sort_with_function (tm->registrations, sort_registrations_by_no_clone); + + for (i = 0; i < vec_len (tm->registrations); i++) + { + int j; + tr = tm->registrations[i]; + tr->first_index = first_index; + first_index += tr->count; + n_vlib_mains += (tr->no_data_structure_clone == 0) ? tr->count : 0; + + /* construct coremask */ + if (tr->use_pthreads || !tr->count) + continue; + + if (tr->coremask) + { + uword c; + /* *INDENT-OFF* */ + clib_bitmap_foreach (c, tr->coremask, ({ + if (clib_bitmap_get(avail_cpu, c) == 0) + return clib_error_return (0, "cpu %u is not available to be used" + " for the '%s' thread",c, tr->name); + + avail_cpu = clib_bitmap_set(avail_cpu, c, 0); + })); +/* *INDENT-ON* */ + + } + else + { + for (j = 0; j < tr->count; j++) + { + uword c = clib_bitmap_first_set (avail_cpu); + if (c == ~0) + return clib_error_return (0, + "no available cpus to be used for" + " the '%s' thread", tr->name); + + avail_cpu = clib_bitmap_set (avail_cpu, c, 0); + tr->coremask = clib_bitmap_set (tr->coremask, c, 1); + } + } + } + + clib_bitmap_free (avail_cpu); + + tm->n_vlib_mains = n_vlib_mains; + + vec_validate_aligned (vlib_worker_threads, first_index - 1, + CLIB_CACHE_LINE_BYTES); + + return 0; +} + +vlib_worker_thread_t * +vlib_alloc_thread (vlib_main_t * vm) +{ + vlib_worker_thread_t *w; + + if (vec_len (vlib_worker_threads) >= vec_len (vlib_thread_stacks)) + { + clib_warning ("out of worker threads... Quitting..."); + exit (1); + } + vec_add2 (vlib_worker_threads, w, 1); + w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads]; + return w; +} + +vlib_frame_queue_t * +vlib_frame_queue_alloc (int nelts) +{ + vlib_frame_queue_t *fq; + + fq = clib_mem_alloc_aligned (sizeof (*fq), CLIB_CACHE_LINE_BYTES); + memset (fq, 0, sizeof (*fq)); + fq->nelts = nelts; + fq->vector_threshold = 128; // packets + vec_validate_aligned (fq->elts, nelts - 1, CLIB_CACHE_LINE_BYTES); + + if (1) + { + if (((uword) & fq->tail) & (CLIB_CACHE_LINE_BYTES - 1)) + fformat (stderr, "WARNING: fq->tail unaligned\n"); + if (((uword) & fq->head) & (CLIB_CACHE_LINE_BYTES - 1)) + fformat (stderr, "WARNING: fq->head unaligned\n"); + if (((uword) fq->elts) & (CLIB_CACHE_LINE_BYTES - 1)) + fformat (stderr, "WARNING: fq->elts unaligned\n"); + + if (sizeof (fq->elts[0]) % CLIB_CACHE_LINE_BYTES) + fformat (stderr, "WARNING: fq->elts[0] size %d\n", + sizeof (fq->elts[0])); + if (nelts & (nelts - 1)) + { + fformat (stderr, "FATAL: nelts MUST be a power of 2\n"); + abort (); + } + } + + return (fq); +} + +void vl_msg_api_handler_no_free (void *) __attribute__ ((weak)); +void +vl_msg_api_handler_no_free (void *v) +{ +} + +/* Turned off, save as reference material... */ +#if 0 +static inline int +vlib_frame_queue_dequeue_internal (int thread_id, + vlib_main_t * vm, vlib_node_main_t * nm) +{ + vlib_frame_queue_t *fq = vlib_frame_queues[thread_id]; + vlib_frame_queue_elt_t *elt; + vlib_frame_t *f; + vlib_pending_frame_t *p; + vlib_node_runtime_t *r; + u32 node_runtime_index; + int msg_type; + u64 before; + int processed = 0; + + ASSERT (vm == vlib_mains[thread_id]); + + while (1) + { + if (fq->head == fq->tail) + return processed; + + elt = fq->elts + ((fq->head + 1) & (fq->nelts - 1)); + + if (!elt->valid) + return processed; + + before = clib_cpu_time_now (); + + f = elt->frame; + node_runtime_index = elt->node_runtime_index; + msg_type = elt->msg_type; + + switch (msg_type) + { + case VLIB_FRAME_QUEUE_ELT_FREE_BUFFERS: + vlib_buffer_free (vm, vlib_frame_vector_args (f), f->n_vectors); + /* note fallthrough... */ + case VLIB_FRAME_QUEUE_ELT_FREE_FRAME: + r = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL], + node_runtime_index); + vlib_frame_free (vm, r, f); + break; + case VLIB_FRAME_QUEUE_ELT_DISPATCH_FRAME: + vec_add2 (vm->node_main.pending_frames, p, 1); + f->flags |= (VLIB_FRAME_PENDING | VLIB_FRAME_FREE_AFTER_DISPATCH); + p->node_runtime_index = elt->node_runtime_index; + p->frame_index = vlib_frame_index (vm, f); + p->next_frame_index = VLIB_PENDING_FRAME_NO_NEXT_FRAME; + fq->dequeue_vectors += (u64) f->n_vectors; + break; + case VLIB_FRAME_QUEUE_ELT_API_MSG: + vl_msg_api_handler_no_free (f); + break; + default: + clib_warning ("bogus frame queue message, type %d", msg_type); + break; + } + elt->valid = 0; + fq->dequeues++; + fq->dequeue_ticks += clib_cpu_time_now () - before; + CLIB_MEMORY_BARRIER (); + fq->head++; + processed++; + } + ASSERT (0); + return processed; +} + +int +vlib_frame_queue_dequeue (int thread_id, + vlib_main_t * vm, vlib_node_main_t * nm) +{ + return vlib_frame_queue_dequeue_internal (thread_id, vm, nm); +} + +int +vlib_frame_queue_enqueue (vlib_main_t * vm, u32 node_runtime_index, + u32 frame_queue_index, vlib_frame_t * frame, + vlib_frame_queue_msg_type_t type) +{ + vlib_frame_queue_t *fq = vlib_frame_queues[frame_queue_index]; + vlib_frame_queue_elt_t *elt; + u32 save_count; + u64 new_tail; + u64 before = clib_cpu_time_now (); + + ASSERT (fq); + + new_tail = __sync_add_and_fetch (&fq->tail, 1); + + /* Wait until a ring slot is available */ + while (new_tail >= fq->head + fq->nelts) + { + f64 b4 = vlib_time_now_ticks (vm, before); + vlib_worker_thread_barrier_check (vm, b4); + /* Bad idea. Dequeue -> enqueue -> dequeue -> trouble */ + // vlib_frame_queue_dequeue (vm->cpu_index, vm, nm); + } + + elt = fq->elts + (new_tail & (fq->nelts - 1)); + + /* this would be very bad... */ + while (elt->valid) + { + } + + /* Once we enqueue the frame, frame->n_vectors is owned elsewhere... */ + save_count = frame->n_vectors; + + elt->frame = frame; + elt->node_runtime_index = node_runtime_index; + elt->msg_type = type; + CLIB_MEMORY_BARRIER (); + elt->valid = 1; + + return save_count; +} +#endif /* 0 */ + +/* To be called by vlib worker threads upon startup */ +void +vlib_worker_thread_init (vlib_worker_thread_t * w) +{ + vlib_thread_main_t *tm = vlib_get_thread_main (); + + /* + * Note: disabling signals in worker threads as follows + * prevents the api post-mortem dump scheme from working + * { + * sigset_t s; + * sigfillset (&s); + * pthread_sigmask (SIG_SETMASK, &s, 0); + * } + */ + + clib_mem_set_heap (w->thread_mheap); + + if (vec_len (tm->thread_prefix) && w->registration->short_name) + { + w->name = format (0, "%v_%s_%d%c", tm->thread_prefix, + w->registration->short_name, w->instance_id, '\0'); + vlib_set_thread_name ((char *) w->name); + } + + if (!w->registration->use_pthreads) + { + + /* Initial barrier sync, for both worker and i/o threads */ + clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, 1); + + while (*vlib_worker_threads->wait_at_barrier) + ; + + clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, -1); + } +} + +void * +vlib_worker_thread_bootstrap_fn (void *arg) +{ + void *rv; + vlib_worker_thread_t *w = arg; + + w->lwp = syscall (SYS_gettid); + w->thread_id = pthread_self (); + + rv = (void *) clib_calljmp + ((uword (*)(uword)) w->thread_function, + (uword) arg, w->thread_stack + VLIB_THREAD_STACK_SIZE); + /* NOTREACHED, we hope */ + return rv; +} + +static int +vlib_launch_thread (void *fp, vlib_worker_thread_t * w, unsigned lcore_id) +{ + void *(*fp_arg) (void *) = fp; + + w->lcore_id = lcore_id; +#if DPDK==1 + if (!w->registration->use_pthreads) + if (rte_eal_remote_launch) /* do we have dpdk linked */ + return rte_eal_remote_launch (fp, (void *) w, lcore_id); + else + return -1; + else +#endif + { + int ret; + pthread_t worker; + cpu_set_t cpuset; + CPU_ZERO (&cpuset); + CPU_SET (lcore_id, &cpuset); + + ret = pthread_create (&worker, NULL /* attr */ , fp_arg, (void *) w); + if (ret == 0) + return pthread_setaffinity_np (worker, sizeof (cpu_set_t), &cpuset); + else + return ret; + } +} + +static clib_error_t * +start_workers (vlib_main_t * vm) +{ + int i, j; + vlib_worker_thread_t *w; + vlib_main_t *vm_clone; + void *oldheap; + vlib_thread_main_t *tm = &vlib_thread_main; + vlib_thread_registration_t *tr; + vlib_node_runtime_t *rt; + u32 n_vlib_mains = tm->n_vlib_mains; + u32 worker_thread_index; + u8 *main_heap = clib_mem_get_per_cpu_heap (); + mheap_t *main_heap_header = mheap_header (main_heap); + + vec_reset_length (vlib_worker_threads); + + /* Set up the main thread */ + vec_add2_aligned (vlib_worker_threads, w, 1, CLIB_CACHE_LINE_BYTES); + w->elog_track.name = "main thread"; + elog_track_register (&vm->elog_main, &w->elog_track); + + if (vec_len (tm->thread_prefix)) + { + w->name = format (0, "%v_main%c", tm->thread_prefix, '\0'); + vlib_set_thread_name ((char *) w->name); + } + + /* + * Truth of the matter: we always use at least two + * threads. So, make the main heap thread-safe + * and make the event log thread-safe. + */ + main_heap_header->flags |= MHEAP_FLAG_THREAD_SAFE; + vm->elog_main.lock = + clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES); + vm->elog_main.lock[0] = 0; + + if (n_vlib_mains > 1) + { + vec_validate (vlib_mains, tm->n_vlib_mains - 1); + _vec_len (vlib_mains) = 0; + vec_add1 (vlib_mains, vm); + + vlib_worker_threads->wait_at_barrier = + clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES); + vlib_worker_threads->workers_at_barrier = + clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES); + + /* Ask for an initial barrier sync */ + *vlib_worker_threads->workers_at_barrier = 0; + *vlib_worker_threads->wait_at_barrier = 1; + + worker_thread_index = 1; + + for (i = 0; i < vec_len (tm->registrations); i++) + { + vlib_node_main_t *nm, *nm_clone; + vlib_buffer_main_t *bm_clone; + vlib_buffer_free_list_t *fl_clone, *fl_orig; + vlib_buffer_free_list_t *orig_freelist_pool; + int k; + + tr = tm->registrations[i]; + + if (tr->count == 0) + continue; + + for (k = 0; k < tr->count; k++) + { + vec_add2 (vlib_worker_threads, w, 1); + if (tr->mheap_size) + w->thread_mheap = + mheap_alloc (0 /* use VM */ , tr->mheap_size); + else + w->thread_mheap = main_heap; + w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads]; + w->thread_function = tr->function; + w->thread_function_arg = w; + w->instance_id = k; + w->registration = tr; + + w->elog_track.name = + (char *) format (0, "%s %d", tr->name, k + 1); + vec_add1 (w->elog_track.name, 0); + elog_track_register (&vm->elog_main, &w->elog_track); + + if (tr->no_data_structure_clone) + continue; + + /* Fork vlib_global_main et al. Look for bugs here */ + oldheap = clib_mem_set_heap (w->thread_mheap); + + vm_clone = clib_mem_alloc (sizeof (*vm_clone)); + clib_memcpy (vm_clone, vlib_mains[0], sizeof (*vm_clone)); + + vm_clone->cpu_index = worker_thread_index; + vm_clone->heap_base = w->thread_mheap; + vm_clone->mbuf_alloc_list = 0; + memset (&vm_clone->random_buffer, 0, + sizeof (vm_clone->random_buffer)); + + nm = &vlib_mains[0]->node_main; + nm_clone = &vm_clone->node_main; + /* fork next frames array, preserving node runtime indices */ + nm_clone->next_frames = vec_dup (nm->next_frames); + for (j = 0; j < vec_len (nm_clone->next_frames); j++) + { + vlib_next_frame_t *nf = &nm_clone->next_frames[j]; + u32 save_node_runtime_index; + u32 save_flags; + + save_node_runtime_index = nf->node_runtime_index; + save_flags = nf->flags & VLIB_FRAME_NO_FREE_AFTER_DISPATCH; + vlib_next_frame_init (nf); + nf->node_runtime_index = save_node_runtime_index; + nf->flags = save_flags; + } + + /* fork the frame dispatch queue */ + nm_clone->pending_frames = 0; + vec_validate (nm_clone->pending_frames, 10); /* $$$$$?????? */ + _vec_len (nm_clone->pending_frames) = 0; + + /* fork nodes */ + nm_clone->nodes = 0; + for (j = 0; j < vec_len (nm->nodes); j++) + { + vlib_node_t *n; + n = clib_mem_alloc_no_fail (sizeof (*n)); + clib_memcpy (n, nm->nodes[j], sizeof (*n)); + /* none of the copied nodes have enqueue rights given out */ + n->owner_node_index = VLIB_INVALID_NODE_INDEX; + memset (&n->stats_total, 0, sizeof (n->stats_total)); + memset (&n->stats_last_clear, 0, + sizeof (n->stats_last_clear)); + vec_add1 (nm_clone->nodes, n); + } + nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] = + vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]); + + nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] = + vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]); + vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]) + rt->cpu_index = vm_clone->cpu_index; + + nm_clone->processes = vec_dup (nm->processes); + + /* zap the (per worker) frame freelists, etc */ + nm_clone->frame_sizes = 0; + nm_clone->frame_size_hash = 0; + + /* Packet trace buffers are guaranteed to be empty, nothing to do here */ + + clib_mem_set_heap (oldheap); + vec_add1 (vlib_mains, vm_clone); + + vm_clone->error_main.counters = + vec_dup (vlib_mains[0]->error_main.counters); + vm_clone->error_main.counters_last_clear = + vec_dup (vlib_mains[0]->error_main.counters_last_clear); + + /* Fork the vlib_buffer_main_t free lists, etc. */ + bm_clone = vec_dup (vm_clone->buffer_main); + vm_clone->buffer_main = bm_clone; + + orig_freelist_pool = bm_clone->buffer_free_list_pool; + bm_clone->buffer_free_list_pool = 0; + + /* *INDENT-OFF* */ + pool_foreach (fl_orig, orig_freelist_pool, + ({ + pool_get_aligned (bm_clone->buffer_free_list_pool, + fl_clone, CLIB_CACHE_LINE_BYTES); + ASSERT (fl_orig - orig_freelist_pool + == fl_clone - bm_clone->buffer_free_list_pool); + + fl_clone[0] = fl_orig[0]; + fl_clone->aligned_buffers = 0; + fl_clone->unaligned_buffers = 0; + fl_clone->n_alloc = 0; + })); +/* *INDENT-ON* */ + + worker_thread_index++; + } + } + } + else + { + /* only have non-data-structure copy threads to create... */ + for (i = 0; i < vec_len (tm->registrations); i++) + { + tr = tm->registrations[i]; + + for (j = 0; j < tr->count; j++) + { + vec_add2 (vlib_worker_threads, w, 1); + if (tr->mheap_size) + w->thread_mheap = + mheap_alloc (0 /* use VM */ , tr->mheap_size); + else + w->thread_mheap = main_heap; + w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads]; + w->thread_function = tr->function; + w->thread_function_arg = w; + w->instance_id = j; + w->elog_track.name = + (char *) format (0, "%s %d", tr->name, j + 1); + w->registration = tr; + vec_add1 (w->elog_track.name, 0); + elog_track_register (&vm->elog_main, &w->elog_track); + } + } + } + + worker_thread_index = 1; + + for (i = 0; i < vec_len (tm->registrations); i++) + { + int j; + + tr = tm->registrations[i]; + + if (tr->use_pthreads || tm->use_pthreads) + { + for (j = 0; j < tr->count; j++) + { + w = vlib_worker_threads + worker_thread_index++; + if (vlib_launch_thread (vlib_worker_thread_bootstrap_fn, w, 0) < + 0) + clib_warning ("Couldn't start '%s' pthread ", tr->name); + } + } + else + { + uword c; + /* *INDENT-OFF* */ + clib_bitmap_foreach (c, tr->coremask, ({ + w = vlib_worker_threads + worker_thread_index++; + if (vlib_launch_thread (vlib_worker_thread_bootstrap_fn, w, c) < 0) + clib_warning ("Couldn't start DPDK lcore %d", c); + + })); +/* *INDENT-ON* */ + } + } + vlib_worker_thread_barrier_sync (vm); + vlib_worker_thread_barrier_release (vm); + return 0; +} + +VLIB_MAIN_LOOP_ENTER_FUNCTION (start_workers); + +void +vlib_worker_thread_node_runtime_update (void) +{ + int i, j; + vlib_worker_thread_t *w; + vlib_main_t *vm; + vlib_node_main_t *nm, *nm_clone; + vlib_node_t **old_nodes_clone; + vlib_main_t *vm_clone; + vlib_node_runtime_t *rt, *old_rt; + void *oldheap; + never_inline void + vlib_node_runtime_sync_stats (vlib_main_t * vm, + vlib_node_runtime_t * r, + uword n_calls, + uword n_vectors, uword n_clocks); + + ASSERT (os_get_cpu_number () == 0); + + if (vec_len (vlib_mains) == 0) + return; + + vm = vlib_mains[0]; + nm = &vm->node_main; + + ASSERT (os_get_cpu_number () == 0); + ASSERT (*vlib_worker_threads->wait_at_barrier == 1); + + /* + * Scrape all runtime stats, so we don't lose node runtime(s) with + * pending counts, or throw away worker / io thread counts. + */ + for (j = 0; j < vec_len (nm->nodes); j++) + { + vlib_node_t *n; + n = nm->nodes[j]; + vlib_node_sync_stats (vm, n); + } + + for (i = 1; i < vec_len (vlib_mains); i++) + { + vlib_node_t *n; + + vm_clone = vlib_mains[i]; + nm_clone = &vm_clone->node_main; + + for (j = 0; j < vec_len (nm_clone->nodes); j++) + { + n = nm_clone->nodes[j]; + + rt = vlib_node_get_runtime (vm_clone, n->index); + vlib_node_runtime_sync_stats (vm_clone, rt, 0, 0, 0); + } + } + + for (i = 1; i < vec_len (vlib_mains); i++) + { + vlib_node_runtime_t *rt; + w = vlib_worker_threads + i; + oldheap = clib_mem_set_heap (w->thread_mheap); + + vm_clone = vlib_mains[i]; + + /* Re-clone error heap */ + u64 *old_counters = vm_clone->error_main.counters; + u64 *old_counters_all_clear = vm_clone->error_main.counters_last_clear; + clib_memcpy (&vm_clone->error_main, &vm->error_main, + sizeof (vm->error_main)); + j = vec_len (vm->error_main.counters) - 1; + vec_validate_aligned (old_counters, j, CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (old_counters_all_clear, j, CLIB_CACHE_LINE_BYTES); + vm_clone->error_main.counters = old_counters; + vm_clone->error_main.counters_last_clear = old_counters_all_clear; + + nm_clone = &vm_clone->node_main; + vec_free (nm_clone->next_frames); + nm_clone->next_frames = vec_dup (nm->next_frames); + + for (j = 0; j < vec_len (nm_clone->next_frames); j++) + { + vlib_next_frame_t *nf = &nm_clone->next_frames[j]; + u32 save_node_runtime_index; + u32 save_flags; + + save_node_runtime_index = nf->node_runtime_index; + save_flags = nf->flags & VLIB_FRAME_NO_FREE_AFTER_DISPATCH; + vlib_next_frame_init (nf); + nf->node_runtime_index = save_node_runtime_index; + nf->flags = save_flags; + } + + old_nodes_clone = nm_clone->nodes; + nm_clone->nodes = 0; + + /* re-fork nodes */ + for (j = 0; j < vec_len (nm->nodes); j++) + { + vlib_node_t *old_n_clone; + vlib_node_t *new_n, *new_n_clone; + + new_n = nm->nodes[j]; + old_n_clone = old_nodes_clone[j]; + + new_n_clone = clib_mem_alloc_no_fail (sizeof (*new_n_clone)); + clib_memcpy (new_n_clone, new_n, sizeof (*new_n)); + /* none of the copied nodes have enqueue rights given out */ + new_n_clone->owner_node_index = VLIB_INVALID_NODE_INDEX; + + if (j >= vec_len (old_nodes_clone)) + { + /* new node, set to zero */ + memset (&new_n_clone->stats_total, 0, + sizeof (new_n_clone->stats_total)); + memset (&new_n_clone->stats_last_clear, 0, + sizeof (new_n_clone->stats_last_clear)); + } + else + { + /* Copy stats if the old data is valid */ + clib_memcpy (&new_n_clone->stats_total, + &old_n_clone->stats_total, + sizeof (new_n_clone->stats_total)); + clib_memcpy (&new_n_clone->stats_last_clear, + &old_n_clone->stats_last_clear, + sizeof (new_n_clone->stats_last_clear)); + + /* keep previous node state */ + new_n_clone->state = old_n_clone->state; + } + vec_add1 (nm_clone->nodes, new_n_clone); + } + /* Free the old node clone */ + for (j = 0; j < vec_len (old_nodes_clone); j++) + clib_mem_free (old_nodes_clone[j]); + vec_free (old_nodes_clone); + + vec_free (nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]); + + nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] = + vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]); + + /* clone input node runtime */ + old_rt = nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]; + + nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] = + vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]); + + vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]) + { + rt->cpu_index = vm_clone->cpu_index; + } + + for (j = 0; j < vec_len (old_rt); j++) + { + rt = vlib_node_get_runtime (vm_clone, old_rt[j].node_index); + rt->state = old_rt[j].state; + } + + vec_free (old_rt); + + nm_clone->processes = vec_dup (nm->processes); + + clib_mem_set_heap (oldheap); + + // vnet_main_fork_fixup (i); + } +} + +u32 +unformat_sched_policy (unformat_input_t * input, va_list * args) +{ + u32 *r = va_arg (*args, u32 *); + + if (0); +#define _(v,f,s) else if (unformat (input, s)) *r = SCHED_POLICY_##f; + foreach_sched_policy +#undef _ + else + return 0; + return 1; +} + +static clib_error_t * +cpu_config (vlib_main_t * vm, unformat_input_t * input) +{ + vlib_thread_registration_t *tr; + uword *p; + vlib_thread_main_t *tm = &vlib_thread_main; + u8 *name; + u64 coremask; + uword *bitmap; + u32 count; + + tm->thread_registrations_by_name = hash_create_string (0, sizeof (uword)); + + tm->n_thread_stacks = 1; /* account for main thread */ + tm->sched_policy = ~0; + tm->sched_priority = ~0; + + tr = tm->next; + + while (tr) + { + hash_set_mem (tm->thread_registrations_by_name, tr->name, (uword) tr); + tr = tr->next; + } + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "use-pthreads")) + tm->use_pthreads = 1; + else if (unformat (input, "thread-prefix %v", &tm->thread_prefix)) + ; + else if (unformat (input, "main-core %u", &tm->main_lcore)) + ; + else if (unformat (input, "skip-cores %u", &tm->skip_cores)) + ; + else if (unformat (input, "coremask-%s %llx", &name, &coremask)) + { + p = hash_get_mem (tm->thread_registrations_by_name, name); + if (p == 0) + return clib_error_return (0, "no such thread type '%s'", name); + + tr = (vlib_thread_registration_t *) p[0]; + + if (tr->use_pthreads) + return clib_error_return (0, + "coremask cannot be set for '%s' threads", + name); + + tr->coremask = clib_bitmap_set_multiple + (tr->coremask, 0, coremask, BITS (coremask)); + tr->count = clib_bitmap_count_set_bits (tr->coremask); + } + else if (unformat (input, "corelist-%s %U", &name, unformat_bitmap_list, + &bitmap)) + { + p = hash_get_mem (tm->thread_registrations_by_name, name); + if (p == 0) + return clib_error_return (0, "no such thread type '%s'", name); + + tr = (vlib_thread_registration_t *) p[0]; + + if (tr->use_pthreads) + return clib_error_return (0, + "corelist cannot be set for '%s' threads", + name); + + tr->coremask = bitmap; + tr->count = clib_bitmap_count_set_bits (tr->coremask); + } + else + if (unformat + (input, "scheduler-policy %U", unformat_sched_policy, + &tm->sched_policy)) + ; + else if (unformat (input, "scheduler-priority %u", &tm->sched_priority)) + ; + else if (unformat (input, "%s %u", &name, &count)) + { + p = hash_get_mem (tm->thread_registrations_by_name, name); + if (p == 0) + return clib_error_return (0, "no such thread type 3 '%s'", name); + + tr = (vlib_thread_registration_t *) p[0]; + if (tr->fixed_count) + return clib_error_return + (0, "number of %s threads not configurable", tr->name); + tr->count = count; + } + else + break; + } + + if (tm->sched_priority != ~0) + { + if (tm->sched_policy == SCHED_FIFO || tm->sched_policy == SCHED_RR) + { + u32 prio_max = sched_get_priority_max (tm->sched_policy); + u32 prio_min = sched_get_priority_min (tm->sched_policy); + if (tm->sched_priority > prio_max) + tm->sched_priority = prio_max; + if (tm->sched_priority < prio_min) + tm->sched_priority = prio_min; + } + else + { + return clib_error_return + (0, + "scheduling priority (%d) is not allowed for `normal` scheduling policy", + tm->sched_priority); + } + } + tr = tm->next; + + if (!tm->thread_prefix) + tm->thread_prefix = format (0, "vpp"); + + while (tr) + { + tm->n_thread_stacks += tr->count; + tm->n_pthreads += tr->count * tr->use_pthreads; + tm->n_eal_threads += tr->count * (tr->use_pthreads == 0); + tr = tr->next; + } + + return 0; +} + +VLIB_EARLY_CONFIG_FUNCTION (cpu_config, "cpu"); + +#if !defined (__x86_64__) && !defined (__aarch64__) && !defined (__powerpc64__) && !defined(__arm__) +void +__sync_fetch_and_add_8 (void) +{ + fformat (stderr, "%s called\n", __FUNCTION__); + abort (); +} + +void +__sync_add_and_fetch_8 (void) +{ + fformat (stderr, "%s called\n", __FUNCTION__); + abort (); +} +#endif + +void vnet_main_fixup (vlib_fork_fixup_t which) __attribute__ ((weak)); +void +vnet_main_fixup (vlib_fork_fixup_t which) +{ +} + +void +vlib_worker_thread_fork_fixup (vlib_fork_fixup_t which) +{ + vlib_main_t *vm = vlib_get_main (); + + if (vlib_mains == 0) + return; + + ASSERT (os_get_cpu_number () == 0); + vlib_worker_thread_barrier_sync (vm); + + switch (which) + { + case VLIB_WORKER_THREAD_FORK_FIXUP_NEW_SW_IF_INDEX: + vnet_main_fixup (VLIB_WORKER_THREAD_FORK_FIXUP_NEW_SW_IF_INDEX); + break; + + default: + ASSERT (0); + } + vlib_worker_thread_barrier_release (vm); +} + +void +vlib_worker_thread_barrier_sync (vlib_main_t * vm) +{ + f64 deadline; + u32 count; + + if (!vlib_mains) + return; + + count = vec_len (vlib_mains) - 1; + + /* Tolerate recursive calls */ + if (++vlib_worker_threads[0].recursion_level > 1) + return; + + vlib_worker_threads[0].barrier_sync_count++; + + ASSERT (os_get_cpu_number () == 0); + + deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT; + + *vlib_worker_threads->wait_at_barrier = 1; + while (*vlib_worker_threads->workers_at_barrier != count) + { + if (vlib_time_now (vm) > deadline) + { + fformat (stderr, "%s: worker thread deadlock\n", __FUNCTION__); + os_panic (); + } + } +} + +void +vlib_worker_thread_barrier_release (vlib_main_t * vm) +{ + f64 deadline; + + if (!vlib_mains) + return; + + if (--vlib_worker_threads[0].recursion_level > 0) + return; + + deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT; + + *vlib_worker_threads->wait_at_barrier = 0; + + while (*vlib_worker_threads->workers_at_barrier > 0) + { + if (vlib_time_now (vm) > deadline) + { + fformat (stderr, "%s: worker thread deadlock\n", __FUNCTION__); + os_panic (); + } + } +} + +/* + * Check the frame queue to see if any frames are available. + * If so, pull the packets off the frames and put them to + * the handoff node. + */ +static inline int +vlib_frame_queue_dequeue_internal (vlib_main_t * vm, + vlib_frame_queue_main_t * fqm) +{ + u32 thread_id = vm->cpu_index; + vlib_frame_queue_t *fq = fqm->vlib_frame_queues[thread_id]; + vlib_frame_queue_elt_t *elt; + u32 *from, *to; + vlib_frame_t *f; + int msg_type; + int processed = 0; + u32 n_left_to_node; + u32 vectors = 0; + + ASSERT (fq); + ASSERT (vm == vlib_mains[thread_id]); + + if (PREDICT_FALSE (fqm->node_index == ~0)) + return 0; + /* + * Gather trace data for frame queues + */ + if (PREDICT_FALSE (fq->trace)) + { + frame_queue_trace_t *fqt; + frame_queue_nelt_counter_t *fqh; + u32 elix; + + fqt = &fqm->frame_queue_traces[thread_id]; + + fqt->nelts = fq->nelts; + fqt->head = fq->head; + fqt->head_hint = fq->head_hint; + fqt->tail = fq->tail; + fqt->threshold = fq->vector_threshold; + fqt->n_in_use = fqt->tail - fqt->head; + if (fqt->n_in_use >= fqt->nelts) + { + // if beyond max then use max + fqt->n_in_use = fqt->nelts - 1; + } + + /* Record the number of elements in use in the histogram */ + fqh = &fqm->frame_queue_histogram[thread_id]; + fqh->count[fqt->n_in_use]++; + + /* Record a snapshot of the elements in use */ + for (elix = 0; elix < fqt->nelts; elix++) + { + elt = fq->elts + ((fq->head + 1 + elix) & (fq->nelts - 1)); + if (1 || elt->valid) + { + fqt->n_vectors[elix] = elt->n_vectors; + } + } + fqt->written = 1; + } + + while (1) + { + if (fq->head == fq->tail) + { + fq->head_hint = fq->head; + return processed; + } + + elt = fq->elts + ((fq->head + 1) & (fq->nelts - 1)); + + if (!elt->valid) + { + fq->head_hint = fq->head; + return processed; + } + + from = elt->buffer_index; + msg_type = elt->msg_type; + + ASSERT (msg_type == VLIB_FRAME_QUEUE_ELT_DISPATCH_FRAME); + ASSERT (elt->n_vectors <= VLIB_FRAME_SIZE); + + f = vlib_get_frame_to_node (vm, fqm->node_index); + + to = vlib_frame_vector_args (f); + + n_left_to_node = elt->n_vectors; + + while (n_left_to_node >= 4) + { + to[0] = from[0]; + to[1] = from[1]; + to[2] = from[2]; + to[3] = from[3]; + to += 4; + from += 4; + n_left_to_node -= 4; + } + + while (n_left_to_node > 0) + { + to[0] = from[0]; + to++; + from++; + n_left_to_node--; + } + + vectors += elt->n_vectors; + f->n_vectors = elt->n_vectors; + vlib_put_frame_to_node (vm, fqm->node_index, f); + + elt->valid = 0; + elt->n_vectors = 0; + elt->msg_type = 0xfefefefe; + CLIB_MEMORY_BARRIER (); + fq->head++; + processed++; + + /* + * Limit the number of packets pushed into the graph + */ + if (vectors >= fq->vector_threshold) + { + fq->head_hint = fq->head; + return processed; + } + } + ASSERT (0); + return processed; +} + +static_always_inline void +vlib_worker_thread_internal (vlib_main_t * vm) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_thread_main_t *tm = vlib_get_thread_main (); + u64 cpu_time_now = clib_cpu_time_now (); + vlib_frame_queue_main_t *fqm; + + vec_alloc (nm->pending_interrupt_node_runtime_indices, 32); + + while (1) + { + vlib_worker_thread_barrier_check (); + + vec_foreach (fqm, tm->frame_queue_mains) + vlib_frame_queue_dequeue_internal (vm, fqm); + + vlib_node_runtime_t *n; + vec_foreach (n, nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]) + { + cpu_time_now = dispatch_node (vm, n, VLIB_NODE_TYPE_INPUT, + VLIB_NODE_STATE_POLLING, /* frame */ 0, + cpu_time_now); + } + + /* Next handle interrupts. */ + { + uword l = _vec_len (nm->pending_interrupt_node_runtime_indices); + uword i; + if (l > 0) + { + _vec_len (nm->pending_interrupt_node_runtime_indices) = 0; + for (i = 0; i < l; i++) + { + n = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT], + nm-> + pending_interrupt_node_runtime_indices + [i]); + cpu_time_now = + dispatch_node (vm, n, VLIB_NODE_TYPE_INPUT, + VLIB_NODE_STATE_INTERRUPT, + /* frame */ 0, + cpu_time_now); + } + } + } + + if (_vec_len (nm->pending_frames)) + { + int i; + cpu_time_now = clib_cpu_time_now (); + for (i = 0; i < _vec_len (nm->pending_frames); i++) + { + vlib_pending_frame_t *p; + + p = nm->pending_frames + i; + + cpu_time_now = dispatch_pending_node (vm, p, cpu_time_now); + } + _vec_len (nm->pending_frames) = 0; + } + vlib_increment_main_loop_counter (vm); + + /* Record time stamp in case there are no enabled nodes and above + calls do not update time stamp. */ + cpu_time_now = clib_cpu_time_now (); + } +} + +void +vlib_worker_thread_fn (void *arg) +{ + vlib_worker_thread_t *w = (vlib_worker_thread_t *) arg; + vlib_main_t *vm = vlib_get_main (); + + ASSERT (vm->cpu_index == os_get_cpu_number ()); + + vlib_worker_thread_init (w); + clib_time_init (&vm->clib_time); + clib_mem_set_heap (w->thread_mheap); + +#if DPDK > 0 + /* Wait until the dpdk init sequence is complete */ + vlib_thread_main_t *tm = vlib_get_thread_main (); + while (tm->worker_thread_release == 0) + vlib_worker_thread_barrier_check (); +#endif + + vlib_worker_thread_internal (vm); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_THREAD (worker_thread_reg, static) = { + .name = "workers", + .short_name = "wk", + .function = vlib_worker_thread_fn, +}; +/* *INDENT-ON* */ + +u32 +vlib_frame_queue_main_init (u32 node_index, u32 frame_queue_nelts) +{ + vlib_thread_main_t *tm = vlib_get_thread_main (); + vlib_frame_queue_main_t *fqm; + vlib_frame_queue_t *fq; + int i; + + if (frame_queue_nelts == 0) + frame_queue_nelts = FRAME_QUEUE_NELTS; + + vec_add2 (tm->frame_queue_mains, fqm, 1); + + fqm->node_index = node_index; + + vec_validate (fqm->vlib_frame_queues, tm->n_vlib_mains - 1); + _vec_len (fqm->vlib_frame_queues) = 0; + for (i = 0; i < tm->n_vlib_mains; i++) + { + fq = vlib_frame_queue_alloc (frame_queue_nelts); + vec_add1 (fqm->vlib_frame_queues, fq); + } + + return (fqm - tm->frame_queue_mains); +} + +clib_error_t * +threads_init (vlib_main_t * vm) +{ + return 0; +} + +VLIB_INIT_FUNCTION (threads_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/threads.h b/src/vlib/threads.h new file mode 100644 index 00000000..34ab5be8 --- /dev/null +++ b/src/vlib/threads.h @@ -0,0 +1,470 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef included_vlib_threads_h +#define included_vlib_threads_h + +#include +#include + +extern vlib_main_t **vlib_mains; + +void vlib_set_thread_name (char *name); + +/* arg is actually a vlib__thread_t * */ +typedef void (vlib_thread_function_t) (void *arg); + +typedef struct vlib_thread_registration_ +{ + /* constructor generated list of thread registrations */ + struct vlib_thread_registration_ *next; + + /* config parameters */ + char *name; + char *short_name; + vlib_thread_function_t *function; + uword mheap_size; + int fixed_count; + u32 count; + int no_data_structure_clone; + u32 frame_queue_nelts; + + /* All threads of this type run on pthreads */ + int use_pthreads; + u32 first_index; + uword *coremask; +} vlib_thread_registration_t; + +/* + * Frames have their cpu / vlib_main_t index in the low-order N bits + * Make VLIB_MAX_CPUS a power-of-two, please... + */ + +#ifndef VLIB_MAX_CPUS +#define VLIB_MAX_CPUS 256 +#endif + +#if VLIB_MAX_CPUS > CLIB_MAX_MHEAPS +#error Please increase number of per-cpu mheaps +#endif + +#define VLIB_CPU_MASK (VLIB_MAX_CPUS - 1) /* 0x3f, max */ +#define VLIB_OFFSET_MASK (~VLIB_CPU_MASK) + +#define VLIB_LOG2_THREAD_STACK_SIZE (20) +#define VLIB_THREAD_STACK_SIZE (1< 0 +/* long barrier timeout, for gdb... */ +#define BARRIER_SYNC_TIMEOUT (600.1) +#else +#define BARRIER_SYNC_TIMEOUT (1.0) +#endif + +void vlib_worker_thread_barrier_sync (vlib_main_t * vm); +void vlib_worker_thread_barrier_release (vlib_main_t * vm); + +always_inline void +vlib_smp_unsafe_warning (void) +{ + if (CLIB_DEBUG > 0) + { + if (os_get_cpu_number ()) + fformat (stderr, "%s: SMP unsafe warning...\n", __FUNCTION__); + } +} + +typedef enum +{ + VLIB_WORKER_THREAD_FORK_FIXUP_ILLEGAL = 0, + VLIB_WORKER_THREAD_FORK_FIXUP_NEW_SW_IF_INDEX, +} vlib_fork_fixup_t; + +void vlib_worker_thread_fork_fixup (vlib_fork_fixup_t which); + +static inline void +vlib_worker_thread_barrier_check (void) +{ + if (PREDICT_FALSE (*vlib_worker_threads->wait_at_barrier)) + { + clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, 1); + while (*vlib_worker_threads->wait_at_barrier) + ; + clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, -1); + } +} + +#define foreach_vlib_main(body) \ +do { \ + vlib_main_t ** __vlib_mains = 0, *this_vlib_main; \ + int ii; \ + \ + if (vec_len (vlib_mains) == 0) \ + vec_add1 (__vlib_mains, &vlib_global_main); \ + else \ + { \ + for (ii = 0; ii < vec_len (vlib_mains); ii++) \ + { \ + this_vlib_main = vlib_mains[ii]; \ + if (this_vlib_main) \ + vec_add1 (__vlib_mains, this_vlib_main); \ + } \ + } \ + \ + for (ii = 0; ii < vec_len (__vlib_mains); ii++) \ + { \ + this_vlib_main = __vlib_mains[ii]; \ + /* body uses this_vlib_main... */ \ + (body); \ + } \ + vec_free (__vlib_mains); \ +} while (0); + +#define foreach_sched_policy \ + _(SCHED_OTHER, OTHER, "other") \ + _(SCHED_BATCH, BATCH, "batch") \ + _(SCHED_IDLE, IDLE, "idle") \ + _(SCHED_FIFO, FIFO, "fifo") \ + _(SCHED_RR, RR, "rr") + +typedef enum +{ +#define _(v,f,s) SCHED_POLICY_##f = v, + foreach_sched_policy +#undef _ + SCHED_POLICY_N, +} sched_policy_t; + +typedef struct +{ + /* Link list of registrations, built by constructors */ + vlib_thread_registration_t *next; + + /* Vector of registrations, w/ non-data-structure clones at the top */ + vlib_thread_registration_t **registrations; + + uword *thread_registrations_by_name; + + vlib_worker_thread_t *worker_threads; + + /* + * Launch all threads as pthreads, + * not eal_rte_launch (strict affinity) threads + */ + int use_pthreads; + + /* Number of vlib_main / vnet_main clones */ + u32 n_vlib_mains; + + /* Number of thread stacks to create */ + u32 n_thread_stacks; + + /* Number of pthreads */ + u32 n_pthreads; + + /* Number of DPDK eal threads */ + u32 n_eal_threads; + + /* Number of cores to skip, must match the core mask */ + u32 skip_cores; + + /* Thread prefix name */ + u8 *thread_prefix; + + /* main thread lcore */ + u8 main_lcore; + + /* Bitmap of available CPU cores */ + uword *cpu_core_bitmap; + + /* Bitmap of available CPU sockets (NUMA nodes) */ + uword *cpu_socket_bitmap; + + /* Worker handoff queues */ + vlib_frame_queue_main_t *frame_queue_mains; + + /* worker thread initialization barrier */ + volatile u32 worker_thread_release; + + /* scheduling policy */ + u32 sched_policy; + + /* scheduling policy priority */ + u32 sched_priority; + +} vlib_thread_main_t; + +extern vlib_thread_main_t vlib_thread_main; + +#define VLIB_REGISTER_THREAD(x,...) \ + __VA_ARGS__ vlib_thread_registration_t x; \ +static void __vlib_add_thread_registration_##x (void) \ + __attribute__((__constructor__)) ; \ +static void __vlib_add_thread_registration_##x (void) \ +{ \ + vlib_thread_main_t * tm = &vlib_thread_main; \ + x.next = tm->next; \ + tm->next = &x; \ +} \ +__VA_ARGS__ vlib_thread_registration_t x + +always_inline u32 +vlib_num_workers () +{ + return vlib_thread_main.n_vlib_mains - 1; +} + +always_inline u32 +vlib_get_worker_cpu_index (u32 worker_index) +{ + return worker_index + 1; +} + +always_inline u32 +vlib_get_worker_index (u32 cpu_index) +{ + return cpu_index - 1; +} + +always_inline u32 +vlib_get_current_worker_index () +{ + return os_get_cpu_number () - 1; +} + +always_inline vlib_main_t * +vlib_get_worker_vlib_main (u32 worker_index) +{ + vlib_main_t *vm; + vlib_thread_main_t *tm = &vlib_thread_main; + ASSERT (worker_index < tm->n_vlib_mains - 1); + vm = vlib_mains[worker_index + 1]; + ASSERT (vm); + return vm; +} + +static inline void +vlib_put_frame_queue_elt (vlib_frame_queue_elt_t * hf) +{ + CLIB_MEMORY_BARRIER (); + hf->valid = 1; +} + +static inline vlib_frame_queue_elt_t * +vlib_get_frame_queue_elt (u32 frame_queue_index, u32 index) +{ + vlib_frame_queue_t *fq; + vlib_frame_queue_elt_t *elt; + vlib_thread_main_t *tm = &vlib_thread_main; + vlib_frame_queue_main_t *fqm = + vec_elt_at_index (tm->frame_queue_mains, frame_queue_index); + u64 new_tail; + + fq = fqm->vlib_frame_queues[index]; + ASSERT (fq); + + new_tail = __sync_add_and_fetch (&fq->tail, 1); + + /* Wait until a ring slot is available */ + while (new_tail >= fq->head_hint + fq->nelts) + vlib_worker_thread_barrier_check (); + + elt = fq->elts + (new_tail & (fq->nelts - 1)); + + /* this would be very bad... */ + while (elt->valid) + ; + + elt->msg_type = VLIB_FRAME_QUEUE_ELT_DISPATCH_FRAME; + elt->last_n_vectors = elt->n_vectors = 0; + + return elt; +} + +static inline vlib_frame_queue_t * +is_vlib_frame_queue_congested (u32 frame_queue_index, + u32 index, + u32 queue_hi_thresh, + vlib_frame_queue_t ** + handoff_queue_by_worker_index) +{ + vlib_frame_queue_t *fq; + vlib_thread_main_t *tm = &vlib_thread_main; + vlib_frame_queue_main_t *fqm = + vec_elt_at_index (tm->frame_queue_mains, frame_queue_index); + + fq = handoff_queue_by_worker_index[index]; + if (fq != (vlib_frame_queue_t *) (~0)) + return fq; + + fq = fqm->vlib_frame_queues[index]; + ASSERT (fq); + + if (PREDICT_FALSE (fq->tail >= (fq->head_hint + queue_hi_thresh))) + { + /* a valid entry in the array will indicate the queue has reached + * the specified threshold and is congested + */ + handoff_queue_by_worker_index[index] = fq; + fq->enqueue_full_events++; + return fq; + } + + return NULL; +} + +static inline vlib_frame_queue_elt_t * +vlib_get_worker_handoff_queue_elt (u32 frame_queue_index, + u32 vlib_worker_index, + vlib_frame_queue_elt_t ** + handoff_queue_elt_by_worker_index) +{ + vlib_frame_queue_elt_t *elt; + + if (handoff_queue_elt_by_worker_index[vlib_worker_index]) + return handoff_queue_elt_by_worker_index[vlib_worker_index]; + + elt = vlib_get_frame_queue_elt (frame_queue_index, vlib_worker_index); + + handoff_queue_elt_by_worker_index[vlib_worker_index] = elt; + + return elt; +} + +#endif /* included_vlib_threads_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/threads_cli.c b/src/vlib/threads_cli.c new file mode 100644 index 00000000..ee632279 --- /dev/null +++ b/src/vlib/threads_cli.c @@ -0,0 +1,579 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#define _GNU_SOURCE + +#include +#include + +#include +#include + +#if DPDK==1 +#include +#include +#include +#include +#include +#endif + +static u8 * +format_sched_policy_and_priority (u8 * s, va_list * args) +{ + long i = va_arg (*args, long); + struct sched_param sched_param; + u8 *t = 0; + + switch (sched_getscheduler (i)) + { +#define _(v,f,str) case SCHED_POLICY_##f: t = (u8 *) str; break; + foreach_sched_policy +#undef _ + } + if (sched_getparam (i, &sched_param) == 0) + return format (s, "%s (%d)", t, sched_param.sched_priority); + else + return format (s, "%s (n/a)", t); +} + +static clib_error_t * +show_threads_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_worker_thread_t *w; + int i; + + vlib_cli_output (vm, "%-7s%-20s%-12s%-8s%-25s%-7s%-7s%-7s%-10s", + "ID", "Name", "Type", "LWP", "Sched Policy (Priority)", + "lcore", "Core", "Socket", "State"); + +#if !defined(__powerpc64__) + for (i = 0; i < vec_len (vlib_worker_threads); i++) + { + w = vlib_worker_threads + i; + u8 *line = NULL; + + line = format (line, "%-7d%-20s%-12s%-8d", + i, + w->name ? w->name : (u8 *) "", + w->registration ? w->registration->name : "", w->lwp); + + line = format (line, "%-25U", format_sched_policy_and_priority, w->lwp); + + int lcore = -1; + cpu_set_t cpuset; + CPU_ZERO (&cpuset); + int ret = -1; + + ret = + pthread_getaffinity_np (w->thread_id, sizeof (cpu_set_t), &cpuset); + if (!ret) + { + int c; + for (c = 0; c < CPU_SETSIZE; c++) + if (CPU_ISSET (c, &cpuset)) + { + if (lcore > -1) + { + lcore = -2; + break; + } + lcore = c; + } + } + else + { + lcore = w->lcore_id; + } + + if (lcore > -1) + { + const char *sys_cpu_path = "/sys/devices/system/cpu/cpu"; + int socket_id = -1; + int core_id = -1; + u8 *p = 0; + + p = format (p, "%s%u/topology/core_id%c", sys_cpu_path, lcore, 0); + vlib_sysfs_read ((char *) p, "%d", &core_id); + + vec_reset_length (p); + p = + format (p, + "%s%u/topology/physical_package_id%c", + sys_cpu_path, lcore, 0); + vlib_sysfs_read ((char *) p, "%d", &socket_id); + vec_free (p); + + line = format (line, "%-7u%-7u%-7u%", lcore, core_id, socket_id); +#if DPDK==1 + ASSERT (lcore <= RTE_MAX_LCORE); + switch (lcore_config[lcore].state) + { + case WAIT: + line = format (line, "wait"); + break; + case RUNNING: + line = format (line, "running"); + break; + case FINISHED: + line = format (line, "finished"); + break; + default: + line = format (line, "unknown"); + } +#endif + } + else + { + line = + format (line, "%-7s%-7s%-7s%", (lcore == -2) ? "M" : "n/a", "n/a", + "n/a"); + } + + vlib_cli_output (vm, "%v", line); + vec_free (line); + } +#endif + + return 0; +} + + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_threads_command, static) = { + .path = "show threads", + .short_help = "Show threads", + .function = show_threads_fn, +}; +/* *INDENT-ON* */ + +/* + * Trigger threads to grab frame queue trace data + */ +static clib_error_t * +trace_frame_queue (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + clib_error_t *error = NULL; + frame_queue_trace_t *fqt; + frame_queue_nelt_counter_t *fqh; + vlib_thread_main_t *tm = vlib_get_thread_main (); + vlib_frame_queue_main_t *fqm; + u32 num_fq; + u32 fqix; + u32 enable = 2; + u32 index = ~(u32) 0; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "on")) + enable = 1; + else if (unformat (line_input, "off")) + enable = 0; + else if (unformat (line_input, "index %u"), &index) + ; + else + return clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + } + + unformat_free (line_input); + + if (enable > 1) + return clib_error_return (0, "expecting on or off"); + + if (vec_len (tm->frame_queue_mains) == 0) + return clib_error_return (0, "no worker handoffs exist"); + + if (index > vec_len (tm->frame_queue_mains) - 1) + return clib_error_return (0, + "expecting valid worker handoff queue index"); + + fqm = vec_elt_at_index (tm->frame_queue_mains, index); + + num_fq = vec_len (fqm->vlib_frame_queues); + if (num_fq == 0) + { + vlib_cli_output (vm, "No frame queues exist\n"); + return error; + } + + // Allocate storage for trace if necessary + vec_validate_aligned (fqm->frame_queue_traces, num_fq - 1, + CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (fqm->frame_queue_histogram, num_fq - 1, + CLIB_CACHE_LINE_BYTES); + + for (fqix = 0; fqix < num_fq; fqix++) + { + fqt = &fqm->frame_queue_traces[fqix]; + fqh = &fqm->frame_queue_histogram[fqix]; + + memset (fqt->n_vectors, 0xff, sizeof (fqt->n_vectors)); + fqt->written = 0; + memset (fqh, 0, sizeof (*fqh)); + fqm->vlib_frame_queues[fqix]->trace = enable; + } + return error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_trace_frame_queue,static) = { + .path = "trace frame-queue", + .short_help = "trace frame-queue (on|off)", + .function = trace_frame_queue, + .is_mp_safe = 1, +}; +/* *INDENT-ON* */ + + +/* + * Adding two counters and compute percent of total + * Round up, e.g. 0.000001 => 1% + */ +static u32 +compute_percent (u64 * two_counters, u64 total) +{ + if (total == 0) + { + return 0; + } + else + { + return (((two_counters[0] + two_counters[1]) * 100) + + (total - 1)) / total; + } +} + +/* + * Display frame queue trace data gathered by threads. + */ +static clib_error_t * +show_frame_queue_internal (vlib_main_t * vm, + vlib_frame_queue_main_t * fqm, u32 histogram) +{ + clib_error_t *error = NULL; + frame_queue_trace_t *fqt; + frame_queue_nelt_counter_t *fqh; + u32 num_fq; + u32 fqix; + + num_fq = vec_len (fqm->frame_queue_traces); + if (num_fq == 0) + { + vlib_cli_output (vm, "No trace data for frame queues\n"); + return error; + } + + if (histogram) + { + vlib_cli_output (vm, "0-1 2-3 4-5 6-7 8-9 10-11 12-13 14-15 " + "16-17 18-19 20-21 22-23 24-25 26-27 28-29 30-31\n"); + } + + for (fqix = 0; fqix < num_fq; fqix++) + { + fqt = &(fqm->frame_queue_traces[fqix]); + + vlib_cli_output (vm, "Thread %d %v\n", fqix, + vlib_worker_threads[fqix].name); + + if (fqt->written == 0) + { + vlib_cli_output (vm, " no trace data\n"); + continue; + } + + if (histogram) + { + fqh = &(fqm->frame_queue_histogram[fqix]); + u32 nelt; + u64 total = 0; + + for (nelt = 0; nelt < FRAME_QUEUE_MAX_NELTS; nelt++) + { + total += fqh->count[nelt]; + } + + /* + * Print in pairs to condense the output. + * Allow entries with 0 counts to be clearly identified, by rounding up. + * Any non-zero value will be displayed as at least one percent. This + * also means the sum of percentages can be > 100, but that is fine. The + * histogram is counted from the last time "trace frame on" was issued. + */ + vlib_cli_output (vm, + "%3d%% %3d%% %3d%% %3d%% %3d%% %3d%% %3d%% %3d%% " + "%3d%% %3d%% %3d%% %3d%% %3d%% %3d%% %3d%% %3d%%\n", + compute_percent (&fqh->count[0], total), + compute_percent (&fqh->count[2], total), + compute_percent (&fqh->count[4], total), + compute_percent (&fqh->count[6], total), + compute_percent (&fqh->count[8], total), + compute_percent (&fqh->count[10], total), + compute_percent (&fqh->count[12], total), + compute_percent (&fqh->count[14], total), + compute_percent (&fqh->count[16], total), + compute_percent (&fqh->count[18], total), + compute_percent (&fqh->count[20], total), + compute_percent (&fqh->count[22], total), + compute_percent (&fqh->count[24], total), + compute_percent (&fqh->count[26], total), + compute_percent (&fqh->count[28], total), + compute_percent (&fqh->count[30], total)); + } + else + { + vlib_cli_output (vm, + " vector-threshold %d ring size %d in use %d\n", + fqt->threshold, fqt->nelts, fqt->n_in_use); + vlib_cli_output (vm, " head %12d head_hint %12d tail %12d\n", + fqt->head, fqt->head_hint, fqt->tail); + vlib_cli_output (vm, + " %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d\n", + fqt->n_vectors[0], fqt->n_vectors[1], + fqt->n_vectors[2], fqt->n_vectors[3], + fqt->n_vectors[4], fqt->n_vectors[5], + fqt->n_vectors[6], fqt->n_vectors[7], + fqt->n_vectors[8], fqt->n_vectors[9], + fqt->n_vectors[10], fqt->n_vectors[11], + fqt->n_vectors[12], fqt->n_vectors[13], + fqt->n_vectors[14], fqt->n_vectors[15]); + + if (fqt->nelts > 16) + { + vlib_cli_output (vm, + " %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d %3d\n", + fqt->n_vectors[16], fqt->n_vectors[17], + fqt->n_vectors[18], fqt->n_vectors[19], + fqt->n_vectors[20], fqt->n_vectors[21], + fqt->n_vectors[22], fqt->n_vectors[23], + fqt->n_vectors[24], fqt->n_vectors[25], + fqt->n_vectors[26], fqt->n_vectors[27], + fqt->n_vectors[28], fqt->n_vectors[29], + fqt->n_vectors[30], fqt->n_vectors[31]); + } + } + + } + return error; +} + +static clib_error_t * +show_frame_queue_trace (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vlib_thread_main_t *tm = vlib_get_thread_main (); + vlib_frame_queue_main_t *fqm; + clib_error_t *error; + + vec_foreach (fqm, tm->frame_queue_mains) + { + vlib_cli_output (vm, "Worker handoff queue index %u (next node '%U'):", + fqm - tm->frame_queue_mains, + format_vlib_node_name, vm, fqm->node_index); + error = show_frame_queue_internal (vm, fqm, 0); + if (error) + return error; + } + return 0; +} + +static clib_error_t * +show_frame_queue_histogram (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vlib_thread_main_t *tm = vlib_get_thread_main (); + vlib_frame_queue_main_t *fqm; + clib_error_t *error; + + vec_foreach (fqm, tm->frame_queue_mains) + { + vlib_cli_output (vm, "Worker handoff queue index %u (next node '%U'):", + fqm - tm->frame_queue_mains, + format_vlib_node_name, vm, fqm->node_index); + error = show_frame_queue_internal (vm, fqm, 1); + if (error) + return error; + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_show_frame_queue_trace,static) = { + .path = "show frame-queue", + .short_help = "show frame-queue trace", + .function = show_frame_queue_trace, +}; +/* *INDENT-ON* */ + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_show_frame_queue_histogram,static) = { + .path = "show frame-queue histogram", + .short_help = "show frame-queue histogram", + .function = show_frame_queue_histogram, +}; +/* *INDENT-ON* */ + + +/* + * Modify the number of elements on the frame_queues + */ +static clib_error_t * +test_frame_queue_nelts (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + vlib_thread_main_t *tm = vlib_get_thread_main (); + vlib_frame_queue_main_t *fqm; + clib_error_t *error = NULL; + u32 num_fq; + u32 fqix; + u32 nelts = 0; + u32 index = ~(u32) 0; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "nelts %u", &nelts)) + ; + else if (unformat (line_input, "index %u", &index)) + ; + else + return clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + } + + unformat_free (line_input); + + if (index > vec_len (tm->frame_queue_mains) - 1) + return clib_error_return (0, + "expecting valid worker handoff queue index"); + + fqm = vec_elt_at_index (tm->frame_queue_mains, index); + + if ((nelts != 4) && (nelts != 8) && (nelts != 16) && (nelts != 32)) + { + return clib_error_return (0, "expecting 4,8,16,32"); + } + + num_fq = vec_len (fqm->vlib_frame_queues); + if (num_fq == 0) + { + vlib_cli_output (vm, "No frame queues exist\n"); + return error; + } + + for (fqix = 0; fqix < num_fq; fqix++) + { + fqm->vlib_frame_queues[fqix]->nelts = nelts; + } + + return error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_test_frame_queue_nelts,static) = { + .path = "test frame-queue nelts", + .short_help = "test frame-queue nelts (4,8,16,32)", + .function = test_frame_queue_nelts, +}; +/* *INDENT-ON* */ + + +/* + * Modify the max number of packets pulled off the frame queues + */ +static clib_error_t * +test_frame_queue_threshold (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + vlib_thread_main_t *tm = vlib_get_thread_main (); + vlib_frame_queue_main_t *fqm; + clib_error_t *error = NULL; + u32 num_fq; + u32 fqix; + u32 threshold = ~(u32) 0; + u32 index = ~(u32) 0; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "threshold %u", &threshold)) + ; + else if (unformat (line_input, "index %u", &index)) + ; + else + return clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + } + + unformat_free (line_input); + + if (index > vec_len (tm->frame_queue_mains) - 1) + return clib_error_return (0, + "expecting valid worker handoff queue index"); + + fqm = vec_elt_at_index (tm->frame_queue_mains, index); + + + if (threshold == ~(u32) 0) + { + vlib_cli_output (vm, "expecting threshold value\n"); + return error; + } + + if (threshold == 0) + threshold = ~0; + + num_fq = vec_len (fqm->vlib_frame_queues); + if (num_fq == 0) + { + vlib_cli_output (vm, "No frame queues exist\n"); + return error; + } + + for (fqix = 0; fqix < num_fq; fqix++) + { + fqm->vlib_frame_queues[fqix]->vector_threshold = threshold; + } + + return error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_test_frame_queue_threshold,static) = { + .path = "test frame-queue threshold", + .short_help = "test frame-queue threshold N (0=no limit)", + .function = test_frame_queue_threshold, +}; +/* *INDENT-ON* */ + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/trace.c b/src/vlib/trace.c new file mode 100644 index 00000000..dcdb837f --- /dev/null +++ b/src/vlib/trace.c @@ -0,0 +1,545 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * trace.c: VLIB trace buffer. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include + +/* Helper function for nodes which only trace buffer data. */ +void +vlib_trace_frame_buffers_only (vlib_main_t * vm, + vlib_node_runtime_t * node, + u32 * buffers, + uword n_buffers, + uword next_buffer_stride, + uword n_buffer_data_bytes_in_trace) +{ + u32 n_left, *from; + + n_left = n_buffers; + from = buffers; + + while (n_left >= 4) + { + u32 bi0, bi1; + vlib_buffer_t *b0, *b1; + u8 *t0, *t1; + + /* Prefetch next iteration. */ + vlib_prefetch_buffer_with_index (vm, from[2], LOAD); + vlib_prefetch_buffer_with_index (vm, from[3], LOAD); + + bi0 = from[0]; + bi1 = from[1]; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + if (b0->flags & VLIB_BUFFER_IS_TRACED) + { + t0 = vlib_add_trace (vm, node, b0, n_buffer_data_bytes_in_trace); + clib_memcpy (t0, b0->data + b0->current_data, + n_buffer_data_bytes_in_trace); + } + if (b1->flags & VLIB_BUFFER_IS_TRACED) + { + t1 = vlib_add_trace (vm, node, b1, n_buffer_data_bytes_in_trace); + clib_memcpy (t1, b1->data + b1->current_data, + n_buffer_data_bytes_in_trace); + } + from += 2; + n_left -= 2; + } + + while (n_left >= 1) + { + u32 bi0; + vlib_buffer_t *b0; + u8 *t0; + + bi0 = from[0]; + + b0 = vlib_get_buffer (vm, bi0); + + if (b0->flags & VLIB_BUFFER_IS_TRACED) + { + t0 = vlib_add_trace (vm, node, b0, n_buffer_data_bytes_in_trace); + clib_memcpy (t0, b0->data + b0->current_data, + n_buffer_data_bytes_in_trace); + } + from += 1; + n_left -= 1; + } +} + +/* Free up all trace buffer memory. */ +always_inline void +clear_trace_buffer (void) +{ + int i; + vlib_trace_main_t *tm; + + /* *INDENT-OFF* */ + foreach_vlib_main ( + ({ + void *mainheap; + + tm = &this_vlib_main->trace_main; + mainheap = clib_mem_set_heap (this_vlib_main->heap_base); + + tm->trace_active_hint = 0; + + for (i = 0; i < vec_len (tm->trace_buffer_pool); i++) + if (! pool_is_free_index (tm->trace_buffer_pool, i)) + vec_free (tm->trace_buffer_pool[i]); + pool_free (tm->trace_buffer_pool); + clib_mem_set_heap (mainheap); + })); + /* *INDENT-ON* */ +} + +static u8 * +format_vlib_trace (u8 * s, va_list * va) +{ + vlib_main_t *vm = va_arg (*va, vlib_main_t *); + vlib_trace_header_t *h = va_arg (*va, vlib_trace_header_t *); + vlib_trace_header_t *e = vec_end (h); + vlib_node_t *node, *prev_node; + clib_time_t *ct = &vm->clib_time; + f64 t; + + prev_node = 0; + while (h < e) + { + node = vlib_get_node (vm, h->node_index); + + if (node != prev_node) + { + t = + (h->time - vm->cpu_time_main_loop_start) * ct->seconds_per_clock; + s = + format (s, "\n%U: %v", format_time_interval, "h:m:s:u", t, + node->name); + } + prev_node = node; + + if (node->format_trace) + s = format (s, "\n %U", node->format_trace, vm, node, h->data); + else + s = format (s, "\n %U", node->format_buffer, h->data); + + h = vlib_trace_header_next (h); + } + + return s; +} + +/* Root of all trace cli commands. */ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (trace_cli_command,static) = { + .path = "trace", + .short_help = "Packet tracer commands", +}; +/* *INDENT-ON* */ + +static int +trace_cmp (void *a1, void *a2) +{ + vlib_trace_header_t **t1 = a1; + vlib_trace_header_t **t2 = a2; + i64 dt = t1[0]->time - t2[0]->time; + return dt < 0 ? -1 : (dt > 0 ? +1 : 0); +} + +/* + * Return 1 if this packet passes the trace filter, or 0 otherwise + */ +u32 +filter_accept (vlib_trace_main_t * tm, vlib_trace_header_t * h) +{ + vlib_trace_header_t *e = vec_end (h); + + if (tm->filter_flag == 0) + return 1; + + if (tm->filter_flag == FILTER_FLAG_INCLUDE) + { + while (h < e) + { + if (h->node_index == tm->filter_node_index) + return 1; + h = vlib_trace_header_next (h); + } + return 0; + } + else /* FILTER_FLAG_EXCLUDE */ + { + while (h < e) + { + if (h->node_index == tm->filter_node_index) + return 0; + h = vlib_trace_header_next (h); + } + return 1; + } + + return 0; +} + +/* + * Remove traces from the trace buffer pool that don't pass the filter + */ +void +trace_apply_filter (vlib_main_t * vm) +{ + vlib_trace_main_t *tm = &vm->trace_main; + vlib_trace_header_t **h; + vlib_trace_header_t ***traces_to_remove = 0; + u32 index; + u32 trace_index; + u32 n_accepted; + + u32 accept; + + if (tm->filter_flag == FILTER_FLAG_NONE) + return; + + /* + * Ideally we would retain the first N traces that pass the filter instead + * of any N traces. + */ + n_accepted = 0; + /* *INDENT-OFF* */ + pool_foreach (h, tm->trace_buffer_pool, + ({ + accept = filter_accept(tm, h[0]); + + if ((n_accepted == tm->filter_count) || !accept) + vec_add1 (traces_to_remove, h); + else + n_accepted++; + })); + /* *INDENT-ON* */ + + /* remove all traces that we don't want to keep */ + for (index = 0; index < vec_len (traces_to_remove); index++) + { + trace_index = traces_to_remove[index] - tm->trace_buffer_pool; + _vec_len (tm->trace_buffer_pool[trace_index]) = 0; + pool_put_index (tm->trace_buffer_pool, trace_index); + } + + vec_free (traces_to_remove); +} + +static clib_error_t * +cli_show_trace_buffer (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_trace_main_t *tm; + vlib_trace_header_t **h, **traces; + u32 i, index = 0; + char *fmt; + u8 *s = 0; + u32 max; + + /* + * By default display only this many traces. To display more, explicitly + * specify a max. This prevents unexpectedly huge outputs. + */ + max = 50; + while (unformat_check_input (input) != (uword) UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "max %d", &max)) + ; + else + return clib_error_create ("expected 'max COUNT', got `%U'", + format_unformat_error, input); + } + + + /* Get active traces from pool. */ + + /* *INDENT-OFF* */ + foreach_vlib_main ( + ({ + void *mainheap; + + fmt = "------------------- Start of thread %d %s -------------------\n"; + s = format (s, fmt, index, vlib_worker_threads[index].name); + + tm = &this_vlib_main->trace_main; + + mainheap = clib_mem_set_heap (this_vlib_main->heap_base); + + trace_apply_filter(this_vlib_main); + + traces = 0; + pool_foreach (h, tm->trace_buffer_pool, + ({ + vec_add1 (traces, h[0]); + })); + + if (vec_len (traces) == 0) + { + clib_mem_set_heap (mainheap); + s = format (s, "No packets in trace buffer\n"); + goto done; + } + + /* Sort them by increasing time. */ + vec_sort_with_function (traces, trace_cmp); + + for (i = 0; i < vec_len (traces); i++) + { + if (i == max) + { + vlib_cli_output (vm, "Limiting display to %d packets." + " To display more specify max.", max); + goto done; + } + + clib_mem_set_heap (mainheap); + + s = format (s, "Packet %d\n%U\n\n", i + 1, + format_vlib_trace, vm, traces[i]); + + mainheap = clib_mem_set_heap (this_vlib_main->heap_base); + } + + done: + vec_free (traces); + clib_mem_set_heap (mainheap); + + index++; + })); + /* *INDENT-ON* */ + + vlib_cli_output (vm, "%v", s); + vec_free (s); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_trace_cli,static) = { + .path = "show trace", + .short_help = "Show trace buffer [max COUNT]", + .function = cli_show_trace_buffer, +}; +/* *INDENT-ON* */ + +static clib_error_t * +cli_add_trace_buffer (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + vlib_trace_main_t *tm; + vlib_trace_node_t *tn; + u32 node_index, add; + u8 verbose = 0; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != (uword) UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "%U %d", + unformat_vlib_node, vm, &node_index, &add)) + ; + else if (unformat (line_input, "verbose")) + verbose = 1; + else + return clib_error_create ("expected NODE COUNT, got `%U'", + format_unformat_error, line_input); + } + + /* *INDENT-OFF* */ + foreach_vlib_main (( + { + void *oldheap; + tm = &this_vlib_main->trace_main; + tm->trace_active_hint = 1; + tm->verbose = verbose; + oldheap = + clib_mem_set_heap (this_vlib_main->heap_base); + vec_validate (tm->nodes, node_index); + tn = tm->nodes + node_index; + tn->limit += add; clib_mem_set_heap (oldheap); + })); + /* *INDENT-ON* */ + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (add_trace_cli,static) = { + .path = "trace add", + .short_help = "Trace given number of packets", + .function = cli_add_trace_buffer, +}; +/* *INDENT-ON* */ + + +/* + * Configure a filter for packet traces. + * + * This supplements the packet trace feature so that only packets matching + * the filter are included in the trace. Currently the only filter is to + * keep packets that include a certain node in the trace or exclude a certain + * node in the trace. + * + * The count of traced packets in the "trace add" command is still used to + * create a certain number of traces. The "trace filter" command specifies + * how many of those packets should be retained in the trace. + * + * For example, 1Mpps of traffic is arriving and one of those packets is being + * dropped. To capture the trace for only that dropped packet, you can do: + * trace filter include error-drop 1 + * trace add dpdk-input 1000000 + * + * show trace + * + * Note that the filter could be implemented by capturing all traces and just + * reducing traces displayed by the "show trace" function. But that would + * require a lot of memory for storing the traces, making that infeasible. + * + * To remove traces from the trace pool that do not include a certain node + * requires that the trace be "complete" before applying the filter. To + * accomplish this, the trace pool is filtered upon each iteraction of the + * main vlib loop. Doing so keeps the number of allocated traces down to a + * reasonably low number. This requires that tracing for a buffer is not + * performed after the vlib main loop interation completes. i.e. you can't + * save away a buffer temporarily then inject it back into the graph and + * expect that the trace_index is still valid (such as a traffic manager might + * do). A new trace buffer should be allocated for those types of packets. + * + * The filter can be extended to support multiple nodes and other match + * criteria (e.g. input sw_if_index, mac address) but for now just checks if + * a specified node is in the trace or not in the trace. + */ +static clib_error_t * +cli_filter_trace (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_trace_main_t *tm = &vm->trace_main; + u32 filter_node_index; + u32 filter_flag; + u32 filter_count; + void *mainheap; + + if (unformat (input, "include %U %d", + unformat_vlib_node, vm, &filter_node_index, &filter_count)) + { + filter_flag = FILTER_FLAG_INCLUDE; + } + else if (unformat (input, "exclude %U %d", + unformat_vlib_node, vm, &filter_node_index, + &filter_count)) + { + filter_flag = FILTER_FLAG_EXCLUDE; + } + else if (unformat (input, "none")) + { + filter_flag = FILTER_FLAG_NONE; + filter_node_index = 0; + filter_count = 0; + } + else + return + clib_error_create + ("expected 'include NODE COUNT' or 'exclude NODE COUNT' or 'none', got `%U'", + format_unformat_error, input); + + /* *INDENT-OFF* */ + foreach_vlib_main ( + ({ + tm = &this_vlib_main->trace_main; + tm->filter_node_index = filter_node_index; + tm->filter_flag = filter_flag; + tm->filter_count = filter_count; + + /* + * Clear the trace limits to stop any in-progress tracing + * Prevents runaway trace allocations when the filter changes (or is removed) + */ + mainheap = clib_mem_set_heap (this_vlib_main->heap_base); + vec_free (tm->nodes); + clib_mem_set_heap (mainheap); + })); + /* *INDENT-ON* */ + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (filter_trace_cli,static) = { + .path = "trace filter", + .short_help = "filter trace output - include NODE COUNT | exclude NODE COUNT | none", + .function = cli_filter_trace, +}; +/* *INDENT-ON* */ + +static clib_error_t * +cli_clear_trace_buffer (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + clear_trace_buffer (); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (clear_trace_cli,static) = { + .path = "clear trace", + .short_help = "Clear trace buffer and free memory", + .function = cli_clear_trace_buffer, +}; +/* *INDENT-ON* */ + +/* Dummy function to get us linked in. */ +void +vlib_trace_cli_reference (void) +{ +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/trace.h b/src/vlib/trace.h new file mode 100644 index 00000000..fc0fc5c8 --- /dev/null +++ b/src/vlib/trace.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * trace.h: VLIB trace buffer. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_trace_h +#define included_vlib_trace_h + +#include + +typedef struct +{ + /* CPU time stamp trace was made. */ + u64 time; + + /* Node which generated this trace. */ + u32 node_index; + + /* Number of data words in this trace. */ + u32 n_data; + + /* Trace data follows. */ + u8 data[0]; +} vlib_trace_header_t; + +typedef struct +{ + /* Current number of traces in buffer. */ + u32 count; + + /* Max. number of traces to be added to buffer. */ + u32 limit; +} vlib_trace_node_t; + +typedef struct +{ + /* Pool of trace buffers. */ + vlib_trace_header_t **trace_buffer_pool; + + u32 last_main_loop_count; + u32 filter_node_index; + u32 filter_flag; +#define FILTER_FLAG_NONE 0 +#define FILTER_FLAG_INCLUDE 1 +#define FILTER_FLAG_EXCLUDE 2 + u32 filter_count; + + /* set on trace add, cleared on clear trace */ + u32 trace_active_hint; + + /* Per node trace counts. */ + vlib_trace_node_t *nodes; + + /* verbosity */ + int verbose; +} vlib_trace_main_t; + +#endif /* included_vlib_trace_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/trace_funcs.h b/src/vlib/trace_funcs.h new file mode 100644 index 00000000..5280eae9 --- /dev/null +++ b/src/vlib/trace_funcs.h @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * trace_funcs.h: VLIB trace buffer. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_trace_funcs_h +#define included_vlib_trace_funcs_h + +always_inline void +vlib_validate_trace (vlib_trace_main_t * tm, vlib_buffer_t * b) +{ + /* + * this assert seems right, but goes off constantly. + * disabling it appears to make the pain go away + */ + ASSERT (1 || b->flags & VLIB_BUFFER_IS_TRACED); + ASSERT (!pool_is_free_index (tm->trace_buffer_pool, b->trace_index)); +} + +always_inline void * +vlib_add_trace (vlib_main_t * vm, + vlib_node_runtime_t * r, vlib_buffer_t * b, u32 n_data_bytes) +{ + vlib_trace_main_t *tm = &vm->trace_main; + vlib_trace_header_t *h; + u32 n_data_words; + + vlib_validate_trace (tm, b); + + n_data_bytes = round_pow2 (n_data_bytes, sizeof (h[0])); + n_data_words = n_data_bytes / sizeof (h[0]); + vec_add2_aligned (tm->trace_buffer_pool[b->trace_index], h, + 1 + n_data_words, sizeof (h[0])); + + h->time = vm->cpu_time_last_node_dispatch; + h->n_data = n_data_words; + h->node_index = r->node_index; + + return h->data; +} + +always_inline vlib_trace_header_t * +vlib_trace_header_next (vlib_trace_header_t * h) +{ + return h + 1 + h->n_data; +} + +always_inline void +vlib_free_trace (vlib_main_t * vm, vlib_buffer_t * b) +{ + vlib_trace_main_t *tm = &vm->trace_main; + vlib_validate_trace (tm, b); + _vec_len (tm->trace_buffer_pool[b->trace_index]) = 0; + pool_put_index (tm->trace_buffer_pool, b->trace_index); +} + +always_inline void +vlib_trace_next_frame (vlib_main_t * vm, + vlib_node_runtime_t * r, u32 next_index) +{ + vlib_next_frame_t *nf; + nf = vlib_node_runtime_get_next_frame (vm, r, next_index); + nf->flags |= VLIB_FRAME_TRACE; +} + +void trace_apply_filter (vlib_main_t * vm); + +/* Mark buffer as traced and allocate trace buffer. */ +always_inline void +vlib_trace_buffer (vlib_main_t * vm, + vlib_node_runtime_t * r, + u32 next_index, vlib_buffer_t * b, int follow_chain) +{ + vlib_trace_main_t *tm = &vm->trace_main; + vlib_trace_header_t **h; + + /* + * Apply filter to existing traces to keep number of allocated traces low. + * Performed each time around the main loop. + */ + if (tm->last_main_loop_count != vm->main_loop_count) + { + tm->last_main_loop_count = vm->main_loop_count; + trace_apply_filter (vm); + } + + vlib_trace_next_frame (vm, r, next_index); + + pool_get (tm->trace_buffer_pool, h); + + do + { + b->flags |= VLIB_BUFFER_IS_TRACED; + b->trace_index = h - tm->trace_buffer_pool; + } + while (follow_chain && (b = vlib_get_next_buffer (vm, b))); +} + +always_inline void +vlib_buffer_copy_trace_flag (vlib_main_t * vm, vlib_buffer_t * b, + u32 bi_target) +{ + vlib_buffer_t *b_target = vlib_get_buffer (vm, bi_target); + b_target->flags |= b->flags & VLIB_BUFFER_IS_TRACED; + b_target->trace_index = b->trace_index; +} + +always_inline u32 +vlib_get_trace_count (vlib_main_t * vm, vlib_node_runtime_t * rt) +{ + vlib_trace_main_t *tm = &vm->trace_main; + vlib_trace_node_t *tn; + int n; + + if (rt->node_index >= vec_len (tm->nodes)) + return 0; + tn = tm->nodes + rt->node_index; + n = tn->limit - tn->count; + ASSERT (n >= 0); + + return n; +} + +always_inline void +vlib_set_trace_count (vlib_main_t * vm, vlib_node_runtime_t * rt, u32 count) +{ + vlib_trace_main_t *tm = &vm->trace_main; + vlib_trace_node_t *tn = vec_elt_at_index (tm->nodes, rt->node_index); + + ASSERT (count <= tn->limit); + tn->count = tn->limit - count; +} + +/* Helper function for nodes which only trace buffer data. */ +void +vlib_trace_frame_buffers_only (vlib_main_t * vm, + vlib_node_runtime_t * node, + u32 * buffers, + uword n_buffers, + uword next_buffer_stride, + uword n_buffer_data_bytes_in_trace); + +#endif /* included_vlib_trace_funcs_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/cj.c b/src/vlib/unix/cj.c new file mode 100644 index 00000000..33ba163a --- /dev/null +++ b/src/vlib/unix/cj.c @@ -0,0 +1,271 @@ +/* + *------------------------------------------------------------------ + * cj.c + * + * Copyright (c) 2013 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +/** + * @file + * Circular joournal diagnostic mechanism. + * + * The @c cj thread-safe circular log buffer scheme is occasionally useful + * when chasing bugs. Calls to it should not be checked in. + */ +/*? %%clicmd:group_label Circular Journal %% ?*/ +/*? %%syscfg:group_label Circular Journal %% ?*/ + +#include +#include + +#include + +cj_main_t cj_main; + +void +cj_log (u32 type, void *data0, void *data1) +{ + u64 new_tail; + cj_main_t *cjm = &cj_main; + cj_record_t *r; + + if (cjm->enable == 0) + return; + + new_tail = __sync_add_and_fetch (&cjm->tail, 1); + + r = (cj_record_t *) & (cjm->records[new_tail & (cjm->num_records - 1)]); + r->time = vlib_time_now (cjm->vlib_main); + r->cpu = os_get_cpu_number (); + r->type = type; + r->data[0] = pointer_to_uword (data0); + r->data[1] = pointer_to_uword (data1); +} + +void +cj_stop (void) +{ + cj_main_t *cjm = &cj_main; + + cjm->enable = 0; +} + + +clib_error_t * +cj_init (vlib_main_t * vm) +{ + cj_main_t *cjm = &cj_main; + + cjm->vlib_main = vm; + return 0; +} + +VLIB_INIT_FUNCTION (cj_init); + +static clib_error_t * +cj_config (vlib_main_t * vm, unformat_input_t * input) +{ + cj_main_t *cjm = &cj_main; + int matched = 0; + int enable = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "records %d", &cjm->num_records)) + matched = 1; + else if (unformat (input, "on")) + enable = 1; + else + return clib_error_return (0, "cj_config: unknown input '%U'", + format_unformat_error, input); + } + + if (matched == 0) + return 0; + + cjm->num_records = max_pow2 (cjm->num_records); + vec_validate (cjm->records, cjm->num_records - 1); + memset (cjm->records, 0xff, cjm->num_records * sizeof (cj_record_t)); + cjm->tail = ~0; + cjm->enable = enable; + + return 0; +} + +/*? + * Configure the circular journal diagnostic mechanism. This is only useful + * if you, the deveoper, have written code to make use of the circular + * journal. + * + * @cfgcmd{records, <number>} + * Configure the number of records to allocate for the circular journal. + * + * @cfgcmd{on} + * Enable the collection of records in the circular journal at the + * earliest opportunity. +?*/ +VLIB_CONFIG_FUNCTION (cj_config, "cj"); + +void +cj_enable_disable (int is_enable) +{ + cj_main_t *cjm = &cj_main; + + if (cjm->num_records) + cjm->enable = is_enable; + else + vlib_cli_output (cjm->vlib_main, "CJ not configured..."); +} + +static inline void +cj_dump_one_record (cj_record_t * r) +{ + fprintf (stderr, "[%d]: %10.6f T%02d %llx %llx\n", + r->cpu, r->time, r->type, (long long unsigned int) r->data[0], + (long long unsigned int) r->data[1]); +} + +static void +cj_dump_internal (u8 filter0_enable, u64 filter0, + u8 filter1_enable, u64 filter1) +{ + cj_main_t *cjm = &cj_main; + cj_record_t *r; + u32 i, index; + + if (cjm->num_records == 0) + { + fprintf (stderr, "CJ not configured...\n"); + return; + } + + if (cjm->tail == (u64) ~ 0) + { + fprintf (stderr, "No data collected...\n"); + return; + } + + /* Has the trace wrapped? */ + index = (cjm->tail + 1) & (cjm->num_records - 1); + r = &(cjm->records[index]); + + if (r->cpu != (u32) ~ 0) + { + /* Yes, dump from tail + 1 to the end */ + for (i = index; i < cjm->num_records; i++) + { + if (filter0_enable && (r->data[0] != filter0)) + goto skip; + if (filter1_enable && (r->data[1] != filter1)) + goto skip; + cj_dump_one_record (r); + skip: + r++; + } + } + /* dump from the beginning through the final tail */ + r = cjm->records; + for (i = 0; i <= cjm->tail; i++) + { + if (filter0_enable && (r->data[0] != filter0)) + goto skip2; + if (filter1_enable && (r->data[1] != filter1)) + goto skip2; + cj_dump_one_record (r); + skip2: + r++; + } +} + +void +cj_dump (void) +{ + cj_dump_internal (0, 0, 0, 0); +} + +void +cj_dump_filter_data0 (u64 filter0) +{ + cj_dump_internal (1 /* enable f0 */ , filter0, 0, 0); +} + +void +cj_dump_filter_data1 (u64 filter1) +{ + cj_dump_internal (0, 0, 1 /* enable f1 */ , filter1); +} + +void +cj_dump_filter_data12 (u64 filter0, u64 filter1) +{ + cj_dump_internal (1, filter0, 1, filter1); +} + +static clib_error_t * +cj_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + int is_enable = -1; + int is_dump = -1; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "enable") || unformat (input, "on")) + is_enable = 1; + else if (unformat (input, "disable") || unformat (input, "off")) + is_enable = 0; + else if (unformat (input, "dump")) + is_dump = 1; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + if (is_enable >= 0) + cj_enable_disable (is_enable); + + if (is_dump > 0) + cj_dump (); + + return 0; +} + +/*? + * Enable, disable the collection of diagnostic data into a + * circular journal or dump the circular journal diagnostic data. + * This is only useful if you, the deveoper, have written code to make + * use of the circular journal. + * + * When dumping the data it is formatted and sent to @c stderr of the + * VPP process; when running VPP in unix interactive mode + * this is typically the same place as the Debug CLI. +?*/ + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cj_command,static) = { + .path = "cj", + .short_help = "cj ", + .function = cj_command_fn, +}; +/* *INDENT-ON* */ + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/cj.h b/src/vlib/unix/cj.h new file mode 100644 index 00000000..67626afe --- /dev/null +++ b/src/vlib/unix/cj.h @@ -0,0 +1,79 @@ +/* + *------------------------------------------------------------------ + * cj.h + * + * Copyright (c) 2013 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#ifndef __included_cj_h__ +#define __included_cj_h__ + +typedef struct +{ + f64 time; + u32 cpu; + u32 type; + u64 data[2]; +} cj_record_t; + +typedef struct +{ + volatile u64 tail; + cj_record_t *records; + u32 num_records; + volatile u32 enable; + + vlib_main_t *vlib_main; +} cj_main_t; + +void cj_log (u32 type, void *data0, void *data1); + +/* + * Supply in application main, so we can log from any library... + * Declare a weak reference in the library, off you go. + */ + +#define DECLARE_CJ_GLOBAL_LOG \ +void cj_global_log (unsigned type, void * data0, void * data1) \ + __attribute__ ((weak)); \ + \ +unsigned __cj_type; \ +void * __cj_data0; \ +void * __cj_data1; \ + \ +void \ +cj_global_log (unsigned type, void * data0, void * data1) \ +{ \ + __cj_type = type; \ + __cj_data0 = data0; \ + __cj_data1 = data1; \ +} + +#define CJ_GLOBAL_LOG_PROTOTYPE +void +cj_global_log (unsigned type, void *data0, void *data1) +__attribute__ ((weak)); + +void cj_stop (void); + +#endif /* __included_cj_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/cli.c b/src/vlib/unix/cli.c new file mode 100644 index 00000000..69fca6ec --- /dev/null +++ b/src/vlib/unix/cli.c @@ -0,0 +1,2989 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * cli.c: Unix stdin/socket CLI. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +/** + * @file + * @brief Unix stdin/socket command line interface. + * Provides a command line interface so humans can interact with VPP. + * This is predominantly a debugging and testing mechanism. + */ +/*? %%clicmd:group_label Command line session %% ?*/ +/*? %%syscfg:group_label Command line session %% ?*/ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +/** ANSI escape code. */ +#define ESC "\x1b" + +/** ANSI Control Sequence Introducer. */ +#define CSI ESC "[" + +/** ANSI clear screen. */ +#define ANSI_CLEAR CSI "2J" CSI "1;1H" +/** ANSI reset color settings. */ +#define ANSI_RESET CSI "0m" +/** ANSI Start bold text. */ +#define ANSI_BOLD CSI "1m" +/** ANSI Stop bold text. */ +#define ANSI_DIM CSI "2m" +/** ANSI Start dark red text. */ +#define ANSI_DRED ANSI_DIM CSI "31m" +/** ANSI Start bright red text. */ +#define ANSI_BRED ANSI_BOLD CSI "31m" +/** ANSI clear line cursor is on. */ +#define ANSI_CLEARLINE CSI "2K" +/** ANSI scroll screen down one line. */ +#define ANSI_SCROLLDN CSI "1T" +/** ANSI save cursor position. */ +#define ANSI_SAVECURSOR CSI "s" +/** ANSI restore cursor position if previously saved. */ +#define ANSI_RESTCURSOR CSI "u" + +/** Maximum depth into a byte stream from which to compile a Telnet + * protocol message. This is a saftey measure. */ +#define UNIX_CLI_MAX_DEPTH_TELNET 24 + +/** Unix standard in */ +#define UNIX_CLI_STDIN_FD 0 + + +/** A CLI banner line. */ +typedef struct +{ + u8 *line; /**< The line to print. */ + u32 length; /**< The length of the line without terminating NUL. */ +} unix_cli_banner_t; + +#define _(a) { .line = (u8 *)(a), .length = sizeof(a) - 1 } +/** Plain welcome banner. */ +static unix_cli_banner_t unix_cli_banner[] = { + _(" _______ _ _ _____ ___ \n"), + _(" __/ __/ _ \\ (_)__ | | / / _ \\/ _ \\\n"), + _(" _/ _// // / / / _ \\ | |/ / ___/ ___/\n"), + _(" /_/ /____(_)_/\\___/ |___/_/ /_/ \n"), + _("\n") +}; + +/** ANSI color welcome banner. */ +static unix_cli_banner_t unix_cli_banner_color[] = { + _(ANSI_BRED " _______ _ " ANSI_RESET " _ _____ ___ \n"), + _(ANSI_BRED " __/ __/ _ \\ (_)__ " ANSI_RESET " | | / / _ \\/ _ \\\n"), + _(ANSI_BRED " _/ _// // / / / _ \\" ANSI_RESET " | |/ / ___/ ___/\n"), + _(ANSI_BRED " /_/ /____(_)_/\\___/" ANSI_RESET " |___/_/ /_/ \n"), + _("\n") +}; + +#undef _ + +/** Pager line index */ +typedef struct +{ + /** Index into pager_vector */ + u32 line; + + /** Offset of the string in the line */ + u32 offset; + + /** Length of the string in the line */ + u32 length; +} unix_cli_pager_index_t; + + +/** Unix CLI session. */ +typedef struct +{ + /** The file index held by unix.c */ + u32 unix_file_index; + + /** Vector of output pending write to file descriptor. */ + u8 *output_vector; + + /** Vector of input saved by Unix input node to be processed by + CLI process. */ + u8 *input_vector; + + /** This session has command history. */ + u8 has_history; + /** Array of vectors of commands in the history. */ + u8 **command_history; + /** The command currently pointed at by the history cursor. */ + u8 *current_command; + /** How far from the end of the history array the user has browsed. */ + i32 excursion; + + /** Maximum number of history entries this session will store. */ + u32 history_limit; + + /** Current command line counter */ + u32 command_number; + + /** The string being searched for in the history. */ + u8 *search_key; + /** If non-zero then the CLI is searching in the history array. + * - @c -1 means search backwards. + * - @c 1 means search forwards. + */ + int search_mode; + + /** Position of the insert cursor on the current input line */ + u32 cursor; + + /** Line mode or char mode */ + u8 line_mode; + + /** Set if the CRLF mode wants CR + LF */ + u8 crlf_mode; + + /** Can we do ANSI output? */ + u8 ansi_capable; + + /** Has the session started? */ + u8 started; + + /** Disable the pager? */ + u8 no_pager; + + /** Pager buffer */ + u8 **pager_vector; + + /** Index of line fragments in the pager buffer */ + unix_cli_pager_index_t *pager_index; + + /** Line number of top of page */ + u32 pager_start; + + /** Terminal width */ + u32 width; + + /** Terminal height */ + u32 height; + + /** Process node identifier */ + u32 process_node_index; +} unix_cli_file_t; + +/** Resets the pager buffer and other data. + * @param f The CLI session whose pager needs to be reset. + */ +always_inline void +unix_cli_pager_reset (unix_cli_file_t * f) +{ + u8 **p; + + f->pager_start = 0; + + vec_free (f->pager_index); + f->pager_index = 0; + + vec_foreach (p, f->pager_vector) + { + vec_free (*p); + } + vec_free (f->pager_vector); + f->pager_vector = 0; +} + +/** Release storage used by a CLI session. + * @param f The CLI session whose storage needs to be released. + */ +always_inline void +unix_cli_file_free (unix_cli_file_t * f) +{ + vec_free (f->output_vector); + vec_free (f->input_vector); + unix_cli_pager_reset (f); +} + +/** CLI actions */ +typedef enum +{ + UNIX_CLI_PARSE_ACTION_NOACTION = 0, /**< No action */ + UNIX_CLI_PARSE_ACTION_CRLF, /**< Carriage return, newline or enter */ + UNIX_CLI_PARSE_ACTION_TAB, /**< Tab key */ + UNIX_CLI_PARSE_ACTION_ERASE, /**< Erase cursor left */ + UNIX_CLI_PARSE_ACTION_ERASERIGHT, /**< Erase cursor right */ + UNIX_CLI_PARSE_ACTION_UP, /**< Up arrow */ + UNIX_CLI_PARSE_ACTION_DOWN, /**< Down arrow */ + UNIX_CLI_PARSE_ACTION_LEFT, /**< Left arrow */ + UNIX_CLI_PARSE_ACTION_RIGHT, /**< Right arrow */ + UNIX_CLI_PARSE_ACTION_HOME, /**< Home key (jump to start of line) */ + UNIX_CLI_PARSE_ACTION_END, /**< End key (jump to end of line) */ + UNIX_CLI_PARSE_ACTION_WORDLEFT, /**< Jump cursor to start of left word */ + UNIX_CLI_PARSE_ACTION_WORDRIGHT, /**< Jump cursor to start of right word */ + UNIX_CLI_PARSE_ACTION_ERASELINELEFT, /**< Erase line to left of cursor */ + UNIX_CLI_PARSE_ACTION_ERASELINERIGHT, /**< Erase line to right & including cursor */ + UNIX_CLI_PARSE_ACTION_CLEAR, /**< Clear the terminal */ + UNIX_CLI_PARSE_ACTION_REVSEARCH, /**< Search backwards in command history */ + UNIX_CLI_PARSE_ACTION_FWDSEARCH, /**< Search forwards in command history */ + UNIX_CLI_PARSE_ACTION_YANK, /**< Undo last erase action */ + UNIX_CLI_PARSE_ACTION_TELNETIAC, /**< Telnet control code */ + + UNIX_CLI_PARSE_ACTION_PAGER_CRLF, /**< Enter pressed (CR, CRLF, LF, etc) */ + UNIX_CLI_PARSE_ACTION_PAGER_QUIT, /**< Exit the pager session */ + UNIX_CLI_PARSE_ACTION_PAGER_NEXT, /**< Scroll to next page */ + UNIX_CLI_PARSE_ACTION_PAGER_DN, /**< Scroll to next line */ + UNIX_CLI_PARSE_ACTION_PAGER_UP, /**< Scroll to previous line */ + UNIX_CLI_PARSE_ACTION_PAGER_TOP, /**< Scroll to first line */ + UNIX_CLI_PARSE_ACTION_PAGER_BOTTOM, /**< Scroll to last line */ + UNIX_CLI_PARSE_ACTION_PAGER_PGDN, /**< Scroll to next page */ + UNIX_CLI_PARSE_ACTION_PAGER_PGUP, /**< Scroll to previous page */ + UNIX_CLI_PARSE_ACTION_PAGER_REDRAW, /**< Clear and redraw the page on the terminal */ + UNIX_CLI_PARSE_ACTION_PAGER_SEARCH, /**< Search the pager buffer */ + + UNIX_CLI_PARSE_ACTION_PARTIALMATCH, /**< Action parser found a partial match */ + UNIX_CLI_PARSE_ACTION_NOMATCH /**< Action parser did not find any match */ +} unix_cli_parse_action_t; + +/** @brief Mapping of input buffer strings to action values. + * @note This won't work as a hash since we need to be able to do + * partial matches on the string. + */ +typedef struct +{ + u8 *input; /**< Input string to match. */ + u32 len; /**< Length of input without final NUL. */ + unix_cli_parse_action_t action; /**< Action to take when matched. */ +} unix_cli_parse_actions_t; + +/** @brief Given a capital ASCII letter character return a @c NUL terminated + * string with the control code for that letter. + * + * @param c An ASCII character. + * @return A @c NUL terminated string of type @c u8[]. + * + * @par Example + * @c CTL('A') returns { 0x01, 0x00 } as a @c u8[]. + */ +#define CTL(c) (u8[]){ (c) - '@', 0 } + +#define _(a,b) { .input = (u8 *)(a), .len = sizeof(a) - 1, .action = (b) } +/** + * Patterns to match on a CLI input stream. + * @showinitializer + */ +static unix_cli_parse_actions_t unix_cli_parse_strings[] = { + /* Line handling */ + _("\r\n", UNIX_CLI_PARSE_ACTION_CRLF), /* Must be before '\r' */ + _("\n", UNIX_CLI_PARSE_ACTION_CRLF), + _("\r\0", UNIX_CLI_PARSE_ACTION_CRLF), /* Telnet does this */ + _("\r", UNIX_CLI_PARSE_ACTION_CRLF), + + /* Unix shell control codes */ + _(CTL ('B'), UNIX_CLI_PARSE_ACTION_LEFT), + _(CTL ('F'), UNIX_CLI_PARSE_ACTION_RIGHT), + _(CTL ('P'), UNIX_CLI_PARSE_ACTION_UP), + _(CTL ('N'), UNIX_CLI_PARSE_ACTION_DOWN), + _(CTL ('A'), UNIX_CLI_PARSE_ACTION_HOME), + _(CTL ('E'), UNIX_CLI_PARSE_ACTION_END), + _(CTL ('D'), UNIX_CLI_PARSE_ACTION_ERASERIGHT), + _(CTL ('U'), UNIX_CLI_PARSE_ACTION_ERASELINELEFT), + _(CTL ('K'), UNIX_CLI_PARSE_ACTION_ERASELINERIGHT), + _(CTL ('Y'), UNIX_CLI_PARSE_ACTION_YANK), + _(CTL ('L'), UNIX_CLI_PARSE_ACTION_CLEAR), + _(ESC "b", UNIX_CLI_PARSE_ACTION_WORDLEFT), /* Alt-B */ + _(ESC "f", UNIX_CLI_PARSE_ACTION_WORDRIGHT), /* Alt-F */ + _("\b", UNIX_CLI_PARSE_ACTION_ERASE), /* ^H */ + _("\x7f", UNIX_CLI_PARSE_ACTION_ERASE), /* Backspace */ + _("\t", UNIX_CLI_PARSE_ACTION_TAB), /* ^I */ + + /* VT100 Normal mode - Broadest support */ + _(CSI "A", UNIX_CLI_PARSE_ACTION_UP), + _(CSI "B", UNIX_CLI_PARSE_ACTION_DOWN), + _(CSI "C", UNIX_CLI_PARSE_ACTION_RIGHT), + _(CSI "D", UNIX_CLI_PARSE_ACTION_LEFT), + _(CSI "H", UNIX_CLI_PARSE_ACTION_HOME), + _(CSI "F", UNIX_CLI_PARSE_ACTION_END), + _(CSI "3~", UNIX_CLI_PARSE_ACTION_ERASERIGHT), /* Delete */ + _(CSI "1;5D", UNIX_CLI_PARSE_ACTION_WORDLEFT), /* C-Left */ + _(CSI "1;5C", UNIX_CLI_PARSE_ACTION_WORDRIGHT), /* C-Right */ + + /* VT100 Application mode - Some Gnome Terminal functions use these */ + _(ESC "OA", UNIX_CLI_PARSE_ACTION_UP), + _(ESC "OB", UNIX_CLI_PARSE_ACTION_DOWN), + _(ESC "OC", UNIX_CLI_PARSE_ACTION_RIGHT), + _(ESC "OD", UNIX_CLI_PARSE_ACTION_LEFT), + _(ESC "OH", UNIX_CLI_PARSE_ACTION_HOME), + _(ESC "OF", UNIX_CLI_PARSE_ACTION_END), + + /* ANSI X3.41-1974 - sent by Microsoft Telnet and PuTTY */ + _(CSI "1~", UNIX_CLI_PARSE_ACTION_HOME), + _(CSI "4~", UNIX_CLI_PARSE_ACTION_END), + + /* Emacs-ish history search */ + _(CTL ('S'), UNIX_CLI_PARSE_ACTION_FWDSEARCH), + _(CTL ('R'), UNIX_CLI_PARSE_ACTION_REVSEARCH), + + /* Other protocol things */ + _("\xff", UNIX_CLI_PARSE_ACTION_TELNETIAC), /* IAC */ + _("\0", UNIX_CLI_PARSE_ACTION_NOACTION), /* NUL */ + _(NULL, UNIX_CLI_PARSE_ACTION_NOMATCH) +}; + +/** + * Patterns to match when a CLI session is in the pager. + * @showinitializer + */ +static unix_cli_parse_actions_t unix_cli_parse_pager[] = { + /* Line handling */ + _("\r\n", UNIX_CLI_PARSE_ACTION_PAGER_CRLF), /* Must be before '\r' */ + _("\n", UNIX_CLI_PARSE_ACTION_PAGER_CRLF), + _("\r\0", UNIX_CLI_PARSE_ACTION_PAGER_CRLF), /* Telnet does this */ + _("\r", UNIX_CLI_PARSE_ACTION_PAGER_CRLF), + + /* Pager commands */ + _(" ", UNIX_CLI_PARSE_ACTION_PAGER_NEXT), + _("q", UNIX_CLI_PARSE_ACTION_PAGER_QUIT), + _(CTL ('L'), UNIX_CLI_PARSE_ACTION_PAGER_REDRAW), + _(CTL ('R'), UNIX_CLI_PARSE_ACTION_PAGER_REDRAW), + _("/", UNIX_CLI_PARSE_ACTION_PAGER_SEARCH), + + /* VT100 */ + _(CSI "A", UNIX_CLI_PARSE_ACTION_PAGER_UP), + _(CSI "B", UNIX_CLI_PARSE_ACTION_PAGER_DN), + _(CSI "H", UNIX_CLI_PARSE_ACTION_PAGER_TOP), + _(CSI "F", UNIX_CLI_PARSE_ACTION_PAGER_BOTTOM), + + /* VT100 Application mode */ + _(ESC "OA", UNIX_CLI_PARSE_ACTION_PAGER_UP), + _(ESC "OB", UNIX_CLI_PARSE_ACTION_PAGER_DN), + _(ESC "OH", UNIX_CLI_PARSE_ACTION_PAGER_TOP), + _(ESC "OF", UNIX_CLI_PARSE_ACTION_PAGER_BOTTOM), + + /* ANSI X3.41-1974 */ + _(CSI "1~", UNIX_CLI_PARSE_ACTION_PAGER_TOP), + _(CSI "4~", UNIX_CLI_PARSE_ACTION_PAGER_BOTTOM), + _(CSI "5~", UNIX_CLI_PARSE_ACTION_PAGER_PGUP), + _(CSI "6~", UNIX_CLI_PARSE_ACTION_PAGER_PGDN), + + /* Other protocol things */ + _("\xff", UNIX_CLI_PARSE_ACTION_TELNETIAC), /* IAC */ + _("\0", UNIX_CLI_PARSE_ACTION_NOACTION), /* NUL */ + _(NULL, UNIX_CLI_PARSE_ACTION_NOMATCH) +}; + +#undef _ + +/** CLI session events. */ +typedef enum +{ + UNIX_CLI_PROCESS_EVENT_READ_READY, /**< A file descriptor has data to be read. */ + UNIX_CLI_PROCESS_EVENT_QUIT, /**< A CLI session wants to close. */ +} unix_cli_process_event_type_t; + +/** CLI global state. */ +typedef struct +{ + /** Prompt string for CLI. */ + u8 *cli_prompt; + + /** Vec pool of CLI sessions. */ + unix_cli_file_t *cli_file_pool; + + /** Vec pool of unused session indices. */ + u32 *unused_cli_process_node_indices; + + /** The session index of the stdin cli */ + u32 stdin_cli_file_index; + + /** File pool index of current input. */ + u32 current_input_file_index; +} unix_cli_main_t; + +/** CLI global state */ +static unix_cli_main_t unix_cli_main; + +/** + * @brief Search for a byte sequence in the action list. + * + * Searches the @ref unix_cli_parse_actions_t list in @a a for a match with + * the bytes in @a input of maximum length @a ilen bytes. + * When a match is made @a *matched indicates how many bytes were matched. + * Returns a value from the enum @ref unix_cli_parse_action_t to indicate + * whether no match was found, a partial match was found or a complete + * match was found and what action, if any, should be taken. + * + * @param[in] a Actions list to search within. + * @param[in] input String fragment to search for. + * @param[in] ilen Length of the string in 'input'. + * @param[out] matched Pointer to an integer that will contain the number + * of bytes matched when a complete match is found. + * + * @return Action from @ref unix_cli_parse_action_t that the string fragment + * matches. + * @ref UNIX_CLI_PARSE_ACTION_PARTIALMATCH is returned when the + * whole input string matches the start of at least one action. + * @ref UNIX_CLI_PARSE_ACTION_NOMATCH is returned when there is no + * match at all. + */ +static unix_cli_parse_action_t +unix_cli_match_action (unix_cli_parse_actions_t * a, + u8 * input, u32 ilen, i32 * matched) +{ + u8 partial = 0; + + while (a->input) + { + if (ilen >= a->len) + { + /* see if the start of the input buffer exactly matches the current + * action string. */ + if (memcmp (input, a->input, a->len) == 0) + { + *matched = a->len; + return a->action; + } + } + else + { + /* if the first ilen characters match, flag this as a partial - + * meaning keep collecting bytes in case of a future match */ + if (memcmp (input, a->input, ilen) == 0) + partial = 1; + } + + /* check next action */ + a++; + } + + return partial ? + UNIX_CLI_PARSE_ACTION_PARTIALMATCH : UNIX_CLI_PARSE_ACTION_NOMATCH; +} + + +/** Add bytes to the output vector and then flagg the I/O system that bytes + * are available to be sent. + */ +static void +unix_cli_add_pending_output (unix_file_t * uf, + unix_cli_file_t * cf, + u8 * buffer, uword buffer_bytes) +{ + unix_main_t *um = &unix_main; + + vec_add (cf->output_vector, buffer, buffer_bytes); + if (vec_len (cf->output_vector) > 0) + { + int skip_update = 0 != (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE); + uf->flags |= UNIX_FILE_DATA_AVAILABLE_TO_WRITE; + if (!skip_update) + um->file_update (uf, UNIX_FILE_UPDATE_MODIFY); + } +} + +/** Delete all bytes from the output vector and flag the I/O system + * that no more bytes are available to be sent. + */ +static void +unix_cli_del_pending_output (unix_file_t * uf, + unix_cli_file_t * cf, uword n_bytes) +{ + unix_main_t *um = &unix_main; + + vec_delete (cf->output_vector, n_bytes, 0); + if (vec_len (cf->output_vector) <= 0) + { + int skip_update = 0 == (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE); + uf->flags &= ~UNIX_FILE_DATA_AVAILABLE_TO_WRITE; + if (!skip_update) + um->file_update (uf, UNIX_FILE_UPDATE_MODIFY); + } +} + +/** @brief A bit like strchr with a buffer length limit. + * Search a buffer for the first instance of a character up to the limit of + * the buffer length. If found then return the position of that character. + * + * The key departure from strchr is that if the character is not found then + * return the buffer length. + * + * @param chr The byte value to search for. + * @param str The buffer in which to search for the value. + * @param len The depth into the buffer to search. + * + * @return The index of the first occurence of \c chr. If \c chr is not + * found then \c len instead. + */ +always_inline word +unix_vlib_findchr (u8 chr, u8 * str, word len) +{ + word i = 0; + for (i = 0; i < len; i++, str++) + { + if (*str == chr) + return i; + } + return len; +} + +/** @brief Send a buffer to the CLI stream if possible, enqueue it otherwise. + * Attempts to write given buffer to the file descriptor of the given + * Unix CLI session. If that session already has data in the output buffer + * or if the write attempt tells us to try again later then the given buffer + * is appended to the pending output buffer instead. + * + * This is typically called only from \c unix_vlib_cli_output_cooked since + * that is where CRLF handling occurs or from places where we explicitly do + * not want cooked handling. + * + * @param cf Unix CLI session of the desired stream to write to. + * @param uf The Unix file structure of the desired stream to write to. + * @param buffer Pointer to the buffer that needs to be written. + * @param buffer_bytes The number of bytes from \c buffer to write. + */ +static void +unix_vlib_cli_output_raw (unix_cli_file_t * cf, + unix_file_t * uf, u8 * buffer, uword buffer_bytes) +{ + int n = 0; + + if (vec_len (cf->output_vector) == 0) + n = write (uf->file_descriptor, buffer, buffer_bytes); + + if (n < 0 && errno != EAGAIN) + { + clib_unix_warning ("write"); + } + else if ((word) n < (word) buffer_bytes) + { + /* We got EAGAIN or we already have stuff in the buffer; + * queue up whatever didn't get sent for later. */ + if (n < 0) + n = 0; + unix_cli_add_pending_output (uf, cf, buffer + n, buffer_bytes - n); + } +} + +/** @brief Process a buffer for CRLF handling before outputting it to the CLI. + * + * @param cf Unix CLI session of the desired stream to write to. + * @param uf The Unix file structure of the desired stream to write to. + * @param buffer Pointer to the buffer that needs to be written. + * @param buffer_bytes The number of bytes from \c buffer to write. + */ +static void +unix_vlib_cli_output_cooked (unix_cli_file_t * cf, + unix_file_t * uf, + u8 * buffer, uword buffer_bytes) +{ + word end = 0, start = 0; + + while (end < buffer_bytes) + { + if (cf->crlf_mode) + { + /* iterate the line on \n's so we can insert a \r before it */ + end = unix_vlib_findchr ('\n', + buffer + start, + buffer_bytes - start) + start; + } + else + { + /* otherwise just send the whole buffer */ + end = buffer_bytes; + } + + unix_vlib_cli_output_raw (cf, uf, buffer + start, end - start); + + if (cf->crlf_mode) + { + if (end < buffer_bytes) + { + unix_vlib_cli_output_raw (cf, uf, (u8 *) "\r\n", 2); + end++; /* skip the \n that we already sent */ + } + start = end; + } + } +} + +/** @brief Output the CLI prompt */ +static void +unix_cli_cli_prompt (unix_cli_file_t * cf, unix_file_t * uf) +{ + unix_cli_main_t *cm = &unix_cli_main; + + unix_vlib_cli_output_raw (cf, uf, cm->cli_prompt, vec_len (cm->cli_prompt)); +} + +/** @brief Output a pager prompt and show number of buffered lines */ +static void +unix_cli_pager_prompt (unix_cli_file_t * cf, unix_file_t * uf) +{ + u8 *prompt; + u32 h; + + h = cf->pager_start + (cf->height - 1); + if (h > vec_len (cf->pager_index)) + h = vec_len (cf->pager_index); + + prompt = format (0, "\r%s-- more -- (%d-%d/%d)%s", + cf->ansi_capable ? ANSI_BOLD : "", + cf->pager_start + 1, + h, + vec_len (cf->pager_index), + cf->ansi_capable ? ANSI_RESET : ""); + + unix_vlib_cli_output_cooked (cf, uf, prompt, vec_len (prompt)); + + vec_free (prompt); +} + +/** @brief Output a pager "skipping" message */ +static void +unix_cli_pager_message (unix_cli_file_t * cf, unix_file_t * uf, + char *message, char *postfix) +{ + u8 *prompt; + + prompt = format (0, "\r%s-- %s --%s%s", + cf->ansi_capable ? ANSI_BOLD : "", + message, cf->ansi_capable ? ANSI_RESET : "", postfix); + + unix_vlib_cli_output_cooked (cf, uf, prompt, vec_len (prompt)); + + vec_free (prompt); +} + +/** @brief Erase the printed pager prompt */ +static void +unix_cli_pager_prompt_erase (unix_cli_file_t * cf, unix_file_t * uf) +{ + if (cf->ansi_capable) + { + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\r", 1); + unix_vlib_cli_output_cooked (cf, uf, + (u8 *) ANSI_CLEARLINE, + sizeof (ANSI_CLEARLINE) - 1); + } + else + { + int i; + + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\r", 1); + for (i = 0; i < cf->width - 1; i++) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) " ", 1); + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\r", 1); + } +} + +/** @brief Uses an ANSI escape sequence to move the cursor */ +static void +unix_cli_ansi_cursor (unix_cli_file_t * cf, unix_file_t * uf, u16 x, u16 y) +{ + u8 *str; + + str = format (0, "%s%d;%dH", CSI, y, x); + + unix_vlib_cli_output_cooked (cf, uf, str, vec_len (str)); + + vec_free (str); +} + +/** Redraw the currently displayed page of text. + * @param cf CLI session to redraw the pager buffer of. + * @param uf Unix file of the CLI session. + */ +static void +unix_cli_pager_redraw (unix_cli_file_t * cf, unix_file_t * uf) +{ + unix_cli_pager_index_t *pi = NULL; + u8 *line = NULL; + word i; + + /* No active pager? Do nothing. */ + if (!vec_len (cf->pager_index)) + return; + + if (cf->ansi_capable) + { + /* If we have ANSI, send the clear screen sequence */ + unix_vlib_cli_output_cooked (cf, uf, + (u8 *) ANSI_CLEAR, + sizeof (ANSI_CLEAR) - 1); + } + else + { + /* Otherwise make sure we're on a blank line */ + unix_cli_pager_prompt_erase (cf, uf); + } + + /* (Re-)send the current page of content */ + for (i = 0; i < cf->height - 1 && + i + cf->pager_start < vec_len (cf->pager_index); i++) + { + pi = &cf->pager_index[cf->pager_start + i]; + line = cf->pager_vector[pi->line] + pi->offset; + + unix_vlib_cli_output_cooked (cf, uf, line, pi->length); + } + /* if the last line didn't end in newline, add a newline */ + if (pi && line[pi->length - 1] != '\n') + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1); + + unix_cli_pager_prompt (cf, uf); +} + +/** @brief Process and add a line to the pager index. + * In normal operation this function will take the given character string + * found in @c line and with length @c len_or_index and iterates the over the + * contents, adding each line of text discovered within it to the + * pager index. Lines are identified by newlines ("\\n") and by + * strings longer than the width of the terminal. + * + * If instead @c line is @c NULL then @c len_or_index is taken to mean the + * index of an existing line in the pager buffer; this simply means that the + * input line does not need to be cloned since we alreayd have it. This is + * typical if we are reindexing the pager buffer. + * + * @param cf The CLI session whose pager we are adding to. + * @param line The string of text to be indexed into the pager buffer. + * If @c line is @c NULL then the mode of operation + * changes slightly; see the description above. + * @param len_or_index If @c line is a pointer to a string then this parameter + * indicates the length of that string; Otherwise this + * value provides the index in the pager buffer of an + * existing string to be indexed. + */ +static void +unix_cli_pager_add_line (unix_cli_file_t * cf, u8 * line, word len_or_index) +{ + u8 *p; + word i, j, k; + word line_index, len; + u32 width = cf->width; + unix_cli_pager_index_t *pi; + + if (line == NULL) + { + /* Use a line already in the pager buffer */ + line_index = len_or_index; + p = cf->pager_vector[line_index]; + len = vec_len (p); + } + else + { + len = len_or_index; + /* Add a copy of the raw string to the pager buffer */ + p = vec_new (u8, len); + clib_memcpy (p, line, len); + + /* store in pager buffer */ + line_index = vec_len (cf->pager_vector); + vec_add1 (cf->pager_vector, p); + } + + i = 0; + while (i < len) + { + /* Find the next line, or run to terminal width, or run to EOL */ + int l = len - i; + j = unix_vlib_findchr ((u8) '\n', p, l < width ? l : width); + + if (j < l && p[j] == '\n') /* incl \n */ + j++; + + /* Add the line to the index */ + k = vec_len (cf->pager_index); + vec_validate (cf->pager_index, k); + pi = &cf->pager_index[k]; + + pi->line = line_index; + pi->offset = i; + pi->length = j; + + i += j; + p += j; + } +} + +/** @brief Reindex entire pager buffer. + * Resets the current pager index and then re-adds the lines in the pager + * buffer to the index. + * + * Additionally this function attempts to retain the current page start + * line offset by searching for the same top-of-screen line in the new index. + * + * @param cf The CLI session whose pager buffer should be reindexed. + */ +static void +unix_cli_pager_reindex (unix_cli_file_t * cf) +{ + word i, old_line, old_offset; + unix_cli_pager_index_t *pi; + + /* If there is nothing in the pager buffer then make sure the index + * is empty and move on. + */ + if (cf->pager_vector == 0) + { + vec_reset_length (cf->pager_index); + return; + } + + /* Retain a pointer to the current page start line so we can + * find it later + */ + pi = &cf->pager_index[cf->pager_start]; + old_line = pi->line; + old_offset = pi->offset; + + /* Re-add the buffered lines to the index */ + vec_reset_length (cf->pager_index); + vec_foreach_index (i, cf->pager_vector) + { + unix_cli_pager_add_line (cf, NULL, i); + } + + /* Attempt to re-locate the previously stored page start line */ + vec_foreach_index (i, cf->pager_index) + { + pi = &cf->pager_index[i]; + + if (pi->line == old_line && + (pi->offset <= old_offset || pi->offset + pi->length > old_offset)) + { + /* Found it! */ + cf->pager_start = i; + break; + } + } + + /* In case the start line was not found (rare), ensure the pager start + * index is within bounds + */ + if (cf->pager_start >= vec_len (cf->pager_index)) + { + if (!cf->height || vec_len (cf->pager_index) < (cf->height - 1)) + cf->pager_start = 0; + else + cf->pager_start = vec_len (cf->pager_index) - (cf->height - 1); + } +} + +/** VLIB CLI output function. + * + * If the terminal has a pager configured then this function takes care + * of collating output into the pager buffer; ensuring only the first page + * is displayed and any lines in excess of the first page are buffered. + * + * If the maximum number of index lines in the buffer is exceeded then the + * pager is cancelled and the contents of the current buffer are sent to the + * terminal. + * + * If there is no pager configured then the output is sent directly to the + * terminal. + * + * @param cli_file_index Index of the CLI session where this output is + * directed. + * @param buffer String of printabe bytes to be output. + * @param buffer_bytes The number of bytes in @c buffer to be output. + */ +static void +unix_vlib_cli_output (uword cli_file_index, u8 * buffer, uword buffer_bytes) +{ + unix_main_t *um = &unix_main; + unix_cli_main_t *cm = &unix_cli_main; + unix_cli_file_t *cf; + unix_file_t *uf; + + cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index); + uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + + if (cf->no_pager || um->cli_pager_buffer_limit == 0 || cf->height == 0) + { + unix_vlib_cli_output_cooked (cf, uf, buffer, buffer_bytes); + } + else + { + word row = vec_len (cf->pager_index); + u8 *line; + unix_cli_pager_index_t *pi; + + /* Index and add the output lines to the pager buffer. */ + unix_cli_pager_add_line (cf, buffer, buffer_bytes); + + /* Now iterate what was added to display the lines. + * If we reach the bottom of the page, display a prompt. + */ + while (row < vec_len (cf->pager_index)) + { + if (row < cf->height - 1) + { + /* output this line */ + pi = &cf->pager_index[row]; + line = cf->pager_vector[pi->line] + pi->offset; + unix_vlib_cli_output_cooked (cf, uf, line, pi->length); + + /* if the last line didn't end in newline, and we're at the + * bottom of the page, add a newline */ + if (line[pi->length - 1] != '\n' && row == cf->height - 2) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1); + } + else + { + /* Display the pager prompt every 10 lines */ + if (!(row % 10)) + unix_cli_pager_prompt (cf, uf); + } + row++; + } + + /* Check if we went over the pager buffer limit */ + if (vec_len (cf->pager_index) > um->cli_pager_buffer_limit) + { + /* Stop using the pager for the remainder of this CLI command */ + cf->no_pager = 2; + + /* If we likely printed the prompt, erase it */ + if (vec_len (cf->pager_index) > cf->height - 1) + unix_cli_pager_prompt_erase (cf, uf); + + /* Dump out the contents of the buffer */ + for (row = cf->pager_start + (cf->height - 1); + row < vec_len (cf->pager_index); row++) + { + pi = &cf->pager_index[row]; + line = cf->pager_vector[pi->line] + pi->offset; + unix_vlib_cli_output_cooked (cf, uf, line, pi->length); + } + + unix_cli_pager_reset (cf); + } + } +} + +/** Identify whether a terminal type is ANSI capable. + * + * Compares the string given in @c term with a list of terminal types known + * to support ANSI escape sequences. + * + * This list contains, for example, @c xterm, @c screen and @c ansi. + * + * @param term A string with a terminal type in it. + * @param len The length of the string in @c term. + * + * @return @c 1 if the terminal type is recognized as supporting ANSI + * terminal sequences; @c 0 otherwise. + */ +static u8 +unix_cli_terminal_type (u8 * term, uword len) +{ + /* This may later be better done as a hash of some sort. */ +#define _(a) do { \ + if (strncasecmp(a, (char *)term, (size_t)len) == 0) return 1; \ + } while(0) + + _("xterm"); + _("xterm-color"); + _("xterm-256color"); /* iTerm on Mac */ + _("screen"); + _("ansi"); /* Microsoft Telnet */ +#undef _ + + return 0; +} + +/** @brief Emit initial welcome banner and prompt on a connection. */ +static void +unix_cli_file_welcome (unix_cli_main_t * cm, unix_cli_file_t * cf) +{ + unix_main_t *um = &unix_main; + unix_file_t *uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + unix_cli_banner_t *banner; + int i, len; + + /* + * Put the first bytes directly into the buffer so that further output is + * queued until everything is ready. (oterwise initial prompt can appear + * mid way through VPP initialization) + */ + unix_cli_add_pending_output (uf, cf, (u8 *) "\r", 1); + + if (!um->cli_no_banner) + { + if (cf->ansi_capable) + { + banner = unix_cli_banner_color; + len = ARRAY_LEN (unix_cli_banner_color); + } + else + { + banner = unix_cli_banner; + len = ARRAY_LEN (unix_cli_banner); + } + + for (i = 0; i < len; i++) + { + unix_vlib_cli_output_cooked (cf, uf, + banner[i].line, banner[i].length); + } + } + + /* Prompt. */ + unix_cli_cli_prompt (cf, uf); + + cf->started = 1; +} + +/** @brief A failsafe triggered on a timer to ensure we send the prompt + * to telnet sessions that fail to negotiate the terminal type. */ +static void +unix_cli_file_welcome_timer (any arg, f64 delay) +{ + unix_cli_main_t *cm = &unix_cli_main; + unix_cli_file_t *cf; + (void) delay; + + /* Check the connection didn't close already */ + if (pool_is_free_index (cm->cli_file_pool, (uword) arg)) + return; + + cf = pool_elt_at_index (cm->cli_file_pool, (uword) arg); + + if (!cf->started) + unix_cli_file_welcome (cm, cf); +} + +/** @brief A mostly no-op Telnet state machine. + * Process Telnet command bytes in a way that ensures we're mostly + * transparent to the Telnet protocol. That is, it's mostly a no-op. + * + * @return -1 if we need more bytes, otherwise a positive integer number of + * bytes to consume from the input_vector, not including the initial + * IAC byte. + */ +static i32 +unix_cli_process_telnet (unix_main_t * um, + unix_cli_file_t * cf, + unix_file_t * uf, u8 * input_vector, uword len) +{ + /* Input_vector starts at IAC byte. + * See if we have a complete message; if not, return -1 so we wait for more. + * if we have a complete message, consume those bytes from the vector. + */ + i32 consume = 0; + + if (len == 1) + return -1; /* want more bytes */ + + switch (input_vector[1]) + { + case IAC: + /* two IAC's in a row means to pass through 0xff. + * since that makes no sense here, just consume it. + */ + consume = 1; + break; + + case WILL: + case WONT: + case DO: + case DONT: + /* Expect 3 bytes */ + if (vec_len (input_vector) < 3) + return -1; /* want more bytes */ + + consume = 2; + break; + + case SB: + { + /* Sub option - search ahead for IAC SE to end it */ + i32 i; + for (i = 3; i < len && i < UNIX_CLI_MAX_DEPTH_TELNET; i++) + { + if (input_vector[i - 1] == IAC && input_vector[i] == SE) + { + /* We have a complete message; see if we care about it */ + switch (input_vector[2]) + { + case TELOPT_TTYPE: + if (input_vector[3] != 0) + break; + /* See if the terminal type is ANSI capable */ + cf->ansi_capable = + unix_cli_terminal_type (input_vector + 4, i - 5); + /* If session not started, we can release the pause */ + if (!cf->started) + /* Send the welcome banner and initial prompt */ + unix_cli_file_welcome (&unix_cli_main, cf); + break; + + case TELOPT_NAWS: + /* Window size */ + if (i != 8) /* check message is correct size */ + break; + cf->width = + clib_net_to_host_u16 (*((u16 *) (input_vector + 3))); + cf->height = + clib_net_to_host_u16 (*((u16 *) (input_vector + 5))); + /* reindex pager buffer */ + unix_cli_pager_reindex (cf); + /* redraw page */ + unix_cli_pager_redraw (cf, uf); + break; + + default: + break; + } + /* Consume it all */ + consume = i; + break; + } + } + + if (i == UNIX_CLI_MAX_DEPTH_TELNET) + consume = 1; /* hit max search depth, advance one byte */ + + if (consume == 0) + return -1; /* want more bytes */ + + break; + } + + case GA: + case EL: + case EC: + case AO: + case IP: + case BREAK: + case DM: + case NOP: + case SE: + case EOR: + case ABORT: + case SUSP: + case xEOF: + /* Simple one-byte messages */ + consume = 1; + break; + + case AYT: + /* Are You There - trigger a visible response */ + consume = 1; + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "fd.io VPP\n", 10); + break; + + default: + /* Unknown command! Eat the IAC byte */ + break; + } + + return consume; +} + +/** @brief Process actionable input. + * Based on the \c action process the input; this typically involves + * searching the command history or editing the current command line. + */ +static int +unix_cli_line_process_one (unix_cli_main_t * cm, + unix_main_t * um, + unix_cli_file_t * cf, + unix_file_t * uf, + u8 input, unix_cli_parse_action_t action) +{ + u8 *prev; + int j, delta; + + switch (action) + { + case UNIX_CLI_PARSE_ACTION_NOACTION: + break; + + case UNIX_CLI_PARSE_ACTION_REVSEARCH: + case UNIX_CLI_PARSE_ACTION_FWDSEARCH: + if (!cf->has_history || !cf->history_limit) + break; + if (cf->search_mode == 0) + { + /* Erase the current command (if any) */ + for (j = 0; j < (vec_len (cf->current_command)); j++) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b \b", 3); + + vec_reset_length (cf->search_key); + vec_reset_length (cf->current_command); + if (action == UNIX_CLI_PARSE_ACTION_REVSEARCH) + cf->search_mode = -1; + else + cf->search_mode = 1; + cf->cursor = 0; + } + else + { + if (action == UNIX_CLI_PARSE_ACTION_REVSEARCH) + cf->search_mode = -1; + else + cf->search_mode = 1; + + cf->excursion += cf->search_mode; + goto search_again; + } + break; + + case UNIX_CLI_PARSE_ACTION_ERASELINELEFT: + /* Erase the command from the cursor to the start */ + + /* Shimmy forwards to the new end of line position */ + delta = vec_len (cf->current_command) - cf->cursor; + for (j = cf->cursor; j > delta; j--) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + /* Zap from here to the end of what is currently displayed */ + for (; j < (vec_len (cf->current_command)); j++) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) " ", 1); + /* Get back to the start of the line */ + for (j = 0; j < (vec_len (cf->current_command)); j++) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + + j = vec_len (cf->current_command) - cf->cursor; + memmove (cf->current_command, cf->current_command + cf->cursor, j); + _vec_len (cf->current_command) = j; + + /* Print the new contents */ + unix_vlib_cli_output_cooked (cf, uf, cf->current_command, j); + /* Shimmy back to the start */ + for (j = 0; j < (vec_len (cf->current_command)); j++) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + cf->cursor = 0; + + cf->search_mode = 0; + break; + + case UNIX_CLI_PARSE_ACTION_ERASELINERIGHT: + /* Erase the command from the cursor to the end */ + + /* Zap from cursor to end of what is currently displayed */ + for (j = cf->cursor; j < (vec_len (cf->current_command)); j++) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) " ", 1); + /* Get back to where we were */ + for (j = cf->cursor; j < (vec_len (cf->current_command)); j++) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + + /* Truncate the line at the cursor */ + _vec_len (cf->current_command) = cf->cursor; + + cf->search_mode = 0; + break; + + case UNIX_CLI_PARSE_ACTION_LEFT: + if (cf->cursor > 0) + { + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + cf->cursor--; + } + + cf->search_mode = 0; + break; + + case UNIX_CLI_PARSE_ACTION_RIGHT: + if (cf->cursor < vec_len (cf->current_command)) + { + /* have to emit the character under the cursor */ + unix_vlib_cli_output_cooked (cf, uf, + cf->current_command + cf->cursor, 1); + cf->cursor++; + } + + cf->search_mode = 0; + break; + + case UNIX_CLI_PARSE_ACTION_UP: + case UNIX_CLI_PARSE_ACTION_DOWN: + if (!cf->has_history || !cf->history_limit) + break; + cf->search_mode = 0; + /* Erase the command */ + for (j = cf->cursor; j < (vec_len (cf->current_command)); j++) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) " ", 1); + for (j = 0; j < (vec_len (cf->current_command)); j++) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b \b", 3); + vec_reset_length (cf->current_command); + if (vec_len (cf->command_history)) + { + if (action == UNIX_CLI_PARSE_ACTION_UP) + delta = -1; + else + delta = 1; + + cf->excursion += delta; + + if (cf->excursion == vec_len (cf->command_history)) + { + /* down-arrowed to last entry - want a blank line */ + _vec_len (cf->current_command) = 0; + } + else if (cf->excursion < 0) + { + /* up-arrowed over the start to the end, want a blank line */ + cf->excursion = vec_len (cf->command_history); + _vec_len (cf->current_command) = 0; + } + else + { + if (cf->excursion > (i32) vec_len (cf->command_history) - 1) + /* down-arrowed past end - wrap to start */ + cf->excursion = 0; + + /* Print the command at the current position */ + prev = cf->command_history[cf->excursion]; + vec_validate (cf->current_command, vec_len (prev) - 1); + + clib_memcpy (cf->current_command, prev, vec_len (prev)); + _vec_len (cf->current_command) = vec_len (prev); + unix_vlib_cli_output_cooked (cf, uf, cf->current_command, + vec_len (cf->current_command)); + } + cf->cursor = vec_len (cf->current_command); + + break; + } + break; + + case UNIX_CLI_PARSE_ACTION_HOME: + if (vec_len (cf->current_command) && cf->cursor > 0) + { + while (cf->cursor) + { + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + cf->cursor--; + } + } + + cf->search_mode = 0; + break; + + case UNIX_CLI_PARSE_ACTION_END: + if (vec_len (cf->current_command) && + cf->cursor < vec_len (cf->current_command)) + { + unix_vlib_cli_output_cooked (cf, uf, + cf->current_command + cf->cursor, + vec_len (cf->current_command) - + cf->cursor); + cf->cursor = vec_len (cf->current_command); + } + + cf->search_mode = 0; + break; + + case UNIX_CLI_PARSE_ACTION_WORDLEFT: + if (vec_len (cf->current_command) && cf->cursor > 0) + { + j = cf->cursor; + + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + j--; + + while (j && isspace (cf->current_command[j])) + { + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + j--; + } + while (j && !isspace (cf->current_command[j])) + { + if (isspace (cf->current_command[j - 1])) + break; + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + j--; + } + + cf->cursor = j; + } + + cf->search_mode = 0; + break; + + case UNIX_CLI_PARSE_ACTION_WORDRIGHT: + if (vec_len (cf->current_command) && + cf->cursor < vec_len (cf->current_command)) + { + int e = vec_len (cf->current_command); + j = cf->cursor; + while (j < e && !isspace (cf->current_command[j])) + j++; + while (j < e && isspace (cf->current_command[j])) + j++; + unix_vlib_cli_output_cooked (cf, uf, + cf->current_command + cf->cursor, + j - cf->cursor); + cf->cursor = j; + } + + cf->search_mode = 0; + break; + + + case UNIX_CLI_PARSE_ACTION_ERASE: + if (vec_len (cf->current_command)) + { + if (cf->cursor == vec_len (cf->current_command)) + { + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b \b", 3); + _vec_len (cf->current_command)--; + cf->cursor--; + } + else if (cf->cursor > 0) + { + /* shift everything at & to the right of the cursor left by 1 */ + j = vec_len (cf->current_command) - cf->cursor; + memmove (cf->current_command + cf->cursor - 1, + cf->current_command + cf->cursor, j); + _vec_len (cf->current_command)--; + cf->cursor--; + /* redraw the rest of the line */ + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + unix_vlib_cli_output_cooked (cf, uf, + cf->current_command + cf->cursor, + j); + unix_vlib_cli_output_cooked (cf, uf, (u8 *) " \b\b", 3); + /* and shift the terminal cursor back where it should be */ + while (--j) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + } + } + cf->search_mode = 0; + cf->excursion = 0; + vec_reset_length (cf->search_key); + break; + + case UNIX_CLI_PARSE_ACTION_ERASERIGHT: + if (vec_len (cf->current_command)) + { + if (cf->cursor < vec_len (cf->current_command)) + { + /* shift everything to the right of the cursor left by 1 */ + j = vec_len (cf->current_command) - cf->cursor - 1; + memmove (cf->current_command + cf->cursor, + cf->current_command + cf->cursor + 1, j); + _vec_len (cf->current_command)--; + /* redraw the rest of the line */ + unix_vlib_cli_output_cooked (cf, uf, + cf->current_command + cf->cursor, + j); + unix_vlib_cli_output_cooked (cf, uf, (u8 *) " \b", 2); + /* and shift the terminal cursor back where it should be */ + if (j) + { + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + while (--j) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + } + } + } + else if (input == 'D' - '@') + { + /* ^D with no command entered = quit */ + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "quit\n", 5); + vlib_process_signal_event (um->vlib_main, + vlib_current_process (um->vlib_main), + UNIX_CLI_PROCESS_EVENT_QUIT, + cf - cm->cli_file_pool); + } + cf->search_mode = 0; + cf->excursion = 0; + vec_reset_length (cf->search_key); + break; + + case UNIX_CLI_PARSE_ACTION_CLEAR: + /* If we're in ANSI mode, clear the screen. + * Then redraw the prompt and any existing command input, then put + * the cursor back where it was in that line. + */ + if (cf->ansi_capable) + unix_vlib_cli_output_cooked (cf, uf, + (u8 *) ANSI_CLEAR, + sizeof (ANSI_CLEAR) - 1); + else + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1); + + unix_vlib_cli_output_raw (cf, uf, + cm->cli_prompt, vec_len (cm->cli_prompt)); + unix_vlib_cli_output_raw (cf, uf, + cf->current_command, + vec_len (cf->current_command)); + for (j = cf->cursor; j < vec_len (cf->current_command); j++) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b", 1); + + break; + + case UNIX_CLI_PARSE_ACTION_TAB: + case UNIX_CLI_PARSE_ACTION_YANK: + /* TODO */ + break; + + + case UNIX_CLI_PARSE_ACTION_PAGER_QUIT: + pager_quit: + unix_cli_pager_prompt_erase (cf, uf); + unix_cli_pager_reset (cf); + unix_cli_cli_prompt (cf, uf); + break; + + case UNIX_CLI_PARSE_ACTION_PAGER_NEXT: + case UNIX_CLI_PARSE_ACTION_PAGER_PGDN: + /* show next page of the buffer */ + if (cf->height + cf->pager_start < vec_len (cf->pager_index)) + { + u8 *line = NULL; + unix_cli_pager_index_t *pi = NULL; + + int m = cf->pager_start + (cf->height - 1); + unix_cli_pager_prompt_erase (cf, uf); + for (j = m; + j < vec_len (cf->pager_index) && cf->pager_start < m; + j++, cf->pager_start++) + { + pi = &cf->pager_index[j]; + line = cf->pager_vector[pi->line] + pi->offset; + unix_vlib_cli_output_cooked (cf, uf, line, pi->length); + } + /* if the last line didn't end in newline, add a newline */ + if (pi && line[pi->length - 1] != '\n') + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1); + unix_cli_pager_prompt (cf, uf); + } + else + { + if (action == UNIX_CLI_PARSE_ACTION_PAGER_NEXT) + /* no more in buffer, exit, but only if it was */ + goto pager_quit; + } + break; + + case UNIX_CLI_PARSE_ACTION_PAGER_DN: + case UNIX_CLI_PARSE_ACTION_PAGER_CRLF: + /* display the next line of the buffer */ + if (cf->pager_start < vec_len (cf->pager_index) - (cf->height - 1)) + { + u8 *line; + unix_cli_pager_index_t *pi; + + unix_cli_pager_prompt_erase (cf, uf); + pi = &cf->pager_index[cf->pager_start + (cf->height - 1)]; + line = cf->pager_vector[pi->line] + pi->offset; + unix_vlib_cli_output_cooked (cf, uf, line, pi->length); + cf->pager_start++; + /* if the last line didn't end in newline, add a newline */ + if (line[pi->length - 1] != '\n') + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1); + unix_cli_pager_prompt (cf, uf); + } + else + { + if (action == UNIX_CLI_PARSE_ACTION_PAGER_CRLF) + /* no more in buffer, exit, but only if it was */ + goto pager_quit; + } + + break; + + case UNIX_CLI_PARSE_ACTION_PAGER_UP: + /* scroll the page back one line */ + if (cf->pager_start > 0) + { + u8 *line = NULL; + unix_cli_pager_index_t *pi = NULL; + + cf->pager_start--; + if (cf->ansi_capable) + { + pi = &cf->pager_index[cf->pager_start]; + line = cf->pager_vector[pi->line] + pi->offset; + unix_cli_pager_prompt_erase (cf, uf); + unix_vlib_cli_output_cooked (cf, uf, (u8 *) ANSI_SCROLLDN, + sizeof (ANSI_SCROLLDN) - 1); + unix_vlib_cli_output_cooked (cf, uf, (u8 *) ANSI_SAVECURSOR, + sizeof (ANSI_SAVECURSOR) - 1); + unix_cli_ansi_cursor (cf, uf, 1, 1); + unix_vlib_cli_output_cooked (cf, uf, (u8 *) ANSI_CLEARLINE, + sizeof (ANSI_CLEARLINE) - 1); + unix_vlib_cli_output_cooked (cf, uf, line, pi->length); + unix_vlib_cli_output_cooked (cf, uf, (u8 *) ANSI_RESTCURSOR, + sizeof (ANSI_RESTCURSOR) - 1); + unix_cli_pager_prompt_erase (cf, uf); + unix_cli_pager_prompt (cf, uf); + } + else + { + int m = cf->pager_start + (cf->height - 1); + unix_cli_pager_prompt_erase (cf, uf); + for (j = cf->pager_start; + j < vec_len (cf->pager_index) && j < m; j++) + { + pi = &cf->pager_index[j]; + line = cf->pager_vector[pi->line] + pi->offset; + unix_vlib_cli_output_cooked (cf, uf, line, pi->length); + } + /* if the last line didn't end in newline, add a newline */ + if (pi && line[pi->length - 1] != '\n') + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1); + unix_cli_pager_prompt (cf, uf); + } + } + break; + + case UNIX_CLI_PARSE_ACTION_PAGER_TOP: + /* back to the first page of the buffer */ + if (cf->pager_start > 0) + { + u8 *line = NULL; + unix_cli_pager_index_t *pi = NULL; + + cf->pager_start = 0; + int m = cf->pager_start + (cf->height - 1); + unix_cli_pager_prompt_erase (cf, uf); + for (j = cf->pager_start; j < vec_len (cf->pager_index) && j < m; + j++) + { + pi = &cf->pager_index[j]; + line = cf->pager_vector[pi->line] + pi->offset; + unix_vlib_cli_output_cooked (cf, uf, line, pi->length); + } + /* if the last line didn't end in newline, add a newline */ + if (pi && line[pi->length - 1] != '\n') + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1); + unix_cli_pager_prompt (cf, uf); + } + break; + + case UNIX_CLI_PARSE_ACTION_PAGER_BOTTOM: + /* skip to the last page of the buffer */ + if (cf->pager_start < vec_len (cf->pager_index) - (cf->height - 1)) + { + u8 *line = NULL; + unix_cli_pager_index_t *pi = NULL; + + cf->pager_start = vec_len (cf->pager_index) - (cf->height - 1); + unix_cli_pager_prompt_erase (cf, uf); + unix_cli_pager_message (cf, uf, "skipping", "\n"); + for (j = cf->pager_start; j < vec_len (cf->pager_index); j++) + { + pi = &cf->pager_index[j]; + line = cf->pager_vector[pi->line] + pi->offset; + unix_vlib_cli_output_cooked (cf, uf, line, pi->length); + } + /* if the last line didn't end in newline, add a newline */ + if (pi && line[pi->length - 1] != '\n') + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1); + unix_cli_pager_prompt (cf, uf); + } + break; + + case UNIX_CLI_PARSE_ACTION_PAGER_PGUP: + /* wander back one page in the buffer */ + if (cf->pager_start > 0) + { + u8 *line = NULL; + unix_cli_pager_index_t *pi = NULL; + int m; + + if (cf->pager_start >= cf->height) + cf->pager_start -= cf->height - 1; + else + cf->pager_start = 0; + m = cf->pager_start + cf->height - 1; + unix_cli_pager_prompt_erase (cf, uf); + for (j = cf->pager_start; j < vec_len (cf->pager_index) && j < m; + j++) + { + pi = &cf->pager_index[j]; + line = cf->pager_vector[pi->line] + pi->offset; + unix_vlib_cli_output_cooked (cf, uf, line, pi->length); + } + /* if the last line didn't end in newline, add a newline */ + if (pi && line[pi->length - 1] != '\n') + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1); + unix_cli_pager_prompt (cf, uf); + } + break; + + case UNIX_CLI_PARSE_ACTION_PAGER_REDRAW: + /* Redraw the current pager screen */ + unix_cli_pager_redraw (cf, uf); + break; + + case UNIX_CLI_PARSE_ACTION_PAGER_SEARCH: + /* search forwards in the buffer */ + break; + + + case UNIX_CLI_PARSE_ACTION_CRLF: + crlf: + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1); + + if (cf->has_history && cf->history_limit) + { + if (cf->command_history + && vec_len (cf->command_history) >= cf->history_limit) + { + vec_free (cf->command_history[0]); + vec_delete (cf->command_history, 1, 0); + } + /* Don't add blank lines to the cmd history */ + if (vec_len (cf->current_command)) + { + /* Don't duplicate the previous command */ + j = vec_len (cf->command_history); + if (j == 0 || + (vec_len (cf->current_command) != + vec_len (cf->command_history[j - 1]) + || memcmp (cf->current_command, cf->command_history[j - 1], + vec_len (cf->current_command)) != 0)) + { + /* copy the command to the history */ + u8 *c = 0; + vec_append (c, cf->current_command); + vec_add1 (cf->command_history, c); + cf->command_number++; + } + } + cf->excursion = vec_len (cf->command_history); + } + + cf->search_mode = 0; + vec_reset_length (cf->search_key); + cf->cursor = 0; + + return 0; + + case UNIX_CLI_PARSE_ACTION_PARTIALMATCH: + case UNIX_CLI_PARSE_ACTION_NOMATCH: + if (vec_len (cf->pager_index)) + { + /* no-op for now */ + } + else if (cf->has_history && cf->search_mode && isprint (input)) + { + int k, limit, offset; + u8 *item; + + vec_add1 (cf->search_key, input); + + search_again: + for (j = 0; j < vec_len (cf->command_history); j++) + { + if (cf->excursion > (i32) vec_len (cf->command_history) - 1) + cf->excursion = 0; + else if (cf->excursion < 0) + cf->excursion = vec_len (cf->command_history) - 1; + + item = cf->command_history[cf->excursion]; + + limit = (vec_len (cf->search_key) > vec_len (item)) ? + vec_len (item) : vec_len (cf->search_key); + + for (offset = 0; offset <= vec_len (item) - limit; offset++) + { + for (k = 0; k < limit; k++) + { + if (item[k + offset] != cf->search_key[k]) + goto next_offset; + } + goto found_at_offset; + + next_offset: + ; + } + goto next; + + found_at_offset: + for (j = 0; j < vec_len (cf->current_command); j++) + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\b \b", 3); + + vec_validate (cf->current_command, vec_len (item) - 1); + clib_memcpy (cf->current_command, item, vec_len (item)); + _vec_len (cf->current_command) = vec_len (item); + + unix_vlib_cli_output_cooked (cf, uf, cf->current_command, + vec_len (cf->current_command)); + cf->cursor = vec_len (cf->current_command); + goto found; + + next: + cf->excursion += cf->search_mode; + } + + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\nNo match...", 12); + vec_reset_length (cf->search_key); + vec_reset_length (cf->current_command); + cf->search_mode = 0; + cf->cursor = 0; + goto crlf; + } + else if (isprint (input)) /* skip any errant control codes */ + { + if (cf->cursor == vec_len (cf->current_command)) + { + /* Append to end */ + vec_add1 (cf->current_command, input); + cf->cursor++; + + /* Echo the character back to the client */ + unix_vlib_cli_output_raw (cf, uf, &input, 1); + } + else + { + /* Insert at cursor: resize +1 byte, move everything over */ + j = vec_len (cf->current_command) - cf->cursor; + vec_add1 (cf->current_command, (u8) 'A'); + memmove (cf->current_command + cf->cursor + 1, + cf->current_command + cf->cursor, j); + cf->current_command[cf->cursor] = input; + /* Redraw the line */ + j++; + unix_vlib_cli_output_raw (cf, uf, + cf->current_command + cf->cursor, j); + /* Put terminal cursor back */ + while (--j) + unix_vlib_cli_output_raw (cf, uf, (u8 *) "\b", 1); + cf->cursor++; + } + } + else + { + /* no-op - not printable or otherwise not actionable */ + } + + found: + + break; + + case UNIX_CLI_PARSE_ACTION_TELNETIAC: + break; + } + return 1; +} + +/** @brief Process input bytes on a stream to provide line editing and + * command history in the CLI. */ +static int +unix_cli_line_edit (unix_cli_main_t * cm, + unix_main_t * um, unix_cli_file_t * cf) +{ + unix_file_t *uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + int i; + + for (i = 0; i < vec_len (cf->input_vector); i++) + { + unix_cli_parse_action_t action; + i32 matched = 0; + unix_cli_parse_actions_t *a; + + /* If we're in the pager mode, search the pager actions */ + a = + vec_len (cf->pager_index) ? unix_cli_parse_pager : + unix_cli_parse_strings; + + /* See if the input buffer is some sort of control code */ + action = unix_cli_match_action (a, &cf->input_vector[i], + vec_len (cf->input_vector) - i, + &matched); + + switch (action) + { + case UNIX_CLI_PARSE_ACTION_PARTIALMATCH: + if (i) + { + /* There was a partial match which means we need more bytes + * than the input buffer currently has. + * Since the bytes before here have been processed, shift + * the remaining contents to the start of the input buffer. + */ + vec_delete (cf->input_vector, i, 0); + } + return 1; /* wait for more */ + + case UNIX_CLI_PARSE_ACTION_TELNETIAC: + /* process telnet options */ + matched = unix_cli_process_telnet (um, cf, uf, + cf->input_vector + i, + vec_len (cf->input_vector) - i); + if (matched < 0) + { + if (i) + { + /* There was a partial match which means we need more bytes + * than the input buffer currently has. + * Since the bytes before here have been processed, shift + * the remaining contents to the start of the input buffer. + */ + vec_delete (cf->input_vector, i, 0); + } + return 1; /* wait for more */ + } + break; + + default: + /* process the action */ + if (!unix_cli_line_process_one (cm, um, cf, uf, + cf->input_vector[i], action)) + { + /* CRLF found. Consume the bytes from the input_vector */ + vec_delete (cf->input_vector, i + matched, 0); + /* And tell our caller to execute cf->input_command */ + return 0; + } + } + + i += matched; + } + + vec_reset_length (cf->input_vector); + return 1; +} + +/** @brief Process input to a CLI session. */ +static void +unix_cli_process_input (unix_cli_main_t * cm, uword cli_file_index) +{ + unix_main_t *um = &unix_main; + unix_file_t *uf; + unix_cli_file_t *cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index); + unformat_input_t input; + int vlib_parse_eval (u8 *); + +more: + /* Try vlibplex first. Someday... */ + if (0 && vlib_parse_eval (cf->input_vector) == 0) + goto done; + + if (cf->line_mode) + { + /* just treat whatever we got as a complete line of input */ + cf->current_command = cf->input_vector; + } + else + { + /* Line edit, echo, etc. */ + if (unix_cli_line_edit (cm, um, cf)) + /* want more input */ + return; + } + + if (um->log_fd) + { + static u8 *lv; + vec_reset_length (lv); + lv = format (lv, "%U[%d]: %v", + format_timeval, 0 /* current bat-time */ , + 0 /* current bat-format */ , + cli_file_index, cf->input_vector); + { + int rv __attribute__ ((unused)) = + write (um->log_fd, lv, vec_len (lv)); + } + } + + /* Copy our input command to a new string */ + unformat_init_vector (&input, cf->current_command); + + /* Remove leading white space from input. */ + (void) unformat (&input, ""); + + cm->current_input_file_index = cli_file_index; + cf->pager_start = 0; /* start a new pager session */ + + if (unformat_check_input (&input) != UNFORMAT_END_OF_INPUT) + vlib_cli_input (um->vlib_main, &input, unix_vlib_cli_output, + cli_file_index); + + /* Zero buffer since otherwise unformat_free will call vec_free on it. */ + input.buffer = 0; + + unformat_free (&input); + + /* Re-fetch pointer since pool may have moved. */ + cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index); + uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + +done: + /* reset vector; we'll re-use it later */ + if (cf->line_mode) + vec_reset_length (cf->input_vector); + else + vec_reset_length (cf->current_command); + + if (cf->no_pager == 2) + { + /* Pager was programmatically disabled */ + unix_cli_pager_message (cf, uf, "pager buffer overflowed", "\n"); + cf->no_pager = um->cli_no_pager; + } + + if (vec_len (cf->pager_index) == 0 + || vec_len (cf->pager_index) < cf->height) + { + /* There was no need for the pager */ + unix_cli_pager_reset (cf); + + /* Prompt. */ + unix_cli_cli_prompt (cf, uf); + } + else + { + /* Display the pager prompt */ + unix_cli_pager_prompt (cf, uf); + } + + /* Any residual data in the input vector? */ + if (vec_len (cf->input_vector)) + goto more; +} + +/** Destroy a CLI session. + * @note If we destroy the @c stdin session this additionally signals + * the shutdown of VPP. + */ +static void +unix_cli_kill (unix_cli_main_t * cm, uword cli_file_index) +{ + unix_main_t *um = &unix_main; + unix_cli_file_t *cf; + unix_file_t *uf; + int i; + + cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index); + uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + + /* Quit/EOF on stdin means quit program. */ + if (uf->file_descriptor == UNIX_CLI_STDIN_FD) + clib_longjmp (&um->vlib_main->main_loop_exit, VLIB_MAIN_LOOP_EXIT_CLI); + + vec_free (cf->current_command); + vec_free (cf->search_key); + + for (i = 0; i < vec_len (cf->command_history); i++) + vec_free (cf->command_history[i]); + + vec_free (cf->command_history); + + unix_file_del (um, uf); + + unix_cli_file_free (cf); + pool_put (cm->cli_file_pool, cf); +} + +/** Handle system events. */ +static uword +unix_cli_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + unix_cli_main_t *cm = &unix_cli_main; + uword i, *data = 0; + + while (1) + { + unix_cli_process_event_type_t event_type; + vlib_process_wait_for_event (vm); + event_type = vlib_process_get_events (vm, &data); + + switch (event_type) + { + case UNIX_CLI_PROCESS_EVENT_READ_READY: + for (i = 0; i < vec_len (data); i++) + unix_cli_process_input (cm, data[i]); + break; + + case UNIX_CLI_PROCESS_EVENT_QUIT: + /* Kill this process. */ + for (i = 0; i < vec_len (data); i++) + unix_cli_kill (cm, data[i]); + goto done; + } + + if (data) + _vec_len (data) = 0; + } + +done: + vec_free (data); + + vlib_node_set_state (vm, rt->node_index, VLIB_NODE_STATE_DISABLED); + + /* Add node index so we can re-use this process later. */ + vec_add1 (cm->unused_cli_process_node_indices, rt->node_index); + + return 0; +} + +/** Called when a CLI session file descriptor can be written to without + * blocking. */ +static clib_error_t * +unix_cli_write_ready (unix_file_t * uf) +{ + unix_cli_main_t *cm = &unix_cli_main; + unix_cli_file_t *cf; + int n; + + cf = pool_elt_at_index (cm->cli_file_pool, uf->private_data); + + /* Flush output vector. */ + n = write (uf->file_descriptor, + cf->output_vector, vec_len (cf->output_vector)); + + if (n < 0 && errno != EAGAIN) + return clib_error_return_unix (0, "write"); + + else if (n > 0) + unix_cli_del_pending_output (uf, cf, n); + + return /* no error */ 0; +} + +/** Called when a CLI session file descriptor has data to be read. */ +static clib_error_t * +unix_cli_read_ready (unix_file_t * uf) +{ + unix_main_t *um = &unix_main; + unix_cli_main_t *cm = &unix_cli_main; + unix_cli_file_t *cf; + uword l; + int n, n_read, n_try; + + cf = pool_elt_at_index (cm->cli_file_pool, uf->private_data); + + n = n_try = 4096; + while (n == n_try) + { + l = vec_len (cf->input_vector); + vec_resize (cf->input_vector, l + n_try); + + n = read (uf->file_descriptor, cf->input_vector + l, n_try); + + /* Error? */ + if (n < 0 && errno != EAGAIN) + return clib_error_return_unix (0, "read"); + + n_read = n < 0 ? 0 : n; + _vec_len (cf->input_vector) = l + n_read; + } + + if (!(n < 0)) + vlib_process_signal_event (um->vlib_main, + cf->process_node_index, + (n_read == 0 + ? UNIX_CLI_PROCESS_EVENT_QUIT + : UNIX_CLI_PROCESS_EVENT_READ_READY), + /* event data */ uf->private_data); + + return /* no error */ 0; +} + +/** Store a new CLI session. + * @param name The name of the session. + * @param fd The file descriptor for the session I/O. + * @return The session ID. + */ +static u32 +unix_cli_file_add (unix_cli_main_t * cm, char *name, int fd) +{ + unix_main_t *um = &unix_main; + unix_cli_file_t *cf; + unix_file_t template = { 0 }; + vlib_main_t *vm = um->vlib_main; + vlib_node_t *n; + + name = (char *) format (0, "unix-cli-%s", name); + + if (vec_len (cm->unused_cli_process_node_indices) > 0) + { + uword l = vec_len (cm->unused_cli_process_node_indices); + + /* Find node and give it new name. */ + n = vlib_get_node (vm, cm->unused_cli_process_node_indices[l - 1]); + vec_free (n->name); + n->name = (u8 *) name; + + vlib_node_set_state (vm, n->index, VLIB_NODE_STATE_POLLING); + + _vec_len (cm->unused_cli_process_node_indices) = l - 1; + } + else + { + static vlib_node_registration_t r = { + .function = unix_cli_process, + .type = VLIB_NODE_TYPE_PROCESS, + .process_log2_n_stack_bytes = 16, + }; + + r.name = name; + vlib_register_node (vm, &r); + vec_free (name); + + n = vlib_get_node (vm, r.index); + } + + pool_get (cm->cli_file_pool, cf); + memset (cf, 0, sizeof (*cf)); + + template.read_function = unix_cli_read_ready; + template.write_function = unix_cli_write_ready; + template.file_descriptor = fd; + template.private_data = cf - cm->cli_file_pool; + + cf->process_node_index = n->index; + cf->unix_file_index = unix_file_add (um, &template); + cf->output_vector = 0; + cf->input_vector = 0; + + vlib_start_process (vm, n->runtime_index); + + vlib_process_t *p = vlib_get_process_from_node (vm, n); + p->output_function = unix_vlib_cli_output; + p->output_function_arg = cf - cm->cli_file_pool; + + return cf - cm->cli_file_pool; +} + +/** Telnet listening socket has a new connection. */ +static clib_error_t * +unix_cli_listen_read_ready (unix_file_t * uf) +{ + unix_main_t *um = &unix_main; + unix_cli_main_t *cm = &unix_cli_main; + clib_socket_t *s = &um->cli_listen_socket; + clib_socket_t client; + char *client_name; + clib_error_t *error; + unix_cli_file_t *cf; + u32 cf_index; + + error = clib_socket_accept (s, &client); + if (error) + return error; + + client_name = (char *) format (0, "%U%c", format_sockaddr, &client.peer, 0); + + cf_index = unix_cli_file_add (cm, client_name, client.fd); + cf = pool_elt_at_index (cm->cli_file_pool, cf_index); + + /* No longer need CLIB version of socket. */ + clib_socket_free (&client); + + vec_free (client_name); + + /* if we're supposed to run telnet session in character mode (default) */ + if (um->cli_line_mode == 0) + { + /* + * Set telnet client character mode, echo on, suppress "go-ahead". + * Technically these should be negotiated, but this works. + */ + u8 charmode_option[] = { + IAC, WONT, TELOPT_LINEMODE, /* server will do char-by-char */ + IAC, DONT, TELOPT_LINEMODE, /* client should do char-by-char */ + IAC, WILL, TELOPT_SGA, /* server willl supress GA */ + IAC, DO, TELOPT_SGA, /* client should supress Go Ahead */ + IAC, WILL, TELOPT_ECHO, /* server will do echo */ + IAC, DONT, TELOPT_ECHO, /* client should not echo */ + IAC, DO, TELOPT_TTYPE, /* client should tell us its term type */ + IAC, SB, TELOPT_TTYPE, 1, IAC, SE, /* now tell me ttype */ + IAC, DO, TELOPT_NAWS, /* client should tell us its window sz */ + IAC, SB, TELOPT_NAWS, 1, IAC, SE, /* now tell me window size */ + }; + + /* Enable history on this CLI */ + cf->history_limit = um->cli_history_limit; + cf->has_history = cf->history_limit != 0; + + /* Make sure this session is in line mode */ + cf->line_mode = 0; + + /* We need CRLF */ + cf->crlf_mode = 1; + + /* Setup the pager */ + cf->no_pager = um->cli_no_pager; + + uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + + /* Send the telnet options */ + unix_vlib_cli_output_raw (cf, uf, charmode_option, + ARRAY_LEN (charmode_option)); + + /* In case the client doesn't negotiate terminal type, use + * a timer to kick off the initial prompt. */ + timer_call (unix_cli_file_welcome_timer, cf_index, 1); + } + + return error; +} + +/** The system terminal has informed us that the window size + * has changed. + */ +static void +unix_cli_resize_interrupt (int signum) +{ + unix_main_t *um = &unix_main; + unix_cli_main_t *cm = &unix_cli_main; + unix_cli_file_t *cf = pool_elt_at_index (cm->cli_file_pool, + cm->stdin_cli_file_index); + unix_file_t *uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + struct winsize ws; + (void) signum; + + /* Terminal resized, fetch the new size */ + if (ioctl (UNIX_CLI_STDIN_FD, TIOCGWINSZ, &ws) < 0) + { + /* "Should never happen..." */ + clib_unix_warning ("TIOCGWINSZ"); + /* We can't trust ws.XXX... */ + return; + } + cf->width = ws.ws_col; + cf->height = ws.ws_row; + + /* Reindex the pager buffer */ + unix_cli_pager_reindex (cf); + + /* Redraw the page */ + unix_cli_pager_redraw (cf, uf); +} + +/** Handle configuration directives in the @em unix section. */ +static clib_error_t * +unix_cli_config (vlib_main_t * vm, unformat_input_t * input) +{ + unix_main_t *um = &unix_main; + unix_cli_main_t *cm = &unix_cli_main; + int flags; + clib_error_t *error = 0; + unix_cli_file_t *cf; + u32 cf_index; + struct termios tio; + struct sigaction sa; + struct winsize ws; + u8 *term; + + /* We depend on unix flags being set. */ + if ((error = vlib_call_config_function (vm, unix_config))) + return error; + + if (um->flags & UNIX_FLAG_INTERACTIVE) + { + /* Set stdin to be non-blocking. */ + if ((flags = fcntl (UNIX_CLI_STDIN_FD, F_GETFL, 0)) < 0) + flags = 0; + (void) fcntl (UNIX_CLI_STDIN_FD, F_SETFL, flags | O_NONBLOCK); + + cf_index = unix_cli_file_add (cm, "stdin", UNIX_CLI_STDIN_FD); + cf = pool_elt_at_index (cm->cli_file_pool, cf_index); + cm->stdin_cli_file_index = cf_index; + + /* If stdin is a tty and we are using chacracter mode, enable + * history on the CLI and set the tty line discipline accordingly. */ + if (isatty (UNIX_CLI_STDIN_FD) && um->cli_line_mode == 0) + { + /* Capture terminal resize events */ + memset (&sa, 0, sizeof (sa)); + sa.sa_handler = unix_cli_resize_interrupt; + if (sigaction (SIGWINCH, &sa, 0) < 0) + clib_panic ("sigaction"); + + /* Retrieve the current terminal size */ + ioctl (UNIX_CLI_STDIN_FD, TIOCGWINSZ, &ws); + cf->width = ws.ws_col; + cf->height = ws.ws_row; + + if (cf->width == 0 || cf->height == 0) + /* We have a tty, but no size. Stick to line mode. */ + goto notty; + + /* Setup the history */ + cf->history_limit = um->cli_history_limit; + cf->has_history = cf->history_limit != 0; + + /* Setup the pager */ + cf->no_pager = um->cli_no_pager; + + /* We're going to be in char by char mode */ + cf->line_mode = 0; + + /* Save the original tty state so we can restore it later */ + tcgetattr (UNIX_CLI_STDIN_FD, &um->tio_stdin); + um->tio_isset = 1; + + /* Tweak the tty settings */ + tio = um->tio_stdin; + /* echo off, canonical mode off, ext'd input processing off */ + tio.c_lflag &= ~(ECHO | ICANON | IEXTEN); + tio.c_cc[VMIN] = 1; /* 1 byte at a time */ + tio.c_cc[VTIME] = 0; /* no timer */ + tcsetattr (UNIX_CLI_STDIN_FD, TCSAFLUSH, &tio); + + /* See if we can do ANSI/VT100 output */ + term = (u8 *) getenv ("TERM"); + if (term != NULL) + cf->ansi_capable = unix_cli_terminal_type (term, + strlen ((char *) + term)); + } + else + { + notty: + /* No tty, so make sure these things are off */ + cf->no_pager = 1; + cf->history_limit = 0; + cf->has_history = 0; + cf->line_mode = 1; + } + + /* Send banner and initial prompt */ + unix_cli_file_welcome (cm, cf); + } + + /* If we have socket config, LISTEN, otherwise, don't */ + clib_socket_t *s = &um->cli_listen_socket; + if (s->config && s->config[0] != 0) + { + /* CLI listen. */ + unix_file_t template = { 0 }; + + s->flags = SOCKET_IS_SERVER; /* listen, don't connect */ + error = clib_socket_init (s); + + if (error) + return error; + + template.read_function = unix_cli_listen_read_ready; + template.file_descriptor = s->fd; + + unix_file_add (um, &template); + } + + /* Set CLI prompt. */ + if (!cm->cli_prompt) + cm->cli_prompt = format (0, "VLIB: "); + + return 0; +} + +/*? + * This module has no configurable parameters. +?*/ +VLIB_CONFIG_FUNCTION (unix_cli_config, "unix-cli"); + +/** Called when VPP is shutting down, this restores the system + * terminal state if previously saved. + */ +static clib_error_t * +unix_cli_exit (vlib_main_t * vm) +{ + unix_main_t *um = &unix_main; + + /* If stdin is a tty and we saved the tty state, reset the tty state */ + if (isatty (UNIX_CLI_STDIN_FD) && um->tio_isset) + tcsetattr (UNIX_CLI_STDIN_FD, TCSAFLUSH, &um->tio_stdin); + + return 0; +} + +VLIB_MAIN_LOOP_EXIT_FUNCTION (unix_cli_exit); + +/** Set the CLI prompt. + * @param prompt The C string to set the prompt to. + * @note This setting is global; it impacts all current + * and future CLI sessions. + */ +void +vlib_unix_cli_set_prompt (char *prompt) +{ + char *fmt = (prompt[strlen (prompt) - 1] == ' ') ? "%s" : "%s "; + unix_cli_main_t *cm = &unix_cli_main; + if (cm->cli_prompt) + vec_free (cm->cli_prompt); + cm->cli_prompt = format (0, fmt, prompt); +} + +/** CLI command to quit the terminal session. + * @note If this is a stdin session then this will + * shutdown VPP also. + */ +static clib_error_t * +unix_cli_quit (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + unix_cli_main_t *cm = &unix_cli_main; + + vlib_process_signal_event (vm, + vlib_current_process (vm), + UNIX_CLI_PROCESS_EVENT_QUIT, + cm->current_input_file_index); + return 0; +} + +/*? + * Terminates the current CLI session. + * + * If VPP is running in @em interactive mode and this is the console session + * (that is, the session on @c stdin) then this will also terminate VPP. +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (unix_cli_quit_command, static) = { + .path = "quit", + .short_help = "Exit CLI", + .function = unix_cli_quit, +}; +/* *INDENT-ON* */ + +/** CLI command to execute a VPP command script. */ +static clib_error_t * +unix_cli_exec (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + char *file_name; + int fd; + unformat_input_t sub_input; + clib_error_t *error; + + file_name = 0; + fd = -1; + error = 0; + + if (!unformat (input, "%s", &file_name)) + { + error = clib_error_return (0, "expecting file name, got `%U'", + format_unformat_error, input); + goto done; + } + + fd = open (file_name, O_RDONLY); + if (fd < 0) + { + error = clib_error_return_unix (0, "failed to open `%s'", file_name); + goto done; + } + + /* Make sure its a regular file. */ + { + struct stat s; + + if (fstat (fd, &s) < 0) + { + error = clib_error_return_unix (0, "failed to stat `%s'", file_name); + goto done; + } + + if (!(S_ISREG (s.st_mode) || S_ISLNK (s.st_mode))) + { + error = clib_error_return (0, "not a regular file `%s'", file_name); + goto done; + } + } + + unformat_init_unix_file (&sub_input, fd); + + vlib_cli_input (vm, &sub_input, 0, 0); + unformat_free (&sub_input); + +done: + if (fd > 0) + close (fd); + vec_free (file_name); + + return error; +} + +/*? + * Executes a sequence of CLI commands which are read from a file. + * + * If a command is unrecognised or otherwise invalid then the usual CLI + * feedback will be generated, however execution of subsequent commands + * from the file will continue. +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cli_exec, static) = { + .path = "exec", + .short_help = "Execute commands from file", + .function = unix_cli_exec, + .is_mp_safe = 1, +}; +/* *INDENT-ON* */ + +/** CLI command to show various unix error statistics. */ +static clib_error_t * +unix_show_errors (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + unix_main_t *um = &unix_main; + clib_error_t *error = 0; + int i, n_errors_to_show; + unix_error_history_t *unix_errors = 0; + + n_errors_to_show = 1 << 30; + + if (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (!unformat (input, "%d", &n_errors_to_show)) + { + error = + clib_error_return (0, + "expecting integer number of errors to show, got `%U'", + format_unformat_error, input); + goto done; + } + } + + n_errors_to_show = + clib_min (ARRAY_LEN (um->error_history), n_errors_to_show); + + i = + um->error_history_index > + 0 ? um->error_history_index - 1 : ARRAY_LEN (um->error_history) - 1; + + while (n_errors_to_show > 0) + { + unix_error_history_t *eh = um->error_history + i; + + if (!eh->error) + break; + + vec_add1 (unix_errors, eh[0]); + n_errors_to_show -= 1; + if (i == 0) + i = ARRAY_LEN (um->error_history) - 1; + else + i--; + } + + if (vec_len (unix_errors) == 0) + vlib_cli_output (vm, "no Unix errors so far"); + else + { + vlib_cli_output (vm, "%Ld total errors seen", um->n_total_errors); + for (i = vec_len (unix_errors) - 1; i >= 0; i--) + { + unix_error_history_t *eh = vec_elt_at_index (unix_errors, i); + vlib_cli_output (vm, "%U: %U", + format_time_interval, "h:m:s:u", eh->time, + format_clib_error, eh->error); + } + vlib_cli_output (vm, "%U: time now", + format_time_interval, "h:m:s:u", vlib_time_now (vm)); + } + +done: + vec_free (unix_errors); + return error; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cli_unix_show_errors, static) = { + .path = "show unix-errors", + .short_help = "Show Unix system call error history", + .function = unix_show_errors, +}; +/* *INDENT-ON* */ + +/** CLI command to show session command history. */ +static clib_error_t * +unix_cli_show_history (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + unix_cli_main_t *cm = &unix_cli_main; + unix_cli_file_t *cf; + int i, j; + + cf = pool_elt_at_index (cm->cli_file_pool, cm->current_input_file_index); + + if (cf->has_history && cf->history_limit) + { + i = 1 + cf->command_number - vec_len (cf->command_history); + for (j = 0; j < vec_len (cf->command_history); j++) + vlib_cli_output (vm, "%d %v\n", i + j, cf->command_history[j]); + } + else + { + vlib_cli_output (vm, "History not enabled.\n"); + } + + return 0; +} + +/*? + * Displays the command history for the current session, if any. +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cli_unix_cli_show_history, static) = { + .path = "history", + .short_help = "Show current session command history", + .function = unix_cli_show_history, +}; +/* *INDENT-ON* */ + +/** CLI command to show terminal status. */ +static clib_error_t * +unix_cli_show_terminal (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + unix_main_t *um = &unix_main; + unix_cli_main_t *cm = &unix_cli_main; + unix_cli_file_t *cf; + vlib_node_t *n; + + cf = pool_elt_at_index (cm->cli_file_pool, cm->current_input_file_index); + n = vlib_get_node (vm, cf->process_node_index); + + vlib_cli_output (vm, "Terminal name: %v\n", n->name); + vlib_cli_output (vm, "Terminal mode: %s\n", cf->line_mode ? + "line-by-line" : "char-by-char"); + vlib_cli_output (vm, "Terminal width: %d\n", cf->width); + vlib_cli_output (vm, "Terminal height: %d\n", cf->height); + vlib_cli_output (vm, "ANSI capable: %s\n", + cf->ansi_capable ? "yes" : "no"); + vlib_cli_output (vm, "History enabled: %s%s\n", + cf->has_history ? "yes" : "no", !cf->has_history + || cf->history_limit ? "" : + " (disabled by history limit)"); + if (cf->has_history) + vlib_cli_output (vm, "History limit: %d\n", cf->history_limit); + vlib_cli_output (vm, "Pager enabled: %s%s%s\n", + cf->no_pager ? "no" : "yes", + cf->no_pager + || cf->height ? "" : " (disabled by terminal height)", + cf->no_pager + || um->cli_pager_buffer_limit ? "" : + " (disabled by buffer limit)"); + if (!cf->no_pager) + vlib_cli_output (vm, "Pager limit: %d\n", um->cli_pager_buffer_limit); + vlib_cli_output (vm, "CRLF mode: %s\n", + cf->crlf_mode ? "CR+LF" : "LF"); + + return 0; +} + +/*? + * Displays various information about the state of the current terminal + * session. + * + * @cliexpar + * @cliexstart{show terminal} + * Terminal name: unix-cli-stdin + * Terminal mode: char-by-char + * Terminal width: 123 + * Terminal height: 48 + * ANSI capable: yes + * History enabled: yes + * History limit: 50 + * Pager enabled: yes + * Pager limit: 100000 + * CRLF mode: LF + * @cliexend +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cli_unix_cli_show_terminal, static) = { + .path = "show terminal", + .short_help = "Show current session terminal settings", + .function = unix_cli_show_terminal, +}; +/* *INDENT-ON* */ + +/** CLI command to set terminal pager settings. */ +static clib_error_t * +unix_cli_set_terminal_pager (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unix_main_t *um = &unix_main; + unix_cli_main_t *cm = &unix_cli_main; + unix_cli_file_t *cf; + unformat_input_t _line_input, *line_input = &_line_input; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + cf = pool_elt_at_index (cm->cli_file_pool, cm->current_input_file_index); + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "on")) + cf->no_pager = 0; + else if (unformat (line_input, "off")) + cf->no_pager = 1; + else if (unformat (line_input, "limit %u", &um->cli_pager_buffer_limit)) + vlib_cli_output (vm, + "Pager limit set to %u lines; note, this is global.\n", + um->cli_pager_buffer_limit); + else + return clib_error_return (0, "unknown parameter: `%U`", + format_unformat_error, line_input); + } + + unformat_free (line_input); + + return 0; +} + +/*? + * Enables or disables the terminal pager for this session. Generally + * this defaults to enabled. + * + * Additionally allows the pager buffer size to be set; though note that + * this value is set globally and not per session. +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cli_unix_cli_set_terminal_pager, static) = { + .path = "set terminal pager", + .short_help = "set terminal pager [on|off] [limit ]", + .function = unix_cli_set_terminal_pager, +}; +/* *INDENT-ON* */ + +/** CLI command to set terminal history settings. */ +static clib_error_t * +unix_cli_set_terminal_history (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unix_cli_main_t *cm = &unix_cli_main; + unix_cli_file_t *cf; + unformat_input_t _line_input, *line_input = &_line_input; + u32 limit; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + cf = pool_elt_at_index (cm->cli_file_pool, cm->current_input_file_index); + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "on")) + cf->has_history = 1; + else if (unformat (line_input, "off")) + cf->has_history = 0; + else if (unformat (line_input, "limit %u", &cf->history_limit)) + ; + else + return clib_error_return (0, "unknown parameter: `%U`", + format_unformat_error, line_input); + + /* If we reduced history size, or turned it off, purge the history */ + limit = cf->has_history ? cf->history_limit : 0; + + while (cf->command_history && vec_len (cf->command_history) >= limit) + { + vec_free (cf->command_history[0]); + vec_delete (cf->command_history, 1, 0); + } + } + + unformat_free (line_input); + + return 0; +} + +/*? + * Enables or disables the command history function of the current + * terminal. Generally this defaults to enabled. + * + * This command also allows the maximum size of the history buffer for + * this session to be altered. +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cli_unix_cli_set_terminal_history, static) = { + .path = "set terminal history", + .short_help = "set terminal history [on|off] [limit ]", + .function = unix_cli_set_terminal_history, +}; +/* *INDENT-ON* */ + +/** CLI command to set terminal ANSI settings. */ +static clib_error_t * +unix_cli_set_terminal_ansi (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unix_cli_main_t *cm = &unix_cli_main; + unix_cli_file_t *cf; + + cf = pool_elt_at_index (cm->cli_file_pool, cm->current_input_file_index); + + if (unformat (input, "on")) + cf->ansi_capable = 1; + else if (unformat (input, "off")) + cf->ansi_capable = 0; + else + return clib_error_return (0, "unknown parameter: `%U`", + format_unformat_error, input); + + return 0; +} + +/*? + * Enables or disables the use of ANSI control sequences by this terminal. + * The default will vary based on terminal detection at the start of the + * session. + * + * ANSI control sequences are used in a small number of places to provide, + * for example, color text output and to control the cursor in the pager. +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cli_unix_cli_set_terminal_ansi, static) = { + .path = "set terminal ansi", + .short_help = "set terminal ansi [on|off]", + .function = unix_cli_set_terminal_ansi, +}; +/* *INDENT-ON* */ + +static clib_error_t * +unix_cli_init (vlib_main_t * vm) +{ + return 0; +} + +VLIB_INIT_FUNCTION (unix_cli_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/dir.dox b/src/vlib/unix/dir.dox new file mode 100644 index 00000000..1380fa56 --- /dev/null +++ b/src/vlib/unix/dir.dox @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2016 Comcast Cable Communications Management, LLC. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Doxygen directory documentation */ + +/** +@dir +@brief VLIB Unix interface + +VLIB application library Unix interface layer. + +*/ +/*? %%clicmd:group_label Unix Interface %% ?*/ +/*? %%syscfg:group_label Unix Interface %% ?*/ + diff --git a/src/vlib/unix/input.c b/src/vlib/unix/input.c new file mode 100644 index 00000000..07096ed2 --- /dev/null +++ b/src/vlib/unix/input.c @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * input.c: Unix file input + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include + +/* FIXME autoconf */ +#define HAVE_LINUX_EPOLL + +#ifdef HAVE_LINUX_EPOLL + +#include + +typedef struct +{ + int epoll_fd; + struct epoll_event *epoll_events; + + /* Statistics. */ + u64 epoll_files_ready; + u64 epoll_waits; +} linux_epoll_main_t; + +static linux_epoll_main_t linux_epoll_main; + +static void +linux_epoll_file_update (unix_file_t * f, unix_file_update_type_t update_type) +{ + unix_main_t *um = &unix_main; + linux_epoll_main_t *em = &linux_epoll_main; + struct epoll_event e; + + memset (&e, 0, sizeof (e)); + + e.events = EPOLLIN; + if (f->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE) + e.events |= EPOLLOUT; + if (f->flags & UNIX_FILE_EVENT_EDGE_TRIGGERED) + e.events |= EPOLLET; + e.data.u32 = f - um->file_pool; + + if (epoll_ctl (em->epoll_fd, + (update_type == UNIX_FILE_UPDATE_ADD + ? EPOLL_CTL_ADD + : (update_type == UNIX_FILE_UPDATE_MODIFY + ? EPOLL_CTL_MOD + : EPOLL_CTL_DEL)), f->file_descriptor, &e) < 0) + clib_warning ("epoll_ctl"); +} + +static uword +linux_epoll_input (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + unix_main_t *um = &unix_main; + linux_epoll_main_t *em = &linux_epoll_main; + struct epoll_event *e; + int n_fds_ready; + + { + vlib_node_main_t *nm = &vm->node_main; + u64 t = nm->cpu_time_next_process_ready; + f64 timeout; + int timeout_ms, max_timeout_ms = 10; + f64 vector_rate = vlib_last_vectors_per_main_loop (vm); + + if (t == ~0ULL) + { + timeout = 10e-3; + timeout_ms = max_timeout_ms; + } + else + { + timeout = + (((i64) t - (i64) clib_cpu_time_now ()) + * vm->clib_time.seconds_per_clock) + /* subtract off some slop time */ - 50e-6; + + if (timeout < 1e3) + { + /* We have event happenning in less than 1 ms so + don't allow epoll to wait */ + timeout_ms = 0; + } + else + { + timeout_ms = timeout * 1e3; + + /* Must be between 1 and 10 ms. */ + timeout_ms = clib_max (1, timeout_ms); + timeout_ms = clib_min (max_timeout_ms, timeout_ms); + } + } + + /* If we still have input nodes polling (e.g. vnet packet generator) + don't sleep. */ + if (nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] > 0) + timeout_ms = 0; + + /* + * When busy: don't wait & only epoll for input + * every 1024 times through main loop. + */ + if (vector_rate > 1 || vm->api_queue_nonempty) + { + timeout_ms = 0; + node->input_main_loops_per_call = 1024; + } + else + /* We're not busy; go to sleep for a while. */ + node->input_main_loops_per_call = 0; + + /* Allow any signal to wakeup our sleep. */ + { + static sigset_t unblock_all_signals; + n_fds_ready = epoll_pwait (em->epoll_fd, + em->epoll_events, + vec_len (em->epoll_events), + timeout_ms, &unblock_all_signals); + + /* This kludge is necessary to run over absurdly old kernels */ + if (n_fds_ready < 0 && errno == ENOSYS) + { + n_fds_ready = epoll_wait (em->epoll_fd, + em->epoll_events, + vec_len (em->epoll_events), timeout_ms); + } + } + } + + if (n_fds_ready < 0) + { + if (unix_error_is_fatal (errno)) + vlib_panic_with_error (vm, clib_error_return_unix (0, "epoll_wait")); + + /* non fatal error (e.g. EINTR). */ + return 0; + } + + em->epoll_waits += 1; + em->epoll_files_ready += n_fds_ready; + + for (e = em->epoll_events; e < em->epoll_events + n_fds_ready; e++) + { + u32 i = e->data.u32; + unix_file_t *f = pool_elt_at_index (um->file_pool, i); + clib_error_t *errors[4]; + int n_errors = 0; + + if (PREDICT_TRUE (!(e->events & EPOLLERR))) + { + if (e->events & EPOLLIN) + { + errors[n_errors] = f->read_function (f); + n_errors += errors[n_errors] != 0; + } + if (e->events & EPOLLOUT) + { + errors[n_errors] = f->write_function (f); + n_errors += errors[n_errors] != 0; + } + } + else + { + if (f->error_function) + { + errors[n_errors] = f->error_function (f); + n_errors += errors[n_errors] != 0; + } + else + close (f->file_descriptor); + } + + ASSERT (n_errors < ARRAY_LEN (errors)); + for (i = 0; i < n_errors; i++) + { + unix_save_error (um, errors[i]); + } + } + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (linux_epoll_input_node,static) = { + .function = linux_epoll_input, + .type = VLIB_NODE_TYPE_PRE_INPUT, + .name = "unix-epoll-input", +}; +/* *INDENT-ON* */ + +clib_error_t * +linux_epoll_input_init (vlib_main_t * vm) +{ + linux_epoll_main_t *em = &linux_epoll_main; + unix_main_t *um = &unix_main; + + /* Allocate some events. */ + vec_resize (em->epoll_events, VLIB_FRAME_SIZE); + + em->epoll_fd = epoll_create (vec_len (em->epoll_events)); + if (em->epoll_fd < 0) + return clib_error_return_unix (0, "epoll_create"); + + um->file_update = linux_epoll_file_update; + + return 0; +} + +VLIB_INIT_FUNCTION (linux_epoll_input_init); + +#endif /* HAVE_LINUX_EPOLL */ + +static clib_error_t * +unix_input_init (vlib_main_t * vm) +{ + return vlib_call_init_function (vm, linux_epoll_input_init); +} + +VLIB_INIT_FUNCTION (unix_input_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c new file mode 100644 index 00000000..562778e0 --- /dev/null +++ b/src/vlib/unix/main.c @@ -0,0 +1,557 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * main.c: Unix main routine + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/** Default CLI pager limit is not configured in startup.conf */ +#define UNIX_CLI_DEFAULT_PAGER_LIMIT 100000 + +/** Default CLI history depth if not configured in startup.conf */ +#define UNIX_CLI_DEFAULT_HISTORY 50 + + +unix_main_t unix_main; + +static clib_error_t * +unix_main_init (vlib_main_t * vm) +{ + unix_main_t *um = &unix_main; + um->vlib_main = vm; + return vlib_call_init_function (vm, unix_input_init); +} + +VLIB_INIT_FUNCTION (unix_main_init); + +static void +unix_signal_handler (int signum, siginfo_t * si, ucontext_t * uc) +{ + uword fatal; + u8 *msg = 0; + + msg = format (msg, "received signal %U, PC %U", + format_signal, signum, format_ucontext_pc, uc); + + if (signum == SIGSEGV) + msg = format (msg, ", faulting address %p", si->si_addr); + + switch (signum) + { + /* these (caught) signals cause the application to exit */ + case SIGTERM: + if (unix_main.vlib_main->main_loop_exit_set) + { + syslog (LOG_ERR | LOG_DAEMON, "received SIGTERM, exiting..."); + + clib_longjmp (&unix_main.vlib_main->main_loop_exit, + VLIB_MAIN_LOOP_EXIT_CLI); + } + /* fall through */ + case SIGQUIT: + case SIGINT: + case SIGILL: + case SIGBUS: + case SIGSEGV: + case SIGHUP: + case SIGFPE: + fatal = 1; + break; + + /* by default, print a message and continue */ + default: + fatal = 0; + break; + } + + /* Null terminate. */ + vec_add1 (msg, 0); + + if (fatal) + { + syslog (LOG_ERR | LOG_DAEMON, "%s", msg); + os_exit (1); + } + else + clib_warning ("%s", msg); + + vec_free (msg); +} + +static clib_error_t * +setup_signal_handlers (unix_main_t * um) +{ + uword i; + struct sigaction sa; + + for (i = 1; i < 32; i++) + { + memset (&sa, 0, sizeof (sa)); + sa.sa_sigaction = (void *) unix_signal_handler; + sa.sa_flags = SA_SIGINFO; + + switch (i) + { + /* these signals take the default action */ + case SIGABRT: + case SIGKILL: + case SIGSTOP: + case SIGUSR1: + case SIGUSR2: + continue; + + /* ignore SIGPIPE, SIGCHLD */ + case SIGPIPE: + case SIGCHLD: + sa.sa_sigaction = (void *) SIG_IGN; + break; + + /* catch and handle all other signals */ + default: + break; + } + + if (sigaction (i, &sa, 0) < 0) + return clib_error_return_unix (0, "sigaction %U", format_signal, i); + } + + return 0; +} + +static void +unix_error_handler (void *arg, u8 * msg, int msg_len) +{ + unix_main_t *um = arg; + + /* Echo to stderr when interactive. */ + if (um->flags & UNIX_FLAG_INTERACTIVE) + { + CLIB_UNUSED (int r) = write (2, msg, msg_len); + } + else + { + char save = msg[msg_len - 1]; + + /* Null Terminate. */ + msg[msg_len - 1] = 0; + + syslog (LOG_ERR | LOG_DAEMON, "%s", msg); + + msg[msg_len - 1] = save; + } +} + +void +vlib_unix_error_report (vlib_main_t * vm, clib_error_t * error) +{ + unix_main_t *um = &unix_main; + + if (um->flags & UNIX_FLAG_INTERACTIVE || error == 0) + return; + + { + char save; + u8 *msg; + u32 msg_len; + + msg = error->what; + msg_len = vec_len (msg); + + /* Null Terminate. */ + save = msg[msg_len - 1]; + msg[msg_len - 1] = 0; + + syslog (LOG_ERR | LOG_DAEMON, "%s", msg); + + msg[msg_len - 1] = save; + } +} + +static uword +startup_config_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + unix_main_t *um = &unix_main; + u8 *buf = 0; + uword l, n = 1; + + vlib_process_suspend (vm, 2.0); + + while (um->unix_config_complete == 0) + vlib_process_suspend (vm, 0.1); + + if (um->startup_config_filename) + { + unformat_input_t sub_input; + int fd; + struct stat s; + char *fn = (char *) um->startup_config_filename; + + fd = open (fn, O_RDONLY); + if (fd < 0) + { + clib_warning ("failed to open `%s'", fn); + return 0; + } + + if (fstat (fd, &s) < 0) + { + clib_warning ("failed to stat `%s'", fn); + bail: + close (fd); + return 0; + } + + if (!(S_ISREG (s.st_mode) || S_ISLNK (s.st_mode))) + { + clib_warning ("not a regular file: `%s'", fn); + goto bail; + } + + while (n > 0) + { + l = vec_len (buf); + vec_resize (buf, 4096); + n = read (fd, buf + l, 4096); + if (n > 0) + { + _vec_len (buf) = l + n; + if (n < 4096) + break; + } + else + break; + } + if (um->log_fd && vec_len (buf)) + { + u8 *lv = 0; + lv = format (lv, "%U: ***** Startup Config *****\n%v", + format_timeval, 0 /* current bat-time */ , + 0 /* current bat-format */ , + buf); + { + int rv __attribute__ ((unused)) = + write (um->log_fd, lv, vec_len (lv)); + } + vec_reset_length (lv); + lv = format (lv, "%U: ***** End Startup Config *****\n", + format_timeval, 0 /* current bat-time */ , + 0 /* current bat-format */ ); + { + int rv __attribute__ ((unused)) = + write (um->log_fd, lv, vec_len (lv)); + } + vec_free (lv); + } + + if (vec_len (buf)) + { + unformat_init_vector (&sub_input, buf); + vlib_cli_input (vm, &sub_input, 0, 0); + /* frees buf for us */ + unformat_free (&sub_input); + } + close (fd); + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (startup_config_node,static) = { + .function = startup_config_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "startup-config-process", +}; +/* *INDENT-ON* */ + +static clib_error_t * +unix_config (vlib_main_t * vm, unformat_input_t * input) +{ + unix_main_t *um = &unix_main; + clib_error_t *error = 0; + + /* Defaults */ + um->cli_pager_buffer_limit = UNIX_CLI_DEFAULT_PAGER_LIMIT; + um->cli_history_limit = UNIX_CLI_DEFAULT_HISTORY; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + char *cli_prompt; + if (unformat (input, "interactive")) + um->flags |= UNIX_FLAG_INTERACTIVE; + else if (unformat (input, "nodaemon")) + um->flags |= UNIX_FLAG_NODAEMON; + else if (unformat (input, "cli-prompt %s", &cli_prompt)) + vlib_unix_cli_set_prompt (cli_prompt); + else + if (unformat (input, "cli-listen %s", &um->cli_listen_socket.config)) + ; + else if (unformat (input, "cli-line-mode")) + um->cli_line_mode = 1; + else if (unformat (input, "cli-no-banner")) + um->cli_no_banner = 1; + else if (unformat (input, "cli-no-pager")) + um->cli_no_pager = 1; + else if (unformat (input, "cli-pager-buffer-limit %d", + &um->cli_pager_buffer_limit)) + ; + else + if (unformat (input, "cli-history-limit %d", &um->cli_history_limit)) + ; + else if (unformat (input, "full-coredump")) + { + int fd; + + fd = open ("/proc/self/coredump_filter", O_WRONLY); + if (fd >= 0) + { + if (write (fd, "0x6f\n", 5) != 5) + clib_unix_warning ("coredump filter write failed!"); + close (fd); + } + else + clib_unix_warning ("couldn't open /proc/self/coredump_filter"); + } + else if (unformat (input, "startup-config %s", + &um->startup_config_filename)) + ; + else if (unformat (input, "exec %s", &um->startup_config_filename)) + ; + else if (unformat (input, "log %s", &um->log_filename)) + { + um->log_fd = open ((char *) um->log_filename, + O_CREAT | O_WRONLY | O_APPEND, 0644); + if (um->log_fd < 0) + { + clib_warning ("couldn't open log '%s'\n", um->log_filename); + um->log_fd = 0; + } + else + { + u8 *lv = 0; + lv = format (0, "%U: ***** Start: PID %d *****\n", + format_timeval, 0 /* current bat-time */ , + 0 /* current bat-format */ , + getpid ()); + { + int rv __attribute__ ((unused)) = + write (um->log_fd, lv, vec_len (lv)); + } + vec_free (lv); + } + } + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + if (!(um->flags & UNIX_FLAG_INTERACTIVE)) + { + error = setup_signal_handlers (um); + if (error) + return error; + + openlog (vm->name, LOG_CONS | LOG_PERROR | LOG_PID, LOG_DAEMON); + clib_error_register_handler (unix_error_handler, um); + + if (!(um->flags & UNIX_FLAG_NODAEMON) && daemon ( /* chdir to / */ 0, + /* stdin/stdout/stderr -> /dev/null */ + 0) < 0) + clib_error_return (0, "daemon () fails"); + } + um->unix_config_complete = 1; + + return 0; +} + +/* unix { ... } configuration. */ +/*? + * + * @cfgcmd{interactive} + * Attach CLI to stdin/out and provide a debugging command line interface. + * Implies @c nodaemon. + * + * @cfgcmd{nodaemon} + * Do not fork or background the VPP process. Typically used when invoking + * VPP applications from a process monitor. + * + * @cfgcmd{exec, <filename>} + * @par startup-config <filename> + * Read startup operational configuration from @c filename. + * The contents of the file will be performed as though entered at the CLI. + * The two keywords are aliases for the same function; if both are specified, + * only the last will have an effect. + * + * @cfgcmd{log, <filename>} + * Logs the startup configuration and all subsequent CLI commands in + * @c filename. + * Very useful in situations where folks don't remember or can't be bothered + * to include CLI commands in bug reports. + * + * @cfgcmd{full-coredump} + * Ask the Linux kernel to dump all memory-mapped address regions, instead + * of just text+data+bss. + * + * @cfgcmd{cli-listen, <address:port>} + * Bind the CLI to listen at the address and port given. @clocalhost + * on TCP port @c 5002, given as cli-listen localhost:5002, + * is typical. + * + * @cfgcmd{cli-line-mode} + * Disable character-by-character I/O on stdin. Useful when combined with, + * for example, emacs M-x gud-gdb. + * + * @cfgcmd{cli-prompt, <string>} + * Configure the CLI prompt to be @c string. + * + * @cfgcmd{cli-history-limit, <nn>} + * Limit commmand history to @c nn lines. A value of @c 0 + * disables command history. Default value: @c 50 + * + * @cfgcmd{cli-no-banner} + * Disable the login banner on stdin and Telnet connections. + * + * @cfgcmd{cli-no-pager} + * Disable the output pager. + * + * @cfgcmd{cli-pager-buffer-limit, <nn>} + * Limit pager buffer to @c nn lines of output. + * A value of @c 0 disables the pager. Default value: @c 100000 +?*/ +VLIB_CONFIG_FUNCTION (unix_config, "unix"); + +static clib_error_t * +unix_exit (vlib_main_t * vm) +{ + /* Close syslog connection. */ + closelog (); + return 0; +} + +VLIB_MAIN_LOOP_EXIT_FUNCTION (unix_exit); + +u8 **vlib_thread_stacks; + +static uword +thread0 (uword arg) +{ + vlib_main_t *vm = (vlib_main_t *) arg; + unformat_input_t input; + int i; + + unformat_init_command_line (&input, (char **) vm->argv); + i = vlib_main (vm, &input); + unformat_free (&input); + + return i; +} + +int +vlib_unix_main (int argc, char *argv[]) +{ + vlib_main_t *vm = &vlib_global_main; /* one and only time for this! */ + vlib_thread_main_t *tm = &vlib_thread_main; + unformat_input_t input; + u8 *thread_stacks; + clib_error_t *e; + int i; + + vm->argv = (u8 **) argv; + vm->name = argv[0]; + vm->heap_base = clib_mem_get_heap (); + ASSERT (vm->heap_base); + + i = vlib_plugin_early_init (vm); + if (i) + return i; + + unformat_init_command_line (&input, (char **) vm->argv); + if (vm->init_functions_called == 0) + vm->init_functions_called = hash_create (0, /* value bytes */ 0); + e = vlib_call_all_config_functions (vm, &input, 1 /* early */ ); + if (e != 0) + { + clib_error_report (e); + return 1; + } + unformat_free (&input); + + /* + * allocate n x VLIB_THREAD_STACK_SIZE stacks, aligned to a + * VLIB_THREAD_STACK_SIZE boundary + * See also: os_get_cpu_number() in vlib/vlib/threads.c + */ + thread_stacks = clib_mem_alloc_aligned + ((uword) tm->n_thread_stacks * VLIB_THREAD_STACK_SIZE, + VLIB_THREAD_STACK_SIZE); + + vec_validate (vlib_thread_stacks, tm->n_thread_stacks - 1); + for (i = 0; i < vec_len (vlib_thread_stacks); i++) + { + vlib_thread_stacks[i] = thread_stacks; + + /* + * Disallow writes to the bottom page of the stack, to + * catch stack overflows. + */ + if (mprotect (thread_stacks, clib_mem_get_page_size (), PROT_READ) < 0) + clib_unix_warning ("thread stack"); + + thread_stacks += VLIB_THREAD_STACK_SIZE; + } + + i = clib_calljmp (thread0, (uword) vm, + (void *) (vlib_thread_stacks[0] + + VLIB_THREAD_STACK_SIZE)); + return i; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/mc_socket.c b/src/vlib/unix/mc_socket.c new file mode 100644 index 00000000..9c12ad3b --- /dev/null +++ b/src/vlib/unix/mc_socket.c @@ -0,0 +1,1049 @@ +/* + * mc_socket.c: socket based multicast for vlib mc + * + * Copyright (c) 2010 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include /* for FIONBIO */ +#include /* for TCP_NODELAY */ +#include /* for struct ifreq */ + +static u8 * +format_socket_peer_id (u8 * s, va_list * args) +{ + u64 peer_id_as_u64 = va_arg (*args, u64); + mc_peer_id_t peer_id; + peer_id.as_u64 = peer_id_as_u64; + u32 a = mc_socket_peer_id_get_address (peer_id); + u32 p = mc_socket_peer_id_get_port (peer_id); + + s = format (s, "%U:%04x", format_network_address, AF_INET, &a, ntohs (p)); + + return s; +} + +typedef void (mc_msg_handler_t) (mc_main_t * mcm, void *msg, + u32 buffer_index); + +always_inline void +msg_handler (mc_main_t * mcm, + u32 buffer_index, u32 handler_frees_buffer, void *_h) +{ + vlib_main_t *vm = mcm->vlib_main; + mc_msg_handler_t *h = _h; + vlib_buffer_t *b = vlib_get_buffer (vm, buffer_index); + void *the_msg = vlib_buffer_get_current (b); + + h (mcm, the_msg, buffer_index); + if (!handler_frees_buffer) + vlib_buffer_free_one (vm, buffer_index); +} + +static uword +append_buffer_index_to_iovec (vlib_main_t * vm, + u32 buffer_index, struct iovec **iovs_return) +{ + struct iovec *i; + vlib_buffer_t *b; + u32 bi = buffer_index; + u32 l = 0; + + while (1) + { + b = vlib_get_buffer (vm, bi); + vec_add2 (*iovs_return, i, 1); + i->iov_base = vlib_buffer_get_current (b); + i->iov_len = b->current_length; + l += i->iov_len; + if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT)) + break; + bi = b->next_buffer; + } + + return l; +} + +static clib_error_t * +sendmsg_helper (mc_socket_main_t * msm, + int socket, struct sockaddr_in *tx_addr, u32 buffer_index) +{ + vlib_main_t *vm = msm->mc_main.vlib_main; + struct msghdr h; + word n_bytes, n_bytes_tx, n_retries; + + memset (&h, 0, sizeof (h)); + h.msg_name = tx_addr; + h.msg_namelen = sizeof (tx_addr[0]); + + if (msm->iovecs) + _vec_len (msm->iovecs) = 0; + + n_bytes = append_buffer_index_to_iovec (vm, buffer_index, &msm->iovecs); + ASSERT (n_bytes <= msm->mc_main.transport.max_packet_size); + if (n_bytes > msm->mc_main.transport.max_packet_size) + clib_error ("sending packet larger than interace MTU %d bytes", n_bytes); + + h.msg_iov = msm->iovecs; + h.msg_iovlen = vec_len (msm->iovecs); + + n_retries = 0; + while ((n_bytes_tx = sendmsg (socket, &h, /* flags */ 0)) != n_bytes + && errno == EAGAIN) + n_retries++; + if (n_bytes_tx != n_bytes) + { + clib_unix_warning ("sendmsg"); + return 0; + } + if (n_retries) + { + ELOG_TYPE_DECLARE (e) = + { + .format = "sendmsg-helper: %d retries",.format_args = "i4",}; + struct + { + u32 retries; + } *ed = 0; + + ed = ELOG_DATA (&vm->elog_main, e); + ed->retries = n_retries; + } + return 0; +} + +static clib_error_t * +tx_buffer (void *transport, mc_transport_type_t type, u32 buffer_index) +{ + mc_socket_main_t *msm = (mc_socket_main_t *) transport; + vlib_main_t *vm = msm->mc_main.vlib_main; + mc_multicast_socket_t *ms = &msm->multicast_sockets[type]; + clib_error_t *error; + error = sendmsg_helper (msm, ms->socket, &ms->tx_addr, buffer_index); + if (type != MC_TRANSPORT_USER_REQUEST_TO_RELAY) + vlib_buffer_free_one (vm, buffer_index); + return error; +} + +static clib_error_t * +tx_ack (void *transport, mc_peer_id_t dest_peer_id, u32 buffer_index) +{ + struct sockaddr_in tx_addr; + mc_socket_main_t *msm = (mc_socket_main_t *) transport; + vlib_main_t *vm = msm->mc_main.vlib_main; + clib_error_t *error; + + memset (&tx_addr, 0, sizeof (tx_addr)); + tx_addr.sin_family = AF_INET; + tx_addr.sin_addr.s_addr = mc_socket_peer_id_get_address (dest_peer_id); + tx_addr.sin_port = mc_socket_peer_id_get_port (dest_peer_id); + + error = sendmsg_helper (msm, msm->ack_socket, &tx_addr, buffer_index); + vlib_buffer_free_one (vm, buffer_index); + return error; +} + +static clib_error_t * +recvmsg_helper (mc_socket_main_t * msm, + int socket, + struct sockaddr_in *rx_addr, + u32 * buffer_index, u32 drop_message) +{ + vlib_main_t *vm = msm->mc_main.vlib_main; + vlib_buffer_t *b; + uword n_left, n_alloc, n_mtu, i, i_rx; + const uword buffer_size = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES; + word n_bytes_left; + + /* Make sure we have at least a MTU worth of buffers. */ + n_mtu = msm->rx_mtu_n_buffers; + n_left = vec_len (msm->rx_buffers); + if (n_left < n_mtu) + { + uword max_alloc = 8 * n_mtu; + vec_validate (msm->rx_buffers, max_alloc - 1); + n_alloc = + vlib_buffer_alloc (vm, msm->rx_buffers + n_left, max_alloc - n_left); + _vec_len (msm->rx_buffers) = n_left + n_alloc; + } + + ASSERT (vec_len (msm->rx_buffers) >= n_mtu); + vec_validate (msm->iovecs, n_mtu - 1); + + /* Allocate RX buffers from end of rx_buffers. + Turn them into iovecs to pass to readv. */ + i_rx = vec_len (msm->rx_buffers) - 1; + for (i = 0; i < n_mtu; i++) + { + b = vlib_get_buffer (vm, msm->rx_buffers[i_rx - i]); + msm->iovecs[i].iov_base = b->data; + msm->iovecs[i].iov_len = buffer_size; + } + _vec_len (msm->iovecs) = n_mtu; + + { + struct msghdr h; + + memset (&h, 0, sizeof (h)); + if (rx_addr) + { + h.msg_name = rx_addr; + h.msg_namelen = sizeof (rx_addr[0]); + } + h.msg_iov = msm->iovecs; + h.msg_iovlen = vec_len (msm->iovecs); + + n_bytes_left = recvmsg (socket, &h, 0); + if (n_bytes_left < 0) + return clib_error_return_unix (0, "recvmsg"); + } + + if (drop_message) + { + *buffer_index = ~0; + return 0; + } + + *buffer_index = msm->rx_buffers[i_rx]; + while (1) + { + b = vlib_get_buffer (vm, msm->rx_buffers[i_rx]); + + b->flags = 0; + b->current_data = 0; + b->current_length = + n_bytes_left < buffer_size ? n_bytes_left : buffer_size; + + n_bytes_left -= buffer_size; + + if (n_bytes_left <= 0) + break; + + i_rx--; + b->flags |= VLIB_BUFFER_NEXT_PRESENT; + b->next_buffer = msm->rx_buffers[i_rx]; + } + + _vec_len (msm->rx_buffers) = i_rx; + + return 0 /* no error */ ; +} + +static clib_error_t * +mastership_socket_read_ready (unix_file_t * uf) +{ + mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; + mc_main_t *mcm = &msm->mc_main; + mc_multicast_socket_t *ms = + &msm->multicast_sockets[MC_TRANSPORT_MASTERSHIP]; + clib_error_t *error; + u32 bi; + + error = recvmsg_helper (msm, ms->socket, /* rx_addr */ 0, &bi, /* drop_message */ + 0); + if (!error) + msg_handler (mcm, bi, + /* handler_frees_buffer */ 0, + mc_msg_master_assert_handler); + + return error; +} + +static clib_error_t * +to_relay_socket_read_ready (unix_file_t * uf) +{ + mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; + mc_main_t *mcm = &msm->mc_main; + vlib_main_t *vm = msm->mc_main.vlib_main; + mc_multicast_socket_t *ms_to_relay = + &msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_TO_RELAY]; + mc_multicast_socket_t *ms_from_relay = + &msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_FROM_RELAY]; + clib_error_t *error; + u32 bi; + u32 is_master = mcm->relay_state == MC_RELAY_STATE_MASTER; + + /* Not the ordering master? Turf the msg */ + error = recvmsg_helper (msm, ms_to_relay->socket, /* rx_addr */ 0, &bi, + /* drop_message */ !is_master); + + /* If we are the master, number and rebroadcast the msg. */ + if (!error && is_master) + { + vlib_buffer_t *b = vlib_get_buffer (vm, bi); + mc_msg_user_request_t *mp = vlib_buffer_get_current (b); + mp->global_sequence = clib_host_to_net_u32 (mcm->relay_global_sequence); + mcm->relay_global_sequence++; + error = + sendmsg_helper (msm, ms_from_relay->socket, &ms_from_relay->tx_addr, + bi); + vlib_buffer_free_one (vm, bi); + } + + return error; +} + +static clib_error_t * +from_relay_socket_read_ready (unix_file_t * uf) +{ + mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; + mc_main_t *mcm = &msm->mc_main; + mc_multicast_socket_t *ms = + &msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_FROM_RELAY]; + clib_error_t *error; + u32 bi; + + error = recvmsg_helper (msm, ms->socket, /* rx_addr */ 0, &bi, /* drop_message */ + 0); + if (!error) + { + msg_handler (mcm, bi, /* handler_frees_buffer */ 1, + mc_msg_user_request_handler); + } + return error; +} + +static clib_error_t * +join_socket_read_ready (unix_file_t * uf) +{ + mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; + mc_main_t *mcm = &msm->mc_main; + vlib_main_t *vm = mcm->vlib_main; + mc_multicast_socket_t *ms = &msm->multicast_sockets[MC_TRANSPORT_JOIN]; + clib_error_t *error; + u32 bi; + + error = recvmsg_helper (msm, ms->socket, /* rx_addr */ 0, &bi, /* drop_message */ + 0); + if (!error) + { + vlib_buffer_t *b = vlib_get_buffer (vm, bi); + mc_msg_join_or_leave_request_t *mp = vlib_buffer_get_current (b); + + switch (clib_host_to_net_u32 (mp->type)) + { + case MC_MSG_TYPE_join_or_leave_request: + msg_handler (mcm, bi, /* handler_frees_buffer */ 0, + mc_msg_join_or_leave_request_handler); + break; + + case MC_MSG_TYPE_join_reply: + msg_handler (mcm, bi, /* handler_frees_buffer */ 0, + mc_msg_join_reply_handler); + break; + + default: + ASSERT (0); + break; + } + } + return error; +} + +static clib_error_t * +ack_socket_read_ready (unix_file_t * uf) +{ + mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; + mc_main_t *mcm = &msm->mc_main; + clib_error_t *error; + u32 bi; + + error = recvmsg_helper (msm, msm->ack_socket, /* rx_addr */ 0, &bi, + /* drop_message */ 0); + if (!error) + msg_handler (mcm, bi, /* handler_frees_buffer */ 0, + mc_msg_user_ack_handler); + return error; +} + +static void +catchup_cleanup (mc_socket_main_t * msm, + mc_socket_catchup_t * c, unix_main_t * um, unix_file_t * uf) +{ + hash_unset (msm->catchup_index_by_file_descriptor, uf->file_descriptor); + unix_file_del (um, uf); + vec_free (c->input_vector); + vec_free (c->output_vector); + pool_put (msm->catchups, c); +} + +static mc_socket_catchup_t * +find_catchup_from_file_descriptor (mc_socket_main_t * msm, + int file_descriptor) +{ + uword *p = + hash_get (msm->catchup_index_by_file_descriptor, file_descriptor); + return p ? pool_elt_at_index (msm->catchups, p[0]) : 0; +} + +static clib_error_t * +catchup_socket_read_ready (unix_file_t * uf, int is_server) +{ + unix_main_t *um = &unix_main; + mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; + mc_main_t *mcm = &msm->mc_main; + mc_socket_catchup_t *c = + find_catchup_from_file_descriptor (msm, uf->file_descriptor); + word l, n, is_eof; + + l = vec_len (c->input_vector); + vec_resize (c->input_vector, 4096); + n = + read (uf->file_descriptor, c->input_vector + l, + vec_len (c->input_vector) - l); + is_eof = n == 0; + + if (n < 0) + { + if (errno == EAGAIN) + n = 0; + else + { + catchup_cleanup (msm, c, um, uf); + return clib_error_return_unix (0, "read"); + } + } + + _vec_len (c->input_vector) = l + n; + + if (is_eof && vec_len (c->input_vector) > 0) + { + if (is_server) + { + mc_msg_catchup_request_handler (mcm, (void *) c->input_vector, + c - msm->catchups); + _vec_len (c->input_vector) = 0; + } + else + { + mc_msg_catchup_reply_handler (mcm, (void *) c->input_vector, + c - msm->catchups); + c->input_vector = 0; /* reply handler is responsible for freeing vector */ + catchup_cleanup (msm, c, um, uf); + } + } + + return 0 /* no error */ ; +} + +static clib_error_t * +catchup_server_read_ready (unix_file_t * uf) +{ + return catchup_socket_read_ready (uf, /* is_server */ 1); +} + +static clib_error_t * +catchup_client_read_ready (unix_file_t * uf) +{ + if (MC_EVENT_LOGGING) + { + mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; + vlib_main_t *vm = msm->mc_main.vlib_main; + + ELOG_TYPE (e, "catchup_client_read_ready"); + ELOG (&vm->elog_main, e, 0); + } + return catchup_socket_read_ready (uf, /* is_server */ 0); +} + +static clib_error_t * +catchup_socket_write_ready (unix_file_t * uf, int is_server) +{ + unix_main_t *um = &unix_main; + mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; + mc_socket_catchup_t *c = + find_catchup_from_file_descriptor (msm, uf->file_descriptor); + clib_error_t *error = 0; + int n; + + if (c->connect_in_progress) + { + u32 len, value; + + c->connect_in_progress = 0; + len = sizeof (value); + if (getsockopt (c->socket, SOL_SOCKET, SO_ERROR, &value, &len) < 0) + { + error = clib_error_return_unix (0, "getsockopt SO_ERROR"); + goto error_quit; + } + if (value != 0) + { + error = + clib_error_return_code (0, value, CLIB_ERROR_ERRNO_VALID, + "connect fails"); + goto error_quit; + } + } + + while (1) + { + u32 n_this_write; + + n_this_write = + clib_min (vec_len (c->output_vector) - c->output_vector_n_written, + msm->rx_mtu_n_bytes - + 64 /* ip + tcp + option allowance */ ); + + if (n_this_write <= 0) + break; + + do + { + n = write (uf->file_descriptor, + c->output_vector + c->output_vector_n_written, + n_this_write); + } + while (n < 0 && errno == EAGAIN); + + if (n < 0) + { + error = clib_error_return_unix (0, "write"); + goto error_quit; + } + c->output_vector_n_written += n; + } + + if (c->output_vector_n_written >= vec_len (c->output_vector)) + { + if (!is_server) + { + uf->flags &= ~UNIX_FILE_DATA_AVAILABLE_TO_WRITE; + unix_main.file_update (uf, UNIX_FILE_UPDATE_MODIFY); + /* Send EOF to other side. */ + shutdown (uf->file_descriptor, SHUT_WR); + return error; + } + else + { + error_quit: + catchup_cleanup (msm, c, um, uf); + } + } + return error; +} + +static clib_error_t * +catchup_server_write_ready (unix_file_t * uf) +{ + return catchup_socket_write_ready (uf, /* is_server */ 1); +} + +static clib_error_t * +catchup_client_write_ready (unix_file_t * uf) +{ + return catchup_socket_write_ready (uf, /* is_server */ 0); +} + +static clib_error_t * +catchup_socket_error_ready (unix_file_t * uf) +{ + unix_main_t *um = &unix_main; + mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; + mc_socket_catchup_t *c = + find_catchup_from_file_descriptor (msm, uf->file_descriptor); + catchup_cleanup (msm, c, um, uf); + return clib_error_return (0, "error"); +} + +static clib_error_t * +catchup_listen_read_ready (unix_file_t * uf) +{ + mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; + struct sockaddr_in client_addr; + int client_len; + mc_socket_catchup_t *c; + unix_file_t template = { 0 }; + + pool_get (msm->catchups, c); + memset (c, 0, sizeof (c[0])); + + client_len = sizeof (client_addr); + + /* Acquires the non-blocking attrib from the server socket. */ + c->socket = accept (uf->file_descriptor, + (struct sockaddr *) &client_addr, + (socklen_t *) & client_len); + + if (c->socket < 0) + { + pool_put (msm->catchups, c); + return clib_error_return_unix (0, "accept"); + } + + if (MC_EVENT_LOGGING) + { + mc_main_t *mcm = &msm->mc_main; + vlib_main_t *vm = mcm->vlib_main; + + ELOG_TYPE_DECLARE (e) = + { + .format = "catchup accepted from 0x%lx",.format_args = "i4",}; + struct + { + u32 addr; + } *ed = 0; + + ed = ELOG_DATA (&vm->elog_main, e); + ed->addr = ntohl (client_addr.sin_addr.s_addr); + } + + /* Disable the Nagle algorithm, ship catchup pkts immediately */ + { + int one = 1; + if ((setsockopt (c->socket, IPPROTO_TCP, + TCP_NODELAY, (void *) &one, sizeof (one))) < 0) + { + clib_unix_warning ("catchup socket: set TCP_NODELAY"); + } + } + + template.read_function = catchup_server_read_ready; + template.write_function = catchup_server_write_ready; + template.error_function = catchup_socket_error_ready; + template.file_descriptor = c->socket; + template.private_data = pointer_to_uword (msm); + c->unix_file_index = unix_file_add (&unix_main, &template); + hash_set (msm->catchup_index_by_file_descriptor, c->socket, + c - msm->catchups); + + return 0; +} + +/* Return and bind to an unused port. */ +static word +find_and_bind_to_free_port (word sock, word port) +{ + for (; port < 1 << 16; port++) + { + struct sockaddr_in a; + + memset (&a, 0, sizeof (a)); /* Warnings be gone */ + + a.sin_family = PF_INET; + a.sin_addr.s_addr = INADDR_ANY; + a.sin_port = htons (port); + + if (bind (sock, (struct sockaddr *) &a, sizeof (a)) >= 0) + break; + } + + return port < 1 << 16 ? port : -1; +} + +static clib_error_t * +setup_mutlicast_socket (mc_socket_main_t * msm, + mc_multicast_socket_t * ms, + char *type, uword udp_port) +{ + int one = 1; + struct ip_mreq mcast_req; + + if (!msm->multicast_ttl) + msm->multicast_ttl = 1; + + /* mastership (multicast) TX socket */ + if ((ms->socket = socket (PF_INET, SOCK_DGRAM, IPPROTO_UDP)) < 0) + return clib_error_return_unix (0, "%s socket", type); + + { + u8 ttl = msm->multicast_ttl; + + if ((setsockopt (ms->socket, IPPROTO_IP, + IP_MULTICAST_TTL, (void *) &ttl, sizeof (ttl))) < 0) + return clib_error_return_unix (0, "%s set multicast ttl", type); + } + + if (setsockopt (ms->socket, SOL_SOCKET, SO_REUSEADDR, &one, sizeof (one)) < + 0) + return clib_error_return_unix (0, "%s setsockopt SO_REUSEADDR", type); + + memset (&ms->tx_addr, 0, sizeof (ms->tx_addr)); + ms->tx_addr.sin_family = AF_INET; + ms->tx_addr.sin_addr.s_addr = + htonl (msm->multicast_tx_ip4_address_host_byte_order); + ms->tx_addr.sin_port = htons (udp_port); + + if (bind (ms->socket, (struct sockaddr *) &ms->tx_addr, + sizeof (ms->tx_addr)) < 0) + return clib_error_return_unix (0, "%s bind", type); + + memset (&mcast_req, 0, sizeof (mcast_req)); + mcast_req.imr_multiaddr.s_addr = + htonl (msm->multicast_tx_ip4_address_host_byte_order); + mcast_req.imr_interface.s_addr = msm->if_ip4_address_net_byte_order; + + if ((setsockopt (ms->socket, IPPROTO_IP, + IP_ADD_MEMBERSHIP, (void *) &mcast_req, + sizeof (mcast_req))) < 0) + return clib_error_return_unix (0, "%s IP_ADD_MEMBERSHIP setsockopt", + type); + + if (ioctl (ms->socket, FIONBIO, &one) < 0) + return clib_error_return_unix (0, "%s set FIONBIO", type); + + /* FIXME remove this when we support tx_ready. */ + { + u32 len = 1 << 20; + socklen_t sl = sizeof (len); + if (setsockopt (ms->socket, SOL_SOCKET, SO_SNDBUF, &len, sl) < 0) + clib_unix_error ("setsockopt"); + } + + return 0; +} + +static clib_error_t * +socket_setup (mc_socket_main_t * msm) +{ + int one = 1; + clib_error_t *error; + u32 port; + + if (!msm->base_multicast_udp_port_host_byte_order) + msm->base_multicast_udp_port_host_byte_order = + 0xffff - ((MC_N_TRANSPORT_TYPE + 2 /* ack socket, catchup socket */ ) + - 1); + + port = msm->base_multicast_udp_port_host_byte_order; + + error = setup_mutlicast_socket (msm, + &msm->multicast_sockets + [MC_TRANSPORT_MASTERSHIP], "mastership", + port++); + if (error) + return error; + + error = setup_mutlicast_socket (msm, + &msm->multicast_sockets[MC_TRANSPORT_JOIN], + "join", port++); + if (error) + return error; + + error = setup_mutlicast_socket (msm, + &msm->multicast_sockets + [MC_TRANSPORT_USER_REQUEST_TO_RELAY], + "to relay", port++); + if (error) + return error; + + error = setup_mutlicast_socket (msm, + &msm->multicast_sockets + [MC_TRANSPORT_USER_REQUEST_FROM_RELAY], + "from relay", port++); + if (error) + return error; + + /* ACK rx socket */ + msm->ack_socket = socket (PF_INET, SOCK_DGRAM, IPPROTO_UDP); + if (msm->ack_socket < 0) + return clib_error_return_unix (0, "ack socket"); + + msm->ack_udp_port = find_and_bind_to_free_port (msm->ack_socket, port++); + + if (ioctl (msm->ack_socket, FIONBIO, &one) < 0) + return clib_error_return_unix (0, "ack socket FIONBIO"); + + msm->catchup_server_socket = socket (AF_INET, SOCK_STREAM, 0); + if (msm->catchup_server_socket < 0) + return clib_error_return_unix (0, "catchup server socket"); + + msm->catchup_tcp_port = + find_and_bind_to_free_port (msm->catchup_server_socket, port++); + + if (ioctl (msm->catchup_server_socket, FIONBIO, &one) < 0) + return clib_error_return_unix (0, "catchup server socket FIONBIO"); + + if (listen (msm->catchup_server_socket, 5) < 0) + return clib_error_return_unix (0, "catchup server socket listen"); + + /* epoll setup for multicast mastership socket */ + { + unix_file_t template = { 0 }; + + template.read_function = mastership_socket_read_ready; + template.file_descriptor = + msm->multicast_sockets[MC_TRANSPORT_MASTERSHIP].socket; + template.private_data = (uword) msm; + unix_file_add (&unix_main, &template); + + /* epoll setup for multicast to_relay socket */ + template.read_function = to_relay_socket_read_ready; + template.file_descriptor = + msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_TO_RELAY].socket; + template.private_data = (uword) msm; + unix_file_add (&unix_main, &template); + + /* epoll setup for multicast from_relay socket */ + template.read_function = from_relay_socket_read_ready; + template.file_descriptor = + msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_FROM_RELAY].socket; + template.private_data = (uword) msm; + unix_file_add (&unix_main, &template); + + template.read_function = join_socket_read_ready; + template.file_descriptor = + msm->multicast_sockets[MC_TRANSPORT_JOIN].socket; + template.private_data = (uword) msm; + unix_file_add (&unix_main, &template); + + /* epoll setup for ack rx socket */ + template.read_function = ack_socket_read_ready; + template.file_descriptor = msm->ack_socket; + template.private_data = (uword) msm; + unix_file_add (&unix_main, &template); + + /* epoll setup for TCP catchup server */ + template.read_function = catchup_listen_read_ready; + template.file_descriptor = msm->catchup_server_socket; + template.private_data = (uword) msm; + unix_file_add (&unix_main, &template); + } + + return 0; +} + +static void * +catchup_add_pending_output (mc_socket_catchup_t * c, uword n_bytes, + u8 * set_output_vector) +{ + unix_file_t *uf = pool_elt_at_index (unix_main.file_pool, + c->unix_file_index); + u8 *result = 0; + + if (set_output_vector) + c->output_vector = set_output_vector; + else + vec_add2 (c->output_vector, result, n_bytes); + if (vec_len (c->output_vector) > 0) + { + int skip_update = 0 != (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE); + uf->flags |= UNIX_FILE_DATA_AVAILABLE_TO_WRITE; + if (!skip_update) + unix_main.file_update (uf, UNIX_FILE_UPDATE_MODIFY); + } + return result; +} + +static uword +catchup_request_fun (void *transport_main, + u32 stream_index, mc_peer_id_t catchup_peer_id) +{ + mc_socket_main_t *msm = (mc_socket_main_t *) transport_main; + mc_main_t *mcm = &msm->mc_main; + vlib_main_t *vm = mcm->vlib_main; + mc_socket_catchup_t *c; + struct sockaddr_in addr; + unix_main_t *um = &unix_main; + int one = 1; + + pool_get (msm->catchups, c); + memset (c, 0, sizeof (*c)); + + c->socket = socket (AF_INET, SOCK_STREAM, 0); + if (c->socket < 0) + { + clib_unix_warning ("socket"); + return 0; + } + + if (ioctl (c->socket, FIONBIO, &one) < 0) + { + clib_unix_warning ("FIONBIO"); + return 0; + } + + memset (&addr, 0, sizeof (addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = mc_socket_peer_id_get_address (catchup_peer_id); + addr.sin_port = mc_socket_peer_id_get_port (catchup_peer_id); + + c->connect_in_progress = 1; + + if (MC_EVENT_LOGGING) + { + ELOG_TYPE_DECLARE (e) = + { + .format = "connecting to peer 0x%Lx",.format_args = "i8",}; + struct + { + u64 peer; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->peer = catchup_peer_id.as_u64; + } + + if (connect (c->socket, (const void *) &addr, sizeof (addr)) + < 0 && errno != EINPROGRESS) + { + clib_unix_warning ("connect to %U fails", + format_socket_peer_id, catchup_peer_id); + return 0; + } + + { + unix_file_t template = { 0 }; + + template.read_function = catchup_client_read_ready; + template.write_function = catchup_client_write_ready; + template.error_function = catchup_socket_error_ready; + template.file_descriptor = c->socket; + template.private_data = (uword) msm; + c->unix_file_index = unix_file_add (um, &template); + + hash_set (msm->catchup_index_by_file_descriptor, c->socket, + c - msm->catchups); + } + + { + mc_msg_catchup_request_t *mp; + mp = catchup_add_pending_output (c, sizeof (mp[0]), /* set_output_vector */ + 0); + mp->peer_id = msm->mc_main.transport.our_catchup_peer_id; + mp->stream_index = stream_index; + mc_byte_swap_msg_catchup_request (mp); + } + + return c - msm->catchups; +} + +static void +catchup_send_fun (void *transport_main, uword opaque, u8 * data) +{ + mc_socket_main_t *msm = (mc_socket_main_t *) transport_main; + mc_socket_catchup_t *c = pool_elt_at_index (msm->catchups, opaque); + catchup_add_pending_output (c, 0, data); +} + +static int +find_interface_ip4_address (char *if_name, u32 * ip4_address, u32 * mtu) +{ + int fd; + struct ifreq ifr; + struct sockaddr_in *sa; + + /* Dig up our IP address */ + fd = socket (PF_INET, AF_INET, 0); + if (fd < 0) + { + clib_unix_error ("socket"); + return -1; + } + + ifr.ifr_addr.sa_family = AF_INET; + strncpy (ifr.ifr_name, if_name, sizeof (ifr.ifr_name) - 1); + if (ioctl (fd, SIOCGIFADDR, &ifr) < 0) + { + clib_unix_error ("ioctl(SIOCFIGADDR)"); + close (fd); + return -1; + } + + sa = (void *) &ifr.ifr_addr; + clib_memcpy (ip4_address, &sa->sin_addr.s_addr, sizeof (ip4_address[0])); + + if (ioctl (fd, SIOCGIFMTU, &ifr) < 0) + { + close (fd); + return -1; + } + if (mtu) + *mtu = ifr.ifr_mtu - ( /* IP4 header */ 20 + /* UDP header */ 8); + + close (fd); + + return 0; +} + +clib_error_t * +mc_socket_main_init (mc_socket_main_t * msm, char **intfc_probe_list, + int n_intfcs_to_probe) +{ + clib_error_t *error; + mc_main_t *mcm; + u32 mtu; + + mcm = &msm->mc_main; + + /* 239.255.0.7 */ + if (!msm->multicast_tx_ip4_address_host_byte_order) + msm->multicast_tx_ip4_address_host_byte_order = 0xefff0007; + + { + u32 i, a, win; + + win = 0; + if (msm->multicast_interface_name) + { + win = + !find_interface_ip4_address (msm->multicast_interface_name, &a, + &mtu); + } + else + { + for (i = 0; i < n_intfcs_to_probe; i++) + if (!find_interface_ip4_address (intfc_probe_list[i], &a, &mtu)) + { + win = 1; + msm->multicast_interface_name = intfc_probe_list[i]; + break; + } + } + + if (!win) + return clib_error_return (0, "can't find interface ip4 address"); + + msm->if_ip4_address_net_byte_order = a; + } + + msm->rx_mtu_n_bytes = mtu; + msm->rx_mtu_n_buffers = + msm->rx_mtu_n_bytes / VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES; + msm->rx_mtu_n_buffers += + (msm->rx_mtu_n_bytes % VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES) != 0; + + error = socket_setup (msm); + if (error) + return error; + + mcm->transport.our_ack_peer_id = + mc_socket_set_peer_id (msm->if_ip4_address_net_byte_order, + msm->ack_udp_port); + + mcm->transport.our_catchup_peer_id = + mc_socket_set_peer_id (msm->if_ip4_address_net_byte_order, + msm->catchup_tcp_port); + + mcm->transport.tx_buffer = tx_buffer; + mcm->transport.tx_ack = tx_ack; + mcm->transport.catchup_request_fun = catchup_request_fun; + mcm->transport.catchup_send_fun = catchup_send_fun; + mcm->transport.format_peer_id = format_socket_peer_id; + mcm->transport.opaque = msm; + mcm->transport.max_packet_size = mtu; + + mc_main_init (mcm, "socket"); + + return error; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/mc_socket.h b/src/vlib/unix/mc_socket.h new file mode 100644 index 00000000..273c9ad4 --- /dev/null +++ b/src/vlib/unix/mc_socket.h @@ -0,0 +1,137 @@ +/* + * mc_socket.h: socket based multicast for vlib mc + * + * Copyright (c) 2010 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __included_mc_socket_h__ +#define __included_mc_socket_h__ + +#include +#include + +typedef struct +{ + int socket; + struct sockaddr_in tx_addr; +} mc_multicast_socket_t; + +/* TCP catchup socket */ +typedef struct +{ + int socket; + u32 unix_file_index; + + u8 *input_vector; + u8 *output_vector; + u32 output_vector_n_written; + + u32 connect_in_progress; +} mc_socket_catchup_t; + +typedef struct mc_socket_main_t +{ + mc_main_t mc_main; + + /* Multicast mastership/to-relay/from-relay sockets. */ + mc_multicast_socket_t multicast_sockets[MC_N_TRANSPORT_TYPE]; + + /* Unicast UDP ack sockets */ + int ack_socket; + + /* TCP catchup server socket */ + int catchup_server_socket; + + /* Pool of stream-private catchup sockets */ + mc_socket_catchup_t *catchups; + + uword *catchup_index_by_file_descriptor; + + u32 rx_mtu_n_bytes; + + /* Receive MTU in bytes and VLIB buffers. */ + u32 rx_mtu_n_buffers; + + /* Vector of RX VLIB buffers. */ + u32 *rx_buffers; + /* Vector of scatter/gather descriptors for sending/receiving VLIB buffers + via kernel. */ + struct iovec *iovecs; + + /* IP address of interface to use for multicast. */ + u32 if_ip4_address_net_byte_order; + + u32 ack_udp_port; + u32 catchup_tcp_port; + + /* Interface on which to listen for multicasts. */ + char *multicast_interface_name; + + /* Multicast address to use (e.g. 0xefff0000). + Host byte order. */ + u32 multicast_tx_ip4_address_host_byte_order; + + /* TTL to use for multicasts. */ + u32 multicast_ttl; + + /* Multicast ports for mastership, joins, etc. will be chosen + starting at the given port in host byte order. + A total of MC_N_TRANSPORT_TYPE ports will be used. */ + u32 base_multicast_udp_port_host_byte_order; +} mc_socket_main_t; + +always_inline u32 +mc_socket_peer_id_get_address (mc_peer_id_t i) +{ + u32 a = ((i.as_u8[0] << 24) + | (i.as_u8[1] << 16) | (i.as_u8[2] << 8) | (i.as_u8[3] << 0)); + return clib_host_to_net_u32 (a); +} + +always_inline u32 +mc_socket_peer_id_get_port (mc_peer_id_t i) +{ + return clib_host_to_net_u16 ((i.as_u8[4] << 8) | i.as_u8[5]); +} + +static_always_inline mc_peer_id_t +mc_socket_set_peer_id (u32 address_net_byte_order, u32 port_host_byte_order) +{ + mc_peer_id_t i; + u32 a = ntohl (address_net_byte_order); + u32 p = port_host_byte_order; + i.as_u8[0] = (a >> 24) & 0xff; + i.as_u8[1] = (a >> 16) & 0xff; + i.as_u8[2] = (a >> 8) & 0xff; + i.as_u8[3] = (a >> 0) & 0xff; + i.as_u8[4] = (p >> 8) & 0xff; + i.as_u8[5] = (p >> 0) & 0xff; + i.as_u8[6] = 0; + i.as_u8[7] = 0; + return i; +} + +clib_error_t *mc_socket_main_init (mc_socket_main_t * msm, + char **intfc_probe_list, + int n_intfcs_to_probe); +#endif /* __included_mc_socket_h__ */ + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/physmem.c b/src/vlib/unix/physmem.c new file mode 100644 index 00000000..80ab7b9d --- /dev/null +++ b/src/vlib/unix/physmem.c @@ -0,0 +1,470 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * physmem.c: Unix physical memory + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include + +static physmem_main_t physmem_main; + +static void * +unix_physmem_alloc_aligned (vlib_physmem_main_t * vpm, uword n_bytes, + uword alignment) +{ + physmem_main_t *pm = &physmem_main; + uword lo_offset, hi_offset; + uword *to_free = 0; + +#if DPDK > 0 + clib_warning ("unsafe alloc!"); +#endif + + /* IO memory is always at least cache aligned. */ + alignment = clib_max (alignment, CLIB_CACHE_LINE_BYTES); + + while (1) + { + mheap_get_aligned (pm->heap, n_bytes, + /* align */ alignment, + /* align offset */ 0, + &lo_offset); + + /* Allocation failed? */ + if (lo_offset == ~0) + break; + + /* Make sure allocation does not span DMA physical chunk boundary. */ + hi_offset = lo_offset + n_bytes - 1; + + if ((lo_offset >> vpm->log2_n_bytes_per_page) == + (hi_offset >> vpm->log2_n_bytes_per_page)) + break; + + /* Allocation would span chunk boundary, queue it to be freed as soon as + we find suitable chunk. */ + vec_add1 (to_free, lo_offset); + } + + if (to_free != 0) + { + uword i; + for (i = 0; i < vec_len (to_free); i++) + mheap_put (pm->heap, to_free[i]); + vec_free (to_free); + } + + return lo_offset != ~0 ? pm->heap + lo_offset : 0; +} + +static void +unix_physmem_free (void *x) +{ + physmem_main_t *pm = &physmem_main; + + /* Return object to region's heap. */ + mheap_put (pm->heap, x - pm->heap); +} + +static void +htlb_shutdown (void) +{ + physmem_main_t *pm = &physmem_main; + + if (!pm->shmid) + return; + shmctl (pm->shmid, IPC_RMID, 0); + pm->shmid = 0; +} + +/* try to use huge TLB pgs if possible */ +static int +htlb_init (vlib_main_t * vm) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + physmem_main_t *pm = &physmem_main; + u64 hugepagesize, pagesize; + u64 pfn, seek_loc; + u64 cur, physaddr, ptbits; + int fd, i; + + pm->shmid = shmget (11 /* key, my amp goes to 11 */ , pm->mem_size, + IPC_CREAT | SHM_HUGETLB | SHM_R | SHM_W); + if (pm->shmid < 0) + { + clib_unix_warning ("shmget"); + return 0; + } + + pm->mem = shmat (pm->shmid, NULL, 0 /* flags */ ); + if (pm->mem == 0) + { + shmctl (pm->shmid, IPC_RMID, 0); + return 0; + } + + memset (pm->mem, 0, pm->mem_size); + + /* $$$ get page size info from /proc/meminfo */ + hugepagesize = 2 << 20; + pagesize = 4 << 10; + vpm->log2_n_bytes_per_page = min_log2 (hugepagesize); + vec_resize (vpm->page_table, pm->mem_size / hugepagesize); + + vpm->page_mask = pow2_mask (vpm->log2_n_bytes_per_page); + vpm->virtual.start = pointer_to_uword (pm->mem); + vpm->virtual.size = pm->mem_size; + vpm->virtual.end = vpm->virtual.start + vpm->virtual.size; + + fd = open ("/proc/self/pagemap", O_RDONLY); + + if (fd < 0) + { + (void) shmdt (pm->mem); + return 0; + } + + pm->heap = mheap_alloc_with_flags (pm->mem, pm->mem_size, + /* Don't want mheap mmap/munmap with IO memory. */ + MHEAP_FLAG_DISABLE_VM); + + cur = pointer_to_uword (pm->mem); + i = 0; + + while (cur < pointer_to_uword (pm->mem) + pm->mem_size) + { + pfn = (u64) cur / pagesize; + seek_loc = pfn * sizeof (u64); + if (lseek (fd, seek_loc, SEEK_SET) != seek_loc) + { + clib_unix_warning ("lseek to 0x%llx", seek_loc); + shmctl (pm->shmid, IPC_RMID, 0); + close (fd); + return 0; + } + if (read (fd, &ptbits, sizeof (ptbits)) != (sizeof (ptbits))) + { + clib_unix_warning ("read ptbits"); + shmctl (pm->shmid, IPC_RMID, 0); + close (fd); + return 0; + } + + /* bits 0-54 are the physical page number */ + physaddr = (ptbits & 0x7fffffffffffffULL) * pagesize; + if (CLIB_DEBUG > 1) + fformat (stderr, "pm: virtual 0x%llx physical 0x%llx\n", + cur, physaddr); + vpm->page_table[i++] = physaddr; + + cur += hugepagesize; + } + close (fd); + atexit (htlb_shutdown); + return 1; +} + +int vlib_app_physmem_init (vlib_main_t * vm, + physmem_main_t * pm, int) __attribute__ ((weak)); +int +vlib_app_physmem_init (vlib_main_t * vm, physmem_main_t * pm, int x) +{ + return 0; +} + +clib_error_t * +unix_physmem_init (vlib_main_t * vm, int physical_memory_required) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + physmem_main_t *pm = &physmem_main; + clib_error_t *error = 0; + + /* Avoid multiple calls. */ + if (vm->os_physmem_alloc_aligned) + return error; + + vm->os_physmem_alloc_aligned = unix_physmem_alloc_aligned; + vm->os_physmem_free = unix_physmem_free; + pm->mem = MAP_FAILED; + + if (pm->mem_size == 0) + pm->mem_size = 16 << 20; + + /* OK, Mr. App, you tell us */ + if (vlib_app_physmem_init (vm, pm, physical_memory_required)) + return 0; + + if (!pm->no_hugepages && htlb_init (vm)) + { + fformat (stderr, "%s: use huge pages\n", __FUNCTION__); + return 0; + } + + pm->mem = + mmap (0, pm->mem_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (pm->mem == MAP_FAILED) + { + error = clib_error_return_unix (0, "mmap"); + goto done; + } + + pm->heap = mheap_alloc (pm->mem, pm->mem_size); + + /* Identity map with a single page. */ + vpm->log2_n_bytes_per_page = min_log2 (pm->mem_size); + vec_add1 (vpm->page_table, pointer_to_uword (pm->mem)); + + vpm->page_mask = pow2_mask (vpm->log2_n_bytes_per_page); + vpm->virtual.start = pointer_to_uword (pm->mem); + vpm->virtual.size = pm->mem_size; + vpm->virtual.end = vpm->virtual.start + vpm->virtual.size; + vpm->is_fake = 1; + + fformat (stderr, "%s: use fake dma pages\n", __FUNCTION__); + +done: + if (error) + { + if (pm->mem != MAP_FAILED) + munmap (pm->mem, pm->mem_size); + } + return error; +} + +static clib_error_t * +show_physmem (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ +#if DPDK > 0 + vlib_cli_output (vm, "Not supported with DPDK drivers."); +#else + physmem_main_t *pm = &physmem_main; + + if (pm->heap) + vlib_cli_output (vm, "%U", format_mheap, pm->heap, /* verbose */ 1); + else + vlib_cli_output (vm, "No physmem allocated."); +#endif + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_physmem_command, static) = { + .path = "show physmem", + .short_help = "Show physical memory allocation", + .function = show_physmem, +}; +/* *INDENT-ON* */ + +static clib_error_t * +show_affinity (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + cpu_set_t set; + cpu_set_t *setp = &set; + int i, rv; + u8 *s = 0; + int first_set_bit_in_run = -1; + int last_set_bit_in_run = -1; + int output_done = 0; + + rv = sched_getaffinity (0 /* pid, 0 = this proc */ , + sizeof (*setp), setp); + if (rv < 0) + { + vlib_cli_output (vm, "Couldn't get affinity mask: %s\n", + strerror (errno)); + return 0; + } + + for (i = 0; i < 64; i++) + { + if (CPU_ISSET (i, setp)) + { + if (first_set_bit_in_run == -1) + { + first_set_bit_in_run = i; + last_set_bit_in_run = i; + if (output_done) + s = format (s, ","); + s = format (s, "%d-", i); + output_done = 1; + } + else + { + if (i == (last_set_bit_in_run + 1)) + last_set_bit_in_run = i; + } + } + else + { + if (first_set_bit_in_run != -1) + { + if (first_set_bit_in_run == (i - 1)) + { + _vec_len (s) -= 2 + ((first_set_bit_in_run / 10)); + } + s = format (s, "%d", last_set_bit_in_run); + first_set_bit_in_run = -1; + last_set_bit_in_run = -1; + } + } + } + + if (first_set_bit_in_run != -1) + s = format (s, "%d", first_set_bit_in_run); + + vlib_cli_output (vm, "Process runs on: %v", s); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_affinity_command, static) = { + .path = "show affinity", + .short_help = "Show process cpu affinity", + .function = show_affinity, +}; +/* *INDENT-ON* */ + +static clib_error_t * +set_affinity (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + cpu_set_t set; + cpu_set_t *setp = &set; + int i, rv; + int another_round; + u32 first, last; + + memset (setp, 0, sizeof (*setp)); + + do + { + another_round = 0; + if (unformat (input, "%d-%d,", &first, &last)) + { + if (first > 64 || last > 64) + { + barf1: + vlib_cli_output (vm, "range %d-%d invalid", first, last); + return 0; + } + + for (i = first; i <= last; i++) + CPU_SET (i, setp); + another_round = 1; + } + else if (unformat (input, "%d-%d", &first, &last)) + { + if (first > 64 || last > 64) + goto barf1; + + for (i = first; i <= last; i++) + CPU_SET (i, setp); + } + else if (unformat (input, "%d,", &first)) + { + if (first > 64) + { + barf2: + vlib_cli_output (vm, "cpu %d invalid", first); + return 0; + } + CPU_SET (first, setp); + another_round = 1; + } + else if (unformat (input, "%d", &first)) + { + if (first > 64) + goto barf2; + + CPU_SET (first, setp); + } + } + while (another_round); + + rv = sched_setaffinity (0 /* pid, 0 = this proc */ , + sizeof (*setp), setp); + + if (rv < 0) + { + vlib_cli_output (vm, "Couldn't get affinity mask: %s\n", + strerror (errno)); + return 0; + } + return show_affinity (vm, input, cmd); +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (set_affinity_command, static) = { + .path = "set affinity", + .short_help = "Set process cpu affinity", + .function = set_affinity, +}; +/* *INDENT-ON* */ + +static clib_error_t * +vlib_physmem_configure (vlib_main_t * vm, unformat_input_t * input) +{ + physmem_main_t *pm = &physmem_main; + u32 size_in_mb; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "no-huge") || unformat (input, "no-huge-pages")) + pm->no_hugepages = 1; + + else if (unformat (input, "size-in-mb %d", &size_in_mb) || + unformat (input, "size %d", &size_in_mb)) + pm->mem_size = size_in_mb << 20; + else + return unformat_parse_error (input); + } + + unformat_free (input); + return 0; +} + +VLIB_EARLY_CONFIG_FUNCTION (vlib_physmem_configure, "physmem"); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/physmem.h b/src/vlib/unix/physmem.h new file mode 100644 index 00000000..5519a7d6 --- /dev/null +++ b/src/vlib/unix/physmem.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_physmem_h__ +#define __included_physmem_h__ + +/* Manage I/O physical memory. */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include + +#include +#include + +#include /* for open */ +#include /* for flock */ +#include +#include +#include +#include +#include +#include + +typedef struct +{ + /* Virtual memory via mmaped. */ + void *mem; + + /* Size in bytes. */ + uword mem_size; + + /* Heap allocated out of virtual memory. */ + void *heap; + + /* huge TLB segment id */ + int shmid; + + /* should we try to use htlb ? */ + int no_hugepages; + +} physmem_main_t; + +#endif /* __included_physmem_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/plugin.c b/src/vlib/unix/plugin.c new file mode 100644 index 00000000..b3d5be02 --- /dev/null +++ b/src/vlib/unix/plugin.c @@ -0,0 +1,260 @@ +/* + * plugin.c: plugin handling + * + * Copyright (c) 2011 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +plugin_main_t vlib_plugin_main; + +void +vlib_set_get_handoff_structure_cb (void *cb) +{ + plugin_main_t *pm = &vlib_plugin_main; + pm->handoff_structure_get_cb = cb; +} + +static void * +vnet_get_handoff_structure (void) +{ + void *(*fp) (void); + + fp = vlib_plugin_main.handoff_structure_get_cb; + if (fp == 0) + return 0; + else + return (*fp) (); +} + +static int +load_one_plugin (plugin_main_t * pm, plugin_info_t * pi, int from_early_init) +{ + void *handle, *register_handle; + clib_error_t *(*fp) (vlib_main_t *, void *, int); + clib_error_t *error; + void *handoff_structure; + + handle = dlopen ((char *) pi->name, RTLD_LAZY); + + /* + * Note: this can happen if the plugin has an undefined symbol reference, + * so print a warning. Otherwise, the poor slob won't know what happened. + * Ask me how I know that... + */ + if (handle == 0) + { + clib_warning ("%s", dlerror ()); + return -1; + } + + pi->handle = handle; + + + register_handle = dlsym (pi->handle, "vlib_plugin_register"); + if (register_handle == 0) + { + dlclose (handle); + clib_warning ("Plugin missing vlib_plugin_register: %s\n", + (char *) pi->name); + return 1; + } + + fp = register_handle; + + handoff_structure = vnet_get_handoff_structure (); + + if (handoff_structure == 0) + error = clib_error_return (0, "handoff structure callback returned 0"); + else + error = (*fp) (pm->vlib_main, handoff_structure, from_early_init); + + if (error) + { + clib_error_report (error); + dlclose (handle); + return 1; + } + + clib_warning ("Loaded plugin: %s", pi->name); + + return 0; +} + +static u8 ** +split_plugin_path (plugin_main_t * pm) +{ + int i; + u8 **rv = 0; + u8 *path = pm->plugin_path; + u8 *this = 0; + + for (i = 0; i < vec_len (pm->plugin_path); i++) + { + if (path[i] != ':') + { + vec_add1 (this, path[i]); + continue; + } + vec_add1 (this, 0); + vec_add1 (rv, this); + this = 0; + } + if (this) + { + vec_add1 (this, 0); + vec_add1 (rv, this); + } + return rv; +} + +int +vlib_load_new_plugins (plugin_main_t * pm, int from_early_init) +{ + DIR *dp; + struct dirent *entry; + struct stat statb; + uword *p; + plugin_info_t *pi; + u8 **plugin_path; + int i; + + plugin_path = split_plugin_path (pm); + + for (i = 0; i < vec_len (plugin_path); i++) + { + dp = opendir ((char *) plugin_path[i]); + + if (dp == 0) + continue; + + while ((entry = readdir (dp))) + { + u8 *plugin_name; + + if (pm->plugin_name_filter) + { + int j; + for (j = 0; j < vec_len (pm->plugin_name_filter); j++) + if (entry->d_name[j] != pm->plugin_name_filter[j]) + goto next; + } + + plugin_name = format (0, "%s/%s%c", plugin_path[i], + entry->d_name, 0); + + /* Only accept .so */ + char *ext = strrchr ((const char *) plugin_name, '.'); + /* unreadable */ + if (!ext || (strcmp (ext, ".so") != 0) || + stat ((char *) plugin_name, &statb) < 0) + { + ignore: + vec_free (plugin_name); + continue; + } + + /* a dir or other things which aren't plugins */ + if (!S_ISREG (statb.st_mode)) + goto ignore; + + p = hash_get_mem (pm->plugin_by_name_hash, plugin_name); + if (p == 0) + { + vec_add2 (pm->plugin_info, pi, 1); + pi->name = plugin_name; + pi->file_info = statb; + + if (load_one_plugin (pm, pi, from_early_init)) + { + vec_free (plugin_name); + _vec_len (pm->plugin_info) = vec_len (pm->plugin_info) - 1; + continue; + } + memset (pi, 0, sizeof (*pi)); + hash_set_mem (pm->plugin_by_name_hash, plugin_name, + pi - pm->plugin_info); + } + next: + ; + } + closedir (dp); + vec_free (plugin_path[i]); + } + vec_free (plugin_path); + return 0; +} + +char *vlib_plugin_path __attribute__ ((weak)); +char *vlib_plugin_path = ""; +char *vlib_plugin_name_filter __attribute__ ((weak)); +char *vlib_plugin_name_filter = 0; + +int +vlib_plugin_early_init (vlib_main_t * vm) +{ + plugin_main_t *pm = &vlib_plugin_main; + + pm->plugin_path = format (0, "%s%c", vlib_plugin_path, 0); + + clib_warning ("plugin path %s", pm->plugin_path); + + if (vlib_plugin_name_filter) + pm->plugin_name_filter = format (0, "%s%c", vlib_plugin_name_filter, 0); + + pm->plugin_by_name_hash = hash_create_string (0, sizeof (uword)); + pm->vlib_main = vm; + + return vlib_load_new_plugins (pm, 1 /* from_early_init */ ); +} + +static clib_error_t * +vlib_plugins_show_cmd_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + plugin_main_t *pm = &vlib_plugin_main; + u8 *s = 0; + u8 *key = 0; + uword *value = 0; + int index = 1; + + s = format (s, " Plugin path is: %s\n", pm->plugin_path); + if (vlib_plugin_name_filter) + s = format (s, " Plugin filter: %s\n", vlib_plugin_name_filter); + + s = format (s, " Plugins loaded: \n"); + hash_foreach_mem (key, value, pm->plugin_by_name_hash, + { + if (key != 0) + s = format (s, " %d.%s\n", index, key); index++;} + ); + + vlib_cli_output (vm, "%v", s); + vec_free (s); + return 0; +} + +VLIB_CLI_COMMAND (plugins_show_cmd, static) = +{ +.path = "show plugins",.short_help = "show loaded plugins",.function = + vlib_plugins_show_cmd_fn,}; +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/plugin.h b/src/vlib/unix/plugin.h new file mode 100644 index 00000000..c17053bd --- /dev/null +++ b/src/vlib/unix/plugin.h @@ -0,0 +1,98 @@ +/* + * plugin.h: plugin handling + * + * Copyright (c) 2011 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __included_plugin_h__ +#define __included_plugin_h__ + +#include +#include +#include +#include +#include + +/* + * vlib plugin scheme + * + * Almost anything which can be made to work in a vlib unix + * application will also work in a vlib plugin. + * + * The elf-section magic which registers static objects + * works so long as plugins are preset when the vlib unix process + * starts. But wait: there's more... + * + * If an application calls vlib_load_new_plugins() -- possibly after + * changing vlib_plugin_main.plugin_path / vlib_plugin_main.plugin_name_filter, + * -- new plugins will be loaded. That, in turn, allows considerable + * flexibility in terms of adding feature code or fixing bugs without + * requiring the data-plane process to restart. + * + * When the plugin mechanism loads a plugin, it uses dlsym to locate + * and call the plugin's function vlib_plugin_register() if it exists. + * A plugin which expects to be loaded after the vlib application + * starts uses this callback to modify the application. If vlib_plugin_register + * returns non-zero, the plugin mechanism dlclose()'s the plugin. + * + * Applications control the plugin search path and name filter by + * declaring the variables vlib_plugin_path and vlib_plugin_name_filter. + * libvlib_unix.la supplies weak references for these symbols which + * effectively disable the scheme. In order for the elf-section magic to + * work, static plugins must be loaded at the earliest possible moment. + * + * An application can change these parameters at any time and call + * vlib_load_new_plugins(). + */ + + + +typedef struct +{ + u8 *name; + struct stat file_info; + void *handle; +} plugin_info_t; + +typedef struct +{ + /* loaded plugin info */ + plugin_info_t *plugin_info; + uword *plugin_by_name_hash; + + /* path and name filter */ + u8 *plugin_path; + u8 *plugin_name_filter; + + /* handoff structure get callback */ + void *handoff_structure_get_cb; + + /* usual */ + vlib_main_t *vlib_main; +} plugin_main_t; + +extern plugin_main_t vlib_plugin_main; + +int vlib_plugin_early_init (vlib_main_t * vm); +int vlib_load_new_plugins (plugin_main_t * pm, int from_early_init); + +#endif /* __included_plugin_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/unix.h b/src/vlib/unix/unix.h new file mode 100644 index 00000000..ea0d417b --- /dev/null +++ b/src/vlib/unix/unix.h @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * unix.h: Unix specific main state + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_unix_unix_h +#define included_unix_unix_h + +#include +#include + +struct unix_file; +typedef clib_error_t *(unix_file_function_t) (struct unix_file * f); + +typedef struct unix_file +{ + /* Unix file descriptor from open/socket. */ + u32 file_descriptor; + + u32 flags; +#define UNIX_FILE_DATA_AVAILABLE_TO_WRITE (1 << 0) +#define UNIX_FILE_EVENT_EDGE_TRIGGERED (1 << 1) + + /* Data available for function's use. */ + uword private_data; + + /* Functions to be called when read/write data becomes ready. */ + unix_file_function_t *read_function, *write_function, *error_function; +} unix_file_t; + +typedef struct +{ + f64 time; + clib_error_t *error; +} unix_error_history_t; + +typedef enum +{ + UNIX_FILE_UPDATE_ADD, + UNIX_FILE_UPDATE_MODIFY, + UNIX_FILE_UPDATE_DELETE, +} unix_file_update_type_t; + +typedef struct +{ + /* Back pointer to main structure. */ + vlib_main_t *vlib_main; + + u32 flags; + /* Run interactively or as daemon (background process). */ +#define UNIX_FLAG_INTERACTIVE (1 << 0) +#define UNIX_FLAG_NODAEMON (1 << 1) + + /* Pool of files to poll for input/output. */ + unix_file_t *file_pool; + + /* CLI listen socket. */ + clib_socket_t cli_listen_socket; + + void (*file_update) (unix_file_t * file, + unix_file_update_type_t update_type); + + /* Circular buffer of last unix errors. */ + unix_error_history_t error_history[128]; + u32 error_history_index; + u64 n_total_errors; + + /* startup-config filename */ + u8 *startup_config_filename; + + /* unix config complete */ + volatile int unix_config_complete; + + /* CLI log file. GIGO. */ + u8 *log_filename; + int log_fd; + + /* Don't put CLI connections into character mode */ + int cli_line_mode; + + /* Maximum amount of command line history to keep per session */ + u32 cli_history_limit; + + /* Suppress the welcome banner at CLI session start */ + int cli_no_banner; + + /* Maximum pager buffer size */ + u32 cli_pager_buffer_limit; + + /* Suppress the pager */ + int cli_no_pager; + + /* Store the original state of stdin when it's a tty */ + struct termios tio_stdin; + int tio_isset; +} unix_main_t; + +/* Global main structure. */ +extern unix_main_t unix_main; + +always_inline uword +unix_file_add (unix_main_t * um, unix_file_t * template) +{ + unix_file_t *f; + pool_get (um->file_pool, f); + f[0] = template[0]; + um->file_update (f, UNIX_FILE_UPDATE_ADD); + return f - um->file_pool; +} + +always_inline void +unix_file_del (unix_main_t * um, unix_file_t * f) +{ + um->file_update (f, UNIX_FILE_UPDATE_DELETE); + close (f->file_descriptor); + f->file_descriptor = ~0; + pool_put (um->file_pool, f); +} + +always_inline uword +unix_file_set_data_available_to_write (u32 unix_file_index, + uword is_available) +{ + unix_file_t *uf = pool_elt_at_index (unix_main.file_pool, unix_file_index); + uword was_available = (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE); + if ((was_available != 0) != (is_available != 0)) + { + uf->flags ^= UNIX_FILE_DATA_AVAILABLE_TO_WRITE; + unix_main.file_update (uf, UNIX_FILE_UPDATE_MODIFY); + } + return was_available != 0; +} + +always_inline void +unix_save_error (unix_main_t * um, clib_error_t * error) +{ + unix_error_history_t *eh = um->error_history + um->error_history_index; + clib_error_free_vector (eh->error); + eh->error = error; + eh->time = vlib_time_now (um->vlib_main); + um->n_total_errors += 1; + if (++um->error_history_index >= ARRAY_LEN (um->error_history)) + um->error_history_index = 0; +} + +/* Main function for Unix VLIB. */ +int vlib_unix_main (int argc, char *argv[]); + +/* Call to allocate/initialize physical DMA memory subsystem. + This is not an init function so that users can explicitly enable/disable + physmem when its not needed. */ +clib_error_t *unix_physmem_init (vlib_main_t * vm, + int fail_if_physical_memory_not_present); + +static inline int +unix_physmem_is_fake (vlib_main_t * vm) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + return vpm->is_fake; +} + +/* Set prompt for CLI. */ +void vlib_unix_cli_set_prompt (char *prompt); + +static inline unix_main_t * +vlib_unix_get_main (void) +{ + return &unix_main; +} + +/* thread stack array; vec_len = max number of threads */ +extern u8 **vlib_thread_stacks; + +/* utils */ + +clib_error_t *vlib_sysfs_write (char *file_name, char *fmt, ...); + +clib_error_t *vlib_sysfs_read (char *file_name, char *fmt, ...); + +u8 *vlib_sysfs_link_to_name (char *link); + +int vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size); + +clib_error_t *foreach_directory_file (char *dir_name, + clib_error_t * (*f) (void *arg, + u8 * path_name, + u8 * file_name), + void *arg, int scan_dirs); + +#endif /* included_unix_unix_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/util.c b/src/vlib/unix/util.c new file mode 100644 index 00000000..edc3e591 --- /dev/null +++ b/src/vlib/unix/util.c @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * pci.c: Linux user space PCI bus management. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include + +#include +#include +#include +#include + +clib_error_t * +foreach_directory_file (char *dir_name, + clib_error_t * (*f) (void *arg, u8 * path_name, + u8 * file_name), void *arg, + int scan_dirs) +{ + DIR *d; + struct dirent *e; + clib_error_t *error = 0; + u8 *s, *t; + + d = opendir (dir_name); + if (!d) + { + if (errno == ENOENT) + return 0; + return clib_error_return_unix (0, "open `%s'", dir_name); + } + + s = t = 0; + while (1) + { + e = readdir (d); + if (!e) + break; + if (scan_dirs) + { + if (e->d_type == DT_DIR + && (!strcmp (e->d_name, ".") || !strcmp (e->d_name, ".."))) + continue; + } + else + { + if (e->d_type == DT_DIR) + continue; + } + + s = format (s, "%s/%s", dir_name, e->d_name); + t = format (t, "%s", e->d_name); + error = f (arg, s, t); + _vec_len (s) = 0; + _vec_len (t) = 0; + + if (error) + break; + } + + vec_free (s); + closedir (d); + + return error; +} + +clib_error_t * +vlib_sysfs_write (char *file_name, char *fmt, ...) +{ + u8 *s; + int fd; + clib_error_t *error = 0; + + fd = open (file_name, O_WRONLY); + if (fd < 0) + return clib_error_return_unix (0, "open `%s'", file_name); + + va_list va; + va_start (va, fmt); + s = va_format (0, fmt, &va); + va_end (va); + + if (write (fd, s, vec_len (s)) < 0) + error = clib_error_return_unix (0, "write `%s'", file_name); + + vec_free (s); + close (fd); + return error; +} + +clib_error_t * +vlib_sysfs_read (char *file_name, char *fmt, ...) +{ + unformat_input_t input; + u8 *s = 0; + int fd; + ssize_t sz; + uword result; + + fd = open (file_name, O_RDONLY); + if (fd < 0) + return clib_error_return_unix (0, "open `%s'", file_name); + + vec_validate (s, 4095); + + sz = read (fd, s, vec_len (s)); + if (sz < 0) + { + close (fd); + vec_free (s); + return clib_error_return_unix (0, "read `%s'", file_name); + } + + _vec_len (s) = sz; + unformat_init_vector (&input, s); + + va_list va; + va_start (va, fmt); + result = va_unformat (&input, fmt, &va); + va_end (va); + + vec_free (s); + close (fd); + + if (result == 0) + return clib_error_return (0, "unformat error"); + + return 0; +} + +u8 * +vlib_sysfs_link_to_name (char *link) +{ + char *p, buffer[64]; + unformat_input_t in; + u8 *s = 0; + int r; + + r = readlink (link, buffer, sizeof (buffer) - 1); + + if (r < 0) + return 0; + + buffer[r] = 0; + p = strrchr (buffer, '/'); + + if (!p) + return 0; + + unformat_init_string (&in, p + 1, strlen (p + 1)); + if (unformat (&in, "%s", &s) != 1) + clib_unix_warning ("no string?"); + unformat_free (&in); + + return s; +} + +int +vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size) +{ + struct stat sb; + u8 *p = 0; + int r = -1; + + p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); + + if (stat ((char *) p, &sb) == 0) + { + if (S_ISDIR (sb.st_mode) == 0) + goto done; + } + else if (numa_node == 0) + { + vec_reset_length (p); + p = format (p, "/sys/kernel/mm%c", 0); + if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) + goto done; + } + else + goto done; + + _vec_len (p) -= 1; + p = format (p, "/hugepages/hugepages-%ukB/free_hugepages%c", page_size, 0); + vlib_sysfs_read ((char *) p, "%d", &r); + +done: + vec_free (p); + return r; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/vlib.h b/src/vlib/vlib.h new file mode 100644 index 00000000..b146a49b --- /dev/null +++ b/src/vlib/vlib.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * vlib.h: top-level include file + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_h +#define included_vlib_h + +#include +#include + +/* Generic definitions. */ +#include + +/* Forward declarations of structs to avoid circular dependencies. */ +struct vlib_main_t; + +/* All includes in alphabetical order. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Main include depends on other vlib/ includes so we put it last. */ +#include + +/* Inline/extern function declarations. */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#endif /* included_vlib_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/vlib_process_doc.h b/src/vlib/vlib_process_doc.h new file mode 100644 index 00000000..a47c5e4b --- /dev/null +++ b/src/vlib/vlib_process_doc.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +#error do not #include this file! + +/** \file + + Cooperative multi-tasking thread support. + + Vlib provides a lightweight cooperative multi-tasking thread + model. Context switching costs a setjmp/longjump pair. It's not + unreasonable to put vlib threads to sleep for 10us. + + The graph node scheduler invokes these processes in much the same + way as traditional vector-processing run-to-completion graph + nodes; plus-or-minus a setjmp/longjmp pair required to switch + stacks. Simply set the vlib_node_registration_t type field to + VLIB_NODE_TYPE_PROCESS. Process is a misnomer; these are threads. + + As of this writing, the default stack size is 2<<15; + 32kb. Initialize the node registration's + process_log2_n_stack_bytes member as needed. The graph node + dispatcher makes some effort to detect stack overrun. We map a + no-access page below each thread stack. + + Process node dispatch functions are expected to be while(1) { } + loops which suspend when not otherwise occupied, and which must + not run for unreasonably long periods of time. Unreasonably long + is an application-dependent concept. Over the years, we have + constructed frame-size sensitive control-plane nodes which will + use a much higher fraction of the available CPU bandwidth when the + frame size is low. Classic example: modifying forwarding + tables. So long as the table-builder leaves the forwarding tables + in a valid state, one can suspend the table builder to avoid + dropping packets as a result of control-plane activity. + + Process nodes can suspend for fixed amounts of time, or until another + entity signals an event, or both. See the example below. + + When running in VLIB process context, one must pay strict attention to + loop invariant issues. If one walks a data structure and calls a + function which may suspend, one had best know by construction that it + cannot change. Often, it s best to simply make a snapshot copy of a + data structure, walk the copy at leisure, then free the copy. + + Here's an example: + +
+    \#define EXAMPLE_POLL_PERIOD 10.0
+
+    static uword
+    example_process (vlib_main_t * vm, vlib_node_runtime_t * rt,
+                     vlib_frame_t * f)
+    {
+      f64 poll_time_remaining;
+      uword event_type, *event_data = 0;
+
+      poll_time_remaining = EXAMPLE_POLL_PERIOD;
+      while (1)
+        {
+          int i;
+
+           // Sleep until next periodic call due,
+           // or until we receive event(s)
+           //
+          poll_time_remaining =
+    	    vlib_process_wait_for_event_or_clock (vm, poll_time_remaining);
+
+          event_type = vlib_process_get_events (vm, &event_data);
+          switch (event_type)
+     	    {
+       	    case ~0:		// no events => timeout
+      	      break;
+
+            case EVENT1:
+    	      for (i = 0; i < vec_len (event_data); i++)
+    	        handle_event1 (mm, event_data[i]);
+    	      break;
+
+    	    case EVENT2:
+    	      for (i = 0; i < vec_len (event_data); i++)
+    	        handle_event2 (vm, event_data[i]);
+    	      break;
+
+              // ... and so forth for each event type
+
+            default:
+              // This should never happen...
+    	      clib_warning ("BUG: unhandled event type %d",
+                            event_type);
+    	      break;
+      	    }
+          vec_reset_length (event_data);
+
+          // Timer expired, call periodic function
+          if (vlib_process_suspend_time_is_zero (poll_time_remaining))
+    	    {
+    	      example_periodic (vm);
+    	      poll_time_remaining = EXAMPLE_POLL_PERIOD;
+    	    }
+        }
+      // NOTREACHED
+      return 0;
+    }
+
+    static VLIB_REGISTER_NODE (example_node) = {
+      .function = example_process,
+      .type = VLIB_NODE_TYPE_PROCESS,
+      .name = "example-process",
+    };
+    
+ + In this example, the VLIB process node waits for an event to + occur, or for 10 seconds to elapse. The code demuxes on the event + type, calling the appropriate handler function. + + Each call to vlib_process_get_events returns a vector of + per-event-type data passed to successive vlib_process_signal_event + calls; vec_len (event_data) >= 1. It is an error to process only + event_data[0]. + + Resetting the event_data vector-length to 0 by calling + vec_reset_length (event_data) - instead of calling vec_free (...) + - means that the event scheme doesn t burn cycles continuously + allocating and freeing the event data vector. This is a common + coding pattern, well worth using when appropriate. +*/ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ -- cgit 1.2.3-korg From 0eca04d84e39ca16c60bea91d77a1c458300c16c Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Tue, 3 Jan 2017 20:11:35 +0100 Subject: vlib: merge libvlib_unix.so into libvlib.so Change-Id: Iedbee427d838794c4b26db5bd07b8d27aad9fcd8 Signed-off-by: Damjan Marion --- src/vlib.am | 15 ++++++--------- src/vlib/unix/plugin.h | 2 +- src/vnet.am | 3 +-- src/vpp-api-test.am | 3 +-- src/vpp.am | 1 - 5 files changed, 9 insertions(+), 15 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib.am b/src/vlib.am index 2464076f..9c3b4d5b 100644 --- a/src/vlib.am +++ b/src/vlib.am @@ -12,14 +12,13 @@ # limitations under the License. -libvlibdir = ${libdir} -libvlib_LTLIBRARIES = libvlib.la -libvlib_la_LIBAD = libvppinfra.la +lib_LTLIBRARIES += libvlib.la +libvlib_la_LIBADD = libvppinfra.la -ldl -lpthread BUILT_SOURCES += vlib/config.h vlib/config.h: - echo "#define __PRE_DATA_SIZE" @PRE_DATA_SIZE@ > $@ + @echo "#define __PRE_DATA_SIZE" @PRE_DATA_SIZE@ > $@ libvlib_la_SOURCES = \ vlib/buffer.c \ @@ -69,9 +68,7 @@ nobase_include_HEADERS += \ vlib/trace.h \ vlib/vlib.h -libvlib_LTLIBRARIES += libvlib_unix.la - -libvlib_unix_la_SOURCES = \ +libvlib_la_SOURCES += \ vlib/unix/cj.c \ vlib/unix/cli.c \ vlib/unix/input.c \ @@ -88,7 +85,7 @@ nobase_include_HEADERS += \ vlib/unix/physmem.h \ vlib/unix/plugin.h \ vlib/unix/unix.h - + if !WITH_DPDK noinst_PROGRAMS += vlib_unix @@ -96,7 +93,7 @@ vlib_unix_SOURCES = \ examples/vlib/main_stub.c \ examples/vlib/mc_test.c -vlib_unix_LDADD = libvlib_unix.la libvlib.la \ +vlib_unix_LDADD = libvlib.la \ libvppinfra.la -lpthread -lm -ldl -lrt endif diff --git a/src/vlib/unix/plugin.h b/src/vlib/unix/plugin.h index c17053bd..f35c9c5d 100644 --- a/src/vlib/unix/plugin.h +++ b/src/vlib/unix/plugin.h @@ -48,7 +48,7 @@ * * Applications control the plugin search path and name filter by * declaring the variables vlib_plugin_path and vlib_plugin_name_filter. - * libvlib_unix.la supplies weak references for these symbols which + * libvlib.la supplies weak references for these symbols which * effectively disable the scheme. In order for the elf-section magic to * work, static plugins must be loaded at the earliest possible moment. * diff --git a/src/vnet.am b/src/vnet.am index 03233487..1baa1d25 100644 --- a/src/vnet.am +++ b/src/vnet.am @@ -440,7 +440,7 @@ test_map_SOURCES = \ test_map_CPPFLAGS = $(AM_CPPFLAGS) -DCLIB_DEBUG test_map_LDADD = libvnet.la libvppinfra.la libvlib.la \ - -lpthread -lvlibmemory -lvlibapi -lvlib_unix \ + -lpthread -lvlibmemory -lvlibapi \ -ldl -lsvm -lrt test_map_LDFLAGS = -static @@ -566,7 +566,6 @@ LDS = \ libvppinfra.la \ libvnet.la \ libvlib.la \ - libvlib_unix.la \ libsvm.la \ libsvmdb.la \ libvlibapi.la \ diff --git a/src/vpp-api-test.am b/src/vpp-api-test.am index fe25f6e1..66d223c0 100644 --- a/src/vpp-api-test.am +++ b/src/vpp-api-test.am @@ -33,8 +33,7 @@ vpp_json_test_SOURCES = \ vpp_api_test_LDFLAGS = $(DPDK_LD_FLAGS) vpp_api_test_LDADD = \ - libvlib.la \ - libvlib_unix.la \ + libvlib.la \ libvlibmemoryclient.la \ libsvm.la \ libvatplugin.la \ diff --git a/src/vpp.am b/src/vpp.am index 5128e8ac..77010e1f 100644 --- a/src/vpp.am +++ b/src/vpp.am @@ -93,7 +93,6 @@ vpp_plugin_configure: bin_vpp_LDADD = \ libvlibapi.la \ libvlibmemory.la \ - libvlib_unix.la \ libvlib.la \ libvnet.la \ libsvm.la \ -- cgit 1.2.3-korg From f935e336db1282c1ec67cf703db2d3cfaec32915 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Fri, 6 Jan 2017 14:33:05 +0100 Subject: plugin: add API to get pointer to symbol in different plugin Change-Id: Ic2fbbd8227d5d0c033e5d7b5f43b859a4889d2f3 Signed-off-by: Damjan Marion --- src/vlib/unix/plugin.c | 31 ++++++++++++++++++++++++------- src/vlib/unix/plugin.h | 2 ++ 2 files changed, 26 insertions(+), 7 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/unix/plugin.c b/src/vlib/unix/plugin.c index b3d5be02..987f7d05 100644 --- a/src/vlib/unix/plugin.c +++ b/src/vlib/unix/plugin.c @@ -40,6 +40,20 @@ vnet_get_handoff_structure (void) return (*fp) (); } +void * +vlib_get_plugin_symbol (char *plugin_name, char *symbol_name) +{ + plugin_main_t *pm = &vlib_plugin_main; + uword *p; + plugin_info_t *pi; + + if ((p = hash_get_mem (pm->plugin_by_name_hash, plugin_name)) == 0) + return 0; + + pi = vec_elt_at_index (pm->plugin_info, p[0]); + return dlsym (pi->handle, symbol_name); +} + static int load_one_plugin (plugin_main_t * pm, plugin_info_t * pi, int from_early_init) { @@ -48,7 +62,7 @@ load_one_plugin (plugin_main_t * pm, plugin_info_t * pi, int from_early_init) clib_error_t *error; void *handoff_structure; - handle = dlopen ((char *) pi->name, RTLD_LAZY); + handle = dlopen ((char *) pi->filename, RTLD_LAZY); /* * Note: this can happen if the plugin has an undefined symbol reference, @@ -144,6 +158,7 @@ vlib_load_new_plugins (plugin_main_t * pm, int from_early_init) while ((entry = readdir (dp))) { u8 *plugin_name; + u8 *filename; if (pm->plugin_name_filter) { @@ -153,17 +168,16 @@ vlib_load_new_plugins (plugin_main_t * pm, int from_early_init) goto next; } - plugin_name = format (0, "%s/%s%c", plugin_path[i], - entry->d_name, 0); + filename = format (0, "%s/%s%c", plugin_path[i], entry->d_name, 0); /* Only accept .so */ - char *ext = strrchr ((const char *) plugin_name, '.'); + char *ext = strrchr ((const char *) filename, '.'); /* unreadable */ if (!ext || (strcmp (ext, ".so") != 0) || - stat ((char *) plugin_name, &statb) < 0) + stat ((char *) filename, &statb) < 0) { ignore: - vec_free (plugin_name); + vec_free (filename); continue; } @@ -171,20 +185,23 @@ vlib_load_new_plugins (plugin_main_t * pm, int from_early_init) if (!S_ISREG (statb.st_mode)) goto ignore; + plugin_name = format (0, "%s%c", entry->d_name, 0); p = hash_get_mem (pm->plugin_by_name_hash, plugin_name); if (p == 0) { vec_add2 (pm->plugin_info, pi, 1); pi->name = plugin_name; + pi->filename = filename; pi->file_info = statb; if (load_one_plugin (pm, pi, from_early_init)) { vec_free (plugin_name); + vec_free (filename); _vec_len (pm->plugin_info) = vec_len (pm->plugin_info) - 1; + memset (pi, 0, sizeof (*pi)); continue; } - memset (pi, 0, sizeof (*pi)); hash_set_mem (pm->plugin_by_name_hash, plugin_name, pi - pm->plugin_info); } diff --git a/src/vlib/unix/plugin.h b/src/vlib/unix/plugin.h index f35c9c5d..1c74cdd2 100644 --- a/src/vlib/unix/plugin.h +++ b/src/vlib/unix/plugin.h @@ -61,6 +61,7 @@ typedef struct { u8 *name; + u8 *filename; struct stat file_info; void *handle; } plugin_info_t; @@ -86,6 +87,7 @@ extern plugin_main_t vlib_plugin_main; int vlib_plugin_early_init (vlib_main_t * vm); int vlib_load_new_plugins (plugin_main_t * pm, int from_early_init); +void *vlib_get_plugin_symbol (char *plugin_name, char *symbol_name); #endif /* __included_plugin_h__ */ -- cgit 1.2.3-korg From 878c609889dcdc58538d40d8b3f662320f88573d Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 4 Jan 2017 13:19:27 +0100 Subject: vlib: add buffer and thread callbacks Change-Id: I8e2e8f94a884ab2f9909d0c83ba00edd38cdab77 Signed-off-by: Damjan Marion --- src/plugins/flowperpkt/flowperpkt.c | 2 +- src/vlib.am | 1 + src/vlib/buffer.c | 736 +++--------------------------------- src/vlib/buffer.h | 45 ++- src/vlib/buffer_funcs.h | 82 ++-- src/vlib/buffer_serialize.c | 248 ++++++++++++ src/vlib/main.c | 7 +- src/vlib/threads.c | 112 +++--- src/vlib/threads.h | 17 +- src/vlib/threads_cli.c | 25 -- src/vlib/unix/physmem.c | 15 +- src/vnet.am | 2 + src/vnet/devices/dpdk/buffer.c | 729 +++++++++++++++++++++++++++++++++++ src/vnet/devices/dpdk/cli.c | 4 +- src/vnet/devices/dpdk/device.c | 7 +- src/vnet/devices/dpdk/dpdk.h | 3 + src/vnet/devices/dpdk/dpdk_priv.h | 3 + src/vnet/devices/dpdk/init.c | 6 +- src/vnet/devices/dpdk/thread.c | 85 +++++ src/vnet/sr/sr_replicate.c | 7 +- 20 files changed, 1304 insertions(+), 832 deletions(-) create mode 100644 src/vlib/buffer_serialize.c create mode 100644 src/vnet/devices/dpdk/buffer.c create mode 100644 src/vnet/devices/dpdk/thread.c (limited to 'src/vlib') diff --git a/src/plugins/flowperpkt/flowperpkt.c b/src/plugins/flowperpkt/flowperpkt.c index fb71d5b0..cc351599 100644 --- a/src/plugins/flowperpkt/flowperpkt.c +++ b/src/plugins/flowperpkt/flowperpkt.c @@ -643,7 +643,7 @@ flowperpkt_init (vlib_main_t * vm) vec_free (name); /* Decide how many worker threads we have */ - num_threads = 1 /* main thread */ + tm->n_eal_threads; + num_threads = 1 /* main thread */ + tm->n_threads; /* Allocate per worker thread vectors */ vec_validate (fm->ipv4_buffers_per_worker, num_threads - 1); diff --git a/src/vlib.am b/src/vlib.am index 0154d841..c21f88c4 100644 --- a/src/vlib.am +++ b/src/vlib.am @@ -23,6 +23,7 @@ vlib/config.h: libvlib_la_SOURCES = \ vlib/buffer.c \ + vlib/buffer_serialize.c \ vlib/cli.c \ vlib/cli.h \ vlib/config.h \ diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index 4bf6d125..0b0e6054 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -38,50 +38,13 @@ */ /** - * @cond (!DPDK) * @file * * Allocate/free network buffers. */ -#if DPDK > 0 -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#endif - #include -#if DPDK > 0 -#pragma weak rte_mem_virt2phy -#pragma weak rte_eal_has_hugepages -#pragma weak rte_socket_id -#pragma weak rte_pktmbuf_pool_create -#endif - uword vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm, vlib_buffer_t * b_first) @@ -103,7 +66,6 @@ u8 * format_vlib_buffer (u8 * s, va_list * args) { vlib_buffer_t *b = va_arg (*args, vlib_buffer_t *); -#if DPDK > 0 uword indent = format_get_indent (s); s = format (s, "current data %d, length %d, free-list %d", @@ -126,18 +88,6 @@ format_vlib_buffer (u8 * s, va_list * args) format_white_space, indent, next_buffer, b->current_length); } -#else - - s = format (s, "current data %d, length %d, free-list %d", - b->current_data, b->current_length, b->free_list_index); - - if (b->flags & VLIB_BUFFER_IS_TRACED) - s = format (s, ", trace 0x%x", b->trace_index); - - if (b->flags & VLIB_BUFFER_NEXT_PRESENT) - s = format (s, ", next-buffer 0x%x", b->next_buffer); -#endif - return s; } @@ -153,7 +103,6 @@ format_vlib_buffer_and_data (u8 * s, va_list * args) return s; } -#if DPDK == 0 static u8 * format_vlib_buffer_known_state (u8 * s, va_list * args) { @@ -181,7 +130,6 @@ format_vlib_buffer_known_state (u8 * s, va_list * args) return format (s, "%s", t); } -#endif u8 * format_vlib_buffer_contents (u8 * s, va_list * va) @@ -200,7 +148,6 @@ format_vlib_buffer_contents (u8 * s, va_list * va) return s; } -#if DPDK == 0 static u8 * vlib_validate_buffer_helper (vlib_main_t * vm, u32 bi, @@ -217,11 +164,10 @@ vlib_validate_buffer_helper (vlib_main_t * vm, if ((signed) b->current_data < (signed) -VLIB_BUFFER_PRE_DATA_SIZE) return format (0, "current data %d before pre-data", b->current_data); -#if DPDK == 0 + if (b->current_data + b->current_length > fl->n_data_bytes) return format (0, "%d-%d beyond end of buffer %d", b->current_data, b->current_length, fl->n_data_bytes); -#endif if (follow_buffer_next && (b->flags & VLIB_BUFFER_NEXT_PRESENT)) { @@ -311,14 +257,12 @@ done: hash_free (hash); return result; } -#endif vlib_main_t **vlib_mains; -#if DPDK == 0 /* When dubugging validate that given buffers are either known allocated or known free. */ -static void +static void __attribute__ ((unused)) vlib_buffer_validate_alloc_free (vlib_main_t * vm, u32 * buffers, uword n_buffers, @@ -359,7 +303,6 @@ vlib_buffer_validate_alloc_free (vlib_main_t * vm, is_free ? VLIB_BUFFER_KNOWN_FREE : VLIB_BUFFER_KNOWN_ALLOCATED); } } -#endif #define BUFFERS_PER_COPY (sizeof (vlib_copy_unit_t) / sizeof (u32)) @@ -463,7 +406,6 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm, { vlib_buffer_main_t *bm = vm->buffer_main; vlib_buffer_free_list_t *f; -#if DPDK > 0 int i; ASSERT (os_get_cpu_number () == 0); @@ -519,47 +461,6 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm, wf->unaligned_buffers = 0; wf->n_alloc = 0; } -#else - - if (!is_default && pool_elts (bm->buffer_free_list_pool) == 0) - { - u32 default_free_free_list_index; - - default_free_free_list_index = vlib_buffer_create_free_list_helper (vm, - /* default buffer size */ - VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES, - /* is_public */ - 1, - /* is_default */ - 1, - (u8 - *) - "default"); - ASSERT (default_free_free_list_index == - VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); - - if (n_data_bytes == VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES && is_public) - return default_free_free_list_index; - } - - pool_get_aligned (bm->buffer_free_list_pool, f, CLIB_CACHE_LINE_BYTES); - - memset (f, 0, sizeof (f[0])); - f->index = f - bm->buffer_free_list_pool; - f->n_data_bytes = vlib_buffer_round_size (n_data_bytes); - f->min_n_buffers_each_physmem_alloc = 256; - f->name = clib_mem_is_heap_object (name) ? name : format (0, "%s", name); - - /* Setup free buffer template. */ - f->buffer_init_template.free_list_index = f->index; - - if (is_public) - { - uword *p = hash_get (bm->free_list_by_size, f->n_data_bytes); - if (!p) - hash_set (bm->free_list_by_size, f->n_data_bytes, f->index); - } -#endif return f->index; } @@ -609,50 +510,30 @@ static void del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f) { u32 i; -#if DPDK > 0 - struct rte_mbuf *mb; - vlib_buffer_t *b; - - for (i = 0; i < vec_len (f->unaligned_buffers); i++) - { - b = vlib_get_buffer (vm, f->unaligned_buffers[i]); - mb = rte_mbuf_from_vlib_buffer (b); - ASSERT (rte_mbuf_refcnt_read (mb) == 1); - rte_pktmbuf_free (mb); - } - for (i = 0; i < vec_len (f->aligned_buffers); i++) - { - b = vlib_get_buffer (vm, f->aligned_buffers[i]); - mb = rte_mbuf_from_vlib_buffer (b); - ASSERT (rte_mbuf_refcnt_read (mb) == 1); - rte_pktmbuf_free (mb); - } - vec_free (f->name); -#else for (i = 0; i < vec_len (f->buffer_memory_allocated); i++) vm->os_physmem_free (f->buffer_memory_allocated[i]); vec_free (f->name); vec_free (f->buffer_memory_allocated); -#endif vec_free (f->unaligned_buffers); vec_free (f->aligned_buffers); } /* Add buffer free list. */ void -vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index) +vlib_buffer_delete_free_list_internal (vlib_main_t * vm, u32 free_list_index) { vlib_buffer_main_t *bm = vm->buffer_main; vlib_buffer_free_list_t *f; u32 merge_index; -#if DPDK > 0 int i; ASSERT (os_get_cpu_number () == 0); f = vlib_buffer_get_free_list (vm, free_list_index); + ASSERT (vec_len (f->unaligned_buffers) + vec_len (f->aligned_buffers) == + f->n_alloc); merge_index = vlib_buffer_get_free_list_with_size (vm, f->n_data_bytes); if (merge_index != ~0 && merge_index != free_list_index) { @@ -674,26 +555,6 @@ vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index) memset (f, 0xab, sizeof (f[0])); pool_put (bm->buffer_free_list_pool, f); } -#else - - f = vlib_buffer_get_free_list (vm, free_list_index); - - ASSERT (vec_len (f->unaligned_buffers) + vec_len (f->aligned_buffers) == - f->n_alloc); - merge_index = vlib_buffer_get_free_list_with_size (vm, f->n_data_bytes); - if (merge_index != ~0 && merge_index != free_list_index) - { - merge_free_lists (pool_elt_at_index (bm->buffer_free_list_pool, - merge_index), f); - } - - del_free_list (vm, f); - - /* Poison it. */ - memset (f, 0xab, sizeof (f[0])); - - pool_put (bm->buffer_free_list_pool, f); -#endif } /* Make sure free list has at least given number of free buffers. */ @@ -701,63 +562,6 @@ static uword fill_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * fl, uword min_free_buffers) { -#if DPDK > 0 - vlib_buffer_t *b; - int n, i; - u32 bi; - u32 n_remaining = 0, n_alloc = 0; - unsigned socket_id = rte_socket_id ? rte_socket_id () : 0; - struct rte_mempool *rmp = vm->buffer_main->pktmbuf_pools[socket_id]; - struct rte_mbuf *mb; - - /* Too early? */ - if (PREDICT_FALSE (rmp == 0)) - return 0; - - trim_aligned (fl); - - /* Already have enough free buffers on free list? */ - n = min_free_buffers - vec_len (fl->aligned_buffers); - if (n <= 0) - return min_free_buffers; - - /* Always allocate round number of buffers. */ - n = round_pow2 (n, BUFFERS_PER_COPY); - - /* Always allocate new buffers in reasonably large sized chunks. */ - n = clib_max (n, fl->min_n_buffers_each_physmem_alloc); - - vec_validate (vm->mbuf_alloc_list, n - 1); - - if (rte_mempool_get_bulk (rmp, vm->mbuf_alloc_list, n) < 0) - return 0; - - _vec_len (vm->mbuf_alloc_list) = n; - - for (i = 0; i < n; i++) - { - mb = vm->mbuf_alloc_list[i]; - - ASSERT (rte_mbuf_refcnt_read (mb) == 0); - rte_mbuf_refcnt_set (mb, 1); - - b = vlib_buffer_from_rte_mbuf (mb); - bi = vlib_get_buffer_index (vm, b); - - vec_add1_aligned (fl->aligned_buffers, bi, sizeof (vlib_copy_unit_t)); - n_alloc++; - n_remaining--; - - vlib_buffer_init_for_free_list (b, fl); - - if (fl->buffer_init_function) - fl->buffer_init_function (vm, fl, &bi, 1); - } - - fl->n_alloc += n; - - return n; -#else vlib_buffer_t *buffers, *b; int n, n_bytes, i; u32 *bi; @@ -824,7 +628,6 @@ fill_free_list (vlib_main_t * vm, fl->buffer_init_function (vm, fl, bi, n_this_chunk); } return n_alloc; -#endif } always_inline uword @@ -833,6 +636,7 @@ copy_alignment (u32 * x) return (pointer_to_uword (x) / sizeof (x[0])) % BUFFERS_PER_COPY; } + static u32 alloc_from_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * free_list, @@ -842,10 +646,6 @@ alloc_from_free_list (vlib_main_t * vm, uword u_len, n_left; uword n_unaligned_start, n_unaligned_end, n_filled; -#if DPDK == 0 - ASSERT (os_get_cpu_number () == 0); - -#endif n_left = n_alloc_buffers; dst = alloc_buffers; n_unaligned_start = ((BUFFERS_PER_COPY - copy_alignment (dst)) @@ -945,25 +745,21 @@ alloc_from_free_list (vlib_main_t * vm, else _vec_len (free_list->unaligned_buffers) = u_len; -#if DPDK == 0 /* Verify that buffers are known free. */ vlib_buffer_validate_alloc_free (vm, alloc_buffers, n_alloc_buffers, VLIB_BUFFER_KNOWN_FREE); -#endif return n_alloc_buffers; } + /* Allocate a given number of buffers into given array. Returns number actually allocated which will be either zero or number requested. */ -u32 -vlib_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +static u32 +vlib_buffer_alloc_internal (vlib_main_t * vm, u32 * buffers, u32 n_buffers) { vlib_buffer_main_t *bm = vm->buffer_main; -#if DPDK == 0 - ASSERT (os_get_cpu_number () == 0); -#endif return alloc_from_free_list (vm, @@ -972,10 +768,10 @@ vlib_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers) buffers, n_buffers); } -u32 -vlib_buffer_alloc_from_free_list (vlib_main_t * vm, - u32 * buffers, - u32 n_buffers, u32 free_list_index) +static u32 +vlib_buffer_alloc_from_free_list_internal (vlib_main_t * vm, + u32 * buffers, + u32 n_buffers, u32 free_list_index) { vlib_buffer_main_t *bm = vm->buffer_main; vlib_buffer_free_list_t *f; @@ -1016,81 +812,10 @@ vlib_set_buffer_free_callback (vlib_main_t * vm, void *fp) return rv; } -#if DPDK == 0 -void vnet_buffer_free_dpdk_mb (vlib_buffer_t * b) __attribute__ ((weak)); -void -vnet_buffer_free_dpdk_mb (vlib_buffer_t * b) -{ -} - -#endif static_always_inline void vlib_buffer_free_inline (vlib_main_t * vm, u32 * buffers, u32 n_buffers, u32 follow_buffer_next) { -#if DPDK > 0 - vlib_buffer_main_t *bm = vm->buffer_main; - vlib_buffer_free_list_t *fl; - u32 fi; - int i; - u32 (*cb) (vlib_main_t * vm, u32 * buffers, u32 n_buffers, - u32 follow_buffer_next); - - cb = bm->buffer_free_callback; - - if (PREDICT_FALSE (cb != 0)) - n_buffers = (*cb) (vm, buffers, n_buffers, follow_buffer_next); - - if (!n_buffers) - return; - - for (i = 0; i < n_buffers; i++) - { - vlib_buffer_t *b; - struct rte_mbuf *mb; - - b = vlib_get_buffer (vm, buffers[i]); - - fl = buffer_get_free_list (vm, b, &fi); - - /* The only current use of this callback: multicast recycle */ - if (PREDICT_FALSE (fl->buffers_added_to_freelist_function != 0)) - { - int j; - - add_buffer_to_free_list - (vm, fl, buffers[i], (b->flags & VLIB_BUFFER_RECYCLE) == 0); - - for (j = 0; j < vec_len (bm->announce_list); j++) - { - if (fl == bm->announce_list[j]) - goto already_announced; - } - vec_add1 (bm->announce_list, fl); - already_announced: - ; - } - else - { - if (PREDICT_TRUE ((b->flags & VLIB_BUFFER_RECYCLE) == 0)) - { - mb = rte_mbuf_from_vlib_buffer (b); - ASSERT (rte_mbuf_refcnt_read (mb) == 1); - rte_pktmbuf_free (mb); - } - } - } - if (vec_len (bm->announce_list)) - { - vlib_buffer_free_list_t *fl; - for (i = 0; i < vec_len (bm->announce_list); i++) - { - fl = bm->announce_list[i]; - fl->buffers_added_to_freelist_function (vm, fl); - } - _vec_len (bm->announce_list) = 0; - } -#else vlib_buffer_main_t *bm = vm->buffer_main; vlib_buffer_free_list_t *fl; static u32 *next_to_free[2]; /* smp bad */ @@ -1315,26 +1040,25 @@ again: } _vec_len (announce_list) = 0; } -#endif } -void -vlib_buffer_free (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +static void +vlib_buffer_free_internal (vlib_main_t * vm, u32 * buffers, u32 n_buffers) { vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */ 1); } -void -vlib_buffer_free_no_next (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +static void +vlib_buffer_free_no_next_internal (vlib_main_t * vm, u32 * buffers, + u32 n_buffers) { vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */ 0); } -#if DPDK == 0 /* Copy template packet data into buffers as they are allocated. */ -static void +static void __attribute__ ((unused)) vlib_packet_template_buffer_init (vlib_main_t * vm, vlib_buffer_free_list_t * fl, u32 * buffers, u32 n_buffers) @@ -1352,7 +1076,6 @@ vlib_packet_template_buffer_init (vlib_main_t * vm, b->current_length); } } -#endif void vlib_packet_template_init (vlib_main_t * vm, @@ -1362,28 +1085,22 @@ vlib_packet_template_init (vlib_main_t * vm, uword min_n_buffers_each_physmem_alloc, char *fmt, ...) { -#if DPDK > 0 + vlib_buffer_main_t *bm = vm->buffer_main; va_list va; __attribute__ ((unused)) u8 *name; + vlib_buffer_free_list_t *fl; va_start (va, fmt); name = va_format (0, fmt, &va); va_end (va); - vlib_worker_thread_barrier_sync (vm); - memset (t, 0, sizeof (t[0])); - - vec_add (t->packet_data, packet_data, n_packet_data_bytes); + if (bm->cb.vlib_packet_template_init_cb) + bm->cb.vlib_packet_template_init_cb (vm, (void *) t, packet_data, + n_packet_data_bytes, + min_n_buffers_each_physmem_alloc, + name); - vlib_worker_thread_barrier_release (vm); -#else - vlib_buffer_free_list_t *fl; - va_list va; - u8 *name; - - va_start (va, fmt); - name = va_format (0, fmt, &va); - va_end (va); + vlib_worker_thread_barrier_sync (vm); memset (t, 0, sizeof (t[0])); @@ -1406,7 +1123,7 @@ vlib_packet_template_init (vlib_main_t * vm, fl->buffer_init_template.current_data = 0; fl->buffer_init_template.current_length = n_packet_data_bytes; fl->buffer_init_template.flags = 0; -#endif + vlib_worker_thread_barrier_release (vm); } void * @@ -1429,7 +1146,6 @@ vlib_packet_template_get_packet (vlib_main_t * vm, return b->data; } -#if DPDK == 0 void vlib_packet_template_get_packet_helper (vlib_main_t * vm, vlib_packet_template_t * t) @@ -1447,7 +1163,6 @@ vlib_packet_template_get_packet_helper (vlib_main_t * vm, _vec_len (t->free_buffers) = n_alloc; } -#endif /* Append given data to end of buffer, possibly allocating new buffers. */ u32 vlib_buffer_add_data (vlib_main_t * vm, @@ -1541,328 +1256,11 @@ vlib_buffer_chain_append_data_with_alloc (vlib_main_t * vm, return copied; } -#if DPDK > 0 -clib_error_t * -vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, - unsigned socket_id) -{ - vlib_buffer_main_t *bm = vm->buffer_main; - vlib_physmem_main_t *vpm = &vm->physmem_main; - struct rte_mempool *rmp; - int i; - - if (!rte_pktmbuf_pool_create) - return clib_error_return (0, "not linked with DPDK"); - - vec_validate_aligned (bm->pktmbuf_pools, socket_id, CLIB_CACHE_LINE_BYTES); - - /* pool already exists, nothing to do */ - if (bm->pktmbuf_pools[socket_id]) - return 0; - - u8 *pool_name = format (0, "mbuf_pool_socket%u%c", socket_id, 0); - - rmp = rte_pktmbuf_pool_create ((char *) pool_name, /* pool name */ - num_mbufs, /* number of mbufs */ - 512, /* cache size */ - VLIB_BUFFER_HDR_SIZE, /* priv size */ - VLIB_BUFFER_PRE_DATA_SIZE + VLIB_BUFFER_DATA_SIZE, /* dataroom size */ - socket_id); /* cpu socket */ - - if (rmp) - { - { - uword this_pool_end; - uword this_pool_start; - uword this_pool_size; - uword save_vpm_start, save_vpm_end, save_vpm_size; - struct rte_mempool_memhdr *memhdr; - - this_pool_start = ~0ULL; - this_pool_end = 0LL; - - STAILQ_FOREACH (memhdr, &rmp->mem_list, next) - { - if (((uword) (memhdr->addr + memhdr->len)) > this_pool_end) - this_pool_end = (uword) (memhdr->addr + memhdr->len); - if (((uword) memhdr->addr) < this_pool_start) - this_pool_start = (uword) (memhdr->addr); - } - ASSERT (this_pool_start < ~0ULL && this_pool_end > 0); - this_pool_size = this_pool_end - this_pool_start; - - if (CLIB_DEBUG > 1) - { - clib_warning ("%s: pool start %llx pool end %llx pool size %lld", - pool_name, this_pool_start, this_pool_end, - this_pool_size); - clib_warning - ("before: virtual.start %llx virtual.end %llx virtual.size %lld", - vpm->virtual.start, vpm->virtual.end, vpm->virtual.size); - } - - save_vpm_start = vpm->virtual.start; - save_vpm_end = vpm->virtual.end; - save_vpm_size = vpm->virtual.size; - - if ((this_pool_start < vpm->virtual.start) || vpm->virtual.start == 0) - vpm->virtual.start = this_pool_start; - if (this_pool_end > vpm->virtual.end) - vpm->virtual.end = this_pool_end; - - vpm->virtual.size = vpm->virtual.end - vpm->virtual.start; - - if (CLIB_DEBUG > 1) - { - clib_warning - ("after: virtual.start %llx virtual.end %llx virtual.size %lld", - vpm->virtual.start, vpm->virtual.end, vpm->virtual.size); - } - - /* check if fits into buffer index range */ - if ((u64) vpm->virtual.size > - ((u64) 1 << (32 + CLIB_LOG2_CACHE_LINE_BYTES))) - { - clib_warning ("physmem: virtual size out of range!"); - vpm->virtual.start = save_vpm_start; - vpm->virtual.end = save_vpm_end; - vpm->virtual.size = save_vpm_size; - rmp = 0; - } - } - if (rmp) - { - bm->pktmbuf_pools[socket_id] = rmp; - vec_free (pool_name); - return 0; - } - } - - vec_free (pool_name); - - /* no usable pool for this socket, try to use pool from another one */ - for (i = 0; i < vec_len (bm->pktmbuf_pools); i++) - { - if (bm->pktmbuf_pools[i]) - { - clib_warning - ("WARNING: Failed to allocate mempool for CPU socket %u. " - "Threads running on socket %u will use socket %u mempool.", - socket_id, socket_id, i); - bm->pktmbuf_pools[socket_id] = bm->pktmbuf_pools[i]; - return 0; - } - } - - return clib_error_return (0, "failed to allocate mempool on socket %u", - socket_id); -} -#endif - -static void -vlib_serialize_tx (serialize_main_header_t * m, serialize_stream_t * s) -{ - vlib_main_t *vm; - vlib_serialize_buffer_main_t *sm; - uword n, n_bytes_to_write; - vlib_buffer_t *last; - - n_bytes_to_write = s->current_buffer_index; - sm = - uword_to_pointer (s->data_function_opaque, - vlib_serialize_buffer_main_t *); - vm = sm->vlib_main; - - ASSERT (sm->tx.max_n_data_bytes_per_chain > 0); - if (serialize_stream_is_end_of_stream (s) - || sm->tx.n_total_data_bytes + n_bytes_to_write > - sm->tx.max_n_data_bytes_per_chain) - { - vlib_process_t *p = vlib_get_current_process (vm); - - last = vlib_get_buffer (vm, sm->last_buffer); - last->current_length = n_bytes_to_write; - - vlib_set_next_frame_buffer (vm, &p->node_runtime, sm->tx.next_index, - sm->first_buffer); - - sm->first_buffer = sm->last_buffer = ~0; - sm->tx.n_total_data_bytes = 0; - } - - else if (n_bytes_to_write == 0 && s->n_buffer_bytes == 0) - { - ASSERT (sm->first_buffer == ~0); - ASSERT (sm->last_buffer == ~0); - n = - vlib_buffer_alloc_from_free_list (vm, &sm->first_buffer, 1, - sm->tx.free_list_index); - if (n != 1) - serialize_error (m, - clib_error_create - ("vlib_buffer_alloc_from_free_list fails")); - sm->last_buffer = sm->first_buffer; - s->n_buffer_bytes = - vlib_buffer_free_list_buffer_size (vm, sm->tx.free_list_index); - } - - if (n_bytes_to_write > 0) - { - vlib_buffer_t *prev = vlib_get_buffer (vm, sm->last_buffer); - n = - vlib_buffer_alloc_from_free_list (vm, &sm->last_buffer, 1, - sm->tx.free_list_index); - if (n != 1) - serialize_error (m, - clib_error_create - ("vlib_buffer_alloc_from_free_list fails")); - sm->tx.n_total_data_bytes += n_bytes_to_write; - prev->current_length = n_bytes_to_write; - prev->next_buffer = sm->last_buffer; - prev->flags |= VLIB_BUFFER_NEXT_PRESENT; - } - - if (sm->last_buffer != ~0) - { - last = vlib_get_buffer (vm, sm->last_buffer); - s->buffer = vlib_buffer_get_current (last); - s->current_buffer_index = 0; - ASSERT (last->current_data == s->current_buffer_index); - } -} - -static void -vlib_serialize_rx (serialize_main_header_t * m, serialize_stream_t * s) -{ - vlib_main_t *vm; - vlib_serialize_buffer_main_t *sm; - vlib_buffer_t *last; - - sm = - uword_to_pointer (s->data_function_opaque, - vlib_serialize_buffer_main_t *); - vm = sm->vlib_main; - - if (serialize_stream_is_end_of_stream (s)) - return; - - if (sm->last_buffer != ~0) - { - last = vlib_get_buffer (vm, sm->last_buffer); - - if (last->flags & VLIB_BUFFER_NEXT_PRESENT) - sm->last_buffer = last->next_buffer; - else - { - vlib_buffer_free (vm, &sm->first_buffer, /* count */ 1); - sm->first_buffer = sm->last_buffer = ~0; - } - } - - if (sm->last_buffer == ~0) - { - while (clib_fifo_elts (sm->rx.buffer_fifo) == 0) - { - sm->rx.ready_one_time_event = - vlib_process_create_one_time_event (vm, vlib_current_process (vm), - ~0); - vlib_process_wait_for_one_time_event (vm, /* no event data */ 0, - sm->rx.ready_one_time_event); - } - - clib_fifo_sub1 (sm->rx.buffer_fifo, sm->first_buffer); - sm->last_buffer = sm->first_buffer; - } - - ASSERT (sm->last_buffer != ~0); - - last = vlib_get_buffer (vm, sm->last_buffer); - s->current_buffer_index = 0; - s->buffer = vlib_buffer_get_current (last); - s->n_buffer_bytes = last->current_length; -} - -static void -serialize_open_vlib_helper (serialize_main_t * m, - vlib_main_t * vm, - vlib_serialize_buffer_main_t * sm, uword is_read) -{ - /* Initialize serialize main but save overflow buffer for re-use between calls. */ - { - u8 *save = m->stream.overflow_buffer; - memset (m, 0, sizeof (m[0])); - m->stream.overflow_buffer = save; - if (save) - _vec_len (save) = 0; - } - - sm->first_buffer = sm->last_buffer = ~0; - if (is_read) - clib_fifo_reset (sm->rx.buffer_fifo); - else - sm->tx.n_total_data_bytes = 0; - sm->vlib_main = vm; - m->header.data_function = is_read ? vlib_serialize_rx : vlib_serialize_tx; - m->stream.data_function_opaque = pointer_to_uword (sm); -} - -void -serialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm, - vlib_serialize_buffer_main_t * sm) -{ - serialize_open_vlib_helper (m, vm, sm, /* is_read */ 0); -} - -void -unserialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm, - vlib_serialize_buffer_main_t * sm) -{ - serialize_open_vlib_helper (m, vm, sm, /* is_read */ 1); -} - -u32 -serialize_close_vlib_buffer (serialize_main_t * m) -{ - vlib_serialize_buffer_main_t *sm - = uword_to_pointer (m->stream.data_function_opaque, - vlib_serialize_buffer_main_t *); - vlib_buffer_t *last; - serialize_stream_t *s = &m->stream; - - last = vlib_get_buffer (sm->vlib_main, sm->last_buffer); - last->current_length = s->current_buffer_index; - - if (vec_len (s->overflow_buffer) > 0) - { - sm->last_buffer - = vlib_buffer_add_data (sm->vlib_main, sm->tx.free_list_index, - sm->last_buffer == ~0 ? 0 : sm->last_buffer, - s->overflow_buffer, - vec_len (s->overflow_buffer)); - _vec_len (s->overflow_buffer) = 0; - } - - return sm->first_buffer; -} - -void -unserialize_close_vlib_buffer (serialize_main_t * m) -{ - vlib_serialize_buffer_main_t *sm - = uword_to_pointer (m->stream.data_function_opaque, - vlib_serialize_buffer_main_t *); - if (sm->first_buffer != ~0) - vlib_buffer_free_one (sm->vlib_main, sm->first_buffer); - clib_fifo_reset (sm->rx.buffer_fifo); - if (m->stream.overflow_buffer) - _vec_len (m->stream.overflow_buffer) = 0; -} static u8 * format_vlib_buffer_free_list (u8 * s, va_list * va) { vlib_buffer_free_list_t *f = va_arg (*va, vlib_buffer_free_list_t *); -#if DPDK > 0 u32 threadnum = va_arg (*va, u32); uword bytes_alloc, bytes_free, n_free, size; @@ -1877,21 +1275,6 @@ format_vlib_buffer_free_list (u8 * s, va_list * va) bytes_free = size * n_free; s = format (s, "%7d%30s%12d%12d%=12U%=12U%=12d%=12d", threadnum, -#else - uword bytes_alloc, bytes_free, n_free, size; - - if (!f) - return format (s, "%=30s%=12s%=12s%=12s%=12s%=12s%=12s", - "Name", "Index", "Size", "Alloc", "Free", "#Alloc", - "#Free"); - - size = sizeof (vlib_buffer_t) + f->n_data_bytes; - n_free = vec_len (f->aligned_buffers) + vec_len (f->unaligned_buffers); - bytes_alloc = size * f->n_alloc; - bytes_free = size * n_free; - - s = format (s, "%30s%12d%12d%=12U%=12U%=12d%=12d", -#endif f->name, f->index, f->n_data_bytes, format_memory_size, bytes_alloc, format_memory_size, bytes_free, f->n_alloc, n_free); @@ -1903,7 +1286,6 @@ static clib_error_t * show_buffers (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) { -#if DPDK > 0 vlib_buffer_main_t *bm; vlib_buffer_free_list_t *f; vlib_main_t *curr_vm; @@ -1926,18 +1308,6 @@ show_buffers (vlib_main_t * vm, } while (vm_index < vec_len (vlib_mains)); -#else - vlib_buffer_main_t *bm = vm->buffer_main; - vlib_buffer_free_list_t *f; - - vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, 0); - /* *INDENT-OFF* */ - pool_foreach (f, bm->buffer_free_list_pool, ({ - vlib_cli_output (vm, "%U", format_vlib_buffer_free_list, f); - })); -/* *INDENT-ON* */ - -#endif return 0; } @@ -1949,34 +1319,38 @@ VLIB_CLI_COMMAND (show_buffers_command, static) = { }; /* *INDENT-ON* */ -#if DPDK > 0 -#if CLIB_DEBUG > 0 - -u32 *vlib_buffer_state_validation_lock; -uword *vlib_buffer_state_validation_hash; -void *vlib_buffer_state_heap; - -static clib_error_t * -buffer_state_validation_init (vlib_main_t * vm) +void +vlib_buffer_cb_init (struct vlib_main_t *vm) { - void *oldheap; - - vlib_buffer_state_heap = mheap_alloc (0, 10 << 20); - - oldheap = clib_mem_set_heap (vlib_buffer_state_heap); + vlib_buffer_main_t *bm = vm->buffer_main; + bm->cb.vlib_buffer_alloc_cb = &vlib_buffer_alloc_internal; + bm->cb.vlib_buffer_alloc_from_free_list_cb = + &vlib_buffer_alloc_from_free_list_internal; + bm->cb.vlib_buffer_free_cb = &vlib_buffer_free_internal; + bm->cb.vlib_buffer_free_no_next_cb = &vlib_buffer_free_no_next_internal; + bm->cb.vlib_buffer_delete_free_list_cb = + &vlib_buffer_delete_free_list_internal; + bm->extern_buffer_mgmt = 0; +} - vlib_buffer_state_validation_hash = hash_create (0, sizeof (uword)); - vec_validate_aligned (vlib_buffer_state_validation_lock, 0, - CLIB_CACHE_LINE_BYTES); - clib_mem_set_heap (oldheap); +int +vlib_buffer_cb_register (struct vlib_main_t *vm, vlib_buffer_callbacks_t * cb) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + if (bm->extern_buffer_mgmt) + return -1; + +#define _(x) bm->cb.x = cb->x + _(vlib_buffer_alloc_cb); + _(vlib_buffer_alloc_from_free_list_cb); + _(vlib_buffer_free_cb); + _(vlib_buffer_free_no_next_cb); + _(vlib_buffer_delete_free_list_cb); +#undef _ + bm->extern_buffer_mgmt = 1; return 0; } -VLIB_INIT_FUNCTION (buffer_state_validation_init); -#endif -#endif - - /** @endcond */ /* * fd.io coding-style-patch-verification: ON diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index 5f1e62f0..d270c08a 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -46,15 +46,9 @@ #include #include /* for vlib_error_t */ -#if DPDK > 0 -#include -#define VLIB_BUFFER_DATA_SIZE (2048) -#define VLIB_BUFFER_PRE_DATA_SIZE RTE_PKTMBUF_HEADROOM -#else #include /* for __PRE_DATA_SIZE */ -#define VLIB_BUFFER_DATA_SIZE (512) +#define VLIB_BUFFER_DATA_SIZE (2048) #define VLIB_BUFFER_PRE_DATA_SIZE __PRE_DATA_SIZE -#endif #if defined (CLIB_HAVE_VEC128) || defined (__aarch64__) typedef u8x16 vlib_copy_unit_t; @@ -296,6 +290,27 @@ typedef struct vlib_buffer_free_list_t uword buffer_init_function_opaque; } __attribute__ ((aligned (16))) vlib_buffer_free_list_t; +typedef struct +{ + u32 (*vlib_buffer_alloc_cb) (struct vlib_main_t * vm, u32 * buffers, + u32 n_buffers); + u32 (*vlib_buffer_alloc_from_free_list_cb) (struct vlib_main_t * vm, + u32 * buffers, u32 n_buffers, + u32 free_list_index); + void (*vlib_buffer_free_cb) (struct vlib_main_t * vm, u32 * buffers, + u32 n_buffers); + void (*vlib_buffer_free_no_next_cb) (struct vlib_main_t * vm, u32 * buffers, + u32 n_buffers); + void (*vlib_packet_template_init_cb) (struct vlib_main_t * vm, void *t, + void *packet_data, + uword n_packet_data_bytes, + uword + min_n_buffers_each_physmem_alloc, + u8 * name); + void (*vlib_buffer_delete_free_list_cb) (struct vlib_main_t * vm, + u32 free_list_index); +} vlib_buffer_callbacks_t; + typedef struct { /* Buffer free callback, for subversive activities */ @@ -323,12 +338,15 @@ typedef struct /* List of free-lists needing Blue Light Special announcements */ vlib_buffer_free_list_t **announce_list; - /* Vector of rte_mempools per socket */ -#if DPDK == 1 - struct rte_mempool **pktmbuf_pools; -#endif + /* Callbacks */ + vlib_buffer_callbacks_t cb; + int extern_buffer_mgmt; } vlib_buffer_main_t; +void vlib_buffer_cb_init (struct vlib_main_t *vm); +int vlib_buffer_cb_register (struct vlib_main_t *vm, + vlib_buffer_callbacks_t * cb); + typedef struct { struct vlib_main_t *vlib_main; @@ -385,11 +403,6 @@ serialize_vlib_buffer_n_bytes (serialize_main_t * m) vec_len (s->overflow_buffer); } -#if DPDK > 0 -#define rte_mbuf_from_vlib_buffer(x) (((struct rte_mbuf *)x) - 1) -#define vlib_buffer_from_rte_mbuf(x) ((vlib_buffer_t *)(x+1)) -#endif - /* */ diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 75716eca..15d93c16 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -195,8 +195,6 @@ do { \ } while (0) #endif -#if DPDK == 0 - typedef enum { /* Index is unknown. */ @@ -232,8 +230,6 @@ vlib_buffer_set_known_state (vlib_main_t * vm, u8 *vlib_validate_buffer (vlib_main_t * vm, u32 buffer_index, uword follow_chain); -#endif /* DPDK == 0 */ - clib_error_t *vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, unsigned socket_id); @@ -245,7 +241,15 @@ clib_error_t *vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, @return - (u32) number of buffers actually allocated, may be less than the number requested or zero */ -u32 vlib_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers); +always_inline u32 +vlib_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + + ASSERT (bm->cb.vlib_buffer_alloc_cb); + + return bm->cb.vlib_buffer_alloc_cb (vm, buffers, n_buffers); +} always_inline u32 vlib_buffer_round_size (u32 size) @@ -261,9 +265,18 @@ vlib_buffer_round_size (u32 size) @return - (u32) number of buffers actually allocated, may be less than the number requested or zero */ -u32 vlib_buffer_alloc_from_free_list (vlib_main_t * vm, - u32 * buffers, - u32 n_buffers, u32 free_list_index); +always_inline u32 +vlib_buffer_alloc_from_free_list (vlib_main_t * vm, + u32 * buffers, + u32 n_buffers, u32 free_list_index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + + ASSERT (bm->cb.vlib_buffer_alloc_from_free_list_cb); + + return bm->cb.vlib_buffer_alloc_from_free_list_cb (vm, buffers, n_buffers, + free_list_index); +} /** \brief Free buffers Frees the entire buffer chain for each buffer @@ -273,11 +286,19 @@ u32 vlib_buffer_alloc_from_free_list (vlib_main_t * vm, @param n_buffers - (u32) number of buffers to free */ -void vlib_buffer_free (vlib_main_t * vm, - /* pointer to first buffer */ - u32 * buffers, - /* number of buffers to free */ - u32 n_buffers); +always_inline void +vlib_buffer_free (vlib_main_t * vm, + /* pointer to first buffer */ + u32 * buffers, + /* number of buffers to free */ + u32 n_buffers) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + + ASSERT (bm->cb.vlib_buffer_free_cb); + + return bm->cb.vlib_buffer_free_cb (vm, buffers, n_buffers); +} /** \brief Free buffers, does not free the buffer chain for each buffer @@ -286,11 +307,19 @@ void vlib_buffer_free (vlib_main_t * vm, @param n_buffers - (u32) number of buffers to free */ -void vlib_buffer_free_no_next (vlib_main_t * vm, - /* pointer to first buffer */ - u32 * buffers, - /* number of buffers to free */ - u32 n_buffers); +always_inline void +vlib_buffer_free_no_next (vlib_main_t * vm, + /* pointer to first buffer */ + u32 * buffers, + /* number of buffers to free */ + u32 n_buffers) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + + ASSERT (bm->cb.vlib_buffer_free_no_next_cb); + + return bm->cb.vlib_buffer_free_no_next_cb (vm, buffers, n_buffers); +} /** \brief Free one buffer Shorthand to free a single buffer chain. @@ -307,7 +336,15 @@ vlib_buffer_free_one (vlib_main_t * vm, u32 buffer_index) /* Add/delete buffer free lists. */ u32 vlib_buffer_create_free_list (vlib_main_t * vm, u32 n_data_bytes, char *fmt, ...); -void vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index); +always_inline void +vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + + ASSERT (bm->cb.vlib_buffer_delete_free_list_cb); + + bm->cb.vlib_buffer_delete_free_list_cb (vm, free_list_index); +} /* Find already existing public free list with given size or create one. */ u32 vlib_buffer_get_or_create_free_list (vlib_main_t * vm, u32 n_data_bytes, @@ -453,11 +490,6 @@ vlib_buffer_copy (vlib_main_t * vm, vlib_buffer_t * b) return fd; } -/* - * vlib_buffer_chain_* functions provide a way to create long buffers. - * When DPDK is enabled, the 'hidden' DPDK header is taken care of transparently. - */ - /* Initializes the buffer as an empty packet with no chained buffers. */ always_inline void vlib_buffer_chain_init (vlib_buffer_t * first) @@ -537,8 +569,6 @@ typedef struct /* Vector of packet data. */ u8 *packet_data; - /* Note: the next three fields are unused if DPDK == 1 */ - /* Number of buffers to allocate in each call to physmem allocator. */ u32 min_n_buffers_each_physmem_alloc; diff --git a/src/vlib/buffer_serialize.c b/src/vlib/buffer_serialize.c new file mode 100644 index 00000000..96a5f0a0 --- /dev/null +++ b/src/vlib/buffer_serialize.c @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * buffer.c: allocate/free network buffers. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include + +static void +vlib_serialize_tx (serialize_main_header_t * m, serialize_stream_t * s) +{ + vlib_main_t *vm; + vlib_serialize_buffer_main_t *sm; + uword n, n_bytes_to_write; + vlib_buffer_t *last; + + n_bytes_to_write = s->current_buffer_index; + sm = + uword_to_pointer (s->data_function_opaque, + vlib_serialize_buffer_main_t *); + vm = sm->vlib_main; + + ASSERT (sm->tx.max_n_data_bytes_per_chain > 0); + if (serialize_stream_is_end_of_stream (s) + || sm->tx.n_total_data_bytes + n_bytes_to_write > + sm->tx.max_n_data_bytes_per_chain) + { + vlib_process_t *p = vlib_get_current_process (vm); + + last = vlib_get_buffer (vm, sm->last_buffer); + last->current_length = n_bytes_to_write; + + vlib_set_next_frame_buffer (vm, &p->node_runtime, sm->tx.next_index, + sm->first_buffer); + + sm->first_buffer = sm->last_buffer = ~0; + sm->tx.n_total_data_bytes = 0; + } + + else if (n_bytes_to_write == 0 && s->n_buffer_bytes == 0) + { + ASSERT (sm->first_buffer == ~0); + ASSERT (sm->last_buffer == ~0); + n = + vlib_buffer_alloc_from_free_list (vm, &sm->first_buffer, 1, + sm->tx.free_list_index); + if (n != 1) + serialize_error (m, + clib_error_create + ("vlib_buffer_alloc_from_free_list fails")); + sm->last_buffer = sm->first_buffer; + s->n_buffer_bytes = + vlib_buffer_free_list_buffer_size (vm, sm->tx.free_list_index); + } + + if (n_bytes_to_write > 0) + { + vlib_buffer_t *prev = vlib_get_buffer (vm, sm->last_buffer); + n = + vlib_buffer_alloc_from_free_list (vm, &sm->last_buffer, 1, + sm->tx.free_list_index); + if (n != 1) + serialize_error (m, + clib_error_create + ("vlib_buffer_alloc_from_free_list fails")); + sm->tx.n_total_data_bytes += n_bytes_to_write; + prev->current_length = n_bytes_to_write; + prev->next_buffer = sm->last_buffer; + prev->flags |= VLIB_BUFFER_NEXT_PRESENT; + } + + if (sm->last_buffer != ~0) + { + last = vlib_get_buffer (vm, sm->last_buffer); + s->buffer = vlib_buffer_get_current (last); + s->current_buffer_index = 0; + ASSERT (last->current_data == s->current_buffer_index); + } +} + +static void +vlib_serialize_rx (serialize_main_header_t * m, serialize_stream_t * s) +{ + vlib_main_t *vm; + vlib_serialize_buffer_main_t *sm; + vlib_buffer_t *last; + + sm = + uword_to_pointer (s->data_function_opaque, + vlib_serialize_buffer_main_t *); + vm = sm->vlib_main; + + if (serialize_stream_is_end_of_stream (s)) + return; + + if (sm->last_buffer != ~0) + { + last = vlib_get_buffer (vm, sm->last_buffer); + + if (last->flags & VLIB_BUFFER_NEXT_PRESENT) + sm->last_buffer = last->next_buffer; + else + { + vlib_buffer_free (vm, &sm->first_buffer, /* count */ 1); + sm->first_buffer = sm->last_buffer = ~0; + } + } + + if (sm->last_buffer == ~0) + { + while (clib_fifo_elts (sm->rx.buffer_fifo) == 0) + { + sm->rx.ready_one_time_event = + vlib_process_create_one_time_event (vm, vlib_current_process (vm), + ~0); + vlib_process_wait_for_one_time_event (vm, /* no event data */ 0, + sm->rx.ready_one_time_event); + } + + clib_fifo_sub1 (sm->rx.buffer_fifo, sm->first_buffer); + sm->last_buffer = sm->first_buffer; + } + + ASSERT (sm->last_buffer != ~0); + + last = vlib_get_buffer (vm, sm->last_buffer); + s->current_buffer_index = 0; + s->buffer = vlib_buffer_get_current (last); + s->n_buffer_bytes = last->current_length; +} + +static void +serialize_open_vlib_helper (serialize_main_t * m, + vlib_main_t * vm, + vlib_serialize_buffer_main_t * sm, uword is_read) +{ + /* Initialize serialize main but save overflow buffer for re-use between calls. */ + { + u8 *save = m->stream.overflow_buffer; + memset (m, 0, sizeof (m[0])); + m->stream.overflow_buffer = save; + if (save) + _vec_len (save) = 0; + } + + sm->first_buffer = sm->last_buffer = ~0; + if (is_read) + clib_fifo_reset (sm->rx.buffer_fifo); + else + sm->tx.n_total_data_bytes = 0; + sm->vlib_main = vm; + m->header.data_function = is_read ? vlib_serialize_rx : vlib_serialize_tx; + m->stream.data_function_opaque = pointer_to_uword (sm); +} + +void +serialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm, + vlib_serialize_buffer_main_t * sm) +{ + serialize_open_vlib_helper (m, vm, sm, /* is_read */ 0); +} + +void +unserialize_open_vlib_buffer (serialize_main_t * m, vlib_main_t * vm, + vlib_serialize_buffer_main_t * sm) +{ + serialize_open_vlib_helper (m, vm, sm, /* is_read */ 1); +} + +u32 +serialize_close_vlib_buffer (serialize_main_t * m) +{ + vlib_serialize_buffer_main_t *sm + = uword_to_pointer (m->stream.data_function_opaque, + vlib_serialize_buffer_main_t *); + vlib_buffer_t *last; + serialize_stream_t *s = &m->stream; + + last = vlib_get_buffer (sm->vlib_main, sm->last_buffer); + last->current_length = s->current_buffer_index; + + if (vec_len (s->overflow_buffer) > 0) + { + sm->last_buffer + = vlib_buffer_add_data (sm->vlib_main, sm->tx.free_list_index, + sm->last_buffer == ~0 ? 0 : sm->last_buffer, + s->overflow_buffer, + vec_len (s->overflow_buffer)); + _vec_len (s->overflow_buffer) = 0; + } + + return sm->first_buffer; +} + +void +unserialize_close_vlib_buffer (serialize_main_t * m) +{ + vlib_serialize_buffer_main_t *sm + = uword_to_pointer (m->stream.data_function_opaque, + vlib_serialize_buffer_main_t *); + if (sm->first_buffer != ~0) + vlib_buffer_free_one (sm->vlib_main, sm->first_buffer); + clib_fifo_reset (sm->rx.buffer_fifo); + if (m->stream.overflow_buffer) + _vec_len (m->stream.overflow_buffer) = 0; +} + +/** @endcond */ +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/main.c b/src/vlib/main.c index 6c6cad98..09f34bbd 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -465,7 +465,7 @@ vlib_put_next_frame (vlib_main_t * vm, vlib_frame_t *f; u32 n_vectors_in_frame; - if (DPDK == 0 && CLIB_DEBUG > 0) + if (vm->buffer_main->extern_buffer_mgmt == 0 && CLIB_DEBUG > 0) vlib_put_next_frame_validate (vm, r, next_index, n_vectors_left); nf = vlib_node_runtime_get_next_frame (vm, r, next_index); @@ -1012,8 +1012,8 @@ dispatch_node (vlib_main_t * vm, /* When in interrupt mode and vector rate crosses threshold switch to polling mode. */ - if ((DPDK == 0 && dispatch_state == VLIB_NODE_STATE_INTERRUPT) - || (DPDK == 0 && dispatch_state == VLIB_NODE_STATE_POLLING + if ((dispatch_state == VLIB_NODE_STATE_INTERRUPT) + || (dispatch_state == VLIB_NODE_STATE_POLLING && (node->flags & VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE))) { @@ -1615,6 +1615,7 @@ vlib_main (vlib_main_t * volatile vm, unformat_input_t * input) vm->name = "VLIB"; vec_validate (vm->buffer_main, 0); + vlib_buffer_cb_init (vm); if ((error = vlib_thread_init (vm))) { diff --git a/src/vlib/threads.c b/src/vlib/threads.c index c5e58bc0..b3bbd30e 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -22,29 +22,10 @@ #include #include - -#if DPDK==1 -#include -#include -#include -#include -#include -#endif DECLARE_CJ_GLOBAL_LOG; #define FRAME_QUEUE_NELTS 32 - -#if DPDK==1 -/* - * Weak definitions of DPDK symbols used in this file. - * Needed for linking test programs without DPDK libs. - */ -unsigned __thread __attribute__ ((weak)) RTE_PER_LCORE (_lcore_id); -struct lcore_config __attribute__ ((weak)) lcore_config[]; -unsigned __attribute__ ((weak)) rte_socket_id (); -int __attribute__ ((weak)) rte_eal_remote_launch (); -#endif u32 vl (void *p) { @@ -194,14 +175,17 @@ vlib_thread_init (vlib_main_t * vm) tm->cpu_socket_bitmap = clib_bitmap_set (0, 0, 1); /* pin main thread to main_lcore */ -#if DPDK==0 - { - cpu_set_t cpuset; - CPU_ZERO (&cpuset); - CPU_SET (tm->main_lcore, &cpuset); - pthread_setaffinity_np (pthread_self (), sizeof (cpu_set_t), &cpuset); - } -#endif + if (tm->cb.vlib_thread_set_lcore_cb) + { + tm->cb.vlib_thread_set_lcore_cb (0, tm->main_lcore); + } + else + { + cpu_set_t cpuset; + CPU_ZERO (&cpuset); + CPU_SET (tm->main_lcore, &cpuset); + pthread_setaffinity_np (pthread_self (), sizeof (cpu_set_t), &cpuset); + } /* as many threads as stacks... */ vec_validate_aligned (vlib_worker_threads, vec_len (vlib_thread_stacks) - 1, @@ -520,32 +504,29 @@ vlib_worker_thread_bootstrap_fn (void *arg) return rv; } -static int -vlib_launch_thread (void *fp, vlib_worker_thread_t * w, unsigned lcore_id) +static clib_error_t * +vlib_launch_thread_int (void *fp, vlib_worker_thread_t * w, unsigned lcore_id) { + vlib_thread_main_t *tm = &vlib_thread_main; void *(*fp_arg) (void *) = fp; w->lcore_id = lcore_id; -#if DPDK==1 - if (!w->registration->use_pthreads) - if (rte_eal_remote_launch) /* do we have dpdk linked */ - return rte_eal_remote_launch (fp, (void *) w, lcore_id); - else - return -1; + if (tm->cb.vlib_launch_thread_cb && !w->registration->use_pthreads) + return tm->cb.vlib_launch_thread_cb (fp, (void *) w, lcore_id); else -#endif { - int ret; pthread_t worker; cpu_set_t cpuset; CPU_ZERO (&cpuset); CPU_SET (lcore_id, &cpuset); - ret = pthread_create (&worker, NULL /* attr */ , fp_arg, (void *) w); - if (ret == 0) - return pthread_setaffinity_np (worker, sizeof (cpu_set_t), &cpuset); - else - return ret; + if (pthread_create (&worker, NULL /* attr */ , fp_arg, (void *) w)) + return clib_error_return_unix (0, "pthread_create"); + + if (pthread_setaffinity_np (worker, sizeof (cpu_set_t), &cpuset)) + return clib_error_return_unix (0, "pthread_setaffinity_np"); + + return 0; } } @@ -769,6 +750,7 @@ start_workers (vlib_main_t * vm) for (i = 0; i < vec_len (tm->registrations); i++) { + clib_error_t *err; int j; tr = tm->registrations[i]; @@ -778,22 +760,24 @@ start_workers (vlib_main_t * vm) for (j = 0; j < tr->count; j++) { w = vlib_worker_threads + worker_thread_index++; - if (vlib_launch_thread (vlib_worker_thread_bootstrap_fn, w, 0) < - 0) - clib_warning ("Couldn't start '%s' pthread ", tr->name); + err = vlib_launch_thread_int (vlib_worker_thread_bootstrap_fn, + w, 0); + if (err) + clib_error_report (err); } } else { uword c; - /* *INDENT-OFF* */ - clib_bitmap_foreach (c, tr->coremask, ({ - w = vlib_worker_threads + worker_thread_index++; - if (vlib_launch_thread (vlib_worker_thread_bootstrap_fn, w, c) < 0) - clib_warning ("Couldn't start DPDK lcore %d", c); - - })); -/* *INDENT-ON* */ + /* *INDENT-OFF* */ + clib_bitmap_foreach (c, tr->coremask, ({ + w = vlib_worker_threads + worker_thread_index++; + err = vlib_launch_thread_int (vlib_worker_thread_bootstrap_fn, + w, c); + if (err) + clib_error_report (err); + })); + /* *INDENT-ON* */ } } vlib_worker_thread_barrier_sync (vm); @@ -1105,7 +1089,7 @@ cpu_config (vlib_main_t * vm, unformat_input_t * input) { tm->n_thread_stacks += tr->count; tm->n_pthreads += tr->count * tr->use_pthreads; - tm->n_eal_threads += tr->count * (tr->use_pthreads == 0); + tm->n_threads += tr->count * (tr->use_pthreads == 0); tr = tr->next; } @@ -1423,6 +1407,7 @@ void vlib_worker_thread_fn (void *arg) { vlib_worker_thread_t *w = (vlib_worker_thread_t *) arg; + vlib_thread_main_t *tm = vlib_get_thread_main (); vlib_main_t *vm = vlib_get_main (); ASSERT (vm->cpu_index == os_get_cpu_number ()); @@ -1431,12 +1416,9 @@ vlib_worker_thread_fn (void *arg) clib_time_init (&vm->clib_time); clib_mem_set_heap (w->thread_mheap); -#if DPDK > 0 /* Wait until the dpdk init sequence is complete */ - vlib_thread_main_t *tm = vlib_get_thread_main (); - while (tm->worker_thread_release == 0) + while (tm->extern_thread_mgmt && tm->worker_thread_release == 0) vlib_worker_thread_barrier_check (); -#endif vlib_worker_thread_internal (vm); } @@ -1475,6 +1457,20 @@ vlib_frame_queue_main_init (u32 node_index, u32 frame_queue_nelts) return (fqm - tm->frame_queue_mains); } + +int +vlib_thread_cb_register (struct vlib_main_t *vm, vlib_thread_callbacks_t * cb) +{ + vlib_thread_main_t *tm = vlib_get_thread_main (); + + if (tm->extern_thread_mgmt) + return -1; + + tm->cb.vlib_launch_thread_cb = cb->vlib_launch_thread_cb; + tm->extern_thread_mgmt = 1; + return 0; +} + clib_error_t * threads_init (vlib_main_t * vm) { diff --git a/src/vlib/threads.h b/src/vlib/threads.h index 34ab5be8..75a5a281 100644 --- a/src/vlib/threads.h +++ b/src/vlib/threads.h @@ -263,6 +263,13 @@ typedef enum SCHED_POLICY_N, } sched_policy_t; +typedef struct +{ + clib_error_t *(*vlib_launch_thread_cb) (void *fp, vlib_worker_thread_t * w, + unsigned lcore_id); + clib_error_t *(*vlib_thread_set_lcore_cb) (u32 thread, u16 lcore); +} vlib_thread_callbacks_t; + typedef struct { /* Link list of registrations, built by constructors */ @@ -290,8 +297,8 @@ typedef struct /* Number of pthreads */ u32 n_pthreads; - /* Number of DPDK eal threads */ - u32 n_eal_threads; + /* Number of threads */ + u32 n_threads; /* Number of cores to skip, must match the core mask */ u32 skip_cores; @@ -320,6 +327,9 @@ typedef struct /* scheduling policy priority */ u32 sched_priority; + /* callbacks */ + vlib_thread_callbacks_t cb; + int extern_thread_mgmt; } vlib_thread_main_t; extern vlib_thread_main_t vlib_thread_main; @@ -459,6 +469,9 @@ vlib_get_worker_handoff_queue_elt (u32 frame_queue_index, return elt; } +int vlib_thread_cb_register (struct vlib_main_t *vm, + vlib_thread_callbacks_t * cb); + #endif /* included_vlib_threads_h */ /* diff --git a/src/vlib/threads_cli.c b/src/vlib/threads_cli.c index ee632279..b64028c4 100644 --- a/src/vlib/threads_cli.c +++ b/src/vlib/threads_cli.c @@ -20,14 +20,6 @@ #include #include -#if DPDK==1 -#include -#include -#include -#include -#include -#endif - static u8 * format_sched_policy_and_priority (u8 * s, va_list * args) { @@ -116,23 +108,6 @@ show_threads_fn (vlib_main_t * vm, vec_free (p); line = format (line, "%-7u%-7u%-7u%", lcore, core_id, socket_id); -#if DPDK==1 - ASSERT (lcore <= RTE_MAX_LCORE); - switch (lcore_config[lcore].state) - { - case WAIT: - line = format (line, "wait"); - break; - case RUNNING: - line = format (line, "running"); - break; - case FINISHED: - line = format (line, "finished"); - break; - default: - line = format (line, "unknown"); - } -#endif } else { diff --git a/src/vlib/unix/physmem.c b/src/vlib/unix/physmem.c index 80ab7b9d..8d10ad2e 100644 --- a/src/vlib/unix/physmem.c +++ b/src/vlib/unix/physmem.c @@ -45,13 +45,13 @@ static void * unix_physmem_alloc_aligned (vlib_physmem_main_t * vpm, uword n_bytes, uword alignment) { + vlib_main_t *vm = vlib_get_main (); physmem_main_t *pm = &physmem_main; uword lo_offset, hi_offset; uword *to_free = 0; -#if DPDK > 0 - clib_warning ("unsafe alloc!"); -#endif + if (vm->buffer_main->extern_buffer_mgmt) + clib_warning ("unsafe alloc!"); /* IO memory is always at least cache aligned. */ alignment = clib_max (alignment, CLIB_CACHE_LINE_BYTES); @@ -269,16 +269,17 @@ static clib_error_t * show_physmem (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) { -#if DPDK > 0 - vlib_cli_output (vm, "Not supported with DPDK drivers."); -#else physmem_main_t *pm = &physmem_main; + if (vm->buffer_main->extern_buffer_mgmt) + { + vlib_cli_output (vm, "Not supported with external buffer management."); + return 0; + } if (pm->heap) vlib_cli_output (vm, "%U", format_mheap, pm->heap, /* verbose */ 1); else vlib_cli_output (vm, "No physmem allocated."); -#endif return 0; } diff --git a/src/vnet.am b/src/vnet.am index 665a16ea..47c5eda7 100644 --- a/src/vnet.am +++ b/src/vnet.am @@ -761,11 +761,13 @@ nobase_include_HEADERS += \ ######################################## if WITH_DPDK libvnet_la_SOURCES += \ + vnet/devices/dpdk/buffer.c \ vnet/devices/dpdk/dpdk_priv.h \ vnet/devices/dpdk/device.c \ vnet/devices/dpdk/format.c \ vnet/devices/dpdk/init.c \ vnet/devices/dpdk/node.c \ + vnet/devices/dpdk/thread.c \ vnet/devices/dpdk/hqos.c \ vnet/devices/dpdk/cli.c \ vnet/devices/dpdk/dpdk_api.c diff --git a/src/vnet/devices/dpdk/buffer.c b/src/vnet/devices/dpdk/buffer.c new file mode 100644 index 00000000..214a9162 --- /dev/null +++ b/src/vnet/devices/dpdk/buffer.c @@ -0,0 +1,729 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * buffer.c: allocate/free network buffers. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @file + * + * Allocate/free network buffers. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + + +STATIC_ASSERT (VLIB_BUFFER_PRE_DATA_SIZE == RTE_PKTMBUF_HEADROOM, + "VLIB_BUFFER_PRE_DATA_SIZE must be equal to RTE_PKTMBUF_HEADROOM"); + +#define BUFFERS_PER_COPY (sizeof (vlib_copy_unit_t) / sizeof (u32)) + +/* Make sure we have at least given number of unaligned buffers. */ +static void +fill_unaligned (vlib_main_t * vm, + vlib_buffer_free_list_t * free_list, + uword n_unaligned_buffers) +{ + word la = vec_len (free_list->aligned_buffers); + word lu = vec_len (free_list->unaligned_buffers); + + /* Aligned come in aligned copy-sized chunks. */ + ASSERT (la % BUFFERS_PER_COPY == 0); + + ASSERT (la >= n_unaligned_buffers); + + while (lu < n_unaligned_buffers) + { + /* Copy 4 buffers from end of aligned vector to unaligned vector. */ + vec_add (free_list->unaligned_buffers, + free_list->aligned_buffers + la - BUFFERS_PER_COPY, + BUFFERS_PER_COPY); + la -= BUFFERS_PER_COPY; + lu += BUFFERS_PER_COPY; + } + _vec_len (free_list->aligned_buffers) = la; +} + +/* After free aligned buffers may not contain even sized chunks. */ +static void +trim_aligned (vlib_buffer_free_list_t * f) +{ + uword l, n_trim; + + /* Add unaligned to aligned before trim. */ + l = vec_len (f->unaligned_buffers); + if (l > 0) + { + vec_add_aligned (f->aligned_buffers, f->unaligned_buffers, l, + /* align */ sizeof (vlib_copy_unit_t)); + + _vec_len (f->unaligned_buffers) = 0; + } + + /* Remove unaligned buffers from end of aligned vector and save for next trim. */ + l = vec_len (f->aligned_buffers); + n_trim = l % BUFFERS_PER_COPY; + if (n_trim) + { + /* Trim aligned -> unaligned. */ + vec_add (f->unaligned_buffers, f->aligned_buffers + l - n_trim, n_trim); + + /* Remove from aligned. */ + _vec_len (f->aligned_buffers) = l - n_trim; + } +} + +static void +merge_free_lists (vlib_buffer_free_list_t * dst, + vlib_buffer_free_list_t * src) +{ + uword l; + u32 *d; + + trim_aligned (src); + trim_aligned (dst); + + l = vec_len (src->aligned_buffers); + if (l > 0) + { + vec_add2_aligned (dst->aligned_buffers, d, l, + /* align */ sizeof (vlib_copy_unit_t)); + clib_memcpy (d, src->aligned_buffers, l * sizeof (d[0])); + vec_free (src->aligned_buffers); + } + + l = vec_len (src->unaligned_buffers); + if (l > 0) + { + vec_add (dst->unaligned_buffers, src->unaligned_buffers, l); + vec_free (src->unaligned_buffers); + } +} + +always_inline u32 +dpdk_buffer_get_free_list_with_size (vlib_main_t * vm, u32 size) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + + size = vlib_buffer_round_size (size); + uword *p = hash_get (bm->free_list_by_size, size); + return p ? p[0] : ~0; +} + +static void +del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f) +{ + u32 i; + struct rte_mbuf *mb; + vlib_buffer_t *b; + + for (i = 0; i < vec_len (f->unaligned_buffers); i++) + { + b = vlib_get_buffer (vm, f->unaligned_buffers[i]); + mb = rte_mbuf_from_vlib_buffer (b); + ASSERT (rte_mbuf_refcnt_read (mb) == 1); + rte_pktmbuf_free (mb); + } + for (i = 0; i < vec_len (f->aligned_buffers); i++) + { + b = vlib_get_buffer (vm, f->aligned_buffers[i]); + mb = rte_mbuf_from_vlib_buffer (b); + ASSERT (rte_mbuf_refcnt_read (mb) == 1); + rte_pktmbuf_free (mb); + } + vec_free (f->name); + vec_free (f->unaligned_buffers); + vec_free (f->aligned_buffers); +} + +/* Add buffer free list. */ +static void +dpdk_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *f; + u32 merge_index; + int i; + + ASSERT (os_get_cpu_number () == 0); + + f = vlib_buffer_get_free_list (vm, free_list_index); + + merge_index = dpdk_buffer_get_free_list_with_size (vm, f->n_data_bytes); + if (merge_index != ~0 && merge_index != free_list_index) + { + merge_free_lists (pool_elt_at_index (bm->buffer_free_list_pool, + merge_index), f); + } + + del_free_list (vm, f); + + /* Poison it. */ + memset (f, 0xab, sizeof (f[0])); + + pool_put (bm->buffer_free_list_pool, f); + + for (i = 1; i < vec_len (vlib_mains); i++) + { + bm = vlib_mains[i]->buffer_main; + f = vlib_buffer_get_free_list (vlib_mains[i], free_list_index);; + memset (f, 0xab, sizeof (f[0])); + pool_put (bm->buffer_free_list_pool, f); + } +} + +/* Make sure free list has at least given number of free buffers. */ +static uword +fill_free_list (vlib_main_t * vm, + vlib_buffer_free_list_t * fl, uword min_free_buffers) +{ + dpdk_main_t *dm = &dpdk_main; + vlib_buffer_t *b; + int n, i; + u32 bi; + u32 n_remaining = 0, n_alloc = 0; + unsigned socket_id = rte_socket_id (); + struct rte_mempool *rmp = dm->pktmbuf_pools[socket_id]; + struct rte_mbuf *mb; + + /* Too early? */ + if (PREDICT_FALSE (rmp == 0)) + return 0; + + trim_aligned (fl); + + /* Already have enough free buffers on free list? */ + n = min_free_buffers - vec_len (fl->aligned_buffers); + if (n <= 0) + return min_free_buffers; + + /* Always allocate round number of buffers. */ + n = round_pow2 (n, BUFFERS_PER_COPY); + + /* Always allocate new buffers in reasonably large sized chunks. */ + n = clib_max (n, fl->min_n_buffers_each_physmem_alloc); + + vec_validate (vm->mbuf_alloc_list, n - 1); + + if (rte_mempool_get_bulk (rmp, vm->mbuf_alloc_list, n) < 0) + return 0; + + _vec_len (vm->mbuf_alloc_list) = n; + + for (i = 0; i < n; i++) + { + mb = vm->mbuf_alloc_list[i]; + + ASSERT (rte_mbuf_refcnt_read (mb) == 0); + rte_mbuf_refcnt_set (mb, 1); + + b = vlib_buffer_from_rte_mbuf (mb); + bi = vlib_get_buffer_index (vm, b); + + vec_add1_aligned (fl->aligned_buffers, bi, sizeof (vlib_copy_unit_t)); + n_alloc++; + n_remaining--; + + vlib_buffer_init_for_free_list (b, fl); + + if (fl->buffer_init_function) + fl->buffer_init_function (vm, fl, &bi, 1); + } + + fl->n_alloc += n; + + return n; +} + +always_inline uword +copy_alignment (u32 * x) +{ + return (pointer_to_uword (x) / sizeof (x[0])) % BUFFERS_PER_COPY; +} + +static u32 +alloc_from_free_list (vlib_main_t * vm, + vlib_buffer_free_list_t * free_list, + u32 * alloc_buffers, u32 n_alloc_buffers) +{ + u32 *dst, *u_src; + uword u_len, n_left; + uword n_unaligned_start, n_unaligned_end, n_filled; + + n_left = n_alloc_buffers; + dst = alloc_buffers; + n_unaligned_start = ((BUFFERS_PER_COPY - copy_alignment (dst)) + & (BUFFERS_PER_COPY - 1)); + + n_filled = fill_free_list (vm, free_list, n_alloc_buffers); + if (n_filled == 0) + return 0; + + n_left = n_filled < n_left ? n_filled : n_left; + n_alloc_buffers = n_left; + + if (n_unaligned_start >= n_left) + { + n_unaligned_start = n_left; + n_unaligned_end = 0; + } + else + n_unaligned_end = copy_alignment (dst + n_alloc_buffers); + + fill_unaligned (vm, free_list, n_unaligned_start + n_unaligned_end); + + u_len = vec_len (free_list->unaligned_buffers); + u_src = free_list->unaligned_buffers + u_len - 1; + + if (n_unaligned_start) + { + uword n_copy = n_unaligned_start; + if (n_copy > n_left) + n_copy = n_left; + n_left -= n_copy; + + while (n_copy > 0) + { + *dst++ = *u_src--; + n_copy--; + u_len--; + } + + /* Now dst should be aligned. */ + if (n_left > 0) + ASSERT (pointer_to_uword (dst) % sizeof (vlib_copy_unit_t) == 0); + } + + /* Aligned copy. */ + { + vlib_copy_unit_t *d, *s; + uword n_copy; + + if (vec_len (free_list->aligned_buffers) < + ((n_left / BUFFERS_PER_COPY) * BUFFERS_PER_COPY)) + abort (); + + n_copy = n_left / BUFFERS_PER_COPY; + n_left = n_left % BUFFERS_PER_COPY; + + /* Remove buffers from aligned free list. */ + _vec_len (free_list->aligned_buffers) -= n_copy * BUFFERS_PER_COPY; + + s = (vlib_copy_unit_t *) vec_end (free_list->aligned_buffers); + d = (vlib_copy_unit_t *) dst; + + /* Fast path loop. */ + while (n_copy >= 4) + { + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + n_copy -= 4; + s += 4; + d += 4; + } + + while (n_copy >= 1) + { + d[0] = s[0]; + n_copy -= 1; + s += 1; + d += 1; + } + + dst = (void *) d; + } + + /* Unaligned copy. */ + ASSERT (n_unaligned_end == n_left); + while (n_left > 0) + { + *dst++ = *u_src--; + n_left--; + u_len--; + } + + if (!free_list->unaligned_buffers) + ASSERT (u_len == 0); + else + _vec_len (free_list->unaligned_buffers) = u_len; + + return n_alloc_buffers; +} + +/* Allocate a given number of buffers into given array. + Returns number actually allocated which will be either zero or + number requested. */ +u32 +dpdk_buffer_alloc (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + + return alloc_from_free_list + (vm, + pool_elt_at_index (bm->buffer_free_list_pool, + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX), + buffers, n_buffers); +} + + +u32 +dpdk_buffer_alloc_from_free_list (vlib_main_t * vm, + u32 * buffers, + u32 n_buffers, u32 free_list_index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *f; + f = pool_elt_at_index (bm->buffer_free_list_pool, free_list_index); + return alloc_from_free_list (vm, f, buffers, n_buffers); +} + +always_inline void +add_buffer_to_free_list (vlib_main_t * vm, + vlib_buffer_free_list_t * f, + u32 buffer_index, u8 do_init) +{ + vlib_buffer_t *b; + b = vlib_get_buffer (vm, buffer_index); + if (PREDICT_TRUE (do_init)) + vlib_buffer_init_for_free_list (b, f); + vec_add1_aligned (f->aligned_buffers, buffer_index, + sizeof (vlib_copy_unit_t)); +} + +always_inline vlib_buffer_free_list_t * +buffer_get_free_list (vlib_main_t * vm, vlib_buffer_t * b, u32 * index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + u32 i; + + *index = i = b->free_list_index; + return pool_elt_at_index (bm->buffer_free_list_pool, i); +} + +static_always_inline void +vlib_buffer_free_inline (vlib_main_t * vm, + u32 * buffers, u32 n_buffers, u32 follow_buffer_next) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_free_list_t *fl; + u32 fi; + int i; + u32 (*cb) (vlib_main_t * vm, u32 * buffers, u32 n_buffers, + u32 follow_buffer_next); + + cb = bm->buffer_free_callback; + + if (PREDICT_FALSE (cb != 0)) + n_buffers = (*cb) (vm, buffers, n_buffers, follow_buffer_next); + + if (!n_buffers) + return; + + for (i = 0; i < n_buffers; i++) + { + vlib_buffer_t *b; + struct rte_mbuf *mb; + + b = vlib_get_buffer (vm, buffers[i]); + + fl = buffer_get_free_list (vm, b, &fi); + + /* The only current use of this callback: multicast recycle */ + if (PREDICT_FALSE (fl->buffers_added_to_freelist_function != 0)) + { + int j; + + add_buffer_to_free_list + (vm, fl, buffers[i], (b->flags & VLIB_BUFFER_RECYCLE) == 0); + + for (j = 0; j < vec_len (bm->announce_list); j++) + { + if (fl == bm->announce_list[j]) + goto already_announced; + } + vec_add1 (bm->announce_list, fl); + already_announced: + ; + } + else + { + if (PREDICT_TRUE ((b->flags & VLIB_BUFFER_RECYCLE) == 0)) + { + mb = rte_mbuf_from_vlib_buffer (b); + ASSERT (rte_mbuf_refcnt_read (mb) == 1); + rte_pktmbuf_free (mb); + } + } + } + if (vec_len (bm->announce_list)) + { + vlib_buffer_free_list_t *fl; + for (i = 0; i < vec_len (bm->announce_list); i++) + { + fl = bm->announce_list[i]; + fl->buffers_added_to_freelist_function (vm, fl); + } + _vec_len (bm->announce_list) = 0; + } +} + +static void +dpdk_buffer_free (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +{ + vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */ + 1); +} + +static void +dpdk_buffer_free_no_next (vlib_main_t * vm, u32 * buffers, u32 n_buffers) +{ + vlib_buffer_free_inline (vm, buffers, n_buffers, /* follow_buffer_next */ + 0); +} + +static void +dpdk_packet_template_init (vlib_main_t * vm, + void *vt, + void *packet_data, + uword n_packet_data_bytes, + uword min_n_buffers_each_physmem_alloc, u8 * name) +{ + vlib_packet_template_t *t = (vlib_packet_template_t *) vt; + + vlib_worker_thread_barrier_sync (vm); + memset (t, 0, sizeof (t[0])); + + vec_add (t->packet_data, packet_data, n_packet_data_bytes); + + vlib_worker_thread_barrier_release (vm); +} + +clib_error_t * +vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, + unsigned socket_id) +{ + dpdk_main_t *dm = &dpdk_main; + vlib_physmem_main_t *vpm = &vm->physmem_main; + struct rte_mempool *rmp; + int i; + + vec_validate_aligned (dm->pktmbuf_pools, socket_id, CLIB_CACHE_LINE_BYTES); + + /* pool already exists, nothing to do */ + if (dm->pktmbuf_pools[socket_id]) + return 0; + + u8 *pool_name = format (0, "mbuf_pool_socket%u%c", socket_id, 0); + + rmp = rte_pktmbuf_pool_create ((char *) pool_name, /* pool name */ + num_mbufs, /* number of mbufs */ + 512, /* cache size */ + VLIB_BUFFER_HDR_SIZE, /* priv size */ + VLIB_BUFFER_PRE_DATA_SIZE + VLIB_BUFFER_DATA_SIZE, /* dataroom size */ + socket_id); /* cpu socket */ + + if (rmp) + { + { + uword this_pool_end; + uword this_pool_start; + uword this_pool_size; + uword save_vpm_start, save_vpm_end, save_vpm_size; + struct rte_mempool_memhdr *memhdr; + + this_pool_start = ~0ULL; + this_pool_end = 0LL; + + STAILQ_FOREACH (memhdr, &rmp->mem_list, next) + { + if (((uword) (memhdr->addr + memhdr->len)) > this_pool_end) + this_pool_end = (uword) (memhdr->addr + memhdr->len); + if (((uword) memhdr->addr) < this_pool_start) + this_pool_start = (uword) (memhdr->addr); + } + ASSERT (this_pool_start < ~0ULL && this_pool_end > 0); + this_pool_size = this_pool_end - this_pool_start; + + if (CLIB_DEBUG > 1) + { + clib_warning ("%s: pool start %llx pool end %llx pool size %lld", + pool_name, this_pool_start, this_pool_end, + this_pool_size); + clib_warning + ("before: virtual.start %llx virtual.end %llx virtual.size %lld", + vpm->virtual.start, vpm->virtual.end, vpm->virtual.size); + } + + save_vpm_start = vpm->virtual.start; + save_vpm_end = vpm->virtual.end; + save_vpm_size = vpm->virtual.size; + + if ((this_pool_start < vpm->virtual.start) || vpm->virtual.start == 0) + vpm->virtual.start = this_pool_start; + if (this_pool_end > vpm->virtual.end) + vpm->virtual.end = this_pool_end; + + vpm->virtual.size = vpm->virtual.end - vpm->virtual.start; + + if (CLIB_DEBUG > 1) + { + clib_warning + ("after: virtual.start %llx virtual.end %llx virtual.size %lld", + vpm->virtual.start, vpm->virtual.end, vpm->virtual.size); + } + + /* check if fits into buffer index range */ + if ((u64) vpm->virtual.size > + ((u64) 1 << (32 + CLIB_LOG2_CACHE_LINE_BYTES))) + { + clib_warning ("physmem: virtual size out of range!"); + vpm->virtual.start = save_vpm_start; + vpm->virtual.end = save_vpm_end; + vpm->virtual.size = save_vpm_size; + rmp = 0; + } + } + if (rmp) + { + dm->pktmbuf_pools[socket_id] = rmp; + vec_free (pool_name); + return 0; + } + } + + vec_free (pool_name); + + /* no usable pool for this socket, try to use pool from another one */ + for (i = 0; i < vec_len (dm->pktmbuf_pools); i++) + { + if (dm->pktmbuf_pools[i]) + { + clib_warning + ("WARNING: Failed to allocate mempool for CPU socket %u. " + "Threads running on socket %u will use socket %u mempool.", + socket_id, socket_id, i); + dm->pktmbuf_pools[socket_id] = dm->pktmbuf_pools[i]; + return 0; + } + } + + return clib_error_return (0, "failed to allocate mempool on socket %u", + socket_id); +} + +#if CLIB_DEBUG > 0 + +u32 *vlib_buffer_state_validation_lock; +uword *vlib_buffer_state_validation_hash; +void *vlib_buffer_state_heap; + +static clib_error_t * +buffer_state_validation_init (vlib_main_t * vm) +{ + void *oldheap; + + vlib_buffer_state_heap = mheap_alloc (0, 10 << 20); + + oldheap = clib_mem_set_heap (vlib_buffer_state_heap); + + vlib_buffer_state_validation_hash = hash_create (0, sizeof (uword)); + vec_validate_aligned (vlib_buffer_state_validation_lock, 0, + CLIB_CACHE_LINE_BYTES); + clib_mem_set_heap (oldheap); + return 0; +} + +VLIB_INIT_FUNCTION (buffer_state_validation_init); +#endif + +static vlib_buffer_callbacks_t callbacks = { + .vlib_buffer_alloc_cb = &dpdk_buffer_alloc, + .vlib_buffer_alloc_from_free_list_cb = &dpdk_buffer_alloc_from_free_list, + .vlib_buffer_free_cb = &dpdk_buffer_free, + .vlib_buffer_free_no_next_cb = &dpdk_buffer_free_no_next, + .vlib_packet_template_init_cb = &dpdk_packet_template_init, + .vlib_buffer_delete_free_list_cb = &dpdk_buffer_delete_free_list, +}; + +static clib_error_t * +dpdk_buffer_init (vlib_main_t * vm) +{ + vlib_buffer_cb_register (vm, &callbacks); + return 0; +} + +VLIB_INIT_FUNCTION (dpdk_buffer_init); + +/** @endcond */ +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/devices/dpdk/cli.c b/src/vnet/devices/dpdk/cli.c index 538a00fd..22bd4b4f 100644 --- a/src/vnet/devices/dpdk/cli.c +++ b/src/vnet/devices/dpdk/cli.c @@ -164,9 +164,9 @@ show_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input, struct rte_mempool *rmp; int i; - for (i = 0; i < vec_len (vm->buffer_main->pktmbuf_pools); i++) + for (i = 0; i < vec_len (dpdk_main.pktmbuf_pools); i++) { - rmp = vm->buffer_main->pktmbuf_pools[i]; + rmp = dpdk_main.pktmbuf_pools[i]; if (rmp) { unsigned count = rte_mempool_avail_count (rmp); diff --git a/src/vnet/devices/dpdk/device.c b/src/vnet/devices/dpdk/device.c index b22fbf2e..0deab6aa 100644 --- a/src/vnet/devices/dpdk/device.c +++ b/src/vnet/devices/dpdk/device.c @@ -87,19 +87,18 @@ dpdk_set_mc_filter (vnet_hw_interface_t * hi, struct rte_mbuf * dpdk_replicate_packet_mb (vlib_buffer_t * b) { - vlib_main_t *vm = vlib_get_main (); - vlib_buffer_main_t *bm = vm->buffer_main; + dpdk_main_t *dm = &dpdk_main; struct rte_mbuf **mbufs = 0, *s, *d; u8 nb_segs; unsigned socket_id = rte_socket_id (); int i; - ASSERT (bm->pktmbuf_pools[socket_id]); + ASSERT (dm->pktmbuf_pools[socket_id]); s = rte_mbuf_from_vlib_buffer (b); nb_segs = s->nb_segs; vec_validate (mbufs, nb_segs - 1); - if (rte_pktmbuf_alloc_bulk (bm->pktmbuf_pools[socket_id], mbufs, nb_segs)) + if (rte_pktmbuf_alloc_bulk (dm->pktmbuf_pools[socket_id], mbufs, nb_segs)) { vec_free (mbufs); return 0; diff --git a/src/vnet/devices/dpdk/dpdk.h b/src/vnet/devices/dpdk/dpdk.h index e0436031..066ec6fa 100644 --- a/src/vnet/devices/dpdk/dpdk.h +++ b/src/vnet/devices/dpdk/dpdk.h @@ -425,6 +425,9 @@ typedef struct vlib_main_t *vlib_main; vnet_main_t *vnet_main; dpdk_config_main_t *conf; + + /* mempool */ + struct rte_mempool **pktmbuf_pools; } dpdk_main_t; dpdk_main_t dpdk_main; diff --git a/src/vnet/devices/dpdk/dpdk_priv.h b/src/vnet/devices/dpdk/dpdk_priv.h index 0c81dbc3..dd40ff48 100644 --- a/src/vnet/devices/dpdk/dpdk_priv.h +++ b/src/vnet/devices/dpdk/dpdk_priv.h @@ -13,6 +13,9 @@ * limitations under the License. */ +#define rte_mbuf_from_vlib_buffer(x) (((struct rte_mbuf *)x) - 1) +#define vlib_buffer_from_rte_mbuf(x) ((vlib_buffer_t *)(x+1)) + #define DPDK_NB_RX_DESC_DEFAULT 1024 #define DPDK_NB_TX_DESC_DEFAULT 1024 #define DPDK_NB_RX_DESC_VIRTIO 256 diff --git a/src/vnet/devices/dpdk/init.c b/src/vnet/devices/dpdk/init.c index 60689463..4c040d20 100755 --- a/src/vnet/devices/dpdk/init.c +++ b/src/vnet/devices/dpdk/init.c @@ -64,8 +64,6 @@ static struct rte_eth_conf port_conf_template = { clib_error_t * dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd) { - vlib_main_t *vm = vlib_get_main (); - vlib_buffer_main_t *bm = vm->buffer_main; int rv; int j; @@ -107,7 +105,7 @@ dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd) rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc, xd->cpu_socket, 0, - bm-> + dm-> pktmbuf_pools[xd->cpu_socket_id_by_queue [j]]); @@ -115,7 +113,7 @@ dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd) if (rv < 0) rv = rte_eth_rx_queue_setup (xd->device_index, j, xd->nb_rx_desc, SOCKET_ID_ANY, 0, - bm-> + dm-> pktmbuf_pools[xd->cpu_socket_id_by_queue [j]]); if (rv < 0) diff --git a/src/vnet/devices/dpdk/thread.c b/src/vnet/devices/dpdk/thread.c new file mode 100644 index 00000000..475dd142 --- /dev/null +++ b/src/vnet/devices/dpdk/thread.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static clib_error_t * +dpdk_launch_thread (void *fp, vlib_worker_thread_t * w, unsigned lcore_id) +{ + int r; + r = rte_eal_remote_launch (fp, (void *) w, lcore_id); + if (r) + return clib_error_return (0, "Failed to launch thread %u", lcore_id); + return 0; +} + +static clib_error_t * +dpdk_thread_set_lcore (u32 thread, u16 lcore) +{ + return 0; +} + +static vlib_thread_callbacks_t callbacks = { + .vlib_launch_thread_cb = &dpdk_launch_thread, + .vlib_thread_set_lcore_cb = &dpdk_thread_set_lcore, +}; + +static clib_error_t * +dpdk_thread_init (vlib_main_t * vm) +{ + vlib_thread_cb_register (vm, &callbacks); + return 0; +} + +VLIB_INIT_FUNCTION (dpdk_thread_init); + +/** @endcond */ +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/sr/sr_replicate.c b/src/vnet/sr/sr_replicate.c index 5f9de504..fa5a68c3 100644 --- a/src/vnet/sr/sr_replicate.c +++ b/src/vnet/sr/sr_replicate.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -142,6 +143,7 @@ static uword sr_replicate_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { + dpdk_main_t *dm = &dpdk_main; u32 n_left_from, *from, *to_next; sr_replicate_next_t next_index; int pkts_replicated = 0; @@ -149,7 +151,6 @@ sr_replicate_node_fn (vlib_main_t * vm, int no_buffer_drops = 0; vlib_buffer_free_list_t *fl; unsigned socket_id = rte_socket_id (); - vlib_buffer_main_t *bm = vm->buffer_main; from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -246,13 +247,13 @@ sr_replicate_node_fn (vlib_main_t * vm, vlib_buffer_t *clone0_c, *clone_b0; t0 = vec_elt_at_index (sm->tunnels, pol0->tunnel_indices[i]); - hdr_mb0 = rte_pktmbuf_alloc (bm->pktmbuf_pools[socket_id]); + hdr_mb0 = rte_pktmbuf_alloc (dm->pktmbuf_pools[socket_id]); if (i < (num_replicas - 1)) { /* Not the last tunnel to process */ clone0 = rte_pktmbuf_clone - (orig_mb0, bm->pktmbuf_pools[socket_id]); + (orig_mb0, dm->pktmbuf_pools[socket_id]); if (clone0 == 0) goto clone_fail; nb_seg = 0; -- cgit 1.2.3-korg From 8a6a3b2ddb2e2929c4697b31746b3f617886f157 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Tue, 17 Jan 2017 14:12:42 +0100 Subject: dpdk: remove duplicate code in buffers.c Change-Id: Idc17b4a32d40012556d5d8550942db0372ebf23d Signed-off-by: Damjan Marion --- src/vlib/buffer.c | 73 +++++++---------------- src/vlib/buffer_funcs.h | 48 +++++++++++++++ src/vnet/devices/dpdk/buffer.c | 131 +++-------------------------------------- 3 files changed, 78 insertions(+), 174 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index 0b0e6054..ea4960e2 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -307,10 +307,10 @@ vlib_buffer_validate_alloc_free (vlib_main_t * vm, #define BUFFERS_PER_COPY (sizeof (vlib_copy_unit_t) / sizeof (u32)) /* Make sure we have at least given number of unaligned buffers. */ -static void -fill_unaligned (vlib_main_t * vm, - vlib_buffer_free_list_t * free_list, - uword n_unaligned_buffers) +void +vlib_buffer_free_list_fill_unaligned (vlib_main_t * vm, + vlib_buffer_free_list_t * free_list, + uword n_unaligned_buffers) { word la = vec_len (free_list->aligned_buffers); word lu = vec_len (free_list->unaligned_buffers); @@ -333,8 +333,8 @@ fill_unaligned (vlib_main_t * vm, } /* After free aligned buffers may not contain even sized chunks. */ -static void -trim_aligned (vlib_buffer_free_list_t * f) +void +vlib_buffer_free_list_trim_aligned (vlib_buffer_free_list_t * f) { uword l, n_trim; @@ -361,15 +361,15 @@ trim_aligned (vlib_buffer_free_list_t * f) } } -static void -merge_free_lists (vlib_buffer_free_list_t * dst, - vlib_buffer_free_list_t * src) +void +vlib_buffer_merge_free_lists (vlib_buffer_free_list_t * dst, + vlib_buffer_free_list_t * src) { uword l; u32 *d; - trim_aligned (src); - trim_aligned (dst); + vlib_buffer_free_list_trim_aligned (src); + vlib_buffer_free_list_trim_aligned (dst); l = vec_len (src->aligned_buffers); if (l > 0) @@ -388,16 +388,6 @@ merge_free_lists (vlib_buffer_free_list_t * dst, } } -always_inline u32 -vlib_buffer_get_free_list_with_size (vlib_main_t * vm, u32 size) -{ - vlib_buffer_main_t *bm = vm->buffer_main; - - size = vlib_buffer_round_size (size); - uword *p = hash_get (bm->free_list_by_size, size); - return p ? p[0] : ~0; -} - /* Add buffer free list. */ static u32 vlib_buffer_create_free_list_helper (vlib_main_t * vm, @@ -537,8 +527,9 @@ vlib_buffer_delete_free_list_internal (vlib_main_t * vm, u32 free_list_index) merge_index = vlib_buffer_get_free_list_with_size (vm, f->n_data_bytes); if (merge_index != ~0 && merge_index != free_list_index) { - merge_free_lists (pool_elt_at_index (bm->buffer_free_list_pool, - merge_index), f); + vlib_buffer_merge_free_lists (pool_elt_at_index + (bm->buffer_free_list_pool, merge_index), + f); } del_free_list (vm, f); @@ -567,7 +558,7 @@ fill_free_list (vlib_main_t * vm, u32 *bi; u32 n_remaining, n_alloc, n_this_chunk; - trim_aligned (fl); + vlib_buffer_free_list_trim_aligned (fl); /* Already have enough free buffers on free list? */ n = min_free_buffers - vec_len (fl->aligned_buffers); @@ -666,7 +657,8 @@ alloc_from_free_list (vlib_main_t * vm, else n_unaligned_end = copy_alignment (dst + n_alloc_buffers); - fill_unaligned (vm, free_list, n_unaligned_start + n_unaligned_end); + vlib_buffer_free_list_fill_unaligned (vm, free_list, + n_unaligned_start + n_unaligned_end); u_len = vec_len (free_list->unaligned_buffers); u_src = free_list->unaligned_buffers + u_len - 1; @@ -779,29 +771,6 @@ vlib_buffer_alloc_from_free_list_internal (vlib_main_t * vm, return alloc_from_free_list (vm, f, buffers, n_buffers); } -always_inline void -add_buffer_to_free_list (vlib_main_t * vm, - vlib_buffer_free_list_t * f, - u32 buffer_index, u8 do_init) -{ - vlib_buffer_t *b; - b = vlib_get_buffer (vm, buffer_index); - if (PREDICT_TRUE (do_init)) - vlib_buffer_init_for_free_list (b, f); - vec_add1_aligned (f->aligned_buffers, buffer_index, - sizeof (vlib_copy_unit_t)); -} - -always_inline vlib_buffer_free_list_t * -buffer_get_free_list (vlib_main_t * vm, vlib_buffer_t * b, u32 * index) -{ - vlib_buffer_main_t *bm = vm->buffer_main; - u32 i; - - *index = i = b->free_list_index; - return pool_elt_at_index (bm->buffer_free_list_pool, i); -} - void * vlib_set_buffer_free_callback (vlib_main_t * vm, void *fp) { @@ -845,7 +814,7 @@ vlib_buffer_free_inline (vlib_main_t * vm, vlib_buffer_t *b0; b0 = vlib_get_buffer (vm, bi0); - fl = buffer_get_free_list (vm, b0, &fi); + fl = vlib_buffer_get_buffer_free_list (vm, b0, &fi); if (fl->buffers_added_to_freelist_function) vec_add1 (announce_list, fl); } @@ -926,7 +895,7 @@ again: fl0 = pool_elt_at_index (bm->buffer_free_list_pool, fi0); fl1 = pool_elt_at_index (bm->buffer_free_list_pool, fi1); - add_buffer_to_free_list (vm, fl0, bi0, free0); + vlib_buffer_add_to_free_list (vm, fl0, bi0, free0); if (PREDICT_FALSE (fl0->buffers_added_to_freelist_function != 0)) { int i; @@ -946,7 +915,7 @@ again: } no_fl1: - add_buffer_to_free_list (vm, fl1, bi1, free1); + vlib_buffer_add_to_free_list (vm, fl1, bi1, free1); /* Possibly change current free list. */ if (fi0 != fi && fi1 != fi) @@ -1003,7 +972,7 @@ again: fl0 = pool_elt_at_index (bm->buffer_free_list_pool, fi0); - add_buffer_to_free_list (vm, fl0, bi0, free0); + vlib_buffer_add_to_free_list (vm, fl0, bi0, free0); if (PREDICT_FALSE (fl0->buffers_added_to_freelist_function != 0)) { int i; diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 15d93c16..543a903c 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -350,6 +350,41 @@ vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index) u32 vlib_buffer_get_or_create_free_list (vlib_main_t * vm, u32 n_data_bytes, char *fmt, ...); + +/* After free aligned buffers may not contain even sized chunks. */ +void vlib_buffer_free_list_trim_aligned (vlib_buffer_free_list_t * f); + +/* Merge two free lists */ +void vlib_buffer_merge_free_lists (vlib_buffer_free_list_t * dst, + vlib_buffer_free_list_t * src); + +/* Make sure we have at least given number of unaligned buffers. */ +void vlib_buffer_free_list_fill_unaligned (vlib_main_t * vm, + vlib_buffer_free_list_t * + free_list, + uword n_unaligned_buffers); + +always_inline u32 +vlib_buffer_get_free_list_with_size (vlib_main_t * vm, u32 size) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + + size = vlib_buffer_round_size (size); + uword *p = hash_get (bm->free_list_by_size, size); + return p ? p[0] : ~0; +} + +always_inline vlib_buffer_free_list_t * +vlib_buffer_get_buffer_free_list (vlib_main_t * vm, vlib_buffer_t * b, + u32 * index) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + u32 i; + + *index = i = b->free_list_index; + return pool_elt_at_index (bm->buffer_free_list_pool, i); +} + always_inline vlib_buffer_free_list_t * vlib_buffer_get_free_list (vlib_main_t * vm, u32 free_list_index) { @@ -674,6 +709,19 @@ vlib_buffer_init_for_free_list (vlib_buffer_t * _dst, ASSERT (dst->b.total_length_not_including_first_buffer == 0); } +always_inline void +vlib_buffer_add_to_free_list (vlib_main_t * vm, + vlib_buffer_free_list_t * f, + u32 buffer_index, u8 do_init) +{ + vlib_buffer_t *b; + b = vlib_get_buffer (vm, buffer_index); + if (PREDICT_TRUE (do_init)) + vlib_buffer_init_for_free_list (b, f); + vec_add1_aligned (f->aligned_buffers, buffer_index, + sizeof (vlib_copy_unit_t)); +} + always_inline void vlib_buffer_init_two_for_free_list (vlib_buffer_t * _dst0, vlib_buffer_t * _dst1, diff --git a/src/vnet/devices/dpdk/buffer.c b/src/vnet/devices/dpdk/buffer.c index 214a9162..038f46d9 100644 --- a/src/vnet/devices/dpdk/buffer.c +++ b/src/vnet/devices/dpdk/buffer.c @@ -81,98 +81,6 @@ STATIC_ASSERT (VLIB_BUFFER_PRE_DATA_SIZE == RTE_PKTMBUF_HEADROOM, #define BUFFERS_PER_COPY (sizeof (vlib_copy_unit_t) / sizeof (u32)) -/* Make sure we have at least given number of unaligned buffers. */ -static void -fill_unaligned (vlib_main_t * vm, - vlib_buffer_free_list_t * free_list, - uword n_unaligned_buffers) -{ - word la = vec_len (free_list->aligned_buffers); - word lu = vec_len (free_list->unaligned_buffers); - - /* Aligned come in aligned copy-sized chunks. */ - ASSERT (la % BUFFERS_PER_COPY == 0); - - ASSERT (la >= n_unaligned_buffers); - - while (lu < n_unaligned_buffers) - { - /* Copy 4 buffers from end of aligned vector to unaligned vector. */ - vec_add (free_list->unaligned_buffers, - free_list->aligned_buffers + la - BUFFERS_PER_COPY, - BUFFERS_PER_COPY); - la -= BUFFERS_PER_COPY; - lu += BUFFERS_PER_COPY; - } - _vec_len (free_list->aligned_buffers) = la; -} - -/* After free aligned buffers may not contain even sized chunks. */ -static void -trim_aligned (vlib_buffer_free_list_t * f) -{ - uword l, n_trim; - - /* Add unaligned to aligned before trim. */ - l = vec_len (f->unaligned_buffers); - if (l > 0) - { - vec_add_aligned (f->aligned_buffers, f->unaligned_buffers, l, - /* align */ sizeof (vlib_copy_unit_t)); - - _vec_len (f->unaligned_buffers) = 0; - } - - /* Remove unaligned buffers from end of aligned vector and save for next trim. */ - l = vec_len (f->aligned_buffers); - n_trim = l % BUFFERS_PER_COPY; - if (n_trim) - { - /* Trim aligned -> unaligned. */ - vec_add (f->unaligned_buffers, f->aligned_buffers + l - n_trim, n_trim); - - /* Remove from aligned. */ - _vec_len (f->aligned_buffers) = l - n_trim; - } -} - -static void -merge_free_lists (vlib_buffer_free_list_t * dst, - vlib_buffer_free_list_t * src) -{ - uword l; - u32 *d; - - trim_aligned (src); - trim_aligned (dst); - - l = vec_len (src->aligned_buffers); - if (l > 0) - { - vec_add2_aligned (dst->aligned_buffers, d, l, - /* align */ sizeof (vlib_copy_unit_t)); - clib_memcpy (d, src->aligned_buffers, l * sizeof (d[0])); - vec_free (src->aligned_buffers); - } - - l = vec_len (src->unaligned_buffers); - if (l > 0) - { - vec_add (dst->unaligned_buffers, src->unaligned_buffers, l); - vec_free (src->unaligned_buffers); - } -} - -always_inline u32 -dpdk_buffer_get_free_list_with_size (vlib_main_t * vm, u32 size) -{ - vlib_buffer_main_t *bm = vm->buffer_main; - - size = vlib_buffer_round_size (size); - uword *p = hash_get (bm->free_list_by_size, size); - return p ? p[0] : ~0; -} - static void del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f) { @@ -212,11 +120,12 @@ dpdk_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index) f = vlib_buffer_get_free_list (vm, free_list_index); - merge_index = dpdk_buffer_get_free_list_with_size (vm, f->n_data_bytes); + merge_index = vlib_buffer_get_free_list_with_size (vm, f->n_data_bytes); if (merge_index != ~0 && merge_index != free_list_index) { - merge_free_lists (pool_elt_at_index (bm->buffer_free_list_pool, - merge_index), f); + vlib_buffer_merge_free_lists (pool_elt_at_index + (bm->buffer_free_list_pool, merge_index), + f); } del_free_list (vm, f); @@ -253,7 +162,7 @@ fill_free_list (vlib_main_t * vm, if (PREDICT_FALSE (rmp == 0)) return 0; - trim_aligned (fl); + vlib_buffer_free_list_trim_aligned (fl); /* Already have enough free buffers on free list? */ n = min_free_buffers - vec_len (fl->aligned_buffers); @@ -333,7 +242,8 @@ alloc_from_free_list (vlib_main_t * vm, else n_unaligned_end = copy_alignment (dst + n_alloc_buffers); - fill_unaligned (vm, free_list, n_unaligned_start + n_unaligned_end); + vlib_buffer_free_list_fill_unaligned (vm, free_list, + n_unaligned_start + n_unaligned_end); u_len = vec_len (free_list->unaligned_buffers); u_src = free_list->unaligned_buffers + u_len - 1; @@ -442,29 +352,6 @@ dpdk_buffer_alloc_from_free_list (vlib_main_t * vm, return alloc_from_free_list (vm, f, buffers, n_buffers); } -always_inline void -add_buffer_to_free_list (vlib_main_t * vm, - vlib_buffer_free_list_t * f, - u32 buffer_index, u8 do_init) -{ - vlib_buffer_t *b; - b = vlib_get_buffer (vm, buffer_index); - if (PREDICT_TRUE (do_init)) - vlib_buffer_init_for_free_list (b, f); - vec_add1_aligned (f->aligned_buffers, buffer_index, - sizeof (vlib_copy_unit_t)); -} - -always_inline vlib_buffer_free_list_t * -buffer_get_free_list (vlib_main_t * vm, vlib_buffer_t * b, u32 * index) -{ - vlib_buffer_main_t *bm = vm->buffer_main; - u32 i; - - *index = i = b->free_list_index; - return pool_elt_at_index (bm->buffer_free_list_pool, i); -} - static_always_inline void vlib_buffer_free_inline (vlib_main_t * vm, u32 * buffers, u32 n_buffers, u32 follow_buffer_next) @@ -491,14 +378,14 @@ vlib_buffer_free_inline (vlib_main_t * vm, b = vlib_get_buffer (vm, buffers[i]); - fl = buffer_get_free_list (vm, b, &fi); + fl = vlib_buffer_get_buffer_free_list (vm, b, &fi); /* The only current use of this callback: multicast recycle */ if (PREDICT_FALSE (fl->buffers_added_to_freelist_function != 0)) { int j; - add_buffer_to_free_list + vlib_buffer_add_to_free_list (vm, fl, buffers[i], (b->flags & VLIB_BUFFER_RECYCLE) == 0); for (j = 0; j < vec_len (bm->announce_list); j++) -- cgit 1.2.3-korg From fe6bdfd84573cd8813a211f9094ee734f088ce16 Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Fri, 20 Jan 2017 19:50:09 -0500 Subject: binary-api debug CLI works with plugins Change-Id: I81f33f5153d5afac94b66b5a8cb91da77463af79 Signed-off-by: Dave Barach --- src/examples/sample-plugin/sample/sample_test.c | 37 +--- src/plugins/acl/acl_test.c | 51 +----- src/plugins/flowperpkt/flowperpkt_test.c | 45 +---- .../export-vxlan-gpe/vxlan_gpe_ioam_export_test.c | 46 +---- src/plugins/ioam/export/ioam_export_test.c | 44 +---- src/plugins/ioam/lib-pot/pot_test.c | 48 +---- src/plugins/ioam/lib-trace/trace_test.c | 48 +---- src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_test.c | 56 +----- src/plugins/lb/lb_test.c | 35 +--- src/plugins/snat/snat_test.c | 57 +----- src/vat/api_format.c | 95 ++++------ src/vat/vat.h | 33 +--- src/vlib/unix/plugin.c | 9 +- src/vlibapi/api_helper_macros.h | 1 - src/vlibapi/vat_helper_macros.h | 76 ++++++++ src/vpp-api-test.am | 3 + src/vpp.am | 6 +- src/vpp/api/api_main.c | 39 ++++ src/vpp/api/plugin.c | 201 +++++++++++++++++++++ src/vpp/api/plugin.h | 61 +++++++ 20 files changed, 476 insertions(+), 515 deletions(-) create mode 100644 src/vlibapi/vat_helper_macros.h create mode 100644 src/vpp/api/plugin.c create mode 100644 src/vpp/api/plugin.h (limited to 'src/vlib') diff --git a/src/examples/sample-plugin/sample/sample_test.c b/src/examples/sample-plugin/sample/sample_test.c index dd1b0215..a47710ee 100644 --- a/src/examples/sample-plugin/sample/sample_test.c +++ b/src/examples/sample-plugin/sample/sample_test.c @@ -23,6 +23,7 @@ #include #include #include +#include uword unformat_sw_if_index (unformat_input_t * input, va_list * args); @@ -87,42 +88,6 @@ foreach_standard_reply_retval_handler; _(SAMPLE_MACSWAP_ENABLE_DISABLE_REPLY, sample_macswap_enable_disable_reply) -/* M: construct, but don't yet send a message */ - -#define M(T,t) \ -do { \ - vam->result_ready = 0; \ - mp = vl_msg_api_alloc(sizeof(*mp)); \ - memset (mp, 0, sizeof (*mp)); \ - mp->_vl_msg_id = ntohs (VL_API_##T + sm->msg_id_base); \ - mp->client_index = vam->my_client_index; \ -} while(0); - -#define M2(T,t,n) \ -do { \ - vam->result_ready = 0; \ - mp = vl_msg_api_alloc(sizeof(*mp)+(n)); \ - memset (mp, 0, sizeof (*mp)); \ - mp->_vl_msg_id = ntohs (VL_API_##T + sm->msg_id_base); \ - mp->client_index = vam->my_client_index; \ -} while(0); - -/* S: send a message */ -#define S (vl_msg_api_send_shmem (vam->vl_input_queue, (u8 *)&mp)) - -/* W: wait for results, with timeout */ -#define W \ -do { \ - timeout = vat_time_now (vam) + 1.0; \ - \ - while (vat_time_now (vam) < timeout) { \ - if (vam->result_ready == 1) { \ - return (vam->retval); \ - } \ - } \ - return -99; \ -} while(0); - static int api_sample_macswap_enable_disable (vat_main_t * vam) { sample_test_main_t * sm = &sample_test_main; diff --git a/src/plugins/acl/acl_test.c b/src/plugins/acl/acl_test.c index a0e413e1..5cacf716 100644 --- a/src/plugins/acl/acl_test.c +++ b/src/plugins/acl/acl_test.c @@ -25,6 +25,7 @@ #include #include #include +#include uword unformat_sw_if_index (unformat_input_t * input, va_list * args); @@ -259,42 +260,6 @@ _(MACIP_ACL_INTERFACE_ADD_DEL_REPLY, macip_acl_interface_add_del_reply) \ _(MACIP_ACL_INTERFACE_GET_REPLY, macip_acl_interface_get_reply) \ _(ACL_PLUGIN_GET_VERSION_REPLY, acl_plugin_get_version_reply) -/* M: construct, but don't yet send a message */ - -#define M(T,t) \ -do { \ - vam->result_ready = 0; \ - mp = vl_msg_api_alloc(sizeof(*mp)); \ - memset (mp, 0, sizeof (*mp)); \ - mp->_vl_msg_id = ntohs (VL_API_##T + sm->msg_id_base); \ - mp->client_index = vam->my_client_index; \ -} while(0); - -#define M2(T,t,n) \ -do { \ - vam->result_ready = 0; \ - mp = vl_msg_api_alloc(sizeof(*mp)+(n)); \ - memset (mp, 0, sizeof (*mp)); \ - mp->_vl_msg_id = ntohs (VL_API_##T + sm->msg_id_base); \ - mp->client_index = vam->my_client_index; \ -} while(0); - -/* S: send a message */ -#define S (vl_msg_api_send_shmem (vam->vl_input_queue, (u8 *)&mp)) - -/* W: wait for results, with timeout */ -#define W \ -do { \ - timeout = vat_time_now (vam) + 1.0; \ - \ - while (vat_time_now (vam) < timeout) { \ - if (vam->result_ready == 1) { \ - return (vam->retval); \ - } \ - } \ - return -99; \ -} while(0); - static int api_acl_plugin_get_version (vat_main_t * vam) { acl_test_main_t * sm = &acl_test_main; @@ -520,7 +485,6 @@ static int api_acl_add_replace (vat_main_t * vam) static int api_acl_del (vat_main_t * vam) { - acl_test_main_t * sm = &acl_test_main; unformat_input_t * i = vam->input; f64 timeout; vl_api_acl_del_t * mp; @@ -544,7 +508,6 @@ static int api_acl_del (vat_main_t * vam) static int api_macip_acl_del (vat_main_t * vam) { - acl_test_main_t * sm = &acl_test_main; unformat_input_t * i = vam->input; f64 timeout; vl_api_acl_del_t * mp; @@ -568,7 +531,6 @@ static int api_macip_acl_del (vat_main_t * vam) static int api_acl_interface_add_del (vat_main_t * vam) { - acl_test_main_t * sm = &acl_test_main; unformat_input_t * i = vam->input; f64 timeout; vl_api_acl_interface_add_del_t * mp; @@ -636,7 +598,6 @@ static int api_acl_interface_add_del (vat_main_t * vam) static int api_macip_acl_interface_add_del (vat_main_t * vam) { - acl_test_main_t * sm = &acl_test_main; unformat_input_t * i = vam->input; f64 timeout; vl_api_macip_acl_interface_add_del_t * mp; @@ -687,7 +648,6 @@ static int api_macip_acl_interface_add_del (vat_main_t * vam) static int api_acl_interface_set_acl_list (vat_main_t * vam) { - acl_test_main_t * sm = &acl_test_main; unformat_input_t * i = vam->input; f64 timeout; vl_api_acl_interface_set_acl_list_t * mp; @@ -746,7 +706,6 @@ static int api_acl_interface_set_acl_list (vat_main_t * vam) static int api_acl_interface_list_dump (vat_main_t * vam) { - acl_test_main_t * sm = &acl_test_main; unformat_input_t * i = vam->input; f64 timeout; u32 sw_if_index = ~0; @@ -775,7 +734,6 @@ static int api_acl_interface_list_dump (vat_main_t * vam) static int api_acl_dump (vat_main_t * vam) { - acl_test_main_t * sm = &acl_test_main; unformat_input_t * i = vam->input; f64 timeout; u32 acl_index = ~0; @@ -802,7 +760,6 @@ static int api_acl_dump (vat_main_t * vam) static int api_macip_acl_dump (vat_main_t * vam) { - acl_test_main_t * sm = &acl_test_main; unformat_input_t * i = vam->input; f64 timeout; u32 acl_index = ~0; @@ -978,8 +935,8 @@ _(macip_acl_interface_add_del, " | sw_if_index [add|del] acl msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name); if (sm->msg_id_base != (u16) ~0) - vat_api_hookup (vam); + acl_vat_api_hookup (vam); vec_free(name); diff --git a/src/plugins/flowperpkt/flowperpkt_test.c b/src/plugins/flowperpkt/flowperpkt_test.c index 716818ff..9211ebe3 100644 --- a/src/plugins/flowperpkt/flowperpkt_test.c +++ b/src/plugins/flowperpkt/flowperpkt_test.c @@ -19,6 +19,7 @@ #include #include #include +#include /** * @file vpp_api_test plugin @@ -88,47 +89,9 @@ foreach_standard_reply_retval_handler; _(FLOWPERPKT_TX_INTERFACE_ADD_DEL_REPLY, \ flowperpkt_tx_interface_add_del_reply) - -/* M: construct, but don't yet send a message */ - -#define M(T,t) \ -do { \ - vam->result_ready = 0; \ - mp = vl_msg_api_alloc(sizeof(*mp)); \ - memset (mp, 0, sizeof (*mp)); \ - mp->_vl_msg_id = ntohs (VL_API_##T + sm->msg_id_base); \ - mp->client_index = vam->my_client_index; \ -} while(0); - -#define M2(T,t,n) \ -do { \ - vam->result_ready = 0; \ - mp = vl_msg_api_alloc(sizeof(*mp)+(n)); \ - memset (mp, 0, sizeof (*mp)); \ - mp->_vl_msg_id = ntohs (VL_API_##T + sm->msg_id_base); \ - mp->client_index = vam->my_client_index; \ -} while(0); - -/* S: send a message */ -#define S (vl_msg_api_send_shmem (vam->vl_input_queue, (u8 *)&mp)) - -/* W: wait for results, with timeout */ -#define W \ -do { \ - timeout = vat_time_now (vam) + 1.0; \ - \ - while (vat_time_now (vam) < timeout) { \ - if (vam->result_ready == 1) { \ - return (vam->retval); \ - } \ - } \ - return -99; \ -} while(0); - static int api_flowperpkt_tx_interface_add_del (vat_main_t * vam) { - flowperpkt_test_main_t *sm = &flowperpkt_test_main; unformat_input_t *i = vam->input; f64 timeout; int enable_disable = 1; @@ -177,8 +140,8 @@ api_flowperpkt_tx_interface_add_del (vat_main_t * vam) #define foreach_vpe_api_msg \ _(flowperpkt_tx_interface_add_del, " [disable]") -void -vat_api_hookup (vat_main_t * vam) +static void +flowperpkt_vat_api_hookup (vat_main_t * vam) { flowperpkt_test_main_t *sm = &flowperpkt_test_main; /* Hook up handlers for replies from the data plane plug-in */ @@ -218,7 +181,7 @@ vat_plugin_register (vat_main_t * vam) /* Don't attempt to hook up API messages if the data plane plugin is AWOL */ if (sm->msg_id_base != (u16) ~ 0) - vat_api_hookup (vam); + flowperpkt_vat_api_hookup (vam); vec_free (name); diff --git a/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_test.c b/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_test.c index 494263d9..5b641cc7 100644 --- a/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_test.c +++ b/src/plugins/ioam/export-vxlan-gpe/vxlan_gpe_ioam_export_test.c @@ -23,7 +23,7 @@ #include #include #include - +#include /* Declare message IDs */ #include @@ -86,47 +86,9 @@ foreach_standard_reply_retval_handler; #define foreach_vpe_api_reply_msg \ _(VXLAN_GPE_IOAM_EXPORT_ENABLE_DISABLE_REPLY, vxlan_gpe_ioam_export_enable_disable_reply) - -/* M: construct, but don't yet send a message */ - -#define M(T,t) \ -do { \ - vam->result_ready = 0; \ - mp = vl_msg_api_alloc(sizeof(*mp)); \ - memset (mp, 0, sizeof (*mp)); \ - mp->_vl_msg_id = ntohs (VL_API_##T + sm->msg_id_base); \ - mp->client_index = vam->my_client_index; \ -} while(0); - -#define M2(T,t,n) \ -do { \ - vam->result_ready = 0; \ - mp = vl_msg_api_alloc(sizeof(*mp)+(n)); \ - memset (mp, 0, sizeof (*mp)); \ - mp->_vl_msg_id = ntohs (VL_API_##T + sm->msg_id_base); \ - mp->client_index = vam->my_client_index; \ -} while(0); - -/* S: send a message */ -#define S (vl_msg_api_send_shmem (vam->vl_input_queue, (u8 *)&mp)) - -/* W: wait for results, with timeout */ -#define W \ -do { \ - timeout = vat_time_now (vam) + 1.0; \ - \ - while (vat_time_now (vam) < timeout) { \ - if (vam->result_ready == 1) { \ - return (vam->retval); \ - } \ - } \ - return -99; \ -} while(0); - static int api_vxlan_gpe_ioam_export_enable_disable (vat_main_t * vam) { - export_test_main_t *sm = &export_test_main; unformat_input_t *i = vam->input; f64 timeout; int is_disable = 0; @@ -160,8 +122,8 @@ api_vxlan_gpe_ioam_export_enable_disable (vat_main_t * vam) #define foreach_vpe_api_msg \ _(vxlan_gpe_ioam_export_enable_disable, " [disable]") -void -vat_api_hookup (vat_main_t * vam) +static void +vxlan_gpe_ioam_vat_api_hookup (vat_main_t * vam) { export_test_main_t *sm = &export_test_main; /* Hook up handlers for replies from the data plane plug-in */ @@ -199,7 +161,7 @@ vat_plugin_register (vat_main_t * vam) sm->msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name); if (sm->msg_id_base != (u16) ~ 0) - vat_api_hookup (vam); + vxlan_gpe_ioam_vat_api_hookup (vam); vec_free (name); diff --git a/src/plugins/ioam/export/ioam_export_test.c b/src/plugins/ioam/export/ioam_export_test.c index f991fc0c..a4ec80d0 100644 --- a/src/plugins/ioam/export/ioam_export_test.c +++ b/src/plugins/ioam/export/ioam_export_test.c @@ -23,6 +23,7 @@ #include #include #include +#include /* Declare message IDs */ @@ -87,46 +88,9 @@ foreach_standard_reply_retval_handler; _(IOAM_EXPORT_IP6_ENABLE_DISABLE_REPLY, ioam_export_ip6_enable_disable_reply) -/* M: construct, but don't yet send a message */ - -#define M(T,t) \ -do { \ - vam->result_ready = 0; \ - mp = vl_msg_api_alloc(sizeof(*mp)); \ - memset (mp, 0, sizeof (*mp)); \ - mp->_vl_msg_id = ntohs (VL_API_##T + sm->msg_id_base); \ - mp->client_index = vam->my_client_index; \ -} while(0); - -#define M2(T,t,n) \ -do { \ - vam->result_ready = 0; \ - mp = vl_msg_api_alloc(sizeof(*mp)+(n)); \ - memset (mp, 0, sizeof (*mp)); \ - mp->_vl_msg_id = ntohs (VL_API_##T + sm->msg_id_base); \ - mp->client_index = vam->my_client_index; \ -} while(0); - -/* S: send a message */ -#define S (vl_msg_api_send_shmem (vam->vl_input_queue, (u8 *)&mp)) - -/* W: wait for results, with timeout */ -#define W \ -do { \ - timeout = vat_time_now (vam) + 1.0; \ - \ - while (vat_time_now (vam) < timeout) { \ - if (vam->result_ready == 1) { \ - return (vam->retval); \ - } \ - } \ - return -99; \ -} while(0); - static int api_ioam_export_ip6_enable_disable (vat_main_t * vam) { - export_test_main_t *sm = &export_test_main; unformat_input_t *i = vam->input; f64 timeout; int is_disable = 0; @@ -159,8 +123,8 @@ api_ioam_export_ip6_enable_disable (vat_main_t * vam) #define foreach_vpe_api_msg \ _(ioam_export_ip6_enable_disable, " [disable]") -void -vat_api_hookup (vat_main_t * vam) +static void +ioam_export_vat_api_hookup (vat_main_t * vam) { export_test_main_t *sm = &export_test_main; /* Hook up handlers for replies from the data plane plug-in */ @@ -198,7 +162,7 @@ vat_plugin_register (vat_main_t * vam) sm->msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name); if (sm->msg_id_base != (u16) ~ 0) - vat_api_hookup (vam); + ioam_export_vat_api_hookup (vam); vec_free (name); diff --git a/src/plugins/ioam/lib-pot/pot_test.c b/src/plugins/ioam/lib-pot/pot_test.c index 2e870238..9f9d0c99 100644 --- a/src/plugins/ioam/lib-pot/pot_test.c +++ b/src/plugins/ioam/lib-pot/pot_test.c @@ -23,6 +23,7 @@ #include #include #include +#include /* Declare message IDs */ #include @@ -118,48 +119,9 @@ _(POT_PROFILE_ACTIVATE_REPLY, pot_profile_activate_reply) \ _(POT_PROFILE_DEL_REPLY, pot_profile_del_reply) \ _(POT_PROFILE_SHOW_CONFIG_DETAILS, pot_profile_show_config_details) - -/* M: construct, but don't yet send a message */ - -#define M(T,t) \ -do { \ - vam->result_ready = 0; \ - mp = vl_msg_api_alloc(sizeof(*mp)); \ - memset (mp, 0, sizeof (*mp)); \ - mp->_vl_msg_id = ntohs (VL_API_##T + sm->msg_id_base); \ - mp->client_index = vam->my_client_index; \ -} while(0); - -#define M2(T,t,n) \ -do { \ - vam->result_ready = 0; \ - mp = vl_msg_api_alloc(sizeof(*mp)+(n)); \ - memset (mp, 0, sizeof (*mp)); \ - mp->_vl_msg_id = ntohs (VL_API_##T + sm->msg_id_base); \ - mp->client_index = vam->my_client_index; \ -} while(0); - -/* S: send a message */ -#define S (vl_msg_api_send_shmem (vam->vl_input_queue, (u8 *)&mp)) - -/* W: wait for results, with timeout */ -#define W \ -do { \ - timeout = vat_time_now (vam) + 1.0; \ - \ - while (vat_time_now (vam) < timeout) { \ - if (vam->result_ready == 1) { \ - return (vam->retval); \ - } \ - } \ - return -99; \ -} while(0); - - static int api_pot_profile_add (vat_main_t *vam) { #define MAX_BITS 64 - pot_test_main_t * sm = &pot_test_main; unformat_input_t *input = vam->input; vl_api_pot_profile_add_t *mp; u8 *name = NULL; @@ -234,7 +196,6 @@ OUT: static int api_pot_profile_activate (vat_main_t *vam) { #define MAX_BITS 64 - pot_test_main_t * sm = &pot_test_main; unformat_input_t *input = vam->input; vl_api_pot_profile_activate_t *mp; u8 *name = NULL; @@ -275,7 +236,6 @@ OUT: static int api_pot_profile_del (vat_main_t *vam) { - pot_test_main_t * sm = &pot_test_main; vl_api_pot_profile_del_t *mp; f64 timeout; @@ -287,7 +247,6 @@ static int api_pot_profile_del (vat_main_t *vam) static int api_pot_profile_show_config_dump (vat_main_t *vam) { - pot_test_main_t * sm = &pot_test_main; unformat_input_t *input = vam->input; vl_api_pot_profile_show_config_dump_t *mp; f64 timeout; @@ -320,7 +279,8 @@ _(pot_profile_activate, "name id [0-1] ") \ _(pot_profile_del, "[id ]") \ _(pot_profile_show_config_dump, "id [0-1]") -void vat_api_hookup (vat_main_t *vam) +static void +pot_vat_api_hookup (vat_main_t *vam) { pot_test_main_t * sm = &pot_test_main; /* Hook up handlers for replies from the data plane plug-in */ @@ -357,7 +317,7 @@ clib_error_t * vat_plugin_register (vat_main_t *vam) sm->msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name); if (sm->msg_id_base != (u16) ~0) - vat_api_hookup (vam); + pot_vat_api_hookup (vam); vec_free(name); diff --git a/src/plugins/ioam/lib-trace/trace_test.c b/src/plugins/ioam/lib-trace/trace_test.c index 111dd461..f4f1d4d5 100644 --- a/src/plugins/ioam/lib-trace/trace_test.c +++ b/src/plugins/ioam/lib-trace/trace_test.c @@ -23,6 +23,7 @@ #include #include #include +#include /* Declare message IDs */ #include @@ -115,48 +116,9 @@ _(TRACE_PROFILE_ADD_REPLY, trace_profile_add_reply) \ _(TRACE_PROFILE_DEL_REPLY, trace_profile_del_reply) \ _(TRACE_PROFILE_SHOW_CONFIG_REPLY, trace_profile_show_config_reply) - -/* M: construct, but don't yet send a message */ - -#define M(T,t) \ -do { \ - vam->result_ready = 0; \ - mp = vl_msg_api_alloc(sizeof(*mp)); \ - memset (mp, 0, sizeof (*mp)); \ - mp->_vl_msg_id = ntohs (VL_API_##T + sm->msg_id_base); \ - mp->client_index = vam->my_client_index; \ -} while(0); - -#define M2(T,t,n) \ -do { \ - vam->result_ready = 0; \ - mp = vl_msg_api_alloc(sizeof(*mp)+(n)); \ - memset (mp, 0, sizeof (*mp)); \ - mp->_vl_msg_id = ntohs (VL_API_##T + sm->msg_id_base); \ - mp->client_index = vam->my_client_index; \ -} while(0); - -/* S: send a message */ -#define S (vl_msg_api_send_shmem (vam->vl_input_queue, (u8 *)&mp)) - -/* W: wait for results, with timeout */ -#define W \ -do { \ - timeout = vat_time_now (vam) + 1.0; \ - \ - while (vat_time_now (vam) < timeout) { \ - if (vam->result_ready == 1) { \ - return (vam->retval); \ - } \ - } \ - return -99; \ -} while(0); - - static int api_trace_profile_add (vat_main_t * vam) { - trace_test_main_t *sm = &trace_test_main; unformat_input_t *input = vam->input; vl_api_trace_profile_add_t *mp; u8 trace_type = 0; @@ -204,7 +166,6 @@ api_trace_profile_add (vat_main_t * vam) static int api_trace_profile_del (vat_main_t * vam) { - trace_test_main_t *sm = &trace_test_main; vl_api_trace_profile_del_t *mp; f64 timeout; @@ -217,7 +178,6 @@ api_trace_profile_del (vat_main_t * vam) static int api_trace_profile_show_config (vat_main_t * vam) { - trace_test_main_t *sm = &trace_test_main; vl_api_trace_profile_show_config_t *mp; f64 timeout; M (TRACE_PROFILE_SHOW_CONFIG, trace_profile_show_config); @@ -237,8 +197,8 @@ _(trace_profile_del, "[id ]") \ _(trace_profile_show_config, "[id ]") -void -vat_api_hookup (vat_main_t * vam) +static void +ioam_trace_vat_api_hookup (vat_main_t * vam) { trace_test_main_t *sm = &trace_test_main; /* Hook up handlers for replies from the data plane plug-in */ @@ -276,7 +236,7 @@ vat_plugin_register (vat_main_t * vam) sm->msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name); if (sm->msg_id_base != (u16) ~ 0) - vat_api_hookup (vam); + ioam_trace_vat_api_hookup (vam); vec_free (name); diff --git a/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_test.c b/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_test.c index b5fee724..7c4088ee 100644 --- a/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_test.c +++ b/src/plugins/ioam/lib-vxlan-gpe/vxlan_gpe_test.c @@ -23,6 +23,7 @@ #include #include #include +#include /* Declare message IDs */ #include @@ -96,49 +97,9 @@ _(VXLAN_GPE_IOAM_VNI_DISABLE_REPLY, vxlan_gpe_ioam_vni_disable_reply) \ _(VXLAN_GPE_IOAM_TRANSIT_ENABLE_REPLY, vxlan_gpe_ioam_transit_enable_reply) \ _(VXLAN_GPE_IOAM_TRANSIT_DISABLE_REPLY, vxlan_gpe_ioam_transit_disable_reply) \ - -/* M: construct, but don't yet send a message */ - -#define M(T,t) \ -do { \ - vam->result_ready = 0; \ - mp = vl_msg_api_alloc(sizeof(*mp)); \ - memset (mp, 0, sizeof (*mp)); \ - mp->_vl_msg_id = ntohs (VL_API_##T + sm->msg_id_base); \ - mp->client_index = vam->my_client_index; \ -} while(0); - -#define M2(T,t,n) \ -do { \ - vam->result_ready = 0; \ - mp = vl_msg_api_alloc(sizeof(*mp)+(n)); \ - memset (mp, 0, sizeof (*mp)); \ - mp->_vl_msg_id = ntohs (VL_API_##T + sm->msg_id_base); \ - mp->client_index = vam->my_client_index; \ -} while(0); - -/* S: send a message */ -#define S (vl_msg_api_send_shmem (vam->vl_input_queue, (u8 *)&mp)) - -/* W: wait for results, with timeout */ -#define W \ -do { \ - timeout = vat_time_now (vam) + 1.0; \ - \ - while (vat_time_now (vam) < timeout) { \ - if (vam->result_ready == 1) { \ - return (vam->retval); \ - } \ - } \ - return -99; \ -} while(0); - - static int api_vxlan_gpe_ioam_enable (vat_main_t * vam) { - vxlan_gpe_test_main_t *sm = &vxlan_gpe_test_main; - unformat_input_t *input = vam->input; vl_api_vxlan_gpe_ioam_enable_t *mp; f64 timeout; @@ -179,7 +140,6 @@ api_vxlan_gpe_ioam_enable (vat_main_t * vam) static int api_vxlan_gpe_ioam_disable (vat_main_t * vam) { - vxlan_gpe_test_main_t *sm = &vxlan_gpe_test_main; vl_api_vxlan_gpe_ioam_disable_t *mp; f64 timeout; @@ -192,8 +152,6 @@ api_vxlan_gpe_ioam_disable (vat_main_t * vam) static int api_vxlan_gpe_ioam_vni_enable (vat_main_t * vam) { - vxlan_gpe_test_main_t *sm = &vxlan_gpe_test_main; - unformat_input_t *line_input = vam->input; vl_api_vxlan_gpe_ioam_vni_enable_t *mp; ip4_address_t local4, remote4; @@ -289,8 +247,6 @@ api_vxlan_gpe_ioam_vni_enable (vat_main_t * vam) static int api_vxlan_gpe_ioam_vni_disable (vat_main_t * vam) { - vxlan_gpe_test_main_t *sm = &vxlan_gpe_test_main; - unformat_input_t *line_input = vam->input; vl_api_vxlan_gpe_ioam_vni_disable_t *mp; ip4_address_t local4, remote4; @@ -386,8 +342,6 @@ api_vxlan_gpe_ioam_vni_disable (vat_main_t * vam) static int api_vxlan_gpe_ioam_transit_enable (vat_main_t * vam) { - vxlan_gpe_test_main_t *sm = &vxlan_gpe_test_main; - unformat_input_t *line_input = vam->input; vl_api_vxlan_gpe_ioam_transit_enable_t *mp; ip4_address_t local4; @@ -458,8 +412,6 @@ api_vxlan_gpe_ioam_transit_enable (vat_main_t * vam) static int api_vxlan_gpe_ioam_transit_disable (vat_main_t * vam) { - vxlan_gpe_test_main_t *sm = &vxlan_gpe_test_main; - unformat_input_t *line_input = vam->input; vl_api_vxlan_gpe_ioam_transit_disable_t *mp; ip4_address_t local4; @@ -545,8 +497,8 @@ _(vxlan_gpe_ioam_transit_disable, ""\ "dst-ip [outer-fib-index ]") \ -void -vat_api_hookup (vat_main_t * vam) +static void +vxlan_gpe_vat_api_hookup (vat_main_t * vam) { vxlan_gpe_test_main_t *sm = &vxlan_gpe_test_main; /* Hook up handlers for replies from the data plane plug-in */ @@ -584,7 +536,7 @@ vat_plugin_register (vat_main_t * vam) sm->msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name); if (sm->msg_id_base != (u16) ~ 0) - vat_api_hookup (vam); + vxlan_gpe_vat_api_hookup (vam); vec_free (name); diff --git a/src/plugins/lb/lb_test.c b/src/plugins/lb/lb_test.c index 8c2eaa91..7e3519a8 100644 --- a/src/plugins/lb/lb_test.c +++ b/src/plugins/lb/lb_test.c @@ -19,6 +19,7 @@ #include #include #include +#include //TODO: Move that to vat/plugin_api.c ////////////////////////// @@ -127,35 +128,8 @@ foreach_standard_reply_retval_handler; _(LB_ADD_DEL_VIP_REPLY, lb_add_del_vip_reply) \ _(LB_ADD_DEL_AS_REPLY, lb_add_del_as_reply) -/* M: construct, but don't yet send a message */ -#define M(T,t) \ -do { \ - vam->result_ready = 0; \ - mp = vl_msg_api_alloc(sizeof(*mp)); \ - memcpy (mp, &mps, sizeof (*mp)); \ - mp->_vl_msg_id = ntohs (VL_API_##T + lbtm->msg_id_base); \ - mp->client_index = vam->my_client_index; \ -} while(0); - -/* S: send a message */ -#define S (vl_msg_api_send_shmem (vam->vl_input_queue, (u8 *)&mp)) - -/* W: wait for results, with timeout */ -#define W \ -do { \ - timeout = vat_time_now (vam) + 1.0; \ - \ - while (vat_time_now (vam) < timeout) { \ - if (vam->result_ready == 1) { \ - return (vam->retval); \ - } \ - } \ - return -99; \ -} while(0); - static int api_lb_conf (vat_main_t * vam) { - lb_test_main_t *lbtm = &lb_test_main; unformat_input_t *i = vam->input; f64 timeout; vl_api_lb_conf_t mps, *mp; @@ -177,7 +151,6 @@ static int api_lb_conf (vat_main_t * vam) static int api_lb_add_del_vip (vat_main_t * vam) { - lb_test_main_t *lbtm = &lb_test_main; unformat_input_t * i = vam->input; f64 timeout; vl_api_lb_add_del_vip_t mps, *mp; @@ -215,7 +188,6 @@ static int api_lb_add_del_vip (vat_main_t * vam) static int api_lb_add_del_as (vat_main_t * vam) { - lb_test_main_t *lbtm = &lb_test_main; unformat_input_t * i = vam->input; f64 timeout; vl_api_lb_add_del_as_t mps, *mp; @@ -246,7 +218,8 @@ _(lb_conf, " [gre4|gre6] [del]") \ _(lb_add_del_as, "
[del]") -void vat_api_hookup (vat_main_t *vam) +static void +lb_vat_api_hookup (vat_main_t *vam) { lb_test_main_t * lbtm = &lb_test_main; /* Hook up handlers for replies from the data plane plug-in */ @@ -285,7 +258,7 @@ clib_error_t * vat_plugin_register (vat_main_t *vam) lbtm->msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name); if (lbtm->msg_id_base != (u16) ~0) - vat_api_hookup (vam); + lb_vat_api_hookup (vam); vec_free(name); diff --git a/src/plugins/snat/snat_test.c b/src/plugins/snat/snat_test.c index 013d7d9b..b601d7d9 100644 --- a/src/plugins/snat/snat_test.c +++ b/src/plugins/snat/snat_test.c @@ -21,6 +21,7 @@ #include #include #include +#include uword unformat_sw_if_index (unformat_input_t * input, va_list * args); @@ -103,44 +104,8 @@ _(SNAT_INTERFACE_ADDR_DETAILS, snat_interface_addr_details) \ _(SNAT_IPFIX_ENABLE_DISABLE_REPLY, \ snat_ipfix_enable_disable_reply) -/* M: construct, but don't yet send a message */ -#define M(T,t) \ -do { \ - vam->result_ready = 0; \ - mp = vl_msg_api_alloc(sizeof(*mp)); \ - memset (mp, 0, sizeof (*mp)); \ - mp->_vl_msg_id = ntohs (VL_API_##T + sm->msg_id_base); \ - mp->client_index = vam->my_client_index; \ -} while(0); - -#define M2(T,t,n) \ -do { \ - vam->result_ready = 0; \ - mp = vl_msg_api_alloc(sizeof(*mp)+(n)); \ - memset (mp, 0, sizeof (*mp)); \ - mp->_vl_msg_id = ntohs (VL_API_##T + sm->msg_id_base); \ - mp->client_index = vam->my_client_index; \ -} while(0); - -/* S: send a message */ -#define S (vl_msg_api_send_shmem (vam->vl_input_queue, (u8 *)&mp)) - -/* W: wait for results, with timeout */ -#define W \ -do { \ - timeout = vat_time_now (vam) + 1.0; \ - \ - while (vat_time_now (vam) < timeout) { \ - if (vam->result_ready == 1) { \ - return (vam->retval); \ - } \ - } \ - return -99; \ -} while(0); - static int api_snat_add_address_range (vat_main_t * vam) { - snat_test_main_t * sm = &snat_test_main; unformat_input_t * i = vam->input; f64 timeout; ip4_address_t start_addr, end_addr; @@ -200,7 +165,6 @@ static int api_snat_add_address_range (vat_main_t * vam) static int api_snat_interface_add_del_feature (vat_main_t * vam) { - snat_test_main_t * sm = &snat_test_main; unformat_input_t * i = vam->input; f64 timeout; vl_api_snat_interface_add_del_feature_t * mp; @@ -246,7 +210,6 @@ static int api_snat_interface_add_del_feature (vat_main_t * vam) static int api_snat_add_static_mapping(vat_main_t * vam) { - snat_test_main_t * sm = &snat_test_main; unformat_input_t * i = vam->input; f64 timeout; vl_api_snat_add_static_mapping_t * mp; @@ -338,7 +301,6 @@ static void vl_api_snat_static_mapping_details_t_handler static int api_snat_static_mapping_dump(vat_main_t * vam) { - snat_test_main_t * sm = &snat_test_main; f64 timeout; vl_api_snat_static_mapping_dump_t * mp; @@ -398,7 +360,6 @@ static void vl_api_snat_show_config_reply_t_handler static int api_snat_show_config(vat_main_t * vam) { - snat_test_main_t * sm = &snat_test_main; f64 timeout; vl_api_snat_show_config_t * mp; @@ -425,7 +386,6 @@ static void vl_api_snat_address_details_t_handler static int api_snat_address_dump(vat_main_t * vam) { - snat_test_main_t * sm = &snat_test_main; f64 timeout; vl_api_snat_address_dump_t * mp; @@ -460,7 +420,6 @@ static void vl_api_snat_interface_details_t_handler static int api_snat_interface_dump(vat_main_t * vam) { - snat_test_main_t * sm = &snat_test_main; f64 timeout; vl_api_snat_interface_dump_t * mp; @@ -485,7 +444,6 @@ static int api_snat_interface_dump(vat_main_t * vam) static int api_snat_set_workers (vat_main_t * vam) { - snat_test_main_t * sm = &snat_test_main; unformat_input_t * i = vam->input; f64 timeout; vl_api_snat_set_workers_t * mp; @@ -523,7 +481,6 @@ static void vl_api_snat_worker_details_t_handler static int api_snat_worker_dump(vat_main_t * vam) { - snat_test_main_t * sm = &snat_test_main; f64 timeout; vl_api_snat_worker_dump_t * mp; @@ -548,7 +505,6 @@ static int api_snat_worker_dump(vat_main_t * vam) static int api_snat_ipfix_enable_disable (vat_main_t * vam) { - snat_test_main_t * sm = &snat_test_main; unformat_input_t * i = vam->input; f64 timeout; vl_api_snat_add_del_interface_addr_t * mp; @@ -597,7 +553,6 @@ static void vl_api_snat_interface_addr_details_t_handler static int api_snat_interface_addr_dump(vat_main_t * vam) { - snat_test_main_t * sm = &snat_test_main; f64 timeout; vl_api_snat_interface_addr_dump_t * mp; @@ -622,7 +577,6 @@ static int api_snat_interface_addr_dump(vat_main_t * vam) static int api_snat_add_del_interface_addr (vat_main_t * vam) { - snat_test_main_t * sm = &snat_test_main; unformat_input_t * i = vam->input; f64 timeout; vl_api_snat_ipfix_enable_disable_t * mp; @@ -677,7 +631,8 @@ _(snat_interface_addr_dump, "") \ _(snat_ipfix_enable_disable, "[domain ] [src_port ] " \ "[disable]") -void vat_api_hookup (vat_main_t *vam) +static void +snat_vat_api_hookup (vat_main_t *vam) { snat_test_main_t * sm __attribute__((unused)) = &snat_test_main; /* Hook up handlers for replies from the data plane plug-in */ @@ -693,7 +648,9 @@ void vat_api_hookup (vat_main_t *vam) #undef _ /* API messages we can send */ -#define _(n,h) hash_set_mem (vam->function_by_name, #n, api_##n); +#define _(n,h) \ + hash_set_mem (vam->function_by_name, #n, api_##n); \ + clib_warning ("vam %llx add '%s' handler %llx", vam, #n, api_##n); foreach_vpe_api_msg; #undef _ @@ -715,7 +672,7 @@ clib_error_t * vat_plugin_register (vat_main_t *vam) sm->msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name); if (sm->msg_id_base != (u16) ~0) - vat_api_hookup (vam); + snat_vat_api_hookup (vam); vec_free(name); diff --git a/src/vat/api_format.c b/src/vat/api_format.c index 1babaf40..653cf79f 100644 --- a/src/vat/api_format.c +++ b/src/vat/api_format.c @@ -70,6 +70,46 @@ #include #undef vl_printfun +#include + +f64 +vat_time_now (vat_main_t * vam) +{ +#if VPP_API_TEST_BUILTIN + return vlib_time_now (vam->vlib_main); +#else + return clib_time_now (&vam->clib_time); +#endif +} + +void +errmsg (char *fmt, ...) +{ + vat_main_t *vam = &vat_main; + va_list va; + u8 *s; + + va_start (va, fmt); + s = va_format (0, fmt, &va); + va_end (va); + + vec_add1 (s, 0); + +#if VPP_API_TEST_BUILTIN + vlib_cli_output (vam->vlib_main, (char *) s); +#else + { + if (vam->ifp != stdin) + fformat (vam->ofp, "%s(%d): \n", vam->current_file, + vam->input_line_number); + fformat (vam->ofp, (char *) s); + fflush (vam->ofp); + } +#endif + + vec_free (s); +} + static uword api_unformat_sw_if_index (unformat_input_t * input, va_list * args) { @@ -88,8 +128,6 @@ api_unformat_sw_if_index (unformat_input_t * input, va_list * args) return 1; } -void vat_suspend (vlib_main_t * vm, f64 interval); - #if VPP_API_TEST_BUILTIN == 0 /* Parse an IP4 address %d.%d.%d.%d. */ uword @@ -3867,59 +3905,6 @@ _(SW_INTERFACE_SET_DPDK_HQOS_TCTBL_REPLY, \ sw_interface_set_dpdk_hqos_tctbl_reply) #endif -/* M: construct, but don't yet send a message */ - -#define M(T,t) \ -do { \ - vam->result_ready = 0; \ - mp = vl_msg_api_alloc_as_if_client(sizeof(*mp)); \ - memset (mp, 0, sizeof (*mp)); \ - mp->_vl_msg_id = ntohs (VL_API_##T); \ - mp->client_index = vam->my_client_index; \ -} while(0); - -#define M2(T,t,n) \ -do { \ - vam->result_ready = 0; \ - mp = vl_msg_api_alloc_as_if_client(sizeof(*mp)+(n)); \ - memset (mp, 0, sizeof (*mp)); \ - mp->_vl_msg_id = ntohs (VL_API_##T); \ - mp->client_index = vam->my_client_index; \ -} while(0); - - -/* S: send a message */ -#define S (vl_msg_api_send_shmem (vam->vl_input_queue, (u8 *)&mp)) - -/* W: wait for results, with timeout */ -#define W \ -do { \ - timeout = vat_time_now (vam) + 1.0; \ - \ - while (vat_time_now (vam) < timeout) { \ - if (vam->result_ready == 1) { \ - return (vam->retval); \ - } \ - vat_suspend (vam->vlib_main, 1e-3); \ - } \ - return -99; \ -} while(0); - -/* W2: wait for results, with timeout */ -#define W2(body) \ -do { \ - timeout = vat_time_now (vam) + 1.0; \ - \ - while (vat_time_now (vam) < timeout) { \ - if (vam->result_ready == 1) { \ - (body); \ - return (vam->retval); \ - } \ - vat_suspend (vam->vlib_main, 1e-3); \ - } \ - return -99; \ -} while(0); - typedef struct { u8 *name; diff --git a/src/vat/vat.h b/src/vat/vat.h index 64be2f7f..3d7d96ae 100644 --- a/src/vat/vat.h +++ b/src/vat/vat.h @@ -190,36 +190,11 @@ typedef struct vlib_main_t *vlib_main; } vat_main_t; -vat_main_t vat_main; - -static inline f64 -vat_time_now (vat_main_t * vam) -{ -#if VPP_API_TEST_BUILTIN - return vlib_time_now (vam->vlib_main); -#else - return clib_time_now (&vam->clib_time); -#endif -} - -#if VPP_API_TEST_BUILTIN -#define errmsg(fmt,args...) \ -do { \ - vat_main_t *__vam = &vat_main; \ - vlib_cli_output (__vam->vlib_main, fmt, ##args); \ - } while(0); -#else -#define errmsg(fmt,args...) \ -do { \ - vat_main_t *__vam = &vat_main; \ - if(__vam->ifp != stdin) \ - fformat(__vam->ofp,"%s(%d): \n", __vam->current_file, \ - __vam->input_line_number); \ - fformat(__vam->ofp, fmt "\n", ##args); \ - fflush(__vam->ofp); \ -} while(0); -#endif +extern vat_main_t vat_main; +void vat_suspend (vlib_main_t * vm, f64 interval); +f64 vat_time_now (vat_main_t * vam); +void errmsg (char *fmt, ...); void vat_api_hookup (vat_main_t * vam); int api_sw_interface_dump (vat_main_t * vam); void do_one_file (vat_main_t * vam); diff --git a/src/vlib/unix/plugin.c b/src/vlib/unix/plugin.c index 987f7d05..40399090 100644 --- a/src/vlib/unix/plugin.c +++ b/src/vlib/unix/plugin.c @@ -264,10 +264,15 @@ vlib_plugins_show_cmd_fn (vlib_main_t * vm, return 0; } +/* *INDENT-OFF* */ VLIB_CLI_COMMAND (plugins_show_cmd, static) = { -.path = "show plugins",.short_help = "show loaded plugins",.function = - vlib_plugins_show_cmd_fn,}; + .path = "show plugins", + .short_help = "show loaded plugins", + .function = vlib_plugins_show_cmd_fn, +}; +/* *INDENT-ON* */ + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vlibapi/api_helper_macros.h b/src/vlibapi/api_helper_macros.h index 16f34cfc..7f94e446 100644 --- a/src/vlibapi/api_helper_macros.h +++ b/src/vlibapi/api_helper_macros.h @@ -17,7 +17,6 @@ *------------------------------------------------------------------ */ - #ifndef __api_helper_macros_h__ #define __api_helper_macros_h__ diff --git a/src/vlibapi/vat_helper_macros.h b/src/vlibapi/vat_helper_macros.h new file mode 100644 index 00000000..7199364a --- /dev/null +++ b/src/vlibapi/vat_helper_macros.h @@ -0,0 +1,76 @@ +/* + *------------------------------------------------------------------ + * vat_helper_macros.h - collect api client helper macros in one place + * + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ +#ifndef __vat_helper_macros_h__ +#define __vat_helper_macros_h__ + +/* M: construct, but don't yet send a message */ + +#define M(T,t) \ +do { \ + vam->result_ready = 0; \ + mp = vl_msg_api_alloc_as_if_client(sizeof(*mp)); \ + memset (mp, 0, sizeof (*mp)); \ + mp->_vl_msg_id = ntohs (VL_API_##T); \ + mp->client_index = vam->my_client_index; \ +} while(0); + +#define M2(T,t,n) \ +do { \ + vam->result_ready = 0; \ + mp = vl_msg_api_alloc_as_if_client(sizeof(*mp)+(n)); \ + memset (mp, 0, sizeof (*mp)); \ + mp->_vl_msg_id = ntohs (VL_API_##T); \ + mp->client_index = vam->my_client_index; \ +} while(0); + + +/* S: send a message */ +#define S (vl_msg_api_send_shmem (vam->vl_input_queue, (u8 *)&mp)) + +/* W: wait for results, with timeout */ +#define W \ +do { \ + timeout = vat_time_now (vam) + 1.0; \ + \ + while (vat_time_now (vam) < timeout) { \ + if (vam->result_ready == 1) { \ + return (vam->retval); \ + } \ + vat_suspend (vam->vlib_main, 1e-5); \ + } \ + return -99; \ +} while(0); + +/* W2: wait for results, with timeout */ +#define W2(body) \ +do { \ + timeout = vat_time_now (vam) + 1.0; \ + \ + while (vat_time_now (vam) < timeout) { \ + if (vam->result_ready == 1) { \ + (body); \ + return (vam->retval); \ + } \ + vat_suspend (vam->vlib_main, 1e-5); \ + } \ + return -99; \ +} while(0); + + +#endif /* __vat_helper_macros_h__ */ diff --git a/src/vpp-api-test.am b/src/vpp-api-test.am index 32610056..f0d5df62 100644 --- a/src/vpp-api-test.am +++ b/src/vpp-api-test.am @@ -44,7 +44,10 @@ vpp_api_test_LDADD = \ libvnet.la \ -lpthread -lm -lrt -ldl -lcrypto +vpp_api_test_LDFLAGS = -Wl,--export-dynamic + vpp_json_test_LDADD = libvppinfra.la -lm +vpp_json_test_LDFLAGS = -Wl,--export-dynamic nobase_include_HEADERS += \ vat/vat.h \ diff --git a/src/vpp.am b/src/vpp.am index 425f1e32..0b605ec5 100644 --- a/src/vpp.am +++ b/src/vpp.am @@ -28,7 +28,9 @@ bin_vpp_SOURCES += \ if WITH_APICLI bin_vpp_SOURCES += \ vpp/api/api_format.c \ - vpp/api/api_main.c + vpp/api/api_main.c \ + vpp/api/plugin.c \ + vpp/api/plugin.h endif # comment out to disable stats upload to gmond @@ -78,6 +80,8 @@ bin_vpp_LDADD = \ libvppinfra.la \ -lrt -lm -lpthread -ldl +bin_vpp_LDFLAGS = -Wl,--export-dynamic + if ENABLE_TESTS noinst_PROGRAMS += bin/test_client diff --git a/src/vpp/api/api_main.c b/src/vpp/api/api_main.c index fd6998f4..7bf7e8b7 100644 --- a/src/vpp/api/api_main.c +++ b/src/vpp/api/api_main.c @@ -42,11 +42,17 @@ static clib_error_t * api_main_init (vlib_main_t * vm) { vat_main_t *vam = &vat_main; + int rv; + int vat_plugin_init (vat_main_t * vam); vam->vlib_main = vm; vam->my_client_index = (u32) ~ 0; init_error_string_table (vam); vat_api_hookup (vam); + clib_warning ("vam %llx", vam); + rv = vat_plugin_init (vam); + if (rv) + clib_warning ("vat_plugin_init returned %d", rv); return 0; } @@ -183,6 +189,39 @@ api_cli_output (void *notused, const char *fmt, ...) vec_free (s); } +u16 +vl_client_get_first_plugin_msg_id (char *plugin_name) +{ + api_main_t *am = &api_main; + vl_api_msg_range_t *rp; + uword *p; + + p = hash_get_mem (am->msg_range_by_name, plugin_name); + if (p == 0) + return ~0; + + rp = vec_elt_at_index (am->msg_ranges, p[0]); + + return (rp->first_msg_id); +} + +uword +unformat_sw_if_index (unformat_input_t * input, va_list * args) +{ + u32 *result = va_arg (*args, u32 *); + vnet_main_t *vnm = vnet_get_main (); + u32 sw_if_index = ~0; + u8 *if_name; + uword *p; + + if (unformat (input, "%U", unformat_vnet_sw_interface, vnm, &sw_if_index)) + { + *result = sw_if_index; + return 1; + } + return 0; +} + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vpp/api/plugin.c b/src/vpp/api/plugin.c new file mode 100644 index 00000000..c1cc928c --- /dev/null +++ b/src/vpp/api/plugin.c @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * plugin.c: plugin handling + */ + +#include +#include +#include +#include + +plugin_main_t vat_plugin_main; + +static int +load_one_plugin (plugin_main_t * pm, plugin_info_t * pi) +{ + void *handle, *register_handle; + clib_error_t *(*fp) (vat_main_t *); + clib_error_t *error; + + handle = dlopen ((char *) pi->name, RTLD_LAZY); + + /* + * Note: this can happen if the plugin has an undefined symbol reference, + * so print a warning. Otherwise, the poor slob won't know what happened. + * Ask me how I know that... + */ + if (handle == 0) + { + clib_warning ("%s", dlerror ()); + return -1; + } + + pi->handle = handle; + + register_handle = dlsym (pi->handle, "vat_plugin_register"); + if (register_handle == 0) + return 0; + + fp = register_handle; + + error = (*fp) (pm->vat_main); + + if (error) + { + clib_error_report (error); + dlclose (handle); + return 1; + } + + clib_warning ("Loaded plugin: %s", pi->name); + + return 0; +} + +static u8 ** +split_plugin_path (plugin_main_t * pm) +{ + int i; + u8 **rv = 0; + u8 *path = pm->plugin_path; + u8 *this = 0; + + for (i = 0; i < vec_len (pm->plugin_path); i++) + { + if (path[i] != ':') + { + vec_add1 (this, path[i]); + continue; + } + vec_add1 (this, 0); + vec_add1 (rv, this); + this = 0; + } + if (this) + { + vec_add1 (this, 0); + vec_add1 (rv, this); + } + return rv; +} + +int +vat_load_new_plugins (plugin_main_t * pm) +{ + DIR *dp; + struct dirent *entry; + struct stat statb; + uword *p; + plugin_info_t *pi; + u8 **plugin_path; + int i; + + plugin_path = split_plugin_path (pm); + + for (i = 0; i < vec_len (plugin_path); i++) + { + dp = opendir ((char *) plugin_path[i]); + + if (dp == 0) + continue; + + while ((entry = readdir (dp))) + { + u8 *plugin_name; + + if (pm->plugin_name_filter) + { + int j; + for (j = 0; j < vec_len (pm->plugin_name_filter); j++) + if (entry->d_name[j] != pm->plugin_name_filter[j]) + goto next; + } + + plugin_name = format (0, "%s/%s%c", plugin_path[i], + entry->d_name, 0); + + /* unreadable */ + if (stat ((char *) plugin_name, &statb) < 0) + { + ignore: + vec_free (plugin_name); + continue; + } + + /* a dir or other things which aren't plugins */ + if (!S_ISREG (statb.st_mode)) + goto ignore; + + p = hash_get_mem (pm->plugin_by_name_hash, plugin_name); + if (p == 0) + { + vec_add2 (pm->plugin_info, pi, 1); + pi->name = plugin_name; + pi->file_info = statb; + + if (load_one_plugin (pm, pi)) + { + vec_free (plugin_name); + _vec_len (pm->plugin_info) = vec_len (pm->plugin_info) - 1; + continue; + } + memset (pi, 0, sizeof (*pi)); + hash_set_mem (pm->plugin_by_name_hash, plugin_name, + pi - pm->plugin_info); + } + next: + ; + } + closedir (dp); + vec_free (plugin_path[i]); + } + vec_free (plugin_path); + return 0; +} + +#define QUOTE_(x) #x +#define QUOTE(x) QUOTE_(x) + +/* + * Load plugins from /usr/lib/vpp_api_test_plugins by default + */ +char *vat_plugin_path = "/usr/lib/vpp_api_test_plugins"; + +char *vat_plugin_name_filter = 0; + +int +vat_plugin_init (vat_main_t * vam) +{ + plugin_main_t *pm = &vat_plugin_main; + + + pm->plugin_path = format (0, "%s%c", vat_plugin_path, 0); + if (vat_plugin_name_filter) + pm->plugin_name_filter = format (0, "%s%c", vat_plugin_name_filter, 0); + + pm->plugin_by_name_hash = hash_create_string (0, sizeof (uword)); + pm->vat_main = vam; + + return vat_load_new_plugins (pm); +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vpp/api/plugin.h b/src/vpp/api/plugin.h new file mode 100644 index 00000000..559ec52f --- /dev/null +++ b/src/vpp/api/plugin.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * plugin.h: plugin handling + */ + +#ifndef __included_plugin_h__ +#define __included_plugin_h__ + +#include +#include +#include + +typedef struct +{ + u8 *name; + struct stat file_info; + void *handle; +} plugin_info_t; + +typedef struct +{ + /* loaded plugin info */ + plugin_info_t *plugin_info; + uword *plugin_by_name_hash; + + /* path and name filter */ + u8 *plugin_path; + u8 *plugin_name_filter; + + /* convenience */ + vat_main_t *vat_main; + +} plugin_main_t; + +plugin_main_t vat_plugin_main; + +int vat_plugin_init (vat_main_t * vam); +int vat_load_new_plugins (plugin_main_t * pm); + +#endif + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ -- cgit 1.2.3-korg From 044183faacc5eb8a055658f9deebefb56b254adc Mon Sep 17 00:00:00 2001 From: Neale Ranns Date: Tue, 24 Jan 2017 01:34:25 -0800 Subject: [re]Enable per-Adjacency/neighbour counters Change-Id: I953b3888bbc6d8a5f53f684a5edc8742b382f323 Signed-off-by: Neale Ranns --- src/scripts/vnet/ip6 | 18 +- src/vat/api_format.c | 124 ++++++++++++++ src/vat/vat.h | 18 ++ src/vlib/counter.h | 14 ++ src/vnet/adj/adj.c | 4 + src/vnet/adj/adj_l2.c | 10 +- src/vnet/ip/ip4_forward.c | 42 +++-- src/vnet/ip/ip6_forward.c | 24 +-- src/vnet/mpls/mpls_output.c | 37 ++-- src/vpp/api/vpe.api | 41 +++++ src/vpp/stats/stats.c | 405 +++++++++++++++++++++++++++++++++++++++++++- 11 files changed, 675 insertions(+), 62 deletions(-) (limited to 'src/vlib') diff --git a/src/scripts/vnet/ip6 b/src/scripts/vnet/ip6 index 4f9f3ee5..adb27225 100644 --- a/src/scripts/vnet/ip6 +++ b/src/scripts/vnet/ip6 @@ -6,10 +6,24 @@ packet-generator new { no-recycle data { IP6: 1.2.3 -> 4.5.6 - ICMP: ::1 -> ::2 + ICMP: 3002::2 -> 3001::2 ICMP echo_request incrementing 100 } } -tr add pg-input 100 + +loop create +loop create +set int state loop0 up +set int state loop1 up + +set int ip address loop0 2001:1::1/64 +set int ip address loop1 2001:2::1/64 + +set ip6 neighbor loop0 2001:1::2 00:00:DD:EE:AA:DD +set ip6 neighbor loop1 2001:2::2 00:00:DD:EE:AA:EE + +ip route add 3001::/64 via 2001:2::2 loop1 + +trace add pg-input 100 diff --git a/src/vat/api_format.c b/src/vat/api_format.c index 653cf79f..839bcdaa 100644 --- a/src/vat/api_format.c +++ b/src/vat/api_format.c @@ -2042,6 +2042,42 @@ static void vl_api_vnet_ip4_fib_counters_t_handler_json } } +static void vl_api_vnet_ip4_nbr_counters_t_handler + (vl_api_vnet_ip4_nbr_counters_t * mp) +{ + /* not supported */ +} + +static void vl_api_vnet_ip4_nbr_counters_t_handler_json + (vl_api_vnet_ip4_nbr_counters_t * mp) +{ + vat_main_t *vam = &vat_main; + vl_api_ip4_nbr_counter_t *v; + ip4_nbr_counter_t *counter; + u32 sw_if_index; + u32 count; + int i; + + sw_if_index = ntohl (mp->sw_if_index); + count = ntohl (mp->count); + vec_validate (vam->ip4_nbr_counters, sw_if_index); + + if (mp->begin) + vec_free (vam->ip4_nbr_counters[sw_if_index]); + + v = (vl_api_ip4_nbr_counter_t *) & mp->c; + for (i = 0; i < count; i++) + { + vec_validate (vam->ip4_nbr_counters[sw_if_index], i); + counter = &vam->ip4_nbr_counters[sw_if_index][i]; + counter->address.s_addr = v->address; + counter->packets = clib_net_to_host_u64 (v->packets); + counter->bytes = clib_net_to_host_u64 (v->bytes); + counter->linkt = v->link_type; + v++; + } +} + static void vl_api_vnet_ip6_fib_counters_t_handler (vl_api_vnet_ip6_fib_counters_t * mp) { @@ -2087,6 +2123,43 @@ static void vl_api_vnet_ip6_fib_counters_t_handler_json } } +static void vl_api_vnet_ip6_nbr_counters_t_handler + (vl_api_vnet_ip6_nbr_counters_t * mp) +{ + /* not supported */ +} + +static void vl_api_vnet_ip6_nbr_counters_t_handler_json + (vl_api_vnet_ip6_nbr_counters_t * mp) +{ + vat_main_t *vam = &vat_main; + vl_api_ip6_nbr_counter_t *v; + ip6_nbr_counter_t *counter; + struct in6_addr ip6; + u32 sw_if_index; + u32 count; + int i; + + sw_if_index = ntohl (mp->sw_if_index); + count = ntohl (mp->count); + vec_validate (vam->ip6_nbr_counters, sw_if_index); + + if (mp->begin) + vec_free (vam->ip6_nbr_counters[sw_if_index]); + + v = (vl_api_ip6_nbr_counter_t *) & mp->c; + for (i = 0; i < count; i++) + { + vec_validate (vam->ip6_nbr_counters[sw_if_index], i); + counter = &vam->ip6_nbr_counters[sw_if_index][i]; + clib_memcpy (&ip6, &v->address, sizeof (ip6)); + counter->address = ip6; + counter->packets = clib_net_to_host_u64 (v->packets); + counter->bytes = clib_net_to_host_u64 (v->bytes); + v++; + } +} + static void vl_api_get_first_msg_id_reply_t_handler (vl_api_get_first_msg_id_reply_t * mp) { @@ -3490,6 +3563,10 @@ static void vl_api_flow_classify_details_t_handler_json #define vl_api_vnet_ip4_fib_counters_t_print vl_noop_handler #define vl_api_vnet_ip6_fib_counters_t_endian vl_noop_handler #define vl_api_vnet_ip6_fib_counters_t_print vl_noop_handler +#define vl_api_vnet_ip4_nbr_counters_t_endian vl_noop_handler +#define vl_api_vnet_ip4_nbr_counters_t_print vl_noop_handler +#define vl_api_vnet_ip6_nbr_counters_t_endian vl_noop_handler +#define vl_api_vnet_ip6_nbr_counters_t_print vl_noop_handler #define vl_api_lisp_adjacencies_get_reply_t_endian vl_noop_handler #define vl_api_lisp_adjacencies_get_reply_t_print vl_noop_handler @@ -3799,6 +3876,8 @@ _(DHCP_COMPL_EVENT, dhcp_compl_event) \ _(VNET_INTERFACE_COUNTERS, vnet_interface_counters) \ _(VNET_IP4_FIB_COUNTERS, vnet_ip4_fib_counters) \ _(VNET_IP6_FIB_COUNTERS, vnet_ip6_fib_counters) \ +_(VNET_IP4_NBR_COUNTERS, vnet_ip4_nbr_counters) \ +_(VNET_IP6_NBR_COUNTERS, vnet_ip6_nbr_counters) \ _(MAP_ADD_DOMAIN_REPLY, map_add_domain_reply) \ _(MAP_DEL_DOMAIN_REPLY, map_del_domain_reply) \ _(MAP_ADD_DEL_RULE_REPLY, map_add_del_rule_reply) \ @@ -4131,6 +4210,8 @@ dump_stats_table (vat_main_t * vam) u64 packets; ip4_fib_counter_t *c4; ip6_fib_counter_t *c6; + ip4_nbr_counter_t *n4; + ip6_nbr_counter_t *n6; int i, j; if (!vam->json_output) @@ -4226,6 +4307,49 @@ dump_stats_table (vat_main_t * vam) } } + /* ip4 nbr counters */ + msg_array = vat_json_object_add (&node, "ip4_nbr_counters"); + vat_json_init_array (msg_array); + for (i = 0; i < vec_len (vam->ip4_nbr_counters); i++) + { + msg = vat_json_array_add (msg_array); + vat_json_init_object (msg); + vat_json_object_add_uint (msg, "sw_if_index", i); + counter_array = vat_json_object_add (msg, "c"); + vat_json_init_array (counter_array); + for (j = 0; j < vec_len (vam->ip4_nbr_counters[i]); j++) + { + counter = vat_json_array_add (counter_array); + vat_json_init_object (counter); + n4 = &vam->ip4_nbr_counters[i][j]; + vat_json_object_add_ip4 (counter, "address", n4->address); + vat_json_object_add_uint (counter, "link-type", n4->linkt); + vat_json_object_add_uint (counter, "packets", n4->packets); + vat_json_object_add_uint (counter, "bytes", n4->bytes); + } + } + + /* ip6 nbr counters */ + msg_array = vat_json_object_add (&node, "ip6_nbr_counters"); + vat_json_init_array (msg_array); + for (i = 0; i < vec_len (vam->ip6_nbr_counters); i++) + { + msg = vat_json_array_add (msg_array); + vat_json_init_object (msg); + vat_json_object_add_uint (msg, "sw_if_index", i); + counter_array = vat_json_object_add (msg, "c"); + vat_json_init_array (counter_array); + for (j = 0; j < vec_len (vam->ip6_nbr_counters[i]); j++) + { + counter = vat_json_array_add (counter_array); + vat_json_init_object (counter); + n6 = &vam->ip6_nbr_counters[i][j]; + vat_json_object_add_ip6 (counter, "address", n6->address); + vat_json_object_add_uint (counter, "packets", n6->packets); + vat_json_object_add_uint (counter, "bytes", n6->bytes); + } + } + vat_json_print (vam->ofp, &node); vat_json_free (&node); diff --git a/src/vat/vat.h b/src/vat/vat.h index 3d7d96ae..831bdf50 100644 --- a/src/vat/vat.h +++ b/src/vat/vat.h @@ -95,6 +95,22 @@ typedef struct u64 bytes; } ip6_fib_counter_t; +typedef struct +{ + struct in_addr address; + vnet_link_t linkt; + u64 packets; + u64 bytes; +} ip4_nbr_counter_t; + +typedef struct +{ + struct in6_addr address; + vnet_link_t linkt; + u64 packets; + u64 bytes; +} ip6_nbr_counter_t; + typedef struct { /* vpe input queue */ @@ -185,6 +201,8 @@ typedef struct u32 *ip4_fib_counters_vrf_id_by_index; ip6_fib_counter_t **ip6_fib_counters; u32 *ip6_fib_counters_vrf_id_by_index; + ip4_nbr_counter_t **ip4_nbr_counters; + ip6_nbr_counter_t **ip6_nbr_counters; /* Convenience */ vlib_main_t *vlib_main; diff --git a/src/vlib/counter.h b/src/vlib/counter.h index a7903206..abfa89ee 100644 --- a/src/vlib/counter.h +++ b/src/vlib/counter.h @@ -273,6 +273,20 @@ vlib_increment_combined_counter (vlib_combined_counter_main_t * cm, } } +#define vlib_prefetch_combined_counter(_cm, _cpu_index, _index) \ +{ \ + vlib_mini_counter_t *_cpu_minis; \ + \ + /* \ + * This CPU's mini index is assumed to already be in cache \ + */ \ + _cpu_minis = (_cm)->minis[(_cpu_index)]; \ + CLIB_PREFETCH(_cpu_minis + (_index), \ + sizeof(*_cpu_minis), \ + STORE); \ +} + + /** Get the value of a combined counter, never called in the speed path Scrapes the entire set of mini counters. Innacurate unless worker threads which might increment the counter are diff --git a/src/vnet/adj/adj.c b/src/vnet/adj/adj.c index e740c4cb..d0be0f0e 100644 --- a/src/vnet/adj/adj.c +++ b/src/vnet/adj/adj.c @@ -122,6 +122,10 @@ format_ip_adjacency (u8 * s, va_list * args) if (fiaf & FORMAT_IP_ADJACENCY_DETAIL) { + vlib_counter_t counts; + + vlib_get_combined_counter(&adjacency_counters, adj_index, &counts); + s = format (s, "\n counts:[%Ld:%Ld]", counts.packets, counts.bytes); s = format (s, "\n locks:%d", adj->ia_node.fn_locks); s = format (s, " node:[%d]:%U", adj->rewrite_header.node_index, diff --git a/src/vnet/adj/adj_l2.c b/src/vnet/adj/adj_l2.c index 4d2dd708..5a083643 100644 --- a/src/vnet/adj/adj_l2.c +++ b/src/vnet/adj/adj_l2.c @@ -94,11 +94,11 @@ adj_l2_rewrite_inline (vlib_main_t * vm, rw_len0 = adj0[0].rewrite_header.data_bytes; vnet_buffer(p0)->ip.save_rewrite_length = rw_len0; - vlib_increment_combined_counter - (&adjacency_counters, - cpu_index, adj_index0, - /* packet increment */ 0, - /* byte increment */ rw_len0-sizeof(ethernet_header_t)); + vlib_increment_combined_counter(&adjacency_counters, + cpu_index, + adj_index0, + /* packet increment */ 0, + /* byte increment */ rw_len0); /* Check MTU of outgoing interface. */ if (PREDICT_TRUE((vlib_buffer_length_in_chain (vm, p0) <= diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index 6e91b9e9..87b345bd 100644 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -2402,19 +2402,12 @@ ip4_rewrite_inline (vlib_main_t * vm, error1); /* - * We've already accounted for an ethernet_header_t elsewhere + * pre-fetch the per-adjacency counters */ - if (PREDICT_FALSE (rw_len0 > sizeof (ethernet_header_t))) - vlib_increment_combined_counter - (&adjacency_counters, cpu_index, adj_index0, - /* packet increment */ 0, - /* byte increment */ rw_len0 - sizeof (ethernet_header_t)); - - if (PREDICT_FALSE (rw_len1 > sizeof (ethernet_header_t))) - vlib_increment_combined_counter - (&adjacency_counters, cpu_index, adj_index1, - /* packet increment */ 0, - /* byte increment */ rw_len1 - sizeof (ethernet_header_t)); + vlib_prefetch_combined_counter (&adjacency_counters, + cpu_index, adj_index0); + vlib_prefetch_combined_counter (&adjacency_counters, + cpu_index, adj_index1); /* Don't adjust the buffer for ttl issue; icmp-error node wants * to see the IP headerr */ @@ -2446,6 +2439,19 @@ ip4_rewrite_inline (vlib_main_t * vm, vnet_rewrite_two_headers (adj0[0], adj1[0], ip0, ip1, sizeof (ethernet_header_t)); + /* + * Bump the per-adjacency counters + */ + vlib_increment_combined_counter + (&adjacency_counters, + cpu_index, + adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); + + vlib_increment_combined_counter + (&adjacency_counters, + cpu_index, + adj_index1, 1, vlib_buffer_length_in_chain (vm, p1) + rw_len1); + if (is_midchain) { adj0->sub_type.midchain.fixup_func (vm, adj0, p0); @@ -2519,6 +2525,9 @@ ip4_rewrite_inline (vlib_main_t * vm, p0->flags &= ~VNET_BUFFER_LOCALLY_ORIGINATED; } + vlib_prefetch_combined_counter (&adjacency_counters, + cpu_index, adj_index0); + /* Guess we are only writing on simple Ethernet header. */ vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t)); @@ -2526,11 +2535,10 @@ ip4_rewrite_inline (vlib_main_t * vm, rw_len0 = adj0[0].rewrite_header.data_bytes; vnet_buffer (p0)->ip.save_rewrite_length = rw_len0; - if (PREDICT_FALSE (rw_len0 > sizeof (ethernet_header_t))) - vlib_increment_combined_counter - (&adjacency_counters, cpu_index, adj_index0, - /* packet increment */ 0, - /* byte increment */ rw_len0 - sizeof (ethernet_header_t)); + vlib_increment_combined_counter + (&adjacency_counters, + cpu_index, + adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); /* Check MTU of outgoing interface. */ error0 = (vlib_buffer_length_in_chain (vm, p0) diff --git a/src/vnet/ip/ip6_forward.c b/src/vnet/ip/ip6_forward.c index 197a9b79..232f7283 100644 --- a/src/vnet/ip/ip6_forward.c +++ b/src/vnet/ip/ip6_forward.c @@ -2108,14 +2108,14 @@ ip6_rewrite_inline (vlib_main_t * vm, vnet_buffer (p0)->ip.save_rewrite_length = rw_len0; vnet_buffer (p1)->ip.save_rewrite_length = rw_len1; - vlib_increment_combined_counter (&adjacency_counters, - cpu_index, adj_index0, - /* packet increment */ 0, - /* byte increment */ rw_len0); - vlib_increment_combined_counter (&adjacency_counters, - cpu_index, adj_index1, - /* packet increment */ 0, - /* byte increment */ rw_len1); + vlib_increment_combined_counter + (&adjacency_counters, + cpu_index, + adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); + vlib_increment_combined_counter + (&adjacency_counters, + cpu_index, adj_index1, + 1, vlib_buffer_length_in_chain (vm, p1) + rw_len1); /* Check MTU of outgoing interface. */ error0 = @@ -2233,10 +2233,10 @@ ip6_rewrite_inline (vlib_main_t * vm, rw_len0 = adj0[0].rewrite_header.data_bytes; vnet_buffer (p0)->ip.save_rewrite_length = rw_len0; - vlib_increment_combined_counter (&adjacency_counters, - cpu_index, adj_index0, - /* packet increment */ 0, - /* byte increment */ rw_len0); + vlib_increment_combined_counter + (&adjacency_counters, + cpu_index, + adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); /* Check MTU of outgoing interface. */ error0 = diff --git a/src/vnet/mpls/mpls_output.c b/src/vnet/mpls/mpls_output.c index 8292a0cb..c06cf917 100644 --- a/src/vnet/mpls/mpls_output.c +++ b/src/vnet/mpls/mpls_output.c @@ -128,18 +128,19 @@ mpls_output_inline (vlib_main_t * vm, rw_len0 = adj0[0].rewrite_header.data_bytes; rw_len1 = adj1[0].rewrite_header.data_bytes; - if (PREDICT_FALSE (rw_len0 > sizeof(ethernet_header_t))) - vlib_increment_combined_counter - (&adjacency_counters, - cpu_index, adj_index0, - /* packet increment */ 0, - /* byte increment */ rw_len0-sizeof(ethernet_header_t)); - if (PREDICT_FALSE (rw_len1 > sizeof(ethernet_header_t))) - vlib_increment_combined_counter - (&adjacency_counters, - cpu_index, adj_index1, - /* packet increment */ 0, - /* byte increment */ rw_len1-sizeof(ethernet_header_t)); + /* Bump the adj counters for packet and bytes */ + vlib_increment_combined_counter + (&adjacency_counters, + cpu_index, + adj_index0, + 1, + vlib_buffer_length_in_chain (vm, p0) + rw_len0); + vlib_increment_combined_counter + (&adjacency_counters, + cpu_index, + adj_index1, + 1, + vlib_buffer_length_in_chain (vm, p1) + rw_len1); /* Check MTU of outgoing interface. */ if (PREDICT_TRUE(vlib_buffer_length_in_chain (vm, p0) <= @@ -234,12 +235,12 @@ mpls_output_inline (vlib_main_t * vm, /* Update packet buffer attributes/set output interface. */ rw_len0 = adj0[0].rewrite_header.data_bytes; - if (PREDICT_FALSE (rw_len0 > sizeof(ethernet_header_t))) - vlib_increment_combined_counter - (&adjacency_counters, - cpu_index, adj_index0, - /* packet increment */ 0, - /* byte increment */ rw_len0-sizeof(ethernet_header_t)); + vlib_increment_combined_counter + (&adjacency_counters, + cpu_index, + adj_index0, + 1, + vlib_buffer_length_in_chain (vm, p0) + rw_len0); /* Check MTU of outgoing interface. */ if (PREDICT_TRUE(vlib_buffer_length_in_chain (vm, p0) <= diff --git a/src/vpp/api/vpe.api b/src/vpp/api/vpe.api index f32ba670..a00033c5 100644 --- a/src/vpp/api/vpe.api +++ b/src/vpp/api/vpe.api @@ -230,6 +230,31 @@ manual_print manual_endian define vnet_ip4_fib_counters vl_api_ip4_fib_counter_t c[count]; }; +typeonly manual_print manual_endian define ip4_nbr_counter +{ + u32 address; + u8 link_type; + u64 packets; + u64 bytes; +}; + +/** + * @brief Per-neighbour (i.e. per-adjacency) coutners + * @param count The size of the array of counters + * @param sw_if_index The interface the adjacency is on + * @param begin Flag to indicate this is the first set of stats for this + * interface. If this flag is not set the it is a continuation of + * stats for this interface + * @param c counters + */ +manual_print manual_endian define vnet_ip4_nbr_counters +{ + u32 count; + u32 sw_if_index; + u8 begin; + vl_api_ip4_nbr_counter_t c[count]; +}; + typeonly manual_print manual_endian define ip6_fib_counter { u64 address[2]; @@ -245,6 +270,22 @@ manual_print manual_endian define vnet_ip6_fib_counters vl_api_ip6_fib_counter_t c[count]; }; +typeonly manual_print manual_endian define ip6_nbr_counter +{ + u64 address[2]; + u8 link_type; + u64 packets; + u64 bytes; +}; + +manual_print manual_endian define vnet_ip6_nbr_counters +{ + u32 count; + u32 sw_if_index; + u8 begin; + vl_api_ip6_nbr_counter_t c[count]; +}; + /** \brief Request for a single block of summary stats @param client_index - opaque cookie to identify the sender @param context - sender context, to match reply w/ request diff --git a/src/vpp/stats/stats.c b/src/vpp/stats/stats.c index 391e02f6..5e9b0d69 100644 --- a/src/vpp/stats/stats.c +++ b/src/vpp/stats/stats.c @@ -49,7 +49,9 @@ _(WANT_STATS, want_stats) \ _(WANT_STATS_REPLY, want_stats_reply) \ _(VNET_INTERFACE_COUNTERS, vnet_interface_counters) \ _(VNET_IP4_FIB_COUNTERS, vnet_ip4_fib_counters) \ -_(VNET_IP6_FIB_COUNTERS, vnet_ip6_fib_counters) +_(VNET_IP6_FIB_COUNTERS, vnet_ip6_fib_counters) \ +_(VNET_IP4_NBR_COUNTERS, vnet_ip4_nbr_counters) \ +_(VNET_IP6_NBR_COUNTERS, vnet_ip6_nbr_counters) /* These constants ensure msg sizes <= 1024, aka ring allocation */ #define SIMPLE_COUNTER_BATCH_SIZE 126 @@ -258,6 +260,313 @@ ip46_fib_stats_delay (stats_main_t * sm, u32 sec, u32 nsec) } } +/** + * @brief The context passed when collecting adjacency counters + */ +typedef struct ip4_nbr_stats_ctx_t_ +{ + /** + * The SW IF index all these adjs belong to + */ + u32 sw_if_index; + + /** + * A vector of ip4 nbr counters + */ + vl_api_ip4_nbr_counter_t *counters; +} ip4_nbr_stats_ctx_t; + +static adj_walk_rc_t +ip4_nbr_stats_cb (adj_index_t ai, void *arg) +{ + vl_api_ip4_nbr_counter_t *vl_counter; + vlib_counter_t adj_counter; + ip4_nbr_stats_ctx_t *ctx; + ip_adjacency_t *adj; + + ctx = arg; + vlib_get_combined_counter (&adjacency_counters, ai, &adj_counter); + + if (0 != adj_counter.packets) + { + vec_add2 (ctx->counters, vl_counter, 1); + adj = adj_get (ai); + + vl_counter->packets = clib_host_to_net_u64 (adj_counter.packets); + vl_counter->bytes = clib_host_to_net_u64 (adj_counter.bytes); + vl_counter->address = adj->sub_type.nbr.next_hop.ip4.as_u32; + vl_counter->link_type = adj->ia_link; + } + return (ADJ_WALK_RC_CONTINUE); +} + +#define MIN(x,y) (((x)<(y))?(x):(y)) + +static void +ip4_nbr_ship (stats_main_t * sm, ip4_nbr_stats_ctx_t * ctx) +{ + api_main_t *am = sm->api_main; + vl_shmem_hdr_t *shmem_hdr = am->shmem_hdr; + unix_shared_memory_queue_t *q = shmem_hdr->vl_input_queue; + vl_api_vnet_ip4_nbr_counters_t *mp = 0; + int first = 0; + + /* + * If the walk context has counters, which may be left over from the last + * suspend, then we continue from there. + */ + while (0 != vec_len (ctx->counters)) + { + u32 n_items = MIN (vec_len (ctx->counters), + IP4_FIB_COUNTER_BATCH_SIZE); + u8 pause = 0; + + dslock (sm, 0 /* release hint */ , 1 /* tag */ ); + + mp = vl_msg_api_alloc_as_if_client (sizeof (*mp) + + (n_items * + sizeof + (vl_api_ip4_nbr_counter_t))); + mp->_vl_msg_id = ntohs (VL_API_VNET_IP4_NBR_COUNTERS); + mp->count = ntohl (n_items); + mp->sw_if_index = ntohl (ctx->sw_if_index); + mp->begin = first; + first = 0; + + /* + * copy the counters from the back of the context, then we can easily + * 'erase' them by resetting the vector length. + * The order we push the stats to the caller is not important. + */ + clib_memcpy (mp->c, + &ctx->counters[vec_len (ctx->counters) - n_items], + n_items * sizeof (*ctx->counters)); + + _vec_len (ctx->counters) = vec_len (ctx->counters) - n_items; + + /* + * send to the shm q + */ + unix_shared_memory_queue_lock (q); + pause = unix_shared_memory_queue_is_full (q); + + vl_msg_api_send_shmem_nolock (q, (u8 *) & mp); + unix_shared_memory_queue_unlock (q); + dsunlock (sm); + + if (pause) + ip46_fib_stats_delay (sm, 0 /* sec */ , + STATS_RELEASE_DELAY_NS); + } +} + +static void +do_ip4_nbrs (stats_main_t * sm) +{ + vnet_main_t *vnm = vnet_get_main (); + vnet_interface_main_t *im = &vnm->interface_main; + vnet_sw_interface_t *si; + + ip4_nbr_stats_ctx_t ctx = { + .sw_if_index = 0, + .counters = NULL, + }; + + /* *INDENT-OFF* */ + pool_foreach (si, im->sw_interfaces, + ({ + /* + * update the interface we are now concerned with + */ + ctx.sw_if_index = si->sw_if_index; + + /* + * we are about to walk another interface, so we shouldn't have any pending + * stats to export. + */ + ASSERT(ctx.counters == NULL); + + /* + * visit each neighbour adjacency on the interface and collect + * its current stats. + * Because we hold the lock the walk is synchronous, so safe to routing + * updates. It's limited in work by the number of adjacenies on an + * interface, which is typically not huge. + */ + dslock (sm, 0 /* release hint */ , 1 /* tag */ ); + adj_nbr_walk (si->sw_if_index, + FIB_PROTOCOL_IP4, + ip4_nbr_stats_cb, + &ctx); + dsunlock (sm); + + /* + * if this interface has some adjacencies with counters then ship them, + * else continue to the next interface. + */ + if (NULL != ctx.counters) + { + ip4_nbr_ship(sm, &ctx); + } + })); + /* *INDENT-OFF* */ +} + +/** + * @brief The context passed when collecting adjacency counters + */ +typedef struct ip6_nbr_stats_ctx_t_ +{ + /** + * The SW IF index all these adjs belong to + */ + u32 sw_if_index; + + /** + * A vector of ip6 nbr counters + */ + vl_api_ip6_nbr_counter_t *counters; +} ip6_nbr_stats_ctx_t; + +static adj_walk_rc_t +ip6_nbr_stats_cb (adj_index_t ai, + void *arg) +{ + vl_api_ip6_nbr_counter_t *vl_counter; + vlib_counter_t adj_counter; + ip6_nbr_stats_ctx_t *ctx; + ip_adjacency_t *adj; + + ctx = arg; + vlib_get_combined_counter(&adjacency_counters, ai, &adj_counter); + + if (0 != adj_counter.packets) + { + vec_add2(ctx->counters, vl_counter, 1); + adj = adj_get(ai); + + vl_counter->packets = clib_host_to_net_u64(adj_counter.packets); + vl_counter->bytes = clib_host_to_net_u64(adj_counter.bytes); + vl_counter->address[0] = adj->sub_type.nbr.next_hop.ip6.as_u64[0]; + vl_counter->address[1] = adj->sub_type.nbr.next_hop.ip6.as_u64[1]; + vl_counter->link_type = adj->ia_link; + } + return (ADJ_WALK_RC_CONTINUE); +} + +#define MIN(x,y) (((x)<(y))?(x):(y)) + +static void +ip6_nbr_ship (stats_main_t * sm, + ip6_nbr_stats_ctx_t *ctx) +{ + api_main_t *am = sm->api_main; + vl_shmem_hdr_t *shmem_hdr = am->shmem_hdr; + unix_shared_memory_queue_t *q = shmem_hdr->vl_input_queue; + vl_api_vnet_ip6_nbr_counters_t *mp = 0; + int first = 0; + + /* + * If the walk context has counters, which may be left over from the last + * suspend, then we continue from there. + */ + while (0 != vec_len(ctx->counters)) + { + u32 n_items = MIN (vec_len (ctx->counters), + IP6_FIB_COUNTER_BATCH_SIZE); + u8 pause = 0; + + dslock (sm, 0 /* release hint */ , 1 /* tag */ ); + + mp = vl_msg_api_alloc_as_if_client (sizeof (*mp) + + (n_items * + sizeof + (vl_api_ip6_nbr_counter_t))); + mp->_vl_msg_id = ntohs (VL_API_VNET_IP6_NBR_COUNTERS); + mp->count = ntohl (n_items); + mp->sw_if_index = ntohl (ctx->sw_if_index); + mp->begin = first; + first = 0; + + /* + * copy the counters from the back of the context, then we can easily + * 'erase' them by resetting the vector length. + * The order we push the stats to the caller is not important. + */ + clib_memcpy (mp->c, + &ctx->counters[vec_len (ctx->counters) - n_items], + n_items * sizeof (*ctx->counters)); + + _vec_len (ctx->counters) = vec_len (ctx->counters) - n_items; + + /* + * send to the shm q + */ + unix_shared_memory_queue_lock (q); + pause = unix_shared_memory_queue_is_full (q); + + vl_msg_api_send_shmem_nolock (q, (u8 *) & mp); + unix_shared_memory_queue_unlock (q); + dsunlock (sm); + + if (pause) + ip46_fib_stats_delay (sm, 0 /* sec */ , + STATS_RELEASE_DELAY_NS); + } +} + +static void +do_ip6_nbrs (stats_main_t * sm) +{ + vnet_main_t *vnm = vnet_get_main (); + vnet_interface_main_t *im = &vnm->interface_main; + vnet_sw_interface_t *si; + + ip6_nbr_stats_ctx_t ctx = { + .sw_if_index = 0, + .counters = NULL, + }; + + /* *INDENT-OFF* */ + pool_foreach (si, im->sw_interfaces, + ({ + /* + * update the interface we are now concerned with + */ + ctx.sw_if_index = si->sw_if_index; + + /* + * we are about to walk another interface, so we shouldn't have any pending + * stats to export. + */ + ASSERT(ctx.counters == NULL); + + /* + * visit each neighbour adjacency on the interface and collect + * its current stats. + * Because we hold the lock the walk is synchronous, so safe to routing + * updates. It's limited in work by the number of adjacenies on an + * interface, which is typically not huge. + */ + dslock (sm, 0 /* release hint */ , 1 /* tag */ ); + adj_nbr_walk (si->sw_if_index, + FIB_PROTOCOL_IP6, + ip6_nbr_stats_cb, + &ctx); + dsunlock (sm); + + /* + * if this interface has some adjacencies with counters then ship them, + * else continue to the next interface. + */ + if (NULL != ctx.counters) + { + ip6_nbr_ship(sm, &ctx); + } + })); + /* *INDENT-OFF* */ +} + static void do_ip4_fibs (stats_main_t * sm) { @@ -318,13 +627,7 @@ again: hash_foreach_pair (p, hash, ({ x.address.data_u32 = p->key; - if (lm->fib_result_n_words > 1) - { - x.index = vec_len (results); - vec_add (results, p->value, lm->fib_result_n_words); - } - else - x.index = p->value[0]; + x.index = p->value[0]; vec_add1 (routes, x); if (sm->data_structure_lock->release_hint) @@ -631,6 +934,8 @@ stats_thread_fn (void *arg) do_combined_interface_counters (sm); do_ip4_fibs (sm); do_ip6_fibs (sm); + do_ip4_nbrs (sm); + do_ip6_nbrs (sm); } } @@ -804,6 +1109,45 @@ vl_api_vnet_ip4_fib_counters_t_handler (vl_api_vnet_ip4_fib_counters_t * mp) } } +static void +vl_api_vnet_ip4_nbr_counters_t_handler (vl_api_vnet_ip4_nbr_counters_t * mp) +{ + vpe_client_registration_t *reg; + stats_main_t *sm = &stats_main; + unix_shared_memory_queue_t *q, *q_prev = NULL; + vl_api_vnet_ip4_nbr_counters_t *mp_copy = NULL; + u32 mp_size; + + mp_size = sizeof (*mp_copy) + + ntohl (mp->count) * sizeof (vl_api_ip4_nbr_counter_t); + + /* *INDENT-OFF* */ + pool_foreach(reg, sm->stats_registrations, + ({ + q = vl_api_client_index_to_input_queue (reg->client_index); + if (q) + { + if (q_prev && (q_prev->cursize < q_prev->maxsize)) + { + mp_copy = vl_msg_api_alloc_as_if_client(mp_size); + clib_memcpy(mp_copy, mp, mp_size); + vl_msg_api_send_shmem (q_prev, (u8 *)&mp); + mp = mp_copy; + } + q_prev = q; + } + })); + /* *INDENT-ON* */ + if (q_prev && (q_prev->cursize < q_prev->maxsize)) + { + vl_msg_api_send_shmem (q_prev, (u8 *) & mp); + } + else + { + vl_msg_api_free (mp); + } +} + static void vl_api_vnet_ip6_fib_counters_t_handler (vl_api_vnet_ip6_fib_counters_t * mp) { @@ -843,6 +1187,45 @@ vl_api_vnet_ip6_fib_counters_t_handler (vl_api_vnet_ip6_fib_counters_t * mp) } } +static void +vl_api_vnet_ip6_nbr_counters_t_handler (vl_api_vnet_ip6_nbr_counters_t * mp) +{ + vpe_client_registration_t *reg; + stats_main_t *sm = &stats_main; + unix_shared_memory_queue_t *q, *q_prev = NULL; + vl_api_vnet_ip6_nbr_counters_t *mp_copy = NULL; + u32 mp_size; + + mp_size = sizeof (*mp_copy) + + ntohl (mp->count) * sizeof (vl_api_ip6_nbr_counter_t); + + /* *INDENT-OFF* */ + pool_foreach(reg, sm->stats_registrations, + ({ + q = vl_api_client_index_to_input_queue (reg->client_index); + if (q) + { + if (q_prev && (q_prev->cursize < q_prev->maxsize)) + { + mp_copy = vl_msg_api_alloc_as_if_client(mp_size); + clib_memcpy(mp_copy, mp, mp_size); + vl_msg_api_send_shmem (q_prev, (u8 *)&mp); + mp = mp_copy; + } + q_prev = q; + } + })); + /* *INDENT-ON* */ + if (q_prev && (q_prev->cursize < q_prev->maxsize)) + { + vl_msg_api_send_shmem (q_prev, (u8 *) & mp); + } + else + { + vl_msg_api_free (mp); + } +} + static void vl_api_want_stats_reply_t_handler (vl_api_want_stats_reply_t * mp) { @@ -929,6 +1312,10 @@ stats_memclnt_delete_callback (u32 client_index) #define vl_api_vnet_ip4_fib_counters_t_print vl_noop_handler #define vl_api_vnet_ip6_fib_counters_t_endian vl_noop_handler #define vl_api_vnet_ip6_fib_counters_t_print vl_noop_handler +#define vl_api_vnet_ip4_nbr_counters_t_endian vl_noop_handler +#define vl_api_vnet_ip4_nbr_counters_t_print vl_noop_handler +#define vl_api_vnet_ip6_nbr_counters_t_endian vl_noop_handler +#define vl_api_vnet_ip6_nbr_counters_t_print vl_noop_handler static clib_error_t * stats_init (vlib_main_t * vm) @@ -961,6 +1348,8 @@ stats_init (vlib_main_t * vm) am->message_bounce[VL_API_VNET_INTERFACE_COUNTERS] = 1; am->message_bounce[VL_API_VNET_IP4_FIB_COUNTERS] = 1; am->message_bounce[VL_API_VNET_IP6_FIB_COUNTERS] = 1; + am->message_bounce[VL_API_VNET_IP4_NBR_COUNTERS] = 1; + am->message_bounce[VL_API_VNET_IP6_NBR_COUNTERS] = 1; return 0; } -- cgit 1.2.3-korg From 3b46cba8f4e909bc363403c6c92215159abb2f11 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Mon, 23 Jan 2017 21:13:45 +0100 Subject: Plugin infrastructure improvements This patch replaces requirement for vlib_plugin_register function in the plugin so file and introduces new macro: VLIB_PLUGIN_REGISTER () = { .version = "version string", .version_required = "requred version", .default_disabled = 1, .early_init = "early_init_function_name", }; Plugin will nor be loaded if .default_disabled is set to 1 unless explicitely enabled in startup.conf. If .verstion_required is set, plugin will not be loaded if there is version mismatch between plugin and vpp. This can be bypassed by setting "skip-version-check" for specific plugin. If .early-init string is present, plugin loader will try to resolve this specific symbol in the plugin namespace and make a function call. Following startup.conf configuration is added: plugins { path /path/to/plugin/directory plugin ila_plugin.so { enable skip-version-check } plugin acl_plugin.so { disable } } Change-Id: I706c691dd34d94ffe9e02b59831af8859a95f061 Signed-off-by: Damjan Marion --- src/plugins/acl/acl.c | 29 +-- src/plugins/acl/acl.h | 2 - src/plugins/acl/l2sess.c | 13 -- src/plugins/acl/l2sess.h | 1 - src/plugins/flowperpkt/flowperpkt.c | 32 +-- src/plugins/ila/ila.c | 14 +- src/plugins/ioam/encap/ip6_ioam_trace.c | 18 +- src/plugins/lb/lb.c | 15 +- src/plugins/sixrd/sixrd.c | 23 +-- src/plugins/snat/snat.c | 27 +-- src/plugins/snat/snat.h | 1 - src/vlib/unix/main.c | 8 + src/vlib/unix/plugin.c | 339 ++++++++++++++++++++++++++------ src/vlib/unix/plugin.h | 31 ++- src/vnet/plugin/plugin.h | 9 +- src/vpp/vnet/main.c | 15 +- 16 files changed, 367 insertions(+), 210 deletions(-) (limited to 'src/vlib') diff --git a/src/plugins/acl/acl.c b/src/plugins/acl/acl.c index f4db2013..85c9113b 100644 --- a/src/plugins/acl/acl.c +++ b/src/plugins/acl/acl.c @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -128,29 +129,11 @@ _(MACIP_ACL_INTERFACE_ADD_DEL, macip_acl_interface_add_del) \ _(MACIP_ACL_DUMP, macip_acl_dump) \ _(MACIP_ACL_INTERFACE_GET, macip_acl_interface_get) -/* - * This routine exists to convince the vlib plugin framework that - * we haven't accidentally copied a random .dll into the plugin directory. - * - * Also collects global variable pointers passed from the vpp engine - */ - -clib_error_t * -vlib_plugin_register (vlib_main_t * vm, vnet_plugin_handoff_t * h, - int from_early_init) -{ - acl_main_t *am = &acl_main; - clib_error_t *error = 0; - - am->vlib_main = vm; - am->vnet_main = h->vnet_main; - am->ethernet_main = h->ethernet_main; - - l2sess_vlib_plugin_register(vm, h, from_early_init); - - return error; -} - +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, +}; +/* *INDENT-ON* */ static void vl_api_acl_plugin_get_version_t_handler (vl_api_acl_plugin_get_version_t * mp) diff --git a/src/plugins/acl/acl.h b/src/plugins/acl/acl.h index afc9b289..62046788 100644 --- a/src/plugins/acl/acl.h +++ b/src/plugins/acl/acl.h @@ -17,7 +17,6 @@ #include #include -#include #include @@ -139,7 +138,6 @@ typedef struct { /* convenience */ vlib_main_t * vlib_main; vnet_main_t * vnet_main; - ethernet_main_t * ethernet_main; } acl_main_t; extern acl_main_t acl_main; diff --git a/src/plugins/acl/l2sess.c b/src/plugins/acl/l2sess.c index cc9bde44..b0385be1 100644 --- a/src/plugins/acl/l2sess.c +++ b/src/plugins/acl/l2sess.c @@ -30,19 +30,6 @@ #include #include -void -l2sess_vlib_plugin_register (vlib_main_t * vm, void* hh, - int from_early_init) -{ - l2sess_main_t *sm = &l2sess_main; - vnet_plugin_handoff_t * h = hh; - memset (sm, 0, sizeof (*sm)); - - sm->vlib_main = vm; - sm->vnet_main = h->vnet_main; - sm->ethernet_main = h->ethernet_main; -} - void l2sess_init_next_features_input (vlib_main_t * vm, l2sess_main_t * sm) { diff --git a/src/plugins/acl/l2sess.h b/src/plugins/acl/l2sess.h index db899917..888b5301 100644 --- a/src/plugins/acl/l2sess.h +++ b/src/plugins/acl/l2sess.h @@ -132,7 +132,6 @@ foreach_l2sess_node /* convenience */ vlib_main_t * vlib_main; vnet_main_t * vnet_main; - ethernet_main_t * ethernet_main; /* Counter(s) */ u64 counter_attempted_delete_free_session; diff --git a/src/plugins/flowperpkt/flowperpkt.c b/src/plugins/flowperpkt/flowperpkt.c index cc351599..6b292eef 100644 --- a/src/plugins/flowperpkt/flowperpkt.c +++ b/src/plugins/flowperpkt/flowperpkt.c @@ -24,6 +24,7 @@ */ #include +#include #include #include @@ -479,30 +480,11 @@ static void *vl_api_flowperpkt_tx_interface_add_del_t_print #define foreach_flowperpkt_plugin_api_msg \ _(FLOWPERPKT_TX_INTERFACE_ADD_DEL, flowperpkt_tx_interface_add_del) -/** - * @brief plugin-api required function - * @param vm vlib_main_t * vlib main data structure pointer - * @param h vlib_plugin_handoff_t * handoff structure - * @param from_early_init int notused - * - * Notes: - * This routine exists to convince the vlib plugin framework that - * we haven't accidentally copied a random .dll into the plugin directory. - * - * Also collects global variable pointers passed from the vpp engine - */ -clib_error_t * -vlib_plugin_register (vlib_main_t * vm, vnet_plugin_handoff_t * h, - int from_early_init) -{ - flowperpkt_main_t *fm = &flowperpkt_main; - clib_error_t *error = 0; - - fm->vlib_main = vm; - fm->vnet_main = h->vnet_main; - - return error; -} +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, +}; +/* *INDENT-ON* */ static clib_error_t * flowperpkt_tx_interface_add_del_feature_command_fn (vlib_main_t * vm, @@ -627,6 +609,8 @@ flowperpkt_init (vlib_main_t * vm) u32 num_threads; u8 *name; + fm->vnet_main = vnet_get_main (); + /* Construct the API name */ name = format (0, "flowperpkt_%08x%c", api_version, 0); diff --git a/src/plugins/ila/ila.c b/src/plugins/ila/ila.c index 336f4cf5..e0f3907f 100644 --- a/src/plugins/ila/ila.c +++ b/src/plugins/ila/ila.c @@ -18,6 +18,7 @@ #include #include #include +#include static ila_main_t ila_main; @@ -821,14 +822,11 @@ ila_interface (u32 sw_if_index, u8 disable) return 0; } -clib_error_t * -vlib_plugin_register (vlib_main_t * vm, vnet_plugin_handoff_t * h, - int from_early_init) -{ - clib_error_t *error = 0; - - return error; -} +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, +}; +/* *INDENT-ON* */ u8 *format_ila_dpo (u8 * s, va_list * va) { diff --git a/src/plugins/ioam/encap/ip6_ioam_trace.c b/src/plugins/ioam/encap/ip6_ioam_trace.c index 3a6758cd..a836dfe8 100644 --- a/src/plugins/ioam/encap/ip6_ioam_trace.c +++ b/src/plugins/ioam/encap/ip6_ioam_trace.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -349,18 +350,11 @@ VLIB_CLI_COMMAND (ip6_show_ioam_trace_cmd, static) = { }; /* *INDENT-ON* */ -/* - * This routine exists to convince the vlib plugin framework that - * we haven't accidentally copied a random .dll into the plugin directory. - * - * Also collects global variable pointers passed from the vpp engine - */ -clib_error_t * -vlib_plugin_register (vlib_main_t * vm, vnet_plugin_handoff_t * h, - int from_early_init) -{ - return 0; -} +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, +}; +/* *INDENT-ON* */ static clib_error_t * ip6_hop_by_hop_ioam_trace_init (vlib_main_t * vm) diff --git a/src/plugins/lb/lb.c b/src/plugins/lb/lb.c index 1d9b9870..dc3f5be1 100644 --- a/src/plugins/lb/lb.c +++ b/src/plugins/lb/lb.c @@ -15,6 +15,7 @@ #include #include +#include #include //GC runs at most once every so many seconds @@ -730,15 +731,11 @@ int lb_vip_del(u32 vip_index) return 0; } -clib_error_t * -vlib_plugin_register (vlib_main_t * vm, - vnet_plugin_handoff_t * h, - int from_early_init) -{ - clib_error_t *error = 0; - return error; -} - +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, +}; +/* *INDENT-ON* */ u8 *format_lb_dpo (u8 * s, va_list * va) { diff --git a/src/plugins/sixrd/sixrd.c b/src/plugins/sixrd/sixrd.c index 66e631a2..71fc181f 100644 --- a/src/plugins/sixrd/sixrd.c +++ b/src/plugins/sixrd/sixrd.c @@ -19,6 +19,7 @@ #include #include #include +#include /* * This code supports the following sixrd modes: @@ -340,27 +341,19 @@ VLIB_CLI_COMMAND(show_sixrd_stats_command, static) = { .function = show_sixrd_stats_command_fn, }; -/* - * This routine exists to convince the vlib plugin framework that - * we haven't accidentally copied a random .dll into the plugin directory. - * - * Also collects global variable pointers passed from the vpp engine - */ -clib_error_t * -vlib_plugin_register (vlib_main_t * vm, vnet_plugin_handoff_t * h, - int from_early_init) +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, +}; +/* *INDENT-ON* */ + +static clib_error_t * sixrd_init (vlib_main_t * vm) { - clib_error_t * error = 0; sixrd_main_t *mm = &sixrd_main; mm->vnet_main = vnet_get_main(); mm->vlib_main = vm; - return error; -} - -static clib_error_t * sixrd_init (vlib_main_t * vm) -{ sixrd_dpo_module_init (); return (NULL); diff --git a/src/plugins/snat/snat.c b/src/plugins/snat/snat.c index 0fcd6ce8..750cc925 100644 --- a/src/plugins/snat/snat.c +++ b/src/plugins/snat/snat.c @@ -28,6 +28,7 @@ #include #include #include +#include snat_main_t snat_main; @@ -129,27 +130,11 @@ VNET_FEATURE_INIT (ip4_snat_out2in_fast, static) = { .runs_before = VNET_FEATURES ("ip4-lookup"), }; - -/* - * This routine exists to convince the vlib plugin framework that - * we haven't accidentally copied a random .dll into the plugin directory. - * - * Also collects global variable pointers passed from the vpp engine - */ - -clib_error_t * -vlib_plugin_register (vlib_main_t * vm, vnet_plugin_handoff_t * h, - int from_early_init) -{ - snat_main_t * sm = &snat_main; - clib_error_t * error = 0; - - sm->vlib_main = vm; - sm->vnet_main = h->vnet_main; - sm->ethernet_main = h->ethernet_main; - - return error; -} +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, +}; +/* *INDENT-ON* */ /*$$$$$ move to an installed header file */ #if (1 || CLIB_DEBUG > 0) /* "trust, but verify" */ diff --git a/src/plugins/snat/snat.h b/src/plugins/snat/snat.h index 32dc9f9e..39cbd3f8 100644 --- a/src/plugins/snat/snat.h +++ b/src/plugins/snat/snat.h @@ -233,7 +233,6 @@ typedef struct { vnet_main_t * vnet_main; ip4_main_t * ip4_main; ip_lookup_main_t * ip4_lookup_main; - ethernet_main_t * ethernet_main; api_main_t * api_main; } snat_main_t; diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c index 562778e0..a04d9f9c 100644 --- a/src/vlib/unix/main.c +++ b/src/vlib/unix/main.c @@ -503,6 +503,14 @@ vlib_unix_main (int argc, char *argv[]) vm->heap_base = clib_mem_get_heap (); ASSERT (vm->heap_base); + unformat_init_command_line (&input, (char **) vm->argv); + if ((e = vlib_plugin_config (vm, &input))) + { + clib_error_report (e); + return 1; + } + unformat_free (&input); + i = vlib_plugin_early_init (vm); if (i) return i; diff --git a/src/vlib/unix/plugin.c b/src/vlib/unix/plugin.c index 40399090..4aba95c8 100644 --- a/src/vlib/unix/plugin.c +++ b/src/vlib/unix/plugin.c @@ -16,29 +16,16 @@ */ #include +#include #include #include plugin_main_t vlib_plugin_main; -void -vlib_set_get_handoff_structure_cb (void *cb) -{ - plugin_main_t *pm = &vlib_plugin_main; - pm->handoff_structure_get_cb = cb; -} - -static void * -vnet_get_handoff_structure (void) -{ - void *(*fp) (void); - - fp = vlib_plugin_main.handoff_structure_get_cb; - if (fp == 0) - return 0; - else - return (*fp) (); -} +char *vlib_plugin_path __attribute__ ((weak)); +char *vlib_plugin_path = ""; +char *vlib_plugin_app_version __attribute__ ((weak)); +char *vlib_plugin_app_version = ""; void * vlib_get_plugin_symbol (char *plugin_name, char *symbol_name) @@ -54,13 +41,99 @@ vlib_get_plugin_symbol (char *plugin_name, char *symbol_name) return dlsym (pi->handle, symbol_name); } +static char * +str_array_to_vec (char *array, int len) +{ + char c, *r = 0; + int n = 0; + + do + { + c = array[n]; + vec_add1 (r, c); + } + while (c && ++n < len); + + if (c) + vec_add1 (r, 0); + + return r; +} + static int load_one_plugin (plugin_main_t * pm, plugin_info_t * pi, int from_early_init) { - void *handle, *register_handle; - clib_error_t *(*fp) (vlib_main_t *, void *, int); + void *handle; clib_error_t *error; - void *handoff_structure; + elf_main_t em = { 0 }; + elf_section_t *section; + u8 *data; + char *version_required; + vlib_plugin_registration_t *reg; + plugin_config_t *pc = 0; + uword *p; + + if (elf_read_file (&em, (char *) pi->filename)) + return -1; + + error = elf_get_section_by_name (&em, ".vlib_plugin_registration", + §ion); + if (error) + { + clib_warning ("Not a plugin: %s\n", (char *) pi->name); + return -1; + } + + data = elf_get_section_contents (&em, section->index, 1); + reg = (vlib_plugin_registration_t *) data; + + if (vec_len (data) != sizeof (*reg)) + { + clib_warning ("vlib_plugin_registration size mismatch in plugin %s\n", + (char *) pi->name); + goto error; + } + + p = hash_get_mem (pm->config_index_by_name, pi->name); + if (p) + { + pc = vec_elt_at_index (pm->configs, p[0]); + if (pc->is_disabled) + { + clib_warning ("Plugin disabled: %s", pi->name); + goto error; + } + if (reg->default_disabled && pc->is_enabled == 0) + { + clib_warning ("Plugin disabled: %s (default)", pi->name); + goto error; + } + } + else if (reg->default_disabled) + { + clib_warning ("Plugin disabled: %s (default)", pi->name); + goto error; + } + + version_required = str_array_to_vec ((char *) ®->version_required, + sizeof (reg->version_required)); + + if ((strlen (version_required) > 0) && + (strncmp (vlib_plugin_app_version, version_required, + strlen (version_required)))) + { + clib_warning ("Plugin %s version mismatch: %s != %s", + pi->name, vlib_plugin_app_version, reg->version_required); + if (!(pc && pc->skip_version_check == 1)) + { + vec_free (version_required); + goto error; + } + } + + vec_free (version_required); + vec_free (data); + elf_main_free (&em); handle = dlopen ((char *) pi->filename, RTLD_LAZY); @@ -77,35 +150,47 @@ load_one_plugin (plugin_main_t * pm, plugin_info_t * pi, int from_early_init) pi->handle = handle; + reg = dlsym (pi->handle, "vlib_plugin_registration"); - register_handle = dlsym (pi->handle, "vlib_plugin_register"); - if (register_handle == 0) + if (reg == 0) { - dlclose (handle); - clib_warning ("Plugin missing vlib_plugin_register: %s\n", - (char *) pi->name); - return 1; + /* This should never happen unless somebody chagnes registration macro */ + clib_warning ("Missing plugin registration in plugin '%s'", pi->name); + os_exit (1); } - fp = register_handle; - - handoff_structure = vnet_get_handoff_structure (); + pi->reg = reg; + pi->version = str_array_to_vec ((char *) ®->version, + sizeof (reg->version)); - if (handoff_structure == 0) - error = clib_error_return (0, "handoff structure callback returned 0"); - else - error = (*fp) (pm->vlib_main, handoff_structure, from_early_init); - - if (error) + if (reg && reg->early_init) { - clib_error_report (error); - dlclose (handle); - return 1; + clib_error_t *(*ei) (vlib_main_t *); + void *h; + + h = dlsym (pi->handle, reg->early_init); + if (h) + { + ei = h; + error = (*ei) (pm->vlib_main); + if (error) + { + clib_error_report (error); + os_exit (1); + } + } + else + clib_warning ("Plugin %s: early init function %s set but not found", + (char *) pi->name, reg->early_init); } clib_warning ("Loaded plugin: %s", pi->name); return 0; +error: + vec_free (data); + elf_main_free (&em); + return -1; } static u8 ** @@ -215,23 +300,16 @@ vlib_load_new_plugins (plugin_main_t * pm, int from_early_init) return 0; } -char *vlib_plugin_path __attribute__ ((weak)); -char *vlib_plugin_path = ""; -char *vlib_plugin_name_filter __attribute__ ((weak)); -char *vlib_plugin_name_filter = 0; - int vlib_plugin_early_init (vlib_main_t * vm) { plugin_main_t *pm = &vlib_plugin_main; - pm->plugin_path = format (0, "%s%c", vlib_plugin_path, 0); + if (pm->plugin_path == 0) + pm->plugin_path = format (0, "%s%c", vlib_plugin_path, 0); clib_warning ("plugin path %s", pm->plugin_path); - if (vlib_plugin_name_filter) - pm->plugin_name_filter = format (0, "%s%c", vlib_plugin_name_filter, 0); - pm->plugin_by_name_hash = hash_create_string (0, sizeof (uword)); pm->vlib_main = vm; @@ -245,19 +323,24 @@ vlib_plugins_show_cmd_fn (vlib_main_t * vm, plugin_main_t *pm = &vlib_plugin_main; u8 *s = 0; u8 *key = 0; - uword *value = 0; + uword value = 0; int index = 1; + plugin_info_t *pi; - s = format (s, " Plugin path is: %s\n", pm->plugin_path); - if (vlib_plugin_name_filter) - s = format (s, " Plugin filter: %s\n", vlib_plugin_name_filter); + s = format (s, " Plugin path is: %s\n\n", pm->plugin_path); + s = format (s, " %-41s%s\n", "Plugin", "Version"); - s = format (s, " Plugins loaded: \n"); + /* *INDENT-OFF* */ hash_foreach_mem (key, value, pm->plugin_by_name_hash, - { - if (key != 0) - s = format (s, " %d.%s\n", index, key); index++;} - ); + { + if (key != 0) + { + pi = vec_elt_at_index (pm->plugin_info, value); + s = format (s, "%3d. %-40s %s\n", index, key, pi->version); + index++; + } + }); + /* *INDENT-ON* */ vlib_cli_output (vm, "%v", s); vec_free (s); @@ -273,6 +356,148 @@ VLIB_CLI_COMMAND (plugins_show_cmd, static) = }; /* *INDENT-ON* */ +static clib_error_t * +config_one_plugin (vlib_main_t * vm, char *name, unformat_input_t * input) +{ + plugin_main_t *pm = &vlib_plugin_main; + plugin_config_t *pc; + clib_error_t *error = 0; + uword *p; + int is_enable = 0; + int is_disable = 0; + int skip_version_check = 0; + + if (pm->config_index_by_name == 0) + pm->config_index_by_name = hash_create_string (0, sizeof (uword)); + + p = hash_get_mem (pm->config_index_by_name, name); + + if (p) + { + error = clib_error_return (0, "plugin '%s' already configured", name); + goto done; + } + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "enable")) + is_enable = 1; + else if (unformat (input, "disable")) + is_disable = 1; + else if (unformat (input, "skip-version-check")) + skip_version_check = 1; + else + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, input); + goto done; + } + } + + if (is_enable && is_disable) + { + error = clib_error_return (0, "please specify either enable or disable" + " for plugin '%s'", name); + goto done; + } + + vec_add2 (pm->configs, pc, 1); + hash_set_mem (pm->config_index_by_name, name, pc - pm->configs); + pc->is_enabled = is_enable; + pc->is_disabled = is_disable; + pc->skip_version_check = skip_version_check; + pc->name = name; + +done: + return error; +} + +clib_error_t * +vlib_plugin_config (vlib_main_t * vm, unformat_input_t * input) +{ + plugin_main_t *pm = &vlib_plugin_main; + clib_error_t *error = 0; + unformat_input_t in; + + unformat_init (&in, 0, 0); + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + u8 *s, *v; + if (unformat (input, "%s %v", &s, &v)) + { + if (strncmp ((const char *) s, "plugins", 8) == 0) + { + if (vec_len (in.buffer) > 0) + vec_add1 (in.buffer, ' '); + vec_add (in.buffer, v, vec_len (v)); + } + } + else + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, input); + goto done; + } + + vec_free (v); + vec_free (s); + } +done: + input = ∈ + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + unformat_input_t sub_input; + u8 *s = 0; + if (unformat (input, "path %s", &s)) + pm->plugin_path = s; + else if (unformat (input, "plugin %s %U", &s, + unformat_vlib_cli_sub_input, &sub_input)) + { + error = config_one_plugin (vm, (char *) s, &sub_input); + unformat_free (&sub_input); + if (error) + goto done2; + } + else + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, input); + { + vec_free (s); + goto done2; + } + } + } + +done2: + unformat_free (&in); + return error; +} + +/* discard whole 'plugins' section, as it is already consumed prior to + plugin load */ +static clib_error_t * +plugins_config (vlib_main_t * vm, unformat_input_t * input) +{ + u8 *junk; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "%s", &junk)) + { + vec_free (junk); + return 0; + } + else + return clib_error_return (0, "unknown input '%U'", + format_unformat_error, input); + } + return 0; +} + +VLIB_CONFIG_FUNCTION (plugins_config, "plugins"); + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vlib/unix/plugin.h b/src/vlib/unix/plugin.h index 1c74cdd2..01fec356 100644 --- a/src/vlib/unix/plugin.h +++ b/src/vlib/unix/plugin.h @@ -56,7 +56,14 @@ * vlib_load_new_plugins(). */ - +/* *INDENT-OFF* */ +typedef CLIB_PACKED(struct { + u8 default_disabled; + const char version[32]; + const char version_required[32]; + const char *early_init; +}) vlib_plugin_registration_t; +/* *INDENT-ON* */ typedef struct { @@ -64,8 +71,20 @@ typedef struct u8 *filename; struct stat file_info; void *handle; + + /* plugin registration */ + vlib_plugin_registration_t *reg; + char *version; } plugin_info_t; +typedef struct +{ + char *name; + u8 is_disabled; + u8 is_enabled; + u8 skip_version_check; +} plugin_config_t; + typedef struct { /* loaded plugin info */ @@ -76,8 +95,9 @@ typedef struct u8 *plugin_path; u8 *plugin_name_filter; - /* handoff structure get callback */ - void *handoff_structure_get_cb; + /* plugin configs and hash by name */ + plugin_config_t *configs; + uword *config_index_by_name; /* usual */ vlib_main_t *vlib_main; @@ -85,10 +105,15 @@ typedef struct extern plugin_main_t vlib_plugin_main; +clib_error_t *vlib_plugin_config (vlib_main_t * vm, unformat_input_t * input); int vlib_plugin_early_init (vlib_main_t * vm); int vlib_load_new_plugins (plugin_main_t * pm, int from_early_init); void *vlib_get_plugin_symbol (char *plugin_name, char *symbol_name); +#define VLIB_PLUGIN_REGISTER() \ + vlib_plugin_registration_t vlib_plugin_registration \ + __attribute__((__section__(".vlib_plugin_registration"))) + #endif /* __included_plugin_h__ */ /* diff --git a/src/vnet/plugin/plugin.h b/src/vnet/plugin/plugin.h index a14a5932..6e1a3264 100644 --- a/src/vnet/plugin/plugin.h +++ b/src/vnet/plugin/plugin.h @@ -20,13 +20,6 @@ #include #include #include - -/* Pointers to Genuine Vnet data structures handed to plugin .dll's */ -typedef struct { - vnet_main_t * vnet_main; - ethernet_main_t * ethernet_main; -} vnet_plugin_handoff_t; - -void * vnet_get_handoff_structure (void); +#include #endif /* included_vnet_plugin_h */ diff --git a/src/vpp/vnet/main.c b/src/vpp/vnet/main.c index a252b846..4a96ca94 100644 --- a/src/vpp/vnet/main.c +++ b/src/vpp/vnet/main.c @@ -18,7 +18,7 @@ #include #include #include - +#include #include @@ -39,16 +39,7 @@ vpe_main_init (vlib_main_t * vm) * Load plugins from /usr/lib/vpp_plugins by default */ char *vlib_plugin_path = "/usr/lib/vpp_plugins"; - -void * -vnet_get_handoff_structure (void) -{ - static vnet_plugin_handoff_t _rv, *rv = &_rv; - - rv->vnet_main = vnet_get_main (); - rv->ethernet_main = ðernet_main; - return (void *) rv; -} +char *vlib_plugin_app_version = VPP_BUILD_VER; int main (int argc, char *argv[]) @@ -59,7 +50,6 @@ main (int argc, char *argv[]) uword main_heap_size = (1ULL << 30); u8 *sizep; u32 size; - void vlib_set_get_handoff_structure_cb (void *cb); #if __x86_64__ CLIB_UNUSED (const char *msg) @@ -206,7 +196,6 @@ defaulted: #if DPDK == 0 unix_physmem_init (vm, 0 /* fail_if_physical_memory_not_present */ ); #endif - vlib_set_get_handoff_structure_cb (&vnet_get_handoff_structure); return vlib_unix_main (argc, argv); } else -- cgit 1.2.3-korg From 71d73fe818b1cd25ca6b181c450c404d43ad4abe Mon Sep 17 00:00:00 2001 From: Gabriel Ganne Date: Sat, 4 Feb 2017 10:51:04 +0100 Subject: fix some 'stored but never read' warnings raised by clang found by `scan-build make plugins-release` Signed-off-by: Gabriel Ganne Change-Id: I52048e3a8ae3fb85eb3d91f6a5e15216dd7b9baa --- src/vlib/error_funcs.h | 8 +++++--- src/vlib/node_funcs.h | 5 ++--- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/error_funcs.h b/src/vlib/error_funcs.h index 1a3602e9..ab281ba2 100644 --- a/src/vlib/error_funcs.h +++ b/src/vlib/error_funcs.h @@ -45,10 +45,12 @@ always_inline void vlib_error_elog_count (vlib_main_t * vm, uword counter, uword increment) { - elog_main_t *em = &vm->elog_main; if (VLIB_ELOG_MAIN_LOOP > 0 && increment > 0) - elog (em, vec_elt_at_index (vm->error_elog_event_types, counter), - increment); + { + elog_main_t *em = &vm->elog_main; + elog (em, vec_elt_at_index (vm->error_elog_event_types, counter), + increment); + } } always_inline void diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h index 21167396..f49a8d6f 100644 --- a/src/vlib/node_funcs.h +++ b/src/vlib/node_funcs.h @@ -476,7 +476,7 @@ vlib_process_get_event_data (vlib_main_t * vm, vlib_node_main_t *nm = &vm->node_main; vlib_process_t *p; vlib_process_event_type_t *et; - uword t, l; + uword t; void *event_data_vector; p = vec_elt (nm->processes, nm->current_process_index); @@ -490,8 +490,7 @@ vlib_process_get_event_data (vlib_main_t * vm, p->non_empty_event_type_bitmap = clib_bitmap_andnoti (p->non_empty_event_type_bitmap, t); - l = _vec_len (p->pending_event_data_by_type_index[t]); - ASSERT (l > 0); + ASSERT (_vec_len (p->pending_event_data_by_type_index[t]) > 0); event_data_vector = p->pending_event_data_by_type_index[t]; p->pending_event_data_by_type_index[t] = 0; -- cgit 1.2.3-korg From bd69a5f24c6e83e9101f203dd124864fb2877a17 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Sun, 5 Feb 2017 23:44:42 +0100 Subject: vlib: remove algned/unaligned buffers scheme Change-Id: I4433eaed3f4e201edc329c4842cbbf74beb19a9a Signed-off-by: Damjan Marion --- src/vlib/buffer.c | 220 +++++------------------------------------ src/vlib/buffer.h | 13 +-- src/vlib/buffer_funcs.h | 53 +++------- src/vlib/threads.c | 3 +- src/vnet/devices/dpdk/buffer.c | 131 +++--------------------- src/vnet/replication.c | 23 +---- 6 files changed, 57 insertions(+), 386 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index ea4960e2..95b4344f 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -304,63 +304,6 @@ vlib_buffer_validate_alloc_free (vlib_main_t * vm, } } -#define BUFFERS_PER_COPY (sizeof (vlib_copy_unit_t) / sizeof (u32)) - -/* Make sure we have at least given number of unaligned buffers. */ -void -vlib_buffer_free_list_fill_unaligned (vlib_main_t * vm, - vlib_buffer_free_list_t * free_list, - uword n_unaligned_buffers) -{ - word la = vec_len (free_list->aligned_buffers); - word lu = vec_len (free_list->unaligned_buffers); - - /* Aligned come in aligned copy-sized chunks. */ - ASSERT (la % BUFFERS_PER_COPY == 0); - - ASSERT (la >= n_unaligned_buffers); - - while (lu < n_unaligned_buffers) - { - /* Copy 4 buffers from end of aligned vector to unaligned vector. */ - vec_add (free_list->unaligned_buffers, - free_list->aligned_buffers + la - BUFFERS_PER_COPY, - BUFFERS_PER_COPY); - la -= BUFFERS_PER_COPY; - lu += BUFFERS_PER_COPY; - } - _vec_len (free_list->aligned_buffers) = la; -} - -/* After free aligned buffers may not contain even sized chunks. */ -void -vlib_buffer_free_list_trim_aligned (vlib_buffer_free_list_t * f) -{ - uword l, n_trim; - - /* Add unaligned to aligned before trim. */ - l = vec_len (f->unaligned_buffers); - if (l > 0) - { - vec_add_aligned (f->aligned_buffers, f->unaligned_buffers, l, - /* align */ sizeof (vlib_copy_unit_t)); - - _vec_len (f->unaligned_buffers) = 0; - } - - /* Remove unaligned buffers from end of aligned vector and save for next trim. */ - l = vec_len (f->aligned_buffers); - n_trim = l % BUFFERS_PER_COPY; - if (n_trim) - { - /* Trim aligned -> unaligned. */ - vec_add (f->unaligned_buffers, f->aligned_buffers + l - n_trim, n_trim); - - /* Remove from aligned. */ - _vec_len (f->aligned_buffers) = l - n_trim; - } -} - void vlib_buffer_merge_free_lists (vlib_buffer_free_list_t * dst, vlib_buffer_free_list_t * src) @@ -368,23 +311,12 @@ vlib_buffer_merge_free_lists (vlib_buffer_free_list_t * dst, uword l; u32 *d; - vlib_buffer_free_list_trim_aligned (src); - vlib_buffer_free_list_trim_aligned (dst); - - l = vec_len (src->aligned_buffers); - if (l > 0) - { - vec_add2_aligned (dst->aligned_buffers, d, l, - /* align */ sizeof (vlib_copy_unit_t)); - clib_memcpy (d, src->aligned_buffers, l * sizeof (d[0])); - vec_free (src->aligned_buffers); - } - - l = vec_len (src->unaligned_buffers); + l = vec_len (src->buffers); if (l > 0) { - vec_add (dst->unaligned_buffers, src->unaligned_buffers, l); - vec_free (src->unaligned_buffers); + vec_add2_aligned (dst->buffers, d, l, CLIB_CACHE_LINE_BYTES); + clib_memcpy (d, src->buffers, l * sizeof (d[0])); + vec_free (src->buffers); } } @@ -447,8 +379,7 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm, ASSERT (f - bm->buffer_free_list_pool == wf - wbm->buffer_free_list_pool); wf[0] = f[0]; - wf->aligned_buffers = 0; - wf->unaligned_buffers = 0; + wf->buffers = 0; wf->n_alloc = 0; } @@ -505,8 +436,7 @@ del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f) vm->os_physmem_free (f->buffer_memory_allocated[i]); vec_free (f->name); vec_free (f->buffer_memory_allocated); - vec_free (f->unaligned_buffers); - vec_free (f->aligned_buffers); + vec_free (f->buffers); } /* Add buffer free list. */ @@ -522,8 +452,7 @@ vlib_buffer_delete_free_list_internal (vlib_main_t * vm, u32 free_list_index) f = vlib_buffer_get_free_list (vm, free_list_index); - ASSERT (vec_len (f->unaligned_buffers) + vec_len (f->aligned_buffers) == - f->n_alloc); + ASSERT (vec_len (f->buffers) == f->n_alloc); merge_index = vlib_buffer_get_free_list_with_size (vm, f->n_data_bytes); if (merge_index != ~0 && merge_index != free_list_index) { @@ -558,15 +487,13 @@ fill_free_list (vlib_main_t * vm, u32 *bi; u32 n_remaining, n_alloc, n_this_chunk; - vlib_buffer_free_list_trim_aligned (fl); - /* Already have enough free buffers on free list? */ - n = min_free_buffers - vec_len (fl->aligned_buffers); + n = min_free_buffers - vec_len (fl->buffers); if (n <= 0) return min_free_buffers; /* Always allocate round number of buffers. */ - n = round_pow2 (n, BUFFERS_PER_COPY); + n = round_pow2 (n, CLIB_CACHE_LINE_BYTES / sizeof (u32)); /* Always allocate new buffers in reasonably large sized chunks. */ n = clib_max (n, fl->min_n_buffers_each_physmem_alloc); @@ -594,8 +521,7 @@ fill_free_list (vlib_main_t * vm, n_remaining -= n_this_chunk; b = buffers; - vec_add2_aligned (fl->aligned_buffers, bi, n_this_chunk, - sizeof (vlib_copy_unit_t)); + vec_add2_aligned (fl->buffers, bi, n_this_chunk, CLIB_CACHE_LINE_BYTES); for (i = 0; i < n_this_chunk; i++) { bi[i] = vlib_get_buffer_index (vm, b); @@ -621,121 +547,28 @@ fill_free_list (vlib_main_t * vm, return n_alloc; } -always_inline uword -copy_alignment (u32 * x) -{ - return (pointer_to_uword (x) / sizeof (x[0])) % BUFFERS_PER_COPY; -} - - static u32 alloc_from_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * free_list, u32 * alloc_buffers, u32 n_alloc_buffers) { - u32 *dst, *u_src; - uword u_len, n_left; - uword n_unaligned_start, n_unaligned_end, n_filled; + u32 *dst, *src; + uword len; + uword n_filled; - n_left = n_alloc_buffers; dst = alloc_buffers; - n_unaligned_start = ((BUFFERS_PER_COPY - copy_alignment (dst)) - & (BUFFERS_PER_COPY - 1)); n_filled = fill_free_list (vm, free_list, n_alloc_buffers); if (n_filled == 0) return 0; - n_left = n_filled < n_left ? n_filled : n_left; - n_alloc_buffers = n_left; - - if (n_unaligned_start >= n_left) - { - n_unaligned_start = n_left; - n_unaligned_end = 0; - } - else - n_unaligned_end = copy_alignment (dst + n_alloc_buffers); - - vlib_buffer_free_list_fill_unaligned (vm, free_list, - n_unaligned_start + n_unaligned_end); - - u_len = vec_len (free_list->unaligned_buffers); - u_src = free_list->unaligned_buffers + u_len - 1; + len = vec_len (free_list->buffers); + ASSERT (len >= n_alloc_buffers); - if (n_unaligned_start) - { - uword n_copy = n_unaligned_start; - if (n_copy > n_left) - n_copy = n_left; - n_left -= n_copy; - - while (n_copy > 0) - { - *dst++ = *u_src--; - n_copy--; - u_len--; - } - - /* Now dst should be aligned. */ - if (n_left > 0) - ASSERT (pointer_to_uword (dst) % sizeof (vlib_copy_unit_t) == 0); - } - - /* Aligned copy. */ - { - vlib_copy_unit_t *d, *s; - uword n_copy; - - if (vec_len (free_list->aligned_buffers) < - ((n_left / BUFFERS_PER_COPY) * BUFFERS_PER_COPY)) - abort (); - - n_copy = n_left / BUFFERS_PER_COPY; - n_left = n_left % BUFFERS_PER_COPY; - - /* Remove buffers from aligned free list. */ - _vec_len (free_list->aligned_buffers) -= n_copy * BUFFERS_PER_COPY; - - s = (vlib_copy_unit_t *) vec_end (free_list->aligned_buffers); - d = (vlib_copy_unit_t *) dst; - - /* Fast path loop. */ - while (n_copy >= 4) - { - d[0] = s[0]; - d[1] = s[1]; - d[2] = s[2]; - d[3] = s[3]; - n_copy -= 4; - s += 4; - d += 4; - } - - while (n_copy >= 1) - { - d[0] = s[0]; - n_copy -= 1; - s += 1; - d += 1; - } - - dst = (void *) d; - } - - /* Unaligned copy. */ - ASSERT (n_unaligned_end == n_left); - while (n_left > 0) - { - *dst++ = *u_src--; - n_left--; - u_len--; - } + src = free_list->buffers + len - n_alloc_buffers; + clib_memcpy (dst, src, n_alloc_buffers * sizeof (u32)); - if (!free_list->unaligned_buffers) - ASSERT (u_len == 0); - else - _vec_len (free_list->unaligned_buffers) = u_len; + _vec_len (free_list->buffers) -= n_alloc_buffers; /* Verify that buffers are known free. */ vlib_buffer_validate_alloc_free (vm, alloc_buffers, @@ -831,8 +664,7 @@ again: vlib_buffer_validate_alloc_free (vm, b, n_left, VLIB_BUFFER_KNOWN_ALLOCATED); - vec_add2_aligned (fl->aligned_buffers, f, n_left, - /* align */ sizeof (vlib_copy_unit_t)); + vec_add2_aligned (fl->buffers, f, n_left, CLIB_CACHE_LINE_BYTES); n = next_to_free[i_next_to_free]; while (n_left >= 4) @@ -890,7 +722,7 @@ again: f -= 2; n -= free_next0 + free_next1; - _vec_len (fl->aligned_buffers) = f - fl->aligned_buffers; + _vec_len (fl->buffers) = f - fl->buffers; fl0 = pool_elt_at_index (bm->buffer_free_list_pool, fi0); fl1 = pool_elt_at_index (bm->buffer_free_list_pool, fi1); @@ -924,8 +756,7 @@ again: fl = pool_elt_at_index (bm->buffer_free_list_pool, fi); } - vec_add2_aligned (fl->aligned_buffers, f, n_left, - /* align */ sizeof (vlib_copy_unit_t)); + vec_add2_aligned (fl->buffers, f, n_left, CLIB_CACHE_LINE_BYTES); } while (n_left >= 1) @@ -968,7 +799,7 @@ again: f -= 1; n -= free_next0; - _vec_len (fl->aligned_buffers) = f - fl->aligned_buffers; + _vec_len (fl->buffers) = f - fl->buffers; fl0 = pool_elt_at_index (bm->buffer_free_list_pool, fi0); @@ -986,8 +817,7 @@ again: fi = fi0; fl = pool_elt_at_index (bm->buffer_free_list_pool, fi); - vec_add2_aligned (fl->aligned_buffers, f, n_left, - /* align */ sizeof (vlib_copy_unit_t)); + vec_add2_aligned (fl->buffers, f, n_left, CLIB_CACHE_LINE_BYTES); } if (follow_buffer_next && ((n_left = n - next_to_free[i_next_to_free]) > 0)) @@ -997,7 +827,7 @@ again: goto again; } - _vec_len (fl->aligned_buffers) = f - fl->aligned_buffers; + _vec_len (fl->buffers) = f - fl->buffers; if (vec_len (announce_list)) { @@ -1239,7 +1069,7 @@ format_vlib_buffer_free_list (u8 * s, va_list * va) "#Alloc", "#Free"); size = sizeof (vlib_buffer_t) + f->n_data_bytes; - n_free = vec_len (f->aligned_buffers) + vec_len (f->unaligned_buffers); + n_free = vec_len (f->buffers); bytes_alloc = size * f->n_alloc; bytes_free = size * n_free; diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index d270c08a..fffb50c8 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -50,12 +50,6 @@ #define VLIB_BUFFER_DATA_SIZE (2048) #define VLIB_BUFFER_PRE_DATA_SIZE __PRE_DATA_SIZE -#if defined (CLIB_HAVE_VEC128) || defined (__aarch64__) -typedef u8x16 vlib_copy_unit_t; -#else -typedef u64 vlib_copy_unit_t; -#endif - /** \file vlib buffer structure definition and a few select access methods. This structure and the buffer allocation @@ -262,11 +256,8 @@ typedef struct vlib_buffer_free_list_t /* Total number of buffers allocated from this free list. */ u32 n_alloc; - /* Vector of free buffers. Each element is a byte offset into I/O heap. - Aligned vectors always has naturally aligned vlib_copy_unit_t sized chunks - of buffer indices. Unaligned vector has any left over. This is meant to - speed up copy routines. */ - u32 *aligned_buffers, *unaligned_buffers; + /* Vector of free buffers. Each element is a byte offset into I/O heap. */ + u32 *buffers; /* Memory chunks allocated for this free list recorded here so they can be freed when free list diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 543a903c..fd051de5 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -350,10 +350,6 @@ vlib_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index) u32 vlib_buffer_get_or_create_free_list (vlib_main_t * vm, u32 n_data_bytes, char *fmt, ...); - -/* After free aligned buffers may not contain even sized chunks. */ -void vlib_buffer_free_list_trim_aligned (vlib_buffer_free_list_t * f); - /* Merge two free lists */ void vlib_buffer_merge_free_lists (vlib_buffer_free_list_t * dst, vlib_buffer_free_list_t * src); @@ -664,23 +660,14 @@ unserialize_vlib_buffer_n_bytes (serialize_main_t * m) return n; } -typedef union -{ - vlib_buffer_t b; - vlib_copy_unit_t i[sizeof (vlib_buffer_t) / sizeof (vlib_copy_unit_t)]; -} -vlib_buffer_union_t; - /* Set a buffer quickly into "uninitialized" state. We want this to be extremely cheap and arrange for all fields that need to be initialized to be in the first 128 bits of the buffer. */ always_inline void -vlib_buffer_init_for_free_list (vlib_buffer_t * _dst, +vlib_buffer_init_for_free_list (vlib_buffer_t * dst, vlib_buffer_free_list_t * fl) { - vlib_buffer_union_t *dst = (vlib_buffer_union_t *) _dst; - vlib_buffer_union_t *src = - (vlib_buffer_union_t *) & fl->buffer_init_template; + vlib_buffer_t *src = &fl->buffer_init_template; /* Make sure vlib_buffer_t is cacheline aligned and sized */ ASSERT (STRUCT_OFFSET_OF (vlib_buffer_t, cacheline0) == 0); @@ -692,21 +679,14 @@ vlib_buffer_init_for_free_list (vlib_buffer_t * _dst, /* Make sure buffer template is sane. */ ASSERT (fl->index == fl->buffer_init_template.free_list_index); - /* Copy template from src->current_data thru src->free_list_index */ - dst->i[0] = src->i[0]; - if (1 * sizeof (dst->i[0]) < 16) - dst->i[1] = src->i[1]; - if (2 * sizeof (dst->i[0]) < 16) - dst->i[2] = src->i[2]; - /* Make sure it really worked. */ -#define _(f) ASSERT (dst->b.f == src->b.f) +#define _(f) dst->f = src->f _(current_data); _(current_length); _(flags); _(free_list_index); #undef _ - ASSERT (dst->b.total_length_not_including_first_buffer == 0); + ASSERT (dst->total_length_not_including_first_buffer == 0); } always_inline void @@ -718,39 +698,28 @@ vlib_buffer_add_to_free_list (vlib_main_t * vm, b = vlib_get_buffer (vm, buffer_index); if (PREDICT_TRUE (do_init)) vlib_buffer_init_for_free_list (b, f); - vec_add1_aligned (f->aligned_buffers, buffer_index, - sizeof (vlib_copy_unit_t)); + vec_add1_aligned (f->buffers, buffer_index, CLIB_CACHE_LINE_BYTES); } always_inline void -vlib_buffer_init_two_for_free_list (vlib_buffer_t * _dst0, - vlib_buffer_t * _dst1, +vlib_buffer_init_two_for_free_list (vlib_buffer_t * dst0, + vlib_buffer_t * dst1, vlib_buffer_free_list_t * fl) { - vlib_buffer_union_t *dst0 = (vlib_buffer_union_t *) _dst0; - vlib_buffer_union_t *dst1 = (vlib_buffer_union_t *) _dst1; - vlib_buffer_union_t *src = - (vlib_buffer_union_t *) & fl->buffer_init_template; + vlib_buffer_t *src = &fl->buffer_init_template; /* Make sure buffer template is sane. */ ASSERT (fl->index == fl->buffer_init_template.free_list_index); - /* Copy template from src->current_data thru src->free_list_index */ - dst0->i[0] = dst1->i[0] = src->i[0]; - if (1 * sizeof (dst0->i[0]) < 16) - dst0->i[1] = dst1->i[1] = src->i[1]; - if (2 * sizeof (dst0->i[0]) < 16) - dst0->i[2] = dst1->i[2] = src->i[2]; - /* Make sure it really worked. */ -#define _(f) ASSERT (dst0->b.f == src->b.f && dst1->b.f == src->b.f) +#define _(f) dst0->f = src->f; dst1->f = src->f _(current_data); _(current_length); _(flags); _(free_list_index); #undef _ - ASSERT (dst0->b.total_length_not_including_first_buffer == 0); - ASSERT (dst1->b.total_length_not_including_first_buffer == 0); + ASSERT (dst0->total_length_not_including_first_buffer == 0); + ASSERT (dst1->total_length_not_including_first_buffer == 0); } #if CLIB_DEBUG > 0 diff --git a/src/vlib/threads.c b/src/vlib/threads.c index b3bbd30e..e3ea3c9c 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -708,8 +708,7 @@ start_workers (vlib_main_t * vm) == fl_clone - bm_clone->buffer_free_list_pool); fl_clone[0] = fl_orig[0]; - fl_clone->aligned_buffers = 0; - fl_clone->unaligned_buffers = 0; + fl_clone->buffers = 0; fl_clone->n_alloc = 0; })); /* *INDENT-ON* */ diff --git a/src/vnet/devices/dpdk/buffer.c b/src/vnet/devices/dpdk/buffer.c index 038f46d9..43ceb91e 100644 --- a/src/vnet/devices/dpdk/buffer.c +++ b/src/vnet/devices/dpdk/buffer.c @@ -79,8 +79,6 @@ STATIC_ASSERT (VLIB_BUFFER_PRE_DATA_SIZE == RTE_PKTMBUF_HEADROOM, "VLIB_BUFFER_PRE_DATA_SIZE must be equal to RTE_PKTMBUF_HEADROOM"); -#define BUFFERS_PER_COPY (sizeof (vlib_copy_unit_t) / sizeof (u32)) - static void del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f) { @@ -88,23 +86,15 @@ del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f) struct rte_mbuf *mb; vlib_buffer_t *b; - for (i = 0; i < vec_len (f->unaligned_buffers); i++) + for (i = 0; i < vec_len (f->buffers); i++) { - b = vlib_get_buffer (vm, f->unaligned_buffers[i]); - mb = rte_mbuf_from_vlib_buffer (b); - ASSERT (rte_mbuf_refcnt_read (mb) == 1); - rte_pktmbuf_free (mb); - } - for (i = 0; i < vec_len (f->aligned_buffers); i++) - { - b = vlib_get_buffer (vm, f->aligned_buffers[i]); + b = vlib_get_buffer (vm, f->buffers[i]); mb = rte_mbuf_from_vlib_buffer (b); ASSERT (rte_mbuf_refcnt_read (mb) == 1); rte_pktmbuf_free (mb); } vec_free (f->name); - vec_free (f->unaligned_buffers); - vec_free (f->aligned_buffers); + vec_free (f->buffers); } /* Add buffer free list. */ @@ -162,15 +152,13 @@ fill_free_list (vlib_main_t * vm, if (PREDICT_FALSE (rmp == 0)) return 0; - vlib_buffer_free_list_trim_aligned (fl); - /* Already have enough free buffers on free list? */ - n = min_free_buffers - vec_len (fl->aligned_buffers); + n = min_free_buffers - vec_len (fl->buffers); if (n <= 0) return min_free_buffers; /* Always allocate round number of buffers. */ - n = round_pow2 (n, BUFFERS_PER_COPY); + n = round_pow2 (n, CLIB_CACHE_LINE_BYTES / sizeof (u32)); /* Always allocate new buffers in reasonably large sized chunks. */ n = clib_max (n, fl->min_n_buffers_each_physmem_alloc); @@ -192,7 +180,7 @@ fill_free_list (vlib_main_t * vm, b = vlib_buffer_from_rte_mbuf (mb); bi = vlib_get_buffer_index (vm, b); - vec_add1_aligned (fl->aligned_buffers, bi, sizeof (vlib_copy_unit_t)); + vec_add1_aligned (fl->buffers, bi, CLIB_CACHE_LINE_BYTES); n_alloc++; n_remaining--; @@ -207,120 +195,27 @@ fill_free_list (vlib_main_t * vm, return n; } -always_inline uword -copy_alignment (u32 * x) -{ - return (pointer_to_uword (x) / sizeof (x[0])) % BUFFERS_PER_COPY; -} - static u32 alloc_from_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * free_list, u32 * alloc_buffers, u32 n_alloc_buffers) { - u32 *dst, *u_src; - uword u_len, n_left; - uword n_unaligned_start, n_unaligned_end, n_filled; + u32 *dst, *src; + uword len, n_filled; - n_left = n_alloc_buffers; dst = alloc_buffers; - n_unaligned_start = ((BUFFERS_PER_COPY - copy_alignment (dst)) - & (BUFFERS_PER_COPY - 1)); n_filled = fill_free_list (vm, free_list, n_alloc_buffers); if (n_filled == 0) return 0; - n_left = n_filled < n_left ? n_filled : n_left; - n_alloc_buffers = n_left; - - if (n_unaligned_start >= n_left) - { - n_unaligned_start = n_left; - n_unaligned_end = 0; - } - else - n_unaligned_end = copy_alignment (dst + n_alloc_buffers); - - vlib_buffer_free_list_fill_unaligned (vm, free_list, - n_unaligned_start + n_unaligned_end); - - u_len = vec_len (free_list->unaligned_buffers); - u_src = free_list->unaligned_buffers + u_len - 1; - - if (n_unaligned_start) - { - uword n_copy = n_unaligned_start; - if (n_copy > n_left) - n_copy = n_left; - n_left -= n_copy; - - while (n_copy > 0) - { - *dst++ = *u_src--; - n_copy--; - u_len--; - } - - /* Now dst should be aligned. */ - if (n_left > 0) - ASSERT (pointer_to_uword (dst) % sizeof (vlib_copy_unit_t) == 0); - } - - /* Aligned copy. */ - { - vlib_copy_unit_t *d, *s; - uword n_copy; - - if (vec_len (free_list->aligned_buffers) < - ((n_left / BUFFERS_PER_COPY) * BUFFERS_PER_COPY)) - abort (); - - n_copy = n_left / BUFFERS_PER_COPY; - n_left = n_left % BUFFERS_PER_COPY; - - /* Remove buffers from aligned free list. */ - _vec_len (free_list->aligned_buffers) -= n_copy * BUFFERS_PER_COPY; - - s = (vlib_copy_unit_t *) vec_end (free_list->aligned_buffers); - d = (vlib_copy_unit_t *) dst; + len = vec_len (free_list->buffers); + ASSERT (len >= n_alloc_buffers); - /* Fast path loop. */ - while (n_copy >= 4) - { - d[0] = s[0]; - d[1] = s[1]; - d[2] = s[2]; - d[3] = s[3]; - n_copy -= 4; - s += 4; - d += 4; - } - - while (n_copy >= 1) - { - d[0] = s[0]; - n_copy -= 1; - s += 1; - d += 1; - } - - dst = (void *) d; - } - - /* Unaligned copy. */ - ASSERT (n_unaligned_end == n_left); - while (n_left > 0) - { - *dst++ = *u_src--; - n_left--; - u_len--; - } + src = free_list->buffers + len - n_alloc_buffers; + clib_memcpy (dst, src, n_alloc_buffers * sizeof (u32)); - if (!free_list->unaligned_buffers) - ASSERT (u_len == 0); - else - _vec_len (free_list->unaligned_buffers) = u_len; + _vec_len (free_list->buffers) -= n_alloc_buffers; return n_alloc_buffers; } diff --git a/src/vnet/replication.c b/src/vnet/replication.c index 561c86cd..02755195 100644 --- a/src/vnet/replication.c +++ b/src/vnet/replication.c @@ -168,32 +168,20 @@ replication_recycle_callback (vlib_main_t * vm, vlib_buffer_free_list_t * fl) * Note: this could be sped up if the node index were stuffed into * the freelist itself. */ - if (vec_len (fl->aligned_buffers) > 0) + if (vec_len (fl->buffers) > 0) { - bi0 = fl->aligned_buffers[0]; - b0 = vlib_get_buffer (vm, bi0); - ctx = pool_elt_at_index (rm->contexts[cpu_number], b0->recycle_count); - feature_node_index = ctx->recycle_node_index; - } - else if (vec_len (fl->unaligned_buffers) > 0) - { - bi0 = fl->unaligned_buffers[0]; + bi0 = fl->buffers[0]; b0 = vlib_get_buffer (vm, bi0); ctx = pool_elt_at_index (rm->contexts[cpu_number], b0->recycle_count); feature_node_index = ctx->recycle_node_index; } - /* aligned, unaligned buffers */ + /* buffers */ for (i = 0; i < 2; i++) { if (i == 0) { - from = fl->aligned_buffers; - n_left_from = vec_len (from); - } - else - { - from = fl->unaligned_buffers; + from = fl->buffers; n_left_from = vec_len (from); } @@ -245,8 +233,7 @@ replication_recycle_callback (vlib_main_t * vm, vlib_buffer_free_list_t * fl) } } - vec_reset_length (fl->aligned_buffers); - vec_reset_length (fl->unaligned_buffers); + vec_reset_length (fl->buffers); if (f) { -- cgit 1.2.3-korg From e4ad8cce6cf6e5d50b0ec101a4f5ab5d510ad374 Mon Sep 17 00:00:00 2001 From: Ole Troan Date: Tue, 7 Feb 2017 11:22:33 +0100 Subject: VPP-630: Null pointer dereferences in vlib/unix/plugin.c Change-Id: Iafb071c684a43e21925e3a43019cd86372347898 Signed-off-by: Ole Troan --- src/vlib/unix/plugin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/vlib') diff --git a/src/vlib/unix/plugin.c b/src/vlib/unix/plugin.c index 4aba95c8..15a2bcbb 100644 --- a/src/vlib/unix/plugin.c +++ b/src/vlib/unix/plugin.c @@ -163,7 +163,7 @@ load_one_plugin (plugin_main_t * pm, plugin_info_t * pi, int from_early_init) pi->version = str_array_to_vec ((char *) ®->version, sizeof (reg->version)); - if (reg && reg->early_init) + if (reg->early_init) { clib_error_t *(*ei) (vlib_main_t *); void *h; -- cgit 1.2.3-korg From 2d41f169c818219b5e8b23ed32e28ef334f39c35 Mon Sep 17 00:00:00 2001 From: Matus Fabian Date: Tue, 21 Feb 2017 22:27:13 -0800 Subject: fix trace frame-queue unformat of index Change-Id: Id891af5ef3c4afe877282b34cd03fc43886940a3 Signed-off-by: Matus Fabian --- src/vlib/threads_cli.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/vlib') diff --git a/src/vlib/threads_cli.c b/src/vlib/threads_cli.c index b64028c4..54cc1aed 100644 --- a/src/vlib/threads_cli.c +++ b/src/vlib/threads_cli.c @@ -160,7 +160,7 @@ trace_frame_queue (vlib_main_t * vm, unformat_input_t * input, enable = 1; else if (unformat (line_input, "off")) enable = 0; - else if (unformat (line_input, "index %u"), &index) + else if (unformat (line_input, "index %u", &index)) ; else return clib_error_return (0, "parse error: '%U'", -- cgit 1.2.3-korg From a9a20e7f69f4a91a4d5267ab5ce14125bdc7d6c6 Mon Sep 17 00:00:00 2001 From: Billy McFall Date: Wed, 15 Feb 2017 11:39:12 -0500 Subject: VPP-635: CLI Memory leak with invalid parameter In the CLI parsing, below is a common pattern: /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) return 0; while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) { if (unformat (line_input, "x")) x = 1; : else return clib_error_return (0, "unknown input `%U'", format_unformat_error, line_input); } unformat_free (line_input); The 'else' returns if an unknown string is encountered. There a memory leak because the 'unformat_free(line_input)' is not called. There is a large number of instances of this pattern. Replaced the previous pattern with: /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) return 0; while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) { if (unformat (line_input, "x")) x = 1; : else { error = clib_error_return (0, "unknown input `%U'", format_unformat_error, line_input); goto done: } } /* ...Remaining code... */ done: unformat_free (line_input); return error; } In multiple files, 'unformat_free (line_input);' was never called, so there was a memory leak whether an invalid string was entered or not. Also, there were multiple instance where: error = clib_error_return (0, "unknown input `%U'", format_unformat_error, line_input); used 'input' as the last parameter instead of 'line_input'. The result is that output did not contain the substring in error, instead just an empty string. Fixed all of those as well. There are a lot of file, and very mind numbing work, so tried to keep it to a pattern to avoid mistakes. Change-Id: I8902f0c32a47dd7fb3bb3471a89818571702f1d2 Signed-off-by: Billy McFall Signed-off-by: Dave Barach --- build-root/emacs-lisp/tunnel-c-skel.el | 19 ++- src/plugins/ila/ila.c | 25 ++- src/plugins/lb/cli.c | 99 ++++++----- src/plugins/sixrd/sixrd.c | 42 +++-- src/plugins/snat/snat.c | 139 +++++++++++----- src/vlib/threads_cli.c | 79 ++++++--- src/vlib/trace.c | 13 +- src/vlib/unix/cli.c | 22 ++- src/vnet/devices/af_packet/cli.c | 56 +++++-- src/vnet/devices/dpdk/cli.c | 290 +++++++++++++++++++++++---------- src/vnet/devices/dpdk/ipsec/cli.c | 15 +- src/vnet/devices/netmap/cli.c | 54 ++++-- src/vnet/devices/virtio/vhost-user.c | 62 +++++-- src/vnet/gre/interface.c | 35 ++-- src/vnet/ip/ip4_source_check.c | 6 +- src/vnet/ip/ip4_test.c | 15 +- src/vnet/ip/ip6_neighbor.c | 27 ++- src/vnet/ip/lookup.c | 34 ++-- src/vnet/ipsec-gre/interface.c | 34 ++-- src/vnet/ipsec/ipsec_cli.c | 177 +++++++++++++------- src/vnet/l2/l2_patch.c | 26 ++- src/vnet/l2/l2_xcrw.c | 34 +++- src/vnet/l2tp/l2tp.c | 39 +++-- src/vnet/lisp-cp/lisp_cli.c | 139 ++++++++++++---- src/vnet/lisp-gpe/interface.c | 58 +++++-- src/vnet/lisp-gpe/lisp_gpe.c | 13 +- src/vnet/map/map.c | 186 +++++++++++++++------ src/vnet/mpls/mpls.c | 2 + src/vnet/mpls/mpls_tunnel.c | 19 ++- src/vnet/pg/cli.c | 39 +++-- src/vnet/policer/node_funcs.c | 19 ++- src/vnet/policer/policer.c | 13 +- src/vnet/unix/tapcli.c | 57 +++++-- src/vnet/vxlan-gpe/vxlan_gpe.c | 62 +++++-- src/vnet/vxlan/vxlan.c | 81 ++++++--- src/vpp/app/l2t.c | 9 +- src/vpp/app/vpe_cli.c | 24 ++- 37 files changed, 1487 insertions(+), 576 deletions(-) (limited to 'src/vlib') diff --git a/build-root/emacs-lisp/tunnel-c-skel.el b/build-root/emacs-lisp/tunnel-c-skel.el index aa260e53..a1b1757d 100644 --- a/build-root/emacs-lisp/tunnel-c-skel.el +++ b/build-root/emacs-lisp/tunnel-c-skel.el @@ -288,6 +288,7 @@ static clib_error_t * vlib_cli_command_t * cmd) { unformat_input_t _line_input, * line_input = &_line_input; + clib_error_t *error = 0; ip4_address_t src, dst; u8 is_add = 1; u8 src_set = 0; @@ -322,13 +323,19 @@ static clib_error_t * { encap_fib_index = fib_index_from_fib_id (tmp); if (encap_fib_index == ~0) - return clib_error_return (0, \"nonexistent encap fib id %d\", tmp); + { + unformat_free (line_input); + return clib_error_return (0, \"nonexistent encap fib id %d\", tmp); + } } else if (unformat (line_input, \"decap-vrf-id %d\", &tmp)) { decap_fib_index = fib_index_from_fib_id (tmp); if (decap_fib_index == ~0) - return clib_error_return (0, \"nonexistent decap fib id %d\", tmp); + { + unformat_free (line_input); + return clib_error_return (0, \"nonexistent decap fib id %d\", tmp); + } } else if (unformat (line_input, \"decap-next %U\", unformat_decap_next, &decap_next_index)) @@ -346,8 +353,12 @@ static clib_error_t * * in the " ENCAP_STACK " header */ else - return clib_error_return (0, \"parse error: '%U'\", - format_unformat_error, line_input); + { + error = clib_error_return (0, \"parse error: '%U'\", + format_unformat_error, line_input); + unformat_free (line_input); + return error; + } } unformat_free (line_input); diff --git a/src/plugins/ila/ila.c b/src/plugins/ila/ila.c index e0f3907f..52c7ea55 100644 --- a/src/plugins/ila/ila.c +++ b/src/plugins/ila/ila.c @@ -949,6 +949,7 @@ ila_entry_command_fn (vlib_main_t * vm, ila_add_del_entry_args_t args = { 0 }; u8 next_hop_set = 0; int ret; + clib_error_t *error = 0; args.type = ILA_TYPE_IID; args.csum_mode = ILA_CSUM_MODE_NO_ACTION; @@ -986,19 +987,29 @@ ila_entry_command_fn (vlib_main_t * vm, else if (unformat (line_input, "del")) args.is_del = 1; else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - if (!next_hop_set) - return clib_error_return (0, "Specified a next hop"); + { + error = clib_error_return (0, "Specified a next hop"); + goto done; + } if ((ret = ila_add_del_entry (&args))) - return clib_error_return (0, "ila_add_del_entry returned error %d", ret); + { + error = clib_error_return (0, "ila_add_del_entry returned error %d", ret); + goto done; + } - return NULL; +done: + unformat_free (line_input); + + return error; } VLIB_CLI_COMMAND (ila_entry_command, static) = diff --git a/src/plugins/lb/cli.c b/src/plugins/lb/cli.c index b59c6426..6452a875 100644 --- a/src/plugins/lb/cli.c +++ b/src/plugins/lb/cli.c @@ -28,13 +28,16 @@ lb_vip_command_fn (vlib_main_t * vm, int ret; u32 gre4 = 0; lb_vip_type_t type; + clib_error_t *error = 0; if (!unformat_user (input, unformat_line_input, line_input)) return 0; - if (!unformat(line_input, "%U", unformat_ip46_prefix, &prefix, &plen, IP46_TYPE_ANY, &plen)) - return clib_error_return (0, "invalid vip prefix: '%U'", - format_unformat_error, line_input); + if (!unformat(line_input, "%U", unformat_ip46_prefix, &prefix, &plen, IP46_TYPE_ANY, &plen)) { + error = clib_error_return (0, "invalid vip prefix: '%U'", + format_unformat_error, line_input); + goto done; + } while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) { @@ -46,13 +49,13 @@ lb_vip_command_fn (vlib_main_t * vm, gre4 = 1; else if (unformat(line_input, "encap gre6")) gre4 = 0; - else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + else { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - if (ip46_prefix_is_ip4(&prefix, plen)) { type = (gre4)?LB_VIP_TYPE_IP4_GRE4:LB_VIP_TYPE_IP4_GRE6; @@ -65,17 +68,25 @@ lb_vip_command_fn (vlib_main_t * vm, u32 index; if (!del) { if ((ret = lb_vip_add(&prefix, plen, type, new_len, &index))) { - return clib_error_return (0, "lb_vip_add error %d", ret); + error = clib_error_return (0, "lb_vip_add error %d", ret); + goto done; } else { vlib_cli_output(vm, "lb_vip_add ok %d", index); } } else { - if ((ret = lb_vip_find_index(&prefix, plen, &index))) - return clib_error_return (0, "lb_vip_find_index error %d", ret); - else if ((ret = lb_vip_del(index))) - return clib_error_return (0, "lb_vip_del error %d", ret); + if ((ret = lb_vip_find_index(&prefix, plen, &index))) { + error = clib_error_return (0, "lb_vip_find_index error %d", ret); + goto done; + } else if ((ret = lb_vip_del(index))) { + error = clib_error_return (0, "lb_vip_del error %d", ret); + goto done; + } } - return NULL; + +done: + unformat_free (line_input); + + return error; } VLIB_CLI_COMMAND (lb_vip_command, static) = @@ -96,16 +107,21 @@ lb_as_command_fn (vlib_main_t * vm, u32 vip_index; u8 del = 0; int ret; + clib_error_t *error = 0; if (!unformat_user (input, unformat_line_input, line_input)) return 0; - if (!unformat(line_input, "%U", unformat_ip46_prefix, &vip_prefix, &vip_plen, IP46_TYPE_ANY)) - return clib_error_return (0, "invalid as address: '%U'", - format_unformat_error, line_input); + if (!unformat(line_input, "%U", unformat_ip46_prefix, &vip_prefix, &vip_plen, IP46_TYPE_ANY)) { + error = clib_error_return (0, "invalid as address: '%U'", + format_unformat_error, line_input); + goto done; + } - if ((ret = lb_vip_find_index(&vip_prefix, vip_plen, &vip_index))) - return clib_error_return (0, "lb_vip_find_index error %d", ret); + if ((ret = lb_vip_find_index(&vip_prefix, vip_plen, &vip_index))) { + error = clib_error_return (0, "lb_vip_find_index error %d", ret); + goto done; + } while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) { @@ -114,15 +130,15 @@ lb_as_command_fn (vlib_main_t * vm, } else if (unformat(line_input, "del")) { del = 1; } else { - vec_free(as_array); - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; } } if (!vec_len(as_array)) { - vec_free(as_array); - return clib_error_return (0, "No AS address provided"); + error = clib_error_return (0, "No AS address provided"); + goto done; } lb_garbage_collection(); @@ -130,18 +146,21 @@ lb_as_command_fn (vlib_main_t * vm, if (del) { if ((ret = lb_vip_del_ass(vip_index, as_array, vec_len(as_array)))) { - vec_free(as_array); - return clib_error_return (0, "lb_vip_del_ass error %d", ret); + error = clib_error_return (0, "lb_vip_del_ass error %d", ret); + goto done; } } else { if ((ret = lb_vip_add_ass(vip_index, as_array, vec_len(as_array)))) { - vec_free(as_array); - return clib_error_return (0, "lb_vip_add_ass error %d", ret); + error = clib_error_return (0, "lb_vip_add_ass error %d", ret); + goto done; } } +done: + unformat_free (line_input); vec_free(as_array); - return 0; + + return error; } VLIB_CLI_COMMAND (lb_as_command, static) = @@ -163,6 +182,7 @@ lb_conf_command_fn (vlib_main_t * vm, u32 per_cpu_sticky_buckets_log2 = 0; u32 flow_timeout = lbm->flow_timeout; int ret; + clib_error_t *error = 0; if (!unformat_user (input, unformat_line_input, line_input)) return 0; @@ -181,19 +201,24 @@ lb_conf_command_fn (vlib_main_t * vm, per_cpu_sticky_buckets = 1 << per_cpu_sticky_buckets_log2; } else if (unformat(line_input, "timeout %d", &flow_timeout)) ; - else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + else { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - lb_garbage_collection(); - if ((ret = lb_conf(&ip4, &ip6, per_cpu_sticky_buckets, flow_timeout))) - return clib_error_return (0, "lb_conf error %d", ret); + if ((ret = lb_conf(&ip4, &ip6, per_cpu_sticky_buckets, flow_timeout))) { + error = clib_error_return (0, "lb_conf error %d", ret); + goto done; + } - return NULL; +done: + unformat_free (line_input); + + return error; } VLIB_CLI_COMMAND (lb_conf_command, static) = diff --git a/src/plugins/sixrd/sixrd.c b/src/plugins/sixrd/sixrd.c index 71fc181f..67a9a3ad 100644 --- a/src/plugins/sixrd/sixrd.c +++ b/src/plugins/sixrd/sixrd.c @@ -192,6 +192,7 @@ sixrd_add_domain_command_fn (vlib_main_t *vm, u32 num_m_args = 0; /* Optional arguments */ u32 mtu = 0; + clib_error_t *error = 0; /* Get a line of input. */ if (!unformat_user(input, unformat_line_input, line_input)) @@ -205,19 +206,25 @@ sixrd_add_domain_command_fn (vlib_main_t *vm, num_m_args++; else if (unformat(line_input, "mtu %d", &mtu)) num_m_args++; - else - return clib_error_return(0, "unknown input `%U'", - format_unformat_error, input); + else { + error = clib_error_return(0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free(line_input); - if (num_m_args < 3) - return clib_error_return(0, "mandatory argument(s) missing"); + if (num_m_args < 3) { + error = clib_error_return(0, "mandatory argument(s) missing"); + goto done; + } sixrd_create_domain(&ip6_prefix, ip6_prefix_len, &ip4_prefix, ip4_prefix_len, &ip4_src, &sixrd_domain_index, mtu); - return 0; +done: + unformat_free (line_input); + + return error; } static clib_error_t * @@ -228,6 +235,7 @@ sixrd_del_domain_command_fn (vlib_main_t *vm, unformat_input_t _line_input, *line_input = &_line_input; u32 num_m_args = 0; u32 sixrd_domain_index; + clib_error_t *error = 0; /* Get a line of input. */ if (! unformat_user(input, unformat_line_input, line_input)) @@ -236,18 +244,24 @@ sixrd_del_domain_command_fn (vlib_main_t *vm, while (unformat_check_input(line_input) != UNFORMAT_END_OF_INPUT) { if (unformat(line_input, "index %d", &sixrd_domain_index)) num_m_args++; - else - return clib_error_return(0, "unknown input `%U'", - format_unformat_error, input); + else { + error = clib_error_return(0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free(line_input); - if (num_m_args != 1) - return clib_error_return(0, "mandatory argument(s) missing"); + if (num_m_args != 1) { + error = clib_error_return(0, "mandatory argument(s) missing"); + goto done; + } sixrd_delete_domain(sixrd_domain_index); - return 0; +done: + unformat_free (line_input); + + return error; } static u8 * diff --git a/src/plugins/snat/snat.c b/src/plugins/snat/snat.c index 73854a7a..8c2bacdb 100644 --- a/src/plugins/snat/snat.c +++ b/src/plugins/snat/snat.c @@ -1705,6 +1705,7 @@ add_address_command_fn (vlib_main_t * vm, int i, count; int is_add = 1; int rv = 0; + clib_error_t *error = 0; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -1721,19 +1722,27 @@ add_address_command_fn (vlib_main_t * vm, else if (unformat (line_input, "del")) is_add = 0; else - return clib_error_return (0, "unknown input '%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); if (sm->static_mapping_only) - return clib_error_return (0, "static mapping only mode"); + { + error = clib_error_return (0, "static mapping only mode"); + goto done; + } start_host_order = clib_host_to_net_u32 (start_addr.as_u32); end_host_order = clib_host_to_net_u32 (end_addr.as_u32); if (end_host_order < start_host_order) - return clib_error_return (0, "end address less than start address"); + { + error = clib_error_return (0, "end address less than start address"); + goto done; + } count = (end_host_order - start_host_order) + 1; @@ -1755,11 +1764,11 @@ add_address_command_fn (vlib_main_t * vm, switch (rv) { case VNET_API_ERROR_NO_SUCH_ENTRY: - return clib_error_return (0, "S-NAT address not exist."); - break; + error = clib_error_return (0, "S-NAT address not exist."); + goto done; case VNET_API_ERROR_UNSPECIFIED: - return clib_error_return (0, "S-NAT address used in static mapping."); - break; + error = clib_error_return (0, "S-NAT address used in static mapping."); + goto done; default: break; } @@ -1767,7 +1776,10 @@ add_address_command_fn (vlib_main_t * vm, increment_v4_address (&this_addr); } - return 0; +done: + unformat_free (line_input); + + return error; } VLIB_CLI_COMMAND (add_address_command, static) = { @@ -1807,10 +1819,12 @@ snat_feature_command_fn (vlib_main_t * vm, else if (unformat (line_input, "del")) is_del = 1; else - return clib_error_return (0, "unknown input '%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); if (vec_len (inside_sw_if_indices)) { @@ -1830,6 +1844,8 @@ snat_feature_command_fn (vlib_main_t * vm, } } +done: + unformat_free (line_input); vec_free (inside_sw_if_indices); vec_free (outside_sw_if_indices); @@ -1923,13 +1939,18 @@ add_static_mapping_command_fn (vlib_main_t * vm, else if (unformat (line_input, "del")) is_add = 0; else - return clib_error_return (0, "unknown input: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "unknown input: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); if (!addr_only && !proto_set) - return clib_error_return (0, "missing protocol"); + { + error = clib_error_return (0, "missing protocol"); + goto done; + } rv = snat_add_static_mapping(l_addr, e_addr, (u16) l_port, (u16) e_port, vrf_id, addr_only, sw_if_index, proto, is_add); @@ -1937,22 +1958,27 @@ add_static_mapping_command_fn (vlib_main_t * vm, switch (rv) { case VNET_API_ERROR_INVALID_VALUE: - return clib_error_return (0, "External port already in use."); - break; + error = clib_error_return (0, "External port already in use."); + goto done; case VNET_API_ERROR_NO_SUCH_ENTRY: if (is_add) - return clib_error_return (0, "External addres must be allocated."); + error = clib_error_return (0, "External addres must be allocated."); else - return clib_error_return (0, "Mapping not exist."); - break; + error = clib_error_return (0, "Mapping not exist."); + goto done; case VNET_API_ERROR_NO_SUCH_FIB: - return clib_error_return (0, "No such VRF id."); + error = clib_error_return (0, "No such VRF id."); + goto done; case VNET_API_ERROR_VALUE_EXIST: - return clib_error_return (0, "Mapping already exist."); + error = clib_error_return (0, "Mapping already exist."); + goto done; default: break; } +done: + unformat_free (line_input); + return error; } @@ -1985,6 +2011,7 @@ set_workers_command_fn (vlib_main_t * vm, unformat_input_t _line_input, *line_input = &_line_input; uword *bitmap = 0; int rv = 0; + clib_error_t *error = 0; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -1995,13 +2022,18 @@ set_workers_command_fn (vlib_main_t * vm, if (unformat (line_input, "%U", unformat_bitmap_list, &bitmap)) ; else - return clib_error_return (0, "unknown input '%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); if (bitmap == 0) - return clib_error_return (0, "List of workers must be specified."); + { + error = clib_error_return (0, "List of workers must be specified."); + goto done; + } rv = snat_set_workers(bitmap); @@ -2010,17 +2042,20 @@ set_workers_command_fn (vlib_main_t * vm, switch (rv) { case VNET_API_ERROR_INVALID_WORKER: - return clib_error_return (0, "Invalid worker(s)."); - break; + error = clib_error_return (0, "Invalid worker(s)."); + goto done; case VNET_API_ERROR_FEATURE_DISABLED: - return clib_error_return (0, + error = clib_error_return (0, "Supported only if 2 or more workes available."); - break; + goto done; default: break; } - return 0; +done: + unformat_free (line_input); + + return error; } /*? @@ -2047,6 +2082,7 @@ snat_ipfix_logging_enable_disable_command_fn (vlib_main_t * vm, u32 src_port = 0; u8 enable = 1; int rv = 0; + clib_error_t *error = 0; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -2061,17 +2097,25 @@ snat_ipfix_logging_enable_disable_command_fn (vlib_main_t * vm, else if (unformat (line_input, "disable")) enable = 0; else - return clib_error_return (0, "unknown input '%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); rv = snat_ipfix_logging_enable_disable (enable, domain_id, (u16) src_port); if (rv) - return clib_error_return (0, "ipfix logging enable failed"); + { + error = clib_error_return (0, "ipfix logging enable failed"); + goto done; + } - return 0; +done: + unformat_free (line_input); + + return error; } /*? @@ -2604,6 +2648,7 @@ snat_add_interface_address_command_fn (vlib_main_t * vm, u32 sw_if_index; int rv; int is_del = 0; + clib_error_t *error = 0; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -2617,8 +2662,11 @@ snat_add_interface_address_command_fn (vlib_main_t * vm, else if (unformat (line_input, "del")) is_del = 1; else - return clib_error_return (0, "unknown input '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } } rv = snat_add_interface_address (sm, sw_if_index, is_del); @@ -2629,10 +2677,15 @@ snat_add_interface_address_command_fn (vlib_main_t * vm, break; default: - return clib_error_return (0, "snat_add_interface_address returned %d", - rv); + error = clib_error_return (0, "snat_add_interface_address returned %d", + rv); + goto done; } - return 0; + +done: + unformat_free (line_input); + + return error; } VLIB_CLI_COMMAND (snat_add_interface_address_command, static) = { diff --git a/src/vlib/threads_cli.c b/src/vlib/threads_cli.c index 54cc1aed..36f8109e 100644 --- a/src/vlib/threads_cli.c +++ b/src/vlib/threads_cli.c @@ -163,21 +163,31 @@ trace_frame_queue (vlib_main_t * vm, unformat_input_t * input, else if (unformat (line_input, "index %u", &index)) ; else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - if (enable > 1) - return clib_error_return (0, "expecting on or off"); + { + error = clib_error_return (0, "expecting on or off"); + goto done; + } if (vec_len (tm->frame_queue_mains) == 0) - return clib_error_return (0, "no worker handoffs exist"); + { + error = clib_error_return (0, "no worker handoffs exist"); + goto done; + } if (index > vec_len (tm->frame_queue_mains) - 1) - return clib_error_return (0, - "expecting valid worker handoff queue index"); + { + error = clib_error_return (0, + "expecting valid worker handoff queue index"); + goto done; + } fqm = vec_elt_at_index (tm->frame_queue_mains, index); @@ -185,7 +195,7 @@ trace_frame_queue (vlib_main_t * vm, unformat_input_t * input, if (num_fq == 0) { vlib_cli_output (vm, "No frame queues exist\n"); - return error; + goto done; } // Allocate storage for trace if necessary @@ -204,6 +214,10 @@ trace_frame_queue (vlib_main_t * vm, unformat_input_t * input, memset (fqh, 0, sizeof (*fqh)); fqm->vlib_frame_queues[fqix]->trace = enable; } + +done: + unformat_free (line_input); + return error; } @@ -432,28 +446,33 @@ test_frame_queue_nelts (vlib_main_t * vm, unformat_input_t * input, else if (unformat (line_input, "index %u", &index)) ; else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - if (index > vec_len (tm->frame_queue_mains) - 1) - return clib_error_return (0, - "expecting valid worker handoff queue index"); + { + error = clib_error_return (0, + "expecting valid worker handoff queue index"); + goto done; + } fqm = vec_elt_at_index (tm->frame_queue_mains, index); if ((nelts != 4) && (nelts != 8) && (nelts != 16) && (nelts != 32)) { - return clib_error_return (0, "expecting 4,8,16,32"); + error = clib_error_return (0, "expecting 4,8,16,32"); + goto done; } num_fq = vec_len (fqm->vlib_frame_queues); if (num_fq == 0) { vlib_cli_output (vm, "No frame queues exist\n"); - return error; + goto done; } for (fqix = 0; fqix < num_fq; fqix++) @@ -461,6 +480,9 @@ test_frame_queue_nelts (vlib_main_t * vm, unformat_input_t * input, fqm->vlib_frame_queues[fqix]->nelts = nelts; } +done: + unformat_free (line_input); + return error; } @@ -499,15 +521,19 @@ test_frame_queue_threshold (vlib_main_t * vm, unformat_input_t * input, else if (unformat (line_input, "index %u", &index)) ; else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - if (index > vec_len (tm->frame_queue_mains) - 1) - return clib_error_return (0, - "expecting valid worker handoff queue index"); + { + error = clib_error_return (0, + "expecting valid worker handoff queue index"); + goto done; + } fqm = vec_elt_at_index (tm->frame_queue_mains, index); @@ -515,7 +541,7 @@ test_frame_queue_threshold (vlib_main_t * vm, unformat_input_t * input, if (threshold == ~(u32) 0) { vlib_cli_output (vm, "expecting threshold value\n"); - return error; + goto done; } if (threshold == 0) @@ -525,7 +551,7 @@ test_frame_queue_threshold (vlib_main_t * vm, unformat_input_t * input, if (num_fq == 0) { vlib_cli_output (vm, "No frame queues exist\n"); - return error; + goto done; } for (fqix = 0; fqix < num_fq; fqix++) @@ -533,6 +559,9 @@ test_frame_queue_threshold (vlib_main_t * vm, unformat_input_t * input, fqm->vlib_frame_queues[fqix]->vector_threshold = threshold; } +done: + unformat_free (line_input); + return error; } diff --git a/src/vlib/trace.c b/src/vlib/trace.c index dcdb837f..6d487ae1 100644 --- a/src/vlib/trace.c +++ b/src/vlib/trace.c @@ -372,6 +372,7 @@ cli_add_trace_buffer (vlib_main_t * vm, vlib_trace_node_t *tn; u32 node_index, add; u8 verbose = 0; + clib_error_t *error = 0; if (!unformat_user (input, unformat_line_input, line_input)) return 0; @@ -384,8 +385,11 @@ cli_add_trace_buffer (vlib_main_t * vm, else if (unformat (line_input, "verbose")) verbose = 1; else - return clib_error_create ("expected NODE COUNT, got `%U'", - format_unformat_error, line_input); + { + error = clib_error_create ("expected NODE COUNT, got `%U'", + format_unformat_error, line_input); + goto done; + } } /* *INDENT-OFF* */ @@ -403,7 +407,10 @@ cli_add_trace_buffer (vlib_main_t * vm, })); /* *INDENT-ON* */ - return 0; +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ diff --git a/src/vlib/unix/cli.c b/src/vlib/unix/cli.c index 69fca6ec..88e2453c 100644 --- a/src/vlib/unix/cli.c +++ b/src/vlib/unix/cli.c @@ -2835,6 +2835,7 @@ unix_cli_set_terminal_pager (vlib_main_t * vm, unix_cli_main_t *cm = &unix_cli_main; unix_cli_file_t *cf; unformat_input_t _line_input, *line_input = &_line_input; + clib_error_t *error = 0; if (!unformat_user (input, unformat_line_input, line_input)) return 0; @@ -2852,13 +2853,17 @@ unix_cli_set_terminal_pager (vlib_main_t * vm, "Pager limit set to %u lines; note, this is global.\n", um->cli_pager_buffer_limit); else - return clib_error_return (0, "unknown parameter: `%U`", - format_unformat_error, line_input); + { + error = clib_error_return (0, "unknown parameter: `%U`", + format_unformat_error, line_input); + goto done; + } } +done: unformat_free (line_input); - return 0; + return error; } /*? @@ -2886,6 +2891,7 @@ unix_cli_set_terminal_history (vlib_main_t * vm, unix_cli_file_t *cf; unformat_input_t _line_input, *line_input = &_line_input; u32 limit; + clib_error_t *error = 0; if (!unformat_user (input, unformat_line_input, line_input)) return 0; @@ -2901,8 +2907,11 @@ unix_cli_set_terminal_history (vlib_main_t * vm, else if (unformat (line_input, "limit %u", &cf->history_limit)) ; else - return clib_error_return (0, "unknown parameter: `%U`", - format_unformat_error, line_input); + { + error = clib_error_return (0, "unknown parameter: `%U`", + format_unformat_error, line_input); + goto done; + } /* If we reduced history size, or turned it off, purge the history */ limit = cf->has_history ? cf->history_limit : 0; @@ -2914,9 +2923,10 @@ unix_cli_set_terminal_history (vlib_main_t * vm, } } +done: unformat_free (line_input); - return 0; + return error; } /*? diff --git a/src/vnet/devices/af_packet/cli.c b/src/vnet/devices/af_packet/cli.c index 6baa26e1..d4aa7016 100644 --- a/src/vnet/devices/af_packet/cli.c +++ b/src/vnet/devices/af_packet/cli.c @@ -49,6 +49,7 @@ af_packet_create_command_fn (vlib_main_t * vm, unformat_input_t * input, u8 *hw_addr_ptr = 0; u32 sw_if_index; int r; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -63,29 +64,47 @@ af_packet_create_command_fn (vlib_main_t * vm, unformat_input_t * input, (line_input, "hw-addr %U", unformat_ethernet_address, hwaddr)) hw_addr_ptr = hwaddr; else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); if (host_if_name == NULL) - return clib_error_return (0, "missing host interface name"); + { + error = clib_error_return (0, "missing host interface name"); + goto done; + } r = af_packet_create_if (vm, host_if_name, hw_addr_ptr, &sw_if_index); - vec_free (host_if_name); if (r == VNET_API_ERROR_SYSCALL_ERROR_1) - return clib_error_return (0, "%s (errno %d)", strerror (errno), errno); + { + error = clib_error_return (0, "%s (errno %d)", strerror (errno), errno); + goto done; + } if (r == VNET_API_ERROR_INVALID_INTERFACE) - return clib_error_return (0, "Invalid interface name"); + { + error = clib_error_return (0, "Invalid interface name"); + goto done; + } if (r == VNET_API_ERROR_SUBIF_ALREADY_EXISTS) - return clib_error_return (0, "Interface elready exists"); + { + error = clib_error_return (0, "Interface elready exists"); + goto done; + } vlib_cli_output (vm, "%U\n", format_vnet_sw_if_index_name, vnet_get_main (), sw_if_index); - return 0; + +done: + vec_free (host_if_name); + unformat_free (line_input); + + return error; } /*? @@ -124,6 +143,7 @@ af_packet_delete_command_fn (vlib_main_t * vm, unformat_input_t * input, { unformat_input_t _line_input, *line_input = &_line_input; u8 *host_if_name = NULL; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -134,18 +154,26 @@ af_packet_delete_command_fn (vlib_main_t * vm, unformat_input_t * input, if (unformat (line_input, "name %s", &host_if_name)) ; else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); if (host_if_name == NULL) - return clib_error_return (0, "missing host interface name"); + { + error = clib_error_return (0, "missing host interface name"); + goto done; + } af_packet_delete_if (vm, host_if_name); + +done: vec_free (host_if_name); + unformat_free (line_input); - return 0; + return error; } /*? diff --git a/src/vnet/devices/dpdk/cli.c b/src/vnet/devices/dpdk/cli.c index d133cfd9..1fc665ac 100644 --- a/src/vnet/devices/dpdk/cli.c +++ b/src/vnet/devices/dpdk/cli.c @@ -398,7 +398,7 @@ set_dpdk_if_desc (vlib_main_t * vm, unformat_input_t * input, u32 hw_if_index = (u32) ~ 0; u32 nb_rx_desc = (u32) ~ 0; u32 nb_tx_desc = (u32) ~ 0; - clib_error_t *rv; + clib_error_t *error = NULL; if (!unformat_user (input, unformat_line_input, line_input)) return 0; @@ -414,25 +414,37 @@ set_dpdk_if_desc (vlib_main_t * vm, unformat_input_t * input, else if (unformat (line_input, "rx %d", &nb_rx_desc)) ; else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - if (hw_if_index == (u32) ~ 0) - return clib_error_return (0, "please specify valid interface name"); + { + error = clib_error_return (0, "please specify valid interface name"); + goto done; + } hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); xd = vec_elt_at_index (dm->devices, hw->dev_instance); if ((xd->flags & DPDK_DEVICE_FLAG_PMD) == 0) - return clib_error_return (0, "number of descriptors can be set only for " - "physical devices"); + { + error = + clib_error_return (0, + "number of descriptors can be set only for " + "physical devices"); + goto done; + } if ((nb_rx_desc == (u32) ~ 0 || nb_rx_desc == xd->nb_rx_desc) && (nb_tx_desc == (u32) ~ 0 || nb_tx_desc == xd->nb_tx_desc)) - return clib_error_return (0, "nothing changed"); + { + error = clib_error_return (0, "nothing changed"); + goto done; + } if (nb_rx_desc != (u32) ~ 0) xd->nb_rx_desc = nb_rx_desc; @@ -440,9 +452,12 @@ set_dpdk_if_desc (vlib_main_t * vm, unformat_input_t * input, if (nb_tx_desc != (u32) ~ 0) xd->nb_tx_desc = nb_tx_desc; - rv = dpdk_port_setup (dm, xd); + error = dpdk_port_setup (dm, xd); + +done: + unformat_free (line_input); - return rv; + return error; } /* *INDENT-OFF* */ @@ -523,6 +538,7 @@ set_dpdk_if_placement (vlib_main_t * vm, unformat_input_t * input, u32 queue = (u32) 0; u32 cpu = (u32) ~ 0; int i; + clib_error_t *error = NULL; if (!unformat_user (input, unformat_line_input, line_input)) return 0; @@ -538,18 +554,25 @@ set_dpdk_if_placement (vlib_main_t * vm, unformat_input_t * input, else if (unformat (line_input, "thread %d", &cpu)) ; else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - if (hw_if_index == (u32) ~ 0) - return clib_error_return (0, "please specify valid interface name"); + { + error = clib_error_return (0, "please specify valid interface name"); + goto done; + } if (cpu < dm->input_cpu_first_index || cpu >= (dm->input_cpu_first_index + dm->input_cpu_count)) - return clib_error_return (0, "please specify valid thread id"); + { + error = clib_error_return (0, "please specify valid thread id"); + goto done; + } hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); xd = vec_elt_at_index (dm->devices, hw->dev_instance); @@ -563,7 +586,7 @@ set_dpdk_if_placement (vlib_main_t * vm, unformat_input_t * input, queue == dq->queue_id) { if (cpu == i) /* nothing to do */ - return 0; + goto done; vec_del1(dm->devices_by_cpu[i], dq - dm->devices_by_cpu[i]); vec_add2(dm->devices_by_cpu[cpu], dq, 1); @@ -586,13 +609,18 @@ set_dpdk_if_placement (vlib_main_t * vm, unformat_input_t * input, vlib_node_set_state (vlib_mains[cpu], dpdk_input_node.index, VLIB_NODE_STATE_POLLING); - return 0; + goto done; } } /* *INDENT-ON* */ } - return clib_error_return (0, "not found"); + error = clib_error_return (0, "not found"); + +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ @@ -653,6 +681,7 @@ set_dpdk_if_hqos_placement (vlib_main_t * vm, unformat_input_t * input, u32 hw_if_index = (u32) ~ 0; u32 cpu = (u32) ~ 0; int i; + clib_error_t *error = NULL; if (!unformat_user (input, unformat_line_input, line_input)) return 0; @@ -666,18 +695,22 @@ set_dpdk_if_hqos_placement (vlib_main_t * vm, unformat_input_t * input, else if (unformat (line_input, "thread %d", &cpu)) ; else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - if (hw_if_index == (u32) ~ 0) return clib_error_return (0, "please specify valid interface name"); if (cpu < dm->hqos_cpu_first_index || cpu >= (dm->hqos_cpu_first_index + dm->hqos_cpu_count)) - return clib_error_return (0, "please specify valid thread id"); + { + error = clib_error_return (0, "please specify valid thread id"); + goto done; + } hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); xd = vec_elt_at_index (dm->devices, hw->dev_instance); @@ -689,7 +722,7 @@ set_dpdk_if_hqos_placement (vlib_main_t * vm, unformat_input_t * input, if (hw_if_index == dm->devices[dq->device].vlib_hw_if_index) { if (cpu == i) /* nothing to do */ - return 0; + goto done; vec_del1 (dm->devices_by_hqos_cpu[i], dq - dm->devices_by_hqos_cpu[i]); @@ -703,12 +736,17 @@ set_dpdk_if_hqos_placement (vlib_main_t * vm, unformat_input_t * input, vec_sort_with_function (dm->devices_by_hqos_cpu[cpu], dpdk_device_queue_sort); - return 0; + goto done; } } } - return clib_error_return (0, "not found"); + error = clib_error_return (0, "not found"); + +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ @@ -732,6 +770,7 @@ set_dpdk_if_hqos_pipe (vlib_main_t * vm, unformat_input_t * input, u32 pipe_id = (u32) ~ 0; u32 profile_id = (u32) ~ 0; int rv; + clib_error_t *error = NULL; if (!unformat_user (input, unformat_line_input, line_input)) return 0; @@ -749,14 +788,18 @@ set_dpdk_if_hqos_pipe (vlib_main_t * vm, unformat_input_t * input, else if (unformat (line_input, "profile %d", &profile_id)) ; else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - if (hw_if_index == (u32) ~ 0) - return clib_error_return (0, "please specify valid interface name"); + { + error = clib_error_return (0, "please specify valid interface name"); + goto done; + } hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); xd = vec_elt_at_index (dm->devices, hw->dev_instance); @@ -765,9 +808,15 @@ set_dpdk_if_hqos_pipe (vlib_main_t * vm, unformat_input_t * input, rte_sched_pipe_config (xd->hqos_ht->hqos, subport_id, pipe_id, profile_id); if (rv) - return clib_error_return (0, "pipe configuration failed"); + { + error = clib_error_return (0, "pipe configuration failed"); + goto done; + } - return 0; +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ @@ -797,6 +846,7 @@ set_dpdk_if_hqos_subport (vlib_main_t * vm, unformat_input_t * input, .tc_period = 10, }; int rv; + clib_error_t *error = NULL; if (!unformat_user (input, unformat_line_input, line_input)) return 0; @@ -829,23 +879,33 @@ set_dpdk_if_hqos_subport (vlib_main_t * vm, unformat_input_t * input, else if (unformat (line_input, "period %d", &p.tc_period)) ; else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - if (hw_if_index == (u32) ~ 0) - return clib_error_return (0, "please specify valid interface name"); + { + error = clib_error_return (0, "please specify valid interface name"); + goto done; + } hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); xd = vec_elt_at_index (dm->devices, hw->dev_instance); rv = rte_sched_subport_config (xd->hqos_ht->hqos, subport_id, &p); if (rv) - return clib_error_return (0, "subport configuration failed"); + { + error = clib_error_return (0, "subport configuration failed"); + goto done; + } - return 0; +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ @@ -872,6 +932,7 @@ set_dpdk_if_hqos_tctbl (vlib_main_t * vm, unformat_input_t * input, u32 queue = (u32) ~ 0; u32 entry = (u32) ~ 0; u32 val, i; + clib_error_t *error = NULL; if (!unformat_user (input, unformat_line_input, line_input)) return 0; @@ -889,20 +950,33 @@ set_dpdk_if_hqos_tctbl (vlib_main_t * vm, unformat_input_t * input, else if (unformat (line_input, "queue %d", &queue)) ; else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - if (hw_if_index == (u32) ~ 0) - return clib_error_return (0, "please specify valid interface name"); + { + error = clib_error_return (0, "please specify valid interface name"); + goto done; + } if (entry >= 64) - return clib_error_return (0, "invalid entry"); + { + error = clib_error_return (0, "invalid entry"); + goto done; + } if (tc >= RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE) - return clib_error_return (0, "invalid traffic class"); + { + error = clib_error_return (0, "invalid traffic class"); + goto done; + } if (queue >= RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS) - return clib_error_return (0, "invalid traffic class"); + { + error = clib_error_return (0, "invalid traffic class"); + goto done; + } hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); xd = vec_elt_at_index (dm->devices, hw->dev_instance); @@ -911,7 +985,10 @@ set_dpdk_if_hqos_tctbl (vlib_main_t * vm, unformat_input_t * input, uword *p = hash_get_mem (tm->thread_registrations_by_name, "workers"); /* Should never happen, shut up Coverity warning */ if (p == 0) - return clib_error_return (0, "no worker registrations?"); + { + error = clib_error_return (0, "no worker registrations?"); + goto done; + } vlib_thread_registration_t *tr = (vlib_thread_registration_t *) p[0]; int worker_thread_first = tr->first_index; @@ -921,7 +998,10 @@ set_dpdk_if_hqos_tctbl (vlib_main_t * vm, unformat_input_t * input, for (i = 0; i < worker_thread_count; i++) xd->hqos_wt[worker_thread_first + i].hqos_tc_table[entry] = val; - return 0; +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ @@ -939,6 +1019,7 @@ set_dpdk_if_hqos_pktfield (vlib_main_t * vm, unformat_input_t * input, unformat_input_t _line_input, *line_input = &_line_input; vlib_thread_main_t *tm = vlib_get_thread_main (); dpdk_main_t *dm = &dpdk_main; + clib_error_t *error = NULL; /* Device specific data */ struct rte_eth_dev_info dev_info; @@ -984,15 +1065,19 @@ set_dpdk_if_hqos_pktfield (vlib_main_t * vm, unformat_input_t * input, else if (unformat (line_input, "mask %llx", &mask)) ; else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - /* Get interface */ if (hw_if_index == (u32) ~ 0) - return clib_error_return (0, "please specify valid interface name"); + { + error = clib_error_return (0, "please specify valid interface name"); + goto done; + } hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); xd = vec_elt_at_index (dm->devices, hw->dev_instance); @@ -1019,7 +1104,7 @@ set_dpdk_if_hqos_pktfield (vlib_main_t * vm, unformat_input_t * input, if (devconf->hqos_enabled == 0) { vlib_cli_output (vm, "HQoS disabled for this interface"); - return 0; + goto done; } n_subports_per_port = devconf->hqos.port.n_subports_per_port; @@ -1028,27 +1113,39 @@ set_dpdk_if_hqos_pktfield (vlib_main_t * vm, unformat_input_t * input, /* Validate packet field configuration: id, offset and mask */ if (id >= 3) - return clib_error_return (0, "invalid packet field id"); + { + error = clib_error_return (0, "invalid packet field id"); + goto done; + } switch (id) { case 0: if (dpdk_hqos_validate_mask (mask, n_subports_per_port) != 0) - return clib_error_return (0, "invalid subport ID mask " - "(n_subports_per_port = %u)", - n_subports_per_port); + { + error = clib_error_return (0, "invalid subport ID mask " + "(n_subports_per_port = %u)", + n_subports_per_port); + goto done; + } break; case 1: if (dpdk_hqos_validate_mask (mask, n_pipes_per_subport) != 0) - return clib_error_return (0, "invalid pipe ID mask " - "(n_pipes_per_subport = %u)", - n_pipes_per_subport); + { + error = clib_error_return (0, "invalid pipe ID mask " + "(n_pipes_per_subport = %u)", + n_pipes_per_subport); + goto done; + } break; case 2: default: if (dpdk_hqos_validate_mask (mask, tctbl_size) != 0) - return clib_error_return (0, "invalid TC table index mask " - "(TC table size = %u)", tctbl_size); + { + error = clib_error_return (0, "invalid TC table index mask " + "(TC table size = %u)", tctbl_size); + goto done; + } } /* Propagate packet field configuration to all workers */ @@ -1075,7 +1172,10 @@ set_dpdk_if_hqos_pktfield (vlib_main_t * vm, unformat_input_t * input, __builtin_ctzll (mask); } - return 0; +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ @@ -1106,6 +1206,7 @@ show_dpdk_if_hqos (vlib_main_t * vm, unformat_input_t * input, dpdk_device_config_t *devconf = 0; vlib_thread_registration_t *tr; uword *p = 0; + clib_error_t *error = NULL; if (!unformat_user (input, unformat_line_input, line_input)) return 0; @@ -1117,14 +1218,18 @@ show_dpdk_if_hqos (vlib_main_t * vm, unformat_input_t * input, &hw_if_index)) ; else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - if (hw_if_index == (u32) ~ 0) - return clib_error_return (0, "please specify interface name!!"); + { + error = clib_error_return (0, "please specify interface name!!"); + goto done; + } hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); xd = vec_elt_at_index (dm->devices, hw->dev_instance); @@ -1151,7 +1256,7 @@ show_dpdk_if_hqos (vlib_main_t * vm, unformat_input_t * input, if (devconf->hqos_enabled == 0) { vlib_cli_output (vm, "HQoS disabled for this interface"); - return 0; + goto done; } /* Detect the set of worker threads */ @@ -1159,7 +1264,10 @@ show_dpdk_if_hqos (vlib_main_t * vm, unformat_input_t * input, /* Should never happen, shut up Coverity warning */ if (p == 0) - return clib_error_return (0, "no worker registrations?"); + { + error = clib_error_return (0, "no worker registrations?"); + goto done; + } tr = (vlib_thread_registration_t *) p[0]; @@ -1284,7 +1392,10 @@ show_dpdk_if_hqos (vlib_main_t * vm, unformat_input_t * input, } #endif - return 0; +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ @@ -1315,6 +1426,7 @@ show_dpdk_hqos_queue_stats (vlib_main_t * vm, unformat_input_t * input, u32 qindex; struct rte_sched_queue_stats stats; u16 qlen; + clib_error_t *error = NULL; if (!unformat_user (input, unformat_line_input, line_input)) return 0; @@ -1339,14 +1451,18 @@ show_dpdk_hqos_queue_stats (vlib_main_t * vm, unformat_input_t * input, ; else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - if (hw_if_index == (u32) ~ 0) - return clib_error_return (0, "please specify interface name!!"); + { + error = clib_error_return (0, "please specify interface name!!"); + goto done; + } hw = vnet_get_hw_interface (dm->vnet_main, hw_if_index); xd = vec_elt_at_index (dm->devices, hw->dev_instance); @@ -1373,7 +1489,7 @@ show_dpdk_hqos_queue_stats (vlib_main_t * vm, unformat_input_t * input, if (devconf->hqos_enabled == 0) { vlib_cli_output (vm, "HQoS disabled for this interface"); - return 0; + goto done; } /* @@ -1386,7 +1502,10 @@ show_dpdk_hqos_queue_stats (vlib_main_t * vm, unformat_input_t * input, if (rte_sched_queue_read_stats (xd->hqos_ht->hqos, qindex, &stats, &qlen) != 0) - return clib_error_return (0, "failed to read stats"); + { + error = clib_error_return (0, "failed to read stats"); + goto done; + } vlib_cli_output (vm, "%=24s%=16s", "Stats Parameter", "Value"); vlib_cli_output (vm, "%=24s%=16d", "Packets", stats.n_pkts); @@ -1399,7 +1518,10 @@ show_dpdk_hqos_queue_stats (vlib_main_t * vm, unformat_input_t * input, vlib_cli_output (vm, "%=24s%=16d", "Bytes dropped", stats.n_bytes_dropped); - return 0; +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ diff --git a/src/vnet/devices/dpdk/ipsec/cli.c b/src/vnet/devices/dpdk/ipsec/cli.c index 93df4a64..f9d3a5d0 100644 --- a/src/vnet/devices/dpdk/ipsec/cli.c +++ b/src/vnet/devices/dpdk/ipsec/cli.c @@ -111,6 +111,7 @@ lcore_cryptodev_map_fn (vlib_main_t * vm, unformat_input_t * input, { unformat_input_t _line_input, *line_input = &_line_input; u16 detail = 0; + clib_error_t *error = NULL; if (!unformat_user (input, unformat_line_input, line_input)) return 0; @@ -120,15 +121,19 @@ lcore_cryptodev_map_fn (vlib_main_t * vm, unformat_input_t * input, if (unformat (line_input, "verbose")) detail = 1; else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - dpdk_ipsec_show_mapping (vm, detail); - return 0; +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ diff --git a/src/vnet/devices/netmap/cli.c b/src/vnet/devices/netmap/cli.c index 6157f27c..71363294 100644 --- a/src/vnet/devices/netmap/cli.c +++ b/src/vnet/devices/netmap/cli.c @@ -37,6 +37,7 @@ netmap_create_command_fn (vlib_main_t * vm, unformat_input_t * input, u8 is_pipe = 0; u8 is_master = 0; u32 sw_if_index = ~0; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -57,30 +58,48 @@ netmap_create_command_fn (vlib_main_t * vm, unformat_input_t * input, else if (unformat (line_input, "slave")) is_master = 0; else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); if (host_if_name == NULL) - return clib_error_return (0, "missing host interface name"); + { + error = clib_error_return (0, "missing host interface name"); + goto done; + } r = netmap_create_if (vm, host_if_name, hw_addr_ptr, is_pipe, is_master, &sw_if_index); if (r == VNET_API_ERROR_SYSCALL_ERROR_1) - return clib_error_return (0, "%s (errno %d)", strerror (errno), errno); + { + error = clib_error_return (0, "%s (errno %d)", strerror (errno), errno); + goto done; + } if (r == VNET_API_ERROR_INVALID_INTERFACE) - return clib_error_return (0, "Invalid interface name"); + { + error = clib_error_return (0, "Invalid interface name"); + goto done; + } if (r == VNET_API_ERROR_SUBIF_ALREADY_EXISTS) - return clib_error_return (0, "Interface already exists"); + { + error = clib_error_return (0, "Interface already exists"); + goto done; + } vlib_cli_output (vm, "%U\n", format_vnet_sw_if_index_name, vnet_get_main (), sw_if_index); - return 0; + +done: + unformat_free (line_input); + + return error; } /*? @@ -144,6 +163,7 @@ netmap_delete_command_fn (vlib_main_t * vm, unformat_input_t * input, { unformat_input_t _line_input, *line_input = &_line_input; u8 *host_if_name = NULL; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -154,17 +174,25 @@ netmap_delete_command_fn (vlib_main_t * vm, unformat_input_t * input, if (unformat (line_input, "name %s", &host_if_name)) ; else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); if (host_if_name == NULL) - return clib_error_return (0, "missing host interface name"); + { + error = clib_error_return (0, "missing host interface name"); + goto done; + } netmap_delete_if (vm, host_if_name); - return 0; +done: + unformat_free (line_input); + + return error; } /*? diff --git a/src/vnet/devices/virtio/vhost-user.c b/src/vnet/devices/virtio/vhost-user.c index 315daa77..c43f6e67 100644 --- a/src/vnet/devices/virtio/vhost-user.c +++ b/src/vnet/devices/virtio/vhost-user.c @@ -2682,6 +2682,7 @@ vhost_user_connect_command_fn (vlib_main_t * vm, u32 custom_dev_instance = ~0; u8 hwaddr[6]; u8 *hw = NULL; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -2704,10 +2705,12 @@ vhost_user_connect_command_fn (vlib_main_t * vm, renumber = 1; } else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); vnet_main_t *vnm = vnet_get_main (); @@ -2716,14 +2719,18 @@ vhost_user_connect_command_fn (vlib_main_t * vm, is_server, &sw_if_index, feature_mask, renumber, custom_dev_instance, hw))) { - vec_free (sock_filename); - return clib_error_return (0, "vhost_user_create_if returned %d", rv); + error = clib_error_return (0, "vhost_user_create_if returned %d", rv); + goto done; } - vec_free (sock_filename); vlib_cli_output (vm, "%U\n", format_vnet_sw_if_index_name, vnet_get_main (), sw_if_index); - return 0; + +done: + vec_free (sock_filename); + unformat_free (line_input); + + return error; } clib_error_t * @@ -2734,6 +2741,7 @@ vhost_user_delete_command_fn (vlib_main_t * vm, unformat_input_t _line_input, *line_input = &_line_input; u32 sw_if_index = ~0; vnet_main_t *vnm = vnet_get_main (); + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -2751,15 +2759,25 @@ vhost_user_delete_command_fn (vlib_main_t * vm, vnet_get_sup_hw_interface (vnm, sw_if_index); if (hwif == NULL || vhost_user_dev_class.index != hwif->dev_class_index) - return clib_error_return (0, "Not a vhost interface"); + { + error = clib_error_return (0, "Not a vhost interface"); + goto done; + } } else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); + vhost_user_delete_if (vnm, vm, sw_if_index); - return 0; + +done: + unformat_free (line_input); + + return error; } int @@ -3286,6 +3304,7 @@ vhost_thread_command_fn (vlib_main_t * vm, u32 sw_if_index; u8 del = 0; int rv; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -3295,9 +3314,9 @@ vhost_thread_command_fn (vlib_main_t * vm, (line_input, "%U %d", unformat_vnet_sw_interface, vnet_get_main (), &sw_if_index, &worker_thread_index)) { - unformat_free (line_input); - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; } if (unformat (line_input, "del")) @@ -3305,9 +3324,16 @@ vhost_thread_command_fn (vlib_main_t * vm, if ((rv = vhost_user_thread_placement (sw_if_index, worker_thread_index, del))) - return clib_error_return (0, "vhost_user_thread_placement returned %d", - rv); - return 0; + { + error = clib_error_return (0, "vhost_user_thread_placement returned %d", + rv); + goto done; + } + +done: + unformat_free (line_input); + + return error; } diff --git a/src/vnet/gre/interface.c b/src/vnet/gre/interface.c index d624587d..d4476ac4 100644 --- a/src/vnet/gre/interface.c +++ b/src/vnet/gre/interface.c @@ -491,6 +491,7 @@ create_gre_tunnel_command_fn (vlib_main_t * vm, u32 num_m_args = 0; u8 is_add = 1; u32 sw_if_index; + clib_error_t *error = NULL; /* Get a line of input. */ if (! unformat_user (input, unformat_line_input, line_input)) @@ -508,16 +509,24 @@ create_gre_tunnel_command_fn (vlib_main_t * vm, else if (unformat (line_input, "teb")) teb = 1; else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); if (num_m_args < 2) - return clib_error_return (0, "mandatory argument(s) missing"); + { + error = clib_error_return (0, "mandatory argument(s) missing"); + goto done; + } if (memcmp (&src, &dst, sizeof(src)) == 0) - return clib_error_return (0, "src and dst are identical"); + { + error = clib_error_return (0, "src and dst are identical"); + goto done; + } memset (a, 0, sizeof (*a)); a->outer_fib_id = outer_fib_id; @@ -536,15 +545,21 @@ create_gre_tunnel_command_fn (vlib_main_t * vm, vlib_cli_output(vm, "%U\n", format_vnet_sw_if_index_name, vnet_get_main(), sw_if_index); break; case VNET_API_ERROR_INVALID_VALUE: - return clib_error_return (0, "GRE tunnel already exists..."); + error = clib_error_return (0, "GRE tunnel already exists..."); + goto done; case VNET_API_ERROR_NO_SUCH_FIB: - return clib_error_return (0, "outer fib ID %d doesn't exist\n", - outer_fib_id); + error = clib_error_return (0, "outer fib ID %d doesn't exist\n", + outer_fib_id); + goto done; default: - return clib_error_return (0, "vnet_gre_add_del_tunnel returned %d", rv); + error = clib_error_return (0, "vnet_gre_add_del_tunnel returned %d", rv); + goto done; } - return 0; +done: + unformat_free (line_input); + + return error; } VLIB_CLI_COMMAND (create_gre_tunnel_command, static) = { diff --git a/src/vnet/ip/ip4_source_check.c b/src/vnet/ip/ip4_source_check.c index d461cc88..3af32f2e 100644 --- a/src/vnet/ip/ip4_source_check.c +++ b/src/vnet/ip/ip4_source_check.c @@ -399,6 +399,8 @@ set_ip_source_check (vlib_main_t * vm, vnet_feature_enable_disable ("ip4-unicast", feature_name, sw_if_index, is_del == 0, &config, sizeof (config)); done: + unformat_free (line_input); + return error; } @@ -531,7 +533,9 @@ ip_source_check_accept (vlib_main_t * vm, } done: - return (error); + unformat_free (line_input); + + return error; } /*? diff --git a/src/vnet/ip/ip4_test.c b/src/vnet/ip/ip4_test.c index 45d17113..73dabfdc 100644 --- a/src/vnet/ip/ip4_test.c +++ b/src/vnet/ip/ip4_test.c @@ -143,8 +143,11 @@ thrash (vlib_main_t * vm, else if (unformat (line_input, "verbose")) verbose = 1; else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } } @@ -178,7 +181,7 @@ thrash (vlib_main_t * vm, if (p == 0) { vlib_cli_output (vm, "Couldn't map fib id %d to fib index\n", table_id); - return 0; + goto done; } table_index = p[0]; @@ -294,7 +297,11 @@ thrash (vlib_main_t * vm, pool_free (tm->route_pool); } - return 0; + +done: + unformat_free (line_input); + + return error; } /*? diff --git a/src/vnet/ip/ip6_neighbor.c b/src/vnet/ip/ip6_neighbor.c index 7229591e..6b53137f 100644 --- a/src/vnet/ip/ip6_neighbor.c +++ b/src/vnet/ip/ip6_neighbor.c @@ -2923,7 +2923,10 @@ ip6_neighbor_cmd (vlib_main_t * vm, unformat_input_t * main_input, else if (unformat (line_input, "ra-lifetime")) { if (!unformat (line_input, "%d", &ra_lifetime)) - return (error = unformat_parse_error (line_input)); + { + error = unformat_parse_error (line_input); + goto done; + } use_lifetime = 1; break; } @@ -2931,13 +2934,19 @@ ip6_neighbor_cmd (vlib_main_t * vm, unformat_input_t * main_input, { if (!unformat (line_input, "%d %d", &ra_initial_count, &ra_initial_interval)) - return (error = unformat_parse_error (line_input)); + { + error = unformat_parse_error (line_input); + goto done; + } break; } else if (unformat (line_input, "ra-interval")) { if (!unformat (line_input, "%d", &ra_max_interval)) - return (error = unformat_parse_error (line_input)); + { + error = unformat_parse_error (line_input); + goto done; + } if (!unformat (line_input, "%d", &ra_min_interval)) ra_min_interval = 0; @@ -2949,7 +2958,10 @@ ip6_neighbor_cmd (vlib_main_t * vm, unformat_input_t * main_input, break; } else - return (unformat_parse_error (line_input)); + { + error = unformat_parse_error (line_input); + goto done; + } } if (add_radv_info) @@ -3006,7 +3018,10 @@ ip6_neighbor_cmd (vlib_main_t * vm, unformat_input_t * main_input, else if (unformat (line_input, "no-onlink")) no_onlink = 1; else - return (unformat_parse_error (line_input)); + { + error = unformat_parse_error (line_input); + goto done; + } } ip6_neighbor_ra_prefix (vm, sw_if_index, @@ -3018,9 +3033,9 @@ ip6_neighbor_cmd (vlib_main_t * vm, unformat_input_t * main_input, off_link, no_autoconfig, no_onlink, is_no); } +done: unformat_free (line_input); -done: return error; } diff --git a/src/vnet/ip/lookup.c b/src/vnet/ip/lookup.c index 0ef0e7a6..807b87b6 100644 --- a/src/vnet/ip/lookup.c +++ b/src/vnet/ip/lookup.c @@ -568,8 +568,6 @@ vnet_ip_route_cmd (vlib_main_t * vm, } } - unformat_free (line_input); - if (vec_len (prefixs) == 0) { error = @@ -704,6 +702,7 @@ done: vec_free (dpos); vec_free (prefixs); vec_free (rpaths); + unformat_free (line_input); return error; } @@ -872,8 +871,6 @@ vnet_ip_mroute_cmd (vlib_main_t * vm, } } - unformat_free (line_input); - if (~0 == table_id) { /* @@ -970,6 +967,8 @@ vnet_ip_mroute_cmd (vlib_main_t * vm, (scount * gcount) / (timet[1] - timet[0])); done: + unformat_free (line_input); + return error; } @@ -1149,24 +1148,37 @@ probe_neighbor_address (vlib_main_t * vm, is_ip4 = 0; } else - return clib_error_return (0, "unknown input '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - if (sw_if_index == ~0) - return clib_error_return (0, "Interface required, not set."); + { + error = clib_error_return (0, "Interface required, not set."); + goto done; + } if (address_set == 0) - return clib_error_return (0, "ip address required, not set."); + { + error = clib_error_return (0, "ip address required, not set."); + goto done; + } if (address_set > 1) - return clib_error_return (0, "Multiple ip addresses not supported."); + { + error = clib_error_return (0, "Multiple ip addresses not supported."); + goto done; + } if (is_ip4) error = ip4_probe_neighbor_wait (vm, &a4, sw_if_index, retry_count); else error = ip6_probe_neighbor_wait (vm, &a6, sw_if_index, retry_count); +done: + unformat_free (line_input); + return error; } diff --git a/src/vnet/ipsec-gre/interface.c b/src/vnet/ipsec-gre/interface.c index 3b6e4ac2..0772ce73 100644 --- a/src/vnet/ipsec-gre/interface.c +++ b/src/vnet/ipsec-gre/interface.c @@ -232,6 +232,7 @@ create_ipsec_gre_tunnel_command_fn (vlib_main_t * vm, vnet_ipsec_gre_add_del_tunnel_args_t _a, *a = &_a; int rv; u32 sw_if_index; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -250,16 +251,24 @@ create_ipsec_gre_tunnel_command_fn (vlib_main_t * vm, else if (unformat (line_input, "remote-sa %d", &rsa)) num_m_args++; else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); if (num_m_args < 4) - return clib_error_return (0, "mandatory argument(s) missing"); + { + error = clib_error_return (0, "mandatory argument(s) missing"); + goto done; + } if (memcmp (&src, &dst, sizeof (src)) == 0) - return clib_error_return (0, "src and dst are identical"); + { + error = clib_error_return (0, "src and dst are identical"); + goto done; + } memset (a, 0, sizeof (*a)); a->is_add = is_add; @@ -277,14 +286,19 @@ create_ipsec_gre_tunnel_command_fn (vlib_main_t * vm, vnet_get_main (), sw_if_index); break; case VNET_API_ERROR_INVALID_VALUE: - return clib_error_return (0, "GRE tunnel already exists..."); + error = clib_error_return (0, "GRE tunnel already exists..."); + goto done; default: - return clib_error_return (0, - "vnet_ipsec_gre_add_del_tunnel returned %d", - rv); + error = clib_error_return (0, + "vnet_ipsec_gre_add_del_tunnel returned %d", + rv); + goto done; } - return 0; +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ diff --git a/src/vnet/ipsec/ipsec_cli.c b/src/vnet/ipsec/ipsec_cli.c index 3c1e26f2..0e034402 100644 --- a/src/vnet/ipsec/ipsec_cli.c +++ b/src/vnet/ipsec/ipsec_cli.c @@ -32,6 +32,7 @@ set_interface_spd_command_fn (vlib_main_t * vm, u32 sw_if_index = (u32) ~ 0; u32 spd_id; int is_add = 1; + clib_error_t *error = NULL; if (!unformat_user (input, unformat_line_input, line_input)) return 0; @@ -43,14 +44,18 @@ set_interface_spd_command_fn (vlib_main_t * vm, else if (unformat (line_input, "del")) is_add = 0; else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); - - unformat_free (line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } ipsec_set_interface_spd (vm, sw_if_index, spd_id, is_add); - return 0; +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ @@ -72,7 +77,7 @@ ipsec_sa_add_del_command_fn (vlib_main_t * vm, ipsec_sa_t sa; int is_add = ~0; u8 *ck = 0, *ik = 0; - clib_error_t *err = 0; + clib_error_t *error = NULL; memset (&sa, 0, sizeof (sa)); @@ -90,8 +95,11 @@ ipsec_sa_add_del_command_fn (vlib_main_t * vm, else if (unformat (line_input, "esp")) sa.protocol = IPSEC_PROTOCOL_ESP; else if (unformat (line_input, "ah")) - //sa.protocol = IPSEC_PROTOCOL_AH; - return clib_error_return (0, "unsupported security protocol 'AH'"); + { + //sa.protocol = IPSEC_PROTOCOL_AH; + error = clib_error_return (0, "unsupported security protocol 'AH'"); + goto done; + } else if (unformat (line_input, "crypto-key %U", unformat_hex_string, &ck)) sa.crypto_key_len = vec_len (ck); @@ -102,8 +110,12 @@ ipsec_sa_add_del_command_fn (vlib_main_t * vm, { if (sa.crypto_alg < IPSEC_CRYPTO_ALG_AES_CBC_128 || sa.crypto_alg >= IPSEC_CRYPTO_N_ALG) - return clib_error_return (0, "unsupported crypto-alg: '%U'", - format_ipsec_crypto_alg, sa.crypto_alg); + { + error = clib_error_return (0, "unsupported crypto-alg: '%U'", + format_ipsec_crypto_alg, + sa.crypto_alg); + goto done; + } } else if (unformat (line_input, "integ-key %U", unformat_hex_string, &ik)) @@ -113,8 +125,12 @@ ipsec_sa_add_del_command_fn (vlib_main_t * vm, { if (sa.integ_alg < IPSEC_INTEG_ALG_SHA1_96 || sa.integ_alg >= IPSEC_INTEG_N_ALG) - return clib_error_return (0, "unsupported integ-alg: '%U'", - format_ipsec_integ_alg, sa.integ_alg); + { + error = clib_error_return (0, "unsupported integ-alg: '%U'", + format_ipsec_integ_alg, + sa.integ_alg); + goto done; + } } else if (unformat (line_input, "tunnel-src %U", unformat_ip4_address, &sa.tunnel_src_addr.ip4)) @@ -135,12 +151,13 @@ ipsec_sa_add_del_command_fn (vlib_main_t * vm, sa.is_tunnel_ip6 = 1; } else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - if (sa.crypto_key_len > sizeof (sa.crypto_key)) sa.crypto_key_len = sizeof (sa.crypto_key); @@ -156,14 +173,17 @@ ipsec_sa_add_del_command_fn (vlib_main_t * vm, if (is_add) { ASSERT (im->cb.check_support_cb); - err = im->cb.check_support_cb (&sa); - if (err) - return err; + error = im->cb.check_support_cb (&sa); + if (error) + goto done; } ipsec_add_del_sa (vm, &sa, is_add); - return 0; +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ @@ -183,6 +203,7 @@ ipsec_spd_add_del_command_fn (vlib_main_t * vm, unformat_input_t _line_input, *line_input = &_line_input; u32 spd_id = ~0; int is_add = ~0; + clib_error_t *error = NULL; if (!unformat_user (input, unformat_line_input, line_input)) return 0; @@ -196,18 +217,25 @@ ipsec_spd_add_del_command_fn (vlib_main_t * vm, else if (unformat (line_input, "%u", &spd_id)) ; else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - if (spd_id == ~0) - return clib_error_return (0, "please specify SPD ID"); + { + error = clib_error_return (0, "please specify SPD ID"); + goto done; + } ipsec_add_del_spd (vm, spd_id, is_add); - return 0; +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ @@ -230,6 +258,7 @@ ipsec_policy_add_del_command_fn (vlib_main_t * vm, int is_add = 0; int is_ip_any = 1; u32 tmp, tmp2; + clib_error_t *error = NULL; memset (&p, 0, sizeof (p)); p.lport.stop = p.rport.stop = ~0; @@ -262,7 +291,10 @@ ipsec_policy_add_del_command_fn (vlib_main_t * vm, &p.policy)) { if (p.policy == IPSEC_POLICY_ACTION_RESOLVE) - return clib_error_return (0, "unsupported action: 'resolve'"); + { + error = clib_error_return (0, "unsupported action: 'resolve'"); + goto done; + } } else if (unformat (line_input, "sa %u", &p.sa_id)) ; @@ -300,19 +332,24 @@ ipsec_policy_add_del_command_fn (vlib_main_t * vm, p.rport.stop = tmp2; } else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - ipsec_add_del_policy (vm, &p, is_add); if (is_ip_any) { p.is_ipv6 = 1; ipsec_add_del_policy (vm, &p, is_add); } - return 0; + +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ @@ -332,6 +369,7 @@ set_ipsec_sa_key_command_fn (vlib_main_t * vm, unformat_input_t _line_input, *line_input = &_line_input; ipsec_sa_t sa; u8 *ck = 0, *ik = 0; + clib_error_t *error = NULL; memset (&sa, 0, sizeof (sa)); @@ -349,12 +387,13 @@ set_ipsec_sa_key_command_fn (vlib_main_t * vm, if (unformat (line_input, "integ-key %U", unformat_hex_string, &ik)) sa.integ_key_len = vec_len (ik); else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - if (sa.crypto_key_len > sizeof (sa.crypto_key)) sa.crypto_key_len = sizeof (sa.crypto_key); @@ -369,7 +408,10 @@ set_ipsec_sa_key_command_fn (vlib_main_t * vm, ipsec_set_sa_key (vm, &sa); - return 0; +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ @@ -649,6 +691,7 @@ create_ipsec_tunnel_command_fn (vlib_main_t * vm, ipsec_add_del_tunnel_args_t a; int rv; u32 num_m_args = 0; + clib_error_t *error = NULL; memset (&a, 0, sizeof (a)); a.is_add = 1; @@ -673,13 +716,18 @@ create_ipsec_tunnel_command_fn (vlib_main_t * vm, else if (unformat (line_input, "del")) a.is_add = 0; else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); if (num_m_args < 4) - return clib_error_return (0, "mandatory argument(s) missing"); + { + error = clib_error_return (0, "mandatory argument(s) missing"); + goto done; + } rv = ipsec_add_del_tunnel_if (&a); @@ -689,16 +737,21 @@ create_ipsec_tunnel_command_fn (vlib_main_t * vm, break; case VNET_API_ERROR_INVALID_VALUE: if (a.is_add) - return clib_error_return (0, - "IPSec tunnel interface already exists..."); + error = clib_error_return (0, + "IPSec tunnel interface already exists..."); else - return clib_error_return (0, "IPSec tunnel interface not exists..."); + error = clib_error_return (0, "IPSec tunnel interface not exists..."); + goto done; default: - return clib_error_return (0, "ipsec_register_interface returned %d", - rv); + error = clib_error_return (0, "ipsec_register_interface returned %d", + rv); + goto done; } - return 0; +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ @@ -720,6 +773,7 @@ set_interface_key_command_fn (vlib_main_t * vm, u32 hw_if_index = (u32) ~ 0; u32 alg; u8 *key = 0; + clib_error_t *error = NULL; if (!unformat_user (input, unformat_line_input, line_input)) return 0; @@ -748,25 +802,38 @@ set_interface_key_command_fn (vlib_main_t * vm, else if (unformat (line_input, "%U", unformat_hex_string, &key)) ; else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - if (type == IPSEC_IF_SET_KEY_TYPE_NONE) - return clib_error_return (0, "unknown key type"); + { + error = clib_error_return (0, "unknown key type"); + goto done; + } if (alg > 0 && vec_len (key) == 0) - return clib_error_return (0, "key is not specified"); + { + error = clib_error_return (0, "key is not specified"); + goto done; + } if (hw_if_index == (u32) ~ 0) - return clib_error_return (0, "interface not specified"); + { + error = clib_error_return (0, "interface not specified"); + goto done; + } ipsec_set_interface_key (im->vnet_main, hw_if_index, type, alg, key); + +done: vec_free (key); + unformat_free (line_input); - return 0; + return error; } /* *INDENT-OFF* */ diff --git a/src/vnet/l2/l2_patch.c b/src/vnet/l2/l2_patch.c index 5e4691f4..ff3d2f3a 100644 --- a/src/vnet/l2/l2_patch.c +++ b/src/vnet/l2/l2_patch.c @@ -315,6 +315,7 @@ test_patch_command_fn (vlib_main_t * vm, int rx_set = 0; int tx_set = 0; int is_add = 1; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -335,10 +336,16 @@ test_patch_command_fn (vlib_main_t * vm, } if (rx_set == 0) - return clib_error_return (0, "rx interface not set"); + { + error = clib_error_return (0, "rx interface not set"); + goto done; + } if (tx_set == 0) - return clib_error_return (0, "tx interface not set"); + { + error = clib_error_return (0, "tx interface not set"); + goto done; + } rv = vnet_l2_patch_add_del (rx_sw_if_index, tx_sw_if_index, is_add); @@ -348,17 +355,24 @@ test_patch_command_fn (vlib_main_t * vm, break; case VNET_API_ERROR_INVALID_SW_IF_INDEX: - return clib_error_return (0, "rx interface not a physical port"); + error = clib_error_return (0, "rx interface not a physical port"); + goto done; case VNET_API_ERROR_INVALID_SW_IF_INDEX_2: - return clib_error_return (0, "tx interface not a physical port"); + error = clib_error_return (0, "tx interface not a physical port"); + goto done; default: - return clib_error_return + error = clib_error_return (0, "WARNING: vnet_l2_patch_add_del returned %d", rv); + goto done; } - return 0; + +done: + unformat_free (line_input); + + return error; } /*? diff --git a/src/vnet/l2/l2_xcrw.c b/src/vnet/l2/l2_xcrw.c index 70610a85..d08a5d8f 100644 --- a/src/vnet/l2/l2_xcrw.c +++ b/src/vnet/l2/l2_xcrw.c @@ -409,6 +409,7 @@ set_l2_xcrw_command_fn (vlib_main_t * vm, u8 *rw = 0; vnet_main_t *vnm = vnet_get_main (); int rv; + clib_error_t *error = NULL; if (!unformat_user (input, unformat_line_input, line_input)) @@ -416,8 +417,11 @@ set_l2_xcrw_command_fn (vlib_main_t * vm, if (!unformat (line_input, "%U", unformat_vnet_sw_interface, vnm, &l2_sw_if_index)) - return clib_error_return (0, "unknown input '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) { @@ -436,7 +440,10 @@ set_l2_xcrw_command_fn (vlib_main_t * vm, } if (next_node_index == ~0) - return clib_error_return (0, "next node not specified"); + { + error = clib_error_return (0, "next node not specified"); + goto done; + } if (tx_fib_id != ~0) { @@ -448,7 +455,11 @@ set_l2_xcrw_command_fn (vlib_main_t * vm, p = hash_get (ip4_main.fib_index_by_table_id, tx_fib_id); if (p == 0) - return clib_error_return (0, "nonexistent tx_fib_id %d", tx_fib_id); + { + error = + clib_error_return (0, "nonexistent tx_fib_id %d", tx_fib_id); + goto done; + } tx_fib_index = p[0]; } @@ -463,16 +474,21 @@ set_l2_xcrw_command_fn (vlib_main_t * vm, break; case VNET_API_ERROR_INVALID_SW_IF_INDEX: - return clib_error_return (0, "%U not cross-connected", - format_vnet_sw_if_index_name, - vnm, l2_sw_if_index); + error = clib_error_return (0, "%U not cross-connected", + format_vnet_sw_if_index_name, + vnm, l2_sw_if_index); + goto done; + default: - return clib_error_return (0, "vnet_configure_l2_xcrw returned %d", rv); + error = clib_error_return (0, "vnet_configure_l2_xcrw returned %d", rv); + goto done; } +done: vec_free (rw); + unformat_free (line_input); - return 0; + return error; } /*? diff --git a/src/vnet/l2tp/l2tp.c b/src/vnet/l2tp/l2tp.c index a4531dab..2d323397 100644 --- a/src/vnet/l2tp/l2tp.c +++ b/src/vnet/l2tp/l2tp.c @@ -427,6 +427,7 @@ create_l2tpv3_tunnel_command_fn (vlib_main_t * vm, u32 sw_if_index; u32 encap_fib_id = ~0; u32 encap_fib_index = ~0; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -455,18 +456,22 @@ create_l2tpv3_tunnel_command_fn (vlib_main_t * vm, else if (unformat (line_input, "l2-sublayer-present")) l2_sublayer_present = 1; else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - if (encap_fib_id != ~0) { uword *p; ip6_main_t *im = &ip6_main; if (!(p = hash_get (im->fib_index_by_table_id, encap_fib_id))) - return clib_error_return (0, "No fib with id %d", encap_fib_id); + { + error = clib_error_return (0, "No fib with id %d", encap_fib_id); + goto done; + } encap_fib_index = p[0]; } else @@ -475,9 +480,15 @@ create_l2tpv3_tunnel_command_fn (vlib_main_t * vm, } if (our_address_set == 0) - return clib_error_return (0, "our address not specified"); + { + error = clib_error_return (0, "our address not specified"); + goto done; + } if (client_address_set == 0) - return clib_error_return (0, "client address not specified"); + { + error = clib_error_return (0, "client address not specified"); + goto done; + } rv = create_l2tpv3_ipv6_tunnel (lm, &client_address, &our_address, local_session_id, remote_session_id, @@ -491,16 +502,22 @@ create_l2tpv3_tunnel_command_fn (vlib_main_t * vm, vnet_get_main (), sw_if_index); break; case VNET_API_ERROR_INVALID_VALUE: - return clib_error_return (0, "session already exists..."); + error = clib_error_return (0, "session already exists..."); + goto done; case VNET_API_ERROR_NO_SUCH_ENTRY: - return clib_error_return (0, "session does not exist..."); + error = clib_error_return (0, "session does not exist..."); + goto done; default: - return clib_error_return (0, "l2tp_session_add_del returned %d", rv); + error = clib_error_return (0, "l2tp_session_add_del returned %d", rv); + goto done; } - return 0; +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ diff --git a/src/vnet/lisp-cp/lisp_cli.c b/src/vnet/lisp-cp/lisp_cli.c index 25d11c61..05df9fb6 100644 --- a/src/vnet/lisp-cp/lisp_cli.c +++ b/src/vnet/lisp-cp/lisp_cli.c @@ -25,6 +25,7 @@ lisp_show_adjacencies_command_fn (vlib_main_t * vm, vlib_cli_output (vm, "%s %40s\n", "leid", "reid"); unformat_input_t _line_input, *line_input = &_line_input; u32 vni = ~0; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -38,14 +39,14 @@ lisp_show_adjacencies_command_fn (vlib_main_t * vm, { vlib_cli_output (vm, "parse error: '%U'", format_unformat_error, line_input); - return 0; + goto done; } } if (~0 == vni) { vlib_cli_output (vm, "error: no vni specified!"); - return 0; + goto done; } adjs = vnet_lisp_adjacencies_get_by_vni (vni); @@ -57,7 +58,10 @@ lisp_show_adjacencies_command_fn (vlib_main_t * vm, } vec_free (adjs); - return 0; +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ @@ -77,6 +81,7 @@ lisp_add_del_map_server_command_fn (vlib_main_t * vm, u8 is_add = 1, ip_set = 0; ip_address_t ip; unformat_input_t _line_input, *line_input = &_line_input; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -94,14 +99,14 @@ lisp_add_del_map_server_command_fn (vlib_main_t * vm, { vlib_cli_output (vm, "parse error: '%U'", format_unformat_error, line_input); - return 0; + goto done; } } if (!ip_set) { vlib_cli_output (vm, "map-server ip address not set!"); - return 0; + goto done; } rv = vnet_lisp_add_del_map_server (&ip, is_add); @@ -109,7 +114,10 @@ lisp_add_del_map_server_command_fn (vlib_main_t * vm, vlib_cli_output (vm, "failed to %s map-server!", is_add ? "add" : "delete"); - return 0; +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ @@ -191,7 +199,7 @@ lisp_add_del_local_eid_command_fn (vlib_main_t * vm, unformat_input_t * input, if (key && (0 == key_id)) { vlib_cli_output (vm, "invalid key_id!"); - return 0; + goto done;; } gid_address_copy (&a->eid, &eid); @@ -213,6 +221,8 @@ done: vec_free (locator_set_name); gid_address_free (&a->eid); vec_free (a->key); + unformat_free (line_input); + return error; } @@ -233,6 +243,7 @@ lisp_eid_table_map_command_fn (vlib_main_t * vm, u8 is_add = 1, is_l2 = 0; u32 vni = 0, dp_id = 0; unformat_input_t _line_input, *line_input = &_line_input; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -250,11 +261,16 @@ lisp_eid_table_map_command_fn (vlib_main_t * vm, is_l2 = 1; else { - return unformat_parse_error (line_input); + error = unformat_parse_error (line_input); + goto done; } } vnet_lisp_eid_table_map (vni, dp_id, is_l2, is_add); - return 0; + +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ @@ -479,7 +495,7 @@ lisp_add_del_adjacency_command_fn (vlib_main_t * vm, unformat_input_t * input, != ip_prefix_version (leid_ippref))) { clib_warning ("remote and local EIDs are of different types!"); - return error; + goto done; } memset (a, 0, sizeof (a[0])); @@ -512,6 +528,7 @@ lisp_map_request_mode_command_fn (vlib_main_t * vm, { unformat_input_t _i, *i = &_i; map_request_mode_t mr_mode = _MR_MODE_MAX; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, i)) @@ -533,12 +550,15 @@ lisp_map_request_mode_command_fn (vlib_main_t * vm, if (_MR_MODE_MAX == mr_mode) { clib_warning ("No LISP map request mode entered!"); - return 0; + goto done; } vnet_lisp_set_map_request_mode (mr_mode); + done: - return 0; + unformat_free (i); + + return error; } /* *INDENT-OFF* */ @@ -630,7 +650,10 @@ lisp_pitr_set_locator_set_command_fn (vlib_main_t * vm, else if (unformat (line_input, "disable")) is_add = 0; else - return clib_error_return (0, "parse error"); + { + error = clib_error_return (0, "parse error"); + goto done; + } } if (!locator_name_set) @@ -648,6 +671,8 @@ lisp_pitr_set_locator_set_command_fn (vlib_main_t * vm, done: if (locator_set_name) vec_free (locator_set_name); + unformat_free (line_input); + return error; } @@ -771,6 +796,7 @@ lisp_show_eid_table_command_fn (vlib_main_t * vm, gid_address_t eid; u8 print_all = 1; u8 filter = 0; + clib_error_t *error = NULL; memset (&eid, 0, sizeof (eid)); @@ -787,8 +813,11 @@ lisp_show_eid_table_command_fn (vlib_main_t * vm, else if (unformat (line_input, "remote")) filter = 2; else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } vlib_cli_output (vm, "%-35s%-20s%-30s%-20s%-s", @@ -818,7 +847,7 @@ lisp_show_eid_table_command_fn (vlib_main_t * vm, { mi = gid_dictionary_lookup (&lcm->mapping_index_by_gid, &eid); if ((u32) ~ 0 == mi) - return 0; + goto done; mapit = pool_elt_at_index (lcm->mapping_pool, mi); locator_set_t *ls = pool_elt_at_index (lcm->locator_set_pool, @@ -827,14 +856,17 @@ lisp_show_eid_table_command_fn (vlib_main_t * vm, if (filter && !((1 == filter && ls->local) || (2 == filter && !ls->local))) { - return 0; + goto done; } vlib_cli_output (vm, "%U,", format_eid_entry, lcm->vnet_main, lcm, mapit, ls); } - return 0; +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ @@ -853,6 +885,7 @@ lisp_enable_disable_command_fn (vlib_main_t * vm, unformat_input_t * input, unformat_input_t _line_input, *line_input = &_line_input; u8 is_enabled = 0; u8 is_set = 0; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -869,16 +902,24 @@ lisp_enable_disable_command_fn (vlib_main_t * vm, unformat_input_t * input, is_set = 1; else { - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; } } if (!is_set) - return clib_error_return (0, "state not set"); + { + error = clib_error_return (0, "state not set"); + goto done; + } vnet_lisp_enable_disable (is_enabled); - return 0; + +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ @@ -897,6 +938,7 @@ lisp_map_register_enable_disable_command_fn (vlib_main_t * vm, unformat_input_t _line_input, *line_input = &_line_input; u8 is_enabled = 0; u8 is_set = 0; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -915,18 +957,22 @@ lisp_map_register_enable_disable_command_fn (vlib_main_t * vm, { vlib_cli_output (vm, "parse error: '%U'", format_unformat_error, line_input); - return 0; + goto done; } } if (!is_set) { vlib_cli_output (vm, "state not set!"); - return 0; + goto done; } vnet_lisp_map_register_enable_disable (is_enabled); - return 0; + +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ @@ -945,6 +991,7 @@ lisp_rloc_probe_enable_disable_command_fn (vlib_main_t * vm, unformat_input_t _line_input, *line_input = &_line_input; u8 is_enabled = 0; u8 is_set = 0; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -963,18 +1010,22 @@ lisp_rloc_probe_enable_disable_command_fn (vlib_main_t * vm, { vlib_cli_output (vm, "parse error: '%U'", format_unformat_error, line_input); - return 0; + goto done; } } if (!is_set) { vlib_cli_output (vm, "state not set!"); - return 0; + goto done; } vnet_lisp_rloc_probe_enable_disable (is_enabled); - return 0; + +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ @@ -1022,6 +1073,7 @@ lisp_show_eid_table_map_command_fn (vlib_main_t * vm, lisp_cp_main_t *lcm = vnet_lisp_cp_get_main (); uword *vni_table = 0; u8 is_l2 = 0; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -1040,14 +1092,17 @@ lisp_show_eid_table_map_command_fn (vlib_main_t * vm, is_l2 = 0; } else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } if (!vni_table) { vlib_cli_output (vm, "Error: expected l2|l3 param!\n"); - return 0; + goto done; } vlib_cli_output (vm, "%=10s%=10s", "VNI", is_l2 ? "BD" : "VRF"); @@ -1059,7 +1114,10 @@ lisp_show_eid_table_map_command_fn (vlib_main_t * vm, })); /* *INDENT-ON* */ - return 0; +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ @@ -1131,6 +1189,8 @@ done: vec_free (locators); if (locator_set_name) vec_free (locator_set_name); + unformat_free (line_input); + return error; } @@ -1205,6 +1265,8 @@ lisp_add_del_locator_in_set_command_fn (vlib_main_t * vm, done: vec_free (locators); vec_free (locator_set_name); + unformat_free (line_input); + return error; } @@ -1322,6 +1384,8 @@ lisp_add_del_map_resolver_command_fn (vlib_main_t * vm, } done: + unformat_free (line_input); + return error; } @@ -1372,11 +1436,11 @@ lisp_add_del_mreq_itr_rlocs_command_fn (vlib_main_t * vm, is_add ? "add" : "delete"); } +done: vec_free (locator_set_name); + unformat_free (line_input); -done: return error; - } /* *INDENT-OFF* */ @@ -1438,7 +1502,10 @@ lisp_use_petr_set_locator_set_command_fn (vlib_main_t * vm, else if (unformat (line_input, "disable")) is_add = 0; else - return clib_error_return (0, "parse error"); + { + error = clib_error_return (0, "parse error"); + goto done; + } } if (!ip_set) @@ -1454,6 +1521,8 @@ lisp_use_petr_set_locator_set_command_fn (vlib_main_t * vm, } done: + unformat_free (line_input); + return error; } diff --git a/src/vnet/lisp-gpe/interface.c b/src/vnet/lisp-gpe/interface.c index 2142e095..19ac22e7 100644 --- a/src/vnet/lisp-gpe/interface.c +++ b/src/vnet/lisp-gpe/interface.c @@ -794,6 +794,7 @@ lisp_gpe_add_del_iface_command_fn (vlib_main_t * vm, unformat_input_t * input, u32 table_id, vni, bd_id; u8 vni_is_set = 0, vrf_is_set = 0, bd_index_is_set = 0; u8 nsh_iface = 0; + clib_error_t *error = NULL; if (vnet_lisp_gpe_enable_disable_status () == 0) { @@ -828,8 +829,9 @@ lisp_gpe_add_del_iface_command_fn (vlib_main_t * vm, unformat_input_t * input, } else { - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; } } @@ -839,7 +841,8 @@ lisp_gpe_add_del_iface_command_fn (vlib_main_t * vm, unformat_input_t * input, { if (~0 == lisp_gpe_add_nsh_iface (&lisp_gpe_main)) { - return clib_error_return (0, "NSH interface not created"); + error = clib_error_return (0, "NSH interface not created"); + goto done; } } else @@ -850,21 +853,34 @@ lisp_gpe_add_del_iface_command_fn (vlib_main_t * vm, unformat_input_t * input, } if (vrf_is_set && bd_index_is_set) - return clib_error_return (0, - "Cannot set both vrf and brdige domain index!"); + { + error = clib_error_return + (0, "Cannot set both vrf and brdige domain index!"); + goto done; + } if (!vni_is_set) - return clib_error_return (0, "vni must be set!"); + { + error = clib_error_return (0, "vni must be set!"); + goto done; + } if (!vrf_is_set && !bd_index_is_set) - return clib_error_return (0, "vrf or bridge domain index must be set!"); + { + error = + clib_error_return (0, "vrf or bridge domain index must be set!"); + goto done; + } if (bd_index_is_set) { if (is_add) { if (~0 == lisp_gpe_tenant_l2_iface_add_or_lock (vni, bd_id)) - return clib_error_return (0, "L2 interface not created"); + { + error = clib_error_return (0, "L2 interface not created"); + goto done; + } } else lisp_gpe_tenant_l2_iface_unlock (vni); @@ -874,13 +890,35 @@ lisp_gpe_add_del_iface_command_fn (vlib_main_t * vm, unformat_input_t * input, if (is_add) { if (~0 == lisp_gpe_tenant_l3_iface_add_or_lock (vni, table_id)) - return clib_error_return (0, "L3 interface not created"); + { + error = clib_error_return (0, "L3 interface not created"); + goto done; + } } else lisp_gpe_tenant_l3_iface_unlock (vni); } - return (NULL); + if (nsh_iface) + { + if (is_add) + { + if (~0 == lisp_gpe_add_nsh_iface (&lisp_gpe_main)) + { + error = clib_error_return (0, "NSH interface not created"); + goto done; + } + else + { + lisp_gpe_del_nsh_iface (&lisp_gpe_main); + } + } + } + +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ diff --git a/src/vnet/lisp-gpe/lisp_gpe.c b/src/vnet/lisp-gpe/lisp_gpe.c index 1f8afdae..f2fbcbd5 100644 --- a/src/vnet/lisp-gpe/lisp_gpe.c +++ b/src/vnet/lisp-gpe/lisp_gpe.c @@ -218,6 +218,7 @@ lisp_gpe_enable_disable_command_fn (vlib_main_t * vm, unformat_input_t _line_input, *line_input = &_line_input; u8 is_en = 1; vnet_lisp_gpe_enable_disable_args_t _a, *a = &_a; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -231,12 +232,18 @@ lisp_gpe_enable_disable_command_fn (vlib_main_t * vm, is_en = 0; else { - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; } } a->is_en = is_en; - return vnet_lisp_gpe_enable_disable (a); + error = vnet_lisp_gpe_enable_disable (a); + +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ diff --git a/src/vnet/map/map.c b/src/vnet/map/map.c index aeec6a94..a2d28118 100644 --- a/src/vnet/map/map.c +++ b/src/vnet/map/map.c @@ -465,6 +465,8 @@ map_security_check_command_fn (vlib_main_t * vm, { unformat_input_t _line_input, *line_input = &_line_input; map_main_t *mm = &map_main; + clib_error_t *error = NULL; + /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) return 0; @@ -476,11 +478,17 @@ map_security_check_command_fn (vlib_main_t * vm, else if (unformat (line_input, "on")) mm->sec_check = true; else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } + +done: unformat_free (line_input); - return 0; + + return error; } static clib_error_t * @@ -490,6 +498,8 @@ map_security_check_frag_command_fn (vlib_main_t * vm, { unformat_input_t _line_input, *line_input = &_line_input; map_main_t *mm = &map_main; + clib_error_t *error = NULL; + /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) return 0; @@ -501,11 +511,17 @@ map_security_check_frag_command_fn (vlib_main_t * vm, else if (unformat (line_input, "on")) mm->sec_check_frag = true; else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } + +done: unformat_free (line_input); - return 0; + + return error; } static clib_error_t * @@ -523,6 +539,7 @@ map_add_domain_command_fn (vlib_main_t * vm, u32 mtu = 0; u8 flags = 0; ip6_src_len = 128; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -559,20 +576,28 @@ map_add_domain_command_fn (vlib_main_t * vm, else if (unformat (line_input, "map-t")) flags |= MAP_DOMAIN_TRANSLATION; else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); if (num_m_args < 3) - return clib_error_return (0, "mandatory argument(s) missing"); + { + error = clib_error_return (0, "mandatory argument(s) missing"); + goto done; + } map_create_domain (&ip4_prefix, ip4_prefix_len, &ip6_prefix, ip6_prefix_len, &ip6_src, ip6_src_len, ea_bits_len, psid_offset, psid_length, &map_domain_index, mtu, flags); - return 0; +done: + unformat_free (line_input); + + return error; } static clib_error_t * @@ -582,6 +607,7 @@ map_del_domain_command_fn (vlib_main_t * vm, unformat_input_t _line_input, *line_input = &_line_input; u32 num_m_args = 0; u32 map_domain_index; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -592,17 +618,25 @@ map_del_domain_command_fn (vlib_main_t * vm, if (unformat (line_input, "index %d", &map_domain_index)) num_m_args++; else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); if (num_m_args != 1) - return clib_error_return (0, "mandatory argument(s) missing"); + { + error = clib_error_return (0, "mandatory argument(s) missing"); + goto done; + } map_delete_domain (map_domain_index); - return 0; +done: + unformat_free (line_input); + + return error; } static clib_error_t * @@ -613,6 +647,7 @@ map_add_rule_command_fn (vlib_main_t * vm, ip6_address_t tep; u32 num_m_args = 0; u32 psid = 0, map_domain_index; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -628,19 +663,29 @@ map_add_rule_command_fn (vlib_main_t * vm, if (unformat (line_input, "ip6-dst %U", unformat_ip6_address, &tep)) num_m_args++; else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); if (num_m_args != 3) - return clib_error_return (0, "mandatory argument(s) missing"); + { + error = clib_error_return (0, "mandatory argument(s) missing"); + goto done; + } if (map_add_del_psid (map_domain_index, psid, &tep, 1) != 0) { - return clib_error_return (0, "Failing to add Mapping Rule"); + error = clib_error_return (0, "Failing to add Mapping Rule"); + goto done; } - return 0; + +done: + unformat_free (line_input); + + return error; } #if MAP_SKIP_IP6_LOOKUP @@ -653,6 +698,7 @@ map_pre_resolve_command_fn (vlib_main_t * vm, ip4_address_t ip4nh; ip6_address_t ip6nh; map_main_t *mm = &map_main; + clib_error_t *error = NULL; memset (&ip4nh, 0, sizeof (ip4nh)); memset (&ip6nh, 0, sizeof (ip6nh)); @@ -669,14 +715,19 @@ map_pre_resolve_command_fn (vlib_main_t * vm, if (unformat (line_input, "ip6-nh %U", unformat_ip6_address, &ip6nh)) mm->preresolve_ip6 = ip6nh; else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); map_pre_resolve (&ip4nh, &ip6nh); - return 0; +done: + unformat_free (line_input); + + return error; } #endif @@ -688,6 +739,7 @@ map_icmp_relay_source_address_command_fn (vlib_main_t * vm, unformat_input_t _line_input, *line_input = &_line_input; ip4_address_t icmp_src_address; map_main_t *mm = &map_main; + clib_error_t *error = NULL; mm->icmp4_src_address.as_u32 = 0; @@ -701,12 +753,17 @@ map_icmp_relay_source_address_command_fn (vlib_main_t * vm, (line_input, "%U", unformat_ip4_address, &icmp_src_address)) mm->icmp4_src_address = icmp_src_address; else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } + +done: unformat_free (line_input); - return 0; + return error; } static clib_error_t * @@ -717,6 +774,7 @@ map_icmp_unreachables_command_fn (vlib_main_t * vm, unformat_input_t _line_input, *line_input = &_line_input; map_main_t *mm = &map_main; int num_m_args = 0; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -730,16 +788,21 @@ map_icmp_unreachables_command_fn (vlib_main_t * vm, else if (unformat (line_input, "off")) mm->icmp6_enabled = false; else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); if (num_m_args != 1) - return clib_error_return (0, "mandatory argument(s) missing"); + error = clib_error_return (0, "mandatory argument(s) missing"); - return 0; +done: + unformat_free (line_input); + + return error; } static clib_error_t * @@ -748,6 +811,7 @@ map_fragment_command_fn (vlib_main_t * vm, { unformat_input_t _line_input, *line_input = &_line_input; map_main_t *mm = &map_main; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -760,12 +824,17 @@ map_fragment_command_fn (vlib_main_t * vm, else if (unformat (line_input, "outer")) mm->frag_inner = false; else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } + +done: unformat_free (line_input); - return 0; + return error; } static clib_error_t * @@ -775,6 +844,7 @@ map_fragment_df_command_fn (vlib_main_t * vm, { unformat_input_t _line_input, *line_input = &_line_input; map_main_t *mm = &map_main; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -787,12 +857,17 @@ map_fragment_df_command_fn (vlib_main_t * vm, else if (unformat (line_input, "off")) mm->frag_ignore_df = false; else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } + +done: unformat_free (line_input); - return 0; + return error; } static clib_error_t * @@ -803,6 +878,7 @@ map_traffic_class_command_fn (vlib_main_t * vm, unformat_input_t _line_input, *line_input = &_line_input; map_main_t *mm = &map_main; u32 tc = 0; + clib_error_t *error = NULL; mm->tc_copy = false; @@ -817,12 +893,17 @@ map_traffic_class_command_fn (vlib_main_t * vm, else if (unformat (line_input, "%x", &tc)) mm->tc = tc & 0xff; else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } + +done: unformat_free (line_input); - return 0; + return error; } static u8 * @@ -922,6 +1003,7 @@ show_map_domain_command_fn (vlib_main_t * vm, unformat_input_t * input, map_domain_t *d; bool counters = false; u32 map_domain_index = ~0; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -934,10 +1016,12 @@ show_map_domain_command_fn (vlib_main_t * vm, unformat_input_t * input, else if (unformat (line_input, "index %d", &map_domain_index)) ; else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); if (pool_elts (mm->domains) == 0) vlib_cli_output (vm, "No MAP domains are configured..."); @@ -952,15 +1036,19 @@ show_map_domain_command_fn (vlib_main_t * vm, unformat_input_t * input, { if (pool_is_free_index (mm->domains, map_domain_index)) { - return clib_error_return (0, "MAP domain does not exists %d", - map_domain_index); + error = clib_error_return (0, "MAP domain does not exists %d", + map_domain_index); + goto done; } d = pool_elt_at_index (mm->domains, map_domain_index); vlib_cli_output (vm, "%U", format_map_domain, d, counters); } - return 0; +done: + unformat_free (line_input); + + return error; } static clib_error_t * diff --git a/src/vnet/mpls/mpls.c b/src/vnet/mpls/mpls.c index 0e610e17..7ae4aa00 100644 --- a/src/vnet/mpls/mpls.c +++ b/src/vnet/mpls/mpls.c @@ -470,6 +470,8 @@ vnet_mpls_local_label (vlib_main_t * vm, } done: + unformat_free (line_input); + return error; } diff --git a/src/vnet/mpls/mpls_tunnel.c b/src/vnet/mpls/mpls_tunnel.c index 8d1e30a3..e488271d 100644 --- a/src/vnet/mpls/mpls_tunnel.c +++ b/src/vnet/mpls/mpls_tunnel.c @@ -535,6 +535,7 @@ vnet_create_mpls_tunnel_command_fn (vlib_main_t * vm, fib_route_path_t rpath, *rpaths = NULL; mpls_label_t out_label = MPLS_LABEL_INVALID, *labels = NULL; u32 sw_if_index; + clib_error_t *error = NULL; memset(&rpath, 0, sizeof(rpath)); @@ -595,8 +596,11 @@ vnet_create_mpls_tunnel_command_fn (vlib_main_t * vm, else if (unformat (line_input, "l2-only")) l2_only = 1; else - return clib_error_return (0, "unknown input '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } } if (is_del) @@ -606,17 +610,22 @@ vnet_create_mpls_tunnel_command_fn (vlib_main_t * vm, else { if (0 == vec_len(labels)) - return clib_error_return (0, "No Output Labels '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "No Output Labels '%U'", + format_unformat_error, line_input); + goto done; + } vec_add1(rpaths, rpath); vnet_mpls_tunnel_add(rpaths, labels, l2_only, &sw_if_index); } +done: vec_free(labels); vec_free(rpaths); + unformat_free (line_input); - return (NULL); + return error; } /*? diff --git a/src/vnet/pg/cli.c b/src/vnet/pg/cli.c index f5896b43..3c249a7b 100644 --- a/src/vnet/pg/cli.c +++ b/src/vnet/pg/cli.c @@ -547,21 +547,30 @@ pg_capture_cmd_fn (vlib_main_t * vm, else { error = clib_error_create ("unknown input `%U'", - format_unformat_error, input); - return error; + format_unformat_error, line_input); + goto done; } } if (!hi) - return clib_error_return (0, "Please specify interface name"); + { + error = clib_error_return (0, "Please specify interface name"); + goto done; + } if (hi->dev_class_index != pg_dev_class.index) - return clib_error_return (0, "Please specify packet-generator interface"); + { + error = + clib_error_return (0, "Please specify packet-generator interface"); + goto done; + } if (!pcap_file_name && is_disable == 0) - return clib_error_return (0, "Please specify pcap file name"); + { + error = clib_error_return (0, "Please specify pcap file name"); + goto done; + } - unformat_free (line_input); pg_capture_args_t _a, *a = &_a; @@ -572,6 +581,10 @@ pg_capture_cmd_fn (vlib_main_t * vm, a->count = count; error = pg_capture (a); + +done: + unformat_free (line_input); + return error; } @@ -590,6 +603,7 @@ create_pg_if_cmd_fn (vlib_main_t * vm, pg_main_t *pg = &pg_main; unformat_input_t _line_input, *line_input = &_line_input; u32 if_id; + clib_error_t *error = NULL; if (!unformat_user (input, unformat_line_input, line_input)) return 0; @@ -600,14 +614,19 @@ create_pg_if_cmd_fn (vlib_main_t * vm, ; else - return clib_error_create ("unknown input `%U'", - format_unformat_error, input); + { + error = clib_error_create ("unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } + pg_interface_add_or_get (pg, if_id); + +done: unformat_free (line_input); - pg_interface_add_or_get (pg, if_id); - return 0; + return error; } /* *INDENT-OFF* */ diff --git a/src/vnet/policer/node_funcs.c b/src/vnet/policer/node_funcs.c index 1f4997ff..457dd09f 100644 --- a/src/vnet/policer/node_funcs.c +++ b/src/vnet/policer/node_funcs.c @@ -447,6 +447,7 @@ test_policer_command_fn (vlib_main_t * vm, int rx_set = 0; int is_add = 1; int is_show = 0; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -468,7 +469,10 @@ test_policer_command_fn (vlib_main_t * vm, } if (rx_set == 0) - return clib_error_return (0, "interface not set"); + { + error = clib_error_return (0, "interface not set"); + goto done; + } if (is_show) { @@ -477,12 +481,13 @@ test_policer_command_fn (vlib_main_t * vm, policer = pool_elt_at_index (pm->policers, pi); vlib_cli_output (vm, "%U", format_policer_instance, policer); - return 0; + goto done; } if (is_add && config_name == 0) { - return clib_error_return (0, "policer config name required"); + error = clib_error_return (0, "policer config name required"); + goto done; } rv = test_policer_add_del (rx_sw_if_index, config_name, is_add); @@ -493,11 +498,15 @@ test_policer_command_fn (vlib_main_t * vm, break; default: - return clib_error_return + error = clib_error_return (0, "WARNING: vnet_vnet_policer_add_del returned %d", rv); + goto done; } - return 0; +done: + unformat_free (line_input); + + return error; } /* *INDENT-OFF* */ diff --git a/src/vnet/policer/policer.c b/src/vnet/policer/policer.c index 290a6af5..cd754e29 100644 --- a/src/vnet/policer/policer.c +++ b/src/vnet/policer/policer.c @@ -413,6 +413,7 @@ configure_policer_command_fn (vlib_main_t * vm, u8 is_add = 1; u8 *name = 0; u32 pi; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -433,13 +434,19 @@ configure_policer_command_fn (vlib_main_t * vm, foreach_config_param #undef _ else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } + error = policer_add_del (vm, name, &c, &pi, is_add); + +done: unformat_free (line_input); - return policer_add_del (vm, name, &c, &pi, is_add); + return error; } /* *INDENT-OFF* */ diff --git a/src/vnet/unix/tapcli.c b/src/vnet/unix/tapcli.c index 48e81b50..25c930c6 100644 --- a/src/vnet/unix/tapcli.c +++ b/src/vnet/unix/tapcli.c @@ -1308,6 +1308,7 @@ tap_connect_command_fn (vlib_main_t * vm, int ip6_address_set = 0; u32 ip4_mask_width = 0; u32 ip6_mask_width = 0; + clib_error_t *error = NULL; if (tm->is_disabled) return clib_error_return (0, "device disabled..."); @@ -1336,12 +1337,18 @@ tap_connect_command_fn (vlib_main_t * vm, else if (unformat (line_input, "%s", &intfc_name)) ; else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } } if (intfc_name == 0) - return clib_error_return (0, "interface name must be specified"); + { + error = clib_error_return (0, "interface name must be specified"); + goto done; + } memset (ap, 0, sizeof (*ap)); @@ -1367,48 +1374,64 @@ tap_connect_command_fn (vlib_main_t * vm, switch (rv) { case VNET_API_ERROR_SYSCALL_ERROR_1: - return clib_error_return (0, "Couldn't open /dev/net/tun"); + error = clib_error_return (0, "Couldn't open /dev/net/tun"); + goto done; case VNET_API_ERROR_SYSCALL_ERROR_2: - return clib_error_return (0, "Error setting flags on '%s'", intfc_name); - + error = clib_error_return (0, "Error setting flags on '%s'", intfc_name); + goto done; + case VNET_API_ERROR_SYSCALL_ERROR_3: - return clib_error_return (0, "Couldn't open provisioning socket"); + error = clib_error_return (0, "Couldn't open provisioning socket"); + goto done; case VNET_API_ERROR_SYSCALL_ERROR_4: - return clib_error_return (0, "Couldn't get if_index"); + error = clib_error_return (0, "Couldn't get if_index"); + goto done; case VNET_API_ERROR_SYSCALL_ERROR_5: - return clib_error_return (0, "Couldn't bind provisioning socket"); + error = clib_error_return (0, "Couldn't bind provisioning socket"); + goto done; case VNET_API_ERROR_SYSCALL_ERROR_6: - return clib_error_return (0, "Couldn't set device non-blocking flag"); + error = clib_error_return (0, "Couldn't set device non-blocking flag"); + goto done; case VNET_API_ERROR_SYSCALL_ERROR_7: - return clib_error_return (0, "Couldn't set device MTU"); + error = clib_error_return (0, "Couldn't set device MTU"); + goto done; case VNET_API_ERROR_SYSCALL_ERROR_8: - return clib_error_return (0, "Couldn't get interface flags"); + error = clib_error_return (0, "Couldn't get interface flags"); + goto done; case VNET_API_ERROR_SYSCALL_ERROR_9: - return clib_error_return (0, "Couldn't set intfc admin state up"); + error = clib_error_return (0, "Couldn't set intfc admin state up"); + goto done; case VNET_API_ERROR_SYSCALL_ERROR_10: - return clib_error_return (0, "Couldn't set intfc address/mask"); + error = clib_error_return (0, "Couldn't set intfc address/mask"); + goto done; case VNET_API_ERROR_INVALID_REGISTRATION: - return clib_error_return (0, "Invalid registration"); + error = clib_error_return (0, "Invalid registration"); + goto done; case 0: break; default: - return clib_error_return (0, "Unknown error: %d", rv); + error = clib_error_return (0, "Unknown error: %d", rv); + goto done; } vlib_cli_output(vm, "%U\n", format_vnet_sw_if_index_name, vnet_get_main(), sw_if_index); - return 0; + +done: + unformat_free (line_input); + + return error; } VLIB_CLI_COMMAND (tap_connect_command, static) = { diff --git a/src/vnet/vxlan-gpe/vxlan_gpe.c b/src/vnet/vxlan-gpe/vxlan_gpe.c index b97510c4..2cba596f 100644 --- a/src/vnet/vxlan-gpe/vxlan_gpe.c +++ b/src/vnet/vxlan-gpe/vxlan_gpe.c @@ -454,6 +454,7 @@ vxlan_gpe_add_del_tunnel_command_fn (vlib_main_t * vm, u32 tmp; vnet_vxlan_gpe_add_del_tunnel_args_t _a, * a = &_a; u32 sw_if_index; + clib_error_t *error = NULL; /* Get a line of input. */ if (! unformat_user (input, unformat_line_input, line_input)) @@ -494,7 +495,10 @@ vxlan_gpe_add_del_tunnel_command_fn (vlib_main_t * vm, encap_fib_index = ip4_fib_index_from_table_id (tmp); if (encap_fib_index == ~0) - return clib_error_return (0, "nonexistent encap fib id %d", tmp); + { + error = clib_error_return (0, "nonexistent encap fib id %d", tmp); + goto done; + } } else if (unformat (line_input, "decap-vrf-id %d", &tmp)) { @@ -504,7 +508,10 @@ vxlan_gpe_add_del_tunnel_command_fn (vlib_main_t * vm, decap_fib_index = ip4_fib_index_from_table_id (tmp); if (decap_fib_index == ~0) - return clib_error_return (0, "nonexistent decap fib id %d", tmp); + { + error = clib_error_return (0, "nonexistent decap fib id %d", tmp); + goto done; + } } else if (unformat (line_input, "vni %d", &vni)) vni_set = 1; @@ -517,27 +524,43 @@ vxlan_gpe_add_del_tunnel_command_fn (vlib_main_t * vm, else if (unformat(line_input, "next-nsh")) protocol = VXLAN_GPE_PROTOCOL_NSH; else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - if (local_set == 0) - return clib_error_return (0, "tunnel local address not specified"); + { + error = clib_error_return (0, "tunnel local address not specified"); + goto done; + } if (remote_set == 0) - return clib_error_return (0, "tunnel remote address not specified"); + { + error = clib_error_return (0, "tunnel remote address not specified"); + goto done; + } if (ipv4_set && ipv6_set) - return clib_error_return (0, "both IPv4 and IPv6 addresses specified"); + { + error = clib_error_return (0, "both IPv4 and IPv6 addresses specified"); + goto done; + } if ((ipv4_set && memcmp(&local.ip4, &remote.ip4, sizeof(local.ip4)) == 0) || (ipv6_set && memcmp(&local.ip6, &remote.ip6, sizeof(local.ip6)) == 0)) - return clib_error_return (0, "src and dst addresses are identical"); + { + error = clib_error_return (0, "src and dst addresses are identical"); + goto done; + } if (vni_set == 0) - return clib_error_return (0, "vni not specified"); + { + error = clib_error_return (0, "vni not specified"); + goto done; + } memset (a, 0, sizeof (*a)); @@ -558,20 +581,27 @@ vxlan_gpe_add_del_tunnel_command_fn (vlib_main_t * vm, vlib_cli_output(vm, "%U\n", format_vnet_sw_if_index_name, vnet_get_main(), sw_if_index); break; case VNET_API_ERROR_INVALID_DECAP_NEXT: - return clib_error_return (0, "invalid decap-next..."); + error = clib_error_return (0, "invalid decap-next..."); + goto done; case VNET_API_ERROR_TUNNEL_EXIST: - return clib_error_return (0, "tunnel already exists..."); + error = clib_error_return (0, "tunnel already exists..."); + goto done; case VNET_API_ERROR_NO_SUCH_ENTRY: - return clib_error_return (0, "tunnel does not exist..."); + error = clib_error_return (0, "tunnel does not exist..."); + goto done; default: - return clib_error_return + error = clib_error_return (0, "vnet_vxlan_gpe_add_del_tunnel returned %d", rv); + goto done; } - return 0; +done: + unformat_free (line_input); + + return error; } VLIB_CLI_COMMAND (create_vxlan_gpe_tunnel_command, static) = { diff --git a/src/vnet/vxlan/vxlan.c b/src/vnet/vxlan/vxlan.c index 849fc25d..eedc16f8 100644 --- a/src/vnet/vxlan/vxlan.c +++ b/src/vnet/vxlan/vxlan.c @@ -657,6 +657,7 @@ vxlan_add_del_tunnel_command_fn (vlib_main_t * vm, int rv; vnet_vxlan_add_del_tunnel_args_t _a, * a = &_a; u32 tunnel_sw_if_index; + clib_error_t *error = NULL; /* Cant "universally zero init" (={0}) due to GCC bug 53119 */ memset(&src, 0, sizeof src); @@ -715,7 +716,10 @@ vxlan_add_del_tunnel_command_fn (vlib_main_t * vm, { encap_fib_index = fib_table_find (fib_ip_proto (ipv6_set), tmp); if (encap_fib_index == ~0) - return clib_error_return (0, "nonexistent encap-vrf-id %d", tmp); + { + error = clib_error_return (0, "nonexistent encap-vrf-id %d", tmp); + goto done; + } } else if (unformat (line_input, "decap-next %U", unformat_decap_next, &decap_next_index, ipv4_set)) @@ -723,41 +727,72 @@ vxlan_add_del_tunnel_command_fn (vlib_main_t * vm, else if (unformat (line_input, "vni %d", &vni)) { if (vni >> 24) - return clib_error_return (0, "vni %d out of range", vni); + { + error = clib_error_return (0, "vni %d out of range", vni); + goto done; + } } else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + goto done; + } } - unformat_free (line_input); - if (src_set == 0) - return clib_error_return (0, "tunnel src address not specified"); + { + error = clib_error_return (0, "tunnel src address not specified"); + goto done; + } if (dst_set == 0) - return clib_error_return (0, "tunnel dst address not specified"); + { + error = clib_error_return (0, "tunnel dst address not specified"); + goto done; + } if (grp_set && !ip46_address_is_multicast(&dst)) - return clib_error_return (0, "tunnel group address not multicast"); + { + error = clib_error_return (0, "tunnel group address not multicast"); + goto done; + } if (grp_set == 0 && ip46_address_is_multicast(&dst)) - return clib_error_return (0, "dst address must be unicast"); + { + error = clib_error_return (0, "dst address must be unicast"); + goto done; + } if (grp_set && mcast_sw_if_index == ~0) - return clib_error_return (0, "tunnel nonexistent multicast device"); + { + error = clib_error_return (0, "tunnel nonexistent multicast device"); + goto done; + } if (ipv4_set && ipv6_set) - return clib_error_return (0, "both IPv4 and IPv6 addresses specified"); + { + error = clib_error_return (0, "both IPv4 and IPv6 addresses specified"); + goto done; + } if (ip46_address_cmp(&src, &dst) == 0) - return clib_error_return (0, "src and dst addresses are identical"); + { + error = clib_error_return (0, "src and dst addresses are identical"); + goto done; + } if (decap_next_index == ~0) - return clib_error_return (0, "next node not found"); + { + error = clib_error_return (0, "next node not found"); + goto done; + } if (vni == 0) - return clib_error_return (0, "vni not specified"); + { + error = clib_error_return (0, "vni not specified"); + goto done; + } memset (a, 0, sizeof (*a)); @@ -779,17 +814,23 @@ vxlan_add_del_tunnel_command_fn (vlib_main_t * vm, break; case VNET_API_ERROR_TUNNEL_EXIST: - return clib_error_return (0, "tunnel already exists..."); + error = clib_error_return (0, "tunnel already exists..."); + goto done; case VNET_API_ERROR_NO_SUCH_ENTRY: - return clib_error_return (0, "tunnel does not exist..."); + error = clib_error_return (0, "tunnel does not exist..."); + goto done; default: - return clib_error_return + error = clib_error_return (0, "vnet_vxlan_add_del_tunnel returned %d", rv); + goto done; } - return 0; +done: + unformat_free (line_input); + + return error; } /*? @@ -912,6 +953,8 @@ set_ip_vxlan_bypass (u32 is_ip6, vnet_int_vxlan_bypass_mode (sw_if_index, is_ip6, is_enable); done: + unformat_free (line_input); + return error; } diff --git a/src/vpp/app/l2t.c b/src/vpp/app/l2t.c index 45dd2807..e1eda155 100644 --- a/src/vpp/app/l2t.c +++ b/src/vpp/app/l2t.c @@ -254,6 +254,7 @@ l2tp_session_add_command_fn (vlib_main_t * vm, u32 local_session_id = 1, remote_session_id = 1; int our_address_set = 0, client_address_set = 0; int l2_sublayer_present = 0; + clib_error_t *error = NULL; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -290,8 +291,12 @@ l2tp_session_add_command_fn (vlib_main_t * vm, else if (unformat (line_input, "l2-sublayer-present")) l2_sublayer_present = 1; else - return clib_error_return (0, "parse error: '%U'", - format_unformat_error, line_input); + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + unformat_free (line_input); + return error; + } } unformat_free (line_input); diff --git a/src/vpp/app/vpe_cli.c b/src/vpp/app/vpe_cli.c index a26bf71f..94bdc84c 100644 --- a/src/vpp/app/vpe_cli.c +++ b/src/vpp/app/vpe_cli.c @@ -36,6 +36,7 @@ virtual_ip_cmd_fn_command_fn (vlib_main_t * vm, mac_addr_t *mac_addrs = 0; u32 sw_if_index; u32 i; + clib_error_t *error = NULL; next_hops = NULL; rpaths = NULL; @@ -49,7 +50,11 @@ virtual_ip_cmd_fn_command_fn (vlib_main_t * vm, if (!unformat (line_input, "%U %U", unformat_ip4_address, &prefix.fp_addr.ip4, unformat_vnet_sw_interface, vnm, &sw_if_index)) - goto barf; + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) { @@ -67,13 +72,18 @@ virtual_ip_cmd_fn_command_fn (vlib_main_t * vm, } else { - barf: - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; } } + if (vec_len (mac_addrs) == 0 || vec_len (mac_addrs) != vec_len (next_hops)) - goto barf; + { + error = clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + goto done; + } /* Create / delete special interface route /32's */ @@ -100,10 +110,12 @@ virtual_ip_cmd_fn_command_fn (vlib_main_t * vm, &prefix, FIB_SOURCE_CLI, FIB_ENTRY_FLAG_NONE, rpaths); +done: vec_free (mac_addrs); vec_free (next_hops); + unformat_free (line_input); - return 0; + return error; } /* *INDENT-OFF* */ -- cgit 1.2.3-korg From 26cd8c129567b48ed0e3610293251ca78fa67103 Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Thu, 23 Feb 2017 17:11:26 -0500 Subject: VPP-650: handle buffer failure in vlib_buffer_copy(...) Change-Id: I6aac48d780fcd935818221044eae50067f225175 Signed-off-by: Dave Barach --- src/vlib/buffer_funcs.h | 11 ++++++++++- src/vnet/dpo/replicate_dpo.c | 32 +++++++++++++++++++++++++++++++- src/vnet/lawful-intercept/node.c | 18 ++++++++++++++---- src/vnet/span/node.c | 12 ++++++++---- 4 files changed, 63 insertions(+), 10 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index fd051de5..0b583a61 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -489,7 +489,15 @@ vlib_buffer_copy (vlib_main_t * vm, vlib_buffer_t * b) vec_validate (new_buffers, n_buffers - 1); n_alloc = vlib_buffer_alloc (vm, new_buffers, n_buffers); - ASSERT (n_alloc == n_buffers); + + /* No guarantee that we'll get all the buffers we asked for */ + if (PREDICT_FALSE (n_alloc < n_buffers)) + { + if (n_alloc > 0) + vlib_buffer_free (vm, new_buffers, n_alloc); + vec_free (new_buffers); + return 0; + } /* 1st segment */ s = b; @@ -518,6 +526,7 @@ vlib_buffer_copy (vlib_main_t * vm, vlib_buffer_t * b) d->flags = s->flags & flag_mask; } + vec_free (new_buffers); return fd; } diff --git a/src/vnet/dpo/replicate_dpo.c b/src/vnet/dpo/replicate_dpo.c index c779516f..a67b19c8 100644 --- a/src/vnet/dpo/replicate_dpo.c +++ b/src/vnet/dpo/replicate_dpo.c @@ -34,6 +34,21 @@ #define REP_DBG(_p, _fmt, _args...) #endif +#define foreach_replicate_dpo_error \ +_(BUFFER_ALLOCATION_FAILURE, "Buffer Allocation Failure") + +typedef enum { +#define _(sym,str) REPLICATE_DPO_ERROR_##sym, + foreach_replicate_dpo_error +#undef _ + REPLICATE_DPO_N_ERROR, +} replicate_dpo_error_t; + +static char * replicate_dpo_error_strings[] = { +#define _(sym,string) string, + foreach_replicate_dpo_error +#undef _ +}; /** * Pool of all DPOs. It's not static so the DP can have fast access @@ -678,8 +693,17 @@ replicate_inline (vlib_main_t * vm, vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); } - /* Make a copy */ + /* Make a copy. This can fail, so deal with it. */ c0 = vlib_buffer_copy(vm, b0); + if (PREDICT_FALSE (c0 == 0)) + { + vlib_node_increment_counter + (vm, node->node_index, + REPLICATE_DPO_ERROR_BUFFER_ALLOCATION_FAILURE, + 1); + continue; + } + ci0 = vlib_get_buffer_index(vm, c0); to_next[0] = ci0; @@ -738,6 +762,9 @@ VLIB_REGISTER_NODE (ip4_replicate_node) = { .name = "ip4-replicate", .vector_size = sizeof (u32), + .n_errors = ARRAY_LEN(replicate_dpo_error_strings), + .error_strings = replicate_dpo_error_strings, + .format_trace = format_replicate_trace, .n_next_nodes = 1, .next_nodes = { @@ -761,6 +788,9 @@ VLIB_REGISTER_NODE (ip6_replicate_node) = { .name = "ip6-replicate", .vector_size = sizeof (u32), + .n_errors = ARRAY_LEN(replicate_dpo_error_strings), + .error_strings = replicate_dpo_error_strings, + .format_trace = format_replicate_trace, .n_next_nodes = 1, .next_nodes = { diff --git a/src/vnet/lawful-intercept/node.c b/src/vnet/lawful-intercept/node.c index ea0cd8ef..50c76ec5 100644 --- a/src/vnet/lawful-intercept/node.c +++ b/src/vnet/lawful-intercept/node.c @@ -42,9 +42,10 @@ static u8 * format_li_hit_trace (u8 * s, va_list * args) vlib_node_registration_t li_hit_node; -#define foreach_li_hit_error \ -_(HITS, "LI packets processed") \ -_(NO_COLLECTOR, "No collector configured") +#define foreach_li_hit_error \ +_(HITS, "LI packets processed") \ +_(NO_COLLECTOR, "No collector configured") \ +_(BUFFER_ALLOCATION_FAILURE, "Buffer allocation failure") typedef enum { #define _(sym,str) LI_HIT_ERROR_##sym, @@ -197,8 +198,16 @@ li_hit_node_fn (vlib_main_t * vm, b0 = vlib_get_buffer (vm, bi0); if (PREDICT_TRUE(to_int_next != 0)) { - /* Make an intercept copy */ + /* Make an intercept copy. This can fail. */ c0 = vlib_buffer_copy (vm, b0); + + if (PREDICT_FALSE (c0 == 0)) + { + vlib_node_increment_counter + (vm, node->node_index, + LI_HIT_ERROR_BUFFER_ALLOCATION_FAILURE, 1); + goto skip; + } vlib_buffer_advance(c0, -sizeof(*iu0)); @@ -225,6 +234,7 @@ li_hit_node_fn (vlib_main_t * vm, to_int_next++; } + skip: if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) && (b0->flags & VLIB_BUFFER_IS_TRACED))) { diff --git a/src/vnet/span/node.c b/src/vnet/span/node.c index 50d642c2..5037c120 100644 --- a/src/vnet/span/node.c +++ b/src/vnet/span/node.c @@ -83,11 +83,15 @@ span_mirror (vlib_main_t * vm, span_interface_t * si0, vlib_buffer_t * b0, mirror_frames[i] = vnet_get_frame_to_sw_interface (vnm, i); to_mirror_next = vlib_frame_vector_args (mirror_frames[i]); to_mirror_next += mirror_frames[i]->n_vectors; + /* This can fail */ c0 = vlib_buffer_copy (vm, b0); - vnet_buffer (c0)->sw_if_index[VLIB_TX] = i; - c0->flags |= VNET_BUFFER_SPAN_CLONE; - to_mirror_next[0] = vlib_get_buffer_index (vm, c0); - mirror_frames[i]->n_vectors++; + if (PREDICT_TRUE(c0 != 0)) + { + vnet_buffer (c0)->sw_if_index[VLIB_TX] = i; + c0->flags |= VNET_BUFFER_SPAN_CLONE; + to_mirror_next[0] = vlib_get_buffer_index (vm, c0); + mirror_frames[i]->n_vectors++; + } })); /* *INDENT-ON* */ } -- cgit 1.2.3-korg From a83bc30d064e70a4e20d2728e14e0b59667b17c5 Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Sat, 25 Feb 2017 16:38:12 -0500 Subject: Load plugins in alphabetical order API traces contain absolute message numbers. Loading plugins in directory (vs. alphabetical) order makes trace replay fragile. Change-Id: I46b3a3b6a9843a383d42269fca0cf5a789486eaf Signed-off-by: Dave Barach --- src/vlib/unix/plugin.c | 64 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 9 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/unix/plugin.c b/src/vlib/unix/plugin.c index 15a2bcbb..e9846e3b 100644 --- a/src/vlib/unix/plugin.c +++ b/src/vlib/unix/plugin.c @@ -220,6 +220,15 @@ split_plugin_path (plugin_main_t * pm) return rv; } +static int +plugin_name_sort_cmp (void *a1, void *a2) +{ + plugin_info_t *p1 = a1; + plugin_info_t *p2 = a2; + + return strcmp ((char *) p1->name, (char *) p2->name); +} + int vlib_load_new_plugins (plugin_main_t * pm, int from_early_init) { @@ -229,6 +238,7 @@ vlib_load_new_plugins (plugin_main_t * pm, int from_early_init) uword *p; plugin_info_t *pi; u8 **plugin_path; + u32 *load_fail_indices = 0; int i; plugin_path = split_plugin_path (pm); @@ -271,22 +281,15 @@ vlib_load_new_plugins (plugin_main_t * pm, int from_early_init) goto ignore; plugin_name = format (0, "%s%c", entry->d_name, 0); + /* Have we seen this plugin already? */ p = hash_get_mem (pm->plugin_by_name_hash, plugin_name); if (p == 0) { + /* No, add it to the plugin vector */ vec_add2 (pm->plugin_info, pi, 1); pi->name = plugin_name; pi->filename = filename; pi->file_info = statb; - - if (load_one_plugin (pm, pi, from_early_init)) - { - vec_free (plugin_name); - vec_free (filename); - _vec_len (pm->plugin_info) = vec_len (pm->plugin_info) - 1; - memset (pi, 0, sizeof (*pi)); - continue; - } hash_set_mem (pm->plugin_by_name_hash, plugin_name, pi - pm->plugin_info); } @@ -297,6 +300,49 @@ vlib_load_new_plugins (plugin_main_t * pm, int from_early_init) vec_free (plugin_path[i]); } vec_free (plugin_path); + + + /* + * Sort the plugins by name. This is important. + * API traces contain absolute message numbers. + * Loading plugins in directory (vs. alphabetical) order + * makes trace replay incredibly fragile. + */ + vec_sort_with_function (pm->plugin_info, plugin_name_sort_cmp); + + /* + * Attempt to load the plugins + */ + for (i = 0; i < vec_len (pm->plugin_info); i++) + { + pi = vec_elt_at_index (pm->plugin_info, i); + + if (load_one_plugin (pm, pi, from_early_init)) + { + /* Make a note of any which fail to load */ + vec_add1 (load_fail_indices, i); + hash_unset_mem (pm->plugin_by_name_hash, pi->name); + vec_free (pi->name); + vec_free (pi->filename); + } + } + + /* Remove plugin info vector elements corresponding to load failures */ + if (vec_len (load_fail_indices) > 0) + { + for (i = vec_len (load_fail_indices) - 1; i >= 0; i--) + vec_delete (pm->plugin_info, 1, load_fail_indices[i]); + vec_free (load_fail_indices); + } + + /* Recreate the plugin name hash */ + for (i = 0; i < vec_len (pm->plugin_info); i++) + { + pi = vec_elt_at_index (pm->plugin_info, i); + hash_unset_mem (pm->plugin_by_name_hash, pi->name); + hash_set_mem (pm->plugin_by_name_hash, pi->name, pi - pm->plugin_info); + } + return 0; } -- cgit 1.2.3-korg From ff542707733102b2573dca2496ea427b3dba3b10 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Mon, 27 Feb 2017 11:29:20 +0100 Subject: vlib: add VLIB_BUFFER_EXT_HDR_VALID flag Change-Id: If56c66dd12eded1cc997087de5fd1b975766c4e2 Signed-off-by: Damjan Marion --- src/vlib/buffer.h | 2 ++ src/vnet/buffer.h | 3 --- src/vnet/devices/dpdk/device.c | 2 +- src/vnet/devices/dpdk/init.c | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index fffb50c8..8ea79502 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -77,6 +77,7 @@ typedef struct
VLIB_BUFFER_REPL_FAIL: packet replication failure
VLIB_BUFFER_RECYCLE: as it says
VLIB_BUFFER_FLOW_REPORT: buffer is a flow report, +
VLIB_BUFFER_EXT_HDR_VALID: buffer contains valid external buffer manager header, set to avoid adding it to a flow report
VLIB_BUFFER_FLAG_USER(n): user-defined bit N */ @@ -88,6 +89,7 @@ typedef struct #define VLIB_BUFFER_REPL_FAIL (1 << 4) #define VLIB_BUFFER_RECYCLE (1 << 5) #define VLIB_BUFFER_FLOW_REPORT (1 << 6) +#define VLIB_BUFFER_EXT_HDR_VALID (1 << 7) /* User defined buffer flags. */ #define LOG2_VLIB_BUFFER_FLAG_USER(n) (32 - (n)) diff --git a/src/vnet/buffer.h b/src/vnet/buffer.h index 45fc352a..f1cc6371 100644 --- a/src/vnet/buffer.h +++ b/src/vnet/buffer.h @@ -61,9 +61,6 @@ #define ETH_BUFFER_VLAN_BITS (ETH_BUFFER_VLAN_1_DEEP | \ ETH_BUFFER_VLAN_2_DEEP) -#define LOG2_VNET_BUFFER_RTE_MBUF_VALID LOG2_VLIB_BUFFER_FLAG_USER(5) -#define VNET_BUFFER_RTE_MBUF_VALID (1 << LOG2_VNET_BUFFER_RTE_MBUF_VALID) - #define LOG2_BUFFER_HANDOFF_NEXT_VALID LOG2_VLIB_BUFFER_FLAG_USER(6) #define BUFFER_HANDOFF_NEXT_VALID (1 << LOG2_BUFFER_HANDOFF_NEXT_VALID) diff --git a/src/vnet/devices/dpdk/device.c b/src/vnet/devices/dpdk/device.c index cd32389c..c9d9a567 100644 --- a/src/vnet/devices/dpdk/device.c +++ b/src/vnet/devices/dpdk/device.c @@ -159,7 +159,7 @@ dpdk_validate_rte_mbuf (vlib_main_t * vm, vlib_buffer_t * b, /* buffer is coming from non-dpdk source so we need to init rte_mbuf header */ - if (PREDICT_FALSE ((b->flags & VNET_BUFFER_RTE_MBUF_VALID) == 0)) + if (PREDICT_FALSE ((b->flags & VLIB_BUFFER_EXT_HDR_VALID) == 0)) { vlib_buffer_t *b2 = b; last_mb = mb = rte_mbuf_from_vlib_buffer (b2); diff --git a/src/vnet/devices/dpdk/init.c b/src/vnet/devices/dpdk/init.c index c50c0659..ec008c20 100755 --- a/src/vnet/devices/dpdk/init.c +++ b/src/vnet/devices/dpdk/init.c @@ -1790,7 +1790,7 @@ dpdk_init (vlib_main_t * vm) /* Default vlib_buffer_t flags, DISABLES tcp/udp checksumming... */ dm->buffer_flags_template = - (VLIB_BUFFER_TOTAL_LENGTH_VALID | VNET_BUFFER_RTE_MBUF_VALID + (VLIB_BUFFER_TOTAL_LENGTH_VALID | VLIB_BUFFER_EXT_HDR_VALID | IP_BUFFER_L4_CHECKSUM_COMPUTED | IP_BUFFER_L4_CHECKSUM_CORRECT); dm->stat_poll_interval = DPDK_STATS_POLL_INTERVAL; -- cgit 1.2.3-korg From c47ed032c6d036a9f942fc9ced48874fad55b48c Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 25 Jan 2017 14:18:03 +0100 Subject: vlib: add buffer cloning support Change-Id: I50070611af15b2b4cc29664a8bee4f821ac3c835 Signed-off-by: Damjan Marion --- src/scripts/vnet/mcast/ip4 | 19 +-- src/vlib/buffer.c | 254 ++++++++++------------------------------- src/vlib/buffer.h | 4 +- src/vlib/buffer_funcs.h | 113 +++++++++++++++++- src/vnet/devices/dpdk/buffer.c | 41 +++++-- src/vnet/devices/dpdk/device.c | 11 +- src/vnet/dpo/replicate_dpo.c | 76 ++++++------ src/vnet/dpo/replicate_dpo.h | 3 + 8 files changed, 256 insertions(+), 265 deletions(-) (limited to 'src/vlib') diff --git a/src/scripts/vnet/mcast/ip4 b/src/scripts/vnet/mcast/ip4 index 69f1ee00..eb6bab27 100644 --- a/src/scripts/vnet/mcast/ip4 +++ b/src/scripts/vnet/mcast/ip4 @@ -2,7 +2,7 @@ packet-generator new { name x limit 1 node ip4-input - size 64-64 + size 512-512 no-recycle data { ICMP: 1.0.0.2 -> 232.1.1.1 @@ -11,12 +11,15 @@ packet-generator new { } } -trace add pg-input 100 -loop create -loop create -set int state loop0 up -set int state loop1 up +create packet-generator interface pg1 +create packet-generator interface pg2 +create packet-generator interface pg3 + +set int state pg1 up +set int state pg2 up +set int state pg3 up ip mroute add 232.1.1.1 via pg0 Accept -ip mroute add 232.1.1.1 via loop0 Forward -ip mroute add 232.1.1.1 via loop1 Forward +ip mroute add 232.1.1.1 via pg1 Forward +ip mroute add 232.1.1.1 via pg2 Forward +ip mroute add 232.1.1.1 via pg3 Forward diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index 95b4344f..4f5eb09d 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -68,8 +68,9 @@ format_vlib_buffer (u8 * s, va_list * args) vlib_buffer_t *b = va_arg (*args, vlib_buffer_t *); uword indent = format_get_indent (s); - s = format (s, "current data %d, length %d, free-list %d", - b->current_data, b->current_length, b->free_list_index); + s = format (s, "current data %d, length %d, free-list %d, clone-count %u", + b->current_data, b->current_length, b->free_list_index, + b->n_add_refs); if (b->flags & VLIB_BUFFER_TOTAL_LENGTH_VALID) s = format (s, ", totlen-nifb %d", @@ -84,8 +85,10 @@ format_vlib_buffer (u8 * s, va_list * args) u32 next_buffer = b->next_buffer; b = vlib_get_buffer (vm, next_buffer); - s = format (s, "\n%Unext-buffer 0x%x, segment length %d", - format_white_space, indent, next_buffer, b->current_length); + s = + format (s, "\n%Unext-buffer 0x%x, segment length %d, clone-count %u", + format_white_space, indent, next_buffer, b->current_length, + b->n_add_refs); } return s; @@ -262,7 +265,7 @@ vlib_main_t **vlib_mains; /* When dubugging validate that given buffers are either known allocated or known free. */ -static void __attribute__ ((unused)) +static void vlib_buffer_validate_alloc_free (vlib_main_t * vm, u32 * buffers, uword n_buffers, @@ -362,6 +365,7 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm, /* Setup free buffer template. */ f->buffer_init_template.free_list_index = f->index; + f->buffer_init_template.n_add_refs = 0; if (is_public) { @@ -620,19 +624,11 @@ vlib_buffer_free_inline (vlib_main_t * vm, { vlib_buffer_main_t *bm = vm->buffer_main; vlib_buffer_free_list_t *fl; - static u32 *next_to_free[2]; /* smp bad */ - u32 i_next_to_free, *b, *n, *f, fi; - uword n_left; + u32 fi; int i; - static vlib_buffer_free_list_t **announce_list; - vlib_buffer_free_list_t *fl0 = 0, *fl1 = 0; - u32 bi0 = (u32) ~ 0, bi1 = (u32) ~ 0, fi0, fi1 = (u32) ~ 0; - u8 free0, free1 = 0, free_next0, free_next1; u32 (*cb) (vlib_main_t * vm, u32 * buffers, u32 n_buffers, u32 follow_buffer_next); - ASSERT (os_get_cpu_number () == 0); - cb = bm->buffer_free_callback; if (PREDICT_FALSE (cb != 0)) @@ -641,203 +637,68 @@ vlib_buffer_free_inline (vlib_main_t * vm, if (!n_buffers) return; - /* Use first buffer to get default free list. */ - { - u32 bi0 = buffers[0]; - vlib_buffer_t *b0; - - b0 = vlib_get_buffer (vm, bi0); - fl = vlib_buffer_get_buffer_free_list (vm, b0, &fi); - if (fl->buffers_added_to_freelist_function) - vec_add1 (announce_list, fl); - } - - vec_validate (next_to_free[0], n_buffers - 1); - vec_validate (next_to_free[1], n_buffers - 1); - - i_next_to_free = 0; - n_left = n_buffers; - b = buffers; - -again: - /* Verify that buffers are known allocated. */ - vlib_buffer_validate_alloc_free (vm, b, - n_left, VLIB_BUFFER_KNOWN_ALLOCATED); - - vec_add2_aligned (fl->buffers, f, n_left, CLIB_CACHE_LINE_BYTES); - - n = next_to_free[i_next_to_free]; - while (n_left >= 4) - { - vlib_buffer_t *b0, *b1, *binit0, *binit1, dummy_buffers[2]; - - bi0 = b[0]; - bi1 = b[1]; - - f[0] = bi0; - f[1] = bi1; - f += 2; - b += 2; - n_left -= 2; - - /* Prefetch buffers for next iteration. */ - vlib_prefetch_buffer_with_index (vm, b[0], WRITE); - vlib_prefetch_buffer_with_index (vm, b[1], WRITE); - - b0 = vlib_get_buffer (vm, bi0); - b1 = vlib_get_buffer (vm, bi1); - - free0 = (b0->flags & VLIB_BUFFER_RECYCLE) == 0; - free1 = (b1->flags & VLIB_BUFFER_RECYCLE) == 0; - - /* Must be before init which will over-write buffer flags. */ - if (follow_buffer_next) - { - n[0] = b0->next_buffer; - free_next0 = free0 && (b0->flags & VLIB_BUFFER_NEXT_PRESENT) != 0; - n += free_next0; - - n[0] = b1->next_buffer; - free_next1 = free1 && (b1->flags & VLIB_BUFFER_NEXT_PRESENT) != 0; - n += free_next1; - } - else - free_next0 = free_next1 = 0; - - /* Must be before init which will over-write buffer free list. */ - fi0 = b0->free_list_index; - fi1 = b1->free_list_index; - - if (PREDICT_FALSE (fi0 != fi || fi1 != fi)) - goto slow_path_x2; - - binit0 = free0 ? b0 : &dummy_buffers[0]; - binit1 = free1 ? b1 : &dummy_buffers[1]; - - vlib_buffer_init_two_for_free_list (binit0, binit1, fl); - continue; - - slow_path_x2: - /* Backup speculation. */ - f -= 2; - n -= free_next0 + free_next1; - - _vec_len (fl->buffers) = f - fl->buffers; - - fl0 = pool_elt_at_index (bm->buffer_free_list_pool, fi0); - fl1 = pool_elt_at_index (bm->buffer_free_list_pool, fi1); - - vlib_buffer_add_to_free_list (vm, fl0, bi0, free0); - if (PREDICT_FALSE (fl0->buffers_added_to_freelist_function != 0)) - { - int i; - for (i = 0; i < vec_len (announce_list); i++) - if (fl0 == announce_list[i]) - goto no_fl0; - vec_add1 (announce_list, fl0); - } - no_fl0: - if (PREDICT_FALSE (fl1->buffers_added_to_freelist_function != 0)) - { - int i; - for (i = 0; i < vec_len (announce_list); i++) - if (fl1 == announce_list[i]) - goto no_fl1; - vec_add1 (announce_list, fl1); - } - - no_fl1: - vlib_buffer_add_to_free_list (vm, fl1, bi1, free1); - - /* Possibly change current free list. */ - if (fi0 != fi && fi1 != fi) - { - fi = fi1; - fl = pool_elt_at_index (bm->buffer_free_list_pool, fi); - } - - vec_add2_aligned (fl->buffers, f, n_left, CLIB_CACHE_LINE_BYTES); - } - - while (n_left >= 1) + for (i = 0; i < n_buffers; i++) { - vlib_buffer_t *b0, *binit0, dummy_buffers[1]; + vlib_buffer_t *b; + u32 bi = buffers[i]; - bi0 = b[0]; - f[0] = bi0; - f += 1; - b += 1; - n_left -= 1; - - b0 = vlib_get_buffer (vm, bi0); + b = vlib_get_buffer (vm, bi); - free0 = (b0->flags & VLIB_BUFFER_RECYCLE) == 0; + fl = vlib_buffer_get_buffer_free_list (vm, b, &fi); - /* Must be before init which will over-write buffer flags. */ - if (follow_buffer_next) + /* The only current use of this callback: multicast recycle */ + if (PREDICT_FALSE (fl->buffers_added_to_freelist_function != 0)) { - n[0] = b0->next_buffer; - free_next0 = free0 && (b0->flags & VLIB_BUFFER_NEXT_PRESENT) != 0; - n += free_next0; + int j; + + vlib_buffer_add_to_free_list + (vm, fl, buffers[i], (b->flags & VLIB_BUFFER_RECYCLE) == 0); + + for (j = 0; j < vec_len (bm->announce_list); j++) + { + if (fl == bm->announce_list[j]) + goto already_announced; + } + vec_add1 (bm->announce_list, fl); + already_announced: + ; } else - free_next0 = 0; - - /* Must be before init which will over-write buffer free list. */ - fi0 = b0->free_list_index; - - if (PREDICT_FALSE (fi0 != fi)) - goto slow_path_x1; - - binit0 = free0 ? b0 : &dummy_buffers[0]; - - vlib_buffer_init_for_free_list (binit0, fl); - continue; - - slow_path_x1: - /* Backup speculation. */ - f -= 1; - n -= free_next0; - - _vec_len (fl->buffers) = f - fl->buffers; - - fl0 = pool_elt_at_index (bm->buffer_free_list_pool, fi0); - - vlib_buffer_add_to_free_list (vm, fl0, bi0, free0); - if (PREDICT_FALSE (fl0->buffers_added_to_freelist_function != 0)) { - int i; - for (i = 0; i < vec_len (announce_list); i++) - if (fl0 == announce_list[i]) - goto no_fl00; - vec_add1 (announce_list, fl0); + if (PREDICT_TRUE ((b->flags & VLIB_BUFFER_RECYCLE) == 0)) + { + u32 flags, next; + + do + { + vlib_buffer_t *nb = vlib_get_buffer (vm, bi); + flags = nb->flags; + next = nb->next_buffer; + if (nb->n_add_refs) + nb->n_add_refs--; + else + { + vlib_buffer_validate_alloc_free (vm, &bi, 1, + VLIB_BUFFER_KNOWN_ALLOCATED); + vlib_buffer_add_to_free_list (vm, fl, bi, 1); + } + bi = next; + } + while (follow_buffer_next + && (flags & VLIB_BUFFER_NEXT_PRESENT)); + + } } - - no_fl00: - fi = fi0; - fl = pool_elt_at_index (bm->buffer_free_list_pool, fi); - - vec_add2_aligned (fl->buffers, f, n_left, CLIB_CACHE_LINE_BYTES); } - - if (follow_buffer_next && ((n_left = n - next_to_free[i_next_to_free]) > 0)) - { - b = next_to_free[i_next_to_free]; - i_next_to_free ^= 1; - goto again; - } - - _vec_len (fl->buffers) = f - fl->buffers; - - if (vec_len (announce_list)) + if (vec_len (bm->announce_list)) { vlib_buffer_free_list_t *fl; - for (i = 0; i < vec_len (announce_list); i++) + for (i = 0; i < vec_len (bm->announce_list); i++) { - fl = announce_list[i]; + fl = bm->announce_list[i]; fl->buffers_added_to_freelist_function (vm, fl); } - _vec_len (announce_list) = 0; + _vec_len (bm->announce_list) = 0; } } @@ -922,6 +783,7 @@ vlib_packet_template_init (vlib_main_t * vm, fl->buffer_init_template.current_data = 0; fl->buffer_init_template.current_length = n_packet_data_bytes; fl->buffer_init_template.flags = 0; + fl->buffer_init_template.n_add_refs = 0; vlib_worker_thread_barrier_release (vm); } diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index 8ea79502..b4015b30 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -119,7 +119,9 @@ typedef struct feature node */ - u8 dont_waste_me[3]; /**< Available space in the (precious) + u8 n_add_refs; /**< Number of additional references to this buffer. */ + + u8 dont_waste_me[2]; /**< Available space in the (precious) first 32 octets of buffer metadata Before allocating any of it, discussion required! */ diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 0b583a61..e0fde5f2 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -530,6 +530,110 @@ vlib_buffer_copy (vlib_main_t * vm, vlib_buffer_t * b) return fd; } +/** \brief Create multiple clones of buffer and store them in the supplied array + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param src_buffer - (u32) source buffer index + @param buffers - (u32 * ) buffer index array + @param n_buffers - (u8) number of buffer clones requested + @param head_end_offset - (u16) offset relative to current position + where packet head ends + @return - (u8) number of buffers actually cloned, may be + less than the number requested or zero +*/ + +always_inline u8 +vlib_buffer_clone (vlib_main_t * vm, u32 src_buffer, u32 * buffers, + u8 n_buffers, u16 head_end_offset) +{ + u8 i; + vlib_buffer_t *s = vlib_get_buffer (vm, src_buffer); + + ASSERT (s->n_add_refs == 0); + ASSERT (n_buffers); + + if (s->current_length <= head_end_offset + CLIB_CACHE_LINE_BYTES * 2) + { + buffers[0] = src_buffer; + for (i = 1; i < n_buffers; i++) + { + vlib_buffer_t *d; + d = vlib_buffer_copy (vm, s); + if (d == 0) + return i; + buffers[i] = vlib_get_buffer_index (vm, d); + + } + return n_buffers; + } + + n_buffers = vlib_buffer_alloc_from_free_list (vm, buffers, n_buffers, + s->free_list_index); + if (PREDICT_FALSE (n_buffers == 0)) + { + buffers[0] = src_buffer; + return 1; + } + + for (i = 0; i < n_buffers; i++) + { + vlib_buffer_t *d = vlib_get_buffer (vm, buffers[i]); + d->current_data = s->current_data; + d->current_length = head_end_offset; + d->free_list_index = s->free_list_index; + d->total_length_not_including_first_buffer = + s->total_length_not_including_first_buffer + s->current_length - + head_end_offset; + d->flags = s->flags | VLIB_BUFFER_NEXT_PRESENT; + d->flags &= ~VLIB_BUFFER_EXT_HDR_VALID; + clib_memcpy (d->opaque, s->opaque, sizeof (s->opaque)); + clib_memcpy (vlib_buffer_get_current (d), vlib_buffer_get_current (s), + head_end_offset); + d->next_buffer = src_buffer; + } + vlib_buffer_advance (s, head_end_offset); + s->n_add_refs = n_buffers - 1; + while (s->flags & VLIB_BUFFER_NEXT_PRESENT) + { + s = vlib_get_buffer (vm, s->next_buffer); + s->n_add_refs = n_buffers - 1; + } + + return n_buffers; +} + +/** \brief Attach cloned tail to the buffer + + @param vm - (vlib_main_t *) vlib main data structure pointer + @param head - (vlib_buffer_t *) head buffer + @param tail - (Vlib buffer_t *) tail buffer to clone and attach to head +*/ + +always_inline void +vlib_buffer_attach_clone (vlib_main_t * vm, vlib_buffer_t * head, + vlib_buffer_t * tail) +{ + ASSERT ((head->flags & VLIB_BUFFER_NEXT_PRESENT) == 0); + ASSERT (head->free_list_index == tail->free_list_index); + + head->flags |= VLIB_BUFFER_NEXT_PRESENT; + head->flags &= ~VLIB_BUFFER_TOTAL_LENGTH_VALID; + head->flags &= ~VLIB_BUFFER_EXT_HDR_VALID; + head->flags |= (tail->flags & VLIB_BUFFER_TOTAL_LENGTH_VALID); + head->next_buffer = vlib_get_buffer_index (vm, tail); + head->total_length_not_including_first_buffer = tail->current_length + + tail->total_length_not_including_first_buffer; + +next_segment: + __sync_add_and_fetch (&tail->n_add_refs, 1); + + if (tail->flags & VLIB_BUFFER_NEXT_PRESENT) + { + tail = vlib_get_buffer (vm, tail->next_buffer); + goto next_segment; + } +} + /* Initializes the buffer as an empty packet with no chained buffers. */ always_inline void vlib_buffer_chain_init (vlib_buffer_t * first) @@ -695,7 +799,8 @@ vlib_buffer_init_for_free_list (vlib_buffer_t * dst, _(flags); _(free_list_index); #undef _ - ASSERT (dst->total_length_not_including_first_buffer == 0); + dst->total_length_not_including_first_buffer = 0; + ASSERT (dst->n_add_refs == 0); } always_inline void @@ -727,8 +832,10 @@ vlib_buffer_init_two_for_free_list (vlib_buffer_t * dst0, _(flags); _(free_list_index); #undef _ - ASSERT (dst0->total_length_not_including_first_buffer == 0); - ASSERT (dst1->total_length_not_including_first_buffer == 0); + dst0->total_length_not_including_first_buffer = 0; + dst1->total_length_not_including_first_buffer = 0; + ASSERT (dst0->n_add_refs == 0); + ASSERT (dst1->n_add_refs == 0); } #if CLIB_DEBUG > 0 diff --git a/src/vnet/devices/dpdk/buffer.c b/src/vnet/devices/dpdk/buffer.c index 007093e4..f95d4cb5 100644 --- a/src/vnet/devices/dpdk/buffer.c +++ b/src/vnet/devices/dpdk/buffer.c @@ -79,20 +79,46 @@ STATIC_ASSERT (VLIB_BUFFER_PRE_DATA_SIZE == RTE_PKTMBUF_HEADROOM, "VLIB_BUFFER_PRE_DATA_SIZE must be equal to RTE_PKTMBUF_HEADROOM"); +static_always_inline void +dpdk_rte_pktmbuf_free (vlib_main_t * vm, vlib_buffer_t * b) +{ + vlib_buffer_t *hb = b; + struct rte_mbuf *mb; + u32 next, flags; + mb = rte_mbuf_from_vlib_buffer (hb); + +next: + flags = b->flags; + next = b->next_buffer; + mb = rte_mbuf_from_vlib_buffer (b); + + if (PREDICT_FALSE (b->n_add_refs)) + { + rte_mbuf_refcnt_update (mb, b->n_add_refs); + b->n_add_refs = 0; + } + + rte_pktmbuf_free_seg (mb); + + if (flags & VLIB_BUFFER_NEXT_PRESENT) + { + b = vlib_get_buffer (vm, next); + goto next; + } +} + static void del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f) { u32 i; - struct rte_mbuf *mb; vlib_buffer_t *b; for (i = 0; i < vec_len (f->buffers); i++) { b = vlib_get_buffer (vm, f->buffers[i]); - mb = rte_mbuf_from_vlib_buffer (b); - ASSERT (rte_mbuf_refcnt_read (mb) == 1); - rte_pktmbuf_free (mb); + dpdk_rte_pktmbuf_free (vm, b); } + vec_free (f->name); vec_free (f->buffers); } @@ -325,7 +351,6 @@ vlib_buffer_free_inline (vlib_main_t * vm, for (i = 0; i < n_buffers; i++) { vlib_buffer_t *b; - struct rte_mbuf *mb; b = vlib_get_buffer (vm, buffers[i]); @@ -351,11 +376,7 @@ vlib_buffer_free_inline (vlib_main_t * vm, else { if (PREDICT_TRUE ((b->flags & VLIB_BUFFER_RECYCLE) == 0)) - { - mb = rte_mbuf_from_vlib_buffer (b); - ASSERT (rte_mbuf_refcnt_read (mb) == 1); - rte_pktmbuf_free (mb); - } + dpdk_rte_pktmbuf_free (vm, b); } } if (vec_len (bm->announce_list)) diff --git a/src/vnet/devices/dpdk/device.c b/src/vnet/devices/dpdk/device.c index c9d9a567..17397900 100644 --- a/src/vnet/devices/dpdk/device.c +++ b/src/vnet/devices/dpdk/device.c @@ -168,13 +168,11 @@ dpdk_validate_rte_mbuf (vlib_main_t * vm, vlib_buffer_t * b, { b2 = vlib_get_buffer (vm, b2->next_buffer); mb = rte_mbuf_from_vlib_buffer (b2); - last_mb->next = mb; - last_mb = mb; rte_pktmbuf_reset (mb); } } - first_mb = mb = rte_mbuf_from_vlib_buffer (b); + last_mb = first_mb = mb = rte_mbuf_from_vlib_buffer (b); first_mb->nb_segs = 1; mb->data_len = b->current_length; mb->pkt_len = maybe_multiseg ? vlib_buffer_length_in_chain (vm, b) : @@ -185,10 +183,17 @@ dpdk_validate_rte_mbuf (vlib_main_t * vm, vlib_buffer_t * b, { b = vlib_get_buffer (vm, b->next_buffer); mb = rte_mbuf_from_vlib_buffer (b); + last_mb->next = mb; + last_mb = mb; mb->data_len = b->current_length; mb->pkt_len = b->current_length; mb->data_off = VLIB_BUFFER_PRE_DATA_SIZE + b->current_data; first_mb->nb_segs++; + if (PREDICT_FALSE (b->n_add_refs)) + { + rte_mbuf_refcnt_update (mb, b->n_add_refs); + b->n_add_refs = 0; + } } } diff --git a/src/vnet/dpo/replicate_dpo.c b/src/vnet/dpo/replicate_dpo.c index a67b19c8..a9f334be 100644 --- a/src/vnet/dpo/replicate_dpo.c +++ b/src/vnet/dpo/replicate_dpo.c @@ -625,6 +625,7 @@ replicate_inline (vlib_main_t * vm, vlib_frame_t * frame) { vlib_combined_counter_main_t * cm = &replicate_main.repm_counters; + replicate_main_t * rm = &replicate_main; u32 n_left_from, * from, * to_next, next_index; u32 cpu_index = os_get_cpu_number(); @@ -645,13 +646,11 @@ replicate_inline (vlib_main_t * vm, const replicate_t *rep0; vlib_buffer_t * b0, *c0; const dpo_id_t *dpo0; + u8 num_cloned; bi0 = from[0]; - to_next[0] = bi0; from += 1; - to_next += 1; n_left_from -= 1; - n_left_to_next -= 1; b0 = vlib_get_buffer (vm, bi0); repi0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX]; @@ -661,50 +660,21 @@ replicate_inline (vlib_main_t * vm, cm, cpu_index, repi0, 1, vlib_buffer_length_in_chain(vm, b0)); - /* ship the original to the first bucket */ - dpo0 = replicate_get_bucket_i(rep0, 0); - next0 = dpo0->dpoi_next_node; - vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; + vec_validate (rm->clones[cpu_index], rep0->rep_n_buckets - 1); - if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) - { - replicate_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t)); - t->rep_index = repi0; - t->dpo = *dpo0; - } - vlib_validate_buffer_enqueue_x1 (vm, node, next_index, - to_next, n_left_to_next, - bi0, next0); + num_cloned = vlib_buffer_clone (vm, bi0, rm->clones[cpu_index], rep0->rep_n_buckets, 128); - /* ship copies to the rest of the buckets */ - for (bucket = 1; bucket < rep0->rep_n_buckets; bucket++) - { - /* - * After the enqueue of the first buffer, and of all subsequent - * buffers in this loop, it is possible that we over-flow the - * frame of the to-next node. When this happens we need to 'put' - * that full frame to the node and get a fresh empty one. - * Note that these are macros with side effects that change - * to_next & n_left_to_next - */ - if (PREDICT_FALSE(0 == n_left_to_next)) - { - vlib_put_next_frame (vm, node, next_index, n_left_to_next); - vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); - } + if (num_cloned != rep0->rep_n_buckets) + { + vlib_node_increment_counter + (vm, node->node_index, + REPLICATE_DPO_ERROR_BUFFER_ALLOCATION_FAILURE, 1); + } - /* Make a copy. This can fail, so deal with it. */ - c0 = vlib_buffer_copy(vm, b0); - if (PREDICT_FALSE (c0 == 0)) - { - vlib_node_increment_counter - (vm, node->node_index, - REPLICATE_DPO_ERROR_BUFFER_ALLOCATION_FAILURE, - 1); - continue; - } - - ci0 = vlib_get_buffer_index(vm, c0); + for (bucket = 0; bucket < num_cloned; bucket++) + { + ci0 = rm->clones[cpu_index][bucket]; + c0 = vlib_get_buffer(vm, ci0); to_next[0] = ci0; to_next += 1; @@ -724,7 +694,13 @@ replicate_inline (vlib_main_t * vm, vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, n_left_to_next, ci0, next0); + if (PREDICT_FALSE (n_left_to_next == 0)) + { + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + } } + vec_reset_length (rm->clones[cpu_index]); } vlib_put_next_frame (vm, node, next_index, n_left_to_next); @@ -797,3 +773,15 @@ VLIB_REGISTER_NODE (ip6_replicate_node) = { [0] = "error-drop", }, }; + +clib_error_t * +replicate_dpo_init (vlib_main_t * vm) +{ + replicate_main_t * rm = &replicate_main; + + vec_validate (rm->clones, vlib_num_workers()); + + return 0; +} + +VLIB_INIT_FUNCTION (replicate_dpo_init); diff --git a/src/vnet/dpo/replicate_dpo.h b/src/vnet/dpo/replicate_dpo.h index a564739c..77273015 100644 --- a/src/vnet/dpo/replicate_dpo.h +++ b/src/vnet/dpo/replicate_dpo.h @@ -32,6 +32,9 @@ typedef struct replicate_main_t_ { vlib_combined_counter_main_t repm_counters; + + /* per-cpu vector of cloned packets */ + u32 **clones; } replicate_main_t; extern replicate_main_t replicate_main; -- cgit 1.2.3-korg From f869028740aaebeb0375077d4d84fa07a17fff1a Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Wed, 1 Mar 2017 11:38:02 -0500 Subject: Fix buffer template copy Change-Id: If451c9cb68719fc816999b0330b9be3a0169176a Signed-off-by: Dave Barach --- src/vlib/buffer.h | 2 ++ src/vlib/buffer_funcs.h | 33 ++++++++++++++++++++++++++++----- 2 files changed, 30 insertions(+), 5 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index b4015b30..1f723f3b 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -61,6 +61,7 @@ typedef struct { CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + STRUCT_MARK (template_start); /* Offset within data[] that we are currently processing. If negative current header points into predata area. */ i16 current_data; /**< signed offset in data[], pre_data[] @@ -103,6 +104,7 @@ typedef struct /**< Only valid for first buffer in chain. Current length plus total length given here give total number of bytes in buffer chain. */ + STRUCT_MARK (template_end); u32 next_buffer; /**< Next buffer for this linked-list of buffers. Only valid if VLIB_BUFFER_NEXT_PRESENT flag is set. diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index e0fde5f2..f346676b 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -792,14 +792,22 @@ vlib_buffer_init_for_free_list (vlib_buffer_t * dst, /* Make sure buffer template is sane. */ ASSERT (fl->index == fl->buffer_init_template.free_list_index); + clib_memcpy (STRUCT_MARK_PTR (dst, template_start), + STRUCT_MARK_PTR (src, template_start), + STRUCT_OFFSET_OF (vlib_buffer_t, template_end) - + STRUCT_OFFSET_OF (vlib_buffer_t, template_start)); + + /* Not in the first 16 octets. */ + dst->n_add_refs = src->n_add_refs; + /* Make sure it really worked. */ -#define _(f) dst->f = src->f +#define _(f) ASSERT (dst->f == src->f); _(current_data); _(current_length); _(flags); _(free_list_index); #undef _ - dst->total_length_not_including_first_buffer = 0; + ASSERT (dst->total_length_not_including_first_buffer == 0); ASSERT (dst->n_add_refs == 0); } @@ -825,15 +833,30 @@ vlib_buffer_init_two_for_free_list (vlib_buffer_t * dst0, /* Make sure buffer template is sane. */ ASSERT (fl->index == fl->buffer_init_template.free_list_index); + clib_memcpy (STRUCT_MARK_PTR (dst0, template_start), + STRUCT_MARK_PTR (src, template_start), + STRUCT_OFFSET_OF (vlib_buffer_t, template_end) - + STRUCT_OFFSET_OF (vlib_buffer_t, template_start)); + + clib_memcpy (STRUCT_MARK_PTR (dst1, template_start), + STRUCT_MARK_PTR (src, template_start), + STRUCT_OFFSET_OF (vlib_buffer_t, template_end) - + STRUCT_OFFSET_OF (vlib_buffer_t, template_start)); + + /* Not in the first 16 octets. */ + dst0->n_add_refs = src->n_add_refs; + dst1->n_add_refs = src->n_add_refs; + /* Make sure it really worked. */ -#define _(f) dst0->f = src->f; dst1->f = src->f +#define _(f) ASSERT (dst0->f == src->f); ASSERT( dst1->f == src->f) _(current_data); _(current_length); _(flags); _(free_list_index); #undef _ - dst0->total_length_not_including_first_buffer = 0; - dst1->total_length_not_including_first_buffer = 0; + + ASSERT (dst0->total_length_not_including_first_buffer == 0); + ASSERT (dst1->total_length_not_including_first_buffer == 0); ASSERT (dst0->n_add_refs == 0); ASSERT (dst1->n_add_refs == 0); } -- cgit 1.2.3-korg From 68b0fb0c620c7451ef1a6380c43c39de6614db51 Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Tue, 28 Feb 2017 15:15:56 -0500 Subject: VPP-598: tcp stack initial commit Change-Id: I49e5ce0aae6e4ff634024387ceaf7dbc432a0351 Signed-off-by: Dave Barach Signed-off-by: Florin Coras --- src/Makefile.am | 1 + src/plugins/ioam/export-common/ioam_export.h | 2 +- src/plugins/ioam/ipfixcollector/ipfixcollector.c | 2 +- src/plugins/ioam/lib-vxlan-gpe/ioam_transit.c | 2 +- src/plugins/snat/in2out.c | 26 +- src/plugins/snat/out2in.c | 24 +- src/scripts/vnet/tcp | 18 +- src/scripts/vnet/udp | 19 + src/scripts/vnet/uri/tcp-setup.sh | 39 + src/scripts/vnet/uri/tcp_server | 4 + src/scripts/vnet/uri/udp | 19 + src/svm.am | 10 +- src/svm/ssvm.c | 16 + src/svm/ssvm.h | 18 +- src/svm/svm_fifo.c | 568 ++++++ src/svm/svm_fifo.h | 157 ++ src/svm/svm_fifo_segment.c | 193 ++ src/svm/svm_fifo_segment.h | 89 + src/svm/test_svm_fifo1.c | 361 ++++ src/uri.am | 22 + src/uri/uri_tcp_test.c | 916 +++++++++ src/uri/uri_udp_test.c | 553 ++++++ src/uri/uri_udp_test2.c | 954 +++++++++ src/uri/uritest.c | 484 +++++ src/vlib/buffer.c | 2 +- src/vlib/buffer.h | 68 + src/vlibmemory/unix_shared_memory_queue.c | 12 +- src/vlibmemory/unix_shared_memory_queue.h | 2 +- src/vnet.am | 66 +- src/vnet/api_errno.h | 21 +- src/vnet/bfd/bfd_udp.c | 4 +- src/vnet/buffer.h | 10 + src/vnet/classify/vnet_classify.c | 4 +- src/vnet/dhcp/dhcp_proxy.h | 2 +- src/vnet/flow/flow_report.h | 2 +- src/vnet/ip/ip.h | 4 +- src/vnet/ip/ip4.h | 42 +- src/vnet/ip/ip4_forward.c | 173 +- src/vnet/ip/ip4_packet.h | 26 +- src/vnet/ip/ip6.h | 44 +- src/vnet/ip/ip6_packet.h | 26 +- src/vnet/ip/punt.c | 2 +- src/vnet/ip/tcp_packet.h | 141 -- src/vnet/ip/udp.h | 315 --- src/vnet/ip/udp_error.def | 21 - src/vnet/ip/udp_format.c | 91 - src/vnet/ip/udp_init.c | 71 - src/vnet/ip/udp_local.c | 645 ------ src/vnet/ip/udp_packet.h | 65 - src/vnet/ip/udp_pg.c | 237 --- src/vnet/ipsec/ikev2.c | 2 +- src/vnet/ipsec/ikev2_cli.c | 2 +- src/vnet/ipsec/ikev2_crypto.c | 2 +- src/vnet/lisp-cp/packets.c | 65 +- src/vnet/lisp-cp/packets.h | 45 - src/vnet/lisp-gpe/interface.c | 2 +- src/vnet/lisp-gpe/lisp_gpe.h | 4 +- src/vnet/lisp-gpe/lisp_gpe_adjacency.c | 2 + src/vnet/session/application.c | 343 ++++ src/vnet/session/application.h | 120 ++ src/vnet/session/application_interface.c | 459 +++++ src/vnet/session/application_interface.h | 136 ++ src/vnet/session/hashes.c | 28 + src/vnet/session/node.c | 435 ++++ src/vnet/session/session.api | 429 ++++ src/vnet/session/session.c | 1286 ++++++++++++ src/vnet/session/session.h | 380 ++++ src/vnet/session/session_api.c | 821 ++++++++ src/vnet/session/session_cli.c | 189 ++ src/vnet/session/transport.c | 64 + src/vnet/session/transport.h | 250 +++ src/vnet/tcp/tcp.c | 708 +++++++ src/vnet/tcp/tcp.h | 624 ++++++ src/vnet/tcp/tcp_error.def | 35 + src/vnet/tcp/tcp_format.c | 136 ++ src/vnet/tcp/tcp_input.c | 2316 ++++++++++++++++++++++ src/vnet/tcp/tcp_newreno.c | 93 + src/vnet/tcp/tcp_output.c | 1412 +++++++++++++ src/vnet/tcp/tcp_packet.h | 184 ++ src/vnet/tcp/tcp_pg.c | 236 +++ src/vnet/tcp/tcp_syn_filter4.c | 542 +++++ src/vnet/tcp/tcp_timer.h | 29 + src/vnet/udp/builtin_server.c | 239 +++ src/vnet/udp/udp.c | 342 ++++ src/vnet/udp/udp.h | 362 ++++ src/vnet/udp/udp_error.def | 21 + src/vnet/udp/udp_format.c | 91 + src/vnet/udp/udp_input.c | 314 +++ src/vnet/udp/udp_local.c | 666 +++++++ src/vnet/udp/udp_packet.h | 65 + src/vnet/udp/udp_pg.c | 237 +++ src/vnet/vnet_all_api_h.h | 1 + src/vnet/vxlan-gpe/vxlan_gpe.h | 2 +- src/vnet/vxlan/vxlan.h | 2 +- src/vpp/api/vpe.api | 1 + src/vppinfra.am | 5 + src/vppinfra/bihash_16_8.h | 103 + src/vppinfra/bihash_48_8.h | 116 ++ src/vppinfra/tw_timer_16t_1w_2048sl.c | 26 + src/vppinfra/tw_timer_16t_1w_2048sl.h | 46 + 100 files changed, 18737 insertions(+), 1874 deletions(-) create mode 100644 src/scripts/vnet/udp create mode 100755 src/scripts/vnet/uri/tcp-setup.sh create mode 100644 src/scripts/vnet/uri/tcp_server create mode 100644 src/scripts/vnet/uri/udp create mode 100644 src/svm/svm_fifo.c create mode 100644 src/svm/svm_fifo.h create mode 100644 src/svm/svm_fifo_segment.c create mode 100644 src/svm/svm_fifo_segment.h create mode 100644 src/svm/test_svm_fifo1.c create mode 100644 src/uri.am create mode 100644 src/uri/uri_tcp_test.c create mode 100644 src/uri/uri_udp_test.c create mode 100644 src/uri/uri_udp_test2.c create mode 100644 src/uri/uritest.c delete mode 100644 src/vnet/ip/tcp_packet.h delete mode 100644 src/vnet/ip/udp.h delete mode 100644 src/vnet/ip/udp_error.def delete mode 100644 src/vnet/ip/udp_format.c delete mode 100644 src/vnet/ip/udp_init.c delete mode 100644 src/vnet/ip/udp_local.c delete mode 100644 src/vnet/ip/udp_packet.h delete mode 100644 src/vnet/ip/udp_pg.c create mode 100644 src/vnet/session/application.c create mode 100644 src/vnet/session/application.h create mode 100644 src/vnet/session/application_interface.c create mode 100644 src/vnet/session/application_interface.h create mode 100644 src/vnet/session/hashes.c create mode 100644 src/vnet/session/node.c create mode 100644 src/vnet/session/session.api create mode 100644 src/vnet/session/session.c create mode 100644 src/vnet/session/session.h create mode 100644 src/vnet/session/session_api.c create mode 100644 src/vnet/session/session_cli.c create mode 100644 src/vnet/session/transport.c create mode 100644 src/vnet/session/transport.h create mode 100644 src/vnet/tcp/tcp.c create mode 100644 src/vnet/tcp/tcp.h create mode 100644 src/vnet/tcp/tcp_error.def create mode 100644 src/vnet/tcp/tcp_format.c create mode 100644 src/vnet/tcp/tcp_input.c create mode 100644 src/vnet/tcp/tcp_newreno.c create mode 100644 src/vnet/tcp/tcp_output.c create mode 100644 src/vnet/tcp/tcp_packet.h create mode 100644 src/vnet/tcp/tcp_pg.c create mode 100644 src/vnet/tcp/tcp_syn_filter4.c create mode 100644 src/vnet/tcp/tcp_timer.h create mode 100644 src/vnet/udp/builtin_server.c create mode 100644 src/vnet/udp/udp.c create mode 100644 src/vnet/udp/udp.h create mode 100644 src/vnet/udp/udp_error.def create mode 100644 src/vnet/udp/udp_format.c create mode 100644 src/vnet/udp/udp_input.c create mode 100644 src/vnet/udp/udp_local.c create mode 100644 src/vnet/udp/udp_packet.h create mode 100644 src/vnet/udp/udp_pg.c create mode 100644 src/vppinfra/bihash_16_8.h create mode 100644 src/vppinfra/bihash_48_8.h create mode 100644 src/vppinfra/tw_timer_16t_1w_2048sl.c create mode 100644 src/vppinfra/tw_timer_16t_1w_2048sl.h (limited to 'src/vlib') diff --git a/src/Makefile.am b/src/Makefile.am index 08feb29a..641707ed 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -88,6 +88,7 @@ include vlib-api.am include vnet.am include vpp.am include vpp-api-test.am +include uri.am SUBDIRS += plugins diff --git a/src/plugins/ioam/export-common/ioam_export.h b/src/plugins/ioam/export-common/ioam_export.h index e84dab0b..dd48a93b 100644 --- a/src/plugins/ioam/export-common/ioam_export.h +++ b/src/plugins/ioam/export-common/ioam_export.h @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/plugins/ioam/ipfixcollector/ipfixcollector.c b/src/plugins/ioam/ipfixcollector/ipfixcollector.c index 4ae47edc..71b934ec 100644 --- a/src/plugins/ioam/ipfixcollector/ipfixcollector.c +++ b/src/plugins/ioam/ipfixcollector/ipfixcollector.c @@ -15,7 +15,7 @@ #include #include -#include +#include #include ipfix_collector_main_t ipfix_collector_main; diff --git a/src/plugins/ioam/lib-vxlan-gpe/ioam_transit.c b/src/plugins/ioam/lib-vxlan-gpe/ioam_transit.c index b42c357c..f334c983 100644 --- a/src/plugins/ioam/lib-vxlan-gpe/ioam_transit.c +++ b/src/plugins/ioam/lib-vxlan-gpe/ioam_transit.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/plugins/snat/in2out.c b/src/plugins/snat/in2out.c index e30c913c..b4b7793d 100644 --- a/src/plugins/snat/in2out.c +++ b/src/plugins/snat/in2out.c @@ -689,12 +689,12 @@ snat_hairpinning (snat_main_t *sm, ip4_header_t, dst_address); ip0->checksum = ip_csum_fold (sum0); - old_dst_port0 = tcp0->ports.dst; + old_dst_port0 = tcp0->dst; if (PREDICT_TRUE(new_dst_port0 != old_dst_port0)) { if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) { - tcp0->ports.dst = new_dst_port0; + tcp0->dst = new_dst_port0; sum0 = tcp0->checksum; sum0 = ip_csum_update (sum0, old_dst_addr0, new_dst_addr0, ip4_header_t, dst_address); @@ -872,9 +872,9 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) { - old_port0 = tcp0->ports.src; - tcp0->ports.src = s0->out2in.port; - new_port0 = tcp0->ports.src; + old_port0 = tcp0->src_port; + tcp0->src_port = s0->out2in.port; + new_port0 = tcp0->src_port; sum0 = tcp0->checksum; sum0 = ip_csum_update (sum0, old_addr0, new_addr0, @@ -1012,9 +1012,9 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, if (PREDICT_TRUE(proto1 == SNAT_PROTOCOL_TCP)) { - old_port1 = tcp1->ports.src; - tcp1->ports.src = s1->out2in.port; - new_port1 = tcp1->ports.src; + old_port1 = tcp1->src_port; + tcp1->src_port = s1->out2in.port; + new_port1 = tcp1->src_port; sum1 = tcp1->checksum; sum1 = ip_csum_update (sum1, old_addr1, new_addr1, @@ -1188,9 +1188,9 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) { - old_port0 = tcp0->ports.src; - tcp0->ports.src = s0->out2in.port; - new_port0 = tcp0->ports.src; + old_port0 = tcp0->src_port; + tcp0->src_port = s0->out2in.port; + new_port0 = tcp0->src_port; sum0 = tcp0->checksum; sum0 = ip_csum_update (sum0, old_addr0, new_addr0, @@ -1667,8 +1667,8 @@ snat_in2out_fast_static_map_fn (vlib_main_t * vm, { if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) { - old_port0 = tcp0->ports.src; - tcp0->ports.src = new_port0; + old_port0 = tcp0->src_port; + tcp0->src_port = new_port0; sum0 = tcp0->checksum; sum0 = ip_csum_update (sum0, old_addr0, new_addr0, diff --git a/src/plugins/snat/out2in.c b/src/plugins/snat/out2in.c index 328f5ba4..3bfc0aa3 100644 --- a/src/plugins/snat/out2in.c +++ b/src/plugins/snat/out2in.c @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include #include @@ -602,9 +602,9 @@ snat_out2in_node_fn (vlib_main_t * vm, if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) { - old_port0 = tcp0->ports.dst; - tcp0->ports.dst = s0->in2out.port; - new_port0 = tcp0->ports.dst; + old_port0 = tcp0->dst_port; + tcp0->dst_port = s0->in2out.port; + new_port0 = tcp0->dst_port; sum0 = tcp0->checksum; sum0 = ip_csum_update (sum0, old_addr0, new_addr0, @@ -737,9 +737,9 @@ snat_out2in_node_fn (vlib_main_t * vm, if (PREDICT_TRUE(proto1 == SNAT_PROTOCOL_TCP)) { - old_port1 = tcp1->ports.dst; - tcp1->ports.dst = s1->in2out.port; - new_port1 = tcp1->ports.dst; + old_port1 = tcp1->dst_port; + tcp1->dst_port = s1->in2out.port; + new_port1 = tcp1->dst_port; sum1 = tcp1->checksum; sum1 = ip_csum_update (sum1, old_addr1, new_addr1, @@ -907,9 +907,9 @@ snat_out2in_node_fn (vlib_main_t * vm, if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) { - old_port0 = tcp0->ports.dst; - tcp0->ports.dst = s0->in2out.port; - new_port0 = tcp0->ports.dst; + old_port0 = tcp0->dst_port; + tcp0->dst_port = s0->in2out.port; + new_port0 = tcp0->dst_port; sum0 = tcp0->checksum; sum0 = ip_csum_update (sum0, old_addr0, new_addr0, @@ -1369,8 +1369,8 @@ snat_out2in_fast_node_fn (vlib_main_t * vm, { if (PREDICT_TRUE(proto0 == SNAT_PROTOCOL_TCP)) { - old_port0 = tcp0->ports.dst; - tcp0->ports.dst = new_port0; + old_port0 = tcp0->dst_port; + tcp0->dst_port = new_port0; sum0 = tcp0->checksum; sum0 = ip_csum_update (sum0, old_addr0, new_addr0, diff --git a/src/scripts/vnet/tcp b/src/scripts/vnet/tcp index a2ee8b2d..b9c23c3a 100644 --- a/src/scripts/vnet/tcp +++ b/src/scripts/vnet/tcp @@ -1,16 +1,18 @@ +loop create +set int ip address loop0 192.168.1.1/8 +set int state loop0 up + packet-generator new { name x - limit 1 + limit 2048 node ip4-input - size 64-64 + size 100-100 + interface loop0 no-recycle data { - TCP: 1.2.3.4 -> 5.6.7.8 - TCP: 1234 -> 5678 + TCP: 192.168.1.2 -> 192.168.1.1 + TCP: 32415 -> 80 + SYN incrementing 100 } } - -tr add pg-input 100 -ip route 5.6.7.8/32 via local -ip route 1.2.3.4/32 via local diff --git a/src/scripts/vnet/udp b/src/scripts/vnet/udp new file mode 100644 index 00000000..7dda1eec --- /dev/null +++ b/src/scripts/vnet/udp @@ -0,0 +1,19 @@ +loop create +set int ip address loop0 192.168.1.1/8 +set int state loop0 up + +packet-generator new { + name udp + limit 512 + rate 1e4 + node ip4-input + size 100-100 + interface loop0 + no-recycle + data { + UDP: 192.168.1.2 - 192.168.2.255 -> 192.168.1.1 + UDP: 4321 -> 1234 + length 72 + incrementing 100 + } +} diff --git a/src/scripts/vnet/uri/tcp-setup.sh b/src/scripts/vnet/uri/tcp-setup.sh new file mode 100755 index 00000000..e0b01588 --- /dev/null +++ b/src/scripts/vnet/uri/tcp-setup.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +function topo_setup +{ + ip netns add vppns1 + ip link add veth_vpp1 type veth peer name vpp1 + ip link set dev vpp1 up + ip link set dev veth_vpp1 up netns vppns1 + + ip netns exec vppns1 \ + bash -c " + ip link set dev lo up + ip addr add 6.0.1.2/24 dev veth_vpp1 + " + + ethtool --offload vpp1 rx off tx off + ip netns exec vppns1 ethtool --offload veth_vpp1 rx off tx off + +} + +function topo_clean +{ + ip link del dev veth_vpp1 &> /dev/null + ip netns del vppns1 &> /dev/null +} + +if [ "$1" == "clean" ] ; then + topo_clean + exit 0 +else + topo_setup +fi + +# to test connectivity do: +# sudo ip netns exec vppns1 telnet 6.0.1.1 1234 +# to push traffic to the server +# dd if=/dev/zero bs=1024K count=512 | nc 6.0.1.1 +# to listen for incoming connection from vpp +# nc -l 1234 diff --git a/src/scripts/vnet/uri/tcp_server b/src/scripts/vnet/uri/tcp_server new file mode 100644 index 00000000..7f5a86de --- /dev/null +++ b/src/scripts/vnet/uri/tcp_server @@ -0,0 +1,4 @@ +create host-interface name vpp1 +set int state host-vpp1 up +set int ip address host-vpp1 6.0.1.1/24 +trace add af-packet-input 10 diff --git a/src/scripts/vnet/uri/udp b/src/scripts/vnet/uri/udp new file mode 100644 index 00000000..ca13b83c --- /dev/null +++ b/src/scripts/vnet/uri/udp @@ -0,0 +1,19 @@ +loop create +set int ip address loop0 10.0.0.1/32 +set int state loop0 up + +packet-generator new { + name udp + limit 512 + rate 1e4 + node ip4-input + size 100-100 + interface loop0 + no-recycle + data { + UDP: 192.168.1.2 - 192.168.2.255 -> 192.168.1.1 + UDP: 4321 -> 1234 + length 72 + incrementing 100 + } +} diff --git a/src/svm.am b/src/svm.am index 2cd385bd..442eba8e 100644 --- a/src/svm.am +++ b/src/svm.am @@ -13,13 +13,14 @@ bin_PROGRAMS += svmtool svmdbtool -nobase_include_HEADERS += svm/svm.h svm/ssvm.h svm/svmdb.h +nobase_include_HEADERS += svm/svm.h svm/ssvm.h svm/svmdb.h \ + svm/svm_fifo.h svm/svm_fifo_segment.h lib_LTLIBRARIES += libsvm.la libsvmdb.la +libsvm_la_SOURCES = svm/svm.c svm/ssvm.c svm/svm_fifo.c svm/svm_fifo_segment.c libsvm_la_LIBADD = libvppinfra.la -lrt -lpthread libsvm_la_DEPENDENCIES = libvppinfra.la -libsvm_la_SOURCES = svm/svm.c svm/ssvm.c svmtool_SOURCES = svm/svmtool.c svmtool_LDADD = libsvm.la libvppinfra.la -lpthread -lrt @@ -31,4 +32,9 @@ libsvmdb_la_SOURCES = svm/svmdb.c svmdbtool_SOURCES = svm/svmdbtool.c svmdbtool_LDADD = libsvmdb.la libsvm.la libvppinfra.la -lpthread -lrt +noinst_PROGRAMS += test_svm_fifo1 +test_svm_fifo1_SOURCES = svm/test_svm_fifo1.c +test_svm_fifo1_LDADD = libsvm.la libvppinfra.la -lpthread -lrt +test_svm_fifo1_LDFLAGS = -static + # vi:syntax=automake diff --git a/src/svm/ssvm.c b/src/svm/ssvm.c index 6f409eb6..6cda1f27 100644 --- a/src/svm/ssvm.c +++ b/src/svm/ssvm.c @@ -169,6 +169,22 @@ re_map_it: return 0; } +void +ssvm_delete (ssvm_private_t * ssvm) +{ + u8 *fn; + + fn = format (0, "/dev/shm/%s%c", ssvm->name, 0); + + /* Throw away the backing file */ + if (unlink ((char *) fn) < 0) + clib_unix_warning ("unlink segment '%s'", ssvm->name); + + munmap ((void *) ssvm->requested_va, ssvm->ssvm_size); + vec_free (fn); +} + + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/svm/ssvm.h b/src/svm/ssvm.h index 9e61b9a0..bccfc164 100644 --- a/src/svm/ssvm.h +++ b/src/svm/ssvm.h @@ -38,7 +38,10 @@ #include #include -#define MMAP_PAGESIZE (4<<10) +#ifndef MMAP_PAGESIZE +#define MMAP_PAGESIZE (clib_mem_get_page_size()) +#endif + #define SSVM_N_OPAQUE 7 typedef struct @@ -125,12 +128,12 @@ ssvm_pop_heap (void *oldheap) } #define foreach_ssvm_api_error \ -_(NO_NAME, "No shared segment name", -10) \ -_(NO_SIZE, "Size not set (master)", -11) \ -_(CREATE_FAILURE, "Create failed", -12) \ -_(SET_SIZE, "Set size failed", -13) \ -_(MMAP, "mmap failed", -14) \ -_(SLAVE_TIMEOUT, "Slave map timeout", -15) +_(NO_NAME, "No shared segment name", -100) \ +_(NO_SIZE, "Size not set (master)", -101) \ +_(CREATE_FAILURE, "Create failed", -102) \ +_(SET_SIZE, "Set size failed", -103) \ +_(MMAP, "mmap failed", -104) \ +_(SLAVE_TIMEOUT, "Slave map timeout", -105) typedef enum { @@ -143,6 +146,7 @@ typedef enum int ssvm_master_init (ssvm_private_t * ssvm, u32 master_index); int ssvm_slave_init (ssvm_private_t * ssvm, int timeout_in_seconds); +void ssvm_delete (ssvm_private_t * ssvm); #endif /* __included_ssvm_h__ */ diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c new file mode 100644 index 00000000..11f90193 --- /dev/null +++ b/src/svm/svm_fifo.c @@ -0,0 +1,568 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "svm_fifo.h" + +/** create an svm fifo, in the current heap. Fails vs blow up the process */ +svm_fifo_t * +svm_fifo_create (u32 data_size_in_bytes) +{ + svm_fifo_t *f; + pthread_mutexattr_t attr; + pthread_condattr_t cattr; + + f = clib_mem_alloc_aligned_or_null (sizeof (*f) + data_size_in_bytes, + CLIB_CACHE_LINE_BYTES); + if (f == 0) + return 0; + + memset (f, 0, sizeof (*f) + data_size_in_bytes); + f->nitems = data_size_in_bytes; + f->ooos_list_head = OOO_SEGMENT_INVALID_INDEX; + + memset (&attr, 0, sizeof (attr)); + memset (&cattr, 0, sizeof (cattr)); + + if (pthread_mutexattr_init (&attr)) + clib_unix_warning ("mutexattr_init"); + if (pthread_mutexattr_setpshared (&attr, PTHREAD_PROCESS_SHARED)) + clib_unix_warning ("pthread_mutexattr_setpshared"); + if (pthread_mutex_init (&f->mutex, &attr)) + clib_unix_warning ("mutex_init"); + if (pthread_mutexattr_destroy (&attr)) + clib_unix_warning ("mutexattr_destroy"); + if (pthread_condattr_init (&cattr)) + clib_unix_warning ("condattr_init"); + if (pthread_condattr_setpshared (&cattr, PTHREAD_PROCESS_SHARED)) + clib_unix_warning ("condattr_setpshared"); + if (pthread_cond_init (&f->condvar, &cattr)) + clib_unix_warning ("cond_init1"); + if (pthread_condattr_destroy (&cattr)) + clib_unix_warning ("cond_init2"); + + return (f); +} + +always_inline ooo_segment_t * +ooo_segment_new (svm_fifo_t * f, u32 start, u32 length) +{ + ooo_segment_t *s; + + pool_get (f->ooo_segments, s); + + s->fifo_position = start; + s->length = length; + + s->prev = s->next = OOO_SEGMENT_INVALID_INDEX; + + return s; +} + +always_inline void +ooo_segment_del (svm_fifo_t * f, u32 index) +{ + ooo_segment_t *cur, *prev = 0, *next = 0; + cur = pool_elt_at_index (f->ooo_segments, index); + + if (cur->next != OOO_SEGMENT_INVALID_INDEX) + { + next = pool_elt_at_index (f->ooo_segments, cur->next); + next->prev = cur->prev; + } + + if (cur->prev != OOO_SEGMENT_INVALID_INDEX) + { + prev = pool_elt_at_index (f->ooo_segments, cur->prev); + prev->next = cur->next; + } + else + { + f->ooos_list_head = cur->next; + } + + pool_put (f->ooo_segments, cur); +} + +/** + * Add segment to fifo's out-of-order segment list. Takes care of merging + * adjacent segments and removing overlapping ones. + */ +static void +ooo_segment_add (svm_fifo_t * f, u32 offset, u32 length) +{ + ooo_segment_t *s, *new_s, *prev, *next, *it; + u32 new_index, position, end_offset, s_sof, s_eof, s_index; + + position = (f->tail + offset) % f->nitems; + end_offset = offset + length; + + if (f->ooos_list_head == OOO_SEGMENT_INVALID_INDEX) + { + s = ooo_segment_new (f, position, length); + f->ooos_list_head = s - f->ooo_segments; + f->ooos_newest = f->ooos_list_head; + return; + } + + /* Find first segment that starts after new segment */ + s = pool_elt_at_index (f->ooo_segments, f->ooos_list_head); + while (s->next != OOO_SEGMENT_INVALID_INDEX + && ooo_segment_offset (f, s) <= offset) + s = pool_elt_at_index (f->ooo_segments, s->next); + + s_index = s - f->ooo_segments; + s_sof = ooo_segment_offset (f, s); + s_eof = ooo_segment_end_offset (f, s); + + /* No overlap, add before current segment */ + if (end_offset < s_sof) + { + new_s = ooo_segment_new (f, position, length); + new_index = new_s - f->ooo_segments; + + /* Pool might've moved, get segment again */ + s = pool_elt_at_index (f->ooo_segments, s_index); + + if (s->prev != OOO_SEGMENT_INVALID_INDEX) + { + new_s->prev = s->prev; + + prev = pool_elt_at_index (f->ooo_segments, new_s->prev); + prev->next = new_index; + } + else + { + /* New head */ + f->ooos_list_head = new_index; + } + + new_s->next = s - f->ooo_segments; + s->prev = new_index; + f->ooos_newest = new_index; + return; + } + /* No overlap, add after current segment */ + else if (s_eof < offset) + { + new_s = ooo_segment_new (f, position, length); + new_index = new_s - f->ooo_segments; + + /* Pool might've moved, get segment again */ + s = pool_elt_at_index (f->ooo_segments, s_index); + + if (s->next != OOO_SEGMENT_INVALID_INDEX) + { + new_s->next = s->next; + + next = pool_elt_at_index (f->ooo_segments, new_s->next); + next->prev = new_index; + } + + new_s->prev = s - f->ooo_segments; + s->next = new_index; + f->ooos_newest = new_index; + + return; + } + + /* + * Merge needed + */ + + /* Merge at head */ + if (offset <= s_sof) + { + /* If we have a previous, check if we overlap */ + if (s->prev != OOO_SEGMENT_INVALID_INDEX) + { + prev = pool_elt_at_index (f->ooo_segments, s->prev); + + /* New segment merges prev and current. Remove previous and + * update position of current. */ + if (ooo_segment_end_offset (f, prev) >= offset) + { + s->fifo_position = prev->fifo_position; + s->length = s_eof - ooo_segment_offset (f, prev); + ooo_segment_del (f, s->prev); + } + } + else + { + s->fifo_position = position; + s->length = s_eof - ooo_segment_offset (f, s); + } + + /* The new segment's tail may cover multiple smaller ones */ + if (s_eof < end_offset) + { + /* Remove segments completely covered */ + it = (s->next != OOO_SEGMENT_INVALID_INDEX) ? + pool_elt_at_index (f->ooo_segments, s->next) : 0; + while (it && ooo_segment_end_offset (f, it) < end_offset) + { + next = (it->next != OOO_SEGMENT_INVALID_INDEX) ? + pool_elt_at_index (f->ooo_segments, it->next) : 0; + ooo_segment_del (f, it - f->ooo_segments); + it = next; + } + + /* Update length. Segment's start might have changed. */ + s->length = end_offset - ooo_segment_offset (f, s); + + /* If partial overlap with last, merge */ + if (it && ooo_segment_offset (f, it) < end_offset) + { + s->length += + it->length - (ooo_segment_offset (f, it) - end_offset); + ooo_segment_del (f, it - f->ooo_segments); + } + } + } + /* Last but overlapping previous */ + else if (s_eof <= end_offset) + { + s->length = end_offset - ooo_segment_offset (f, s); + } + /* New segment completely covered by current one */ + else + { + /* Do Nothing */ + } + + /* Most recently updated segment */ + f->ooos_newest = s - f->ooo_segments; +} + +/** + * Removes segments that can now be enqueued because the fifo's tail has + * advanced. Returns the number of bytes added to tail. + */ +static int +ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) +{ + ooo_segment_t *s; + u32 index, bytes = 0, diff; + + s = pool_elt_at_index (f->ooo_segments, f->ooos_list_head); + + /* If last tail update overlaps one/multiple ooo segments, remove them */ + diff = (f->nitems + f->tail - s->fifo_position) % f->nitems; + while (0 < diff && diff < n_bytes_enqueued) + { + /* Segment end is beyond the tail. Advance tail and be done */ + if (diff < s->length) + { + f->tail += s->length - diff; + f->tail %= f->nitems; + break; + } + /* If we have next go on */ + else if (s->next != OOO_SEGMENT_INVALID_INDEX) + { + index = s - f->ooo_segments; + s = pool_elt_at_index (f->ooo_segments, s->next); + diff = (f->nitems + f->tail - s->fifo_position) % f->nitems; + ooo_segment_del (f, index); + } + /* End of search */ + else + { + break; + } + } + + /* If tail is adjacent to an ooo segment, 'consume' it */ + if (diff == 0) + { + bytes = ((f->nitems - f->cursize) >= s->length) ? s->length : + f->nitems - f->cursize; + + f->tail += bytes; + f->tail %= f->nitems; + + ooo_segment_del (f, s - f->ooo_segments); + } + + return bytes; +} + +static int +svm_fifo_enqueue_internal (svm_fifo_t * f, + int pid, u32 max_bytes, u8 * copy_from_here) +{ + u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; + u32 cursize, nitems; + + if (PREDICT_FALSE (f->cursize == f->nitems)) + return -2; /* fifo stuffed */ + + /* read cursize, which can only decrease while we're working */ + cursize = f->cursize; + nitems = f->nitems; + + /* Number of bytes we're going to copy */ + total_copy_bytes = (nitems - cursize) < max_bytes ? + (nitems - cursize) : max_bytes; + + if (PREDICT_TRUE (copy_from_here != 0)) + { + /* Number of bytes in first copy segment */ + first_copy_bytes = ((nitems - f->tail) < total_copy_bytes) + ? (nitems - f->tail) : total_copy_bytes; + + clib_memcpy (&f->data[f->tail], copy_from_here, first_copy_bytes); + f->tail += first_copy_bytes; + f->tail = (f->tail == nitems) ? 0 : f->tail; + + /* Number of bytes in second copy segment, if any */ + second_copy_bytes = total_copy_bytes - first_copy_bytes; + if (second_copy_bytes) + { + clib_memcpy (&f->data[f->tail], copy_from_here + first_copy_bytes, + second_copy_bytes); + f->tail += second_copy_bytes; + f->tail = (f->tail == nitems) ? 0 : f->tail; + } + } + else + { + /* Account for a zero-copy enqueue done elsewhere */ + ASSERT (max_bytes <= (nitems - cursize)); + f->tail += max_bytes; + f->tail = f->tail % nitems; + total_copy_bytes = max_bytes; + } + + /* Any out-of-order segments to collect? */ + if (PREDICT_FALSE (f->ooos_list_head != OOO_SEGMENT_INVALID_INDEX)) + total_copy_bytes += ooo_segment_try_collect (f, total_copy_bytes); + + /* Atomically increase the queue length */ + __sync_fetch_and_add (&f->cursize, total_copy_bytes); + + return (total_copy_bytes); +} + +int +svm_fifo_enqueue_nowait (svm_fifo_t * f, + int pid, u32 max_bytes, u8 * copy_from_here) +{ + return svm_fifo_enqueue_internal (f, pid, max_bytes, copy_from_here); +} + +/** Enqueue a future segment. + * Two choices: either copies the entire segment, or copies nothing + * Returns 0 of the entire segment was copied + * Returns -1 if none of the segment was copied due to lack of space + */ + +static int +svm_fifo_enqueue_with_offset_internal2 (svm_fifo_t * f, + int pid, + u32 offset, + u32 required_bytes, + u8 * copy_from_here) +{ + u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; + u32 cursize, nitems; + u32 tail_plus_offset; + + ASSERT (offset > 0); + + /* read cursize, which can only decrease while we're working */ + cursize = f->cursize; + nitems = f->nitems; + + /* Will this request fit? */ + if ((required_bytes + offset) > (nitems - cursize)) + return -1; + + ooo_segment_add (f, offset, required_bytes); + + /* Number of bytes we're going to copy */ + total_copy_bytes = required_bytes; + tail_plus_offset = (f->tail + offset) % nitems; + + /* Number of bytes in first copy segment */ + first_copy_bytes = ((nitems - tail_plus_offset) < total_copy_bytes) + ? (nitems - tail_plus_offset) : total_copy_bytes; + + clib_memcpy (&f->data[tail_plus_offset], copy_from_here, first_copy_bytes); + + /* Number of bytes in second copy segment, if any */ + second_copy_bytes = total_copy_bytes - first_copy_bytes; + if (second_copy_bytes) + { + tail_plus_offset += first_copy_bytes; + tail_plus_offset %= nitems; + + ASSERT (tail_plus_offset == 0); + + clib_memcpy (&f->data[tail_plus_offset], + copy_from_here + first_copy_bytes, second_copy_bytes); + } + + return (0); +} + + +int +svm_fifo_enqueue_with_offset (svm_fifo_t * f, + int pid, + u32 offset, + u32 required_bytes, u8 * copy_from_here) +{ + return svm_fifo_enqueue_with_offset_internal2 + (f, pid, offset, required_bytes, copy_from_here); +} + + +static int +svm_fifo_dequeue_internal2 (svm_fifo_t * f, + int pid, u32 max_bytes, u8 * copy_here) +{ + u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; + u32 cursize, nitems; + + if (PREDICT_FALSE (f->cursize == 0)) + return -2; /* nothing in the fifo */ + + /* read cursize, which can only increase while we're working */ + cursize = f->cursize; + nitems = f->nitems; + + /* Number of bytes we're going to copy */ + total_copy_bytes = (cursize < max_bytes) ? cursize : max_bytes; + + if (PREDICT_TRUE (copy_here != 0)) + { + /* Number of bytes in first copy segment */ + first_copy_bytes = ((nitems - f->head) < total_copy_bytes) + ? (nitems - f->head) : total_copy_bytes; + clib_memcpy (copy_here, &f->data[f->head], first_copy_bytes); + f->head += first_copy_bytes; + f->head = (f->head == nitems) ? 0 : f->head; + + /* Number of bytes in second copy segment, if any */ + second_copy_bytes = total_copy_bytes - first_copy_bytes; + if (second_copy_bytes) + { + clib_memcpy (copy_here + first_copy_bytes, + &f->data[f->head], second_copy_bytes); + f->head += second_copy_bytes; + f->head = (f->head == nitems) ? 0 : f->head; + } + } + else + { + /* Account for a zero-copy dequeue done elsewhere */ + ASSERT (max_bytes <= cursize); + f->head += max_bytes; + f->head = f->head % nitems; + cursize -= max_bytes; + total_copy_bytes = max_bytes; + } + + __sync_fetch_and_sub (&f->cursize, total_copy_bytes); + + return (total_copy_bytes); +} + +int +svm_fifo_dequeue_nowait (svm_fifo_t * f, + int pid, u32 max_bytes, u8 * copy_here) +{ + return svm_fifo_dequeue_internal2 (f, pid, max_bytes, copy_here); +} + +int +svm_fifo_peek (svm_fifo_t * f, int pid, u32 offset, u32 max_bytes, + u8 * copy_here) +{ + u32 total_copy_bytes, first_copy_bytes, second_copy_bytes; + u32 cursize, nitems; + + if (PREDICT_FALSE (f->cursize == 0)) + return -2; /* nothing in the fifo */ + + /* read cursize, which can only increase while we're working */ + cursize = f->cursize; + nitems = f->nitems; + + /* Number of bytes we're going to copy */ + total_copy_bytes = (cursize < max_bytes) ? cursize : max_bytes; + + if (PREDICT_TRUE (copy_here != 0)) + { + /* Number of bytes in first copy segment */ + first_copy_bytes = + ((nitems - f->head) < total_copy_bytes) ? + (nitems - f->head) : total_copy_bytes; + clib_memcpy (copy_here, &f->data[f->head], first_copy_bytes); + + /* Number of bytes in second copy segment, if any */ + second_copy_bytes = total_copy_bytes - first_copy_bytes; + if (second_copy_bytes) + { + clib_memcpy (copy_here + first_copy_bytes, &f->data[0], + second_copy_bytes); + } + } + return total_copy_bytes; +} + +int +svm_fifo_dequeue_drop (svm_fifo_t * f, int pid, u32 max_bytes) +{ + u32 total_drop_bytes, first_drop_bytes, second_drop_bytes; + u32 cursize, nitems; + + if (PREDICT_FALSE (f->cursize == 0)) + return -2; /* nothing in the fifo */ + + /* read cursize, which can only increase while we're working */ + cursize = f->cursize; + nitems = f->nitems; + + /* Number of bytes we're going to drop */ + total_drop_bytes = (cursize < max_bytes) ? cursize : max_bytes; + + /* Number of bytes in first copy segment */ + first_drop_bytes = + ((nitems - f->head) < total_drop_bytes) ? + (nitems - f->head) : total_drop_bytes; + f->head += first_drop_bytes; + f->head = (f->head == nitems) ? 0 : f->head; + + /* Number of bytes in second drop segment, if any */ + second_drop_bytes = total_drop_bytes - first_drop_bytes; + if (second_drop_bytes) + { + f->head += second_drop_bytes; + f->head = (f->head == nitems) ? 0 : f->head; + } + + __sync_fetch_and_sub (&f->cursize, total_drop_bytes); + + return total_drop_bytes; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/svm/svm_fifo.h b/src/svm/svm_fifo.h new file mode 100644 index 00000000..70624b74 --- /dev/null +++ b/src/svm/svm_fifo.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_ssvm_fifo_h__ +#define __included_ssvm_fifo_h__ + +#include +#include +#include +#include +#include +#include +#include + +typedef enum +{ + SVM_FIFO_TAG_NOT_HELD = 0, + SVM_FIFO_TAG_DEQUEUE, + SVM_FIFO_TAG_ENQUEUE, +} svm_lock_tag_t; + +/** Out-of-order segment */ +typedef struct +{ + u32 next; /**< Next linked-list element pool index */ + u32 prev; /**< Previous linked-list element pool index */ + + u32 fifo_position; /**< Start of segment, normalized*/ + u32 length; /**< Length of segment */ +} ooo_segment_t; + +#define OOO_SEGMENT_INVALID_INDEX ((u32)~0) + +typedef struct +{ + pthread_mutex_t mutex; /* 8 bytes */ + pthread_cond_t condvar; /* 8 bytes */ + u32 owner_pid; + svm_lock_tag_t tag; + volatile u32 cursize; + u32 nitems; + + /* Backpointers */ + u32 server_session_index; + u32 client_session_index; + u8 server_thread_index; + u8 client_thread_index; + CLIB_CACHE_LINE_ALIGN_MARK (end_shared); + u32 head; + CLIB_CACHE_LINE_ALIGN_MARK (end_consumer); + + /* producer */ + u32 tail; + + ooo_segment_t *ooo_segments; /**< Pool of ooo segments */ + u32 ooos_list_head; /**< Head of out-of-order linked-list */ + u32 ooos_newest; /**< Last segment to have been updated */ + + CLIB_CACHE_LINE_ALIGN_MARK (data); +} svm_fifo_t; + +static inline int +svm_fifo_lock (svm_fifo_t * f, u32 pid, u32 tag, int nowait) +{ + if (PREDICT_TRUE (nowait == 0)) + pthread_mutex_lock (&f->mutex); + else + { + if (pthread_mutex_trylock (&f->mutex)) + return -1; + } + f->owner_pid = pid; + f->tag = tag; + return 0; +} + +static inline void +svm_fifo_unlock (svm_fifo_t * f) +{ + f->owner_pid = 0; + f->tag = 0; + CLIB_MEMORY_BARRIER (); + pthread_mutex_unlock (&f->mutex); +} + +static inline u32 +svm_fifo_max_dequeue (svm_fifo_t * f) +{ + return f->cursize; +} + +static inline u32 +svm_fifo_max_enqueue (svm_fifo_t * f) +{ + return f->nitems - f->cursize; +} + +static inline u8 +svm_fifo_has_ooo_data (svm_fifo_t * f) +{ + return f->ooos_list_head != OOO_SEGMENT_INVALID_INDEX; +} + +svm_fifo_t *svm_fifo_create (u32 data_size_in_bytes); + +int svm_fifo_enqueue_nowait (svm_fifo_t * f, int pid, u32 max_bytes, + u8 * copy_from_here); + +int svm_fifo_enqueue_with_offset (svm_fifo_t * f, int pid, + u32 offset, u32 required_bytes, + u8 * copy_from_here); + +int svm_fifo_dequeue_nowait (svm_fifo_t * f, int pid, u32 max_bytes, + u8 * copy_here); + +int svm_fifo_peek (svm_fifo_t * f, int pid, u32 offset, u32 max_bytes, + u8 * copy_here); +int svm_fifo_dequeue_drop (svm_fifo_t * f, int pid, u32 max_bytes); + +always_inline ooo_segment_t * +svm_fifo_newest_ooo_segment (svm_fifo_t * f) +{ + return f->ooo_segments + f->ooos_newest; +} + +always_inline u32 +ooo_segment_offset (svm_fifo_t * f, ooo_segment_t * s) +{ + return ((f->nitems + s->fifo_position - f->tail) % f->nitems); +} + +always_inline u32 +ooo_segment_end_offset (svm_fifo_t * f, ooo_segment_t * s) +{ + return ((f->nitems + s->fifo_position + s->length - f->tail) % f->nitems); +} + +#endif /* __included_ssvm_fifo_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/svm/svm_fifo_segment.c b/src/svm/svm_fifo_segment.c new file mode 100644 index 00000000..acabb3bd --- /dev/null +++ b/src/svm/svm_fifo_segment.c @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +svm_fifo_segment_main_t svm_fifo_segment_main; + +/** (master) create an svm fifo segment */ +int +svm_fifo_segment_create (svm_fifo_segment_create_args_t * a) +{ + int rv; + svm_fifo_segment_private_t *s; + svm_fifo_segment_main_t *sm = &svm_fifo_segment_main; + ssvm_shared_header_t *sh; + svm_fifo_segment_header_t *fsh; + void *oldheap; + + /* Allocate a fresh segment */ + pool_get (sm->segments, s); + memset (s, 0, sizeof (*s)); + + s->ssvm.ssvm_size = a->segment_size; + s->ssvm.i_am_master = 1; + s->ssvm.my_pid = getpid (); + s->ssvm.name = (u8 *) a->segment_name; + s->ssvm.requested_va = sm->next_baseva; + + rv = ssvm_master_init (&s->ssvm, s - sm->segments); + + if (rv) + { + _vec_len (s) = vec_len (s) - 1; + return (rv); + } + + /* Note; requested_va updated due to seg base addr randomization */ + sm->next_baseva = s->ssvm.requested_va + a->segment_size; + + sh = s->ssvm.sh; + oldheap = ssvm_push_heap (sh); + + /* Set up svm_fifo_segment shared header */ + fsh = clib_mem_alloc (sizeof (*fsh)); + memset (fsh, 0, sizeof (*fsh)); + sh->opaque[0] = fsh; + s->h = fsh; + fsh->segment_name = format (0, "%s%c", a->segment_name, 0); + + /* Avoid vec_add1(...) failure when adding a fifo, etc. */ + vec_validate (fsh->fifos, 64); + _vec_len (fsh->fifos) = 0; + + ssvm_pop_heap (oldheap); + + sh->ready = 1; + a->new_segment_index = s - sm->segments; + return (0); +} + +/** (slave) attach to an svm fifo segment */ +int +svm_fifo_segment_attach (svm_fifo_segment_create_args_t * a) +{ + int rv; + svm_fifo_segment_private_t *s; + svm_fifo_segment_main_t *sm = &svm_fifo_segment_main; + ssvm_shared_header_t *sh; + svm_fifo_segment_header_t *fsh; + + /* Allocate a fresh segment */ + pool_get (sm->segments, s); + + memset (s, 0, sizeof (*s)); + + s->ssvm.ssvm_size = a->segment_size; + s->ssvm.my_pid = getpid (); + s->ssvm.name = (u8 *) a->segment_name; + s->ssvm.requested_va = sm->next_baseva; + + rv = ssvm_slave_init (&s->ssvm, sm->timeout_in_seconds); + + if (rv) + { + _vec_len (s) = vec_len (s) - 1; + return (rv); + } + + /* Fish the segment header */ + sh = s->ssvm.sh; + fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; + s->h = fsh; + + a->new_segment_index = s - sm->segments; + return (0); +} + +void +svm_fifo_segment_delete (svm_fifo_segment_private_t * s) +{ + svm_fifo_segment_main_t *sm = &svm_fifo_segment_main; + ssvm_delete (&s->ssvm); + pool_put (sm->segments, s); +} + +svm_fifo_t * +svm_fifo_segment_alloc_fifo (svm_fifo_segment_private_t * s, + u32 data_size_in_bytes) +{ + ssvm_shared_header_t *sh; + svm_fifo_segment_header_t *fsh; + svm_fifo_t *f; + void *oldheap; + + sh = s->ssvm.sh; + fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; + oldheap = ssvm_push_heap (sh); + + /* Note: this can fail, in which case: create another segment */ + f = svm_fifo_create (data_size_in_bytes); + if (f == 0) + { + ssvm_pop_heap (oldheap); + return (0); + } + + vec_add1 (fsh->fifos, f); + + ssvm_pop_heap (oldheap); + return (f); +} + +void +svm_fifo_segment_free_fifo (svm_fifo_segment_private_t * s, svm_fifo_t * f) +{ + ssvm_shared_header_t *sh; + svm_fifo_segment_header_t *fsh; + void *oldheap; + int i; + + sh = s->ssvm.sh; + fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; + oldheap = ssvm_push_heap (sh); + + for (i = 0; i < vec_len (fsh->fifos); i++) + { + if (fsh->fifos[i] == f) + { + vec_delete (fsh->fifos, 1, i); + goto found; + } + } + clib_warning ("fifo 0x%llx not found in fifo table...", f); + +found: + clib_mem_free (f); + ssvm_pop_heap (oldheap); +} + +void +svm_fifo_segment_init (u64 baseva, u32 timeout_in_seconds) +{ + svm_fifo_segment_main_t *sm = &svm_fifo_segment_main; + + sm->next_baseva = baseva; + sm->timeout_in_seconds = timeout_in_seconds; +} + +u32 +svm_fifo_segment_index (svm_fifo_segment_private_t * s) +{ + return s - svm_fifo_segment_main.segments; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/svm/svm_fifo_segment.h b/src/svm/svm_fifo_segment.h new file mode 100644 index 00000000..793fa7c8 --- /dev/null +++ b/src/svm/svm_fifo_segment.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_ssvm_fifo_segment_h__ +#define __included_ssvm_fifo_segment_h__ + +#include "svm_fifo.h" +#include "ssvm.h" + +typedef struct +{ + volatile svm_fifo_t **fifos; + u8 *segment_name; +} svm_fifo_segment_header_t; + +typedef struct +{ + ssvm_private_t ssvm; + svm_fifo_segment_header_t *h; +} svm_fifo_segment_private_t; + +typedef struct +{ + /** pool of segments */ + svm_fifo_segment_private_t *segments; + /* Where to put the next one */ + u64 next_baseva; + u32 timeout_in_seconds; +} svm_fifo_segment_main_t; + +extern svm_fifo_segment_main_t svm_fifo_segment_main; + +typedef struct +{ + char *segment_name; + u32 segment_size; + u32 new_segment_index; +} svm_fifo_segment_create_args_t; + +static inline svm_fifo_segment_private_t * +svm_fifo_get_segment (u32 segment_index) +{ + svm_fifo_segment_main_t *ssm = &svm_fifo_segment_main; + return vec_elt_at_index (ssm->segments, segment_index); +} + +#define foreach_ssvm_fifo_segment_api_error \ +_(OUT_OF_SPACE, "Out of space in segment", -200) + +typedef enum +{ +#define _(n,s,c) SSVM_FIFO_SEGMENT_API_ERROR_##n = c, + foreach_ssvm_fifo_segment_api_error +#undef _ +} ssvm_fifo_segment_api_error_enum_t; + +int svm_fifo_segment_create (svm_fifo_segment_create_args_t * a); +int svm_fifo_segment_attach (svm_fifo_segment_create_args_t * a); +void svm_fifo_segment_delete (svm_fifo_segment_private_t * s); + +svm_fifo_t *svm_fifo_segment_alloc_fifo (svm_fifo_segment_private_t * s, + u32 data_size_in_bytes); +void svm_fifo_segment_free_fifo (svm_fifo_segment_private_t * s, + svm_fifo_t * f); + +void svm_fifo_segment_init (u64 baseva, u32 timeout_in_seconds); + +u32 svm_fifo_segment_index (svm_fifo_segment_private_t * s); + +#endif /* __included_ssvm_fifo_segment_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/svm/test_svm_fifo1.c b/src/svm/test_svm_fifo1.c new file mode 100644 index 00000000..355653df --- /dev/null +++ b/src/svm/test_svm_fifo1.c @@ -0,0 +1,361 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "svm_fifo_segment.h" + +clib_error_t * +hello_world (int verbose) +{ + svm_fifo_segment_create_args_t _a, *a = &_a; + svm_fifo_segment_private_t *sp; + svm_fifo_t *f; + int rv; + u8 *test_data; + u8 *retrieved_data = 0; + clib_error_t *error = 0; + int pid = getpid (); + + memset (a, 0, sizeof (*a)); + + a->segment_name = "fifo-test1"; + a->segment_size = 256 << 10; + + rv = svm_fifo_segment_create (a); + + if (rv) + return clib_error_return (0, "svm_fifo_segment_create returned %d", rv); + + sp = svm_fifo_get_segment (a->new_segment_index); + + f = svm_fifo_segment_alloc_fifo (sp, 4096); + + if (f == 0) + return clib_error_return (0, "svm_fifo_segment_alloc_fifo failed"); + + test_data = format (0, "Hello world%c", 0); + vec_validate (retrieved_data, vec_len (test_data) - 1); + + while (svm_fifo_max_enqueue (f) >= vec_len (test_data)) + svm_fifo_enqueue_nowait (f, pid, vec_len (test_data), test_data); + + while (svm_fifo_max_dequeue (f) >= vec_len (test_data)) + svm_fifo_dequeue_nowait (f, pid, vec_len (retrieved_data), + retrieved_data); + + while (svm_fifo_max_enqueue (f) >= vec_len (test_data)) + svm_fifo_enqueue_nowait (f, pid, vec_len (test_data), test_data); + + while (svm_fifo_max_dequeue (f) >= vec_len (test_data)) + svm_fifo_dequeue_nowait (f, pid, vec_len (retrieved_data), + retrieved_data); + + if (!memcmp (retrieved_data, test_data, vec_len (test_data))) + error = clib_error_return (0, "data test OK, got '%s'", retrieved_data); + else + error = clib_error_return (0, "data test FAIL!"); + + svm_fifo_segment_free_fifo (sp, f); + + return error; +} + +clib_error_t * +master (int verbose) +{ + svm_fifo_segment_create_args_t _a, *a = &_a; + svm_fifo_segment_private_t *sp; + svm_fifo_t *f; + int rv; + u8 *test_data; + u8 *retrieved_data = 0; + int i; + int pid = getpid (); + + memset (a, 0, sizeof (*a)); + + a->segment_name = "fifo-test1"; + a->segment_size = 256 << 10; + + rv = svm_fifo_segment_create (a); + + if (rv) + return clib_error_return (0, "svm_fifo_segment_create returned %d", rv); + + sp = svm_fifo_get_segment (a->new_segment_index); + + f = svm_fifo_segment_alloc_fifo (sp, 4096); + + if (f == 0) + return clib_error_return (0, "svm_fifo_segment_alloc_fifo failed"); + + test_data = format (0, "Hello world%c", 0); + vec_validate (retrieved_data, vec_len (test_data) - 1); + + for (i = 0; i < 1000; i++) + svm_fifo_enqueue_nowait (f, pid, vec_len (test_data), test_data); + + return clib_error_return (0, "master (enqueue) done"); +} + +clib_error_t * +mempig (int verbose) +{ + svm_fifo_segment_create_args_t _a, *a = &_a; + svm_fifo_segment_private_t *sp; + svm_fifo_t *f; + svm_fifo_t **flist = 0; + int rv; + int i; + + memset (a, 0, sizeof (*a)); + + a->segment_name = "fifo-test1"; + a->segment_size = 256 << 10; + + rv = svm_fifo_segment_create (a); + + if (rv) + return clib_error_return (0, "svm_fifo_segment_create returned %d", rv); + + sp = svm_fifo_get_segment (a->new_segment_index); + + for (i = 0; i < 1000; i++) + { + f = svm_fifo_segment_alloc_fifo (sp, 4096); + if (f == 0) + break; + vec_add1 (flist, f); + } + + fformat (stdout, "Try #1: created %d fifos...\n", vec_len (flist)); + for (i = 0; i < vec_len (flist); i++) + { + f = flist[i]; + svm_fifo_segment_free_fifo (sp, f); + } + + _vec_len (flist) = 0; + + for (i = 0; i < 1000; i++) + { + f = svm_fifo_segment_alloc_fifo (sp, 4096); + if (f == 0) + break; + vec_add1 (flist, f); + } + + fformat (stdout, "Try #2: created %d fifos...\n", vec_len (flist)); + for (i = 0; i < vec_len (flist); i++) + { + f = flist[i]; + svm_fifo_segment_free_fifo (sp, f); + } + + return 0; +} + +clib_error_t * +offset (int verbose) +{ + svm_fifo_segment_create_args_t _a, *a = &_a; + svm_fifo_segment_private_t *sp; + svm_fifo_t *f; + int rv; + u32 *test_data = 0; + u32 *recovered_data = 0; + int i; + int pid = getpid (); + + memset (a, 0, sizeof (*a)); + + a->segment_name = "fifo-test1"; + a->segment_size = 256 << 10; + + rv = svm_fifo_segment_create (a); + + if (rv) + return clib_error_return (0, "svm_fifo_segment_create returned %d", rv); + + sp = svm_fifo_get_segment (a->new_segment_index); + + f = svm_fifo_segment_alloc_fifo (sp, 200 << 10); + + if (f == 0) + return clib_error_return (0, "svm_fifo_segment_alloc_fifo failed"); + + for (i = 0; i < (3 * 1024); i++) + vec_add1 (test_data, i); + + /* Enqueue the first 1024 u32's */ + svm_fifo_enqueue_nowait (f, pid, 4096 /* bytes to enqueue */ , + (u8 *) test_data); + + /* Enqueue the third 1024 u32's 2048 ahead of the current tail */ + svm_fifo_enqueue_with_offset (f, pid, 4096, 4096, (u8 *) & test_data[2048]); + + /* Enqueue the second 1024 u32's at the current tail */ + svm_fifo_enqueue_nowait (f, pid, 4096 /* bytes to enqueue */ , + (u8 *) & test_data[1024]); + + vec_validate (recovered_data, (3 * 1024) - 1); + + svm_fifo_dequeue_nowait (f, pid, 3 * 4096, (u8 *) recovered_data); + + for (i = 0; i < (3 * 1024); i++) + { + if (recovered_data[i] != test_data[i]) + { + clib_warning ("[%d] expected %d recovered %d", i, + test_data[i], recovered_data[i]); + return clib_error_return (0, "offset test FAILED"); + } + } + + return clib_error_return (0, "offset test OK"); +} + +clib_error_t * +slave (int verbose) +{ + svm_fifo_segment_create_args_t _a, *a = &_a; + svm_fifo_segment_private_t *sp; + svm_fifo_segment_header_t *fsh; + svm_fifo_t *f; + ssvm_shared_header_t *sh; + int rv; + u8 *test_data; + u8 *retrieved_data = 0; + int pid = getpid (); + int i; + + memset (a, 0, sizeof (*a)); + + a->segment_name = "fifo-test1"; + + rv = svm_fifo_segment_attach (a); + + if (rv) + return clib_error_return (0, "svm_fifo_segment_attach returned %d", rv); + + sp = svm_fifo_get_segment (a->new_segment_index); + sh = sp->ssvm.sh; + fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; + + /* might wanna wait.. */ + f = (svm_fifo_t *) fsh->fifos[0]; + + /* Lazy bastards united */ + test_data = format (0, "Hello world%c", 0); + vec_validate (retrieved_data, vec_len (test_data) - 1); + + for (i = 0; i < 1000; i++) + { + svm_fifo_dequeue_nowait (f, pid, vec_len (retrieved_data), + retrieved_data); + if (memcmp (retrieved_data, test_data, vec_len (retrieved_data))) + return clib_error_return (0, "retrieved data incorrect, '%s'", + retrieved_data); + } + + return clib_error_return (0, "slave (dequeue) done"); +} + + +int +test_ssvm_fifo1 (unformat_input_t * input) +{ + clib_error_t *error = 0; + int verbose = 0; + int test_id = 0; + + svm_fifo_segment_init (0x200000000ULL, 20); + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "verbose %d", &verbose)) + ; + else if (unformat (input, "verbose")) + verbose = 1; + else if (unformat (input, "master")) + test_id = 1; + else if (unformat (input, "slave")) + test_id = 2; + else if (unformat (input, "mempig")) + test_id = 3; + else if (unformat (input, "offset")) + test_id = 4; + else + { + error = clib_error_create ("unknown input `%U'\n", + format_unformat_error, input); + goto out; + } + } + + switch (test_id) + { + case 0: + error = hello_world (verbose); + break; + + case 1: + error = master (verbose); + break; + + case 2: + error = slave (verbose); + break; + + case 3: + error = mempig (verbose); + break; + + case 4: + error = offset (verbose); + break; + + default: + error = clib_error_return (0, "test id %d unknown", test_id); + break; + } + +out: + if (error) + clib_error_report (error); + + return 0; +} + + + +int +main (int argc, char *argv[]) +{ + unformat_input_t i; + int r; + + unformat_init_command_line (&i, argv); + r = test_ssvm_fifo1 (&i); + unformat_free (&i); + return r; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/uri.am b/src/uri.am new file mode 100644 index 00000000..8cdd77c6 --- /dev/null +++ b/src/uri.am @@ -0,0 +1,22 @@ +# Copyright (c) 2016 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +noinst_PROGRAMS += uri_udp_test2 uri_tcp_test + +uri_udp_test2_SOURCES = uri/uri_udp_test2.c +uri_udp_test2_LDADD = libvlibmemoryclient.la libvlibapi.la libsvm.la \ + libvppinfra.la -lpthread -lm -lrt + +uri_tcp_test_SOURCES = uri/uri_tcp_test.c +uri_tcp_test_LDADD = libvlibmemoryclient.la libvlibapi.la libsvm.la \ + libvppinfra.la -lpthread -lm -lrt diff --git a/src/uri/uri_tcp_test.c b/src/uri/uri_tcp_test.c new file mode 100644 index 00000000..ed5a37d8 --- /dev/null +++ b/src/uri/uri_tcp_test.c @@ -0,0 +1,916 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "../vnet/session/application_interface.h" + +#define vl_typedefs /* define message structures */ +#include +#undef vl_typedefs + +/* declare message handlers for each api */ + +#define vl_endianfun /* define message structures */ +#include +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) +#define vl_printfun +#include +#undef vl_printfun + +/* Satisfy external references when not linking with -lvlib */ +vlib_main_t vlib_global_main; +vlib_main_t **vlib_mains; + +typedef struct +{ + svm_fifo_t * server_rx_fifo; + svm_fifo_t * server_tx_fifo; + + u32 vpp_session_index; + u32 vpp_session_thread; +} session_t; + +typedef enum +{ + STATE_START, + STATE_READY, + STATE_DISCONNECTING, + STATE_FAILED +} connection_state_t; + +typedef struct +{ + /* vpe input queue */ + unix_shared_memory_queue_t *vl_input_queue; + + /* API client handle */ + u32 my_client_index; + + /* The URI we're playing with */ + u8 * uri; + + /* Session pool */ + session_t * sessions; + + /* Hash table for disconnect processing */ + uword * session_index_by_vpp_handles; + + /* intermediate rx buffer */ + u8 * rx_buf; + + /* URI for slave's connect */ + u8 * connect_uri; + + u32 connected_session_index; + + int i_am_master; + + /* drop all packets */ + int drop_packets; + + /* Our event queue */ + unix_shared_memory_queue_t * our_event_queue; + + /* $$$ single thread only for the moment */ + unix_shared_memory_queue_t * vpp_event_queue; + + pid_t my_pid; + + /* For deadman timers */ + clib_time_t clib_time; + + /* State of the connection, shared between msg RX thread and main thread */ + volatile connection_state_t state; + + /* Signal variables */ + volatile int time_to_stop; + volatile int time_to_print_stats; + + u32 configured_segment_size; + + /* VNET_API_ERROR_FOO -> "Foo" hash table */ + uword * error_string_by_error_number; + + /* convenience */ + svm_fifo_segment_main_t * segment_main; + + u8 *connect_test_data; +} uri_tcp_test_main_t; + +uri_tcp_test_main_t uri_tcp_test_main; + +#if CLIB_DEBUG > 0 +#define NITER 10000 +#else +#define NITER 4000000 +#endif + +int +wait_for_state_change (uri_tcp_test_main_t * utm, connection_state_t state) +{ +#if CLIB_DEBUG > 0 +#define TIMEOUT 600.0 +#else +#define TIMEOUT 600.0 +#endif + + f64 timeout = clib_time_now (&utm->clib_time) + TIMEOUT; + + while (clib_time_now (&utm->clib_time) < timeout) + { + if (utm->state == state) + return 0; + if (utm->state == STATE_FAILED) + return -1; + } + clib_warning ("timeout waiting for STATE_READY"); + return -1; +} + +static void +init_error_string_table (uri_tcp_test_main_t * utm) +{ + utm->error_string_by_error_number = hash_create (0, sizeof (uword)); + +#define _(n,v,s) hash_set (utm->error_string_by_error_number, -v, s); + foreach_vnet_api_error; +#undef _ + + hash_set (utm->error_string_by_error_number, 99, "Misc"); +} + +static void +stop_signal (int signum) +{ + uri_tcp_test_main_t *um = &uri_tcp_test_main; + + um->time_to_stop = 1; +} + +static void +stats_signal (int signum) +{ + uri_tcp_test_main_t *um = &uri_tcp_test_main; + + um->time_to_print_stats = 1; +} + +static clib_error_t * +setup_signal_handlers (void) +{ + signal (SIGINT, stats_signal); + signal (SIGQUIT, stop_signal); + signal (SIGTERM, stop_signal); + + return 0; +} + +void +vlib_cli_output (struct vlib_main_t *vm, char *fmt, ...) +{ + clib_warning ("BUG"); +} + +int +connect_to_vpp (char *name) +{ + uri_tcp_test_main_t *utm = &uri_tcp_test_main; + api_main_t *am = &api_main; + + if (vl_client_connect_to_vlib ("/vpe-api", name, 32) < 0) + return -1; + + utm->vl_input_queue = am->shmem_hdr->vl_input_queue; + utm->my_client_index = am->my_client_index; + + return 0; +} + +static void +vl_api_map_another_segment_t_handler (vl_api_map_another_segment_t *mp) +{ + svm_fifo_segment_create_args_t _a, *a = &_a; + int rv; + + a->segment_name = (char *) mp->segment_name; + a->segment_size = mp->segment_size; + /* Attach to the segment vpp created */ + rv = svm_fifo_segment_attach (a); + if (rv) + { + clib_warning ("svm_fifo_segment_attach ('%s') failed", + mp->segment_name); + return; + } + clib_warning ("Mapped new segment '%s' size %d", mp->segment_name, + mp->segment_size); +} + +static void +vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) +{ + uri_tcp_test_main_t *utm = &uri_tcp_test_main; + session_t * session; + vl_api_disconnect_session_reply_t * rmp; + uword * p; + int rv = 0; + u64 key; + + key = (((u64)mp->session_thread_index) << 32) | (u64)mp->session_index; + + p = hash_get (utm->session_index_by_vpp_handles, key); + + if (p) + { + session = pool_elt_at_index (utm->sessions, p[0]); + hash_unset (utm->session_index_by_vpp_handles, key); + pool_put (utm->sessions, session); + } + else + { + clib_warning ("couldn't find session key %llx", key); + rv = -11; + } + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + + rmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION_REPLY); + rmp->retval = rv; + rmp->session_index = mp->session_index; + rmp->session_thread_index = mp->session_thread_index; + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *)&rmp); +} + +static void +vl_api_reset_session_t_handler (vl_api_reset_session_t * mp) +{ + uri_tcp_test_main_t *utm = &uri_tcp_test_main; + session_t * session; + vl_api_reset_session_reply_t * rmp; + uword * p; + int rv = 0; + u64 key; + + key = (((u64)mp->session_thread_index) << 32) | (u64)mp->session_index; + + p = hash_get(utm->session_index_by_vpp_handles, key); + + if (p) + { + session = pool_elt_at_index(utm->sessions, p[0]); + hash_unset(utm->session_index_by_vpp_handles, key); + pool_put(utm->sessions, session); + } + else + { + clib_warning("couldn't find session key %llx", key); + rv = -11; + } + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION_REPLY); + rmp->retval = rv; + rmp->session_index = mp->session_index; + rmp->session_thread_index = mp->session_thread_index; + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *)&rmp); +} + +void +handle_fifo_event_connect_rx (uri_tcp_test_main_t *utm, session_fifo_event_t * e) +{ + svm_fifo_t * rx_fifo; + int n_read, bytes; + + rx_fifo = e->fifo; + + bytes = e->enqueue_length; + do + { + n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len(utm->rx_buf), + utm->rx_buf); + if (n_read > 0) + bytes -= n_read; + } + while (n_read < 0 || bytes > 0); + + // bytes_to_read = svm_fifo_max_dequeue (rx_fifo); + // + // bytes_to_read = vec_len(utm->rx_buf) > bytes_to_read ? + // bytes_to_read : vec_len(utm->rx_buf); + // + // buffer_offset = 0; + // while (bytes_to_read > 0) + // { + // rv = svm_fifo_dequeue_nowait2 (rx_fifo, mypid, + // bytes_to_read, + // utm->rx_buf + buffer_offset); + // if (rv > 0) + // { + // bytes_to_read -= rv; + // buffer_offset += rv; + // bytes_received += rv; + // } + // } + + + // while (bytes_received < bytes_sent) + // { + // rv = svm_fifo_dequeue_nowait2 (rx_fifo, mypid, + // vec_len (utm->rx_buf), + // utm->rx_buf); + // if (rv > 0) + // { + //#if CLIB_DEBUG > 0 + // int j; + // for (j = 0; j < rv; j++) + // { + // if (utm->rx_buf[j] != ((bytes_received + j) & 0xff)) + // { + // clib_warning ("error at byte %lld, 0x%x not 0x%x", + // bytes_received + j, + // utm->rx_buf[j], + // ((bytes_received + j )&0xff)); + // } + // } + //#endif + // bytes_received += (u64) rv; + // } + // } +} + +void +handle_connect_event_queue (uri_tcp_test_main_t * utm) +{ + session_fifo_event_t _e, *e = &_e;; + + unix_shared_memory_queue_sub (utm->our_event_queue, (u8 *) e, 0 /* nowait */); + switch (e->event_type) + { + case FIFO_EVENT_SERVER_RX: + handle_fifo_event_connect_rx (utm, e); + break; + + case FIFO_EVENT_SERVER_EXIT: + return; + + default: + clib_warning("unknown event type %d", e->event_type); + break; + } +} + +void +uri_tcp_connect_send (uri_tcp_test_main_t *utm) +{ + u8 *test_data = utm->connect_test_data; + u64 bytes_sent = 0; + int rv; + int mypid = getpid(); + session_t * session; + svm_fifo_t *tx_fifo; + int buffer_offset, bytes_to_send = 0; + session_fifo_event_t evt; + static int serial_number = 0; + int i; + u32 max_chunk = 64 << 10, write; + + session = pool_elt_at_index (utm->sessions, utm->connected_session_index); + tx_fifo = session->server_tx_fifo; + + vec_validate (utm->rx_buf, vec_len (test_data) - 1); + + for (i = 0; i < 10; i++) + { + bytes_to_send = vec_len (test_data); + buffer_offset = 0; + while (bytes_to_send > 0) + { + write = bytes_to_send > max_chunk ? max_chunk : bytes_to_send; + rv = svm_fifo_enqueue_nowait (tx_fifo, mypid, write, + test_data + buffer_offset); + + if (rv > 0) + { + bytes_to_send -= rv; + buffer_offset += rv; + bytes_sent += rv; + + /* Fabricate TX event, send to vpp */ + evt.fifo = tx_fifo; + evt.event_type = FIFO_EVENT_SERVER_TX; + /* $$$$ for event logging */ + evt.enqueue_length = rv; + evt.event_id = serial_number++; + + unix_shared_memory_queue_add (utm->vpp_event_queue, (u8 *) &evt, + 0 /* do wait for mutex */); + } + } + } +} + +static void +uri_tcp_client_test (uri_tcp_test_main_t * utm) +{ + vl_api_connect_uri_t * cmp; + vl_api_disconnect_session_t *dmp; + session_t *connected_session; + int i; + + cmp = vl_msg_api_alloc (sizeof (*cmp)); + memset (cmp, 0, sizeof (*cmp)); + + cmp->_vl_msg_id = ntohs (VL_API_CONNECT_URI); + cmp->client_index = utm->my_client_index; + cmp->context = ntohl(0xfeedface); + memcpy (cmp->uri, utm->connect_uri, vec_len (utm->connect_uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *)&cmp); + + if (wait_for_state_change (utm, STATE_READY)) + { + return; + } + + /* Init test data */ + vec_validate (utm->connect_test_data, 64 * 1024 - 1); + for (i = 0; i < vec_len (utm->connect_test_data); i++) + utm->connect_test_data[i] = i & 0xff; + + /* Start reader thread */ + /* handle_connect_event_queue (utm); */ + + /* Start send */ + uri_tcp_connect_send (utm); + + /* Disconnect */ + connected_session = pool_elt_at_index(utm->sessions, + utm->connected_session_index); + dmp = vl_msg_api_alloc (sizeof (*dmp)); + memset (dmp, 0, sizeof (*dmp)); + dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION); + dmp->client_index = utm->my_client_index; + dmp->session_index = connected_session->vpp_session_index; + dmp->session_thread_index = connected_session->vpp_session_thread; + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *)&dmp); +} + +void +handle_fifo_event_server_rx (uri_tcp_test_main_t *utm, session_fifo_event_t * e) +{ + svm_fifo_t * rx_fifo, * tx_fifo; + int n_read; + + session_fifo_event_t evt; + unix_shared_memory_queue_t *q; + int rv, bytes; + + rx_fifo = e->fifo; + tx_fifo = utm->sessions[rx_fifo->client_session_index].server_tx_fifo; + + bytes = e->enqueue_length; + do + { + n_read = svm_fifo_dequeue_nowait (rx_fifo, 0, vec_len(utm->rx_buf), + utm->rx_buf); + + /* Reflect if a non-drop session */ + if (!utm->drop_packets && n_read > 0) + { + do + { + rv = svm_fifo_enqueue_nowait (tx_fifo, 0, n_read, utm->rx_buf); + } + while (rv == -2); + + /* Fabricate TX event, send to vpp */ + evt.fifo = tx_fifo; + evt.event_type = FIFO_EVENT_SERVER_TX; + /* $$$$ for event logging */ + evt.enqueue_length = n_read; + evt.event_id = e->event_id; + q = utm->vpp_event_queue; + unix_shared_memory_queue_add (q, (u8 *) &evt, 0 /* do wait for mutex */); + } + + if (n_read > 0) + bytes -= n_read; + } + while (n_read < 0 || bytes > 0); +} + +void +handle_event_queue (uri_tcp_test_main_t * utm) +{ + session_fifo_event_t _e, *e = &_e;; + + while (1) + { + unix_shared_memory_queue_sub (utm->our_event_queue, (u8 *)e, + 0 /* nowait */); + switch (e->event_type) + { + case FIFO_EVENT_SERVER_RX: + handle_fifo_event_server_rx (utm, e); + break; + + case FIFO_EVENT_SERVER_EXIT: + return; + + default: + clib_warning ("unknown event type %d", e->event_type); + break; + } + if (PREDICT_FALSE(utm->time_to_stop == 1)) + break; + if (PREDICT_FALSE(utm->time_to_print_stats == 1)) + { + utm->time_to_print_stats = 0; + fformat(stdout, "%d connections\n", pool_elts (utm->sessions)); + } + } +} + +static void +vl_api_bind_uri_reply_t_handler (vl_api_bind_uri_reply_t * mp) +{ + uri_tcp_test_main_t *utm = &uri_tcp_test_main; + svm_fifo_segment_create_args_t _a, *a = &_a; + int rv; + + if (mp->retval) + { + clib_warning("bind failed: %d", mp->retval); + return; + } + + if (mp->segment_name_length == 0) + { + clib_warning("segment_name_length zero"); + return; + } + + a->segment_name = (char *) mp->segment_name; + a->segment_size = mp->segment_size; + + ASSERT(mp->server_event_queue_address); + + /* Attach to the segment vpp created */ + rv = svm_fifo_segment_attach (a); + if (rv) + { + clib_warning("svm_fifo_segment_attach ('%s') failed", mp->segment_name); + return; + } + + utm->our_event_queue = + (unix_shared_memory_queue_t *) mp->server_event_queue_address; + + utm->state = STATE_READY; +} + +static void +vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) +{ + uri_tcp_test_main_t *utm = &uri_tcp_test_main; + svm_fifo_segment_create_args_t _a, *a = &_a; + session_t *session; + u32 session_index; + svm_fifo_t *rx_fifo, *tx_fifo; + int rv; + + if (mp->retval) + { + clib_warning ("connection failed with code: %d", mp->retval); + utm->state = STATE_FAILED; + return; + } + /* + * Attatch to segment + */ + + if (mp->segment_name_length == 0) + { + clib_warning ("segment_name_length zero"); + utm->state = STATE_FAILED; + return; + } + + a->segment_name = (char *) mp->segment_name; + a->segment_size = mp->segment_size; + + ASSERT(mp->client_event_queue_address); + + /* Attach to the segment vpp created */ + rv = svm_fifo_segment_attach (a); + if (rv) + { + clib_warning ("svm_fifo_segment_attach ('%s') failed", + mp->segment_name); + return; + } + + /* + * Save the queues + */ + + utm->our_event_queue = (unix_shared_memory_queue_t *) + mp->client_event_queue_address; + + utm->vpp_event_queue = (unix_shared_memory_queue_t *) + mp->vpp_event_queue_address; + + /* + * Setup session + */ + + pool_get (utm->sessions, session); + session_index = session - utm->sessions; + + rx_fifo = (svm_fifo_t *)mp->server_rx_fifo; + rx_fifo->client_session_index = session_index; + tx_fifo = (svm_fifo_t *)mp->server_tx_fifo; + tx_fifo->client_session_index = session_index; + + session->server_rx_fifo = rx_fifo; + session->server_tx_fifo = tx_fifo; + session->vpp_session_index = mp->session_index; + session->vpp_session_thread = mp->session_thread_index; + + /* Save handle */ + utm->connected_session_index = session_index; + + utm->state = STATE_READY; +} + +void +uri_tcp_bind (uri_tcp_test_main_t *utm) +{ + vl_api_bind_uri_t * bmp; + u32 fifo_size = 3 << 20; + bmp = vl_msg_api_alloc (sizeof (*bmp)); + memset (bmp, 0, sizeof (*bmp)); + + bmp->_vl_msg_id = ntohs (VL_API_BIND_URI); + bmp->client_index = utm->my_client_index; + bmp->context = ntohl(0xfeedface); + bmp->initial_segment_size = 256<<20; /* size of initial segment */ + bmp->options[SESSION_OPTIONS_FLAGS] = + SESSION_OPTIONS_FLAGS_USE_FIFO | SESSION_OPTIONS_FLAGS_ADD_SEGMENT; + bmp->options[SESSION_OPTIONS_RX_FIFO_SIZE] = fifo_size; + bmp->options[SESSION_OPTIONS_TX_FIFO_SIZE] = fifo_size; + bmp->options[SESSION_OPTIONS_ADD_SEGMENT_SIZE] = 128<<20; + memcpy (bmp->uri, utm->uri, vec_len (utm->uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *)&bmp); +} + +static void +vl_api_unbind_uri_reply_t_handler (vl_api_unbind_uri_reply_t *mp) +{ + uri_tcp_test_main_t *utm = &uri_tcp_test_main; + + if (mp->retval != 0) + clib_warning ("returned %d", ntohl(mp->retval)); + + utm->state = STATE_START; +} + +void +uri_tcp_unbind (uri_tcp_test_main_t *utm) +{ + vl_api_unbind_uri_t * ump; + + ump = vl_msg_api_alloc (sizeof (*ump)); + memset (ump, 0, sizeof (*ump)); + + ump->_vl_msg_id = ntohs (VL_API_UNBIND_URI); + ump->client_index = utm->my_client_index; + memcpy (ump->uri, utm->uri, vec_len (utm->uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *)&ump); +} + +static void +vl_api_accept_session_t_handler (vl_api_accept_session_t * mp) +{ + uri_tcp_test_main_t *utm = &uri_tcp_test_main; + vl_api_accept_session_reply_t *rmp; + svm_fifo_t * rx_fifo, * tx_fifo; + session_t * session; + static f64 start_time; + u64 key; + u32 session_index; + + if (start_time == 0.0) + start_time = clib_time_now (&utm->clib_time); + + utm->vpp_event_queue = (unix_shared_memory_queue_t *) + mp->vpp_event_queue_address; + + /* Allocate local session and set it up */ + pool_get (utm->sessions, session); + session_index = session - utm->sessions; + + rx_fifo = (svm_fifo_t *)mp->server_rx_fifo; + rx_fifo->client_session_index = session_index; + tx_fifo = (svm_fifo_t *)mp->server_tx_fifo; + tx_fifo->client_session_index = session_index; + + session->server_rx_fifo = rx_fifo; + session->server_tx_fifo = tx_fifo; + + /* Add it to lookup table */ + key = (((u64)mp->session_thread_index) << 32) | (u64)mp->session_index; + hash_set (utm->session_index_by_vpp_handles, key, session_index); + + utm->state = STATE_READY; + + /* Stats printing */ + if (pool_elts (utm->sessions) && (pool_elts(utm->sessions) % 20000) == 0) + { + f64 now = clib_time_now (&utm->clib_time); + fformat (stdout, "%d active sessions in %.2f seconds, %.2f/sec...\n", + pool_elts(utm->sessions), now - start_time, + (f64)pool_elts(utm->sessions) / (now - start_time)); + } + + /* Send accept reply to vpp */ + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_ACCEPT_SESSION_REPLY); + rmp->session_type = mp->session_type; + rmp->session_index = mp->session_index; + rmp->session_thread_index = mp->session_thread_index; + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *)&rmp); +} + +void +uri_tcp_server_test (uri_tcp_test_main_t * utm) +{ + + /* Bind to uri */ + uri_tcp_bind (utm); + + if (wait_for_state_change (utm, STATE_READY)) + { + clib_warning ("timeout waiting for STATE_READY"); + return; + } + + /* Enter handle event loop */ + handle_event_queue (utm); + + /* Cleanup */ + uri_tcp_unbind (utm); + + if (wait_for_state_change (utm, STATE_START)) + { + clib_warning ("timeout waiting for STATE_START"); + return; + } + + fformat (stdout, "Test complete...\n"); +} + +#define foreach_uri_msg \ +_(BIND_URI_REPLY, bind_uri_reply) \ +_(UNBIND_URI_REPLY, unbind_uri_reply) \ +_(ACCEPT_SESSION, accept_session) \ +_(CONNECT_URI_REPLY, connect_uri_reply) \ +_(DISCONNECT_SESSION, disconnect_session) \ +_(RESET_SESSION, reset_session) \ +_(MAP_ANOTHER_SEGMENT, map_another_segment) + +void +uri_api_hookup (uri_tcp_test_main_t * utm) +{ +#define _(N,n) \ + vl_msg_api_set_handlers(VL_API_##N, #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_uri_msg; +#undef _ +} + +int +main (int argc, char **argv) +{ + uri_tcp_test_main_t *utm = &uri_tcp_test_main; + unformat_input_t _argv, *a = &_argv; + u8 *chroot_prefix; + u8 *heap; + u8 * bind_name = (u8 *) "tcp://0.0.0.0/1234"; + u32 tmp; + mheap_t *h; + session_t * session; + int i; + int i_am_master = 1, drop_packets = 0; + + clib_mem_init (0, 256 << 20); + + heap = clib_mem_get_per_cpu_heap (); + h = mheap_header (heap); + + /* make the main heap thread-safe */ + h->flags |= MHEAP_FLAG_THREAD_SAFE; + + vec_validate (utm->rx_buf, 65536); + + utm->session_index_by_vpp_handles = + hash_create (0, sizeof(uword)); + + utm->my_pid = getpid(); + utm->configured_segment_size = 1<<20; + + clib_time_init (&utm->clib_time); + init_error_string_table (utm); + svm_fifo_segment_init(0x200000000ULL, 20); + unformat_init_command_line (a, argv); + + while (unformat_check_input (a) != UNFORMAT_END_OF_INPUT) + { + if (unformat (a, "chroot prefix %s", &chroot_prefix)) + { + vl_set_memory_root_path ((char *) chroot_prefix); + } + else if (unformat (a, "uri %s", &bind_name)) + ; + else if (unformat (a, "segment-size %dM", &tmp)) + utm->configured_segment_size = tmp<<20; + else if (unformat (a, "segment-size %dG", &tmp)) + utm->configured_segment_size = tmp<<30; + else if (unformat (a, "master")) + i_am_master = 1; + else if (unformat (a, "slave")) + i_am_master = 0; + else if (unformat (a, "drop")) + drop_packets = 1; + else + { + fformat (stderr, "%s: usage [master|slave]\n"); + exit (1); + } + } + + utm->uri = format (0, "%s%c", bind_name, 0); + utm->i_am_master = i_am_master; + utm->segment_main = &svm_fifo_segment_main; + utm->drop_packets = drop_packets; + + utm->connect_uri = format (0, "tcp://6.0.1.2/1234%c", 0); + + setup_signal_handlers(); + uri_api_hookup (utm); + + if (connect_to_vpp (i_am_master? "uri_tcp_server":"uri_tcp_client") < 0) + { + svm_region_exit (); + fformat (stderr, "Couldn't connect to vpe, exiting...\n"); + exit (1); + } + + if (i_am_master == 0) + { + uri_tcp_client_test (utm); + exit (0); + } + + /* $$$$ hack preallocation */ + for (i = 0; i < 200000; i++) + { + pool_get (utm->sessions, session); + memset (session, 0, sizeof (*session)); + } + for (i = 0; i < 200000; i++) + pool_put_index (utm->sessions, i); + + uri_tcp_server_test (utm); + + vl_client_disconnect_from_vlib (); + exit (0); +} diff --git a/src/uri/uri_udp_test.c b/src/uri/uri_udp_test.c new file mode 100644 index 00000000..6f5284c9 --- /dev/null +++ b/src/uri/uri_udp_test.c @@ -0,0 +1,553 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define vl_typedefs /* define message structures */ +#include +#undef vl_typedefs + +/* declare message handlers for each api */ + +#define vl_endianfun /* define message structures */ +#include +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) +#define vl_printfun +#include +#undef vl_printfun + +/* Satisfy external references when not linking with -lvlib */ +vlib_main_t vlib_global_main; +vlib_main_t **vlib_mains; + +typedef enum +{ + STATE_START, + STATE_READY, + STATE_DISCONNECTING, +} connection_state_t; + +typedef struct +{ + svm_fifo_t *server_rx_fifo; + svm_fifo_t *server_tx_fifo; +} session_t; + +typedef struct +{ + /* vpe input queue */ + unix_shared_memory_queue_t *vl_input_queue; + + /* API client handle */ + u32 my_client_index; + + /* The URI we're playing with */ + u8 *uri; + + /* Session pool */ + session_t *sessions; + + /* Hash table for disconnect processing */ + uword *session_index_by_vpp_handles; + + /* fifo segment */ + svm_fifo_segment_private_t *seg; + + /* intermediate rx buffer */ + u8 *rx_buf; + + /* Our event queue */ + unix_shared_memory_queue_t *our_event_queue; + + /* $$$ single thread only for the moment */ + unix_shared_memory_queue_t *vpp_event_queue; + + /* For deadman timers */ + clib_time_t clib_time; + + /* State of the connection, shared between msg RX thread and main thread */ + volatile connection_state_t state; + + volatile int time_to_stop; + volatile int time_to_print_stats; + + /* VNET_API_ERROR_FOO -> "Foo" hash table */ + uword *error_string_by_error_number; +} uri_udp_test_main_t; + +#if CLIB_DEBUG > 0 +#define NITER 1000 +#else +#define NITER 1000000 +#endif + +uri_udp_test_main_t uri_udp_test_main; + +static void +stop_signal (int signum) +{ + uri_udp_test_main_t *um = &uri_udp_test_main; + + um->time_to_stop = 1; +} + +static void +stats_signal (int signum) +{ + uri_udp_test_main_t *um = &uri_udp_test_main; + + um->time_to_print_stats = 1; +} + +static clib_error_t * +setup_signal_handlers (void) +{ + signal (SIGINT, stats_signal); + signal (SIGQUIT, stop_signal); + signal (SIGTERM, stop_signal); + + return 0; +} + +u8 * +format_api_error (u8 * s, va_list * args) +{ + uri_udp_test_main_t *utm = va_arg (*args, uri_udp_test_main_t *); + i32 error = va_arg (*args, u32); + uword *p; + + p = hash_get (utm->error_string_by_error_number, -error); + + if (p) + s = format (s, "%s", p[0]); + else + s = format (s, "%d", error); + return s; +} + +int +wait_for_state_change (uri_udp_test_main_t * utm, connection_state_t state) +{ + f64 timeout = clib_time_now (&utm->clib_time) + 5.0; + + while (clib_time_now (&utm->clib_time) < timeout) + { + if (utm->state == state) + return 0; + } + return -1; +} + +static void +vl_api_bind_uri_reply_t_handler (vl_api_bind_uri_reply_t * mp) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + svm_fifo_segment_create_args_t _a, *a = &_a; + int rv; + + if (mp->segment_name_length == 0) + { + clib_warning ("segment_name_length zero"); + return; + } + + a->segment_name = (char *) mp->segment_name; + + /* Attach to the segment vpp created */ + rv = svm_fifo_segment_attach (a); + if (rv) + { + clib_warning ("sm_fifo_segment_create ('%s') failed", mp->segment_name); + return; + } + + utm->our_event_queue = (unix_shared_memory_queue_t *) + mp->server_event_queue_address; + + utm->state = STATE_READY; +} + +static void +vl_api_unbind_uri_reply_t_handler (vl_api_unbind_uri_reply_t * mp) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + + if (mp->retval != 0) + clib_warning ("returned %d", ntohl (mp->retval)); + + utm->state = STATE_START; +} + +static void +vl_api_accept_session_t_handler (vl_api_accept_session_t * mp) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + vl_api_accept_session_reply_t *rmp; + svm_fifo_t *rx_fifo, *tx_fifo; + session_t *session; + static f64 start_time; + u64 key; + + if (start_time == 0.0) + start_time = clib_time_now (&utm->clib_time); + + utm->vpp_event_queue = (unix_shared_memory_queue_t *) + mp->vpp_event_queue_address; + + pool_get (utm->sessions, session); + + rx_fifo = (svm_fifo_t *) mp->server_rx_fifo; + rx_fifo->client_session_index = session - utm->sessions; + tx_fifo = (svm_fifo_t *) mp->server_tx_fifo; + tx_fifo->client_session_index = session - utm->sessions; + + session->server_rx_fifo = rx_fifo; + session->server_tx_fifo = tx_fifo; + + key = (((u64) mp->session_thread_index) << 32) | (u64) mp->session_index; + + hash_set (utm->session_index_by_vpp_handles, key, session - utm->sessions); + + utm->state = STATE_READY; + + if (pool_elts (utm->sessions) && (pool_elts (utm->sessions) % 20000) == 0) + { + f64 now = clib_time_now (&utm->clib_time); + fformat (stdout, "%d active sessions in %.2f seconds, %.2f/sec...\n", + pool_elts (utm->sessions), now - start_time, + (f64) pool_elts (utm->sessions) / (now - start_time)); + } + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_ACCEPT_SESSION_REPLY); + rmp->session_type = mp->session_type; + rmp->session_index = mp->session_index; + rmp->session_thread_index = mp->session_thread_index; + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & rmp); +} + +static void +vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + session_t *session; + vl_api_disconnect_session_reply_t *rmp; + uword *p; + int rv = 0; + u64 key; + + key = (((u64) mp->session_thread_index) << 32) | (u64) mp->session_index; + + p = hash_get (utm->session_index_by_vpp_handles, key); + + if (p) + { + session = pool_elt_at_index (utm->sessions, p[0]); + hash_unset (utm->session_index_by_vpp_handles, key); + pool_put (utm->sessions, session); + } + else + { + clib_warning ("couldn't find session key %llx", key); + rv = -11; + } + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION_REPLY); + rmp->retval = rv; + rmp->session_index = mp->session_index; + rmp->session_thread_index = mp->session_thread_index; + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & rmp); +} + +#define foreach_uri_msg \ +_(BIND_URI_REPLY, bind_uri_reply) \ +_(UNBIND_URI_REPLY, unbind_uri_reply) \ +_(ACCEPT_SESSION, accept_session) \ +_(DISCONNECT_SESSION, disconnect_session) + +void +uri_api_hookup (uri_udp_test_main_t * utm) +{ +#define _(N,n) \ + vl_msg_api_set_handlers(VL_API_##N, #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_uri_msg; +#undef _ + +} + + +int +connect_to_vpp (char *name) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + api_main_t *am = &api_main; + + if (vl_client_connect_to_vlib ("/vpe-api", name, 32) < 0) + return -1; + + utm->vl_input_queue = am->shmem_hdr->vl_input_queue; + utm->my_client_index = am->my_client_index; + + return 0; +} + +void +vlib_cli_output (struct vlib_main_t *vm, char *fmt, ...) +{ + clib_warning ("BUG"); +} + +static void +init_error_string_table (uri_udp_test_main_t * utm) +{ + utm->error_string_by_error_number = hash_create (0, sizeof (uword)); + +#define _(n,v,s) hash_set (utm->error_string_by_error_number, -v, s); + foreach_vnet_api_error; +#undef _ + + hash_set (utm->error_string_by_error_number, 99, "Misc"); +} + +void +handle_fifo_event_server_rx (uri_udp_test_main_t * utm, + session_fifo_event_t * e) +{ + svm_fifo_t *rx_fifo, *tx_fifo; + int nbytes; + + session_fifo_event_t evt; + unix_shared_memory_queue_t *q; + int rv; + + rx_fifo = e->fifo; + tx_fifo = utm->sessions[rx_fifo->client_session_index].server_tx_fifo; + + do + { + nbytes = svm_fifo_dequeue_nowait (rx_fifo, 0, + vec_len (utm->rx_buf), utm->rx_buf); + } + while (nbytes <= 0); + do + { + rv = svm_fifo_enqueue_nowait (tx_fifo, 0, nbytes, utm->rx_buf); + } + while (rv == -2); + + /* Fabricate TX event, send to vpp */ + evt.fifo = tx_fifo; + evt.event_type = FIFO_EVENT_SERVER_TX; + /* $$$$ for event logging */ + evt.enqueue_length = nbytes; + evt.event_id = e->event_id; + q = utm->vpp_event_queue; + unix_shared_memory_queue_add (q, (u8 *) & evt, 0 /* do wait for mutex */ ); +} + +void +handle_event_queue (uri_udp_test_main_t * utm) +{ + session_fifo_event_t _e, *e = &_e;; + + while (1) + { + unix_shared_memory_queue_sub (utm->our_event_queue, (u8 *) e, + 0 /* nowait */ ); + switch (e->event_type) + { + case FIFO_EVENT_SERVER_RX: + handle_fifo_event_server_rx (utm, e); + break; + + case FIFO_EVENT_SERVER_EXIT: + return; + + default: + clib_warning ("unknown event type %d", e->event_type); + break; + } + if (PREDICT_FALSE (utm->time_to_stop == 1)) + break; + if (PREDICT_FALSE (utm->time_to_print_stats == 1)) + { + utm->time_to_print_stats = 0; + fformat (stdout, "%d connections\n", pool_elts (utm->sessions)); + } + } +} + +void +uri_udp_test (uri_udp_test_main_t * utm) +{ + vl_api_bind_uri_t *bmp; + vl_api_unbind_uri_t *ump; + + bmp = vl_msg_api_alloc (sizeof (*bmp)); + memset (bmp, 0, sizeof (*bmp)); + + bmp->_vl_msg_id = ntohs (VL_API_BIND_URI); + bmp->client_index = utm->my_client_index; + bmp->context = ntohl (0xfeedface); + bmp->segment_size = 2 << 30; + memcpy (bmp->uri, utm->uri, vec_len (utm->uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & bmp); + + if (wait_for_state_change (utm, STATE_READY)) + { + clib_warning ("timeout waiting for STATE_READY"); + return; + } + + handle_event_queue (utm); + + ump = vl_msg_api_alloc (sizeof (*ump)); + memset (ump, 0, sizeof (*ump)); + + ump->_vl_msg_id = ntohs (VL_API_UNBIND_URI); + ump->client_index = utm->my_client_index; + memcpy (ump->uri, utm->uri, vec_len (utm->uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & ump); + + if (wait_for_state_change (utm, STATE_START)) + { + clib_warning ("timeout waiting for STATE_START"); + return; + } + + fformat (stdout, "Test complete...\n"); +} + +int +main (int argc, char **argv) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + unformat_input_t _argv, *a = &_argv; + u8 *chroot_prefix; + u8 *heap; + u8 *bind_name = (u8 *) "udp4:1234"; + mheap_t *h; + session_t *session; + int i; + + clib_mem_init (0, 256 << 20); + + heap = clib_mem_get_per_cpu_heap (); + h = mheap_header (heap); + + /* make the main heap thread-safe */ + h->flags |= MHEAP_FLAG_THREAD_SAFE; + + vec_validate (utm->rx_buf, 8192); + + utm->session_index_by_vpp_handles = hash_create (0, sizeof (uword)); + + clib_time_init (&utm->clib_time); + init_error_string_table (utm); + svm_fifo_segment_init (0x200000000ULL, 20); + unformat_init_command_line (a, argv); + + while (unformat_check_input (a) != UNFORMAT_END_OF_INPUT) + { + if (unformat (a, "chroot prefix %s", &chroot_prefix)) + { + vl_set_memory_root_path ((char *) chroot_prefix); + } + else if (unformat (a, "uri %s", &bind_name)) + ; + else + { + fformat (stderr, "%s: usage [master|slave]\n"); + exit (1); + } + } + + utm->uri = format (0, "%s%c", bind_name, 0); + + setup_signal_handlers (); + + uri_api_hookup (utm); + + if (connect_to_vpp ("uri_udp_test") < 0) + { + svm_region_exit (); + fformat (stderr, "Couldn't connect to vpe, exiting...\n"); + exit (1); + } + + /* $$$$ hack preallocation */ + for (i = 0; i < 200000; i++) + { + pool_get (utm->sessions, session); + memset (session, 0, sizeof (*session)); + } + for (i = 0; i < 200000; i++) + pool_put_index (utm->sessions, i); + + uri_udp_test (utm); + + vl_client_disconnect_from_vlib (); + exit (0); +} + +#undef vl_api_version +#define vl_api_version(n,v) static u32 vpe_api_version = v; +#include +#undef vl_api_version + +void +vl_client_add_api_signatures (vl_api_memclnt_create_t * mp) +{ + /* + * Send the main API signature in slot 0. This bit of code must + * match the checks in ../vpe/api/api.c: vl_msg_api_version_check(). + */ + mp->api_versions[0] = clib_host_to_net_u32 (vpe_api_version); +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/uri/uri_udp_test2.c b/src/uri/uri_udp_test2.c new file mode 100644 index 00000000..ddfffaa6 --- /dev/null +++ b/src/uri/uri_udp_test2.c @@ -0,0 +1,954 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../vnet/session/application_interface.h" + +#define vl_typedefs /* define message structures */ +#include +#undef vl_typedefs + +/* declare message handlers for each api */ + +#define vl_endianfun /* define message structures */ +#include +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) +#define vl_printfun +#include +#undef vl_printfun + +/* Satisfy external references when not linking with -lvlib */ +vlib_main_t vlib_global_main; +vlib_main_t **vlib_mains; + +typedef enum +{ + STATE_START, + STATE_READY, + STATE_DISCONNECTING, +} connection_state_t; + +typedef struct +{ + svm_fifo_t *server_rx_fifo; + svm_fifo_t *server_tx_fifo; +} session_t; + +typedef struct +{ + /* vpe input queue */ + unix_shared_memory_queue_t *vl_input_queue; + + /* API client handle */ + u32 my_client_index; + + /* The URI we're playing with */ + u8 *uri; + + /* Session pool */ + session_t *sessions; + + /* Hash table for disconnect processing */ + uword *session_index_by_vpp_handles; + + /* fifo segment */ + svm_fifo_segment_private_t *seg; + + /* intermediate rx buffer */ + u8 *rx_buf; + + /* URI for connect */ + u8 *connect_uri; + + int i_am_master; + + /* Our event queue */ + unix_shared_memory_queue_t *our_event_queue; + + /* $$$ single thread only for the moment */ + unix_shared_memory_queue_t *vpp_event_queue; + + /* $$$$ hack: cut-through session index */ + volatile u32 cut_through_session_index; + + /* unique segment name counter */ + u32 unique_segment_index; + + pid_t my_pid; + + /* pthread handle */ + pthread_t cut_through_thread_handle; + + /* For deadman timers */ + clib_time_t clib_time; + + /* State of the connection, shared between msg RX thread and main thread */ + volatile connection_state_t state; + + volatile int time_to_stop; + volatile int time_to_print_stats; + + u32 configured_segment_size; + + /* VNET_API_ERROR_FOO -> "Foo" hash table */ + uword *error_string_by_error_number; + + /* convenience */ + svm_fifo_segment_main_t *segment_main; + +} uri_udp_test_main_t; + +#if CLIB_DEBUG > 0 +#define NITER 10000 +#else +#define NITER 4000000 +#endif + +uri_udp_test_main_t uri_udp_test_main; + +static void +stop_signal (int signum) +{ + uri_udp_test_main_t *um = &uri_udp_test_main; + + um->time_to_stop = 1; +} + +static void +stats_signal (int signum) +{ + uri_udp_test_main_t *um = &uri_udp_test_main; + + um->time_to_print_stats = 1; +} + +static clib_error_t * +setup_signal_handlers (void) +{ + signal (SIGINT, stats_signal); + signal (SIGQUIT, stop_signal); + signal (SIGTERM, stop_signal); + + return 0; +} + +u8 * +format_api_error (u8 * s, va_list * args) +{ + uri_udp_test_main_t *utm = va_arg (*args, uri_udp_test_main_t *); + i32 error = va_arg (*args, u32); + uword *p; + + p = hash_get (utm->error_string_by_error_number, -error); + + if (p) + s = format (s, "%s", p[0]); + else + s = format (s, "%d", error); + return s; +} + +int +wait_for_state_change (uri_udp_test_main_t * utm, connection_state_t state) +{ +#if CLIB_DEBUG > 0 +#define TIMEOUT 600.0 +#else +#define TIMEOUT 600.0 +#endif + + f64 timeout = clib_time_now (&utm->clib_time) + TIMEOUT; + + while (clib_time_now (&utm->clib_time) < timeout) + { + if (utm->state == state) + return 0; + } + return -1; +} + +u64 server_bytes_received, server_bytes_sent; + +static void * +cut_through_thread_fn (void *arg) +{ + session_t *s; + svm_fifo_t *rx_fifo; + svm_fifo_t *tx_fifo; + u8 *my_copy_buffer = 0; + uri_udp_test_main_t *utm = &uri_udp_test_main; + i32 actual_transfer; + int rv; + u32 buffer_offset; + + while (utm->cut_through_session_index == ~0) + ; + + s = pool_elt_at_index (utm->sessions, utm->cut_through_session_index); + + rx_fifo = s->server_rx_fifo; + tx_fifo = s->server_tx_fifo; + + vec_validate (my_copy_buffer, 64 * 1024 - 1); + + while (true) + { + /* We read from the tx fifo and write to the rx fifo */ + do + { + actual_transfer = svm_fifo_dequeue_nowait (tx_fifo, 0, + vec_len (my_copy_buffer), + my_copy_buffer); + } + while (actual_transfer <= 0); + + server_bytes_received += actual_transfer; + + buffer_offset = 0; + while (actual_transfer > 0) + { + rv = svm_fifo_enqueue_nowait (rx_fifo, 0, actual_transfer, + my_copy_buffer + buffer_offset); + if (rv > 0) + { + actual_transfer -= rv; + buffer_offset += rv; + server_bytes_sent += rv; + } + + } + if (PREDICT_FALSE (utm->time_to_stop)) + break; + } + + pthread_exit (0); +} + +static void +uri_udp_slave_test (uri_udp_test_main_t * utm) +{ + vl_api_connect_uri_t *cmp; + int i; + u8 *test_data = 0; + u64 bytes_received = 0, bytes_sent = 0; + i32 bytes_to_read; + int rv; + int mypid = getpid (); + f64 before, after, delta, bytes_per_second; + session_t *session; + svm_fifo_t *rx_fifo, *tx_fifo; + int buffer_offset, bytes_to_send = 0; + + vec_validate (test_data, 64 * 1024 - 1); + for (i = 0; i < vec_len (test_data); i++) + test_data[i] = i & 0xff; + + cmp = vl_msg_api_alloc (sizeof (*cmp)); + memset (cmp, 0, sizeof (*cmp)); + + cmp->_vl_msg_id = ntohs (VL_API_CONNECT_URI); + cmp->client_index = utm->my_client_index; + cmp->context = ntohl (0xfeedface); + memcpy (cmp->uri, utm->connect_uri, vec_len (utm->connect_uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & cmp); + + if (wait_for_state_change (utm, STATE_READY)) + { + clib_warning ("timeout waiting for STATE_READY"); + return; + } + + session = pool_elt_at_index (utm->sessions, utm->cut_through_session_index); + rx_fifo = session->server_rx_fifo; + tx_fifo = session->server_tx_fifo; + + before = clib_time_now (&utm->clib_time); + + vec_validate (utm->rx_buf, vec_len (test_data) - 1); + + for (i = 0; i < NITER; i++) + { + bytes_to_send = vec_len (test_data); + buffer_offset = 0; + while (bytes_to_send > 0) + { + rv = svm_fifo_enqueue_nowait (tx_fifo, mypid, + bytes_to_send, + test_data + buffer_offset); + + if (rv > 0) + { + bytes_to_send -= rv; + buffer_offset += rv; + bytes_sent += rv; + } + } + + bytes_to_read = svm_fifo_max_dequeue (rx_fifo); + + bytes_to_read = vec_len (utm->rx_buf) > bytes_to_read ? + bytes_to_read : vec_len (utm->rx_buf); + + buffer_offset = 0; + while (bytes_to_read > 0) + { + rv = svm_fifo_dequeue_nowait (rx_fifo, mypid, + bytes_to_read, + utm->rx_buf + buffer_offset); + if (rv > 0) + { + bytes_to_read -= rv; + buffer_offset += rv; + bytes_received += rv; + } + } + } + while (bytes_received < bytes_sent) + { + rv = svm_fifo_dequeue_nowait (rx_fifo, mypid, + vec_len (utm->rx_buf), utm->rx_buf); + if (rv > 0) + { +#if CLIB_DEBUG > 0 + int j; + for (j = 0; j < rv; j++) + { + if (utm->rx_buf[j] != ((bytes_received + j) & 0xff)) + { + clib_warning ("error at byte %lld, 0x%x not 0x%x", + bytes_received + j, + utm->rx_buf[j], + ((bytes_received + j) & 0xff)); + } + } +#endif + bytes_received += (u64) rv; + } + } + + after = clib_time_now (&utm->clib_time); + delta = after - before; + bytes_per_second = 0.0; + + if (delta > 0.0) + bytes_per_second = (f64) bytes_received / delta; + + fformat (stdout, + "Done: %lld recv bytes in %.2f seconds, %.2f bytes/sec...\n\n", + bytes_received, delta, bytes_per_second); + fformat (stdout, + "Done: %lld sent bytes in %.2f seconds, %.2f bytes/sec...\n\n", + bytes_sent, delta, bytes_per_second); + fformat (stdout, + "client -> server -> client round trip: %.2f Gbit/sec \n\n", + (bytes_per_second * 8.0) / 1e9); +} + +static void +vl_api_bind_uri_reply_t_handler (vl_api_bind_uri_reply_t * mp) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + svm_fifo_segment_create_args_t _a, *a = &_a; + int rv; + + if (mp->segment_name_length == 0) + { + clib_warning ("segment_name_length zero"); + return; + } + + a->segment_name = (char *) mp->segment_name; + a->segment_size = mp->segment_size; + + ASSERT (mp->server_event_queue_address); + + /* Attach to the segment vpp created */ + rv = svm_fifo_segment_attach (a); + if (rv) + { + clib_warning ("svm_fifo_segment_attach ('%s') failed", + mp->segment_name); + return; + } + + utm->our_event_queue = (unix_shared_memory_queue_t *) + mp->server_event_queue_address; + + utm->state = STATE_READY; +} + +static void +vl_api_map_another_segment_t_handler (vl_api_map_another_segment_t * mp) +{ + svm_fifo_segment_create_args_t _a, *a = &_a; + int rv; + + a->segment_name = (char *) mp->segment_name; + a->segment_size = mp->segment_size; + /* Attach to the segment vpp created */ + rv = svm_fifo_segment_attach (a); + if (rv) + { + clib_warning ("svm_fifo_segment_attach ('%s') failed", + mp->segment_name); + return; + } + clib_warning ("Mapped new segment '%s' size %d", mp->segment_name, + mp->segment_size); +} + +static void +vl_api_connect_uri_t_handler (vl_api_connect_uri_t * mp) +{ + u32 segment_index; + uri_udp_test_main_t *utm = &uri_udp_test_main; + svm_fifo_segment_main_t *sm = &svm_fifo_segment_main; + svm_fifo_segment_create_args_t _a, *a = &_a; + svm_fifo_segment_private_t *seg; + unix_shared_memory_queue_t *client_q; + vl_api_connect_uri_reply_t *rmp; + session_t *session; + int rv = 0; + + /* Create the segment */ + a->segment_name = (char *) format (0, "%d:segment%d%c", utm->my_pid, + utm->unique_segment_index++, 0); + a->segment_size = utm->configured_segment_size; + + rv = svm_fifo_segment_create (a); + if (rv) + { + clib_warning ("sm_fifo_segment_create ('%s') failed", a->segment_name); + rv = VNET_API_ERROR_URI_FIFO_CREATE_FAILED; + goto send_reply; + } + + vec_add2 (utm->seg, seg, 1); + + segment_index = vec_len (sm->segments) - 1; + + memcpy (seg, sm->segments + segment_index, sizeof (utm->seg[0])); + + pool_get (utm->sessions, session); + + /* + * By construction the master's idea of the rx fifo ends up in + * fsh->fifos[0], and the master's idea of the tx fifo ends up in + * fsh->fifos[1]. + */ + session->server_rx_fifo = svm_fifo_segment_alloc_fifo (utm->seg, + 128 * 1024); + ASSERT (session->server_rx_fifo); + + session->server_tx_fifo = svm_fifo_segment_alloc_fifo (utm->seg, + 128 * 1024); + ASSERT (session->server_tx_fifo); + + session->server_rx_fifo->server_session_index = session - utm->sessions; + session->server_tx_fifo->server_session_index = session - utm->sessions; + utm->cut_through_session_index = session - utm->sessions; + + rv = pthread_create (&utm->cut_through_thread_handle, + NULL /*attr */ , cut_through_thread_fn, 0); + if (rv) + { + clib_warning ("pthread_create returned %d", rv); + rv = VNET_API_ERROR_SYSCALL_ERROR_1; + } + +send_reply: + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + + rmp->_vl_msg_id = ntohs (VL_API_CONNECT_URI_REPLY); + rmp->context = mp->context; + rmp->retval = ntohl (rv); + rmp->segment_name_length = vec_len (a->segment_name); + memcpy (rmp->segment_name, a->segment_name, vec_len (a->segment_name)); + + vec_free (a->segment_name); + + client_q = (unix_shared_memory_queue_t *) mp->client_queue_address; + vl_msg_api_send_shmem (client_q, (u8 *) & rmp); +} + +static void +vl_api_unbind_uri_reply_t_handler (vl_api_unbind_uri_reply_t * mp) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + + if (mp->retval != 0) + clib_warning ("returned %d", ntohl (mp->retval)); + + utm->state = STATE_START; +} + +static void +vl_api_accept_session_t_handler (vl_api_accept_session_t * mp) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + vl_api_accept_session_reply_t *rmp; + svm_fifo_t *rx_fifo, *tx_fifo; + session_t *session; + static f64 start_time; + u64 key; + + if (start_time == 0.0) + start_time = clib_time_now (&utm->clib_time); + + utm->vpp_event_queue = (unix_shared_memory_queue_t *) + mp->vpp_event_queue_address; + + pool_get (utm->sessions, session); + + rx_fifo = (svm_fifo_t *) mp->server_rx_fifo; + rx_fifo->client_session_index = session - utm->sessions; + tx_fifo = (svm_fifo_t *) mp->server_tx_fifo; + tx_fifo->client_session_index = session - utm->sessions; + + session->server_rx_fifo = rx_fifo; + session->server_tx_fifo = tx_fifo; + + key = (((u64) mp->session_thread_index) << 32) | (u64) mp->session_index; + + hash_set (utm->session_index_by_vpp_handles, key, session - utm->sessions); + + utm->state = STATE_READY; + + if (pool_elts (utm->sessions) && (pool_elts (utm->sessions) % 20000) == 0) + { + f64 now = clib_time_now (&utm->clib_time); + fformat (stdout, "%d active sessions in %.2f seconds, %.2f/sec...\n", + pool_elts (utm->sessions), now - start_time, + (f64) pool_elts (utm->sessions) / (now - start_time)); + } + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_ACCEPT_SESSION_REPLY); + rmp->session_type = mp->session_type; + rmp->session_index = mp->session_index; + rmp->session_thread_index = mp->session_thread_index; + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & rmp); +} + +static void +vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + session_t *session; + vl_api_disconnect_session_reply_t *rmp; + uword *p; + int rv = 0; + u64 key; + + key = (((u64) mp->session_thread_index) << 32) | (u64) mp->session_index; + + p = hash_get (utm->session_index_by_vpp_handles, key); + + if (p) + { + session = pool_elt_at_index (utm->sessions, p[0]); + hash_unset (utm->session_index_by_vpp_handles, key); + pool_put (utm->sessions, session); + } + else + { + clib_warning ("couldn't find session key %llx", key); + rv = -11; + } + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION_REPLY); + rmp->retval = rv; + rmp->session_index = mp->session_index; + rmp->session_thread_index = mp->session_thread_index; + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & rmp); +} + +static void +vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) +{ + svm_fifo_segment_main_t *sm = &svm_fifo_segment_main; + uri_udp_test_main_t *utm = &uri_udp_test_main; + svm_fifo_segment_create_args_t _a, *a = &_a; + ssvm_shared_header_t *sh; + svm_fifo_segment_private_t *seg; + svm_fifo_segment_header_t *fsh; + session_t *session; + u32 segment_index; + int rv; + + ASSERT (utm->i_am_master == 0); + + if (mp->segment_name_length == 0) + { + clib_warning ("segment_name_length zero"); + return; + } + + memset (a, 0, sizeof (*a)); + + a->segment_name = (char *) mp->segment_name; + + sleep (1); + + rv = svm_fifo_segment_attach (a); + if (rv) + { + clib_warning ("sm_fifo_segment_create ('%v') failed", mp->segment_name); + return; + } + + segment_index = vec_len (sm->segments) - 1; + + vec_add2 (utm->seg, seg, 1); + + memcpy (seg, sm->segments + segment_index, sizeof (*seg)); + sh = seg->ssvm.sh; + fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; + + while (vec_len (fsh->fifos) < 2) + sleep (1); + + pool_get (utm->sessions, session); + utm->cut_through_session_index = session - utm->sessions; + + session->server_rx_fifo = (svm_fifo_t *) fsh->fifos[0]; + ASSERT (session->server_rx_fifo); + session->server_tx_fifo = (svm_fifo_t *) fsh->fifos[1]; + ASSERT (session->server_tx_fifo); + + /* security: could unlink /dev/shm/segment_name> here, maybe */ + + utm->state = STATE_READY; +} + +#define foreach_uri_msg \ +_(BIND_URI_REPLY, bind_uri_reply) \ +_(CONNECT_URI, connect_uri) \ +_(CONNECT_URI_REPLY, connect_uri_reply) \ +_(UNBIND_URI_REPLY, unbind_uri_reply) \ +_(ACCEPT_SESSION, accept_session) \ +_(DISCONNECT_SESSION, disconnect_session) \ +_(MAP_ANOTHER_SEGMENT, map_another_segment) + +void +uri_api_hookup (uri_udp_test_main_t * utm) +{ +#define _(N,n) \ + vl_msg_api_set_handlers(VL_API_##N, #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_uri_msg; +#undef _ + +} + + +int +connect_to_vpp (char *name) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + api_main_t *am = &api_main; + + if (vl_client_connect_to_vlib ("/vpe-api", name, 32) < 0) + return -1; + + utm->vl_input_queue = am->shmem_hdr->vl_input_queue; + utm->my_client_index = am->my_client_index; + + return 0; +} + +void +vlib_cli_output (struct vlib_main_t *vm, char *fmt, ...) +{ + clib_warning ("BUG"); +} + +static void +init_error_string_table (uri_udp_test_main_t * utm) +{ + utm->error_string_by_error_number = hash_create (0, sizeof (uword)); + +#define _(n,v,s) hash_set (utm->error_string_by_error_number, -v, s); + foreach_vnet_api_error; +#undef _ + + hash_set (utm->error_string_by_error_number, 99, "Misc"); +} + +void +handle_fifo_event_server_rx (uri_udp_test_main_t * utm, + session_fifo_event_t * e) +{ + svm_fifo_t *rx_fifo, *tx_fifo; + int nbytes; + + session_fifo_event_t evt; + unix_shared_memory_queue_t *q; + int rv; + + rx_fifo = e->fifo; + tx_fifo = utm->sessions[rx_fifo->client_session_index].server_tx_fifo; + + do + { + nbytes = svm_fifo_dequeue_nowait (rx_fifo, 0, + vec_len (utm->rx_buf), utm->rx_buf); + } + while (nbytes <= 0); + do + { + rv = svm_fifo_enqueue_nowait (tx_fifo, 0, nbytes, utm->rx_buf); + } + while (rv == -2); + + /* Fabricate TX event, send to vpp */ + evt.fifo = tx_fifo; + evt.event_type = FIFO_EVENT_SERVER_TX; + /* $$$$ for event logging */ + evt.enqueue_length = nbytes; + evt.event_id = e->event_id; + q = utm->vpp_event_queue; + unix_shared_memory_queue_add (q, (u8 *) & evt, 0 /* do wait for mutex */ ); +} + +void +handle_event_queue (uri_udp_test_main_t * utm) +{ + session_fifo_event_t _e, *e = &_e;; + + while (1) + { + unix_shared_memory_queue_sub (utm->our_event_queue, (u8 *) e, + 0 /* nowait */ ); + switch (e->event_type) + { + case FIFO_EVENT_SERVER_RX: + handle_fifo_event_server_rx (utm, e); + break; + + case FIFO_EVENT_SERVER_EXIT: + return; + + default: + clib_warning ("unknown event type %d", e->event_type); + break; + } + if (PREDICT_FALSE (utm->time_to_stop == 1)) + break; + if (PREDICT_FALSE (utm->time_to_print_stats == 1)) + { + utm->time_to_print_stats = 0; + fformat (stdout, "%d connections\n", pool_elts (utm->sessions)); + } + } +} + +void +uri_udp_test (uri_udp_test_main_t * utm) +{ + vl_api_bind_uri_t *bmp; + vl_api_unbind_uri_t *ump; + + bmp = vl_msg_api_alloc (sizeof (*bmp)); + memset (bmp, 0, sizeof (*bmp)); + + bmp->_vl_msg_id = ntohs (VL_API_BIND_URI); + bmp->client_index = utm->my_client_index; + bmp->context = ntohl (0xfeedface); + bmp->initial_segment_size = 256 << 20; /* size of initial segment */ + bmp->options[SESSION_OPTIONS_FLAGS] = + SESSION_OPTIONS_FLAGS_USE_FIFO | SESSION_OPTIONS_FLAGS_ADD_SEGMENT; + bmp->options[SESSION_OPTIONS_RX_FIFO_SIZE] = 16 << 10; + bmp->options[SESSION_OPTIONS_TX_FIFO_SIZE] = 16 << 10; + bmp->options[SESSION_OPTIONS_ADD_SEGMENT_SIZE] = 128 << 20; + memcpy (bmp->uri, utm->uri, vec_len (utm->uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & bmp); + + if (wait_for_state_change (utm, STATE_READY)) + { + clib_warning ("timeout waiting for STATE_READY"); + return; + } + + handle_event_queue (utm); + + ump = vl_msg_api_alloc (sizeof (*ump)); + memset (ump, 0, sizeof (*ump)); + + ump->_vl_msg_id = ntohs (VL_API_UNBIND_URI); + ump->client_index = utm->my_client_index; + memcpy (ump->uri, utm->uri, vec_len (utm->uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & ump); + + if (wait_for_state_change (utm, STATE_START)) + { + clib_warning ("timeout waiting for STATE_START"); + return; + } + + fformat (stdout, "Test complete...\n"); +} + +int +main (int argc, char **argv) +{ + uri_udp_test_main_t *utm = &uri_udp_test_main; + unformat_input_t _argv, *a = &_argv; + u8 *chroot_prefix; + u8 *heap; + u8 *bind_name = (u8 *) "udp://0.0.0.0/1234"; + u32 tmp; + mheap_t *h; + session_t *session; + int i; + int i_am_master = 1; + + clib_mem_init (0, 256 << 20); + + heap = clib_mem_get_per_cpu_heap (); + h = mheap_header (heap); + + /* make the main heap thread-safe */ + h->flags |= MHEAP_FLAG_THREAD_SAFE; + + vec_validate (utm->rx_buf, 8192); + + utm->session_index_by_vpp_handles = hash_create (0, sizeof (uword)); + + utm->my_pid = getpid (); + utm->configured_segment_size = 1 << 20; + + clib_time_init (&utm->clib_time); + init_error_string_table (utm); + svm_fifo_segment_init (0x200000000ULL, 20); + unformat_init_command_line (a, argv); + + while (unformat_check_input (a) != UNFORMAT_END_OF_INPUT) + { + if (unformat (a, "chroot prefix %s", &chroot_prefix)) + { + vl_set_memory_root_path ((char *) chroot_prefix); + } + else if (unformat (a, "uri %s", &bind_name)) + ; + else if (unformat (a, "segment-size %dM", &tmp)) + utm->configured_segment_size = tmp << 20; + else if (unformat (a, "segment-size %dG", &tmp)) + utm->configured_segment_size = tmp << 30; + else if (unformat (a, "master")) + i_am_master = 1; + else if (unformat (a, "slave")) + i_am_master = 0; + else + { + fformat (stderr, "%s: usage [master|slave]\n"); + exit (1); + } + } + + utm->cut_through_session_index = ~0; + utm->uri = format (0, "%s%c", bind_name, 0); + utm->i_am_master = i_am_master; + utm->segment_main = &svm_fifo_segment_main; + + utm->connect_uri = format (0, "udp://10.0.0.1/1234%c", 0); + + setup_signal_handlers (); + + uri_api_hookup (utm); + + if (connect_to_vpp (i_am_master ? "uri_udp_master" : "uri_udp_slave") < 0) + { + svm_region_exit (); + fformat (stderr, "Couldn't connect to vpe, exiting...\n"); + exit (1); + } + + if (i_am_master == 0) + { + uri_udp_slave_test (utm); + exit (0); + } + + /* $$$$ hack preallocation */ + for (i = 0; i < 200000; i++) + { + pool_get (utm->sessions, session); + memset (session, 0, sizeof (*session)); + } + for (i = 0; i < 200000; i++) + pool_put_index (utm->sessions, i); + + uri_udp_test (utm); + + vl_client_disconnect_from_vlib (); + exit (0); +} + +#undef vl_api_version +#define vl_api_version(n,v) static u32 vpe_api_version = v; +#include +#undef vl_api_version + +void +vl_client_add_api_signatures (vl_api_memclnt_create_t * mp) +{ + /* + * Send the main API signature in slot 0. This bit of code must + * match the checks in ../vpe/api/api.c: vl_msg_api_version_check(). + */ + mp->api_versions[0] = clib_host_to_net_u32 (vpe_api_version); +} + +u32 +vl (void *p) +{ + return vec_len (p); +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/uri/uritest.c b/src/uri/uritest.c new file mode 100644 index 00000000..edcdb3ad --- /dev/null +++ b/src/uri/uritest.c @@ -0,0 +1,484 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define vl_typedefs /* define message structures */ +#include +#undef vl_typedefs + +/* declare message handlers for each api */ + +#define vl_endianfun /* define message structures */ +#include +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) +#define vl_printfun +#include +#undef vl_printfun + +typedef enum +{ + STATE_START, + STATE_READY, + STATE_DISCONNECTING, +} connection_state_t; + +typedef struct +{ + /* vpe input queue */ + unix_shared_memory_queue_t *vl_input_queue; + + /* API client handle */ + u32 my_client_index; + + /* role */ + int i_am_master; + + /* The URI we're playing with */ + u8 *uri; + + /* fifo segment */ + svm_fifo_segment_private_t *seg; + + svm_fifo_t *rx_fifo; + svm_fifo_t *tx_fifo; + + /* For deadman timers */ + clib_time_t clib_time; + + /* State of the connection, shared between msg RX thread and main thread */ + volatile connection_state_t state; + + /* VNET_API_ERROR_FOO -> "Foo" hash table */ + uword *error_string_by_error_number; +} uritest_main_t; + +#if CLIB_DEBUG > 0 +#define NITER 1000 +#else +#define NITER 1000000 +#endif + +uritest_main_t uritest_main; + +u8 * +format_api_error (u8 * s, va_list * args) +{ + uritest_main_t *utm = va_arg (*args, uritest_main_t *); + i32 error = va_arg (*args, u32); + uword *p; + + p = hash_get (utm->error_string_by_error_number, -error); + + if (p) + s = format (s, "%s", p[0]); + else + s = format (s, "%d", error); + return s; +} + +int +wait_for_state_change (uritest_main_t * utm, connection_state_t state) +{ + f64 timeout = clib_time_now (&utm->clib_time) + 1.0; + + while (clib_time_now (&utm->clib_time) < timeout) + { + if (utm->state == state) + return 0; + } + return -1; +} + +static void +vl_api_bind_uri_reply_t_handler (vl_api_bind_uri_reply_t * mp) +{ + uritest_main_t *utm = &uritest_main; + svm_fifo_segment_create_args_t _a, *a = &_a; + int rv; + + ASSERT (utm->i_am_master); + + if (mp->segment_name_length == 0) + { + clib_warning ("segment_name_length zero"); + return; + } + + a->segment_name = (char *) mp->segment_name; + a->segment_size = mp->segment_size; + + /* Create the segment */ + rv = svm_fifo_segment_create (a); + if (rv) + { + clib_warning ("sm_fifo_segment_create ('%s') failed", mp->segment_name); + return; + } + + vec_validate (utm->seg, 0); + + memcpy (utm->seg, a->rv, sizeof (*utm->seg)); + + /* + * By construction the master's idea of the rx fifo ends up in + * fsh->fifos[0], and the master's idea of the tx fifo ends up in + * fsh->fifos[1]. + */ + utm->rx_fifo = svm_fifo_segment_alloc_fifo (utm->seg, 10240); + ASSERT (utm->rx_fifo); + + utm->tx_fifo = svm_fifo_segment_alloc_fifo (utm->seg, 10240); + ASSERT (utm->tx_fifo); + + utm->state = STATE_READY; +} + +static void +vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) +{ + uritest_main_t *utm = &uritest_main; + svm_fifo_segment_create_args_t _a, *a = &_a; + ssvm_shared_header_t *sh; + svm_fifo_segment_header_t *fsh; + int rv; + + ASSERT (utm->i_am_master == 0); + + if (mp->segment_name_length == 0) + { + clib_warning ("segment_name_length zero"); + return; + } + + memset (a, 0, sizeof (*a)); + + a->segment_name = (char *) mp->segment_name; + + rv = svm_fifo_segment_attach (a); + if (rv) + { + clib_warning ("sm_fifo_segment_create ('%s') failed", mp->segment_name); + return; + } + + vec_validate (utm->seg, 0); + + memcpy (utm->seg, a->rv, sizeof (*utm->seg)); + sh = utm->seg->ssvm.sh; + fsh = (svm_fifo_segment_header_t *) sh->opaque[0]; + + while (vec_len (fsh->fifos) < 2) + sleep (1); + + utm->rx_fifo = (svm_fifo_t *) fsh->fifos[1]; + ASSERT (utm->rx_fifo); + utm->tx_fifo = (svm_fifo_t *) fsh->fifos[0]; + ASSERT (utm->tx_fifo); + + /* security: could unlink /dev/shm/segment_name> here, maybe */ + + utm->state = STATE_READY; +} + +static void +vl_api_unbind_uri_reply_t_handler (vl_api_unbind_uri_reply_t * mp) +{ + uritest_main_t *utm = &uritest_main; + + if (mp->retval != 0) + clib_warning ("returned %d", ntohl (mp->retval)); + + utm->state = STATE_START; +} + +#define foreach_uri_msg \ +_(BIND_URI_REPLY, bind_uri_reply) \ +_(CONNECT_URI_REPLY, connect_uri_reply) \ +_(UNBIND_URI_REPLY, unbind_uri_reply) + +void +uri_api_hookup (uritest_main_t * utm) +{ +#define _(N,n) \ + vl_msg_api_set_handlers(VL_API_##N, #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_uri_msg; +#undef _ + +} + + +int +connect_to_vpp (char *name) +{ + uritest_main_t *utm = &uritest_main; + api_main_t *am = &api_main; + + if (vl_client_connect_to_vlib ("/vpe-api", name, 32) < 0) + return -1; + + utm->vl_input_queue = am->shmem_hdr->vl_input_queue; + utm->my_client_index = am->my_client_index; + + return 0; +} + +void +vlib_cli_output (struct vlib_main_t *vm, char *fmt, ...) +{ + clib_warning ("BUG"); +} + +static void +init_error_string_table (uritest_main_t * utm) +{ + utm->error_string_by_error_number = hash_create (0, sizeof (uword)); + +#define _(n,v,s) hash_set (utm->error_string_by_error_number, -v, s); + foreach_vnet_api_error; +#undef _ + + hash_set (utm->error_string_by_error_number, 99, "Misc"); +} + +void +uritest_master (uritest_main_t * utm) +{ + vl_api_bind_uri_t *bmp; + vl_api_unbind_uri_t *ump; + int i; + u8 *test_data = 0; + u8 *reply = 0; + u32 reply_len; + int mypid = getpid (); + + for (i = 0; i < 2048; i++) + vec_add1 (test_data, 'a' + (i % 32)); + + bmp = vl_msg_api_alloc (sizeof (*bmp)); + memset (bmp, 0, sizeof (*bmp)); + + bmp->_vl_msg_id = ntohs (VL_API_BIND_URI); + bmp->client_index = utm->my_client_index; + bmp->context = ntohl (0xfeedface); + bmp->segment_size = 256 << 10; + memcpy (bmp->uri, utm->uri, vec_len (utm->uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & bmp); + + if (wait_for_state_change (utm, STATE_READY)) + { + clib_warning ("timeout waiting for STATE_READY"); + return; + } + + for (i = 0; i < NITER; i++) + svm_fifo_enqueue (utm->tx_fifo, mypid, vec_len (test_data), test_data); + + vec_validate (reply, 0); + + reply_len = svm_fifo_dequeue (utm->rx_fifo, mypid, vec_len (reply), reply); + + if (reply_len != 1) + clib_warning ("reply length %d", reply_len); + + if (reply[0] == 1) + fformat (stdout, "Test OK..."); + + ump = vl_msg_api_alloc (sizeof (*ump)); + memset (ump, 0, sizeof (*ump)); + + ump->_vl_msg_id = ntohs (VL_API_UNBIND_URI); + ump->client_index = utm->my_client_index; + memcpy (ump->uri, utm->uri, vec_len (utm->uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & ump); + + if (wait_for_state_change (utm, STATE_START)) + { + clib_warning ("timeout waiting for STATE_READY"); + return; + } + + fformat (stdout, "Master done...\n"); +} + +void +uritest_slave (uritest_main_t * utm) +{ + vl_api_connect_uri_t *cmp; + int i, j; + u8 *test_data = 0; + u8 *reply = 0; + u32 bytes_received = 0; + u32 actual_bytes; + int mypid = getpid (); + u8 ok; + f64 before, after, delta, bytes_per_second; + + vec_validate (test_data, 4095); + + cmp = vl_msg_api_alloc (sizeof (*cmp)); + memset (cmp, 0, sizeof (*cmp)); + + cmp->_vl_msg_id = ntohs (VL_API_CONNECT_URI); + cmp->client_index = utm->my_client_index; + cmp->context = ntohl (0xfeedface); + memcpy (cmp->uri, utm->uri, vec_len (utm->uri)); + vl_msg_api_send_shmem (utm->vl_input_queue, (u8 *) & cmp); + + if (wait_for_state_change (utm, STATE_READY)) + { + clib_warning ("timeout waiting for STATE_READY"); + return; + } + + ok = 1; + before = clib_time_now (&utm->clib_time); + for (i = 0; i < NITER; i++) + { + actual_bytes = svm_fifo_dequeue (utm->rx_fifo, mypid, + vec_len (test_data), test_data); + j = 0; + while (j < actual_bytes) + { + if (test_data[j] != ('a' + (bytes_received % 32))) + ok = 0; + bytes_received++; + j++; + } + if (bytes_received == NITER * 2048) + break; + } + + vec_add1 (reply, ok); + + svm_fifo_enqueue (utm->tx_fifo, mypid, vec_len (reply), reply); + after = clib_time_now (&utm->clib_time); + delta = after - before; + bytes_per_second = 0.0; + + if (delta > 0.0) + bytes_per_second = (f64) bytes_received / delta; + + fformat (stdout, + "Slave done, %d bytes in %.2f seconds, %.2f bytes/sec...\n", + bytes_received, delta, bytes_per_second); +} + +int +main (int argc, char **argv) +{ + uritest_main_t *utm = &uritest_main; + unformat_input_t _argv, *a = &_argv; + u8 *chroot_prefix; + u8 *heap; + char *bind_name = "fifo:uritest"; + mheap_t *h; + int i_am_master = 0; + + clib_mem_init (0, 128 << 20); + + heap = clib_mem_get_per_cpu_heap (); + h = mheap_header (heap); + + /* make the main heap thread-safe */ + h->flags |= MHEAP_FLAG_THREAD_SAFE; + + clib_time_init (&utm->clib_time); + init_error_string_table (utm); + svm_fifo_segment_init (0x200000000ULL, 20); + unformat_init_command_line (a, argv); + + utm->uri = format (0, "%s%c", bind_name, 0); + + while (unformat_check_input (a) != UNFORMAT_END_OF_INPUT) + { + if (unformat (a, "master")) + i_am_master = 1; + else if (unformat (a, "slave")) + i_am_master = 0; + else if (unformat (a, "chroot prefix %s", &chroot_prefix)) + { + vl_set_memory_root_path ((char *) chroot_prefix); + } + else + { + fformat (stderr, "%s: usage [master|slave]\n"); + exit (1); + } + } + + uri_api_hookup (utm); + + if (connect_to_vpp (i_am_master ? "uritest_master" : "uritest_slave") < 0) + { + svm_region_exit (); + fformat (stderr, "Couldn't connect to vpe, exiting...\n"); + exit (1); + } + + utm->i_am_master = i_am_master; + + if (i_am_master) + uritest_master (utm); + else + uritest_slave (utm); + + vl_client_disconnect_from_vlib (); + exit (0); +} + +#undef vl_api_version +#define vl_api_version(n,v) static u32 vpe_api_version = v; +#include +#undef vl_api_version + +void +vl_client_add_api_signatures (vl_api_memclnt_create_t * mp) +{ + /* + * Send the main API signature in slot 0. This bit of code must + * match the checks in ../vpe/api/api.c: vl_msg_api_version_check(). + */ + mp->api_versions[0] = clib_host_to_net_u32 (vpe_api_version); +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index 4f5eb09d..9f26bec7 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -360,7 +360,7 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm, memset (f, 0, sizeof (f[0])); f->index = f - bm->buffer_free_list_pool; f->n_data_bytes = vlib_buffer_round_size (n_data_bytes); - f->min_n_buffers_each_physmem_alloc = 16; + f->min_n_buffers_each_physmem_alloc = VLIB_FRAME_SIZE; f->name = clib_mem_is_heap_object (name) ? name : format (0, "%s", name); /* Setup free buffer template. */ diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index 1f723f3b..69c8c7cc 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -240,6 +240,74 @@ vlib_get_buffer_opaque2 (vlib_buffer_t * b) return (void *) b->opaque2; } +/** \brief Get pointer to the end of buffer's data + * @param b pointer to the buffer + * @return pointer to tail of packet's data + */ +always_inline u8 * +vlib_buffer_get_tail (vlib_buffer_t * b) +{ + return b->data + b->current_data + b->current_length; +} + +/** \brief Append uninitialized data to buffer + * @param b pointer to the buffer + * @param size number of uninitialized bytes + * @return pointer to beginning of uninitialized data + */ +always_inline void * +vlib_buffer_put_uninit (vlib_buffer_t * b, u8 size) +{ + void *p = vlib_buffer_get_tail (b); + /* XXX make sure there's enough space */ + b->current_length += size; + return p; +} + +/** \brief Prepend uninitialized data to buffer + * @param b pointer to the buffer + * @param size number of uninitialized bytes + * @return pointer to beginning of uninitialized data + */ +always_inline void * +vlib_buffer_push_uninit (vlib_buffer_t * b, u8 size) +{ + ASSERT (b->current_data + VLIB_BUFFER_PRE_DATA_SIZE >= size); + b->current_data -= size; + b->current_length += size; + + return vlib_buffer_get_current (b); +} + +/** \brief Make head room, typically for packet headers + * @param b pointer to the buffer + * @param size number of head room bytes + * @return pointer to start of buffer (current data) + */ +always_inline void * +vlib_buffer_make_headroom (vlib_buffer_t * b, u8 size) +{ + ASSERT (b->current_data + VLIB_BUFFER_PRE_DATA_SIZE >= size); + b->current_data += size; + return vlib_buffer_get_current (b); +} + +/** \brief Retrieve bytes from buffer head + * @param b pointer to the buffer + * @param size number of bytes to pull + * @return pointer to start of buffer (current data) + */ +always_inline void * +vlib_buffer_pull (vlib_buffer_t * b, u8 size) +{ + if (b->current_length + VLIB_BUFFER_PRE_DATA_SIZE < size) + return 0; + + void *data = vlib_buffer_get_current (b); + vlib_buffer_advance (b, size); + return data; +} + /* Forward declaration. */ struct vlib_main_t; diff --git a/src/vlibmemory/unix_shared_memory_queue.c b/src/vlibmemory/unix_shared_memory_queue.c index 25d28910..e86edec3 100644 --- a/src/vlibmemory/unix_shared_memory_queue.c +++ b/src/vlibmemory/unix_shared_memory_queue.c @@ -33,18 +33,13 @@ * nels = number of elements on the queue * elsize = element size, presumably 4 and cacheline-size will * be popular choices. - * coid = consumer coid, from ChannelCreate * pid = consumer pid - * pulse_code = pulse code consumer expects - * pulse_value = pulse value consumer expects - * consumer_prio = consumer's priority, so pulses won't change - * the consumer's priority. * * The idea is to call this function in the queue consumer, * and e-mail the queue pointer to the producer(s). * - * The spp process / main thread allocates one of these - * at startup; its main input queue. The spp main input queue + * The vpp process / main thread allocates one of these + * at startup; its main input queue. The vpp main input queue * has a pointer to it in the shared memory segment header. * * You probably want to be on an svm data heap before calling this @@ -70,7 +65,7 @@ unix_shared_memory_queue_init (int nels, q->signal_when_queue_non_empty = signal_when_queue_non_empty; memset (&attr, 0, sizeof (attr)); - memset (&cattr, 0, sizeof (attr)); + memset (&cattr, 0, sizeof (cattr)); if (pthread_mutexattr_init (&attr)) clib_unix_warning ("mutexattr_init"); @@ -277,6 +272,7 @@ unix_shared_memory_queue_sub (unix_shared_memory_queue_t * q, clib_memcpy (elem, headp, q->elsize); q->head++; + /* $$$$ JFC shouldn't this be == 0? */ if (q->cursize == q->maxsize) need_broadcast = 1; diff --git a/src/vlibmemory/unix_shared_memory_queue.h b/src/vlibmemory/unix_shared_memory_queue.h index f758f17c..13800065 100644 --- a/src/vlibmemory/unix_shared_memory_queue.h +++ b/src/vlibmemory/unix_shared_memory_queue.h @@ -29,7 +29,7 @@ typedef struct _unix_shared_memory_queue pthread_cond_t condvar; /* 8 bytes */ int head; int tail; - int cursize; + volatile int cursize; int maxsize; int elsize; int consumer_pid; diff --git a/src/vnet.am b/src/vnet.am index 64484e18..923f61d8 100644 --- a/src/vnet.am +++ b/src/vnet.am @@ -324,11 +324,7 @@ libvnet_la_SOURCES += \ vnet/ip/ip_input_acl.c \ vnet/ip/lookup.c \ vnet/ip/ping.c \ - vnet/ip/punt.c \ - vnet/ip/udp_format.c \ - vnet/ip/udp_init.c \ - vnet/ip/udp_local.c \ - vnet/ip/udp_pg.c + vnet/ip/punt.c nobase_include_HEADERS += \ vnet/ip/format.h \ @@ -354,11 +350,7 @@ nobase_include_HEADERS += \ vnet/ip/ports.def \ vnet/ip/protocols.def \ vnet/ip/punt_error.def \ - vnet/ip/punt.h \ - vnet/ip/tcp_packet.h \ - vnet/ip/udp_error.def \ - vnet/ip/udp.h \ - vnet/ip/udp_packet.h + vnet/ip/punt.h API_FILES += vnet/ip/ip.api @@ -473,6 +465,38 @@ test_map_LDADD = libvnet.la libvppinfra.la libvlib.la \ test_map_LDFLAGS = -static endif +######################################## +# Layer 4 protocol: tcp +######################################## +libvnet_la_SOURCES += \ + vnet/tcp/tcp_format.c \ + vnet/tcp/tcp_pg.c \ + vnet/tcp/tcp_syn_filter4.c \ + vnet/tcp/tcp_output.c \ + vnet/tcp/tcp_input.c \ + vnet/tcp/tcp_newreno.c \ + vnet/tcp/tcp.c + +nobase_include_HEADERS += \ + vnet/tcp/tcp_packet.h \ + vnet/tcp/tcp_timer.h \ + vnet/tcp/tcp.h + +######################################## +# Layer 4 protocol: udp +######################################## +libvnet_la_SOURCES += \ + vnet/udp/udp.c \ + vnet/udp/udp_input.c \ + vnet/udp/builtin_server.c \ + vnet/udp/udp_format.c \ + vnet/udp/udp_local.c \ + vnet/udp/udp_pg.c + +nobase_include_HEADERS += \ + vnet/udp/udp_error.def \ + vnet/udp/udp.h \ + vnet/udp/udp_packet.h ######################################## # Tunnel protocol: gre @@ -833,6 +857,28 @@ libvnet_la_SOURCES += \ nobase_include_HEADERS += \ vnet/devices/ssvm/ssvm_eth.h +######################################## +# session managmeent +######################################## + +libvnet_la_SOURCES += \ + vnet/session/session.c \ + vnet/session/node.c \ + vnet/session/transport.c \ + vnet/session/application.c \ + vnet/session/session_cli.c \ + vnet/session/hashes.c \ + vnet/session/application_interface.c \ + vnet/session/session_api.c + +nobase_include_HEADERS += \ + vnet/session/session.h \ + vnet/session/application.h \ + vnet/session/transport.h \ + vnet/session/application_interface.h + +API_FILES += vnet/session/session.api + ######################################## # Linux packet interface ######################################## diff --git a/src/vnet/api_errno.h b/src/vnet/api_errno.h index 8680ef7c..861a5767 100644 --- a/src/vnet/api_errno.h +++ b/src/vnet/api_errno.h @@ -91,14 +91,19 @@ _(INVALID_ADDRESS_FAMILY, -97, "Invalid address family") \ _(INVALID_SUB_SW_IF_INDEX, -98, "Invalid sub-interface sw_if_index") \ _(TABLE_TOO_BIG, -99, "Table too big") \ _(CANNOT_ENABLE_DISABLE_FEATURE, -100, "Cannot enable/disable feature") \ -_(BFD_EEXIST, -101, "Duplicate BFD object") \ -_(BFD_ENOENT, -102, "No such BFD object") \ -_(BFD_EINUSE, -103, "BFD object in use") \ -_(BFD_NOTSUPP, -104, "BFD feature not supported") \ -_(LISP_RLOC_LOCAL, -105, "RLOC address is local") \ -_(BFD_EAGAIN, -106, "BFD object cannot be manipulated at this time") \ -_(INVALID_GPE_MODE, -107, "Invalid GPE mode") \ -_(LISP_GPE_ENTRIES_PRESENT, -108, "LISP GPE entries are present") +_(BFD_EEXIST, -101, "Duplicate BFD object") \ +_(BFD_ENOENT, -102, "No such BFD object") \ +_(BFD_EINUSE, -103, "BFD object in use") \ +_(BFD_NOTSUPP, -104, "BFD feature not supported") \ +_(ADDRESS_IN_USE, -105, "Address in use") \ +_(ADDRESS_NOT_IN_USE, -106, "Address not in use") \ +_(QUEUE_FULL, -107, "Queue full") \ +_(UNKNOWN_URI_TYPE, -108, "Unknown URI type") \ +_(URI_FIFO_CREATE_FAILED, -109, "URI FIFO segment create failed") \ +_(LISP_RLOC_LOCAL, -110, "RLOC address is local") \ +_(BFD_EAGAIN, -111, "BFD object cannot be manipulated at this time") \ +_(INVALID_GPE_MODE, -112, "Invalid GPE mode") \ +_(LISP_GPE_ENTRIES_PRESENT, -113, "LISP GPE entries are present") typedef enum { diff --git a/src/vnet/bfd/bfd_udp.c b/src/vnet/bfd/bfd_udp.c index 146faad6..cf05089b 100644 --- a/src/vnet/bfd/bfd_udp.c +++ b/src/vnet/bfd/bfd_udp.c @@ -18,12 +18,12 @@ #include #include #include -#include +#include +#include #include #include #include #include -#include #include #include #include diff --git a/src/vnet/buffer.h b/src/vnet/buffer.h index f1cc6371..3de01f2a 100644 --- a/src/vnet/buffer.h +++ b/src/vnet/buffer.h @@ -277,6 +277,16 @@ typedef struct u16 buffer_advance; } device_input_feat; + /* TCP */ + struct + { + u32 connection_index; + u32 seq_number; + u32 seq_end; + u32 ack_number; + u8 flags; + } tcp; + u32 unused[6]; }; } vnet_buffer_opaque_t; diff --git a/src/vnet/classify/vnet_classify.c b/src/vnet/classify/vnet_classify.c index 6093e2ac..b651a1f1 100644 --- a/src/vnet/classify/vnet_classify.c +++ b/src/vnet/classify/vnet_classify.c @@ -695,8 +695,8 @@ int vnet_classify_add_del_table (vnet_classify_main_t * cm, } #define foreach_tcp_proto_field \ -_(src_port) \ -_(dst_port) +_(src) \ +_(dst) #define foreach_udp_proto_field \ _(src_port) \ diff --git a/src/vnet/dhcp/dhcp_proxy.h b/src/vnet/dhcp/dhcp_proxy.h index c0d79c41..4586d883 100644 --- a/src/vnet/dhcp/dhcp_proxy.h +++ b/src/vnet/dhcp/dhcp_proxy.h @@ -26,7 +26,7 @@ #include #include #include -#include +#include typedef enum { #define dhcp_proxy_error(n,s) DHCP_PROXY_ERROR_##n, diff --git a/src/vnet/flow/flow_report.h b/src/vnet/flow/flow_report.h index 4e764377..e8ed3818 100644 --- a/src/vnet/flow/flow_report.h +++ b/src/vnet/flow/flow_report.h @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/vnet/ip/ip.h b/src/vnet/ip/ip.h index 02a1a963..70b4ccd8 100644 --- a/src/vnet/ip/ip.h +++ b/src/vnet/ip/ip.h @@ -50,8 +50,8 @@ #include #include -#include -#include +#include +#include #include #include diff --git a/src/vnet/ip/ip4.h b/src/vnet/ip/ip4.h index b184fbae..4e075d0f 100644 --- a/src/vnet/ip/ip4.h +++ b/src/vnet/ip/ip4.h @@ -309,8 +309,8 @@ ip4_compute_flow_hash (const ip4_header_t * ip, b = (flow_hash_config & IP_FLOW_HASH_REVERSE_SRC_DST) ? t1 : t2; b ^= (flow_hash_config & IP_FLOW_HASH_PROTO) ? ip->protocol : 0; - t1 = is_tcp_udp ? tcp->ports.src : 0; - t2 = is_tcp_udp ? tcp->ports.dst : 0; + t1 = is_tcp_udp ? tcp->src : 0; + t2 = is_tcp_udp ? tcp->dst : 0; t1 = (flow_hash_config & IP_FLOW_HASH_SRC_PORT) ? t1 : 0; t2 = (flow_hash_config & IP_FLOW_HASH_DST_PORT) ? t2 : 0; @@ -334,6 +334,44 @@ u8 *format_ip4_forward_next_trace (u8 * s, va_list * args); u32 ip4_tcp_udp_validate_checksum (vlib_main_t * vm, vlib_buffer_t * p0); +#define IP_DF 0x4000 /* don't fragment */ + +/** + * Push IPv4 header to buffer + * + * This does not support fragmentation. + * + * @param vm - vlib_main + * @param b - buffer to write the header to + * @param src - source IP + * @param dst - destination IP + * @param prot - payload proto + * + * @return - pointer to start of IP header + */ +always_inline void * +vlib_buffer_push_ip4 (vlib_main_t * vm, vlib_buffer_t * b, + ip4_address_t * src, ip4_address_t * dst, int proto) +{ + ip4_header_t *ih; + + /* make some room */ + ih = vlib_buffer_push_uninit (b, sizeof (ip4_header_t)); + + ih->ip_version_and_header_length = 0x45; + ih->tos = 0; + ih->length = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b)); + + /* No fragments */ + ih->flags_and_fragment_offset = clib_host_to_net_u16 (IP_DF); + ih->ttl = 255; + ih->protocol = proto; + ih->src_address.as_u32 = src->as_u32; + ih->dst_address.as_u32 = dst->as_u32; + + ih->checksum = ip4_header_checksum (ih); + return ih; +} #endif /* included_ip_ip4_h */ /* diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index 8081b34b..66d91ab6 100644 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -1478,8 +1478,18 @@ ip4_tcp_udp_validate_checksum (vlib_main_t * vm, vlib_buffer_t * p0) return p0->flags; } -static uword -ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) +/* *INDENT-OFF* */ +VNET_FEATURE_ARC_INIT (ip4_local) = +{ + .arc_name = "ip4-local", + .start_nodes = VNET_FEATURES ("ip4-local"), +}; +/* *INDENT-ON* */ + +static inline uword +ip4_local_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, int head_of_feature_arc) { ip4_main_t *im = &ip4_main; ip_lookup_main_t *lm = &im->lookup_main; @@ -1487,6 +1497,7 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) u32 *from, *to_next, n_left_from, n_left_to_next; vlib_node_runtime_t *error_node = vlib_node_get_runtime (vm, ip4_input_node.index); + u8 arc_index = vnet_feat_arc_ip4_local.feature_arc_index; from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -1513,7 +1524,7 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) i32 len_diff0, len_diff1; u8 error0, is_udp0, is_tcp_udp0, good_tcp_udp0, proto0; u8 error1, is_udp1, is_tcp_udp1, good_tcp_udp1, proto1; - u8 enqueue_code; + u32 sw_if_index0, sw_if_index1; pi0 = to_next[0] = from[0]; pi1 = to_next[1] = from[1]; @@ -1522,6 +1533,8 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) to_next += 2; n_left_to_next -= 2; + next0 = next1 = IP_LOCAL_NEXT_DROP; + p0 = vlib_get_buffer (vm, pi0); p1 = vlib_get_buffer (vm, pi1); @@ -1531,14 +1544,18 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) vnet_buffer (p0)->ip.start_of_ip_header = p0->current_data; vnet_buffer (p1)->ip.start_of_ip_header = p1->current_data; - fib_index0 = vec_elt (im->fib_index_by_sw_if_index, - vnet_buffer (p0)->sw_if_index[VLIB_RX]); + sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX]; + sw_if_index1 = vnet_buffer (p1)->sw_if_index[VLIB_RX]; + + fib_index0 = vec_elt (im->fib_index_by_sw_if_index, sw_if_index0); + fib_index1 = vec_elt (im->fib_index_by_sw_if_index, sw_if_index1); + + fib_index0 = vec_elt (im->fib_index_by_sw_if_index, sw_if_index0); fib_index0 = (vnet_buffer (p0)->sw_if_index[VLIB_TX] == (u32) ~ 0) ? fib_index0 : vnet_buffer (p0)->sw_if_index[VLIB_TX]; - fib_index1 = vec_elt (im->fib_index_by_sw_if_index, - vnet_buffer (p1)->sw_if_index[VLIB_RX]); + fib_index1 = vec_elt (im->fib_index_by_sw_if_index, sw_if_index1); fib_index1 = (vnet_buffer (p1)->sw_if_index[VLIB_TX] == (u32) ~ 0) ? fib_index1 : vnet_buffer (p1)->sw_if_index[VLIB_TX]; @@ -1557,6 +1574,13 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) until support of IP frag reassembly is implemented */ proto0 = ip4_is_fragment (ip0) ? 0xfe : ip0->protocol; proto1 = ip4_is_fragment (ip1) ? 0xfe : ip1->protocol; + + if (head_of_feature_arc == 0) + { + error0 = error1 = IP4_ERROR_UNKNOWN_PROTOCOL; + goto skip_checks; + } + is_udp0 = proto0 == IP_PROTOCOL_UDP; is_udp1 = proto1 == IP_PROTOCOL_UDP; is_tcp_udp0 = is_udp0 || proto0 == IP_PROTOCOL_TCP; @@ -1686,6 +1710,7 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) next0 = lm->local_next_by_ip_protocol[proto0]; next1 = lm->local_next_by_ip_protocol[proto1]; + skip_checks: next0 = error0 != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next0; next1 = @@ -1694,44 +1719,17 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) p0->error = error0 ? error_node->errors[error0] : 0; p1->error = error1 ? error_node->errors[error1] : 0; - enqueue_code = (next0 != next_index) + 2 * (next1 != next_index); - - if (PREDICT_FALSE (enqueue_code != 0)) + if (head_of_feature_arc) { - switch (enqueue_code) - { - case 1: - /* A B A */ - to_next[-2] = pi1; - to_next -= 1; - n_left_to_next += 1; - vlib_set_next_frame_buffer (vm, node, next0, pi0); - break; - - case 2: - /* A A B */ - to_next -= 1; - n_left_to_next += 1; - vlib_set_next_frame_buffer (vm, node, next1, pi1); - break; - - case 3: - /* A B B or A B C */ - to_next -= 2; - n_left_to_next += 2; - vlib_set_next_frame_buffer (vm, node, next0, pi0); - vlib_set_next_frame_buffer (vm, node, next1, pi1); - if (next0 == next1) - { - vlib_put_next_frame (vm, node, next_index, - n_left_to_next); - next_index = next1; - vlib_get_next_frame (vm, node, next_index, to_next, - n_left_to_next); - } - break; - } + if (PREDICT_TRUE (error0 == (u8) IP4_ERROR_UNKNOWN_PROTOCOL)) + vnet_feature_arc_start (arc_index, sw_if_index0, &next0, p0); + if (PREDICT_TRUE (error1 == (u8) IP4_ERROR_UNKNOWN_PROTOCOL)) + vnet_feature_arc_start (arc_index, sw_if_index1, &next1, p1); } + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, to_next, + n_left_to_next, pi0, pi1, + next0, next1); } while (n_left_from > 0 && n_left_to_next > 0) @@ -1746,6 +1744,7 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) u8 error0, is_udp0, is_tcp_udp0, good_tcp_udp0, proto0; load_balance_t *lb0; const dpo_id_t *dpo0; + u32 sw_if_index0; pi0 = to_next[0] = from[0]; from += 1; @@ -1753,14 +1752,18 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) to_next += 1; n_left_to_next -= 1; + next0 = IP_LOCAL_NEXT_DROP; + p0 = vlib_get_buffer (vm, pi0); ip0 = vlib_buffer_get_current (p0); vnet_buffer (p0)->ip.start_of_ip_header = p0->current_data; - fib_index0 = vec_elt (im->fib_index_by_sw_if_index, - vnet_buffer (p0)->sw_if_index[VLIB_RX]); + sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX]; + + fib_index0 = vec_elt (im->fib_index_by_sw_if_index, sw_if_index0); + fib_index0 = (vnet_buffer (p0)->sw_if_index[VLIB_TX] == (u32) ~ 0) ? fib_index0 : vnet_buffer (p0)->sw_if_index[VLIB_TX]; @@ -1775,6 +1778,13 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) /* Treat IP frag packets as "experimental" protocol for now until support of IP frag reassembly is implemented */ proto0 = ip4_is_fragment (ip0) ? 0xfe : ip0->protocol; + + if (head_of_feature_arc == 0) + { + error0 = IP4_ERROR_UNKNOWN_PROTOCOL; + goto skip_check; + } + is_udp0 = proto0 == IP_PROTOCOL_UDP; is_tcp_udp0 = is_udp0 || proto0 == IP_PROTOCOL_TCP; @@ -1847,6 +1857,8 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) ip0->dst_address.as_u32 != 0xFFFFFFFF) ? IP4_ERROR_SRC_LOOKUP_MISS : error0); + skip_check: + next0 = lm->local_next_by_ip_protocol[proto0]; next0 = @@ -1854,18 +1866,15 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) p0->error = error0 ? error_node->errors[error0] : 0; - if (PREDICT_FALSE (next0 != next_index)) + if (head_of_feature_arc) { - n_left_to_next += 1; - vlib_put_next_frame (vm, node, next_index, n_left_to_next); - - next_index = next0; - vlib_get_next_frame (vm, node, next_index, to_next, - n_left_to_next); - to_next[0] = pi0; - to_next += 1; - n_left_to_next -= 1; + if (PREDICT_TRUE (error0 == (u8) IP4_ERROR_UNKNOWN_PROTOCOL)) + vnet_feature_arc_start (arc_index, sw_if_index0, &next0, p0); } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, pi0, next0); + } vlib_put_next_frame (vm, node, next_index, n_left_to_next); @@ -1874,21 +1883,57 @@ ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) return frame->n_vectors; } +static uword +ip4_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + return ip4_local_inline (vm, node, frame, 1 /* head of feature arc */ ); +} + +/* *INDENT-OFF* */ VLIB_REGISTER_NODE (ip4_local_node) = { - .function = ip4_local,.name = "ip4-local",.vector_size = - sizeof (u32),.format_trace = - format_ip4_forward_next_trace,.n_next_nodes = - IP_LOCAL_N_NEXT,.next_nodes = + .function = ip4_local, + .name = "ip4-local", + .vector_size = sizeof (u32), + .format_trace = format_ip4_forward_next_trace, + .n_next_nodes = IP_LOCAL_N_NEXT, + .next_nodes = { - [IP_LOCAL_NEXT_DROP] = "error-drop", - [IP_LOCAL_NEXT_PUNT] = "error-punt", - [IP_LOCAL_NEXT_UDP_LOOKUP] = "ip4-udp-lookup", - [IP_LOCAL_NEXT_ICMP] = "ip4-icmp-input",} -,}; + [IP_LOCAL_NEXT_DROP] = "error-drop", + [IP_LOCAL_NEXT_PUNT] = "error-punt", + [IP_LOCAL_NEXT_UDP_LOOKUP] = "ip4-udp-lookup", + [IP_LOCAL_NEXT_ICMP] = "ip4-icmp-input",}, +}; +/* *INDENT-ON* */ VLIB_NODE_FUNCTION_MULTIARCH (ip4_local_node, ip4_local); +static uword +ip4_local_end_of_arc (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + return ip4_local_inline (vm, node, frame, 0 /* head of feature arc */ ); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (ip4_local_end_of_arc_node,static) = { + .function = ip4_local_end_of_arc, + .name = "ip4-local-end-of-arc", + .vector_size = sizeof (u32), + + .format_trace = format_ip4_forward_next_trace, + .sibling_of = "ip4-local", +}; + +VLIB_NODE_FUNCTION_MULTIARCH (ip4_local_end_of_arc_node, ip4_local_end_of_arc) + +VNET_FEATURE_INIT (ip4_local_end_of_arc, static) = { + .arc_name = "ip4-local", + .node_name = "ip4-local-end-of-arc", + .runs_before = 0, /* not before any other features */ +}; +/* *INDENT-ON* */ + void ip4_register_protocol (u32 protocol, u32 node_index) { diff --git a/src/vnet/ip/ip4_packet.h b/src/vnet/ip/ip4_packet.h index 8da788b4..b2c1fcd4 100644 --- a/src/vnet/ip/ip4_packet.h +++ b/src/vnet/ip/ip4_packet.h @@ -41,7 +41,7 @@ #define included_ip4_packet_h #include /* for ip_csum_t */ -#include /* for tcp_header_t */ +#include /* for tcp_header_t */ #include /* for clib_net_to_host_u16 */ /* IP4 address which can be accessed either as 4 bytes @@ -342,10 +342,10 @@ ip4_tcp_reply_x1 (ip4_header_t * ip0, tcp_header_t * tcp0) ip0->src_address.data_u32 = dst0; ip0->dst_address.data_u32 = src0; - src0 = tcp0->ports.src; - dst0 = tcp0->ports.dst; - tcp0->ports.src = dst0; - tcp0->ports.dst = src0; + src0 = tcp0->src; + dst0 = tcp0->dst; + tcp0->src = dst0; + tcp0->dst = src0; } always_inline void @@ -363,14 +363,14 @@ ip4_tcp_reply_x2 (ip4_header_t * ip0, ip4_header_t * ip1, ip0->dst_address.data_u32 = src0; ip1->dst_address.data_u32 = src1; - src0 = tcp0->ports.src; - src1 = tcp1->ports.src; - dst0 = tcp0->ports.dst; - dst1 = tcp1->ports.dst; - tcp0->ports.src = dst0; - tcp1->ports.src = dst1; - tcp0->ports.dst = src0; - tcp1->ports.dst = src1; + src0 = tcp0->src; + src1 = tcp1->src; + dst0 = tcp0->dst; + dst1 = tcp1->dst; + tcp0->src = dst0; + tcp1->src = dst1; + tcp0->dst = src0; + tcp1->dst = src1; } #endif /* included_ip4_packet_h */ diff --git a/src/vnet/ip/ip6.h b/src/vnet/ip/ip6.h index 5456f0f2..2615fbfa 100644 --- a/src/vnet/ip/ip6.h +++ b/src/vnet/ip/ip6.h @@ -461,8 +461,8 @@ ip6_compute_flow_hash (const ip6_header_t * ip, b = (flow_hash_config & IP_FLOW_HASH_REVERSE_SRC_DST) ? t1 : t2; b ^= (flow_hash_config & IP_FLOW_HASH_PROTO) ? ip->protocol : 0; - t1 = is_tcp_udp ? tcp->ports.src : 0; - t2 = is_tcp_udp ? tcp->ports.dst : 0; + t1 = is_tcp_udp ? tcp->src : 0; + t2 = is_tcp_udp ? tcp->dst : 0; t1 = (flow_hash_config & IP_FLOW_HASH_SRC_PORT) ? t1 : 0; t2 = (flow_hash_config & IP_FLOW_HASH_DST_PORT) ? t2 : 0; @@ -497,6 +497,46 @@ int ip6_hbh_register_option (u8 option, int ip6_hbh_unregister_option (u8 option); void ip6_hbh_set_next_override (uword next); +/** + * Push IPv6 header to buffer + * + * @param vm - vlib_main + * @param b - buffer to write the header to + * @param src - source IP + * @param dst - destination IP + * @param prot - payload proto + * + * @return - pointer to start of IP header + */ +always_inline void * +vlib_buffer_push_ip6 (vlib_main_t * vm, vlib_buffer_t * b, + ip6_address_t * src, ip6_address_t * dst, int proto) +{ + ip6_header_t *ip6h; + u16 payload_length; + + /* make some room */ + ip6h = vlib_buffer_push_uninit (b, sizeof (ip6_header_t)); + + ip6h->ip_version_traffic_class_and_flow_label = + clib_host_to_net_u32 (0x6 << 28); + + /* calculate ip6 payload length */ + payload_length = vlib_buffer_length_in_chain (vm, b); + payload_length -= sizeof (*ip6h); + + ip6h->payload_length = clib_host_to_net_u16 (payload_length); + + ip6h->hop_limit = 0xff; + ip6h->protocol = proto; + clib_memcpy (ip6h->src_address.as_u8, src->as_u8, + sizeof (ip6h->src_address)); + clib_memcpy (ip6h->dst_address.as_u8, dst->as_u8, + sizeof (ip6h->src_address)); + + return ip6h; +} + #endif /* included_ip_ip6_h */ /* diff --git a/src/vnet/ip/ip6_packet.h b/src/vnet/ip/ip6_packet.h index 1e551c8b..4fd14b96 100644 --- a/src/vnet/ip/ip6_packet.h +++ b/src/vnet/ip/ip6_packet.h @@ -40,7 +40,7 @@ #ifndef included_ip6_packet_h #define included_ip6_packet_h -#include +#include #include typedef union @@ -373,10 +373,10 @@ ip6_tcp_reply_x1 (ip6_header_t * ip0, tcp_header_t * tcp0) { u16 src0, dst0; - src0 = tcp0->ports.src; - dst0 = tcp0->ports.dst; - tcp0->ports.src = dst0; - tcp0->ports.dst = src0; + src0 = tcp0->src; + dst0 = tcp0->dst; + tcp0->src = dst0; + tcp0->dst = src0; } } @@ -400,14 +400,14 @@ ip6_tcp_reply_x2 (ip6_header_t * ip0, ip6_header_t * ip1, { u16 src0, dst0, src1, dst1; - src0 = tcp0->ports.src; - src1 = tcp1->ports.src; - dst0 = tcp0->ports.dst; - dst1 = tcp1->ports.dst; - tcp0->ports.src = dst0; - tcp1->ports.src = dst1; - tcp0->ports.dst = src0; - tcp1->ports.dst = src1; + src0 = tcp0->src; + src1 = tcp1->src; + dst0 = tcp0->dst; + dst1 = tcp1->dst; + tcp0->src = dst0; + tcp1->src = dst1; + tcp0->dst = src0; + tcp1->dst = src1; } } diff --git a/src/vnet/ip/punt.c b/src/vnet/ip/punt.c index 9c735128..48558401 100644 --- a/src/vnet/ip/punt.c +++ b/src/vnet/ip/punt.c @@ -23,7 +23,7 @@ */ #include #include -#include +#include #include #define foreach_punt_next \ diff --git a/src/vnet/ip/tcp_packet.h b/src/vnet/ip/tcp_packet.h deleted file mode 100644 index 93f73e01..00000000 --- a/src/vnet/ip/tcp_packet.h +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * ip4/tcp_packet.h: TCP packet format (see RFC 793) - * - * Copyright (c) 2008 Eliot Dresselhaus - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#ifndef included_tcp_packet_h -#define included_tcp_packet_h - -/* TCP flags bit 0 first. */ -#define foreach_tcp_flag \ - _ (FIN) \ - _ (SYN) \ - _ (RST) \ - _ (PSH) \ - _ (ACK) \ - _ (URG) \ - _ (ECE) \ - _ (CWR) - -enum -{ -#define _(f) TCP_FLAG_BIT_##f, - foreach_tcp_flag -#undef _ - TCP_N_FLAG_BITS, - -#define _(f) TCP_FLAG_##f = 1 << TCP_FLAG_BIT_##f, - foreach_tcp_flag -#undef _ -}; - -typedef struct -{ - /* Source and destination port. */ - union - { - union - { - struct - { - u16 src, dst; - }; - u32 src_and_dst; - } ports; - struct - { - u16 src_port, dst_port; - }; - }; - - /* Sequence and acknowledgment number. */ - u32 seq_number, ack_number; - - /* Size of TCP header in 32-bit units plus 4 reserved bits. */ - u8 tcp_header_u32s_and_reserved; - - /* see foreach_tcp_flag for enumation of tcp flags. */ - u8 flags; - - /* Current window advertised by sender. - This is the number of bytes sender is willing to receive - right now. */ - u16 window; - - /* Checksum of TCP pseudo header and data. */ - u16 checksum; - - u16 urgent_pointer; -} tcp_header_t; - -always_inline int -tcp_header_bytes (tcp_header_t * t) -{ - return (t->tcp_header_u32s_and_reserved >> 4) * sizeof (u32); -} - -/* TCP options. */ -typedef enum tcp_option_type -{ - TCP_OPTION_END = 0, - TCP_OPTION_NOP = 1, - TCP_OPTION_MSS = 2, - TCP_OPTION_WINDOW_SCALE = 3, - TCP_OPTION_SACK_PERMITTED = 4, - TCP_OPTION_SACK_BLOCK = 5, - TCP_OPTION_TIME_STAMP = 8, -} tcp_option_type_t; - -/* All except NOP and END have 1 byte length field. */ -typedef struct -{ - tcp_option_type_t type:8; - - /* Length of this option in bytes. */ - u8 length; -} tcp_option_with_length_t; - -#endif /* included_tcp_packet_h */ - - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet/ip/udp.h b/src/vnet/ip/udp.h deleted file mode 100644 index bad58b5d..00000000 --- a/src/vnet/ip/udp.h +++ /dev/null @@ -1,315 +0,0 @@ -/* - * ip/udp.h: udp protocol - * - * Copyright (c) 2013 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef included_udp_h -#define included_udp_h - -#include -#include -#include -#include -#include -#include -#include - -typedef enum -{ -#define udp_error(n,s) UDP_ERROR_##n, -#include -#undef udp_error - UDP_N_ERROR, -} udp_error_t; - -#define foreach_udp4_dst_port \ -_ (67, dhcp_to_server) \ -_ (68, dhcp_to_client) \ -_ (500, ikev2) \ -_ (3784, bfd4) \ -_ (3785, bfd_echo4) \ -_ (4341, lisp_gpe) \ -_ (4342, lisp_cp) \ -_ (4739, ipfix) \ -_ (4789, vxlan) \ -_ (4789, vxlan6) \ -_ (4790, vxlan_gpe) \ -_ (6633, vpath_3) - - -#define foreach_udp6_dst_port \ -_ (547, dhcpv6_to_server) \ -_ (546, dhcpv6_to_client) \ -_ (3784, bfd6) \ -_ (3785, bfd_echo6) \ -_ (4341, lisp_gpe6) \ -_ (4342, lisp_cp6) \ -_ (4790, vxlan6_gpe) \ -_ (6633, vpath6_3) - -typedef enum -{ -#define _(n,f) UDP_DST_PORT_##f = n, - foreach_udp4_dst_port foreach_udp6_dst_port -#undef _ -} udp_dst_port_t; - -typedef enum -{ -#define _(n,f) UDP6_DST_PORT_##f = n, - foreach_udp6_dst_port -#undef _ -} udp6_dst_port_t; - -typedef struct -{ - /* Name (a c string). */ - char *name; - - /* GRE protocol type in host byte order. */ - udp_dst_port_t dst_port; - - /* Node which handles this type. */ - u32 node_index; - - /* Next index for this type. */ - u32 next_index; -} udp_dst_port_info_t; - -typedef enum -{ - UDP_IP6 = 0, - UDP_IP4, /* the code is full of is_ip4... */ - N_UDP_AF, -} udp_af_t; - -typedef struct -{ - udp_dst_port_info_t *dst_port_infos[N_UDP_AF]; - - /* Hash tables mapping name/protocol to protocol info index. */ - uword *dst_port_info_by_name[N_UDP_AF]; - uword *dst_port_info_by_dst_port[N_UDP_AF]; - - /* convenience */ - vlib_main_t *vlib_main; -} udp_main_t; - -always_inline udp_dst_port_info_t * -udp_get_dst_port_info (udp_main_t * um, udp_dst_port_t dst_port, u8 is_ip4) -{ - uword *p = hash_get (um->dst_port_info_by_dst_port[is_ip4], dst_port); - return p ? vec_elt_at_index (um->dst_port_infos[is_ip4], p[0]) : 0; -} - -format_function_t format_udp_header; -format_function_t format_udp_rx_trace; - -unformat_function_t unformat_udp_header; - -void udp_register_dst_port (vlib_main_t * vm, - udp_dst_port_t dst_port, - u32 node_index, u8 is_ip4); - -void udp_punt_unknown (vlib_main_t * vm, u8 is_ip4, u8 is_add); - -always_inline void -ip_udp_fixup_one (vlib_main_t * vm, vlib_buffer_t * b0, u8 is_ip4) -{ - u16 new_l0; - udp_header_t *udp0; - - if (is_ip4) - { - ip4_header_t *ip0; - ip_csum_t sum0; - u16 old_l0 = 0; - - ip0 = vlib_buffer_get_current (b0); - - /* fix the ing outer-IP checksum */ - sum0 = ip0->checksum; - /* old_l0 always 0, see the rewrite setup */ - new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0)); - - sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t, - length /* changed member */ ); - ip0->checksum = ip_csum_fold (sum0); - ip0->length = new_l0; - - /* Fix UDP length */ - udp0 = (udp_header_t *) (ip0 + 1); - new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0) - - sizeof (*ip0)); - udp0->length = new_l0; - } - else - { - ip6_header_t *ip0; - int bogus0; - - ip0 = vlib_buffer_get_current (b0); - - new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0) - - sizeof (*ip0)); - ip0->payload_length = new_l0; - - /* Fix UDP length */ - udp0 = (udp_header_t *) (ip0 + 1); - udp0->length = new_l0; - - udp0->checksum = - ip6_tcp_udp_icmp_compute_checksum (vm, b0, ip0, &bogus0); - ASSERT (bogus0 == 0); - - if (udp0->checksum == 0) - udp0->checksum = 0xffff; - } -} - -always_inline void -ip_udp_encap_one (vlib_main_t * vm, vlib_buffer_t * b0, u8 * ec0, word ec_len, - u8 is_ip4) -{ - vlib_buffer_advance (b0, -ec_len); - - if (is_ip4) - { - ip4_header_t *ip0; - - ip0 = vlib_buffer_get_current (b0); - - /* Apply the encap string. */ - clib_memcpy (ip0, ec0, ec_len); - ip_udp_fixup_one (vm, b0, 1); - } - else - { - ip6_header_t *ip0; - - ip0 = vlib_buffer_get_current (b0); - - /* Apply the encap string. */ - clib_memcpy (ip0, ec0, ec_len); - ip_udp_fixup_one (vm, b0, 0); - } -} - -always_inline void -ip_udp_encap_two (vlib_main_t * vm, vlib_buffer_t * b0, vlib_buffer_t * b1, - u8 * ec0, u8 * ec1, word ec_len, u8 is_v4) -{ - u16 new_l0, new_l1; - udp_header_t *udp0, *udp1; - - ASSERT (_vec_len (ec0) == _vec_len (ec1)); - - vlib_buffer_advance (b0, -ec_len); - vlib_buffer_advance (b1, -ec_len); - - if (is_v4) - { - ip4_header_t *ip0, *ip1; - ip_csum_t sum0, sum1; - u16 old_l0 = 0, old_l1 = 0; - - ip0 = vlib_buffer_get_current (b0); - ip1 = vlib_buffer_get_current (b1); - - /* Apply the encap string */ - clib_memcpy (ip0, ec0, ec_len); - clib_memcpy (ip1, ec1, ec_len); - - /* fix the ing outer-IP checksum */ - sum0 = ip0->checksum; - sum1 = ip1->checksum; - - /* old_l0 always 0, see the rewrite setup */ - new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0)); - new_l1 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1)); - - sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t, - length /* changed member */ ); - sum1 = ip_csum_update (sum1, old_l1, new_l1, ip4_header_t, - length /* changed member */ ); - - ip0->checksum = ip_csum_fold (sum0); - ip1->checksum = ip_csum_fold (sum1); - - ip0->length = new_l0; - ip1->length = new_l1; - - /* Fix UDP length */ - udp0 = (udp_header_t *) (ip0 + 1); - udp1 = (udp_header_t *) (ip1 + 1); - - new_l0 = - clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0) - - sizeof (*ip0)); - new_l1 = - clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1) - - sizeof (*ip1)); - udp0->length = new_l0; - udp1->length = new_l1; - } - else - { - ip6_header_t *ip0, *ip1; - int bogus0, bogus1; - - ip0 = vlib_buffer_get_current (b0); - ip1 = vlib_buffer_get_current (b1); - - /* Apply the encap string. */ - clib_memcpy (ip0, ec0, ec_len); - clib_memcpy (ip1, ec1, ec_len); - - new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0) - - sizeof (*ip0)); - new_l1 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1) - - sizeof (*ip1)); - ip0->payload_length = new_l0; - ip1->payload_length = new_l1; - - /* Fix UDP length */ - udp0 = (udp_header_t *) (ip0 + 1); - udp1 = (udp_header_t *) (ip1 + 1); - - udp0->length = new_l0; - udp1->length = new_l1; - - udp0->checksum = - ip6_tcp_udp_icmp_compute_checksum (vm, b0, ip0, &bogus0); - udp1->checksum = - ip6_tcp_udp_icmp_compute_checksum (vm, b1, ip1, &bogus1); - ASSERT (bogus0 == 0); - ASSERT (bogus1 == 0); - - if (udp0->checksum == 0) - udp0->checksum = 0xffff; - if (udp1->checksum == 0) - udp1->checksum = 0xffff; - } -} - -#endif /* included_udp_h */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet/ip/udp_error.def b/src/vnet/ip/udp_error.def deleted file mode 100644 index bfdae0ac..00000000 --- a/src/vnet/ip/udp_error.def +++ /dev/null @@ -1,21 +0,0 @@ -/* - * udp_error.def: udp errors - * - * Copyright (c) 2013-2016 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -udp_error (NONE, "no error") -udp_error (NO_LISTENER, "no listener for dst port") -udp_error (LENGTH_ERROR, "UDP packets with length errors") -udp_error (PUNT, "no listener punt") diff --git a/src/vnet/ip/udp_format.c b/src/vnet/ip/udp_format.c deleted file mode 100644 index abdf561e..00000000 --- a/src/vnet/ip/udp_format.c +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * ip/udp_format.c: udp formatting - * - * Copyright (c) 2008 Eliot Dresselhaus - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#include - -/* Format UDP header. */ -u8 * -format_udp_header (u8 * s, va_list * args) -{ - udp_header_t *udp = va_arg (*args, udp_header_t *); - u32 max_header_bytes = va_arg (*args, u32); - uword indent; - u32 header_bytes = sizeof (udp[0]); - - /* Nothing to do. */ - if (max_header_bytes < sizeof (udp[0])) - return format (s, "UDP header truncated"); - - indent = format_get_indent (s); - indent += 2; - - s = format (s, "UDP: %d -> %d", - clib_net_to_host_u16 (udp->src_port), - clib_net_to_host_u16 (udp->dst_port)); - - s = format (s, "\n%Ulength %d, checksum 0x%04x", - format_white_space, indent, - clib_net_to_host_u16 (udp->length), - clib_net_to_host_u16 (udp->checksum)); - - /* Recurse into next protocol layer. */ - if (max_header_bytes != 0 && header_bytes < max_header_bytes) - { - ip_main_t *im = &ip_main; - tcp_udp_port_info_t *pi; - - pi = ip_get_tcp_udp_port_info (im, udp->dst_port); - - if (pi && pi->format_header) - s = format (s, "\n%U%U", - format_white_space, indent - 2, pi->format_header, - /* next protocol header */ (udp + 1), - max_header_bytes - sizeof (udp[0])); - } - - return s; -} - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet/ip/udp_init.c b/src/vnet/ip/udp_init.c deleted file mode 100644 index 1241ca4a..00000000 --- a/src/vnet/ip/udp_init.c +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * ip/udp_init.c: udp initialization - * - * Copyright (c) 2008 Eliot Dresselhaus - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#include - -clib_error_t * -udp_init (vlib_main_t * vm) -{ - ip_main_t *im = &ip_main; - ip_protocol_info_t *pi; - clib_error_t *error; - - error = vlib_call_init_function (vm, ip_main_init); - - if (!error) - { - pi = ip_get_protocol_info (im, IP_PROTOCOL_UDP); - if (pi == 0) - return clib_error_return (0, "UDP protocol info AWOL"); - pi->format_header = format_udp_header; - pi->unformat_pg_edit = unformat_pg_udp_header; - } - - return 0; -} - -VLIB_INIT_FUNCTION (udp_init); - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet/ip/udp_local.c b/src/vnet/ip/udp_local.c deleted file mode 100644 index 13ab6e4f..00000000 --- a/src/vnet/ip/udp_local.c +++ /dev/null @@ -1,645 +0,0 @@ -/* - * node.c: udp packet processing - * - * Copyright (c) 2013 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -udp_main_t udp_main; - -#define foreach_udp_input_next \ - _ (PUNT, "error-punt") \ - _ (DROP, "error-drop") \ - _ (ICMP4_ERROR, "ip4-icmp-error") \ - _ (ICMP6_ERROR, "ip6-icmp-error") - -typedef enum -{ -#define _(s,n) UDP_INPUT_NEXT_##s, - foreach_udp_input_next -#undef _ - UDP_INPUT_N_NEXT, -} udp_input_next_t; - -typedef struct -{ - u16 src_port; - u16 dst_port; - u8 bound; -} udp_rx_trace_t; - -u8 * -format_udp_rx_trace (u8 * s, va_list * args) -{ - CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); - CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); - udp_rx_trace_t *t = va_arg (*args, udp_rx_trace_t *); - - s = format (s, "UDP: src-port %d dst-port %d%s", - clib_net_to_host_u16 (t->src_port), - clib_net_to_host_u16 (t->dst_port), - t->bound ? "" : " (no listener)"); - return s; -} - -typedef struct -{ - /* Sparse vector mapping udp dst_port in network byte order - to next index. */ - u16 *next_by_dst_port; - u8 punt_unknown; -} udp_input_runtime_t; - -vlib_node_registration_t udp4_input_node; -vlib_node_registration_t udp6_input_node; - -always_inline uword -udp46_input_inline (vlib_main_t * vm, - vlib_node_runtime_t * node, - vlib_frame_t * from_frame, int is_ip4) -{ - udp_input_runtime_t *rt = is_ip4 ? - (void *) vlib_node_get_runtime_data (vm, udp4_input_node.index) - : (void *) vlib_node_get_runtime_data (vm, udp6_input_node.index); - __attribute__ ((unused)) u32 n_left_from, next_index, *from, *to_next; - word n_no_listener = 0; - u8 punt_unknown = rt->punt_unknown; - - from = vlib_frame_vector_args (from_frame); - n_left_from = from_frame->n_vectors; - - next_index = node->cached_next_index; - - while (n_left_from > 0) - { - u32 n_left_to_next; - - vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); - - while (n_left_from >= 4 && n_left_to_next >= 2) - { - u32 bi0, bi1; - vlib_buffer_t *b0, *b1; - udp_header_t *h0 = 0, *h1 = 0; - u32 i0, i1, dst_port0, dst_port1; - u32 advance0, advance1; - u32 error0, next0, error1, next1; - - /* Prefetch next iteration. */ - { - vlib_buffer_t *p2, *p3; - - p2 = vlib_get_buffer (vm, from[2]); - p3 = vlib_get_buffer (vm, from[3]); - - vlib_prefetch_buffer_header (p2, LOAD); - vlib_prefetch_buffer_header (p3, LOAD); - - CLIB_PREFETCH (p2->data, sizeof (h0[0]), LOAD); - CLIB_PREFETCH (p3->data, sizeof (h1[0]), LOAD); - } - - bi0 = from[0]; - bi1 = from[1]; - to_next[0] = bi0; - to_next[1] = bi1; - from += 2; - to_next += 2; - n_left_to_next -= 2; - n_left_from -= 2; - - b0 = vlib_get_buffer (vm, bi0); - b1 = vlib_get_buffer (vm, bi1); - - /* ip4/6_local hands us the ip header, not the udp header */ - if (is_ip4) - { - advance0 = sizeof (ip4_header_t); - advance1 = sizeof (ip4_header_t); - } - else - { - advance0 = sizeof (ip6_header_t); - advance1 = sizeof (ip6_header_t); - } - - if (PREDICT_FALSE (b0->current_length < advance0 + sizeof (*h0))) - { - error0 = UDP_ERROR_LENGTH_ERROR; - next0 = UDP_INPUT_NEXT_DROP; - } - else - { - vlib_buffer_advance (b0, advance0); - h0 = vlib_buffer_get_current (b0); - error0 = next0 = 0; - if (PREDICT_FALSE (clib_net_to_host_u16 (h0->length) > - vlib_buffer_length_in_chain (vm, b0))) - { - error0 = UDP_ERROR_LENGTH_ERROR; - next0 = UDP_INPUT_NEXT_DROP; - } - } - - if (PREDICT_FALSE (b1->current_length < advance1 + sizeof (*h1))) - { - error1 = UDP_ERROR_LENGTH_ERROR; - next1 = UDP_INPUT_NEXT_DROP; - } - else - { - vlib_buffer_advance (b1, advance1); - h1 = vlib_buffer_get_current (b1); - error1 = next1 = 0; - if (PREDICT_FALSE (clib_net_to_host_u16 (h1->length) > - vlib_buffer_length_in_chain (vm, b1))) - { - error1 = UDP_ERROR_LENGTH_ERROR; - next1 = UDP_INPUT_NEXT_DROP; - } - } - - /* Index sparse array with network byte order. */ - dst_port0 = (error0 == 0) ? h0->dst_port : 0; - dst_port1 = (error1 == 0) ? h1->dst_port : 0; - sparse_vec_index2 (rt->next_by_dst_port, dst_port0, dst_port1, - &i0, &i1); - next0 = (error0 == 0) ? vec_elt (rt->next_by_dst_port, i0) : next0; - next1 = (error1 == 0) ? vec_elt (rt->next_by_dst_port, i1) : next1; - - if (PREDICT_FALSE (i0 == SPARSE_VEC_INVALID_INDEX)) - { - // move the pointer back so icmp-error can find the - // ip packet header - vlib_buffer_advance (b0, -(word) advance0); - - if (PREDICT_FALSE (punt_unknown)) - { - b0->error = node->errors[UDP_ERROR_PUNT]; - next0 = UDP_INPUT_NEXT_PUNT; - } - else if (is_ip4) - { - icmp4_error_set_vnet_buffer (b0, - ICMP4_destination_unreachable, - ICMP4_destination_unreachable_port_unreachable, - 0); - next0 = UDP_INPUT_NEXT_ICMP4_ERROR; - n_no_listener++; - } - else - { - icmp6_error_set_vnet_buffer (b0, - ICMP6_destination_unreachable, - ICMP6_destination_unreachable_port_unreachable, - 0); - next0 = UDP_INPUT_NEXT_ICMP6_ERROR; - n_no_listener++; - } - } - else - { - b0->error = node->errors[UDP_ERROR_NONE]; - // advance to the payload - vlib_buffer_advance (b0, sizeof (*h0)); - } - - if (PREDICT_FALSE (i1 == SPARSE_VEC_INVALID_INDEX)) - { - // move the pointer back so icmp-error can find the - // ip packet header - vlib_buffer_advance (b1, -(word) advance1); - - if (PREDICT_FALSE (punt_unknown)) - { - b1->error = node->errors[UDP_ERROR_PUNT]; - next1 = UDP_INPUT_NEXT_PUNT; - } - else if (is_ip4) - { - icmp4_error_set_vnet_buffer (b1, - ICMP4_destination_unreachable, - ICMP4_destination_unreachable_port_unreachable, - 0); - next1 = UDP_INPUT_NEXT_ICMP4_ERROR; - n_no_listener++; - } - else - { - icmp6_error_set_vnet_buffer (b1, - ICMP6_destination_unreachable, - ICMP6_destination_unreachable_port_unreachable, - 0); - next1 = UDP_INPUT_NEXT_ICMP6_ERROR; - n_no_listener++; - } - } - else - { - b1->error = node->errors[UDP_ERROR_NONE]; - // advance to the payload - vlib_buffer_advance (b1, sizeof (*h1)); - } - - if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) - { - udp_rx_trace_t *tr = vlib_add_trace (vm, node, - b0, sizeof (*tr)); - if (b0->error != node->errors[UDP_ERROR_LENGTH_ERROR]) - { - tr->src_port = h0 ? h0->src_port : 0; - tr->dst_port = h0 ? h0->dst_port : 0; - tr->bound = (next0 != UDP_INPUT_NEXT_ICMP4_ERROR && - next0 != UDP_INPUT_NEXT_ICMP6_ERROR); - } - } - if (PREDICT_FALSE (b1->flags & VLIB_BUFFER_IS_TRACED)) - { - udp_rx_trace_t *tr = vlib_add_trace (vm, node, - b1, sizeof (*tr)); - if (b1->error != node->errors[UDP_ERROR_LENGTH_ERROR]) - { - tr->src_port = h1 ? h1->src_port : 0; - tr->dst_port = h1 ? h1->dst_port : 0; - tr->bound = (next1 != UDP_INPUT_NEXT_ICMP4_ERROR && - next1 != UDP_INPUT_NEXT_ICMP6_ERROR); - } - } - - vlib_validate_buffer_enqueue_x2 (vm, node, next_index, - to_next, n_left_to_next, - bi0, bi1, next0, next1); - } - - while (n_left_from > 0 && n_left_to_next > 0) - { - u32 bi0; - vlib_buffer_t *b0; - udp_header_t *h0 = 0; - u32 i0, next0; - u32 advance0; - - bi0 = from[0]; - to_next[0] = bi0; - from += 1; - to_next += 1; - n_left_from -= 1; - n_left_to_next -= 1; - - b0 = vlib_get_buffer (vm, bi0); - - /* ip4/6_local hands us the ip header, not the udp header */ - if (is_ip4) - advance0 = sizeof (ip4_header_t); - else - advance0 = sizeof (ip6_header_t); - - if (PREDICT_FALSE (b0->current_length < advance0 + sizeof (*h0))) - { - b0->error = node->errors[UDP_ERROR_LENGTH_ERROR]; - next0 = UDP_INPUT_NEXT_DROP; - goto trace_x1; - } - - vlib_buffer_advance (b0, advance0); - - h0 = vlib_buffer_get_current (b0); - - if (PREDICT_TRUE (clib_net_to_host_u16 (h0->length) <= - vlib_buffer_length_in_chain (vm, b0))) - { - i0 = sparse_vec_index (rt->next_by_dst_port, h0->dst_port); - next0 = vec_elt (rt->next_by_dst_port, i0); - - if (PREDICT_FALSE (i0 == SPARSE_VEC_INVALID_INDEX)) - { - // move the pointer back so icmp-error can find the - // ip packet header - vlib_buffer_advance (b0, -(word) advance0); - - if (PREDICT_FALSE (punt_unknown)) - { - b0->error = node->errors[UDP_ERROR_PUNT]; - next0 = UDP_INPUT_NEXT_PUNT; - } - else if (is_ip4) - { - icmp4_error_set_vnet_buffer (b0, - ICMP4_destination_unreachable, - ICMP4_destination_unreachable_port_unreachable, - 0); - next0 = UDP_INPUT_NEXT_ICMP4_ERROR; - n_no_listener++; - } - else - { - icmp6_error_set_vnet_buffer (b0, - ICMP6_destination_unreachable, - ICMP6_destination_unreachable_port_unreachable, - 0); - next0 = UDP_INPUT_NEXT_ICMP6_ERROR; - n_no_listener++; - } - } - else - { - b0->error = node->errors[UDP_ERROR_NONE]; - // advance to the payload - vlib_buffer_advance (b0, sizeof (*h0)); - } - } - else - { - b0->error = node->errors[UDP_ERROR_LENGTH_ERROR]; - next0 = UDP_INPUT_NEXT_DROP; - } - - trace_x1: - if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) - { - udp_rx_trace_t *tr = vlib_add_trace (vm, node, - b0, sizeof (*tr)); - if (b0->error != node->errors[UDP_ERROR_LENGTH_ERROR]) - { - tr->src_port = h0->src_port; - tr->dst_port = h0->dst_port; - tr->bound = (next0 != UDP_INPUT_NEXT_ICMP4_ERROR && - next0 != UDP_INPUT_NEXT_ICMP6_ERROR); - } - } - - vlib_validate_buffer_enqueue_x1 (vm, node, next_index, - to_next, n_left_to_next, - bi0, next0); - } - - vlib_put_next_frame (vm, node, next_index, n_left_to_next); - } - vlib_error_count (vm, node->node_index, UDP_ERROR_NO_LISTENER, - n_no_listener); - return from_frame->n_vectors; -} - -static char *udp_error_strings[] = { -#define udp_error(n,s) s, -#include "udp_error.def" -#undef udp_error -}; - -static uword -udp4_input (vlib_main_t * vm, - vlib_node_runtime_t * node, vlib_frame_t * from_frame) -{ - return udp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */ ); -} - -static uword -udp6_input (vlib_main_t * vm, - vlib_node_runtime_t * node, vlib_frame_t * from_frame) -{ - return udp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */ ); -} - - -/* *INDENT-OFF* */ -VLIB_REGISTER_NODE (udp4_input_node) = { - .function = udp4_input, - .name = "ip4-udp-lookup", - /* Takes a vector of packets. */ - .vector_size = sizeof (u32), - - .runtime_data_bytes = sizeof (udp_input_runtime_t), - - .n_errors = UDP_N_ERROR, - .error_strings = udp_error_strings, - - .n_next_nodes = UDP_INPUT_N_NEXT, - .next_nodes = { -#define _(s,n) [UDP_INPUT_NEXT_##s] = n, - foreach_udp_input_next -#undef _ - }, - - .format_buffer = format_udp_header, - .format_trace = format_udp_rx_trace, - .unformat_buffer = unformat_udp_header, -}; -/* *INDENT-ON* */ - -VLIB_NODE_FUNCTION_MULTIARCH (udp4_input_node, udp4_input); - -/* *INDENT-OFF* */ -VLIB_REGISTER_NODE (udp6_input_node) = { - .function = udp6_input, - .name = "ip6-udp-lookup", - /* Takes a vector of packets. */ - .vector_size = sizeof (u32), - - .runtime_data_bytes = sizeof (udp_input_runtime_t), - - .n_errors = UDP_N_ERROR, - .error_strings = udp_error_strings, - - .n_next_nodes = UDP_INPUT_N_NEXT, - .next_nodes = { -#define _(s,n) [UDP_INPUT_NEXT_##s] = n, - foreach_udp_input_next -#undef _ - }, - - .format_buffer = format_udp_header, - .format_trace = format_udp_rx_trace, - .unformat_buffer = unformat_udp_header, -}; -/* *INDENT-ON* */ - -VLIB_NODE_FUNCTION_MULTIARCH (udp6_input_node, udp6_input); - -static void -add_dst_port (udp_main_t * um, - udp_dst_port_t dst_port, char *dst_port_name, u8 is_ip4) -{ - udp_dst_port_info_t *pi; - u32 i; - - vec_add2 (um->dst_port_infos[is_ip4], pi, 1); - i = pi - um->dst_port_infos[is_ip4]; - - pi->name = dst_port_name; - pi->dst_port = dst_port; - pi->next_index = pi->node_index = ~0; - - hash_set (um->dst_port_info_by_dst_port[is_ip4], dst_port, i); - - if (pi->name) - hash_set_mem (um->dst_port_info_by_name[is_ip4], pi->name, i); -} - -void -udp_register_dst_port (vlib_main_t * vm, - udp_dst_port_t dst_port, u32 node_index, u8 is_ip4) -{ - udp_main_t *um = &udp_main; - udp_dst_port_info_t *pi; - udp_input_runtime_t *rt; - u16 *n; - - { - clib_error_t *error = vlib_call_init_function (vm, udp_local_init); - if (error) - clib_error_report (error); - } - - pi = udp_get_dst_port_info (um, dst_port, is_ip4); - if (!pi) - { - add_dst_port (um, dst_port, 0, is_ip4); - pi = udp_get_dst_port_info (um, dst_port, is_ip4); - ASSERT (pi); - } - - pi->node_index = node_index; - pi->next_index = vlib_node_add_next (vm, - is_ip4 ? udp4_input_node.index - : udp6_input_node.index, node_index); - - /* Setup udp protocol -> next index sparse vector mapping. */ - rt = vlib_node_get_runtime_data - (vm, is_ip4 ? udp4_input_node.index : udp6_input_node.index); - n = sparse_vec_validate (rt->next_by_dst_port, - clib_host_to_net_u16 (dst_port)); - n[0] = pi->next_index; -} - -void -udp_punt_unknown (vlib_main_t * vm, u8 is_ip4, u8 is_add) -{ - udp_input_runtime_t *rt; - - { - clib_error_t *error = vlib_call_init_function (vm, udp_local_init); - if (error) - clib_error_report (error); - } - - rt = vlib_node_get_runtime_data - (vm, is_ip4 ? udp4_input_node.index : udp6_input_node.index); - - rt->punt_unknown = is_add; -} - -/* Parse a UDP header. */ -uword -unformat_udp_header (unformat_input_t * input, va_list * args) -{ - u8 **result = va_arg (*args, u8 **); - udp_header_t *udp; - __attribute__ ((unused)) int old_length; - u16 src_port, dst_port; - - /* Allocate space for IP header. */ - { - void *p; - - old_length = vec_len (*result); - vec_add2 (*result, p, sizeof (ip4_header_t)); - udp = p; - } - - memset (udp, 0, sizeof (udp[0])); - if (unformat (input, "src-port %d dst-port %d", &src_port, &dst_port)) - { - udp->src_port = clib_host_to_net_u16 (src_port); - udp->dst_port = clib_host_to_net_u16 (dst_port); - return 1; - } - return 0; -} - -static void -udp_setup_node (vlib_main_t * vm, u32 node_index) -{ - vlib_node_t *n = vlib_get_node (vm, node_index); - pg_node_t *pn = pg_get_node (node_index); - - n->format_buffer = format_udp_header; - n->unformat_buffer = unformat_udp_header; - pn->unformat_edit = unformat_pg_udp_header; -} - -clib_error_t * -udp_local_init (vlib_main_t * vm) -{ - udp_input_runtime_t *rt; - udp_main_t *um = &udp_main; - int i; - - { - clib_error_t *error; - error = vlib_call_init_function (vm, udp_init); - if (error) - clib_error_report (error); - } - - - for (i = 0; i < 2; i++) - { - um->dst_port_info_by_name[i] = hash_create_string (0, sizeof (uword)); - um->dst_port_info_by_dst_port[i] = hash_create (0, sizeof (uword)); - } - - udp_setup_node (vm, udp4_input_node.index); - udp_setup_node (vm, udp6_input_node.index); - - rt = vlib_node_get_runtime_data (vm, udp4_input_node.index); - - rt->next_by_dst_port = sparse_vec_new - ( /* elt bytes */ sizeof (rt->next_by_dst_port[0]), - /* bits in index */ BITS (((udp_header_t *) 0)->dst_port)); - - rt->punt_unknown = 0; - -#define _(n,s) add_dst_port (um, UDP_DST_PORT_##s, #s, 1 /* is_ip4 */); - foreach_udp4_dst_port -#undef _ - rt = vlib_node_get_runtime_data (vm, udp6_input_node.index); - - rt->next_by_dst_port = sparse_vec_new - ( /* elt bytes */ sizeof (rt->next_by_dst_port[0]), - /* bits in index */ BITS (((udp_header_t *) 0)->dst_port)); - - rt->punt_unknown = 0; - -#define _(n,s) add_dst_port (um, UDP_DST_PORT_##s, #s, 0 /* is_ip4 */); - foreach_udp6_dst_port -#undef _ - ip4_register_protocol (IP_PROTOCOL_UDP, udp4_input_node.index); - /* Note: ip6 differs from ip4, UDP is hotwired to ip6-udp-lookup */ - return 0; -} - -VLIB_INIT_FUNCTION (udp_local_init); - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet/ip/udp_packet.h b/src/vnet/ip/udp_packet.h deleted file mode 100644 index beea3059..00000000 --- a/src/vnet/ip/udp_packet.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * ip4/udp_packet.h: UDP packet format - * - * Copyright (c) 2008 Eliot Dresselhaus - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#ifndef included_udp_packet_h -#define included_udp_packet_h - -typedef struct -{ - /* Source and destination port. */ - u16 src_port, dst_port; - - /* Length of UDP header plus payload. */ - u16 length; - - /* Checksum of UDP pseudo-header and data or - zero if checksum is disabled. */ - u16 checksum; -} udp_header_t; - -#endif /* included_udp_packet_h */ - - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet/ip/udp_pg.c b/src/vnet/ip/udp_pg.c deleted file mode 100644 index c9d8d38c..00000000 --- a/src/vnet/ip/udp_pg.c +++ /dev/null @@ -1,237 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * ip/udp_pg: UDP packet-generator interface - * - * Copyright (c) 2008 Eliot Dresselhaus - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#include -#include /* for unformat_udp_udp_port */ - -#define UDP_PG_EDIT_LENGTH (1 << 0) -#define UDP_PG_EDIT_CHECKSUM (1 << 1) - -always_inline void -udp_pg_edit_function_inline (pg_main_t * pg, - pg_stream_t * s, - pg_edit_group_t * g, - u32 * packets, u32 n_packets, u32 flags) -{ - vlib_main_t *vm = vlib_get_main (); - u32 ip_offset, udp_offset; - - udp_offset = g->start_byte_offset; - ip_offset = (g - 1)->start_byte_offset; - - while (n_packets >= 1) - { - vlib_buffer_t *p0; - ip4_header_t *ip0; - udp_header_t *udp0; - u32 udp_len0; - - p0 = vlib_get_buffer (vm, packets[0]); - n_packets -= 1; - packets += 1; - - ip0 = (void *) (p0->data + ip_offset); - udp0 = (void *) (p0->data + udp_offset); - udp_len0 = clib_net_to_host_u16 (ip0->length) - sizeof (ip0[0]); - - if (flags & UDP_PG_EDIT_LENGTH) - udp0->length = - clib_net_to_host_u16 (vlib_buffer_length_in_chain (vm, p0) - - ip_offset); - - /* Initialize checksum with header. */ - if (flags & UDP_PG_EDIT_CHECKSUM) - { - ip_csum_t sum0; - - sum0 = clib_mem_unaligned (&ip0->src_address, u64); - - sum0 = ip_csum_with_carry - (sum0, clib_host_to_net_u32 (udp_len0 + (ip0->protocol << 16))); - - /* Invalidate possibly old checksum. */ - udp0->checksum = 0; - - sum0 = - ip_incremental_checksum_buffer (vm, p0, udp_offset, udp_len0, - sum0); - - sum0 = ~ip_csum_fold (sum0); - - /* Zero checksum means checksumming disabled. */ - sum0 = sum0 != 0 ? sum0 : 0xffff; - - udp0->checksum = sum0; - } - } -} - -static void -udp_pg_edit_function (pg_main_t * pg, - pg_stream_t * s, - pg_edit_group_t * g, u32 * packets, u32 n_packets) -{ - switch (g->edit_function_opaque) - { - case UDP_PG_EDIT_LENGTH: - udp_pg_edit_function_inline (pg, s, g, packets, n_packets, - UDP_PG_EDIT_LENGTH); - break; - - case UDP_PG_EDIT_CHECKSUM: - udp_pg_edit_function_inline (pg, s, g, packets, n_packets, - UDP_PG_EDIT_CHECKSUM); - break; - - case UDP_PG_EDIT_CHECKSUM | UDP_PG_EDIT_LENGTH: - udp_pg_edit_function_inline (pg, s, g, packets, n_packets, - UDP_PG_EDIT_CHECKSUM | UDP_PG_EDIT_LENGTH); - break; - - default: - ASSERT (0); - break; - } -} - -typedef struct -{ - pg_edit_t src_port, dst_port; - pg_edit_t length; - pg_edit_t checksum; -} pg_udp_header_t; - -static inline void -pg_udp_header_init (pg_udp_header_t * p) -{ - /* Initialize fields that are not bit fields in the IP header. */ -#define _(f) pg_edit_init (&p->f, udp_header_t, f); - _(src_port); - _(dst_port); - _(length); - _(checksum); -#undef _ -} - -uword -unformat_pg_udp_header (unformat_input_t * input, va_list * args) -{ - pg_stream_t *s = va_arg (*args, pg_stream_t *); - pg_udp_header_t *p; - u32 group_index; - - p = pg_create_edit_group (s, sizeof (p[0]), sizeof (udp_header_t), - &group_index); - pg_udp_header_init (p); - - /* Defaults. */ - p->checksum.type = PG_EDIT_UNSPECIFIED; - p->length.type = PG_EDIT_UNSPECIFIED; - - if (!unformat (input, "UDP: %U -> %U", - unformat_pg_edit, - unformat_tcp_udp_port, &p->src_port, - unformat_pg_edit, unformat_tcp_udp_port, &p->dst_port)) - goto error; - - /* Parse options. */ - while (1) - { - if (unformat (input, "length %U", - unformat_pg_edit, unformat_pg_number, &p->length)) - ; - - else if (unformat (input, "checksum %U", - unformat_pg_edit, unformat_pg_number, &p->checksum)) - ; - - /* Can't parse input: try next protocol level. */ - else - break; - } - - { - ip_main_t *im = &ip_main; - u16 dst_port; - tcp_udp_port_info_t *pi; - - pi = 0; - if (p->dst_port.type == PG_EDIT_FIXED) - { - dst_port = pg_edit_get_value (&p->dst_port, PG_EDIT_LO); - pi = ip_get_tcp_udp_port_info (im, dst_port); - } - - if (pi && pi->unformat_pg_edit - && unformat_user (input, pi->unformat_pg_edit, s)) - ; - - else if (!unformat_user (input, unformat_pg_payload, s)) - goto error; - - p = pg_get_edit_group (s, group_index); - if (p->checksum.type == PG_EDIT_UNSPECIFIED - || p->length.type == PG_EDIT_UNSPECIFIED) - { - pg_edit_group_t *g = pg_stream_get_group (s, group_index); - g->edit_function = udp_pg_edit_function; - g->edit_function_opaque = 0; - if (p->checksum.type == PG_EDIT_UNSPECIFIED) - g->edit_function_opaque |= UDP_PG_EDIT_CHECKSUM; - if (p->length.type == PG_EDIT_UNSPECIFIED) - g->edit_function_opaque |= UDP_PG_EDIT_LENGTH; - } - - return 1; - } - -error: - /* Free up any edits we may have added. */ - pg_free_edit_group (s); - return 0; -} - - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet/ipsec/ikev2.c b/src/vnet/ipsec/ikev2.c index 09209334..2c1074d8 100644 --- a/src/vnet/ipsec/ikev2.c +++ b/src/vnet/ipsec/ikev2.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/vnet/ipsec/ikev2_cli.c b/src/vnet/ipsec/ikev2_cli.c index 5c88d8d4..05ed4e60 100644 --- a/src/vnet/ipsec/ikev2_cli.c +++ b/src/vnet/ipsec/ikev2_cli.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/vnet/ipsec/ikev2_crypto.c b/src/vnet/ipsec/ikev2_crypto.c index c201d3eb..ca56158f 100644 --- a/src/vnet/ipsec/ikev2_crypto.c +++ b/src/vnet/ipsec/ikev2_crypto.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/vnet/lisp-cp/packets.c b/src/vnet/lisp-cp/packets.c index 3a4f421b..f24024f1 100644 --- a/src/vnet/lisp-cp/packets.c +++ b/src/vnet/lisp-cp/packets.c @@ -15,7 +15,7 @@ #include #include -#include +#include /* Returns IP ID for the packet */ /* static u16 ip_id = 0; @@ -141,61 +141,6 @@ pkt_push_udp (vlib_main_t * vm, vlib_buffer_t * b, u16 sp, u16 dp) return uh; } -void * -pkt_push_ipv4 (vlib_main_t * vm, vlib_buffer_t * b, ip4_address_t * src, - ip4_address_t * dst, int proto) -{ - ip4_header_t *ih; - - /* make some room */ - ih = vlib_buffer_push_uninit (b, sizeof (ip4_header_t)); - - ih->ip_version_and_header_length = 0x45; - ih->tos = 0; - ih->length = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b)); - - /* iph->fragment_id = clib_host_to_net_u16(get_IP_ID ()); */ - - /* TODO: decide if we allow fragments in case of control */ - ih->flags_and_fragment_offset = clib_host_to_net_u16 (IP_DF); - ih->ttl = 255; - ih->protocol = proto; - ih->src_address.as_u32 = src->as_u32; - ih->dst_address.as_u32 = dst->as_u32; - - ih->checksum = ip4_header_checksum (ih); - return ih; -} - -void * -pkt_push_ipv6 (vlib_main_t * vm, vlib_buffer_t * b, ip6_address_t * src, - ip6_address_t * dst, int proto) -{ - ip6_header_t *ip6h; - u16 payload_length; - - /* make some room */ - ip6h = vlib_buffer_push_uninit (b, sizeof (ip6_header_t)); - - ip6h->ip_version_traffic_class_and_flow_label = - clib_host_to_net_u32 (0x6 << 28); - - /* calculate ip6 payload length */ - payload_length = vlib_buffer_length_in_chain (vm, b); - payload_length -= sizeof (*ip6h); - - ip6h->payload_length = clib_host_to_net_u16 (payload_length); - - ip6h->hop_limit = 0xff; - ip6h->protocol = proto; - clib_memcpy (ip6h->src_address.as_u8, src->as_u8, - sizeof (ip6h->src_address)); - clib_memcpy (ip6h->dst_address.as_u8, dst->as_u8, - sizeof (ip6h->src_address)); - - return ip6h; -} - void * pkt_push_ip (vlib_main_t * vm, vlib_buffer_t * b, ip_address_t * src, ip_address_t * dst, u32 proto) @@ -210,12 +155,12 @@ pkt_push_ip (vlib_main_t * vm, vlib_buffer_t * b, ip_address_t * src, switch (ip_addr_version (src)) { case IP4: - return pkt_push_ipv4 (vm, b, &ip_addr_v4 (src), &ip_addr_v4 (dst), - proto); + return vlib_buffer_push_ip4 (vm, b, &ip_addr_v4 (src), + &ip_addr_v4 (dst), proto); break; case IP6: - return pkt_push_ipv6 (vm, b, &ip_addr_v6 (src), &ip_addr_v6 (dst), - proto); + return vlib_buffer_push_ip6 (vm, b, &ip_addr_v6 (src), + &ip_addr_v6 (dst), proto); break; } diff --git a/src/vnet/lisp-cp/packets.h b/src/vnet/lisp-cp/packets.h index 212a1d78..f6da3bf4 100644 --- a/src/vnet/lisp-cp/packets.h +++ b/src/vnet/lisp-cp/packets.h @@ -26,51 +26,6 @@ void *pkt_push_udp_and_ip (vlib_main_t * vm, vlib_buffer_t * b, u16 sp, void *pkt_push_ecm_hdr (vlib_buffer_t * b); -always_inline u8 * -vlib_buffer_get_tail (vlib_buffer_t * b) -{ - return b->data + b->current_data + b->current_length; -} - -always_inline void * -vlib_buffer_put_uninit (vlib_buffer_t * b, u8 size) -{ - /* XXX should make sure there's enough space! */ - void *p = vlib_buffer_get_tail (b); - b->current_length += size; - return p; -} - -always_inline void * -vlib_buffer_push_uninit (vlib_buffer_t * b, u8 size) -{ - /* XXX should make sure there's enough space! */ - ASSERT (b->current_data >= size); - b->current_data -= size; - b->current_length += size; - - return vlib_buffer_get_current (b); -} - -always_inline void * -vlib_buffer_make_headroom (vlib_buffer_t * b, u8 size) -{ - /* XXX should make sure there's enough space! */ - b->current_data += size; - return vlib_buffer_get_current (b); -} - -always_inline void * -vlib_buffer_pull (vlib_buffer_t * b, u8 size) -{ - if (b->current_length < size) - return 0; - - void *data = vlib_buffer_get_current (b); - vlib_buffer_advance (b, size); - return data; -} - /* *INDENT-ON* */ /* diff --git a/src/vnet/lisp-gpe/interface.c b/src/vnet/lisp-gpe/interface.c index 13359277..292c7e6a 100644 --- a/src/vnet/lisp-gpe/interface.c +++ b/src/vnet/lisp-gpe/interface.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/vnet/lisp-gpe/lisp_gpe.h b/src/vnet/lisp-gpe/lisp_gpe.h index c898a7da..b5a50ec6 100644 --- a/src/vnet/lisp-gpe/lisp_gpe.h +++ b/src/vnet/lisp-gpe/lisp_gpe.h @@ -27,10 +27,12 @@ #include #include #include -#include +#include #include #include #include +#include +#include /** IP4-UDP-LISP encap header */ /* *INDENT-OFF* */ diff --git a/src/vnet/lisp-gpe/lisp_gpe_adjacency.c b/src/vnet/lisp-gpe/lisp_gpe_adjacency.c index 65006b81..dbcf7134 100644 --- a/src/vnet/lisp-gpe/lisp_gpe_adjacency.c +++ b/src/vnet/lisp-gpe/lisp_gpe_adjacency.c @@ -25,6 +25,8 @@ #include #include #include +#include +#include /** * Memory pool of all adjacencies diff --git a/src/vnet/session/application.c b/src/vnet/session/application.c new file mode 100644 index 00000000..a561e7d1 --- /dev/null +++ b/src/vnet/session/application.c @@ -0,0 +1,343 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +/* + * Pool from which we allocate all applications + */ +static application_t *app_pool; + +/* + * Hash table of apps by api client index + */ +static uword *app_by_api_client_index; + +int +application_api_queue_is_full (application_t * app) +{ + unix_shared_memory_queue_t *q; + + /* builtin servers are always OK */ + if (app->api_client_index == ~0) + return 0; + + q = vl_api_client_index_to_input_queue (app->api_client_index); + if (!q) + return 1; + + if (q->cursize == q->maxsize) + return 1; + return 0; +} + +static void +application_table_add (application_t * app) +{ + hash_set (app_by_api_client_index, app->api_client_index, app->index); +} + +static void +application_table_del (application_t * app) +{ + hash_unset (app_by_api_client_index, app->api_client_index); +} + +application_t * +application_lookup (u32 api_client_index) +{ + uword *p; + p = hash_get (app_by_api_client_index, api_client_index); + if (p) + return application_get (p[0]); + + return 0; +} + +void +application_del (application_t * app) +{ + session_manager_main_t *smm = vnet_get_session_manager_main (); + api_main_t *am = &api_main; + void *oldheap; + session_manager_t *sm; + + if (app->mode == APP_SERVER) + { + sm = session_manager_get (app->session_manager_index); + session_manager_del (smm, sm); + } + + /* Free the event fifo in the /vpe-api shared-memory segment */ + oldheap = svm_push_data_heap (am->vlib_rp); + if (app->event_queue) + unix_shared_memory_queue_free (app->event_queue); + svm_pop_heap (oldheap); + + application_table_del (app); + + pool_put (app_pool, app); +} + +application_t * +application_new (application_type_t type, session_type_t sst, + u32 api_client_index, u32 flags, session_cb_vft_t * cb_fns) +{ + session_manager_main_t *smm = vnet_get_session_manager_main (); + api_main_t *am = &api_main; + application_t *app; + void *oldheap; + session_manager_t *sm; + + pool_get (app_pool, app); + memset (app, 0, sizeof (*app)); + + /* Allocate event fifo in the /vpe-api shared-memory segment */ + oldheap = svm_push_data_heap (am->vlib_rp); + + /* Allocate server event queue */ + app->event_queue = + unix_shared_memory_queue_init (128 /* nels $$$$ config */ , + sizeof (session_fifo_event_t), + 0 /* consumer pid */ , + 0 + /* (do not) signal when queue non-empty */ + ); + + svm_pop_heap (oldheap); + + /* If a server, allocate session manager */ + if (type == APP_SERVER) + { + pool_get (smm->session_managers, sm); + memset (sm, 0, sizeof (*sm)); + + app->session_manager_index = sm - smm->session_managers; + } + else if (type == APP_CLIENT) + { + /* Allocate connect session manager if needed */ + if (smm->connect_manager_index[sst] == INVALID_INDEX) + connects_session_manager_init (smm, sst); + app->session_manager_index = smm->connect_manager_index[sst]; + } + + app->mode = type; + app->index = application_get_index (app); + app->session_type = sst; + app->api_client_index = api_client_index; + app->flags = flags; + app->cb_fns = *cb_fns; + + /* Add app to lookup by api_client_index table */ + application_table_add (app); + + return app; +} + +application_t * +application_get (u32 index) +{ + return pool_elt_at_index (app_pool, index); +} + +u32 +application_get_index (application_t * app) +{ + return app - app_pool; +} + +int +application_server_init (application_t * server, u32 segment_size, + u32 add_segment_size, u32 rx_fifo_size, + u32 tx_fifo_size, u8 ** segment_name) +{ + session_manager_main_t *smm = vnet_get_session_manager_main (); + session_manager_t *sm; + int rv; + + sm = session_manager_get (server->session_manager_index); + + /* Add first segment */ + if ((rv = session_manager_add_first_segment (smm, sm, segment_size, + segment_name))) + { + return rv; + } + + /* Setup session manager */ + sm->add_segment_size = add_segment_size; + sm->rx_fifo_size = rx_fifo_size; + sm->tx_fifo_size = tx_fifo_size; + sm->add_segment = sm->add_segment_size != 0; + return 0; +} + +u8 * +format_application_server (u8 * s, va_list * args) +{ + application_t *srv = va_arg (*args, application_t *); + int verbose = va_arg (*args, int); + vl_api_registration_t *regp; + stream_session_t *listener; + u8 *server_name, *str, *seg_name; + u32 segment_size; + + if (srv == 0) + { + if (verbose) + s = format (s, "%-40s%-20s%-15s%-15s%-10s", "Connection", "Server", + "Segment", "API Client", "Cookie"); + else + s = format (s, "%-40s%-20s", "Connection", "Server"); + + return s; + } + + regp = vl_api_client_index_to_registration (srv->api_client_index); + if (!regp) + server_name = format (0, "%s%c", regp->name, 0); + else + server_name = regp->name; + + listener = stream_session_listener_get (srv->session_type, + srv->session_index); + str = format (0, "%U", format_stream_session, listener, verbose); + + session_manager_get_segment_info (listener->server_segment_index, &seg_name, + &segment_size); + if (verbose) + { + s = format (s, "%-40s%-20s%-20s%-10d%-10d", str, server_name, + seg_name, srv->api_client_index, srv->accept_cookie); + } + else + s = format (s, "%-40s%-20s", str, server_name); + return s; +} + +u8 * +format_application_client (u8 * s, va_list * args) +{ + application_t *client = va_arg (*args, application_t *); + int verbose = va_arg (*args, int); + stream_session_t *session; + u8 *str, *seg_name; + u32 segment_size; + + if (client == 0) + { + if (verbose) + s = + format (s, "%-40s%-20s%-10s", "Connection", "Segment", + "API Client"); + else + s = format (s, "%-40s", "Connection"); + + return s; + } + + session = stream_session_get (client->session_index, client->thread_index); + str = format (0, "%U", format_stream_session, session, verbose); + + session_manager_get_segment_info (session->server_segment_index, &seg_name, + &segment_size); + if (verbose) + { + s = format (s, "%-40s%-20s%-10d%", str, seg_name, + client->api_client_index); + } + else + s = format (s, "%-40s", str); + return s; +} + +static clib_error_t * +show_app_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + application_t *app; + int do_server = 0; + int do_client = 0; + int verbose = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "server")) + do_server = 1; + else if (unformat (input, "client")) + do_client = 1; + else if (unformat (input, "verbose")) + verbose = 1; + else + break; + } + + if (do_server) + { + if (pool_elts (app_pool)) + { + vlib_cli_output (vm, "%U", format_application_server, + 0 /* header */ , + verbose); + /* *INDENT-OFF* */ + pool_foreach (app, app_pool, + ({ + if (app->mode == APP_SERVER) + vlib_cli_output (vm, "%U", format_application_server, app, + verbose); + })); + /* *INDENT-ON* */ + } + else + vlib_cli_output (vm, "No active server bindings"); + } + + if (do_client) + { + if (pool_elts (app_pool)) + { + vlib_cli_output (vm, "%U", format_application_client, + 0 /* header */ , + verbose); + /* *INDENT-OFF* */ + pool_foreach (app, app_pool, + ({ + if (app->mode == APP_CLIENT) + vlib_cli_output (vm, "%U", format_application_client, app, + verbose); + })); + /* *INDENT-ON* */ + } + else + vlib_cli_output (vm, "No active server bindings"); + } + + return 0; +} + +VLIB_CLI_COMMAND (show_app_command, static) = +{ +.path = "show app",.short_help = + "show app [server|client] [verbose]",.function = show_app_command_fn,}; + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/application.h b/src/vnet/session/application.h new file mode 100644 index 00000000..027d6967 --- /dev/null +++ b/src/vnet/session/application.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SRC_VNET_SESSION_APPLICATION_H_ +#define SRC_VNET_SESSION_APPLICATION_H_ + +#include +#include + +typedef enum +{ + APP_SERVER, + APP_CLIENT +} application_type_t; + +typedef struct _stream_session_cb_vft +{ + /** Notify server of new segment */ + int (*add_segment_callback) (u32 api_client_index, const u8 * seg_name, + u32 seg_size); + + /** Notify server of newly accepted session */ + int (*session_accept_callback) (stream_session_t * new_session); + + /* Connection request callback */ + int (*session_connected_callback) (u32 api_client_index, + stream_session_t * s, u8 code); + + /** Notify app that session is closing */ + void (*session_disconnect_callback) (stream_session_t * s); + + /** Notify app that session was reset */ + void (*session_reset_callback) (stream_session_t * s); + + /* Direct RX callback, for built-in servers */ + int (*builtin_server_rx_callback) (stream_session_t * session); + + /* Redirect connection to local server */ + int (*redirect_connect_callback) (u32 api_client_index, void *mp); +} session_cb_vft_t; + +typedef struct _application +{ + /** Index in server pool */ + u32 index; + + /** Flags */ + u32 flags; + + /** Binary API connection index, ~0 if internal */ + u32 api_client_index; + + /* */ + u32 api_context; + + /** Application listens for events on this svm queue */ + unix_shared_memory_queue_t *event_queue; + + /** Stream session type */ + u8 session_type; + + /* Stream server mode: accept or connect */ + u8 mode; + + u32 session_manager_index; + + /* + * Bind/Listen specific + */ + + /** Accept cookie, for multiple session flavors ($$$ maybe) */ + u32 accept_cookie; + + /** Index of the listen session or connect session */ + u32 session_index; + + /** Session thread index for client connect sessions */ + u32 thread_index; + + /* + * Callbacks: shoulder-taps for the server/client + */ + session_cb_vft_t cb_fns; +} application_t; + +application_t *application_new (application_type_t type, session_type_t sst, + u32 api_client_index, u32 flags, + session_cb_vft_t * cb_fns); +void application_del (application_t * app); +application_t *application_get (u32 index); +application_t *application_lookup (u32 api_client_index); +u32 application_get_index (application_t * app); + +int +application_server_init (application_t * server, u32 segment_size, + u32 add_segment_size, u32 rx_fifo_size, + u32 tx_fifo_size, u8 ** segment_name); +int application_api_queue_is_full (application_t * app); + +#endif /* SRC_VNET_SESSION_APPLICATION_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/application_interface.c b/src/vnet/session/application_interface.c new file mode 100644 index 00000000..0ea77fd8 --- /dev/null +++ b/src/vnet/session/application_interface.c @@ -0,0 +1,459 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include +#include +#include +#include + +/** @file + VPP's application/session API bind/unbind/connect/disconnect calls +*/ + +static u8 +ip_is_zero (ip46_address_t * ip46_address, u8 is_ip4) +{ + if (is_ip4) + return (ip46_address->ip4.as_u32 == 0); + else + return (ip46_address->as_u64[0] == 0 && ip46_address->as_u64[1] == 0); +} + +static u8 +ip_is_local (ip46_address_t * ip46_address, u8 is_ip4) +{ + fib_node_index_t fei; + fib_entry_flag_t flags; + fib_prefix_t prefix; + + /* Check if requester is local */ + if (is_ip4) + { + prefix.fp_len = 32; + prefix.fp_proto = FIB_PROTOCOL_IP4; + } + else + { + prefix.fp_len = 128; + prefix.fp_proto = FIB_PROTOCOL_IP6; + } + + clib_memcpy (&prefix.fp_addr, ip46_address, sizeof (ip46_address)); + fei = fib_table_lookup (0, &prefix); + flags = fib_entry_get_flags (fei); + + return (flags & FIB_ENTRY_FLAG_LOCAL); +} + +int +api_parse_session_handle (u64 handle, u32 * session_index, u32 * thread_index) +{ + session_manager_main_t *smm = vnet_get_session_manager_main (); + stream_session_t *pool; + + *thread_index = handle & 0xFFFFFFFF; + *session_index = handle >> 32; + + if (*thread_index >= vec_len (smm->sessions)) + return VNET_API_ERROR_INVALID_VALUE; + + pool = smm->sessions[*thread_index]; + + if (pool_is_free_index (pool, *session_index)) + return VNET_API_ERROR_INVALID_VALUE_2; + + return 0; +} + +int +vnet_bind_i (u32 api_client_index, ip46_address_t * ip46, u16 port_host_order, + session_type_t sst, u64 * options, session_cb_vft_t * cb_fns, + application_t ** app, u32 * len_seg_name, char *seg_name) +{ + u8 *segment_name = 0; + application_t *server = 0; + stream_session_t *listener; + u8 is_ip4; + + listener = + stream_session_lookup_listener (ip46, + clib_host_to_net_u16 (port_host_order), + sst); + + if (listener) + return VNET_API_ERROR_ADDRESS_IN_USE; + + if (application_lookup (api_client_index)) + { + clib_warning ("Only one bind supported for now"); + return VNET_API_ERROR_ADDRESS_IN_USE; + } + + is_ip4 = SESSION_TYPE_IP4_UDP == sst || SESSION_TYPE_IP4_TCP == sst; + if (!ip_is_zero (ip46, is_ip4) && !ip_is_local (ip46, is_ip4)) + return VNET_API_ERROR_INVALID_VALUE; + + /* Allocate and initialize stream server */ + server = application_new (APP_SERVER, sst, api_client_index, + options[SESSION_OPTIONS_FLAGS], cb_fns); + + application_server_init (server, options[SESSION_OPTIONS_SEGMENT_SIZE], + options[SESSION_OPTIONS_ADD_SEGMENT_SIZE], + options[SESSION_OPTIONS_RX_FIFO_SIZE], + options[SESSION_OPTIONS_TX_FIFO_SIZE], + &segment_name); + + /* Setup listen path down to transport */ + stream_session_start_listen (server->index, ip46, port_host_order); + + /* + * Return values + */ + + ASSERT (vec_len (segment_name) <= 128); + *len_seg_name = vec_len (segment_name); + memcpy (seg_name, segment_name, *len_seg_name); + *app = server; + + return 0; +} + +int +vnet_unbind_i (u32 api_client_index) +{ + application_t *server; + + /* + * Find the stream_server_t corresponding to the api client + */ + server = application_lookup (api_client_index); + if (!server) + return VNET_API_ERROR_INVALID_VALUE_2; + + /* Clear the listener */ + stream_session_stop_listen (server->index); + application_del (server); + + return 0; +} + +int +vnet_connect_i (u32 api_client_index, u32 api_context, session_type_t sst, + ip46_address_t * ip46, u16 port, u64 * options, void *mp, + session_cb_vft_t * cb_fns) +{ + stream_session_t *listener; + application_t *server, *app; + + /* + * Figure out if connecting to a local server + */ + listener = stream_session_lookup_listener (ip46, + clib_host_to_net_u16 (port), + sst); + if (listener) + { + server = application_get (listener->app_index); + + /* + * Server is willing to have a direct fifo connection created + * instead of going through the state machine, etc. + */ + if (server->flags & SESSION_OPTIONS_FLAGS_USE_FIFO) + return server->cb_fns. + redirect_connect_callback (server->api_client_index, mp); + } + + /* Create client app */ + app = application_new (APP_CLIENT, sst, api_client_index, + options[SESSION_OPTIONS_FLAGS], cb_fns); + + app->api_context = api_context; + + /* + * Not connecting to a local server. Create regular session + */ + stream_session_open (sst, ip46, port, app->index); + + return 0; +} + +/** + * unformat a vnet URI + * + * fifo://name + * tcp://ip46-addr:port + * udp://ip46-addr:port + * + * u8 ip46_address[16]; + * u16 port_in_host_byte_order; + * stream_session_type_t sst; + * u8 *fifo_name; + * + * if (unformat (input, "%U", unformat_vnet_uri, &ip46_address, + * &sst, &port, &fifo_name)) + * etc... + * + */ +uword +unformat_vnet_uri (unformat_input_t * input, va_list * args) +{ + ip46_address_t *address = va_arg (*args, ip46_address_t *); + session_type_t *sst = va_arg (*args, session_type_t *); + u16 *port = va_arg (*args, u16 *); + + if (unformat (input, "tcp://%U/%d", unformat_ip4_address, &address->ip4, + port)) + { + *sst = SESSION_TYPE_IP4_TCP; + return 1; + } + if (unformat (input, "udp://%U/%d", unformat_ip4_address, &address->ip4, + port)) + { + *sst = SESSION_TYPE_IP4_UDP; + return 1; + } + if (unformat (input, "udp://%U/%d", unformat_ip6_address, &address->ip6, + port)) + { + *sst = SESSION_TYPE_IP6_UDP; + return 1; + } + if (unformat (input, "tcp://%U/%d", unformat_ip6_address, &address->ip6, + port)) + { + *sst = SESSION_TYPE_IP6_TCP; + return 1; + } + + return 0; +} + +int +parse_uri (char *uri, session_type_t * sst, ip46_address_t * addr, + u16 * port_number_host_byte_order) +{ + unformat_input_t _input, *input = &_input; + + /* Make sure */ + uri = (char *) format (0, "%s%c", uri, 0); + + /* Parse uri */ + unformat_init_string (input, uri, strlen (uri)); + if (!unformat (input, "%U", unformat_vnet_uri, addr, sst, + port_number_host_byte_order)) + { + unformat_free (input); + return VNET_API_ERROR_INVALID_VALUE; + } + unformat_free (input); + + return 0; +} + +int +vnet_bind_uri (vnet_bind_args_t * a) +{ + application_t *server = 0; + u16 port_host_order; + session_type_t sst = SESSION_N_TYPES; + ip46_address_t ip46; + int rv; + + memset (&ip46, 0, sizeof (ip46)); + rv = parse_uri (a->uri, &sst, &ip46, &port_host_order); + if (rv) + return rv; + + if ((rv = vnet_bind_i (a->api_client_index, &ip46, port_host_order, sst, + a->options, a->session_cb_vft, &server, + &a->segment_name_length, a->segment_name))) + return rv; + + a->server_event_queue_address = (u64) server->event_queue; + return 0; +} + +session_type_t +session_type_from_proto_and_ip (session_api_proto_t proto, u8 is_ip4) +{ + if (proto == SESSION_PROTO_TCP) + { + if (is_ip4) + return SESSION_TYPE_IP4_TCP; + else + return SESSION_TYPE_IP6_TCP; + } + else + { + if (is_ip4) + return SESSION_TYPE_IP4_UDP; + else + return SESSION_TYPE_IP6_UDP; + } + + return SESSION_N_TYPES; +} + +int +vnet_unbind_uri (char *uri, u32 api_client_index) +{ + u16 port_number_host_byte_order; + session_type_t sst = SESSION_N_TYPES; + ip46_address_t ip46_address; + stream_session_t *listener; + int rv; + + rv = parse_uri (uri, &sst, &ip46_address, &port_number_host_byte_order); + if (rv) + return rv; + + listener = + stream_session_lookup_listener (&ip46_address, + clib_host_to_net_u16 + (port_number_host_byte_order), sst); + + if (!listener) + return VNET_API_ERROR_ADDRESS_NOT_IN_USE; + + /* External client? */ + if (api_client_index != ~0) + { + ASSERT (vl_api_client_index_to_registration (api_client_index)); + } + + return vnet_unbind_i (api_client_index); +} + +int +vnet_connect_uri (vnet_connect_args_t * a) +{ + ip46_address_t ip46_address; + u16 port; + session_type_t sst; + application_t *app; + int rv; + + app = application_lookup (a->api_client_index); + if (app) + { + clib_warning ("Already have a connect from this app"); + return VNET_API_ERROR_INVALID_VALUE_2; + } + + /* Parse uri */ + rv = parse_uri (a->uri, &sst, &ip46_address, &port); + if (rv) + return rv; + + return vnet_connect_i (a->api_client_index, a->api_context, sst, + &ip46_address, port, a->options, a->mp, + a->session_cb_vft); +} + +int +vnet_disconnect_session (u32 client_index, u32 session_index, + u32 thread_index) +{ + stream_session_t *session; + + session = stream_session_get (session_index, thread_index); + stream_session_disconnect (session); + + return 0; +} + + +int +vnet_bind (vnet_bind_args_t * a) +{ + application_t *server = 0; + session_type_t sst = SESSION_N_TYPES; + int rv; + + sst = session_type_from_proto_and_ip (a->proto, a->tep.is_ip4); + if ((rv = vnet_bind_i (a->api_client_index, &a->tep.ip, a->tep.port, sst, + a->options, a->session_cb_vft, &server, + &a->segment_name_length, a->segment_name))) + return rv; + + a->server_event_queue_address = (u64) server->event_queue; + a->handle = (u64) a->tep.vrf << 32 | (u64) server->session_index; + return 0; +} + +int +vnet_unbind (vnet_unbind_args_t * a) +{ + application_t *server; + + if (a->api_client_index != ~0) + { + ASSERT (vl_api_client_index_to_registration (a->api_client_index)); + } + + /* Make sure this is the right one */ + server = application_lookup (a->api_client_index); + ASSERT (server->session_index == (0xFFFFFFFF & a->handle)); + + /* TODO use handle to disambiguate namespaces/vrfs */ + return vnet_unbind_i (a->api_client_index); +} + +int +vnet_connect (vnet_connect_args_t * a) +{ + session_type_t sst; + application_t *app; + + app = application_lookup (a->api_client_index); + if (app) + { + clib_warning ("Already have a connect from this app"); + return VNET_API_ERROR_INVALID_VALUE_2; + } + + sst = session_type_from_proto_and_ip (a->proto, a->tep.is_ip4); + return vnet_connect_i (a->api_client_index, a->api_context, sst, &a->tep.ip, + a->tep.port, a->options, a->mp, a->session_cb_vft); +} + +int +vnet_disconnect (vnet_disconnect_args_t * a) +{ + stream_session_t *session; + u32 session_index, thread_index; + + if (api_parse_session_handle (a->handle, &session_index, &thread_index)) + { + clib_warning ("Invalid handle"); + return -1; + } + + session = stream_session_get (session_index, thread_index); + stream_session_disconnect (session); + + return 0; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/application_interface.h b/src/vnet/session/application_interface.h new file mode 100644 index 00000000..8d87c067 --- /dev/null +++ b/src/vnet/session/application_interface.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_uri_h__ +#define __included_uri_h__ + +#include +#include +#include +#include +#include +#include + +typedef enum _session_api_proto +{ + SESSION_PROTO_TCP, + SESSION_PROTO_UDP +} session_api_proto_t; + +typedef struct _vnet_bind_args_t +{ + union + { + char *uri; + struct + { + transport_endpoint_t tep; + session_api_proto_t proto; + }; + }; + + u32 api_client_index; + u64 *options; + session_cb_vft_t *session_cb_vft; + + /* + * Results + */ + char *segment_name; + u32 segment_name_length; + u64 server_event_queue_address; + u64 handle; +} vnet_bind_args_t; + +typedef struct _vnet_unbind_args_t +{ + union + { + char *uri; + u64 handle; + }; + u32 api_client_index; +} vnet_unbind_args_t; + +typedef struct _vnet_connect_args +{ + union + { + char *uri; + struct + { + transport_endpoint_t tep; + session_api_proto_t proto; + }; + }; + u32 api_client_index; + u32 api_context; + u64 *options; + session_cb_vft_t *session_cb_vft; + + /* Used for redirects */ + void *mp; +} vnet_connect_args_t; + +typedef struct _vnet_disconnect_args_t +{ + u64 handle; + u32 api_client_index; +} vnet_disconnect_args_t; + +/* Bind / connect options */ +typedef enum +{ + SESSION_OPTIONS_FLAGS, + SESSION_OPTIONS_SEGMENT_SIZE, + SESSION_OPTIONS_ADD_SEGMENT_SIZE, + SESSION_OPTIONS_RX_FIFO_SIZE, + SESSION_OPTIONS_TX_FIFO_SIZE, + SESSION_OPTIONS_ACCEPT_COOKIE, + SESSION_OPTIONS_N_OPTIONS +} session_options_index_t; + +/** Server can handle delegated connect requests from local clients */ +#define SESSION_OPTIONS_FLAGS_USE_FIFO (1<<0) + +/** Server wants vpp to add segments when out of memory for fifos */ +#define SESSION_OPTIONS_FLAGS_ADD_SEGMENT (1<<1) + +#define VNET_CONNECT_REDIRECTED 123 + +int vnet_bind_uri (vnet_bind_args_t *); +int vnet_unbind_uri (char *uri, u32 api_client_index); +int vnet_connect_uri (vnet_connect_args_t * a); +int +vnet_disconnect_session (u32 client_index, u32 session_index, + u32 thread_index); + +int vnet_bind (vnet_bind_args_t * a); +int vnet_connect (vnet_connect_args_t * a); +int vnet_unbind (vnet_unbind_args_t * a); +int vnet_disconnect (vnet_disconnect_args_t * a); + +int +api_parse_session_handle (u64 handle, u32 * session_index, + u32 * thread_index); + +#endif /* __included_uri_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/hashes.c b/src/vnet/session/hashes.c new file mode 100644 index 00000000..1808dd73 --- /dev/null +++ b/src/vnet/session/hashes.c @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Generate typed init functions for multiple hash table styles... */ + +#include +#include + +#include + +#undef __included_bihash_template_h__ + +#include +#include + +#include diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c new file mode 100644 index 00000000..e467f4e9 --- /dev/null +++ b/src/vnet/session/node.c @@ -0,0 +1,435 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include +#include + +vlib_node_registration_t session_queue_node; + +typedef struct +{ + u32 session_index; + u32 server_thread_index; +} session_queue_trace_t; + +/* packet trace format function */ +static u8 * +format_session_queue_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + session_queue_trace_t *t = va_arg (*args, session_queue_trace_t *); + + s = format (s, "SESSION_QUEUE: session index %d, server thread index %d", + t->session_index, t->server_thread_index); + return s; +} + +vlib_node_registration_t session_queue_node; + +#define foreach_session_queue_error \ +_(TX, "Packets transmitted") \ +_(TIMER, "Timer events") + +typedef enum +{ +#define _(sym,str) SESSION_QUEUE_ERROR_##sym, + foreach_session_queue_error +#undef _ + SESSION_QUEUE_N_ERROR, +} session_queue_error_t; + +static char *session_queue_error_strings[] = { +#define _(sym,string) string, + foreach_session_queue_error +#undef _ +}; + +static u32 session_type_to_next[] = { + SESSION_QUEUE_NEXT_TCP_IP4_OUTPUT, + SESSION_QUEUE_NEXT_IP4_LOOKUP, + SESSION_QUEUE_NEXT_TCP_IP6_OUTPUT, + SESSION_QUEUE_NEXT_IP6_LOOKUP, +}; + +always_inline int +session_fifo_rx_i (vlib_main_t * vm, vlib_node_runtime_t * node, + session_manager_main_t * smm, session_fifo_event_t * e0, + stream_session_t * s0, u32 thread_index, int *n_tx_packets, + u8 peek_data) +{ + u32 n_trace = vlib_get_trace_count (vm, node); + u32 left_to_snd0, max_len_to_snd0, len_to_deq0, n_bufs, snd_space0; + u32 n_frame_bytes, n_frames_per_evt; + transport_connection_t *tc0; + transport_proto_vft_t *transport_vft; + u32 next_index, next0, *to_next, n_left_to_next, bi0; + vlib_buffer_t *b0; + u32 rx_offset; + u16 snd_mss0; + u8 *data0; + int i; + + next_index = next0 = session_type_to_next[s0->session_type]; + + transport_vft = session_get_transport_vft (s0->session_type); + tc0 = transport_vft->get_connection (s0->connection_index, thread_index); + + /* Make sure we have space to send and there's something to dequeue */ + snd_space0 = transport_vft->send_space (tc0); + snd_mss0 = transport_vft->send_mss (tc0); + + if (snd_space0 == 0 || svm_fifo_max_dequeue (s0->server_tx_fifo) == 0 + || snd_mss0 == 0) + return 0; + + ASSERT (e0->enqueue_length > 0); + + /* Ensure we're not writing more than transport window allows */ + max_len_to_snd0 = clib_min (e0->enqueue_length, snd_space0); + + if (peek_data) + { + /* Offset in rx fifo from where to peek data */ + rx_offset = transport_vft->rx_fifo_offset (tc0); + } + + /* TODO check if transport is willing to send len_to_snd0 + * bytes (Nagle) */ + + n_frame_bytes = snd_mss0 * VLIB_FRAME_SIZE; + n_frames_per_evt = ceil ((double) max_len_to_snd0 / n_frame_bytes); + + n_bufs = vec_len (smm->tx_buffers[thread_index]); + left_to_snd0 = max_len_to_snd0; + for (i = 0; i < n_frames_per_evt; i++) + { + /* Make sure we have at least one full frame of buffers ready */ + if (PREDICT_FALSE (n_bufs < VLIB_FRAME_SIZE)) + { + vec_validate (smm->tx_buffers[thread_index], + n_bufs + VLIB_FRAME_SIZE - 1); + n_bufs += + vlib_buffer_alloc (vm, &smm->tx_buffers[thread_index][n_bufs], + VLIB_FRAME_SIZE); + + /* buffer shortage + * XXX 0.9 because when debugging we might not get a full frame */ + if (PREDICT_FALSE (n_bufs < 0.9 * VLIB_FRAME_SIZE)) + { + /* Keep track of how much we've dequeued and exit */ + e0->enqueue_length -= max_len_to_snd0 - left_to_snd0; + return -1; + } + + _vec_len (smm->tx_buffers[thread_index]) = n_bufs; + } + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + while (left_to_snd0 && n_left_to_next) + { + /* Get free buffer */ + n_bufs--; + bi0 = smm->tx_buffers[thread_index][n_bufs]; + _vec_len (smm->tx_buffers[thread_index]) = n_bufs; + + b0 = vlib_get_buffer (vm, bi0); + b0->error = 0; + b0->flags = VLIB_BUFFER_TOTAL_LENGTH_VALID + | VNET_BUFFER_LOCALLY_ORIGINATED; + b0->current_data = 0; + + /* RX on the local interface. tx in default fib */ + vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + + /* usual speculation, or the enqueue_x1 macro will barf */ + to_next[0] = bi0; + to_next += 1; + n_left_to_next -= 1; + + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + if (PREDICT_FALSE (n_trace > 0)) + { + session_queue_trace_t *t0; + vlib_trace_buffer (vm, node, next_index, b0, + 1 /* follow_chain */ ); + vlib_set_trace_count (vm, node, --n_trace); + t0 = vlib_add_trace (vm, node, b0, sizeof (*t0)); + t0->session_index = s0->session_index; + t0->server_thread_index = s0->thread_index; + } + + if (1) + { + ELOG_TYPE_DECLARE (e) = + { + .format = "evt-dequeue: id %d length %d",.format_args = + "i4i4",}; + struct + { + u32 data[2]; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->data[0] = e0->event_id; + ed->data[1] = e0->enqueue_length; + } + + len_to_deq0 = (left_to_snd0 < snd_mss0) ? left_to_snd0 : snd_mss0; + + /* Make room for headers */ + data0 = vlib_buffer_make_headroom (b0, MAX_HDRS_LEN); + + /* Dequeue the data + * TODO 1) peek instead of dequeue + * 2) buffer chains */ + if (peek_data) + { + int n_bytes_read; + n_bytes_read = svm_fifo_peek (s0->server_tx_fifo, s0->pid, + rx_offset, len_to_deq0, data0); + if (n_bytes_read < 0) + goto dequeue_fail; + + /* Keep track of progress locally, transport is also supposed to + * increment it independently when pushing header */ + rx_offset += n_bytes_read; + } + else + { + if (svm_fifo_dequeue_nowait (s0->server_tx_fifo, s0->pid, + len_to_deq0, data0) < 0) + goto dequeue_fail; + } + + b0->current_length = len_to_deq0; + + /* Ask transport to push header */ + transport_vft->push_header (tc0, b0); + + left_to_snd0 -= len_to_deq0; + *n_tx_packets = *n_tx_packets + 1; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + /* If we couldn't dequeue all bytes store progress */ + if (max_len_to_snd0 < e0->enqueue_length) + { + e0->enqueue_length -= max_len_to_snd0; + vec_add1 (smm->evts_partially_read[thread_index], *e0); + } + return 0; + +dequeue_fail: + /* Can't read from fifo. Store event rx progress, save as partially read, + * return buff to free list and return */ + e0->enqueue_length -= max_len_to_snd0 - left_to_snd0; + vec_add1 (smm->evts_partially_read[thread_index], *e0); + + to_next -= 1; + n_left_to_next += 1; + _vec_len (smm->tx_buffers[thread_index]) += 1; + + clib_warning ("dequeue fail"); + return 0; +} + +int +session_fifo_rx_peek (vlib_main_t * vm, vlib_node_runtime_t * node, + session_manager_main_t * smm, session_fifo_event_t * e0, + stream_session_t * s0, u32 thread_index, int *n_tx_pkts) +{ + return session_fifo_rx_i (vm, node, smm, e0, s0, thread_index, n_tx_pkts, + 1); +} + +int +session_fifo_rx_dequeue (vlib_main_t * vm, vlib_node_runtime_t * node, + session_manager_main_t * smm, + session_fifo_event_t * e0, stream_session_t * s0, + u32 thread_index, int *n_tx_pkts) +{ + return session_fifo_rx_i (vm, node, smm, e0, s0, thread_index, n_tx_pkts, + 0); +} + +static uword +session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + session_manager_main_t *smm = vnet_get_session_manager_main (); + session_fifo_event_t *my_fifo_events, *e; + u32 n_to_dequeue; + unix_shared_memory_queue_t *q; + int n_tx_packets = 0; + u32 my_thread_index = vm->cpu_index; + int i, rv; + + /* + * Update TCP time + */ + tcp_update_time (vlib_time_now (vm), my_thread_index); + + /* + * Get vpp queue events + */ + q = smm->vpp_event_queues[my_thread_index]; + if (PREDICT_FALSE (q == 0)) + return 0; + + /* min number of events we can dequeue without blocking */ + n_to_dequeue = q->cursize; + if (n_to_dequeue == 0) + return 0; + + my_fifo_events = smm->fifo_events[my_thread_index]; + + /* If we didn't manage to process previous events try going + * over them again without dequeuing new ones. + * XXX: Block senders to sessions that can't keep up */ + if (vec_len (my_fifo_events) >= 100) + goto skip_dequeue; + + /* See you in the next life, don't be late */ + if (pthread_mutex_trylock (&q->mutex)) + return 0; + + for (i = 0; i < n_to_dequeue; i++) + { + vec_add2 (my_fifo_events, e, 1); + unix_shared_memory_queue_sub_raw (q, (u8 *) e); + } + + /* The other side of the connection is not polling */ + if (q->cursize < (q->maxsize / 8)) + (void) pthread_cond_broadcast (&q->condvar); + pthread_mutex_unlock (&q->mutex); + + smm->fifo_events[my_thread_index] = my_fifo_events; + +skip_dequeue: + + for (i = 0; i < n_to_dequeue; i++) + { + svm_fifo_t *f0; /* $$$ prefetch 1 ahead maybe */ + stream_session_t *s0; + u32 server_session_index0, server_thread_index0; + session_fifo_event_t *e0; + + e0 = &my_fifo_events[i]; + f0 = e0->fifo; + server_session_index0 = f0->server_session_index; + server_thread_index0 = f0->server_thread_index; + + /* $$$ add multiple event queues, per vpp worker thread */ + ASSERT (server_thread_index0 == my_thread_index); + + s0 = pool_elt_at_index (smm->sessions[my_thread_index], + server_session_index0); + + ASSERT (s0->thread_index == my_thread_index); + + switch (e0->event_type) + { + case FIFO_EVENT_SERVER_TX: + /* Spray packets in per session type frames, since they go to + * different nodes */ + rv = (smm->session_rx_fns[s0->session_type]) (vm, node, smm, e0, s0, + my_thread_index, + &n_tx_packets); + if (rv < 0) + goto done; + + break; + + default: + clib_warning ("unhandled event type %d", e0->event_type); + } + } + +done: + + /* Couldn't process all events. Probably out of buffers */ + if (PREDICT_FALSE (i < n_to_dequeue)) + { + session_fifo_event_t *partially_read = + smm->evts_partially_read[my_thread_index]; + vec_add (partially_read, &my_fifo_events[i], n_to_dequeue - i); + vec_free (my_fifo_events); + smm->fifo_events[my_thread_index] = partially_read; + smm->evts_partially_read[my_thread_index] = 0; + } + else + { + vec_free (smm->fifo_events[my_thread_index]); + smm->fifo_events[my_thread_index] = + smm->evts_partially_read[my_thread_index]; + smm->evts_partially_read[my_thread_index] = 0; + } + + vlib_node_increment_counter (vm, session_queue_node.index, + SESSION_QUEUE_ERROR_TX, n_tx_packets); + + return n_tx_packets; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (session_queue_node) = +{ + .function = session_queue_node_fn, + .name = "session-queue", + .format_trace = format_session_queue_trace, + .type = VLIB_NODE_TYPE_INPUT, + .n_errors = ARRAY_LEN (session_queue_error_strings), + .error_strings = session_queue_error_strings, + .n_next_nodes = SESSION_QUEUE_N_NEXT, + /* .state = VLIB_NODE_STATE_DISABLED, enable on-demand? */ + /* edit / add dispositions here */ + .next_nodes = + { + [SESSION_QUEUE_NEXT_DROP] = "error-drop", + [SESSION_QUEUE_NEXT_IP4_LOOKUP] = "ip4-lookup", + [SESSION_QUEUE_NEXT_IP6_LOOKUP] = "ip6-lookup", + [SESSION_QUEUE_NEXT_TCP_IP4_OUTPUT] = "tcp4-output", + [SESSION_QUEUE_NEXT_TCP_IP6_OUTPUT] = "tcp6-output", + }, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/session.api b/src/vnet/session/session.api new file mode 100644 index 00000000..a7b28c1d --- /dev/null +++ b/src/vnet/session/session.api @@ -0,0 +1,429 @@ +/* + * Copyright (c) 2015-2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /** \brief Bind to a given URI + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param accept_cookie - sender accept cookie, to identify this bind flavor + @param uri - a URI, e.g. "tcp://0.0.0.0/0/80" [ipv4] + "tcp://::/0/80" [ipv6] etc. + @param options - socket options, fifo sizes, etc. +*/ +define bind_uri { + u32 client_index; + u32 context; + u32 accept_cookie; + u32 initial_segment_size; + u8 uri[128]; + u64 options[16]; +}; + +/** \brief Unbind a given URI + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param uri - a URI, e.g. "tcp://0.0.0.0/0/80" [ipv4] + "tcp://::/0/80" [ipv6], etc. + @param options - socket options, fifo sizes, etc. +*/ +define unbind_uri { + u32 client_index; + u32 context; + u8 uri[128]; +}; + +/** \brief Connect to a given URI + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param accept_cookie - sender accept cookie, to identify this bind flavor + @param uri - a URI, e.g. "tcp4://0.0.0.0/0/80" + "tcp6://::/0/80" [ipv6], etc. + @param options - socket options, fifo sizes, etc. +*/ +define connect_uri { + u32 client_index; + u32 context; + u8 uri[128]; + u64 client_queue_address; + u64 options[16]; +}; + +/** \brief Bind reply + @param context - sender context, to match reply w/ request + @param retval - return code for the request + @param event_queue_address - vpp event queue address or 0 if this + connection shouldn't send events + @param segment_name_length - length of segment name + @param segment_name - name of segment client needs to attach to +*/ +define bind_uri_reply { + u32 context; + i32 retval; + u64 server_event_queue_address; + u8 segment_name_length; + u32 segment_size; + u8 segment_name[128]; +}; + +/** \brief unbind reply + @param context - sender context, to match reply w/ request + @param retval - return code for the request +*/ +define unbind_uri_reply { + u32 context; + i32 retval; +}; + +/** \brief vpp->client, connect reply + @param context - sender context, to match reply w/ request + @param retval - return code for the request + @param server_rx_fifo - rx (vpp -> vpp-client) fifo address + @param server_tx_fifo - tx (vpp-client -> vpp) fifo address + @param session_index - session index; + @param session_thread_index - session thread index + @param session_type - session thread type + @param vpp_event_queue_address - vpp's event queue address + @param client_event_queue_address - client's event queue address + @param segment_name_length - non-zero if the client needs to attach to + the fifo segment + @param segment_name - set if the client needs to attach to the segment +*/ +define connect_uri_reply { + u32 context; + i32 retval; + u64 server_rx_fifo; + u64 server_tx_fifo; + u32 session_index; + u32 session_thread_index; + u8 session_type; + u64 client_event_queue_address; + u64 vpp_event_queue_address; + u32 segment_size; + u8 segment_name_length; + u8 segment_name[128]; +}; + +/** \brief vpp->client, please map an additional shared memory segment + @param context - sender context, to match reply w/ request + @param segment_name - +*/ +define map_another_segment { + u32 client_index; + u32 context; + u32 segment_size; + u8 segment_name[128]; +}; + +/** \brief client->vpp + @param context - sender context, to match reply w/ request + @param retval - return code for the request +*/ +define map_another_segment_reply { + u32 context; + i32 retval; +}; + +/** \brief vpp->client, accept this session + @param context - sender context, to match reply w/ request + @param accept_cookie - tells client which bind flavor just occurred + @param rx_fifo_address - rx (vpp -> vpp-client) fifo address + @param tx_fifo_address - tx (vpp-client -> vpp) fifo address + @param session_index - index of new session + @param session_thread_index - thread index of new session + @param vpp_event_queue_address - vpp's event queue address + @param session_type - type of session + +*/ +define accept_session { + u32 client_index; + u32 context; + u32 accept_cookie; + u64 server_rx_fifo; + u64 server_tx_fifo; + u32 session_index; + u32 session_thread_index; + u64 vpp_event_queue_address; + u8 session_type; +}; + +/** \brief client->vpp, reply to an accept message + @param context - sender context, to match reply w/ request + @param retval - return code for the request + @param session_index - session index from accept_session / connect_reply + @param session_thread_index - thread index from accept_session / + connect_reply +*/ +define accept_session_reply { + u32 context; + i32 retval; + u8 session_type; + u8 session_thread_index; + u32 session_index; +}; + +/** \brief bidirectional disconnect API + @param client_index - opaque cookie to identify the sender + client to vpp direction only + @param context - sender context, to match reply w/ request + @param session_index - cookie #1 from accept_session / connect_reply + @param session_thread_index - cookie #2 +*/ +define disconnect_session { + u32 client_index; + u32 context; + u32 session_index; + u32 session_thread_index; +}; + +/** \brief bidirectional disconnect reply API + @param client_index - opaque cookie to identify the sender + client to vpp direction only + @param context - sender context, to match reply w/ request + @param retval - return code for the request + @param session_index - session index from accept_session / connect_reply + @param session_thread_index - thread index from accept_session / + connect_reply +*/ +define disconnect_session_reply { + u32 client_index; + u32 context; + i32 retval; + u32 session_index; + u32 session_thread_index; +}; + +/** \brief vpp->client reset session API + @param client_index - opaque cookie to identify the sender + client to vpp direction only + @param context - sender context, to match reply w/ request + @param session_index - session index from accept_session / connect_reply + @param session_thread_index - thread index from accept_session / + connect_reply +*/ +define reset_session { + u32 client_index; + u32 context; + u32 session_index; + u32 session_thread_index; +}; + +/** \brief client->vpp reset session reply + @param client_index - opaque cookie to identify the sender + client to vpp direction only + @param context - sender context, to match reply w/ request + @param retval - return code for the request + @param session_index - session index from accept_session / connect_reply + @param session_thread_index - thread index from accept_session / + connect_reply +*/ +define reset_session_reply { + u32 client_index; + u32 context; + i32 retval; + u32 session_index; + u32 session_thread_index; +}; + +/** \brief Bind to an ip:port pair for a given transport protocol + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param vrf - bind namespace + @param is_ip4 - flag that is 1 if ip address family is IPv4 + @param ip - ip address + @param port - port + @param proto - protocol 0 - TCP 1 - UDP + @param options - socket options, fifo sizes, etc. +*/ +define bind_sock { + u32 client_index; + u32 context; + u32 vrf; + u8 is_ip4; + u8 ip[16]; + u16 port; + u8 proto; + u64 options[16]; +}; + +/** \brief Unbind + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param handle - bind handle obtained from bind reply +*/ +define unbind_sock { + u32 client_index; + u32 context; + u64 handle; +}; + +/** \brief Connect to a remote peer + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param vrf - connection namespace + @param is_ip4 - flag that is 1 if ip address family is IPv4 + @param ip - ip address + @param port - port + @param proto - protocol 0 - TCP 1 - UDP + @param client_queue_address - client's API queue address. Non-zero when + used to perform redirects + @param options - socket options, fifo sizes, etc. +*/ +define connect_sock { + u32 client_index; + u32 context; + u32 vrf; + u8 is_ip4; + u8 ip[16]; + u16 port; + u8 proto; + u64 client_queue_address; + u64 options[16]; +}; + +/** \brief Bind reply + @param context - sender context, to match reply w/ request + @param handle - bind handle + @param retval - return code for the request + @param event_queue_address - vpp event queue address or 0 if this + connection shouldn't send events + @param segment_name_length - length of segment name + @param segment_name - name of segment client needs to attach to +*/ +define bind_sock_reply { + u32 context; + u64 handle; + i32 retval; + u64 server_event_queue_address; + u32 segment_size; + u8 segment_name_length; + u8 segment_name[128]; +}; + +/** \brief unbind reply + @param context - sender context, to match reply w/ request + @param retval - return code for the request +*/ +define unbind_sock_reply { + u32 context; + i32 retval; +}; + +/** \brief vpp/server->client, connect reply + @param context - sender context, to match reply w/ request + @param retval - return code for the request + @param handle - connection handle + @param server_rx_fifo - rx (vpp -> vpp-client) fifo address + @param server_tx_fifo - tx (vpp-client -> vpp) fifo address + @param vpp_event_queue_address - vpp's event queue address + @param client_event_queue_address - client's event queue address + @param segment_name_length - non-zero if the client needs to attach to + the fifo segment + @param segment_name - set if the client needs to attach to the segment +*/ +define connect_sock_reply { + u32 context; + i32 retval; + u64 handle; + u64 server_rx_fifo; + u64 server_tx_fifo; + u64 client_event_queue_address; + u64 vpp_event_queue_address; + u32 segment_size; + u8 segment_name_length; + u8 segment_name[128]; +}; + +/** \brief bidirectional disconnect API + @param client_index - opaque cookie to identify the sender + client to vpp direction only + @param context - sender context, to match reply w/ request + @param handle - session handle obtained through accept/connect +*/ +define disconnect_sock { + u32 client_index; + u32 context; + u64 handle; +}; + +/** \brief bidirectional disconnect reply API + @param client_index - opaque cookie to identify the sender + client to vpp direction only + @param client_context - sender context, to match reply w/ request + @param handle - session handle obtained through accept/connect +*/ +define disconnect_sock_reply { + u32 client_index; + u32 context; + i32 retval; + u64 handle; +}; + +/** \brief vpp->client, accept this session + @param context - sender context, to match reply w/ request + @param accept_cookie - tells client which bind flavor just occurred + @param handle - session handle obtained through accept/connect + @param rx_fifo_address - rx (vpp -> vpp-client) fifo address + @param tx_fifo_address - tx (vpp-client -> vpp) fifo address + @param vpp_event_queue_address - vpp's event queue address +*/ +define accept_sock { + u32 client_index; + u32 context; + u32 accept_cookie; + u64 handle; + u64 server_rx_fifo; + u64 server_tx_fifo; + u64 vpp_event_queue_address; +}; + +/** \brief client->vpp, reply to an accept message + @param context - sender context, to match reply w/ request + @param retval - return code for the request + @param handle - session handle obtained through accept/connect +*/ +define accept_sock_reply { + u32 context; + i32 retval; + u64 handle; +}; + +/** \brief vpp->client reset session API + @param client_index - opaque cookie to identify the sender + client to vpp direction only + @param context - sender context, to match reply w/ request + @param handle - session handle obtained through accept/connect +*/ +define reset_sock { + u32 client_index; + u32 context; + u64 handle; +}; + +/** \brief client->vpp reset session reply + @param client_index - opaque cookie to identify the sender + client to vpp direction only + @param context - sender context, to match reply w/ request + @param handle - session handle obtained through accept/connect +*/ +define reset_sock_reply { + u32 client_index; + u32 context; + i32 retval; + u64 handle; +}; +/* + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ \ No newline at end of file diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c new file mode 100644 index 00000000..539da613 --- /dev/null +++ b/src/vnet/session/session.c @@ -0,0 +1,1286 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @file + * @brief Session and session manager + */ + +#include +#include +#include +#include +#include + +/** + * Per-type vector of transport protocol virtual function tables + */ +static transport_proto_vft_t *tp_vfts; + +session_manager_main_t session_manager_main; + +/* + * Session lookup key; (src-ip, dst-ip, src-port, dst-port, session-type) + * Value: (owner thread index << 32 | session_index); + */ +static void +stream_session_table_add_for_tc (u8 sst, transport_connection_t * tc, + u64 value) +{ + session_manager_main_t *smm = &session_manager_main; + session_kv4_t kv4; + session_kv6_t kv6; + + switch (sst) + { + case SESSION_TYPE_IP4_UDP: + case SESSION_TYPE_IP4_TCP: + make_v4_ss_kv_from_tc (&kv4, tc); + kv4.value = value; + clib_bihash_add_del_16_8 (&smm->v4_session_hash, &kv4, 1 /* is_add */ ); + break; + case SESSION_TYPE_IP6_UDP: + case SESSION_TYPE_IP6_TCP: + make_v6_ss_kv_from_tc (&kv6, tc); + kv6.value = value; + clib_bihash_add_del_48_8 (&smm->v6_session_hash, &kv6, 1 /* is_add */ ); + break; + default: + clib_warning ("Session type not supported"); + ASSERT (0); + } +} + +void +stream_session_table_add (session_manager_main_t * smm, stream_session_t * s, + u64 value) +{ + transport_connection_t *tc; + + tc = tp_vfts[s->session_type].get_connection (s->connection_index, + s->thread_index); + stream_session_table_add_for_tc (s->session_type, tc, value); +} + +static void +stream_session_half_open_table_add (u8 sst, transport_connection_t * tc, + u64 value) +{ + session_manager_main_t *smm = &session_manager_main; + session_kv4_t kv4; + session_kv6_t kv6; + + switch (sst) + { + case SESSION_TYPE_IP4_UDP: + case SESSION_TYPE_IP4_TCP: + make_v4_ss_kv_from_tc (&kv4, tc); + kv4.value = value; + clib_bihash_add_del_16_8 (&smm->v4_half_open_hash, &kv4, + 1 /* is_add */ ); + break; + case SESSION_TYPE_IP6_UDP: + case SESSION_TYPE_IP6_TCP: + make_v6_ss_kv_from_tc (&kv6, tc); + kv6.value = value; + clib_bihash_add_del_48_8 (&smm->v6_half_open_hash, &kv6, + 1 /* is_add */ ); + break; + default: + clib_warning ("Session type not supported"); + ASSERT (0); + } +} + +static int +stream_session_table_del_for_tc (session_manager_main_t * smm, u8 sst, + transport_connection_t * tc) +{ + session_kv4_t kv4; + session_kv6_t kv6; + + switch (sst) + { + case SESSION_TYPE_IP4_UDP: + case SESSION_TYPE_IP4_TCP: + make_v4_ss_kv_from_tc (&kv4, tc); + return clib_bihash_add_del_16_8 (&smm->v4_session_hash, &kv4, + 0 /* is_add */ ); + break; + case SESSION_TYPE_IP6_UDP: + case SESSION_TYPE_IP6_TCP: + make_v6_ss_kv_from_tc (&kv6, tc); + return clib_bihash_add_del_48_8 (&smm->v6_session_hash, &kv6, + 0 /* is_add */ ); + break; + default: + clib_warning ("Session type not supported"); + ASSERT (0); + } + + return 0; +} + +static int +stream_session_table_del (session_manager_main_t * smm, stream_session_t * s) +{ + transport_connection_t *ts; + + ts = tp_vfts[s->session_type].get_connection (s->connection_index, + s->thread_index); + return stream_session_table_del_for_tc (smm, s->session_type, ts); +} + +static void +stream_session_half_open_table_del (session_manager_main_t * smm, u8 sst, + transport_connection_t * tc) +{ + session_kv4_t kv4; + session_kv6_t kv6; + + switch (sst) + { + case SESSION_TYPE_IP4_UDP: + case SESSION_TYPE_IP4_TCP: + make_v4_ss_kv_from_tc (&kv4, tc); + clib_bihash_add_del_16_8 (&smm->v4_half_open_hash, &kv4, + 0 /* is_add */ ); + break; + case SESSION_TYPE_IP6_UDP: + case SESSION_TYPE_IP6_TCP: + make_v6_ss_kv_from_tc (&kv6, tc); + clib_bihash_add_del_48_8 (&smm->v6_half_open_hash, &kv6, + 0 /* is_add */ ); + break; + default: + clib_warning ("Session type not supported"); + ASSERT (0); + } +} + +stream_session_t * +stream_session_lookup_listener4 (ip4_address_t * lcl, u16 lcl_port, u8 proto) +{ + session_manager_main_t *smm = &session_manager_main; + session_kv4_t kv4; + int rv; + + make_v4_listener_kv (&kv4, lcl, lcl_port, proto); + rv = clib_bihash_search_inline_16_8 (&smm->v4_session_hash, &kv4); + if (rv == 0) + return pool_elt_at_index (smm->listen_sessions[proto], (u32) kv4.value); + + /* Zero out the lcl ip */ + kv4.key[0] = 0; + rv = clib_bihash_search_inline_16_8 (&smm->v4_session_hash, &kv4); + if (rv == 0) + return pool_elt_at_index (smm->listen_sessions[proto], kv4.value); + + return 0; +} + +/** Looks up a session based on the 5-tuple passed as argument. + * + * First it tries to find an established session, if this fails, it tries + * finding a listener session if this fails, it tries a lookup with a + * wildcarded local source (listener bound to all interfaces) + */ +stream_session_t * +stream_session_lookup4 (ip4_address_t * lcl, ip4_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto, + u32 my_thread_index) +{ + session_manager_main_t *smm = &session_manager_main; + session_kv4_t kv4; + int rv; + + /* Lookup session amongst established ones */ + make_v4_ss_kv (&kv4, lcl, rmt, lcl_port, rmt_port, proto); + rv = clib_bihash_search_inline_16_8 (&smm->v4_session_hash, &kv4); + if (rv == 0) + return stream_session_get_tsi (kv4.value, my_thread_index); + + /* If nothing is found, check if any listener is available */ + return stream_session_lookup_listener4 (lcl, lcl_port, proto); +} + +stream_session_t * +stream_session_lookup_listener6 (ip6_address_t * lcl, u16 lcl_port, u8 proto) +{ + session_manager_main_t *smm = &session_manager_main; + session_kv6_t kv6; + int rv; + + make_v6_listener_kv (&kv6, lcl, lcl_port, proto); + rv = clib_bihash_search_inline_48_8 (&smm->v6_session_hash, &kv6); + if (rv == 0) + return pool_elt_at_index (smm->listen_sessions[proto], kv6.value); + + /* Zero out the lcl ip */ + kv6.key[0] = kv6.key[1] = 0; + rv = clib_bihash_search_inline_48_8 (&smm->v6_session_hash, &kv6); + if (rv == 0) + return pool_elt_at_index (smm->listen_sessions[proto], kv6.value); + + return 0; +} + +/* Looks up a session based on the 5-tuple passed as argument. + * First it tries to find an established session, if this fails, it tries + * finding a listener session if this fails, it tries a lookup with a + * wildcarded local source (listener bound to all interfaces) */ +stream_session_t * +stream_session_lookup6 (ip6_address_t * lcl, ip6_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto, + u32 my_thread_index) +{ + session_manager_main_t *smm = vnet_get_session_manager_main (); + session_kv6_t kv6; + int rv; + + make_v6_ss_kv (&kv6, lcl, rmt, lcl_port, rmt_port, proto); + rv = clib_bihash_search_inline_48_8 (&smm->v6_session_hash, &kv6); + if (rv == 0) + return stream_session_get_tsi (kv6.value, my_thread_index); + + /* If nothing is found, check if any listener is available */ + return stream_session_lookup_listener6 (lcl, lcl_port, proto); +} + +stream_session_t * +stream_session_lookup_listener (ip46_address_t * lcl, u16 lcl_port, u8 proto) +{ + switch (proto) + { + case SESSION_TYPE_IP4_UDP: + case SESSION_TYPE_IP4_TCP: + return stream_session_lookup_listener4 (&lcl->ip4, lcl_port, proto); + break; + case SESSION_TYPE_IP6_UDP: + case SESSION_TYPE_IP6_TCP: + return stream_session_lookup_listener6 (&lcl->ip6, lcl_port, proto); + break; + } + return 0; +} + +static u64 +stream_session_half_open_lookup (session_manager_main_t * smm, + ip46_address_t * lcl, ip46_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto) +{ + session_kv4_t kv4; + session_kv6_t kv6; + int rv; + + switch (proto) + { + case SESSION_TYPE_IP4_UDP: + case SESSION_TYPE_IP4_TCP: + make_v4_ss_kv (&kv4, &lcl->ip4, &rmt->ip4, lcl_port, rmt_port, proto); + rv = clib_bihash_search_inline_16_8 (&smm->v4_half_open_hash, &kv4); + + if (rv == 0) + return kv4.value; + + return (u64) ~ 0; + break; + case SESSION_TYPE_IP6_UDP: + case SESSION_TYPE_IP6_TCP: + make_v6_ss_kv (&kv6, &lcl->ip6, &rmt->ip6, lcl_port, rmt_port, proto); + rv = clib_bihash_search_inline_48_8 (&smm->v6_half_open_hash, &kv6); + + if (rv == 0) + return kv6.value; + + return (u64) ~ 0; + break; + } + return 0; +} + +transport_connection_t * +stream_session_lookup_transport4 (session_manager_main_t * smm, + ip4_address_t * lcl, ip4_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto, + u32 my_thread_index) +{ + session_kv4_t kv4; + stream_session_t *s; + int rv; + + /* Lookup session amongst established ones */ + make_v4_ss_kv (&kv4, lcl, rmt, lcl_port, rmt_port, proto); + rv = clib_bihash_search_inline_16_8 (&smm->v4_session_hash, &kv4); + if (rv == 0) + { + s = stream_session_get_tsi (kv4.value, my_thread_index); + + return tp_vfts[s->session_type].get_connection (s->connection_index, + my_thread_index); + } + + /* If nothing is found, check if any listener is available */ + s = stream_session_lookup_listener4 (lcl, lcl_port, proto); + if (s) + return tp_vfts[s->session_type].get_listener (s->connection_index); + + /* Finally, try half-open connections */ + rv = clib_bihash_search_inline_16_8 (&smm->v4_half_open_hash, &kv4); + if (rv == 0) + return tp_vfts[proto].get_half_open (kv4.value & 0xFFFFFFFF); + + return 0; +} + +transport_connection_t * +stream_session_lookup_transport6 (session_manager_main_t * smm, + ip6_address_t * lcl, ip6_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto, + u32 my_thread_index) +{ + stream_session_t *s; + session_kv6_t kv6; + int rv; + + make_v6_ss_kv (&kv6, lcl, rmt, lcl_port, rmt_port, proto); + rv = clib_bihash_search_inline_48_8 (&smm->v6_session_hash, &kv6); + if (rv == 0) + { + s = stream_session_get_tsi (kv6.value, my_thread_index); + + return tp_vfts[s->session_type].get_connection (s->connection_index, + my_thread_index); + } + + /* If nothing is found, check if any listener is available */ + s = stream_session_lookup_listener6 (lcl, lcl_port, proto); + if (s) + return tp_vfts[s->session_type].get_listener (s->connection_index); + + /* Finally, try half-open connections */ + rv = clib_bihash_search_inline_48_8 (&smm->v6_half_open_hash, &kv6); + if (rv == 0) + return tp_vfts[s->session_type].get_half_open (kv6.value & 0xFFFFFFFF); + + return 0; +} + +/** + * Allocate vpp event queue (once) per worker thread + */ +void +vpp_session_event_queue_allocate (session_manager_main_t * smm, + u32 thread_index) +{ + api_main_t *am = &api_main; + void *oldheap; + + if (smm->vpp_event_queues[thread_index] == 0) + { + /* Allocate event fifo in the /vpe-api shared-memory segment */ + oldheap = svm_push_data_heap (am->vlib_rp); + + smm->vpp_event_queues[thread_index] = + unix_shared_memory_queue_init (2048 /* nels $$$$ config */ , + sizeof (session_fifo_event_t), + 0 /* consumer pid */ , + 0 + /* (do not) send signal when queue non-empty */ + ); + + svm_pop_heap (oldheap); + } +} + +void +session_manager_get_segment_info (u32 index, u8 ** name, u32 * size) +{ + svm_fifo_segment_private_t *s; + s = svm_fifo_get_segment (index); + *name = s->h->segment_name; + *size = s->ssvm.ssvm_size; +} + +always_inline int +session_manager_add_segment_i (session_manager_main_t * smm, + session_manager_t * sm, + u32 segment_size, u8 * segment_name) +{ + svm_fifo_segment_create_args_t _ca, *ca = &_ca; + int rv; + + memset (ca, 0, sizeof (*ca)); + + ca->segment_name = (char *) segment_name; + ca->segment_size = segment_size; + + rv = svm_fifo_segment_create (ca); + if (rv) + { + clib_warning ("svm_fifo_segment_create ('%s', %d) failed", + ca->segment_name, ca->segment_size); + vec_free (segment_name); + return -1; + } + + vec_add1 (sm->segment_indices, ca->new_segment_index); + + return 0; +} + +static int +session_manager_add_segment (session_manager_main_t * smm, + session_manager_t * sm) +{ + u8 *segment_name; + svm_fifo_segment_create_args_t _ca, *ca = &_ca; + u32 add_segment_size; + u32 default_segment_size = 128 << 10; + + memset (ca, 0, sizeof (*ca)); + segment_name = format (0, "%d-%d%c", getpid (), + smm->unique_segment_name_counter++, 0); + add_segment_size = + sm->add_segment_size ? sm->add_segment_size : default_segment_size; + + return session_manager_add_segment_i (smm, sm, add_segment_size, + segment_name); +} + +int +session_manager_add_first_segment (session_manager_main_t * smm, + session_manager_t * sm, u32 segment_size, + u8 ** segment_name) +{ + svm_fifo_segment_create_args_t _ca, *ca = &_ca; + memset (ca, 0, sizeof (*ca)); + *segment_name = format (0, "%d-%d%c", getpid (), + smm->unique_segment_name_counter++, 0); + return session_manager_add_segment_i (smm, sm, segment_size, *segment_name); +} + +void +session_manager_del (session_manager_main_t * smm, session_manager_t * sm) +{ + u32 *deleted_sessions = 0; + u32 *deleted_thread_indices = 0; + int i, j; + + /* Across all fifo segments used by the server */ + for (j = 0; j < vec_len (sm->segment_indices); j++) + { + svm_fifo_segment_private_t *fifo_segment; + svm_fifo_t **fifos; + /* Vector of fifos allocated in the segment */ + fifo_segment = svm_fifo_get_segment (sm->segment_indices[j]); + fifos = (svm_fifo_t **) fifo_segment->h->fifos; + + /* + * Remove any residual sessions from the session lookup table + * Don't bother deleting the individual fifos, we're going to + * throw away the fifo segment in a minute. + */ + for (i = 0; i < vec_len (fifos); i++) + { + svm_fifo_t *fifo; + u32 session_index, thread_index; + stream_session_t *session; + + fifo = fifos[i]; + session_index = fifo->server_session_index; + thread_index = fifo->server_thread_index; + + session = pool_elt_at_index (smm->sessions[thread_index], + session_index); + + /* Add to the deleted_sessions vector (once!) */ + if (!session->is_deleted) + { + session->is_deleted = 1; + vec_add1 (deleted_sessions, + session - smm->sessions[thread_index]); + vec_add1 (deleted_thread_indices, thread_index); + } + } + + for (i = 0; i < vec_len (deleted_sessions); i++) + { + stream_session_t *session; + + session = + pool_elt_at_index (smm->sessions[deleted_thread_indices[i]], + deleted_sessions[i]); + + /* Instead of directly removing the session call disconnect */ + stream_session_disconnect (session); + + /* + stream_session_table_del (smm, session); + pool_put(smm->sessions[deleted_thread_indices[i]], session); + */ + } + + vec_reset_length (deleted_sessions); + vec_reset_length (deleted_thread_indices); + + /* Instead of removing the segment, test when removing the session if + * the segment can be removed + */ + /* svm_fifo_segment_delete (fifo_segment); */ + } + + vec_free (deleted_sessions); + vec_free (deleted_thread_indices); +} + +int +session_manager_allocate_session_fifos (session_manager_main_t * smm, + session_manager_t * sm, + svm_fifo_t ** server_rx_fifo, + svm_fifo_t ** server_tx_fifo, + u32 * fifo_segment_index, + u8 * added_a_segment) +{ + svm_fifo_segment_private_t *fifo_segment; + u32 fifo_size, default_fifo_size = 8192 /* TODO config */ ; + int i; + + *added_a_segment = 0; + + /* Allocate svm fifos */ + ASSERT (vec_len (sm->segment_indices)); + +again: + for (i = 0; i < vec_len (sm->segment_indices); i++) + { + *fifo_segment_index = sm->segment_indices[i]; + fifo_segment = svm_fifo_get_segment (*fifo_segment_index); + + fifo_size = sm->rx_fifo_size; + fifo_size = (fifo_size == 0) ? default_fifo_size : fifo_size; + *server_rx_fifo = svm_fifo_segment_alloc_fifo (fifo_segment, fifo_size); + + fifo_size = sm->tx_fifo_size; + fifo_size = (fifo_size == 0) ? default_fifo_size : fifo_size; + *server_tx_fifo = svm_fifo_segment_alloc_fifo (fifo_segment, fifo_size); + + if (*server_rx_fifo == 0) + { + /* This would be very odd, but handle it... */ + if (*server_tx_fifo != 0) + { + svm_fifo_segment_free_fifo (fifo_segment, *server_tx_fifo); + *server_tx_fifo = 0; + } + continue; + } + if (*server_tx_fifo == 0) + { + if (*server_rx_fifo != 0) + { + svm_fifo_segment_free_fifo (fifo_segment, *server_rx_fifo); + *server_rx_fifo = 0; + } + continue; + } + break; + } + + /* See if we're supposed to create another segment */ + if (*server_rx_fifo == 0) + { + if (sm->add_segment) + { + if (*added_a_segment) + { + clib_warning ("added a segment, still cant allocate a fifo"); + return SESSION_ERROR_NEW_SEG_NO_SPACE; + } + + if (session_manager_add_segment (smm, sm)) + return VNET_API_ERROR_URI_FIFO_CREATE_FAILED; + + *added_a_segment = 1; + goto again; + } + else + return SESSION_ERROR_NO_SPACE; + } + return 0; +} + +int +stream_session_create_i (session_manager_main_t * smm, application_t * app, + transport_connection_t * tc, + stream_session_t ** ret_s) +{ + int rv; + svm_fifo_t *server_rx_fifo = 0, *server_tx_fifo = 0; + u32 fifo_segment_index; + u32 pool_index, seg_size; + stream_session_t *s; + u64 value; + u32 thread_index = tc->thread_index; + session_manager_t *sm; + u8 segment_added; + u8 *seg_name; + + sm = session_manager_get (app->session_manager_index); + + /* Check the API queue */ + if (app->mode == APP_SERVER && application_api_queue_is_full (app)) + return SESSION_ERROR_API_QUEUE_FULL; + + if ((rv = session_manager_allocate_session_fifos (smm, sm, &server_rx_fifo, + &server_tx_fifo, + &fifo_segment_index, + &segment_added))) + return rv; + + if (segment_added && app->mode == APP_SERVER) + { + /* Send an API message to the external server, to map new segment */ + ASSERT (app->cb_fns.add_segment_callback); + + session_manager_get_segment_info (fifo_segment_index, &seg_name, + &seg_size); + if (app->cb_fns.add_segment_callback (app->api_client_index, seg_name, + seg_size)) + return VNET_API_ERROR_URI_FIFO_CREATE_FAILED; + } + + /* Create the session */ + pool_get (smm->sessions[thread_index], s); + memset (s, 0, sizeof (*s)); + + /* Initialize backpointers */ + pool_index = s - smm->sessions[thread_index]; + server_rx_fifo->server_session_index = pool_index; + server_rx_fifo->server_thread_index = thread_index; + + server_tx_fifo->server_session_index = pool_index; + server_tx_fifo->server_thread_index = thread_index; + + s->server_rx_fifo = server_rx_fifo; + s->server_tx_fifo = server_tx_fifo; + + /* Initialize state machine, such as it is... */ + s->session_type = app->session_type; + s->session_state = SESSION_STATE_CONNECTING; + s->app_index = application_get_index (app); + s->server_segment_index = fifo_segment_index; + s->thread_index = thread_index; + s->session_index = pool_index; + + /* Attach transport to session */ + s->connection_index = tc->c_index; + + /* Attach session to transport */ + tc->s_index = s->session_index; + + /* Add to the main lookup table */ + value = (((u64) thread_index) << 32) | (u64) s->session_index; + stream_session_table_add_for_tc (app->session_type, tc, value); + + *ret_s = s; + + return 0; +} + +/* + * Enqueue data for delivery to session peer. Does not notify peer of enqueue + * event but on request can queue notification events for later delivery by + * calling stream_server_flush_enqueue_events(). + * + * @param tc Transport connection which is to be enqueued data + * @param data Data to be enqueued + * @param len Length of data to be enqueued + * @param queue_event Flag to indicate if peer is to be notified or if event + * is to be queued. The former is useful when more data is + * enqueued and only one event is to be generated. + * @return Number of bytes enqueued or a negative value if enqueueing failed. + */ +int +stream_session_enqueue_data (transport_connection_t * tc, u8 * data, u16 len, + u8 queue_event) +{ + stream_session_t *s; + int enqueued; + + s = stream_session_get (tc->s_index, tc->thread_index); + + /* Make sure there's enough space left. We might've filled the pipes */ + if (PREDICT_FALSE (len > svm_fifo_max_enqueue (s->server_rx_fifo))) + return -1; + + enqueued = svm_fifo_enqueue_nowait (s->server_rx_fifo, s->pid, len, data); + + if (queue_event) + { + /* Queue RX event on this fifo. Eventually these will need to be flushed + * by calling stream_server_flush_enqueue_events () */ + session_manager_main_t *smm = vnet_get_session_manager_main (); + u32 thread_index = s->thread_index; + u32 my_enqueue_epoch = smm->current_enqueue_epoch[thread_index]; + + if (s->enqueue_epoch != my_enqueue_epoch) + { + s->enqueue_epoch = my_enqueue_epoch; + vec_add1 (smm->session_indices_to_enqueue_by_thread[thread_index], + s - smm->sessions[thread_index]); + } + } + + return enqueued; +} + +/** Check if we have space in rx fifo to push more bytes */ +u8 +stream_session_no_space (transport_connection_t * tc, u32 thread_index, + u16 data_len) +{ + stream_session_t *s = stream_session_get (tc->c_index, thread_index); + + if (PREDICT_FALSE (s->session_state != SESSION_STATE_READY)) + return 1; + + if (data_len > svm_fifo_max_enqueue (s->server_rx_fifo)) + return 1; + + return 0; +} + +u32 +stream_session_peek_bytes (transport_connection_t * tc, u8 * buffer, + u32 offset, u32 max_bytes) +{ + stream_session_t *s = stream_session_get (tc->s_index, tc->thread_index); + return svm_fifo_peek (s->server_tx_fifo, s->pid, offset, max_bytes, buffer); +} + +u32 +stream_session_dequeue_drop (transport_connection_t * tc, u32 max_bytes) +{ + stream_session_t *s = stream_session_get (tc->s_index, tc->thread_index); + return svm_fifo_dequeue_drop (s->server_tx_fifo, s->pid, max_bytes); +} + +/** + * Notify session peer that new data has been enqueued. + * + * @param s Stream session for which the event is to be generated. + * @param block Flag to indicate if call should block if event queue is full. + * + * @return 0 on succes or negative number if failed to send notification. + */ +static int +stream_session_enqueue_notify (stream_session_t * s, u8 block) +{ + application_t *app; + session_fifo_event_t evt; + unix_shared_memory_queue_t *q; + static u32 serial_number; + + if (PREDICT_FALSE (s->session_state == SESSION_STATE_CLOSED)) + return 0; + + /* Get session's server */ + app = application_get (s->app_index); + + /* Fabricate event */ + evt.fifo = s->server_rx_fifo; + evt.event_type = FIFO_EVENT_SERVER_RX; + evt.event_id = serial_number++; + evt.enqueue_length = svm_fifo_max_dequeue (s->server_rx_fifo); + + /* Add event to server's event queue */ + q = app->event_queue; + + /* Based on request block (or not) for lack of space */ + if (block || PREDICT_TRUE (q->cursize < q->maxsize)) + unix_shared_memory_queue_add (app->event_queue, (u8 *) & evt, + 0 /* do wait for mutex */ ); + else + return -1; + + if (1) + { + ELOG_TYPE_DECLARE (e) = + { + .format = "evt-enqueue: id %d length %d",.format_args = "i4i4",}; + struct + { + u32 data[2]; + } *ed; + ed = ELOG_DATA (&vlib_global_main.elog_main, e); + ed->data[0] = evt.event_id; + ed->data[1] = evt.enqueue_length; + } + + return 0; +} + +/** + * Flushes queue of sessions that are to be notified of new data + * enqueued events. + * + * @param thread_index Thread index for which the flush is to be performed. + * @return 0 on success or a positive number indicating the number of + * failures due to API queue being full. + */ +int +session_manager_flush_enqueue_events (u32 thread_index) +{ + session_manager_main_t *smm = &session_manager_main; + u32 *session_indices_to_enqueue; + int i, errors = 0; + + session_indices_to_enqueue = + smm->session_indices_to_enqueue_by_thread[thread_index]; + + for (i = 0; i < vec_len (session_indices_to_enqueue); i++) + { + stream_session_t *s0; + + /* Get session */ + s0 = stream_session_get (session_indices_to_enqueue[i], thread_index); + if (stream_session_enqueue_notify (s0, 0 /* don't block */ )) + { + errors++; + } + } + + vec_reset_length (session_indices_to_enqueue); + + smm->session_indices_to_enqueue_by_thread[thread_index] = + session_indices_to_enqueue; + + /* Increment enqueue epoch for next round */ + smm->current_enqueue_epoch[thread_index]++; + + return errors; +} + +/* + * Start listening on server's ip/port pair for requested transport. + * + * Creates a 'dummy' stream session with state LISTENING to be used in session + * lookups, prior to establishing connection. Requests transport to build + * it's own specific listening connection. + */ +int +stream_session_start_listen (u32 server_index, ip46_address_t * ip, u16 port) +{ + session_manager_main_t *smm = &session_manager_main; + stream_session_t *s; + transport_connection_t *tc; + application_t *srv; + u32 tci; + + srv = application_get (server_index); + + pool_get (smm->listen_sessions[srv->session_type], s); + memset (s, 0, sizeof (*s)); + + s->session_type = srv->session_type; + s->session_state = SESSION_STATE_LISTENING; + s->session_index = s - smm->listen_sessions[srv->session_type]; + s->app_index = srv->index; + + /* Transport bind/listen */ + tci = tp_vfts[srv->session_type].bind (smm->vlib_main, s->session_index, ip, + port); + + /* Attach transport to session */ + s->connection_index = tci; + tc = tp_vfts[srv->session_type].get_listener (tci); + + srv->session_index = s->session_index; + + /* Add to the main lookup table */ + stream_session_table_add_for_tc (s->session_type, tc, s->session_index); + + return 0; +} + +void +stream_session_stop_listen (u32 server_index) +{ + session_manager_main_t *smm = &session_manager_main; + stream_session_t *listener; + transport_connection_t *tc; + application_t *srv; + + srv = application_get (server_index); + listener = pool_elt_at_index (smm->listen_sessions[srv->session_type], + srv->session_index); + + tc = tp_vfts[srv->session_type].get_listener (listener->connection_index); + stream_session_table_del_for_tc (smm, listener->session_type, tc); + + tp_vfts[srv->session_type].unbind (smm->vlib_main, + listener->connection_index); + pool_put (smm->listen_sessions[srv->session_type], listener); +} + +int +connect_server_add_segment_cb (application_t * ss, char *segment_name, + u32 segment_size) +{ + /* Does exactly nothing, but die */ + ASSERT (0); + return 0; +} + +void +connects_session_manager_init (session_manager_main_t * smm, u8 session_type) +{ + session_manager_t *sm; + u32 connect_fifo_size = 8 << 10; /* Config? */ + u32 default_segment_size = 1 << 20; + + pool_get (smm->session_managers, sm); + memset (sm, 0, sizeof (*sm)); + + sm->add_segment_size = default_segment_size; + sm->rx_fifo_size = connect_fifo_size; + sm->tx_fifo_size = connect_fifo_size; + sm->add_segment = 1; + + session_manager_add_segment (smm, sm); + smm->connect_manager_index[session_type] = sm - smm->session_managers; +} + +void +stream_session_connect_notify (transport_connection_t * tc, u8 sst, + u8 is_fail) +{ + session_manager_main_t *smm = &session_manager_main; + application_t *app; + stream_session_t *new_s = 0; + u64 value; + + value = stream_session_half_open_lookup (smm, &tc->lcl_ip, &tc->rmt_ip, + tc->lcl_port, tc->rmt_port, + tc->proto); + if (value == HALF_OPEN_LOOKUP_INVALID_VALUE) + { + clib_warning ("This can't be good!"); + return; + } + + app = application_get (value >> 32); + + if (!is_fail) + { + /* Create new session (server segments are allocated if needed) */ + if (stream_session_create_i (smm, app, tc, &new_s)) + return; + + app->session_index = stream_session_get_index (new_s); + app->thread_index = new_s->thread_index; + + /* Allocate vpp event queue for this thread if needed */ + vpp_session_event_queue_allocate (smm, tc->thread_index); + } + + /* Notify client */ + app->cb_fns.session_connected_callback (app->api_client_index, new_s, + is_fail); + + /* Cleanup session lookup */ + stream_session_half_open_table_del (smm, sst, tc); +} + +void +stream_session_accept_notify (transport_connection_t * tc) +{ + application_t *server; + stream_session_t *s; + + s = stream_session_get (tc->s_index, tc->thread_index); + server = application_get (s->app_index); + server->cb_fns.session_accept_callback (s); +} + +/** + * Notification from transport that connection is being closed. + * + * A disconnect is sent to application but state is not removed. Once + * disconnect is acknowledged by application, session disconnect is called. + * Ultimately this leads to close being called on transport (passive close). + */ +void +stream_session_disconnect_notify (transport_connection_t * tc) +{ + application_t *server; + stream_session_t *s; + + s = stream_session_get (tc->s_index, tc->thread_index); + server = application_get (s->app_index); + server->cb_fns.session_disconnect_callback (s); +} + +/** + * Cleans up session and associated app if needed. + */ +void +stream_session_delete (stream_session_t * s) +{ + session_manager_main_t *smm = vnet_get_session_manager_main (); + svm_fifo_segment_private_t *fifo_segment; + application_t *app; + int rv; + + /* delete from the main lookup table */ + rv = stream_session_table_del (smm, s); + + if (rv) + clib_warning ("hash delete error, rv %d", rv); + + /* Cleanup fifo segments */ + fifo_segment = svm_fifo_get_segment (s->server_segment_index); + svm_fifo_segment_free_fifo (fifo_segment, s->server_rx_fifo); + svm_fifo_segment_free_fifo (fifo_segment, s->server_tx_fifo); + + /* Cleanup app if client */ + app = application_get (s->app_index); + if (app->mode == APP_CLIENT) + { + application_del (app); + } + else if (app->mode == APP_SERVER) + { + session_manager_t *sm; + svm_fifo_segment_private_t *fifo_segment; + svm_fifo_t **fifos; + u32 fifo_index; + + sm = session_manager_get (app->session_manager_index); + + /* Delete fifo */ + fifo_segment = svm_fifo_get_segment (s->server_segment_index); + fifos = (svm_fifo_t **) fifo_segment->h->fifos; + + fifo_index = svm_fifo_segment_index (fifo_segment); + + /* Remove segment only if it holds no fifos and not the first */ + if (sm->segment_indices[0] != fifo_index && vec_len (fifos) == 0) + svm_fifo_segment_delete (fifo_segment); + } + + pool_put (smm->sessions[s->thread_index], s); +} + +/** + * Notification from transport that connection is being deleted + * + * This should be called only on previously fully established sessions. For + * instance failed connects should call stream_session_connect_notify and + * indicate that the connect has failed. + */ +void +stream_session_delete_notify (transport_connection_t * tc) +{ + stream_session_t *s; + + s = stream_session_get_if_valid (tc->s_index, tc->thread_index); + if (!s) + { + clib_warning ("Surprised!"); + return; + } + stream_session_delete (s); +} + +/** + * Notify application that connection has been reset. + */ +void +stream_session_reset_notify (transport_connection_t * tc) +{ + stream_session_t *s; + application_t *app; + s = stream_session_get (tc->s_index, tc->thread_index); + + app = application_get (s->app_index); + app->cb_fns.session_reset_callback (s); +} + +/** + * Accept a stream session. Optionally ping the server by callback. + */ +int +stream_session_accept (transport_connection_t * tc, u32 listener_index, + u8 sst, u8 notify) +{ + session_manager_main_t *smm = &session_manager_main; + application_t *server; + stream_session_t *s, *listener; + + int rv; + + /* Find the server */ + listener = pool_elt_at_index (smm->listen_sessions[sst], listener_index); + server = application_get (listener->app_index); + + if ((rv = stream_session_create_i (smm, server, tc, &s))) + return rv; + + /* Allocate vpp event queue for this thread if needed */ + vpp_session_event_queue_allocate (smm, tc->thread_index); + + /* Shoulder-tap the server */ + if (notify) + { + server->cb_fns.session_accept_callback (s); + } + + return 0; +} + +void +stream_session_open (u8 sst, ip46_address_t * addr, u16 port_host_byte_order, + u32 app_index) +{ + transport_connection_t *tc; + u32 tci; + u64 value; + + /* Ask transport to open connection */ + tci = tp_vfts[sst].open (addr, port_host_byte_order); + + /* Get transport connection */ + tc = tp_vfts[sst].get_half_open (tci); + + /* Store api_client_index and transport connection index */ + value = (((u64) app_index) << 32) | (u64) tc->c_index; + + /* Add to the half-open lookup table */ + stream_session_half_open_table_add (sst, tc, value); +} + +/** + * Disconnect session and propagate to transport. This should eventually + * result in a delete notification that allows us to cleanup session state. + * Called for both active/passive disconnects. + */ +void +stream_session_disconnect (stream_session_t * s) +{ + tp_vfts[s->session_type].close (s->connection_index, s->thread_index); + s->session_state = SESSION_STATE_CLOSED; +} + +/** + * Cleanup transport and session state. + */ +void +stream_session_cleanup (stream_session_t * s) +{ + tp_vfts[s->session_type].cleanup (s->connection_index, s->thread_index); + stream_session_delete (s); +} + +void +session_register_transport (u8 type, const transport_proto_vft_t * vft) +{ + session_manager_main_t *smm = vnet_get_session_manager_main (); + + vec_validate (tp_vfts, type); + tp_vfts[type] = *vft; + + /* If an offset function is provided, then peek instead of dequeue */ + smm->session_rx_fns[type] = + (vft->rx_fifo_offset) ? session_fifo_rx_peek : session_fifo_rx_dequeue; +} + +transport_proto_vft_t * +session_get_transport_vft (u8 type) +{ + if (type >= vec_len (tp_vfts)) + return 0; + return &tp_vfts[type]; +} + +static clib_error_t * +session_manager_main_init (vlib_main_t * vm) +{ + u32 num_threads; + vlib_thread_main_t *vtm = vlib_get_thread_main (); + session_manager_main_t *smm = &session_manager_main; + int i; + + smm->vlib_main = vm; + smm->vnet_main = vnet_get_main (); + + num_threads = 1 /* main thread */ + vtm->n_threads; + + if (num_threads < 1) + return clib_error_return (0, "n_thread_stacks not set"); + + /* $$$ config parameters */ + svm_fifo_segment_init (0x200000000ULL /* first segment base VA */ , + 20 /* timeout in seconds */ ); + + /* configure per-thread ** vectors */ + vec_validate (smm->sessions, num_threads - 1); + vec_validate (smm->session_indices_to_enqueue_by_thread, num_threads - 1); + vec_validate (smm->tx_buffers, num_threads - 1); + vec_validate (smm->fifo_events, num_threads - 1); + vec_validate (smm->evts_partially_read, num_threads - 1); + vec_validate (smm->current_enqueue_epoch, num_threads - 1); + vec_validate (smm->vpp_event_queues, num_threads - 1); + + /* $$$$ preallocate hack config parameter */ + for (i = 0; i < 200000; i++) + { + stream_session_t *ss; + pool_get (smm->sessions[0], ss); + memset (ss, 0, sizeof (*ss)); + } + + for (i = 0; i < 200000; i++) + pool_put_index (smm->sessions[0], i); + + clib_bihash_init_16_8 (&smm->v4_session_hash, "v4 session table", + 200000 /* $$$$ config parameter nbuckets */ , + (64 << 20) /*$$$ config parameter table size */ ); + clib_bihash_init_48_8 (&smm->v6_session_hash, "v6 session table", + 200000 /* $$$$ config parameter nbuckets */ , + (64 << 20) /*$$$ config parameter table size */ ); + + clib_bihash_init_16_8 (&smm->v4_half_open_hash, "v4 half-open table", + 200000 /* $$$$ config parameter nbuckets */ , + (64 << 20) /*$$$ config parameter table size */ ); + clib_bihash_init_48_8 (&smm->v6_half_open_hash, "v6 half-open table", + 200000 /* $$$$ config parameter nbuckets */ , + (64 << 20) /*$$$ config parameter table size */ ); + + for (i = 0; i < SESSION_N_TYPES; i++) + smm->connect_manager_index[i] = INVALID_INDEX; + + return 0; +} + +VLIB_INIT_FUNCTION (session_manager_main_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h new file mode 100644 index 00000000..cf14cca9 --- /dev/null +++ b/src/vnet/session/session.h @@ -0,0 +1,380 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_session_h__ +#define __included_session_h__ + +#include +#include +#include +#include +#include + +#define HALF_OPEN_LOOKUP_INVALID_VALUE ((u64)~0) +#define INVALID_INDEX ((u32)~0) + +/* TODO decide how much since we have pre-data as well */ +#define MAX_HDRS_LEN 100 /* Max number of bytes for headers */ + +typedef enum +{ + FIFO_EVENT_SERVER_RX, + FIFO_EVENT_SERVER_TX, + FIFO_EVENT_TIMEOUT, + FIFO_EVENT_SERVER_EXIT, +} fifo_event_type_t; + +#define foreach_session_input_error \ +_(NO_SESSION, "No session drops") \ +_(NO_LISTENER, "No listener for dst port drops") \ +_(ENQUEUED, "Packets pushed into rx fifo") \ +_(NOT_READY, "Session not ready packets") \ +_(FIFO_FULL, "Packets dropped for lack of rx fifo space") \ +_(EVENT_FIFO_FULL, "Events not sent for lack of event fifo space") \ +_(API_QUEUE_FULL, "Sessions not created for lack of API queue space") \ +_(NEW_SEG_NO_SPACE, "Created segment, couldn't allocate a fifo pair") \ +_(NO_SPACE, "Couldn't allocate a fifo pair") + +typedef enum +{ +#define _(sym,str) SESSION_ERROR_##sym, + foreach_session_input_error +#undef _ + SESSION_N_ERROR, +} session_error_t; + +/* Event queue input node static next indices */ +typedef enum +{ + SESSION_QUEUE_NEXT_DROP, + SESSION_QUEUE_NEXT_TCP_IP4_OUTPUT, + SESSION_QUEUE_NEXT_IP4_LOOKUP, + SESSION_QUEUE_NEXT_TCP_IP6_OUTPUT, + SESSION_QUEUE_NEXT_IP6_LOOKUP, + SESSION_QUEUE_N_NEXT, +} session_queue_next_t; + +#define foreach_session_type \ + _(IP4_TCP, ip4_tcp) \ + _(IP4_UDP, ip4_udp) \ + _(IP6_TCP, ip6_tcp) \ + _(IP6_UDP, ip6_udp) + +typedef enum +{ +#define _(A, a) SESSION_TYPE_##A, + foreach_session_type +#undef _ + SESSION_N_TYPES, +} session_type_t; + +/* + * Application session state + */ +typedef enum +{ + SESSION_STATE_LISTENING, + SESSION_STATE_CONNECTING, + SESSION_STATE_READY, + SESSION_STATE_CLOSED, + SESSION_STATE_N_STATES, +} stream_session_state_t; + +typedef CLIB_PACKED (struct + { + svm_fifo_t * fifo; + u8 event_type; + /* $$$$ for event logging */ + u16 event_id; + u32 enqueue_length; + }) session_fifo_event_t; + +typedef struct _stream_session_t +{ + /** Type */ + u8 session_type; + + /** State */ + u8 session_state; + + /** Session index in per_thread pool */ + u32 session_index; + + /** Transport specific */ + u32 connection_index; + + u8 thread_index; + + /** Application specific */ + u32 pid; + + /** fifo pointers. Once allocated, these do not move */ + svm_fifo_t *server_rx_fifo; + svm_fifo_t *server_tx_fifo; + + /** To avoid n**2 "one event per frame" check */ + u8 enqueue_epoch; + + /** used during unbind processing */ + u8 is_deleted; + + /** stream server pool index */ + u32 app_index; + + /** svm segment index */ + u32 server_segment_index; +} stream_session_t; + +typedef struct _session_manager +{ + /** segments mapped by this server */ + u32 *segment_indices; + + /** Session fifo sizes. They are provided for binds and take default + * values for connects */ + u32 rx_fifo_size; + u32 tx_fifo_size; + + /** Configured additional segment size */ + u32 add_segment_size; + + /** Flag that indicates if additional segments should be created */ + u8 add_segment; +} session_manager_t; + +/* Forward definition */ +typedef struct _session_manager_main session_manager_main_t; + +typedef int + (session_fifo_rx_fn) (vlib_main_t * vm, vlib_node_runtime_t * node, + session_manager_main_t * smm, + session_fifo_event_t * e0, stream_session_t * s0, + u32 thread_index, int *n_tx_pkts); + +extern session_fifo_rx_fn session_fifo_rx_peek; +extern session_fifo_rx_fn session_fifo_rx_dequeue; + +struct _session_manager_main +{ + /** Lookup tables for established sessions and listeners */ + clib_bihash_16_8_t v4_session_hash; + clib_bihash_48_8_t v6_session_hash; + + /** Lookup tables for half-open sessions */ + clib_bihash_16_8_t v4_half_open_hash; + clib_bihash_48_8_t v6_half_open_hash; + + /** Per worker thread session pools */ + stream_session_t **sessions; + + /** Pool of listen sessions. Same type as stream sessions to ease lookups */ + stream_session_t *listen_sessions[SESSION_N_TYPES]; + + /** Sparse vector to map dst port to stream server */ + u16 *stream_server_by_dst_port[SESSION_N_TYPES]; + + /** per-worker enqueue epoch counters */ + u8 *current_enqueue_epoch; + + /** Per-worker thread vector of sessions to enqueue */ + u32 **session_indices_to_enqueue_by_thread; + + /** per-worker tx buffer free lists */ + u32 **tx_buffers; + + /** Per worker-thread vector of partially read events */ + session_fifo_event_t **evts_partially_read; + + /** per-worker active event vectors */ + session_fifo_event_t **fifo_events; + + /** vpp fifo event queue */ + unix_shared_memory_queue_t **vpp_event_queues; + + /** Unique segment name counter */ + u32 unique_segment_name_counter; + + /* Connection manager used by incoming connects */ + u32 connect_manager_index[SESSION_N_TYPES]; + + session_manager_t *session_managers; + + /** Per transport rx function that can either dequeue or peek */ + session_fifo_rx_fn *session_rx_fns[SESSION_N_TYPES]; + + /* Convenience */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; +}; + +extern session_manager_main_t session_manager_main; + +/* + * Session manager function + */ +always_inline session_manager_main_t * +vnet_get_session_manager_main () +{ + return &session_manager_main; +} + +always_inline session_manager_t * +session_manager_get (u32 index) +{ + return pool_elt_at_index (session_manager_main.session_managers, index); +} + +always_inline unix_shared_memory_queue_t * +session_manager_get_vpp_event_queue (u32 thread_index) +{ + return session_manager_main.vpp_event_queues[thread_index]; +} + +always_inline session_manager_t * +connects_session_manager_get (session_manager_main_t * smm, + session_type_t session_type) +{ + return pool_elt_at_index (smm->session_managers, + smm->connect_manager_index[session_type]); +} + +void session_manager_get_segment_info (u32 index, u8 ** name, u32 * size); +int session_manager_flush_enqueue_events (u32 thread_index); +int +session_manager_add_first_segment (session_manager_main_t * smm, + session_manager_t * sm, u32 segment_size, + u8 ** segment_name); +void +session_manager_del (session_manager_main_t * smm, session_manager_t * sm); +void +connects_session_manager_init (session_manager_main_t * smm, u8 session_type); + +/* + * Stream session functions + */ + +stream_session_t *stream_session_lookup_listener4 (ip4_address_t * lcl, + u16 lcl_port, u8 proto); +stream_session_t *stream_session_lookup4 (ip4_address_t * lcl, + ip4_address_t * rmt, u16 lcl_port, + u16 rmt_port, u8 proto, + u32 thread_index); +stream_session_t *stream_session_lookup_listener6 (ip6_address_t * lcl, + u16 lcl_port, u8 proto); +stream_session_t *stream_session_lookup6 (ip6_address_t * lcl, + ip6_address_t * rmt, u16 lcl_port, + u16 rmt_port, u8, u32 thread_index); +transport_connection_t + * stream_session_lookup_transport4 (session_manager_main_t * smm, + ip4_address_t * lcl, + ip4_address_t * rmt, u16 lcl_port, + u16 rmt_port, u8 proto, + u32 thread_index); +transport_connection_t + * stream_session_lookup_transport6 (session_manager_main_t * smm, + ip6_address_t * lcl, + ip6_address_t * rmt, u16 lcl_port, + u16 rmt_port, u8 proto, + u32 thread_index); +stream_session_t *stream_session_lookup_listener (ip46_address_t * lcl, + u16 lcl_port, u8 proto); + +always_inline stream_session_t * +stream_session_get_tsi (u64 ti_and_si, u32 thread_index) +{ + ASSERT ((u32) (ti_and_si >> 32) == thread_index); + return pool_elt_at_index (session_manager_main.sessions[thread_index], + ti_and_si & 0xFFFFFFFFULL); +} + +always_inline stream_session_t * +stream_session_get (u64 si, u32 thread_index) +{ + return pool_elt_at_index (session_manager_main.sessions[thread_index], si); +} + +always_inline stream_session_t * +stream_session_get_if_valid (u64 si, u32 thread_index) +{ + if (thread_index >= vec_len (session_manager_main.sessions)) + return 0; + + if (pool_is_free_index (session_manager_main.sessions[thread_index], si)) + return 0; + + return pool_elt_at_index (session_manager_main.sessions[thread_index], si); +} + +always_inline stream_session_t * +stream_session_listener_get (u8 sst, u64 si) +{ + return pool_elt_at_index (session_manager_main.listen_sessions[sst], si); +} + +always_inline u32 +stream_session_get_index (stream_session_t * s) +{ + if (s->session_state == SESSION_STATE_LISTENING) + return s - session_manager_main.listen_sessions[s->session_type]; + + return s - session_manager_main.sessions[s->thread_index]; +} + +always_inline u32 +stream_session_max_enqueue (transport_connection_t * tc) +{ + stream_session_t *s = stream_session_get (tc->s_index, tc->thread_index); + return svm_fifo_max_enqueue (s->server_rx_fifo); +} + +int +stream_session_enqueue_data (transport_connection_t * tc, u8 * data, u16 len, + u8 queue_event); +u32 +stream_session_peek_bytes (transport_connection_t * tc, u8 * buffer, + u32 offset, u32 max_bytes); +u32 stream_session_dequeue_drop (transport_connection_t * tc, u32 max_bytes); + +void +stream_session_connect_notify (transport_connection_t * tc, u8 sst, + u8 is_fail); +void stream_session_accept_notify (transport_connection_t * tc); +void stream_session_disconnect_notify (transport_connection_t * tc); +void stream_session_delete_notify (transport_connection_t * tc); +void stream_session_reset_notify (transport_connection_t * tc); +int +stream_session_accept (transport_connection_t * tc, u32 listener_index, + u8 sst, u8 notify); +void stream_session_open (u8 sst, ip46_address_t * addr, + u16 port_host_byte_order, u32 api_client_index); +void stream_session_disconnect (stream_session_t * s); +void stream_session_cleanup (stream_session_t * s); +int +stream_session_start_listen (u32 server_index, ip46_address_t * ip, u16 port); +void stream_session_stop_listen (u32 server_index); + +u8 *format_stream_session (u8 * s, va_list * args); + +void session_register_transport (u8 type, const transport_proto_vft_t * vft); +transport_proto_vft_t *session_get_transport_vft (u8 type); + +#endif /* __included_session_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/session_api.c b/src/vnet/session/session_api.c new file mode 100644 index 00000000..9d068684 --- /dev/null +++ b/src/vnet/session/session_api.c @@ -0,0 +1,821 @@ +/* + * Copyright (c) 2015-2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include "application_interface.h" + +#define vl_typedefs /* define message structures */ +#include +#undef vl_typedefs + +#define vl_endianfun /* define message structures */ +#include +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) +#define vl_printfun +#include +#undef vl_printfun + +#include + +#define foreach_session_api_msg \ +_(MAP_ANOTHER_SEGMENT_REPLY, map_another_segment_reply) \ +_(BIND_URI, bind_uri) \ +_(UNBIND_URI, unbind_uri) \ +_(CONNECT_URI, connect_uri) \ +_(DISCONNECT_SESSION, disconnect_session) \ +_(DISCONNECT_SESSION_REPLY, disconnect_session_reply) \ +_(ACCEPT_SESSION_REPLY, accept_session_reply) \ +_(RESET_SESSION_REPLY, reset_session_reply) \ +_(BIND_SOCK, bind_sock) \ +_(UNBIND_SOCK, unbind_sock) \ +_(CONNECT_SOCK, connect_sock) \ +_(DISCONNECT_SOCK, disconnect_sock) \ +_(DISCONNECT_SOCK_REPLY, disconnect_sock_reply) \ +_(ACCEPT_SOCK_REPLY, accept_sock_reply) \ +_(RESET_SOCK_REPLY, reset_sock_reply) \ + +static int +send_add_segment_callback (u32 api_client_index, const u8 * segment_name, + u32 segment_size) +{ + vl_api_map_another_segment_t *mp; + unix_shared_memory_queue_t *q; + + q = vl_api_client_index_to_input_queue (api_client_index); + + if (!q) + return -1; + + mp = vl_msg_api_alloc (sizeof (*mp)); + memset (mp, 0, sizeof (*mp)); + mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_MAP_ANOTHER_SEGMENT); + mp->segment_size = segment_size; + strncpy ((char *) mp->segment_name, (char *) segment_name, + sizeof (mp->segment_name) - 1); + + vl_msg_api_send_shmem (q, (u8 *) & mp); + + return 0; +} + +static int +send_session_accept_uri_callback (stream_session_t * s) +{ + vl_api_accept_session_t *mp; + unix_shared_memory_queue_t *q, *vpp_queue; + application_t *server = application_get (s->app_index); + + q = vl_api_client_index_to_input_queue (server->api_client_index); + vpp_queue = session_manager_get_vpp_event_queue (s->thread_index); + + if (!q) + return -1; + + mp = vl_msg_api_alloc (sizeof (*mp)); + mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_ACCEPT_SESSION); + + /* Note: session_type is the first octet in all types of sessions */ + + mp->accept_cookie = server->accept_cookie; + mp->server_rx_fifo = (u64) s->server_rx_fifo; + mp->server_tx_fifo = (u64) s->server_tx_fifo; + mp->session_thread_index = s->thread_index; + mp->session_index = s->session_index; + mp->session_type = s->session_type; + mp->vpp_event_queue_address = (u64) vpp_queue; + vl_msg_api_send_shmem (q, (u8 *) & mp); + + return 0; +} + +static void +send_session_disconnect_uri_callback (stream_session_t * s) +{ + vl_api_disconnect_session_t *mp; + unix_shared_memory_queue_t *q; + application_t *app = application_get (s->app_index); + + q = vl_api_client_index_to_input_queue (app->api_client_index); + + if (!q) + return; + + mp = vl_msg_api_alloc (sizeof (*mp)); + memset (mp, 0, sizeof (*mp)); + mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_DISCONNECT_SESSION); + + mp->session_thread_index = s->thread_index; + mp->session_index = s->session_index; + vl_msg_api_send_shmem (q, (u8 *) & mp); +} + +static int +send_session_connected_uri_callback (u32 api_client_index, + stream_session_t * s, u8 is_fail) +{ + vl_api_connect_uri_reply_t *mp; + unix_shared_memory_queue_t *q; + application_t *app = application_lookup (api_client_index); + u8 *seg_name; + unix_shared_memory_queue_t *vpp_queue; + + q = vl_api_client_index_to_input_queue (app->api_client_index); + + if (!q) + return -1; + + mp = vl_msg_api_alloc (sizeof (*mp)); + mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_CONNECT_URI_REPLY); + mp->context = app->api_context; + mp->retval = is_fail; + if (!is_fail) + { + vpp_queue = session_manager_get_vpp_event_queue (s->thread_index); + mp->server_rx_fifo = (u64) s->server_rx_fifo; + mp->server_tx_fifo = (u64) s->server_tx_fifo; + mp->session_thread_index = s->thread_index; + mp->session_index = s->session_index; + mp->session_type = s->session_type; + mp->vpp_event_queue_address = (u64) vpp_queue; + mp->client_event_queue_address = (u64) app->event_queue; + + session_manager_get_segment_info (s->server_segment_index, &seg_name, + &mp->segment_size); + mp->segment_name_length = vec_len (seg_name); + if (mp->segment_name_length) + clib_memcpy (mp->segment_name, seg_name, mp->segment_name_length); + } + + vl_msg_api_send_shmem (q, (u8 *) & mp); + + /* Remove client if connect failed */ + if (is_fail) + application_del (app); + + return 0; +} + +/** + * Redirect a connect_uri message to the indicated server. + * Only sent if the server has bound the related port with + * URI_OPTIONS_FLAGS_USE_FIFO + */ +static int +redirect_connect_uri_callback (u32 server_api_client_index, void *mp_arg) +{ + vl_api_connect_uri_t *mp = mp_arg; + unix_shared_memory_queue_t *server_q, *client_q; + vlib_main_t *vm = vlib_get_main (); + f64 timeout = vlib_time_now (vm) + 0.5; + int rv = 0; + + server_q = vl_api_client_index_to_input_queue (server_api_client_index); + + if (!server_q) + { + rv = VNET_API_ERROR_INVALID_VALUE; + goto out; + } + + client_q = vl_api_client_index_to_input_queue (mp->client_index); + if (!client_q) + { + rv = VNET_API_ERROR_INVALID_VALUE_2; + goto out; + } + + /* Tell the server the client's API queue address, so it can reply */ + mp->client_queue_address = (u64) client_q; + + /* + * Bounce message handlers MUST NOT block the data-plane. + * Spin waiting for the queue lock, but + */ + + while (vlib_time_now (vm) < timeout) + { + rv = + unix_shared_memory_queue_add (server_q, (u8 *) & mp, 1 /*nowait */ ); + switch (rv) + { + /* correctly enqueued */ + case 0: + return VNET_CONNECT_REDIRECTED; + + /* continue spinning, wait for pthread_mutex_trylock to work */ + case -1: + continue; + + /* queue stuffed, drop the msg */ + case -2: + rv = VNET_API_ERROR_QUEUE_FULL; + goto out; + } + } +out: + /* Dispose of the message */ + vl_msg_api_free (mp); + return rv; +} + +static u64 +make_session_handle (stream_session_t * s) +{ + return (u64) s->session_index << 32 | (u64) s->thread_index; +} + +static int +send_session_accept_callback (stream_session_t * s) +{ + vl_api_accept_sock_t *mp; + unix_shared_memory_queue_t *q, *vpp_queue; + application_t *server = application_get (s->app_index); + + q = vl_api_client_index_to_input_queue (server->api_client_index); + vpp_queue = session_manager_get_vpp_event_queue (s->thread_index); + + if (!q) + return -1; + + mp = vl_msg_api_alloc (sizeof (*mp)); + mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_ACCEPT_SOCK); + + /* Note: session_type is the first octet in all types of sessions */ + + mp->accept_cookie = server->accept_cookie; + mp->server_rx_fifo = (u64) s->server_rx_fifo; + mp->server_tx_fifo = (u64) s->server_tx_fifo; + mp->handle = make_session_handle (s); + mp->vpp_event_queue_address = (u64) vpp_queue; + vl_msg_api_send_shmem (q, (u8 *) & mp); + + return 0; +} + +static int +send_session_connected_callback (u32 api_client_index, stream_session_t * s, + u8 is_fail) +{ + vl_api_connect_sock_reply_t *mp; + unix_shared_memory_queue_t *q; + application_t *app = application_lookup (api_client_index); + u8 *seg_name; + unix_shared_memory_queue_t *vpp_queue; + + q = vl_api_client_index_to_input_queue (app->api_client_index); + + if (!q) + return -1; + + mp = vl_msg_api_alloc (sizeof (*mp)); + mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_CONNECT_SOCK_REPLY); + mp->context = app->api_context; + mp->retval = is_fail; + if (!is_fail) + { + vpp_queue = session_manager_get_vpp_event_queue (s->thread_index); + mp->server_rx_fifo = (u64) s->server_rx_fifo; + mp->server_tx_fifo = (u64) s->server_tx_fifo; + mp->handle = make_session_handle (s); + mp->vpp_event_queue_address = (u64) vpp_queue; + mp->client_event_queue_address = (u64) app->event_queue; + + session_manager_get_segment_info (s->server_segment_index, &seg_name, + &mp->segment_size); + mp->segment_name_length = vec_len (seg_name); + if (mp->segment_name_length) + clib_memcpy (mp->segment_name, seg_name, mp->segment_name_length); + } + + vl_msg_api_send_shmem (q, (u8 *) & mp); + + /* Remove client if connect failed */ + if (is_fail) + application_del (app); + + return 0; +} + +static void +send_session_disconnect_callback (stream_session_t * s) +{ + vl_api_disconnect_sock_t *mp; + unix_shared_memory_queue_t *q; + application_t *app = application_get (s->app_index); + + q = vl_api_client_index_to_input_queue (app->api_client_index); + + if (!q) + return; + + mp = vl_msg_api_alloc (sizeof (*mp)); + memset (mp, 0, sizeof (*mp)); + mp->_vl_msg_id = clib_host_to_net_u16 (VL_API_DISCONNECT_SOCK); + + mp->handle = make_session_handle (s); + vl_msg_api_send_shmem (q, (u8 *) & mp); +} + +/** + * Redirect a connect_uri message to the indicated server. + * Only sent if the server has bound the related port with + * URI_OPTIONS_FLAGS_USE_FIFO + */ +static int +redirect_connect_callback (u32 server_api_client_index, void *mp_arg) +{ + vl_api_connect_sock_t *mp = mp_arg; + unix_shared_memory_queue_t *server_q, *client_q; + vlib_main_t *vm = vlib_get_main (); + f64 timeout = vlib_time_now (vm) + 0.5; + int rv = 0; + + server_q = vl_api_client_index_to_input_queue (server_api_client_index); + + if (!server_q) + { + rv = VNET_API_ERROR_INVALID_VALUE; + goto out; + } + + client_q = vl_api_client_index_to_input_queue (mp->client_index); + if (!client_q) + { + rv = VNET_API_ERROR_INVALID_VALUE_2; + goto out; + } + + /* Tell the server the client's API queue address, so it can reply */ + mp->client_queue_address = (u64) client_q; + + /* + * Bounce message handlers MUST NOT block the data-plane. + * Spin waiting for the queue lock, but + */ + + while (vlib_time_now (vm) < timeout) + { + rv = + unix_shared_memory_queue_add (server_q, (u8 *) & mp, 1 /*nowait */ ); + switch (rv) + { + /* correctly enqueued */ + case 0: + return VNET_CONNECT_REDIRECTED; + + /* continue spinning, wait for pthread_mutex_trylock to work */ + case -1: + continue; + + /* queue stuffed, drop the msg */ + case -2: + rv = VNET_API_ERROR_QUEUE_FULL; + goto out; + } + } +out: + /* Dispose of the message */ + vl_msg_api_free (mp); + return rv; +} + +static session_cb_vft_t uri_session_cb_vft = { + .session_accept_callback = send_session_accept_uri_callback, + .session_disconnect_callback = send_session_disconnect_uri_callback, + .session_connected_callback = send_session_connected_uri_callback, + .add_segment_callback = send_add_segment_callback, + .redirect_connect_callback = redirect_connect_uri_callback +}; + +static session_cb_vft_t session_cb_vft = { + .session_accept_callback = send_session_accept_callback, + .session_disconnect_callback = send_session_disconnect_callback, + .session_connected_callback = send_session_connected_callback, + .add_segment_callback = send_add_segment_callback, + .redirect_connect_callback = redirect_connect_callback +}; + +static int +api_session_not_valid (u32 session_index, u32 thread_index) +{ + session_manager_main_t *smm = vnet_get_session_manager_main (); + stream_session_t *pool; + + if (thread_index >= vec_len (smm->sessions)) + return VNET_API_ERROR_INVALID_VALUE; + + pool = smm->sessions[thread_index]; + + if (pool_is_free_index (pool, session_index)) + return VNET_API_ERROR_INVALID_VALUE_2; + + return 0; +} + +static void +vl_api_bind_uri_t_handler (vl_api_bind_uri_t * mp) +{ + vl_api_bind_uri_reply_t *rmp; + vnet_bind_args_t _a, *a = &_a; + char segment_name[128]; + u32 segment_name_length; + int rv; + + _Static_assert (sizeof (u64) * SESSION_OPTIONS_N_OPTIONS <= + sizeof (mp->options), + "Out of options, fix api message definition"); + + segment_name_length = ARRAY_LEN (segment_name); + + memset (a, 0, sizeof (*a)); + + a->uri = (char *) mp->uri; + a->api_client_index = mp->client_index; + a->options = mp->options; + a->segment_name = segment_name; + a->segment_name_length = segment_name_length; + a->session_cb_vft = &uri_session_cb_vft; + + a->options[SESSION_OPTIONS_SEGMENT_SIZE] = mp->initial_segment_size; + a->options[SESSION_OPTIONS_ACCEPT_COOKIE] = mp->accept_cookie; + rv = vnet_bind_uri (a); + + /* *INDENT-OFF* */ + REPLY_MACRO2 (VL_API_BIND_URI_REPLY, ({ + rmp->retval = rv; + if (!rv) + { + rmp->segment_name_length = 0; + /* $$$$ policy? */ + rmp->segment_size = mp->initial_segment_size; + if (segment_name_length) + { + memcpy (rmp->segment_name, segment_name, segment_name_length); + rmp->segment_name_length = segment_name_length; + } + rmp->server_event_queue_address = a->server_event_queue_address; + } + })); + /* *INDENT-ON* */ + +} + +static void +vl_api_unbind_uri_t_handler (vl_api_unbind_uri_t * mp) +{ + vl_api_unbind_uri_reply_t *rmp; + int rv; + + rv = vnet_unbind_uri ((char *) mp->uri, mp->client_index); + + REPLY_MACRO (VL_API_UNBIND_URI_REPLY); +} + +static void +vl_api_connect_uri_t_handler (vl_api_connect_uri_t * mp) +{ + vnet_connect_args_t _a, *a = &_a; + + a->uri = (char *) mp->uri; + a->api_client_index = mp->client_index; + a->api_context = mp->context; + a->options = mp->options; + a->session_cb_vft = &uri_session_cb_vft; + a->mp = mp; + vnet_connect_uri (a); +} + +static void +vl_api_disconnect_session_t_handler (vl_api_disconnect_session_t * mp) +{ + vl_api_disconnect_session_reply_t *rmp; + int rv; + + rv = api_session_not_valid (mp->session_index, mp->session_thread_index); + if (!rv) + rv = vnet_disconnect_session (mp->client_index, mp->session_index, + mp->session_thread_index); + + REPLY_MACRO (VL_API_DISCONNECT_SESSION_REPLY); +} + +static void +vl_api_disconnect_session_reply_t_handler (vl_api_disconnect_session_reply_t * + mp) +{ + if (api_session_not_valid (mp->session_index, mp->session_thread_index)) + { + clib_warning ("Invalid session!"); + return; + } + + /* Client objected to disconnecting the session, log and continue */ + if (mp->retval) + { + clib_warning ("client retval %d", mp->retval); + return; + } + + /* Disconnect has been confirmed. Confirm close to transport */ + vnet_disconnect_session (mp->client_index, mp->session_index, + mp->session_thread_index); +} + +static void +vl_api_reset_session_reply_t_handler (vl_api_reset_session_reply_t * mp) +{ + stream_session_t *s; + + if (api_session_not_valid (mp->session_index, mp->session_thread_index)) + { + clib_warning ("Invalid session!"); + return; + } + + /* Client objected to resetting the session, log and continue */ + if (mp->retval) + { + clib_warning ("client retval %d", mp->retval); + return; + } + + s = stream_session_get (mp->session_index, mp->session_thread_index); + + /* This comes as a response to a reset, transport only waiting for + * confirmation to remove connection state, no need to disconnect */ + stream_session_cleanup (s); +} + +static void +vl_api_accept_session_reply_t_handler (vl_api_accept_session_reply_t * mp) +{ + stream_session_t *s; + int rv; + + if (api_session_not_valid (mp->session_index, mp->session_thread_index)) + return; + + s = stream_session_get (mp->session_index, mp->session_thread_index); + rv = mp->retval; + + if (rv) + { + /* Server isn't interested, kill the session */ + stream_session_disconnect (s); + return; + } + + s->session_state = SESSION_STATE_READY; +} + +static void +vl_api_map_another_segment_reply_t_handler (vl_api_map_another_segment_reply_t + * mp) +{ + clib_warning ("not implemented"); +} + +static void +vl_api_bind_sock_t_handler (vl_api_bind_sock_t * mp) +{ + vl_api_bind_sock_reply_t *rmp; + vnet_bind_args_t _a, *a = &_a; + char segment_name[128]; + u32 segment_name_length; + int rv; + + STATIC_ASSERT (sizeof (u64) * SESSION_OPTIONS_N_OPTIONS <= + sizeof (mp->options), + "Out of options, fix api message definition"); + + segment_name_length = ARRAY_LEN (segment_name); + + memset (a, 0, sizeof (*a)); + + clib_memcpy (&a->tep.ip, mp->ip, + (mp->is_ip4 ? sizeof (ip4_address_t) : + sizeof (ip6_address_t))); + a->tep.is_ip4 = mp->is_ip4; + a->tep.port = mp->port; + a->tep.vrf = mp->vrf; + + a->api_client_index = mp->client_index; + a->options = mp->options; + a->segment_name = segment_name; + a->segment_name_length = segment_name_length; + a->session_cb_vft = &session_cb_vft; + + rv = vnet_bind_uri (a); + + /* *INDENT-OFF* */ + REPLY_MACRO2 (VL_API_BIND_SOCK_REPLY, ({ + rmp->retval = rv; + if (!rv) + { + rmp->segment_name_length = 0; + rmp->segment_size = mp->options[SESSION_OPTIONS_SEGMENT_SIZE]; + if (segment_name_length) + { + memcpy(rmp->segment_name, segment_name, segment_name_length); + rmp->segment_name_length = segment_name_length; + } + rmp->server_event_queue_address = a->server_event_queue_address; + } + })); + /* *INDENT-ON* */ +} + +static void +vl_api_unbind_sock_t_handler (vl_api_unbind_sock_t * mp) +{ + vl_api_unbind_sock_reply_t *rmp; + vnet_unbind_args_t _a, *a = &_a; + int rv; + + a->api_client_index = mp->client_index; + a->handle = mp->handle; + + rv = vnet_unbind (a); + + REPLY_MACRO (VL_API_UNBIND_SOCK_REPLY); +} + +static void +vl_api_connect_sock_t_handler (vl_api_connect_sock_t * mp) +{ + vnet_connect_args_t _a, *a = &_a; + + clib_memcpy (&a->tep.ip, mp->ip, + (mp->is_ip4 ? sizeof (ip4_address_t) : + sizeof (ip6_address_t))); + a->tep.is_ip4 = mp->is_ip4; + a->tep.port = mp->port; + a->tep.vrf = mp->vrf; + a->options = mp->options; + a->session_cb_vft = &session_cb_vft; + a->api_context = mp->context; + a->mp = mp; + + vnet_connect (a); +} + +static void +vl_api_disconnect_sock_t_handler (vl_api_disconnect_sock_t * mp) +{ + vnet_disconnect_args_t _a, *a = &_a; + vl_api_disconnect_sock_reply_t *rmp; + int rv; + + a->api_client_index = mp->client_index; + a->handle = mp->handle; + rv = vnet_disconnect (a); + + REPLY_MACRO (VL_API_DISCONNECT_SOCK_REPLY); +} + +static void +vl_api_disconnect_sock_reply_t_handler (vl_api_disconnect_sock_reply_t * mp) +{ + vnet_disconnect_args_t _a, *a = &_a; + + /* Client objected to disconnecting the session, log and continue */ + if (mp->retval) + { + clib_warning ("client retval %d", mp->retval); + return; + } + + a->api_client_index = mp->client_index; + a->handle = mp->handle; + + vnet_disconnect (a); +} + +static void +vl_api_reset_sock_reply_t_handler (vl_api_reset_sock_reply_t * mp) +{ + stream_session_t *s; + u32 session_index, thread_index; + + /* Client objected to resetting the session, log and continue */ + if (mp->retval) + { + clib_warning ("client retval %d", mp->retval); + return; + } + + if (api_parse_session_handle (mp->handle, &session_index, &thread_index)) + { + clib_warning ("Invalid handle"); + return; + } + + s = stream_session_get (session_index, thread_index); + + /* This comes as a response to a reset, transport only waiting for + * confirmation to remove connection state, no need to disconnect */ + stream_session_cleanup (s); +} + +static void +vl_api_accept_sock_reply_t_handler (vl_api_accept_sock_reply_t * mp) +{ + stream_session_t *s; + u32 session_index, thread_index; + + if (api_parse_session_handle (mp->handle, &session_index, &thread_index)) + { + clib_warning ("Invalid handle"); + return; + } + s = stream_session_get (session_index, thread_index); + + if (mp->retval) + { + /* Server isn't interested, kill the session */ + stream_session_disconnect (s); + return; + } + + s->session_state = SESSION_STATE_READY; +} + +#define vl_msg_name_crc_list +#include +#undef vl_msg_name_crc_list + +static void +setup_message_id_table (api_main_t * am) +{ +#define _(id,n,crc) vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id); + foreach_vl_msg_name_crc_session; +#undef _ +} + +/* + * session_api_hookup + * Add uri's API message handlers to the table. + * vlib has alread mapped shared memory and + * added the client registration handlers. + * See .../open-repo/vlib/memclnt_vlib.c:memclnt_process() + */ +static clib_error_t * +session_api_hookup (vlib_main_t * vm) +{ + api_main_t *am = &api_main; + +#define _(N,n) \ + vl_msg_api_set_handlers(VL_API_##N, #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_session_api_msg; +#undef _ + + /* + * Messages which bounce off the data-plane to + * an API client. Simply tells the message handling infra not + * to free the message. + * + * Bounced message handlers MUST NOT block the data plane + */ + am->message_bounce[VL_API_CONNECT_URI] = 1; + am->message_bounce[VL_API_CONNECT_SOCK] = 1; + + /* + * Set up the (msg_name, crc, message-id) table + */ + setup_message_id_table (am); + + return 0; +} + +VLIB_API_INIT_FUNCTION (session_api_hookup); +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/session_cli.c b/src/vnet/session/session_cli.c new file mode 100644 index 00000000..b2943a1c --- /dev/null +++ b/src/vnet/session/session_cli.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include + +/** + * Format stream session as per the following format + * + * verbose: + * "Connection", "Rx fifo", "Tx fifo", "Session Index" + * non-verbose: + * "Connection" + */ +u8 * +format_stream_session (u8 * s, va_list * args) +{ + stream_session_t *ss = va_arg (*args, stream_session_t *); + int verbose = va_arg (*args, int); + transport_proto_vft_t *tp_vft; + u8 *str = 0; + + tp_vft = session_get_transport_vft (ss->session_type); + + if (verbose) + str = format (0, "%-20llp%-20llp%-15lld", ss->server_rx_fifo, + ss->server_tx_fifo, stream_session_get_index (ss)); + + if (ss->session_state == SESSION_STATE_READY) + { + s = format (s, "%-40U%v", tp_vft->format_connection, + ss->connection_index, ss->thread_index, str); + } + else if (ss->session_state == SESSION_STATE_LISTENING) + { + s = format (s, "%-40U%v", tp_vft->format_listener, ss->connection_index, + str); + } + else if (ss->session_state == SESSION_STATE_READY) + { + s = + format (s, "%-40U%v", tp_vft->format_half_open, ss->connection_index, + str); + } + else if (ss->session_state == SESSION_STATE_CLOSED) + { + s = format (s, "[CL] %-40U%v", tp_vft->format_connection, + ss->connection_index, ss->thread_index, str); + } + else + { + clib_warning ("Session in unknown state!"); + } + + vec_free (str); + + return s; +} + +static clib_error_t * +show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + session_manager_main_t *smm = &session_manager_main; + int verbose = 0, i; + stream_session_t *pool; + stream_session_t *s; + u8 *str = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "verbose")) + verbose = 1; + else + break; + } + + for (i = 0; i < vec_len (smm->sessions); i++) + { + u32 once_per_pool; + pool = smm->sessions[i]; + + once_per_pool = 1; + + if (pool_elts (pool)) + { + + vlib_cli_output (vm, "Thread %d: %d active sessions", + i, pool_elts (pool)); + if (verbose) + { + if (once_per_pool) + { + str = format (str, "%-40s%-20s%-20s%-15s", + "Connection", "Rx fifo", "Tx fifo", + "Session Index"); + vlib_cli_output (vm, "%v", str); + vec_reset_length (str); + once_per_pool = 0; + } + + /* *INDENT-OFF* */ + pool_foreach (s, pool, + ({ + vlib_cli_output (vm, "%U", format_stream_session, s, verbose); + })); + /* *INDENT-ON* */ + } + } + else + vlib_cli_output (vm, "Thread %d: no active sessions", i); + } + vec_free (str); + + return 0; +} + +VLIB_CLI_COMMAND (show_uri_command, static) = +{ +.path = "show session",.short_help = "show session [verbose]",.function = + show_session_command_fn,}; + + +static clib_error_t * +clear_session_command_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + session_manager_main_t *smm = &session_manager_main; + u32 thread_index = 0; + u32 session_index = ~0; + stream_session_t *pool, *session; + application_t *server; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "thread %d", &thread_index)) + ; + else if (unformat (input, "session %d", &session_index)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + if (session_index == ~0) + return clib_error_return (0, "session required, but not set."); + + if (thread_index > vec_len (smm->sessions)) + return clib_error_return (0, "thread %d out of range [0-%d]", + thread_index, vec_len (smm->sessions)); + + pool = smm->sessions[thread_index]; + + if (pool_is_free_index (pool, session_index)) + return clib_error_return (0, "session %d not active", session_index); + + session = pool_elt_at_index (pool, session_index); + server = application_get (session->app_index); + + /* Disconnect both app and transport */ + server->cb_fns.session_disconnect_callback (session); + + return 0; +} + +VLIB_CLI_COMMAND (clear_uri_session_command, static) = +{ +.path = "clear session",.short_help = + "clear session thread session ",.function = + clear_session_command_fn,}; + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/session/transport.c b/src/vnet/session/transport.c new file mode 100644 index 00000000..abd94ba4 --- /dev/null +++ b/src/vnet/session/transport.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +u32 +transport_endpoint_lookup (transport_endpoint_table_t *ht, ip46_address_t *ip, + u16 port) +{ + clib_bihash_kv_24_8_t kv; + int rv; + + kv.key[0] = ip->as_u64[0]; + kv.key[1] = ip->as_u64[1]; + kv.key[2] = port; + + rv = clib_bihash_search_inline_24_8 (ht, &kv); + if (rv == 0) + return kv.value; + + return TRANSPORT_ENDPOINT_INVALID_INDEX; +} + +void +transport_endpoint_table_add (transport_endpoint_table_t *ht, + transport_endpoint_t *te, u32 value) +{ + clib_bihash_kv_24_8_t kv; + + kv.key[0] = te->ip.as_u64[0]; + kv.key[1] = te->ip.as_u64[1]; + kv.key[2] = te->port; + kv.value = value; + + clib_bihash_add_del_24_8 (ht, &kv, 1); +} + +void +transport_endpoint_table_del (transport_endpoint_table_t *ht, + transport_endpoint_t *te) +{ + clib_bihash_kv_24_8_t kv; + + kv.key[0] = te->ip.as_u64[0]; + kv.key[1] = te->ip.as_u64[1]; + kv.key[2] = te->port; + + clib_bihash_add_del_24_8 (ht, &kv, 0); +} + + + diff --git a/src/vnet/session/transport.h b/src/vnet/session/transport.h new file mode 100644 index 00000000..2d4415ba --- /dev/null +++ b/src/vnet/session/transport.h @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef VNET_VNET_URI_TRANSPORT_H_ +#define VNET_VNET_URI_TRANSPORT_H_ + +#include +#include +#include +#include + +/* + * Protocol independent transport properties associated to a session + */ +typedef struct _transport_connection +{ + ip46_address_t rmt_ip; /**< Remote IP */ + ip46_address_t lcl_ip; /**< Local IP */ + u16 lcl_port; /**< Local port */ + u16 rmt_port; /**< Remote port */ + u8 proto; /**< Transport protocol id */ + + u32 s_index; /**< Parent session index */ + u32 c_index; /**< Connection index in transport pool */ + u8 is_ip4; /**< Flag if IP4 connection */ + u32 thread_index; /**< Worker-thread index */ + + /** Macros for 'derived classes' where base is named "connection" */ +#define c_lcl_ip connection.lcl_ip +#define c_rmt_ip connection.rmt_ip +#define c_lcl_ip4 connection.lcl_ip.ip4 +#define c_rmt_ip4 connection.rmt_ip.ip4 +#define c_lcl_ip6 connection.lcl_ip.ip6 +#define c_rmt_ip6 connection.rmt_ip.ip6 +#define c_lcl_port connection.lcl_port +#define c_rmt_port connection.rmt_port +#define c_proto connection.proto +#define c_state connection.state +#define c_s_index connection.s_index +#define c_c_index connection.c_index +#define c_is_ip4 connection.is_ip4 +#define c_thread_index connection.thread_index +} transport_connection_t; + +/* + * Transport protocol virtual function table + */ +typedef struct _transport_proto_vft +{ + /* + * Setup + */ + u32 (*bind) (vlib_main_t *, u32, ip46_address_t *, u16); + u32 (*unbind) (vlib_main_t *, u32); + int (*open) (ip46_address_t * addr, u16 port_host_byte_order); + void (*close) (u32 conn_index, u32 thread_index); + void (*cleanup) (u32 conn_index, u32 thread_index); + + /* + * Transmission + */ + u32 (*push_header) (transport_connection_t * tconn, vlib_buffer_t * b); + u16 (*send_mss) (transport_connection_t * tc); + u32 (*send_space) (transport_connection_t * tc); + u32 (*rx_fifo_offset) (transport_connection_t * tc); + + /* + * Connection retrieval + */ + transport_connection_t *(*get_connection) (u32 conn_idx, u32 thread_idx); + transport_connection_t *(*get_listener) (u32 conn_index); + transport_connection_t *(*get_half_open) (u32 conn_index); + + /* + * Format + */ + u8 *(*format_connection) (u8 * s, va_list * args); + u8 *(*format_listener) (u8 * s, va_list * args); + u8 *(*format_half_open) (u8 * s, va_list * args); + +} transport_proto_vft_t; + +/* 16 octets */ +typedef CLIB_PACKED (struct + { + union + { + struct + { + ip4_address_t src; ip4_address_t dst; + u16 src_port; + u16 dst_port; + /* align by making this 4 octets even though its a 1-bit field + * NOTE: avoid key overlap with other transports that use 5 tuples for + * session identification. + */ + u32 proto; + }; + u64 as_u64[2]; + }; + }) v4_connection_key_t; + +typedef CLIB_PACKED (struct + { + union + { + struct + { + /* 48 octets */ + ip6_address_t src; ip6_address_t dst; + u16 src_port; + u16 dst_port; u32 proto; u8 unused_for_now[8]; + }; u64 as_u64[6]; + }; + }) v6_connection_key_t; + +typedef clib_bihash_kv_16_8_t session_kv4_t; +typedef clib_bihash_kv_48_8_t session_kv6_t; + +always_inline void +make_v4_ss_kv (session_kv4_t * kv, ip4_address_t * lcl, ip4_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto) +{ + v4_connection_key_t key; + memset (&key, 0, sizeof (v4_connection_key_t)); + + key.src.as_u32 = lcl->as_u32; + key.dst.as_u32 = rmt->as_u32; + key.src_port = lcl_port; + key.dst_port = rmt_port; + key.proto = proto; + + kv->key[0] = key.as_u64[0]; + kv->key[1] = key.as_u64[1]; + kv->value = ~0ULL; +} + +always_inline void +make_v4_listener_kv (session_kv4_t * kv, ip4_address_t * lcl, u16 lcl_port, + u8 proto) +{ + v4_connection_key_t key; + memset (&key, 0, sizeof (v4_connection_key_t)); + + key.src.as_u32 = lcl->as_u32; + key.dst.as_u32 = 0; + key.src_port = lcl_port; + key.dst_port = 0; + key.proto = proto; + + kv->key[0] = key.as_u64[0]; + kv->key[1] = key.as_u64[1]; + kv->value = ~0ULL; +} + +always_inline void +make_v4_ss_kv_from_tc (session_kv4_t * kv, transport_connection_t * t) +{ + return make_v4_ss_kv (kv, &t->lcl_ip.ip4, &t->rmt_ip.ip4, t->lcl_port, + t->rmt_port, t->proto); +} + +always_inline void +make_v6_ss_kv (session_kv6_t * kv, ip6_address_t * lcl, ip6_address_t * rmt, + u16 lcl_port, u16 rmt_port, u8 proto) +{ + v6_connection_key_t key; + memset (&key, 0, sizeof (v6_connection_key_t)); + + key.src.as_u64[0] = lcl->as_u64[0]; + key.src.as_u64[1] = lcl->as_u64[1]; + key.dst.as_u64[0] = rmt->as_u64[0]; + key.dst.as_u64[1] = rmt->as_u64[1]; + key.src_port = lcl_port; + key.dst_port = rmt_port; + key.proto = proto; + + kv->key[0] = key.as_u64[0]; + kv->key[1] = key.as_u64[1]; + kv->value = ~0ULL; +} + +always_inline void +make_v6_listener_kv (session_kv6_t * kv, ip6_address_t * lcl, u16 lcl_port, + u8 proto) +{ + v6_connection_key_t key; + memset (&key, 0, sizeof (v6_connection_key_t)); + + key.src.as_u64[0] = lcl->as_u64[0]; + key.src.as_u64[1] = lcl->as_u64[1]; + key.dst.as_u64[0] = 0; + key.dst.as_u64[1] = 0; + key.src_port = lcl_port; + key.dst_port = 0; + key.proto = proto; + + kv->key[0] = key.as_u64[0]; + kv->key[1] = key.as_u64[1]; + kv->value = ~0ULL; +} + +always_inline void +make_v6_ss_kv_from_tc (session_kv6_t * kv, transport_connection_t * t) +{ + make_v6_ss_kv (kv, &t->lcl_ip.ip6, &t->rmt_ip.ip6, t->lcl_port, + t->rmt_port, t->proto); +} + +typedef struct _transport_endpoint +{ + ip46_address_t ip; + u16 port; + u8 is_ip4; + u32 vrf; +} transport_endpoint_t; + +typedef clib_bihash_24_8_t transport_endpoint_table_t; + +#define TRANSPORT_ENDPOINT_INVALID_INDEX ((u32)~0) + +u32 +transport_endpoint_lookup (transport_endpoint_table_t * ht, + ip46_address_t * ip, u16 port); +void transport_endpoint_table_add (transport_endpoint_table_t * ht, + transport_endpoint_t * te, u32 value); +void transport_endpoint_table_del (transport_endpoint_table_t * ht, + transport_endpoint_t * te); + +#endif /* VNET_VNET_URI_TRANSPORT_H_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c new file mode 100644 index 00000000..0f9b7097 --- /dev/null +++ b/src/vnet/tcp/tcp.c @@ -0,0 +1,708 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +tcp_main_t tcp_main; + +static u32 +tcp_connection_bind (vlib_main_t * vm, u32 session_index, ip46_address_t * ip, + u16 port_host_byte_order, u8 is_ip4) +{ + tcp_main_t *tm = &tcp_main; + tcp_connection_t *listener; + + pool_get (tm->listener_pool, listener); + memset (listener, 0, sizeof (*listener)); + + listener->c_c_index = listener - tm->listener_pool; + listener->c_lcl_port = clib_host_to_net_u16 (port_host_byte_order); + + if (is_ip4) + listener->c_lcl_ip4.as_u32 = ip->ip4.as_u32; + else + clib_memcpy (&listener->c_lcl_ip6, &ip->ip6, sizeof (ip6_address_t)); + + listener->c_s_index = session_index; + listener->c_proto = SESSION_TYPE_IP4_TCP; + listener->state = TCP_STATE_LISTEN; + listener->c_is_ip4 = 1; + + return listener->c_c_index; +} + +u32 +tcp_session_bind_ip4 (vlib_main_t * vm, u32 session_index, + ip46_address_t * ip, u16 port_host_byte_order) +{ + return tcp_connection_bind (vm, session_index, ip, port_host_byte_order, 1); +} + +u32 +tcp_session_bind_ip6 (vlib_main_t * vm, u32 session_index, + ip46_address_t * ip, u16 port_host_byte_order) +{ + return tcp_connection_bind (vm, session_index, ip, port_host_byte_order, 0); + +} + +static void +tcp_session_unbind (u32 listener_index) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + pool_put_index (tm->listener_pool, listener_index); +} + +u32 +tcp_session_unbind_ip4 (vlib_main_t * vm, u32 listener_index) +{ + tcp_session_unbind (listener_index); + return 0; +} + +u32 +tcp_session_unbind_ip6 (vlib_main_t * vm, u32 listener_index) +{ + tcp_session_unbind (listener_index); + return 0; +} + +transport_connection_t * +tcp_session_get_listener (u32 listener_index) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + tcp_connection_t *tc; + tc = pool_elt_at_index (tm->listener_pool, listener_index); + return &tc->connection; +} + +/** + * Cleans up connection state. + * + * No notifications. + */ +void +tcp_connection_cleanup (tcp_connection_t * tc) +{ + tcp_main_t *tm = &tcp_main; + u32 tepi; + transport_endpoint_t *tep; + + /* Cleanup local endpoint if this was an active connect */ + tepi = transport_endpoint_lookup (&tm->local_endpoints_table, &tc->c_lcl_ip, + tc->c_lcl_port); + + /*XXX lock */ + if (tepi != TRANSPORT_ENDPOINT_INVALID_INDEX) + { + tep = pool_elt_at_index (tm->local_endpoints, tepi); + transport_endpoint_table_del (&tm->local_endpoints_table, tep); + pool_put (tm->local_endpoints, tep); + } + + /* Make sure all timers are cleared */ + tcp_connection_timers_reset (tc); + + /* Check if half-open */ + if (tc->state == TCP_STATE_SYN_SENT) + pool_put (tm->half_open_connections, tc); + else + pool_put (tm->connections[tc->c_thread_index], tc); +} + +/** + * Connection removal. + * + * This should be called only once connection enters CLOSED state. Note + * that it notifies the session of the removal event, so if the goal is to + * just remove the connection, call tcp_connection_cleanup instead. + */ +void +tcp_connection_del (tcp_connection_t * tc) +{ + stream_session_delete_notify (&tc->connection); + tcp_connection_cleanup (tc); +} + +/** + * Begin connection closing procedure. + * + * If at the end the connection is not in CLOSED state, it is not removed. + * Instead, we rely on on TCP to advance through state machine to either + * 1) LAST_ACK (passive close) whereby when the last ACK is received + * tcp_connection_del is called. This notifies session of the delete and + * calls cleanup. + * 2) TIME_WAIT (active close) whereby after 2MSL the 2MSL timer triggers + * and cleanup is called. + */ +void +tcp_connection_close (tcp_connection_t * tc) +{ + /* Send FIN if needed */ + if (tc->state == TCP_STATE_ESTABLISHED || tc->state == TCP_STATE_SYN_RCVD + || tc->state == TCP_STATE_CLOSE_WAIT) + tcp_send_fin (tc); + + /* Switch state */ + if (tc->state == TCP_STATE_ESTABLISHED || tc->state == TCP_STATE_SYN_RCVD) + tc->state = TCP_STATE_FIN_WAIT_1; + else if (tc->state == TCP_STATE_SYN_SENT) + tc->state = TCP_STATE_CLOSED; + else if (tc->state == TCP_STATE_CLOSE_WAIT) + tc->state = TCP_STATE_LAST_ACK; + + /* Half-close connections are not supported XXX */ + + if (tc->state == TCP_STATE_CLOSED) + tcp_connection_del (tc); +} + +void +tcp_session_close (u32 conn_index, u32 thread_index) +{ + tcp_connection_t *tc; + tc = tcp_connection_get (conn_index, thread_index); + tcp_connection_close (tc); +} + +void +tcp_session_cleanup (u32 conn_index, u32 thread_index) +{ + tcp_connection_t *tc; + tc = tcp_connection_get (conn_index, thread_index); + tcp_connection_cleanup (tc); +} + +void * +ip_interface_get_first_ip (u32 sw_if_index, u8 is_ip4) +{ + ip_lookup_main_t *lm4 = &ip4_main.lookup_main; + ip_lookup_main_t *lm6 = &ip6_main.lookup_main; + ip_interface_address_t *ia = 0; + + if (is_ip4) + { + /* *INDENT-OFF* */ + foreach_ip_interface_address (lm4, ia, sw_if_index, 1 /* unnumbered */ , + ({ + return ip_interface_address_get_address (lm4, ia); + })); + /* *INDENT-ON* */ + } + else + { + /* *INDENT-OFF* */ + foreach_ip_interface_address (lm6, ia, sw_if_index, 1 /* unnumbered */ , + ({ + return ip_interface_address_get_address (lm6, ia); + })); + /* *INDENT-ON* */ + } + + return 0; +} + +/** + * Allocate local port and add if successful add entry to local endpoint + * table to mark the pair as used. + */ +u16 +tcp_allocate_local_port (tcp_main_t * tm, ip46_address_t * ip) +{ + u8 unique = 0; + transport_endpoint_t *tep; + u32 time_now, tei; + u16 min = 1024, max = 65535, tries; /* XXX configurable ? */ + + tries = max - min; + time_now = tcp_time_now (); + + /* Start at random point or max */ + pool_get (tm->local_endpoints, tep); + clib_memcpy (&tep->ip, ip, sizeof (*ip)); + tep->port = random_u32 (&time_now) << 16; + tep->port = tep->port < min ? max : tep->port; + + /* Search for first free slot */ + while (tries) + { + tei = transport_endpoint_lookup (&tm->local_endpoints_table, &tep->ip, + tep->port); + if (tei == TRANSPORT_ENDPOINT_INVALID_INDEX) + { + unique = 1; + break; + } + + tep->port--; + + if (tep->port < min) + tep->port = max; + + tries--; + } + + if (unique) + { + transport_endpoint_table_add (&tm->local_endpoints_table, tep, + tep - tm->local_endpoints); + + return tep->port; + } + + /* Failed */ + pool_put (tm->local_endpoints, tep); + return -1; +} + +/** + * Initialize all connection timers as invalid + */ +void +tcp_connection_timers_init (tcp_connection_t * tc) +{ + int i; + + /* Set all to invalid */ + for (i = 0; i < TCP_N_TIMERS; i++) + { + tc->timers[i] = TCP_TIMER_HANDLE_INVALID; + } + + tc->rto = TCP_RTO_INIT; +} + +/** + * Stop all connection timers + */ +void +tcp_connection_timers_reset (tcp_connection_t * tc) +{ + int i; + for (i = 0; i < TCP_N_TIMERS; i++) + { + tcp_timer_reset (tc, i); + } +} + +/** Initialize tcp connection variables + * + * Should be called after having received a msg from the peer, i.e., a SYN or + * a SYNACK, such that connection options have already been exchanged. */ +void +tcp_connection_init_vars (tcp_connection_t * tc) +{ + tcp_connection_timers_init (tc); + tcp_set_snd_mss (tc); + tc->sack_sb.head = TCP_INVALID_SACK_HOLE_INDEX; + tcp_cc_init (tc); +} + +int +tcp_connection_open (ip46_address_t * rmt_addr, u16 rmt_port, u8 is_ip4) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + tcp_connection_t *tc; + fib_prefix_t prefix; + u32 fei, sw_if_index; + ip46_address_t lcl_addr; + u16 lcl_port; + + /* + * Find the local address and allocate port + */ + memset (&lcl_addr, 0, sizeof (lcl_addr)); + + /* Find a FIB path to the destination */ + clib_memcpy (&prefix.fp_addr, rmt_addr, sizeof (*rmt_addr)); + prefix.fp_proto = is_ip4 ? FIB_PROTOCOL_IP4 : FIB_PROTOCOL_IP6; + prefix.fp_len = is_ip4 ? 32 : 128; + + fei = fib_table_lookup (0, &prefix); + + /* Couldn't find route to destination. Bail out. */ + if (fei == FIB_NODE_INDEX_INVALID) + return -1; + + sw_if_index = fib_entry_get_resolving_interface (fei); + + if (sw_if_index == (u32) ~ 0) + return -1; + + if (is_ip4) + { + ip4_address_t *ip4; + ip4 = ip_interface_get_first_ip (sw_if_index, 1); + lcl_addr.ip4.as_u32 = ip4->as_u32; + } + else + { + ip6_address_t *ip6; + ip6 = ip_interface_get_first_ip (sw_if_index, 0); + clib_memcpy (&lcl_addr.ip6, ip6, sizeof (*ip6)); + } + + /* Allocate source port */ + lcl_port = tcp_allocate_local_port (tm, &lcl_addr); + if (lcl_port < 1) + return -1; + + /* + * Create connection and send SYN + */ + + pool_get (tm->half_open_connections, tc); + memset (tc, 0, sizeof (*tc)); + + clib_memcpy (&tc->c_rmt_ip, rmt_addr, sizeof (ip46_address_t)); + clib_memcpy (&tc->c_lcl_ip, &lcl_addr, sizeof (ip46_address_t)); + tc->c_rmt_port = clib_host_to_net_u16 (rmt_port); + tc->c_lcl_port = clib_host_to_net_u16 (lcl_port); + tc->c_c_index = tc - tm->half_open_connections; + tc->c_is_ip4 = is_ip4; + + /* The other connection vars will be initialized after SYN ACK */ + tcp_connection_timers_init (tc); + + tcp_send_syn (tc); + + tc->state = TCP_STATE_SYN_SENT; + + return tc->c_c_index; +} + +int +tcp_session_open_ip4 (ip46_address_t * addr, u16 port) +{ + return tcp_connection_open (addr, port, 1); +} + +int +tcp_session_open_ip6 (ip46_address_t * addr, u16 port) +{ + return tcp_connection_open (addr, port, 0); +} + +u8 * +format_tcp_session_ip4 (u8 * s, va_list * args) +{ + u32 tci = va_arg (*args, u32); + u32 thread_index = va_arg (*args, u32); + tcp_connection_t *tc; + + tc = tcp_connection_get (tci, thread_index); + + s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip4_address, + &tc->c_lcl_ip4, clib_net_to_host_u16 (tc->c_lcl_port), + format_ip4_address, &tc->c_rmt_ip4, + clib_net_to_host_u16 (tc->c_rmt_port)); + + return s; +} + +u8 * +format_tcp_session_ip6 (u8 * s, va_list * args) +{ + u32 tci = va_arg (*args, u32); + u32 thread_index = va_arg (*args, u32); + tcp_connection_t *tc = tcp_connection_get (tci, thread_index); + s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip6_address, + &tc->c_lcl_ip6, clib_net_to_host_u16 (tc->c_lcl_port), + format_ip6_address, &tc->c_rmt_ip6, + clib_net_to_host_u16 (tc->c_rmt_port)); + return s; +} + +u8 * +format_tcp_listener_session_ip4 (u8 * s, va_list * args) +{ + u32 tci = va_arg (*args, u32); + tcp_connection_t *tc = tcp_listener_get (tci); + s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip4_address, + &tc->c_lcl_ip4, clib_net_to_host_u16 (tc->c_lcl_port), + format_ip4_address, &tc->c_rmt_ip4, + clib_net_to_host_u16 (tc->c_rmt_port)); + return s; +} + +u8 * +format_tcp_listener_session_ip6 (u8 * s, va_list * args) +{ + u32 tci = va_arg (*args, u32); + tcp_connection_t *tc = tcp_listener_get (tci); + s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip6_address, + &tc->c_lcl_ip6, clib_net_to_host_u16 (tc->c_lcl_port), + format_ip6_address, &tc->c_rmt_ip6, + clib_net_to_host_u16 (tc->c_rmt_port)); + return s; +} + +u8 * +format_tcp_half_open_session_ip4 (u8 * s, va_list * args) +{ + u32 tci = va_arg (*args, u32); + tcp_connection_t *tc = tcp_half_open_connection_get (tci); + s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip4_address, + &tc->c_lcl_ip4, clib_net_to_host_u16 (tc->c_lcl_port), + format_ip4_address, &tc->c_rmt_ip4, + clib_net_to_host_u16 (tc->c_rmt_port)); + return s; +} + +u8 * +format_tcp_half_open_session_ip6 (u8 * s, va_list * args) +{ + u32 tci = va_arg (*args, u32); + tcp_connection_t *tc = tcp_half_open_connection_get (tci); + s = format (s, "[%s] %U:%d->%U:%d", "tcp", format_ip6_address, + &tc->c_lcl_ip6, clib_net_to_host_u16 (tc->c_lcl_port), + format_ip6_address, &tc->c_rmt_ip6, + clib_net_to_host_u16 (tc->c_rmt_port)); + return s; +} + +transport_connection_t * +tcp_session_get_transport (u32 conn_index, u32 thread_index) +{ + tcp_connection_t *tc = tcp_connection_get (conn_index, thread_index); + return &tc->connection; +} + +transport_connection_t * +tcp_half_open_session_get_transport (u32 conn_index) +{ + tcp_connection_t *tc = tcp_half_open_connection_get (conn_index); + return &tc->connection; +} + +u16 +tcp_session_send_mss (transport_connection_t * trans_conn) +{ + tcp_connection_t *tc = (tcp_connection_t *) trans_conn; + return tc->snd_mss; +} + +u32 +tcp_session_send_space (transport_connection_t * trans_conn) +{ + tcp_connection_t *tc = (tcp_connection_t *) trans_conn; + return tcp_available_snd_space (tc); +} + +u32 +tcp_session_rx_fifo_offset (transport_connection_t * trans_conn) +{ + tcp_connection_t *tc = (tcp_connection_t *) trans_conn; + return (tc->snd_una_max - tc->snd_una); +} + +/* *INDENT-OFF* */ +const static transport_proto_vft_t tcp4_proto = { + .bind = tcp_session_bind_ip4, + .unbind = tcp_session_unbind_ip4, + .push_header = tcp_push_header, + .get_connection = tcp_session_get_transport, + .get_listener = tcp_session_get_listener, + .get_half_open = tcp_half_open_session_get_transport, + .open = tcp_session_open_ip4, + .close = tcp_session_close, + .cleanup = tcp_session_cleanup, + .send_mss = tcp_session_send_mss, + .send_space = tcp_session_send_space, + .rx_fifo_offset = tcp_session_rx_fifo_offset, + .format_connection = format_tcp_session_ip4, + .format_listener = format_tcp_listener_session_ip4, + .format_half_open = format_tcp_half_open_session_ip4 +}; + +const static transport_proto_vft_t tcp6_proto = { + .bind = tcp_session_bind_ip6, + .unbind = tcp_session_unbind_ip6, + .push_header = tcp_push_header, + .get_connection = tcp_session_get_transport, + .get_listener = tcp_session_get_listener, + .get_half_open = tcp_half_open_session_get_transport, + .open = tcp_session_open_ip6, + .close = tcp_session_close, + .cleanup = tcp_session_cleanup, + .send_mss = tcp_session_send_mss, + .send_space = tcp_session_send_space, + .rx_fifo_offset = tcp_session_rx_fifo_offset, + .format_connection = format_tcp_session_ip6, + .format_listener = format_tcp_listener_session_ip6, + .format_half_open = format_tcp_half_open_session_ip6 +}; +/* *INDENT-ON* */ + +void +tcp_timer_keep_handler (u32 conn_index) +{ + u32 cpu_index = os_get_cpu_number (); + tcp_connection_t *tc; + + tc = tcp_connection_get (conn_index, cpu_index); + tc->timers[TCP_TIMER_KEEP] = TCP_TIMER_HANDLE_INVALID; + + tcp_connection_close (tc); +} + +void +tcp_timer_establish_handler (u32 conn_index) +{ + tcp_connection_t *tc; + u8 sst; + + tc = tcp_half_open_connection_get (conn_index); + tc->timers[TCP_TIMER_ESTABLISH] = TCP_TIMER_HANDLE_INVALID; + + ASSERT (tc->state == TCP_STATE_SYN_SENT); + + sst = tc->c_is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; + stream_session_connect_notify (&tc->connection, sst, 1 /* fail */ ); + + tcp_connection_cleanup (tc); +} + +void +tcp_timer_2msl_handler (u32 conn_index) +{ + u32 cpu_index = os_get_cpu_number (); + tcp_connection_t *tc; + + tc = tcp_connection_get (conn_index, cpu_index); + tc->timers[TCP_TIMER_2MSL] = TCP_TIMER_HANDLE_INVALID; + + tcp_connection_del (tc); +} + +/* *INDENT-OFF* */ +static timer_expiration_handler *timer_expiration_handlers[TCP_N_TIMERS] = +{ + tcp_timer_retransmit_handler, + tcp_timer_delack_handler, + 0, + tcp_timer_keep_handler, + tcp_timer_2msl_handler, + tcp_timer_retransmit_syn_handler, + tcp_timer_establish_handler +}; +/* *INDENT-ON* */ + +static void +tcp_expired_timers_dispatch (u32 * expired_timers) +{ + int i; + u32 connection_index, timer_id; + + for (i = 0; i < vec_len (expired_timers); i++) + { + /* Get session index and timer id */ + connection_index = expired_timers[i] & 0x0FFFFFFF; + timer_id = expired_timers[i] >> 28; + + /* Handle expiration */ + (*timer_expiration_handlers[timer_id]) (connection_index); + } +} + +void +tcp_initialize_timer_wheels (tcp_main_t * tm) +{ + tw_timer_wheel_16t_2w_512sl_t *tw; + vec_foreach (tw, tm->timer_wheels) + { + tw_timer_wheel_init_16t_2w_512sl (tw, tcp_expired_timers_dispatch, + 100e-3 /* timer period 100ms */ , ~0); + tw->last_run_time = vlib_time_now (tm->vlib_main); + } +} + +clib_error_t * +tcp_init (vlib_main_t * vm) +{ + ip_main_t *im = &ip_main; + ip_protocol_info_t *pi; + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_thread_main_t *vtm = vlib_get_thread_main (); + clib_error_t *error = 0; + u32 num_threads; + + tm->vlib_main = vm; + tm->vnet_main = vnet_get_main (); + + if ((error = vlib_call_init_function (vm, ip_main_init))) + return error; + if ((error = vlib_call_init_function (vm, ip4_lookup_init))) + return error; + if ((error = vlib_call_init_function (vm, ip6_lookup_init))) + return error; + + /* + * Registrations + */ + + /* Register with IP */ + pi = ip_get_protocol_info (im, IP_PROTOCOL_TCP); + if (pi == 0) + return clib_error_return (0, "TCP protocol info AWOL"); + pi->format_header = format_tcp_header; + pi->unformat_pg_edit = unformat_pg_tcp_header; + + ip4_register_protocol (IP_PROTOCOL_TCP, tcp4_input_node.index); + + /* Register as transport with URI */ + session_register_transport (SESSION_TYPE_IP4_TCP, &tcp4_proto); + session_register_transport (SESSION_TYPE_IP6_TCP, &tcp6_proto); + + /* + * Initialize data structures + */ + + num_threads = 1 /* main thread */ + vtm->n_threads; + vec_validate (tm->connections, num_threads - 1); + + /* Initialize per worker thread tx buffers (used for control messages) */ + vec_validate (tm->tx_buffers, num_threads - 1); + + /* Initialize timer wheels */ + vec_validate (tm->timer_wheels, num_threads - 1); + tcp_initialize_timer_wheels (tm); + + vec_validate (tm->delack_connections, num_threads - 1); + + /* Initialize clocks per tick for TCP timestamp. Used to compute + * monotonically increasing timestamps. */ + tm->tstamp_ticks_per_clock = vm->clib_time.seconds_per_clock + / TCP_TSTAMP_RESOLUTION; + + clib_bihash_init_24_8 (&tm->local_endpoints_table, "local endpoint table", + 200000 /* $$$$ config parameter nbuckets */ , + (64 << 20) /*$$$ config parameter table size */ ); + + return error; +} + +VLIB_INIT_FUNCTION (tcp_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h new file mode 100644 index 00000000..22f00a63 --- /dev/null +++ b/src/vnet/tcp/tcp.h @@ -0,0 +1,624 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _vnet_tcp_h_ +#define _vnet_tcp_h_ + +#include +#include +#include +#include +#include +#include + +#define TCP_TICK 10e-3 /**< TCP tick period (s) */ +#define THZ 1/TCP_TICK /**< TCP tick frequency */ +#define TCP_TSTAMP_RESOLUTION TCP_TICK /**< Time stamp resolution */ +#define TCP_PAWS_IDLE 24 * 24 * 60 * 60 * THZ /**< 24 days */ +#define TCP_MAX_OPTION_SPACE 40 + +#define TCP_DUPACK_THRESHOLD 3 +#define TCP_DEFAULT_RX_FIFO_SIZE 64 << 10 + +/** TCP FSM state definitions as per RFC793. */ +#define foreach_tcp_fsm_state \ + _(CLOSED, "CLOSED") \ + _(LISTEN, "LISTEN") \ + _(SYN_SENT, "SYN_SENT") \ + _(SYN_RCVD, "SYN_RCVD") \ + _(ESTABLISHED, "ESTABLISHED") \ + _(CLOSE_WAIT, "CLOSE_WAIT") \ + _(FIN_WAIT_1, "FIN_WAIT_1") \ + _(LAST_ACK, "LAST_ACK") \ + _(CLOSING, "CLOSING") \ + _(FIN_WAIT_2, "FIN_WAIT_2") \ + _(TIME_WAIT, "TIME_WAIT") + +typedef enum _tcp_state +{ +#define _(sym, str) TCP_STATE_##sym, + foreach_tcp_fsm_state +#undef _ + TCP_N_STATES +} tcp_state_t; + +format_function_t format_tcp_state; + +/** TCP timers */ +#define foreach_tcp_timer \ + _(RETRANSMIT, "RETRANSMIT") \ + _(DELACK, "DELAYED ACK") \ + _(PERSIST, "PERSIST") \ + _(KEEP, "KEEP") \ + _(2MSL, "2MSL") \ + _(RETRANSMIT_SYN, "RETRANSMIT_SYN") \ + _(ESTABLISH, "ESTABLISH") + +typedef enum _tcp_timers +{ +#define _(sym, str) TCP_TIMER_##sym, + foreach_tcp_timer +#undef _ + TCP_N_TIMERS +} tcp_timers_e; + +typedef void (timer_expiration_handler) (u32 index); + +extern timer_expiration_handler tcp_timer_delack_handler; +extern timer_expiration_handler tcp_timer_retransmit_handler; +extern timer_expiration_handler tcp_timer_retransmit_syn_handler; + +#define TCP_TIMER_HANDLE_INVALID ((u32) ~0) + +/* Timer delays as multiples of 100ms */ +#define TCP_TO_TIMER_TICK TCP_TICK*10 /* Period for converting from TCP + * ticks to timer units */ +#define TCP_DELACK_TIME 1 /* 0.1s */ +#define TCP_ESTABLISH_TIME 750 /* 75s */ +#define TCP_2MSL_TIME 300 /* 30s */ + +#define TCP_RTO_MAX 60 * THZ /* Min max RTO (60s) as per RFC6298 */ +#define TCP_RTT_MAX 30 * THZ /* 30s (probably too much) */ +#define TCP_RTO_SYN_RETRIES 3 /* SYN retries without doubling RTO */ +#define TCP_RTO_INIT 1 * THZ /* Initial retransmit timer */ + +void tcp_update_time (f64 now, u32 thread_index); + +/** TCP connection flags */ +#define foreach_tcp_connection_flag \ + _(DELACK, "Delay ACK") \ + _(SNDACK, "Send ACK") \ + _(BURSTACK, "Burst ACK set") \ + _(SENT_RCV_WND0, "Sent 0 receive window") \ + _(RECOVERY, "Recovery on") \ + _(FAST_RECOVERY, "Fast Recovery on") + +typedef enum _tcp_connection_flag_bits +{ +#define _(sym, str) TCP_CONN_##sym##_BIT, + foreach_tcp_connection_flag +#undef _ + TCP_CONN_N_FLAG_BITS +} tcp_connection_flag_bits_e; + +typedef enum _tcp_connection_flag +{ +#define _(sym, str) TCP_CONN_##sym = 1 << TCP_CONN_##sym##_BIT, + foreach_tcp_connection_flag +#undef _ + TCP_CONN_N_FLAGS +} tcp_connection_flags_e; + +/** TCP buffer flags */ +#define foreach_tcp_buf_flag \ + _ (ACK) /**< Sending ACK. */ \ + _ (DUPACK) /**< Sending DUPACK. */ \ + +enum +{ +#define _(f) TCP_BUF_BIT_##f, + foreach_tcp_buf_flag +#undef _ + TCP_N_BUF_BITS, +}; + +enum +{ +#define _(f) TCP_BUF_FLAG_##f = 1 << TCP_BUF_BIT_##f, + foreach_tcp_buf_flag +#undef _ +}; + +#define TCP_MAX_SACK_BLOCKS 5 /**< Max number of SACK blocks stored */ +#define TCP_INVALID_SACK_HOLE_INDEX ((u32)~0) + +typedef struct _sack_scoreboard_hole +{ + u32 next; /**< Index for next entry in linked list */ + u32 prev; /**< Index for previous entry in linked list */ + u32 start; /**< Start sequence number */ + u32 end; /**< End sequence number */ +} sack_scoreboard_hole_t; + +typedef struct _sack_scoreboard +{ + sack_scoreboard_hole_t *holes; /**< Pool of holes */ + u32 head; /**< Index to first entry */ + u32 sacked_bytes; /**< Number of bytes sacked in sb */ +} sack_scoreboard_t; + +typedef enum _tcp_cc_algorithm_type +{ + TCP_CC_NEWRENO, +} tcp_cc_algorithm_type_e; + +typedef struct _tcp_cc_algorithm tcp_cc_algorithm_t; + +typedef enum _tcp_cc_ack_t +{ + TCP_CC_ACK, + TCP_CC_DUPACK, + TCP_CC_PARTIALACK +} tcp_cc_ack_t; + +typedef struct _tcp_connection +{ + transport_connection_t connection; /**< Common transport data. First! */ + + u8 state; /**< TCP state as per tcp_state_t */ + u16 flags; /**< Connection flags (see tcp_conn_flags_e) */ + u32 timers[TCP_N_TIMERS]; /**< Timer handles into timer wheel */ + + /* TODO RFC4898 */ + + /** Send sequence variables RFC793 */ + u32 snd_una; /**< oldest unacknowledged sequence number */ + u32 snd_una_max; /**< newest unacknowledged sequence number + 1*/ + u32 snd_wnd; /**< send window */ + u32 snd_wl1; /**< seq number used for last snd.wnd update */ + u32 snd_wl2; /**< ack number used for last snd.wnd update */ + u32 snd_nxt; /**< next seq number to be sent */ + + /** Receive sequence variables RFC793 */ + u32 rcv_nxt; /**< next sequence number expected */ + u32 rcv_wnd; /**< receive window we expect */ + + u32 rcv_las; /**< rcv_nxt at last ack sent/rcv_wnd update */ + u32 iss; /**< initial sent sequence */ + u32 irs; /**< initial remote sequence */ + + /* Options */ + tcp_options_t opt; /**< TCP connection options parsed */ + u8 rcv_wscale; /**< Window scale to advertise to peer */ + u8 snd_wscale; /**< Window scale to use when sending */ + u32 tsval_recent; /**< Last timestamp received */ + u32 tsval_recent_age; /**< When last updated tstamp_recent*/ + + sack_block_t *snd_sacks; /**< Vector of SACKs to send. XXX Fixed size? */ + sack_scoreboard_t sack_sb; /**< SACK "scoreboard" that tracks holes */ + + u8 rcv_dupacks; /**< Number of DUPACKs received */ + u8 snt_dupacks; /**< Number of DUPACKs sent in a burst */ + + /* Congestion control */ + u32 cwnd; /**< Congestion window */ + u32 ssthresh; /**< Slow-start threshold */ + u32 prev_ssthresh; /**< ssthresh before congestion */ + u32 bytes_acked; /**< Bytes acknowledged by current segment */ + u32 rtx_bytes; /**< Retransmitted bytes */ + u32 tsecr_last_ack; /**< Timestamp echoed to us in last health ACK */ + tcp_cc_algorithm_t *cc_algo; /**< Congestion control algorithm */ + + /* RTT and RTO */ + u32 rto; /**< Retransmission timeout */ + u32 rto_boff; /**< Index for RTO backoff */ + u32 srtt; /**< Smoothed RTT */ + u32 rttvar; /**< Smoothed mean RTT difference. Approximates variance */ + u32 rtt_ts; /**< Timestamp for tracked ACK */ + u32 rtt_seq; /**< Sequence number for tracked ACK */ + + u16 snd_mss; /**< Send MSS */ +} tcp_connection_t; + +struct _tcp_cc_algorithm +{ + void (*rcv_ack) (tcp_connection_t * tc); + void (*rcv_cong_ack) (tcp_connection_t * tc, tcp_cc_ack_t ack); + void (*congestion) (tcp_connection_t * tc); + void (*recovered) (tcp_connection_t * tc); + void (*init) (tcp_connection_t * tc); +}; + +#define tcp_fastrecovery_on(tc) (tc)->flags |= TCP_CONN_FAST_RECOVERY +#define tcp_fastrecovery_off(tc) (tc)->flags &= ~TCP_CONN_FAST_RECOVERY +#define tcp_in_fastrecovery(tc) ((tc)->flags & TCP_CONN_FAST_RECOVERY) +#define tcp_in_recovery(tc) ((tc)->flags & (TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY)) +#define tcp_recovery_off(tc) ((tc)->flags &= ~(TCP_CONN_FAST_RECOVERY | TCP_CONN_RECOVERY)) +#define tcp_in_slowstart(tc) (tc->cwnd < tc->ssthresh) + +typedef enum +{ + TCP_IP4, + TCP_IP6, + TCP_N_AF, +} tcp_af_t; + +typedef enum _tcp_error +{ +#define tcp_error(n,s) TCP_ERROR_##n, +#include +#undef tcp_error + TCP_N_ERROR, +} tcp_error_t; + +typedef struct _tcp_lookup_dispatch +{ + u8 next, error; +} tcp_lookup_dispatch_t; + +typedef struct _tcp_main +{ + /* Per-worker thread tcp connection pools */ + tcp_connection_t **connections; + + /* Pool of listeners. */ + tcp_connection_t *listener_pool; + + /** Dispatch table by state and flags */ + tcp_lookup_dispatch_t dispatch_table[TCP_N_STATES][64]; + + u8 log2_tstamp_clocks_per_tick; + f64 tstamp_ticks_per_clock; + + /** per-worker tx buffer free lists */ + u32 **tx_buffers; + + /* Per worker-thread timer wheel for connections timers */ + tw_timer_wheel_16t_2w_512sl_t *timer_wheels; + + /* Convenience per worker-thread vector of connections to DELACK */ + u32 **delack_connections; + + /* Pool of half-open connections on which we've sent a SYN */ + tcp_connection_t *half_open_connections; + + /* Pool of local TCP endpoints */ + transport_endpoint_t *local_endpoints; + + /* Local endpoints lookup table */ + transport_endpoint_table_t local_endpoints_table; + + /* Congestion control algorithms registered */ + tcp_cc_algorithm_t *cc_algos; + + /* convenience */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; + ip4_main_t *ip4_main; + ip6_main_t *ip6_main; +} tcp_main_t; + +extern tcp_main_t tcp_main; +extern vlib_node_registration_t tcp4_input_node; +extern vlib_node_registration_t tcp6_input_node; +extern vlib_node_registration_t tcp4_output_node; +extern vlib_node_registration_t tcp6_output_node; + +always_inline tcp_main_t * +vnet_get_tcp_main () +{ + return &tcp_main; +} + +always_inline tcp_connection_t * +tcp_connection_get (u32 conn_index, u32 thread_index) +{ + return pool_elt_at_index (tcp_main.connections[thread_index], conn_index); +} + +always_inline tcp_connection_t * +tcp_connection_get_if_valid (u32 conn_index, u32 thread_index) +{ + if (tcp_main.connections[thread_index] == 0) + return 0; + if (pool_is_free_index (tcp_main.connections[thread_index], conn_index)) + return 0; + return pool_elt_at_index (tcp_main.connections[thread_index], conn_index); +} + +void tcp_connection_close (tcp_connection_t * tc); +void tcp_connection_cleanup (tcp_connection_t * tc); +void tcp_connection_del (tcp_connection_t * tc); + +always_inline tcp_connection_t * +tcp_listener_get (u32 tli) +{ + return pool_elt_at_index (tcp_main.listener_pool, tli); +} + +always_inline tcp_connection_t * +tcp_half_open_connection_get (u32 conn_index) +{ + return pool_elt_at_index (tcp_main.half_open_connections, conn_index); +} + +void tcp_make_ack (tcp_connection_t * ts, vlib_buffer_t * b); +void tcp_make_finack (tcp_connection_t * tc, vlib_buffer_t * b); +void tcp_make_synack (tcp_connection_t * ts, vlib_buffer_t * b); +void tcp_send_reset (vlib_buffer_t * pkt, u8 is_ip4); +void tcp_send_syn (tcp_connection_t * tc); +void tcp_send_fin (tcp_connection_t * tc); +void tcp_set_snd_mss (tcp_connection_t * tc); + +always_inline u32 +tcp_end_seq (tcp_header_t * th, u32 len) +{ + return th->seq_number + tcp_is_syn (th) + tcp_is_fin (th) + len; +} + +/* Modulo arithmetic for TCP sequence numbers */ +#define seq_lt(_s1, _s2) ((i32)((_s1)-(_s2)) < 0) +#define seq_leq(_s1, _s2) ((i32)((_s1)-(_s2)) <= 0) +#define seq_gt(_s1, _s2) ((i32)((_s1)-(_s2)) > 0) +#define seq_geq(_s1, _s2) ((i32)((_s1)-(_s2)) >= 0) + +/* Modulo arithmetic for timestamps */ +#define timestamp_lt(_t1, _t2) ((i32)((_t1)-(_t2)) < 0) +#define timestamp_leq(_t1, _t2) ((i32)((_t1)-(_t2)) <= 0) + +always_inline u32 +tcp_flight_size (const tcp_connection_t * tc) +{ + return tc->snd_una_max - tc->snd_una - tc->sack_sb.sacked_bytes + + tc->rtx_bytes; +} + +/** + * Initial cwnd as per RFC5681 + */ +always_inline u32 +tcp_initial_cwnd (const tcp_connection_t * tc) +{ + if (tc->snd_mss > 2190) + return 2 * tc->snd_mss; + else if (tc->snd_mss > 1095) + return 3 * tc->snd_mss; + else + return 4 * tc->snd_mss; +} + +always_inline u32 +tcp_loss_wnd (const tcp_connection_t * tc) +{ + return tc->snd_mss; +} + +always_inline u32 +tcp_available_wnd (const tcp_connection_t * tc) +{ + return clib_min (tc->cwnd, tc->snd_wnd); +} + +always_inline u32 +tcp_available_snd_space (const tcp_connection_t * tc) +{ + u32 available_wnd = tcp_available_wnd (tc); + u32 flight_size = tcp_flight_size (tc); + + if (available_wnd <= flight_size) + return 0; + + return available_wnd - flight_size; +} + +void tcp_retransmit_first_unacked (tcp_connection_t * tc); + +void tcp_fast_retransmit (tcp_connection_t * tc); + +always_inline u32 +tcp_time_now (void) +{ + return clib_cpu_time_now () * tcp_main.tstamp_ticks_per_clock; +} + +u32 tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b); + +u32 +tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, + u32 max_bytes); + +void tcp_connection_timers_init (tcp_connection_t * tc); +void tcp_connection_timers_reset (tcp_connection_t * tc); + +void tcp_connection_init_vars (tcp_connection_t * tc); + +always_inline void +tcp_connection_force_ack (tcp_connection_t * tc, vlib_buffer_t * b) +{ + /* Reset flags, make sure ack is sent */ + tc->flags = TCP_CONN_SNDACK; + vnet_buffer (b)->tcp.flags &= ~TCP_BUF_FLAG_DUPACK; +} + +always_inline void +tcp_timer_set (tcp_connection_t * tc, u8 timer_id, u32 interval) +{ + tc->timers[timer_id] + = tw_timer_start_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index], + tc->c_c_index, timer_id, interval); +} + +always_inline void +tcp_retransmit_timer_set (tcp_main_t * tm, tcp_connection_t * tc) +{ + /* XXX Switch to faster TW */ + tcp_timer_set (tc, TCP_TIMER_RETRANSMIT, + clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); +} + +always_inline void +tcp_timer_reset (tcp_connection_t * tc, u8 timer_id) +{ + if (tc->timers[timer_id] == TCP_TIMER_HANDLE_INVALID) + return; + + tw_timer_stop_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index], + tc->timers[timer_id]); + tc->timers[timer_id] = TCP_TIMER_HANDLE_INVALID; +} + +always_inline void +tcp_timer_update (tcp_connection_t * tc, u8 timer_id, u32 interval) +{ + if (tc->timers[timer_id] != TCP_TIMER_HANDLE_INVALID) + tw_timer_stop_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index], + tc->timers[timer_id]); + tc->timers[timer_id] = + tw_timer_start_16t_2w_512sl (&tcp_main.timer_wheels[tc->c_thread_index], + tc->c_c_index, timer_id, interval); +} + +always_inline u8 +tcp_timer_is_active (tcp_connection_t * tc, tcp_timers_e timer) +{ + return tc->timers[timer] != TCP_TIMER_HANDLE_INVALID; +} + +void +scoreboard_remove_hole (sack_scoreboard_t * sb, + sack_scoreboard_hole_t * hole); + +always_inline sack_scoreboard_hole_t * +scoreboard_next_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) +{ + if (hole->next != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, hole->next); + return 0; +} + +always_inline sack_scoreboard_hole_t * +scoreboard_first_hole (sack_scoreboard_t * sb) +{ + if (sb->head != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, sb->head); + return 0; +} + +always_inline void +scoreboard_clear (sack_scoreboard_t * sb) +{ + sack_scoreboard_hole_t *hole = scoreboard_first_hole (sb); + while ((hole = scoreboard_first_hole (sb))) + { + scoreboard_remove_hole (sb, hole); + } +} + +always_inline u32 +scoreboard_hole_bytes (sack_scoreboard_hole_t * hole) +{ + return hole->end - hole->start; +} + +always_inline void +tcp_cc_algo_register (tcp_cc_algorithm_type_e type, + const tcp_cc_algorithm_t * vft) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vec_validate (tm->cc_algos, type); + + tm->cc_algos[type] = *vft; +} + +always_inline tcp_cc_algorithm_t * +tcp_cc_algo_get (tcp_cc_algorithm_type_e type) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + return &tm->cc_algos[type]; +} + +void tcp_cc_init (tcp_connection_t * tc); + +/** + * Push TCP header to buffer + * + * @param vm - vlib_main + * @param b - buffer to write the header to + * @param sp_net - source port net order + * @param dp_net - destination port net order + * @param seq - sequence number net order + * @param ack - ack number net order + * @param tcp_hdr_opts_len - header and options length in bytes + * @param flags - header flags + * @param wnd - window size + * + * @return - pointer to start of TCP header + */ +always_inline void * +vlib_buffer_push_tcp_net_order (vlib_buffer_t * b, u16 sp, u16 dp, u32 seq, + u32 ack, u8 tcp_hdr_opts_len, u8 flags, + u16 wnd) +{ + tcp_header_t *th; + + th = vlib_buffer_push_uninit (b, tcp_hdr_opts_len); + + th->src_port = sp; + th->dst_port = dp; + th->seq_number = seq; + th->ack_number = ack; + th->data_offset_and_reserved = (tcp_hdr_opts_len >> 2) << 4; + th->flags = flags; + th->window = wnd; + th->checksum = 0; + th->urgent_pointer = 0; + return th; +} + +/** + * Push TCP header to buffer + * + * @param vm - vlib_main + * @param b - buffer to write the header to + * @param sp_net - source port net order + * @param dp_net - destination port net order + * @param seq - sequence number host order + * @param ack - ack number host order + * @param tcp_hdr_opts_len - header and options length in bytes + * @param flags - header flags + * @param wnd - window size + * + * @return - pointer to start of TCP header + */ +always_inline void * +vlib_buffer_push_tcp (vlib_buffer_t * b, u16 sp_net, u16 dp_net, u32 seq, + u32 ack, u8 tcp_hdr_opts_len, u8 flags, u16 wnd) +{ + return vlib_buffer_push_tcp_net_order (b, sp_net, dp_net, + clib_host_to_net_u32 (seq), + clib_host_to_net_u32 (ack), + tcp_hdr_opts_len, flags, + clib_host_to_net_u16 (wnd)); +} + +#endif /* _vnet_tcp_h_ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_error.def b/src/vnet/tcp/tcp_error.def new file mode 100644 index 00000000..cff5ec13 --- /dev/null +++ b/src/vnet/tcp/tcp_error.def @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +tcp_error (NONE, "no error") +tcp_error (NO_LISTENER, "no listener for dst port") +tcp_error (LOOKUP_DROPS, "lookup drops") +tcp_error (DISPATCH, "Dispatch error") +tcp_error (ENQUEUED, "Packets pushed into rx fifo") +tcp_error (PURE_ACK, "Pure acks") +tcp_error (SYNS_RCVD, "SYNs received") +tcp_error (SYN_ACKS_RCVD, "SYN-ACKs received") +tcp_error (NOT_READY, "Session not ready for packets") +tcp_error (FIFO_FULL, "Packets dropped for lack of rx fifo space") +tcp_error (EVENT_FIFO_FULL, "Events not sent for lack of event fifo space") +tcp_error (API_QUEUE_FULL, "Sessions not created for lack of API queue space") +tcp_error (CREATE_SESSION_FAIL, "Sessions couldn't be allocated") +tcp_error (SEGMENT_INVALID, "Invalid segment") +tcp_error (ACK_INVALID, "Invalid ACK") +tcp_error (ACK_DUP, "Duplicate ACK") +tcp_error (ACK_OLD, "Old ACK") +tcp_error (PKTS_SENT, "Packets sent") +tcp_error (FILTERED_DUPACKS, "Filtered duplicate ACKs") +tcp_error (RST_SENT, "Resets sent") \ No newline at end of file diff --git a/src/vnet/tcp/tcp_format.c b/src/vnet/tcp/tcp_format.c new file mode 100644 index 00000000..7136741d --- /dev/null +++ b/src/vnet/tcp/tcp_format.c @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * tcp/tcp_format.c: tcp formatting + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include + +static u8 * +format_tcp_flags (u8 * s, va_list * args) +{ + int flags = va_arg (*args, int); + +#define _(f) if (flags & TCP_FLAG_##f) s = format (s, "%s, ", #f); + foreach_tcp_flag +#undef _ + return s; +} + +/* Format TCP header. */ +u8 * +format_tcp_header (u8 * s, va_list * args) +{ + tcp_header_t *tcp = va_arg (*args, tcp_header_t *); + u32 max_header_bytes = va_arg (*args, u32); + u32 header_bytes; + uword indent; + + /* Nothing to do. */ + if (max_header_bytes < sizeof (tcp[0])) + return format (s, "TCP header truncated"); + + indent = format_get_indent (s); + indent += 2; + header_bytes = tcp_header_bytes (tcp); + + s = format (s, "TCP: %d -> %d", clib_net_to_host_u16 (tcp->src), + clib_net_to_host_u16 (tcp->dst)); + + s = format (s, "\n%Useq. 0x%08x ack 0x%08x", format_white_space, indent, + clib_net_to_host_u32 (tcp->seq_number), + clib_net_to_host_u32 (tcp->ack_number)); + + s = format (s, "\n%Uflags %U, tcp header: %d bytes", format_white_space, + indent, format_tcp_flags, tcp->flags, header_bytes); + + s = format (s, "\n%Uwindow %d, checksum 0x%04x", format_white_space, indent, + clib_net_to_host_u16 (tcp->window), + clib_net_to_host_u16 (tcp->checksum)); + + +#if 0 + /* Format TCP options. */ + { + u8 *o; + u8 *option_start = (void *) (tcp + 1); + u8 *option_end = (void *) tcp + header_bytes; + + for (o = option_start; o < option_end;) + { + u32 length = o[1]; + switch (o[0]) + { + case TCP_OPTION_END: + length = 1; + o = option_end; + break; + + case TCP_OPTION_NOOP: + length = 1; + break; + + } + } + } +#endif + + /* Recurse into next protocol layer. */ + if (max_header_bytes != 0 && header_bytes < max_header_bytes) + { + ip_main_t *im = &ip_main; + tcp_udp_port_info_t *pi; + + pi = ip_get_tcp_udp_port_info (im, tcp->dst); + + if (pi && pi->format_header) + s = format (s, "\n%U%U", format_white_space, indent - 2, + pi->format_header, + /* next protocol header */ (void *) tcp + header_bytes, + max_header_bytes - header_bytes); + } + + return s; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c new file mode 100644 index 00000000..daa0683b --- /dev/null +++ b/src/vnet/tcp/tcp_input.c @@ -0,0 +1,2316 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +static char *tcp_error_strings[] = { +#define tcp_error(n,s) s, +#include +#undef tcp_error +}; + +/* All TCP nodes have the same outgoing arcs */ +#define foreach_tcp_state_next \ + _ (DROP, "error-drop") \ + _ (TCP4_OUTPUT, "tcp4-output") \ + _ (TCP6_OUTPUT, "tcp6-output") + +typedef enum _tcp_established_next +{ +#define _(s,n) TCP_ESTABLISHED_NEXT_##s, + foreach_tcp_state_next +#undef _ + TCP_ESTABLISHED_N_NEXT, +} tcp_established_next_t; + +typedef enum _tcp_rcv_process_next +{ +#define _(s,n) TCP_RCV_PROCESS_NEXT_##s, + foreach_tcp_state_next +#undef _ + TCP_RCV_PROCESS_N_NEXT, +} tcp_rcv_process_next_t; + +typedef enum _tcp_syn_sent_next +{ +#define _(s,n) TCP_SYN_SENT_NEXT_##s, + foreach_tcp_state_next +#undef _ + TCP_SYN_SENT_N_NEXT, +} tcp_syn_sent_next_t; + +typedef enum _tcp_listen_next +{ +#define _(s,n) TCP_LISTEN_NEXT_##s, + foreach_tcp_state_next +#undef _ + TCP_LISTEN_N_NEXT, +} tcp_listen_next_t; + +/* Generic, state independent indices */ +typedef enum _tcp_state_next +{ +#define _(s,n) TCP_NEXT_##s, + foreach_tcp_state_next +#undef _ + TCP_STATE_N_NEXT, +} tcp_state_next_t; + +#define tcp_next_output(is_ip4) (is_ip4 ? TCP_NEXT_TCP4_OUTPUT \ + : TCP_NEXT_TCP6_OUTPUT) + +vlib_node_registration_t tcp4_established_node; +vlib_node_registration_t tcp6_established_node; + +/** + * Validate segment sequence number. As per RFC793: + * + * Segment Receive Test + * Length Window + * ------- ------- ------------------------------------------- + * 0 0 SEG.SEQ = RCV.NXT + * 0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND + * >0 0 not acceptable + * >0 >0 RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND + * or RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND + * + * This ultimately consists in checking if segment falls within the window. + * The one important difference compared to RFC793 is that we use rcv_las, + * or the rcv_nxt at last ack sent instead of rcv_nxt since that's the + * peer's reference when computing our receive window. + * + * This accepts only segments within the window. + */ +always_inline u8 +tcp_segment_in_rcv_wnd (tcp_connection_t * tc, u32 seq, u32 end_seq) +{ + return seq_leq (end_seq, tc->rcv_las + tc->rcv_wnd) + && seq_geq (seq, tc->rcv_nxt); +} + +void +tcp_options_parse (tcp_header_t * th, tcp_options_t * to) +{ + const u8 *data; + u8 opt_len, opts_len, kind; + int j; + sack_block_t b; + + opts_len = (tcp_doff (th) << 2) - sizeof (tcp_header_t); + data = (const u8 *) (th + 1); + + /* Zero out all flags but those set in SYN */ + to->flags &= (TCP_OPTS_FLAG_SACK_PERMITTED | TCP_OPTS_FLAG_WSCALE); + + for (; opts_len > 0; opts_len -= opt_len, data += opt_len) + { + kind = data[0]; + + /* Get options length */ + if (kind == TCP_OPTION_EOL) + break; + else if (kind == TCP_OPTION_NOOP) + opt_len = 1; + else + { + /* broken options */ + if (opts_len < 2) + break; + opt_len = data[1]; + + /* weird option length */ + if (opt_len < 2 || opt_len > opts_len) + break; + } + + /* Parse options */ + switch (kind) + { + case TCP_OPTION_MSS: + if ((opt_len == TCP_OPTION_LEN_MSS) && tcp_syn (th)) + { + to->flags |= TCP_OPTS_FLAG_MSS; + to->mss = clib_net_to_host_u16 (*(u16 *) (data + 2)); + } + break; + case TCP_OPTION_WINDOW_SCALE: + if ((opt_len == TCP_OPTION_LEN_WINDOW_SCALE) && tcp_syn (th)) + { + to->flags |= TCP_OPTS_FLAG_WSCALE; + to->wscale = data[2]; + if (to->wscale > TCP_MAX_WND_SCALE) + { + clib_warning ("Illegal window scaling value: %d", + to->wscale); + to->wscale = TCP_MAX_WND_SCALE; + } + } + break; + case TCP_OPTION_TIMESTAMP: + if (opt_len == TCP_OPTION_LEN_TIMESTAMP) + { + to->flags |= TCP_OPTS_FLAG_TSTAMP; + to->tsval = clib_net_to_host_u32 (*(u32 *) (data + 2)); + to->tsecr = clib_net_to_host_u32 (*(u32 *) (data + 6)); + } + break; + case TCP_OPTION_SACK_PERMITTED: + if (opt_len == TCP_OPTION_LEN_SACK_PERMITTED && tcp_syn (th)) + to->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; + break; + case TCP_OPTION_SACK_BLOCK: + /* If SACK permitted was not advertised or a SYN, break */ + if ((to->flags & TCP_OPTS_FLAG_SACK_PERMITTED) == 0 || tcp_syn (th)) + break; + + /* If too short or not correctly formatted, break */ + if (opt_len < 10 || ((opt_len - 2) % TCP_OPTION_LEN_SACK_BLOCK)) + break; + + to->flags |= TCP_OPTS_FLAG_SACK; + to->n_sack_blocks = (opt_len - 2) / TCP_OPTION_LEN_SACK_BLOCK; + vec_reset_length (to->sacks); + for (j = 0; j < to->n_sack_blocks; j++) + { + b.start = clib_net_to_host_u32 (*(u32 *) (data + 2 + 4 * j)); + b.end = clib_net_to_host_u32 (*(u32 *) (data + 6 + 4 * j)); + vec_add1 (to->sacks, b); + } + break; + default: + /* Nothing to see here */ + continue; + } + } +} + +always_inline int +tcp_segment_check_paws (tcp_connection_t * tc) +{ + /* XXX normally test for timestamp should be lt instead of leq, but for + * local testing this is not enough */ + return tcp_opts_tstamp (&tc->opt) && tc->tsval_recent + && timestamp_lt (tc->opt.tsval, tc->tsval_recent); +} + +/** + * Validate incoming segment as per RFC793 p. 69 and RFC1323 p. 19 + * + * It first verifies if segment has a wrapped sequence number (PAWS) and then + * does the processing associated to the first four steps (ignoring security + * and precedence): sequence number, rst bit and syn bit checks. + * + * @return 0 if segments passes validation. + */ +static int +tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, + vlib_buffer_t * b0, tcp_header_t * th0, u32 * next0) +{ + u8 paws_failed; + + if (PREDICT_FALSE (!tcp_ack (th0) && !tcp_rst (th0) && !tcp_syn (th0))) + return -1; + + tcp_options_parse (th0, &tc0->opt); + + /* RFC1323: Check against wrapped sequence numbers (PAWS). If we have + * timestamp to echo and it's less than tsval_recent, drop segment + * but still send an ACK in order to retain TCP's mechanism for detecting + * and recovering from half-open connections */ + paws_failed = tcp_segment_check_paws (tc0); + if (paws_failed) + { + clib_warning ("paws failed"); + + /* If it just so happens that a segment updates tsval_recent for a + * segment over 24 days old, invalidate tsval_recent. */ + if (timestamp_lt (tc0->tsval_recent_age + TCP_PAWS_IDLE, + tcp_time_now ())) + { + /* Age isn't reset until we get a valid tsval (bsd inspired) */ + tc0->tsval_recent = 0; + } + else + { + /* Drop after ack if not rst */ + if (!tcp_rst (th0)) + { + tcp_make_ack (tc0, b0); + *next0 = tcp_next_output (tc0->c_is_ip4); + return -1; + } + } + } + + /* 1st: check sequence number */ + if (!tcp_segment_in_rcv_wnd (tc0, vnet_buffer (b0)->tcp.seq_number, + vnet_buffer (b0)->tcp.seq_end)) + { + if (!tcp_rst (th0)) + { + /* Send dup ack */ + tcp_make_ack (tc0, b0); + *next0 = tcp_next_output (tc0->c_is_ip4); + } + return -1; + } + + /* 2nd: check the RST bit */ + if (tcp_rst (th0)) + { + /* Notify session that connection has been reset. Switch + * state to closed and await for session to do the cleanup. */ + stream_session_reset_notify (&tc0->connection); + tc0->state = TCP_STATE_CLOSED; + return -1; + } + + /* 3rd: check security and precedence (skip) */ + + /* 4th: check the SYN bit */ + if (tcp_syn (th0)) + { + tcp_send_reset (b0, tc0->c_is_ip4); + return -1; + } + + /* If PAWS passed and segment in window, save timestamp */ + if (!paws_failed) + { + tc0->tsval_recent = tc0->opt.tsval; + tc0->tsval_recent_age = tcp_time_now (); + } + + return 0; +} + +always_inline int +tcp_rcv_ack_is_acceptable (tcp_connection_t * tc0, vlib_buffer_t * tb0) +{ + /* SND.UNA =< SEG.ACK =< SND.NXT */ + return (seq_leq (tc0->snd_una, vnet_buffer (tb0)->tcp.ack_number) + && seq_leq (vnet_buffer (tb0)->tcp.ack_number, tc0->snd_nxt)); +} + +/** + * Compute smoothed RTT as per VJ's '88 SIGCOMM and RFC6298 + * + * Note that although the original article, srtt and rttvar are scaled + * to minimize round-off errors, here we don't. Instead, we rely on + * better precision time measurements. + * + * TODO support us rtt resolution + */ +static void +tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt) +{ + int err; + + if (tc->srtt != 0) + { + err = mrtt - tc->srtt; + tc->srtt += err >> 3; + + /* XXX Drop in RTT results in RTTVAR increase and bigger RTO. + * The increase should be bound */ + tc->rttvar += (clib_abs (err) - tc->rttvar) >> 2; + } + else + { + /* First measurement. */ + tc->srtt = mrtt; + tc->rttvar = mrtt << 1; + } +} + +/** Update RTT estimate and RTO timer + * + * Measure RTT: We have two sources of RTT measurements: TSOPT and ACK + * timing. Middle boxes are known to fiddle with TCP options so we + * should give higher priority to ACK timing. + * + * return 1 if valid rtt 0 otherwise + */ +static int +tcp_update_rtt (tcp_connection_t * tc, u32 ack) +{ + u32 mrtt = 0; + + /* Karn's rule, part 1. Don't use retransmitted segments to estimate + * RTT because they're ambiguous. */ + if (tc->rtt_seq && seq_gt (ack, tc->rtt_seq) && !tc->rto_boff) + { + mrtt = tcp_time_now () - tc->rtt_ts; + tc->rtt_seq = 0; + } + + /* As per RFC7323 TSecr can be used for RTTM only if the segment advances + * snd_una, i.e., the left side of the send window: + * seq_lt (tc->snd_una, ack). Note: last condition could be dropped, we don't + * try to update rtt for dupacks */ + else if (tcp_opts_tstamp (&tc->opt) && tc->opt.tsecr && tc->bytes_acked) + { + mrtt = tcp_time_now () - tc->opt.tsecr; + } + + /* Ignore dubious measurements */ + if (mrtt == 0 || mrtt > TCP_RTT_MAX) + return 0; + + tcp_estimate_rtt (tc, mrtt); + + tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); + + return 1; +} + +/** + * Dequeue bytes that have been acked and while at it update RTT estimates. + */ +static void +tcp_dequeue_acked (tcp_connection_t * tc, u32 ack) +{ + /* Dequeue the newly ACKed bytes */ + stream_session_dequeue_drop (&tc->connection, tc->bytes_acked); + + /* Update rtt and rto */ + if (tcp_update_rtt (tc, ack)) + { + /* Good ACK received and valid RTT, make sure retransmit backoff is 0 */ + tc->rto_boff = 0; + } +} + +/** Check if dupack as per RFC5681 Sec. 2 */ +always_inline u8 +tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 new_snd_wnd) +{ + return ((vnet_buffer (b)->tcp.ack_number == tc->snd_una) + && seq_gt (tc->snd_una_max, tc->snd_una) + && (vnet_buffer (b)->tcp.seq_end == vnet_buffer (b)->tcp.seq_number) + && (new_snd_wnd == tc->snd_wnd)); +} + +void +scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) +{ + sack_scoreboard_hole_t *next, *prev; + + if (hole->next != TCP_INVALID_SACK_HOLE_INDEX) + { + next = pool_elt_at_index (sb->holes, hole->next); + next->prev = hole->prev; + } + + if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX) + { + prev = pool_elt_at_index (sb->holes, hole->prev); + prev->next = hole->next; + } + else + { + sb->head = hole->next; + } + + pool_put (sb->holes, hole); +} + +sack_scoreboard_hole_t * +scoreboard_insert_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * prev, + u32 start, u32 end) +{ + sack_scoreboard_hole_t *hole, *next; + u32 hole_index; + + pool_get (sb->holes, hole); + memset (hole, 0, sizeof (*hole)); + + hole->start = start; + hole->end = end; + hole_index = hole - sb->holes; + + if (prev) + { + hole->prev = prev - sb->holes; + hole->next = prev->next; + + if ((next = scoreboard_next_hole (sb, hole))) + next->prev = hole_index; + + prev->next = hole_index; + } + else + { + sb->head = hole_index; + hole->prev = TCP_INVALID_SACK_HOLE_INDEX; + hole->next = TCP_INVALID_SACK_HOLE_INDEX; + } + + return hole; +} + +static void +tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) +{ + sack_scoreboard_t *sb = &tc->sack_sb; + sack_block_t *blk, tmp; + sack_scoreboard_hole_t *hole, *next_hole; + u32 blk_index = 0; + int i, j; + + if (!tcp_opts_sack (tc) && sb->head == TCP_INVALID_SACK_HOLE_INDEX) + return; + + /* Remove invalid blocks */ + vec_foreach (blk, tc->opt.sacks) + { + if (seq_lt (blk->start, blk->end) + && seq_gt (blk->start, tc->snd_una) + && seq_gt (blk->start, ack) && seq_lt (blk->end, tc->snd_nxt)) + continue; + + vec_del1 (tc->opt.sacks, blk - tc->opt.sacks); + } + + /* Add block for cumulative ack */ + if (seq_gt (ack, tc->snd_una)) + { + tmp.start = tc->snd_una; + tmp.end = ack; + vec_add1 (tc->opt.sacks, tmp); + } + + if (vec_len (tc->opt.sacks) == 0) + return; + + /* Make sure blocks are ordered */ + for (i = 0; i < vec_len (tc->opt.sacks); i++) + for (j = i; j < vec_len (tc->opt.sacks); j++) + if (seq_lt (tc->opt.sacks[j].start, tc->opt.sacks[i].start)) + { + tmp = tc->opt.sacks[i]; + tc->opt.sacks[i] = tc->opt.sacks[j]; + tc->opt.sacks[j] = tmp; + } + + /* If no holes, insert the first that covers all outstanding bytes */ + if (sb->head == TCP_INVALID_SACK_HOLE_INDEX) + { + scoreboard_insert_hole (sb, 0, tc->snd_una, tc->snd_una_max); + } + + /* Walk the holes with the SACK blocks */ + hole = pool_elt_at_index (sb->holes, sb->head); + while (hole && blk_index < vec_len (tc->opt.sacks)) + { + blk = &tc->opt.sacks[blk_index]; + + if (seq_leq (blk->start, hole->start)) + { + /* Block covers hole. Remove hole */ + if (seq_geq (blk->end, hole->end)) + { + next_hole = scoreboard_next_hole (sb, hole); + + /* Byte accounting */ + if (seq_lt (hole->end, ack)) + { + /* Bytes lost because snd wnd left edge advances */ + if (seq_lt (next_hole->start, ack)) + sb->sacked_bytes -= next_hole->start - hole->end; + else + sb->sacked_bytes -= ack - hole->end; + } + else + { + sb->sacked_bytes += scoreboard_hole_bytes (hole); + } + + scoreboard_remove_hole (sb, hole); + hole = next_hole; + } + /* Partial overlap */ + else + { + sb->sacked_bytes += blk->end - hole->start; + hole->start = blk->end; + blk_index++; + } + } + else + { + /* Hole must be split */ + if (seq_leq (blk->end, hole->end)) + { + sb->sacked_bytes += blk->end - blk->start; + scoreboard_insert_hole (sb, hole, blk->end, hole->end); + hole->end = blk->start - 1; + blk_index++; + } + else + { + sb->sacked_bytes += hole->end - blk->start + 1; + hole->end = blk->start - 1; + hole = scoreboard_next_hole (sb, hole); + } + } + } +} + +/** Update snd_wnd + * + * If (SND.WL1 < SEG.SEQ or (SND.WL1 = SEG.SEQ and SND.WL2 =< SEG.ACK)), set + * SND.WND <- SEG.WND, set SND.WL1 <- SEG.SEQ, and set SND.WL2 <- SEG.ACK */ +static void +tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd) +{ + if (tc->snd_wl1 < seq || (tc->snd_wl1 == seq && tc->snd_wl2 <= ack)) + { + tc->snd_wnd = snd_wnd; + tc->snd_wl1 = seq; + tc->snd_wl2 = ack; + } +} + +static void +tcp_cc_congestion (tcp_connection_t * tc) +{ + tc->cc_algo->congestion (tc); +} + +static void +tcp_cc_recover (tcp_connection_t * tc) +{ + if (tcp_in_fastrecovery (tc)) + { + tc->cc_algo->recovered (tc); + tcp_recovery_off (tc); + } + else if (tcp_in_recovery (tc)) + { + tcp_recovery_off (tc); + tc->cwnd = tcp_loss_wnd (tc); + } +} + +static void +tcp_cc_rcv_ack (tcp_connection_t * tc) +{ + u8 partial_ack; + + if (tcp_in_recovery (tc)) + { + partial_ack = seq_lt (tc->snd_una, tc->snd_una_max); + if (!partial_ack) + { + /* Clear retransmitted bytes. */ + tc->rtx_bytes = 0; + tcp_cc_recover (tc); + } + else + { + /* Clear retransmitted bytes. XXX should we clear all? */ + tc->rtx_bytes = 0; + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK); + + /* Retransmit first unacked segment */ + tcp_retransmit_first_unacked (tc); + } + } + else + { + tc->cc_algo->rcv_ack (tc); + } + + tc->rcv_dupacks = 0; + tc->tsecr_last_ack = tc->opt.tsecr; +} + +static void +tcp_cc_rcv_dupack (tcp_connection_t * tc, u32 ack) +{ + ASSERT (tc->snd_una == ack); + + tc->rcv_dupacks++; + if (tc->rcv_dupacks == TCP_DUPACK_THRESHOLD) + { + /* RFC6582 NewReno heuristic to avoid multiple fast retransmits */ + if (tc->opt.tsecr != tc->tsecr_last_ack) + { + tc->rcv_dupacks = 0; + return; + } + + tcp_fastrecovery_on (tc); + + /* Handle congestion and dupack */ + tcp_cc_congestion (tc); + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + + tcp_fast_retransmit (tc); + + /* Post retransmit update cwnd to ssthresh and account for the + * three segments that have left the network and should've been + * buffered at the receiver */ + tc->cwnd = tc->ssthresh + TCP_DUPACK_THRESHOLD * tc->snd_mss; + } + else if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD) + { + ASSERT (tcp_in_fastrecovery (tc)); + + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + } +} + +void +tcp_cc_init (tcp_connection_t * tc) +{ + tc->cc_algo = tcp_cc_algo_get (TCP_CC_NEWRENO); + tc->cc_algo->init (tc); +} + +static int +tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, + tcp_header_t * th, u32 * next, u32 * error) +{ + u32 new_snd_wnd; + + /* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) then send an + * ACK, drop the segment, and return */ + if (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)) + { + tcp_make_ack (tc, b); + *next = tcp_next_output (tc->c_is_ip4); + *error = TCP_ERROR_ACK_INVALID; + return -1; + } + + /* If old ACK, discard */ + if (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una)) + { + *error = TCP_ERROR_ACK_OLD; + return -1; + } + + if (tcp_opts_sack_permitted (&tc->opt)) + tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number); + + new_snd_wnd = clib_net_to_host_u32 (th->window) << tc->snd_wscale; + + if (tcp_ack_is_dupack (tc, b, new_snd_wnd)) + { + tcp_cc_rcv_dupack (tc, vnet_buffer (b)->tcp.ack_number); + *error = TCP_ERROR_ACK_DUP; + return -1; + } + + /* Valid ACK */ + tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una; + tc->snd_una = vnet_buffer (b)->tcp.ack_number; + + /* Dequeue ACKed packet and update RTT */ + tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number); + + tcp_update_snd_wnd (tc, vnet_buffer (b)->tcp.seq_number, + vnet_buffer (b)->tcp.ack_number, new_snd_wnd); + + /* Updates congestion control (slow start/congestion avoidance) */ + tcp_cc_rcv_ack (tc); + + /* If everything has been acked, stop retransmit timer + * otherwise update */ + if (tc->snd_una == tc->snd_una_max) + tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT); + else + tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, tc->rto); + + return 0; +} + +/** + * Build SACK list as per RFC2018. + * + * Makes sure the first block contains the segment that generated the current + * ACK and the following ones are the ones most recently reported in SACK + * blocks. + * + * @param tc TCP connection for which the SACK list is updated + * @param start Start sequence number of the newest SACK block + * @param end End sequence of the newest SACK block + */ +static void +tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end) +{ + sack_block_t *new_list = 0, block; + u32 n_elts; + int i; + u8 new_head = 0; + + /* If the first segment is ooo add it to the list. Last write might've moved + * rcv_nxt over the first segment. */ + if (seq_lt (tc->rcv_nxt, start)) + { + block.start = start; + block.end = end; + vec_add1 (new_list, block); + new_head = 1; + } + + /* Find the blocks still worth keeping. */ + for (i = 0; i < vec_len (tc->snd_sacks); i++) + { + /* Discard if: + * 1) rcv_nxt advanced beyond current block OR + * 2) Segment overlapped by the first segment, i.e., it has been merged + * into it.*/ + if (seq_leq (tc->snd_sacks[i].start, tc->rcv_nxt) + || seq_leq (tc->snd_sacks[i].start, end)) + continue; + + /* Save subsequent segments to new SACK list. */ + n_elts = clib_min (vec_len (tc->snd_sacks) - i, + TCP_MAX_SACK_BLOCKS - new_head); + vec_insert_elts (new_list, &tc->snd_sacks[i], n_elts, new_head); + break; + } + + /* Replace old vector with new one */ + vec_free (tc->snd_sacks); + tc->snd_sacks = new_list; +} + +/** Enqueue data for delivery to application */ +always_inline u32 +tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, + u16 data_len) +{ + int written; + + /* Pure ACK. Update rcv_nxt and be done. */ + if (PREDICT_FALSE (data_len == 0)) + { + tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end; + return TCP_ERROR_PURE_ACK; + } + + written = stream_session_enqueue_data (&tc->connection, + vlib_buffer_get_current (b), + data_len, 1 /* queue event */ ); + + /* Update rcv_nxt */ + if (PREDICT_TRUE (written == data_len)) + { + tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end; + } + /* If more data written than expected, account for out-of-order bytes. */ + else if (written > data_len) + { + tc->rcv_nxt = vnet_buffer (b)->tcp.seq_end + written - data_len; + + /* Send ACK confirming the update */ + tc->flags |= TCP_CONN_SNDACK; + + /* Update SACK list if need be */ + if (tcp_opts_sack_permitted (&tc->opt)) + { + /* Remove SACK blocks that have been delivered */ + tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt); + } + } + else + { + ASSERT (0); + return TCP_ERROR_FIFO_FULL; + } + + return TCP_ERROR_ENQUEUED; +} + +/** Enqueue out-of-order data */ +always_inline u32 +tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, + u16 data_len) +{ + stream_session_t *s0; + u32 offset, seq; + + s0 = stream_session_get (tc->c_s_index, tc->c_thread_index); + seq = vnet_buffer (b)->tcp.seq_number; + offset = seq - tc->rcv_nxt; + + if (svm_fifo_enqueue_with_offset (s0->server_rx_fifo, s0->pid, offset, + data_len, vlib_buffer_get_current (b))) + return TCP_ERROR_FIFO_FULL; + + /* Update SACK list if in use */ + if (tcp_opts_sack_permitted (&tc->opt)) + { + ooo_segment_t *newest; + u32 start, end; + + /* Get the newest segment from the fifo */ + newest = svm_fifo_newest_ooo_segment (s0->server_rx_fifo); + start = tc->rcv_nxt + ooo_segment_offset (s0->server_rx_fifo, newest); + end = tc->rcv_nxt + ooo_segment_end_offset (s0->server_rx_fifo, newest); + + tcp_update_sack_list (tc, start, end); + } + + return TCP_ERROR_ENQUEUED; +} + +/** + * Check if ACK could be delayed. DELACK timer is set only after frame is + * processed so this can return true for a full bursts of packets. + */ +always_inline int +tcp_can_delack (tcp_connection_t * tc) +{ + /* If there's no DELACK timer set and the last window sent wasn't 0 we + * can safely delay. */ + if (!tcp_timer_is_active (tc, TCP_TIMER_DELACK) + && (tc->flags & TCP_CONN_SENT_RCV_WND0) == 0 + && (tc->flags & TCP_CONN_SNDACK) == 0) + return 1; + + return 0; +} + +static int +tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, + u16 n_data_bytes, u32 * next0) +{ + u32 error = 0; + + /* Handle out-of-order data */ + if (PREDICT_FALSE (vnet_buffer (b)->tcp.seq_number != tc->rcv_nxt)) + { + error = tcp_session_enqueue_ooo (tc, b, n_data_bytes); + + /* Don't send more than 3 dupacks per burst + * XXX decide if this is good */ + if (tc->snt_dupacks < 3) + { + /* RFC2581: Send DUPACK for fast retransmit */ + tcp_make_ack (tc, b); + *next0 = tcp_next_output (tc->c_is_ip4); + + /* Mark as DUPACK. We may filter these in output if + * the burst fills the holes. */ + vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_DUPACK; + + tc->snt_dupacks++; + } + + goto done; + } + + /* In order data, enqueue. Fifo figures out by itself if any out-of-order + * segments can be enqueued after fifo tail offset changes. */ + error = tcp_session_enqueue_data (tc, b, n_data_bytes); + + /* Check if ACK can be delayed */ + if (tcp_can_delack (tc)) + { + /* Nothing to do for pure ACKs */ + if (n_data_bytes == 0) + goto done; + + /* If connection has not been previously marked for delay ack + * add it to the list and flag it */ + if (!tc->flags & TCP_CONN_DELACK) + { + vec_add1 (tm->delack_connections[tc->c_thread_index], + tc->c_c_index); + tc->flags |= TCP_CONN_DELACK; + } + } + else + { + /* Check if a packet has already been enqueued to output for burst. + * If yes, then drop this one, otherwise, let it pass through to + * output */ + if ((tc->flags & TCP_CONN_BURSTACK) == 0) + { + *next0 = tcp_next_output (tc->c_is_ip4); + tcp_make_ack (tc, b); + error = TCP_ERROR_ENQUEUED; + + /* TODO: maybe add counter to ensure N acks will be sent/burst */ + tc->flags |= TCP_CONN_BURSTACK; + } + } + +done: + return error; +} + +void +delack_timers_init (tcp_main_t * tm, u32 thread_index) +{ + tcp_connection_t *tc; + u32 i, *conns; + tw_timer_wheel_16t_2w_512sl_t *tw; + + tw = &tm->timer_wheels[thread_index]; + conns = tm->delack_connections[thread_index]; + for (i = 0; i < vec_len (conns); i++) + { + tc = pool_elt_at_index (tm->connections[thread_index], conns[i]); + ASSERT (0 != tc); + + tc->timers[TCP_TIMER_DELACK] + = tw_timer_start_16t_2w_512sl (tw, conns[i], + TCP_TIMER_DELACK, TCP_DELACK_TIME); + } + vec_reset_length (tm->delack_connections[thread_index]); +} + +always_inline uword +tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->cpu_index, errors = 0; + tcp_main_t *tm = vnet_get_tcp_main (); + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + tcp_header_t *th0 = 0; + tcp_connection_t *tc0; + ip4_header_t *ip40; + ip6_header_t *ip60; + u32 n_advance_bytes0, n_data_bytes0; + u32 next0 = TCP_ESTABLISHED_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index, + my_thread_index); + + /* Checksum computed by ipx_local no need to compute again */ + + if (is_ip4) + { + ip40 = vlib_buffer_get_current (b0); + th0 = ip4_next_header (ip40); + n_advance_bytes0 = (ip4_header_bytes (ip40) + + tcp_header_bytes (th0)); + n_data_bytes0 = clib_net_to_host_u16 (ip40->length) + - n_advance_bytes0; + } + else + { + ip60 = vlib_buffer_get_current (b0); + th0 = ip6_next_header (ip60); + n_advance_bytes0 = tcp_header_bytes (th0); + n_data_bytes0 = clib_net_to_host_u16 (ip60->payload_length) + - n_advance_bytes0; + n_advance_bytes0 += sizeof (ip60[0]); + } + + /* SYNs, FINs and data consume sequence numbers */ + vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number + + tcp_is_syn (th0) + tcp_is_fin (th0) + n_data_bytes0; + + /* TODO header prediction fast path */ + + /* 1-4: check SEQ, RST, SYN */ + if (PREDICT_FALSE (tcp_segment_validate (vm, tc0, b0, th0, &next0))) + { + error0 = TCP_ERROR_SEGMENT_INVALID; + goto drop; + } + + /* 5: check the ACK field */ + if (tcp_rcv_ack (tc0, b0, th0, &next0, &error0)) + { + goto drop; + } + + /* 6: check the URG bit TODO */ + + /* 7: process the segment text */ + vlib_buffer_advance (b0, n_advance_bytes0); + error0 = tcp_segment_rcv (tm, tc0, b0, n_data_bytes0, &next0); + + /* 8: check the FIN bit */ + if (tcp_fin (th0)) + { + /* Send ACK and enter CLOSE-WAIT */ + tcp_make_ack (tc0, b0); + tcp_connection_force_ack (tc0, b0); + next0 = tcp_next_output (tc0->c_is_ip4); + tc0->state = TCP_STATE_CLOSE_WAIT; + stream_session_disconnect_notify (&tc0->connection); + } + + drop: + b0->error = node->errors[error0]; + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + errors = session_manager_flush_enqueue_events (my_thread_index); + if (errors) + { + if (is_ip4) + vlib_node_increment_counter (vm, tcp4_established_node.index, + TCP_ERROR_EVENT_FIFO_FULL, errors); + else + vlib_node_increment_counter (vm, tcp6_established_node.index, + TCP_ERROR_EVENT_FIFO_FULL, errors); + } + + delack_timers_init (tm, my_thread_index); + + return from_frame->n_vectors; +} + +static uword +tcp4_established (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_established_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +tcp6_established (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_established_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_established_node) = +{ + .function = tcp4_established, + .name = "tcp4-established", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR,.error_strings = tcp_error_strings, + .n_next_nodes = TCP_ESTABLISHED_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_established_node, tcp4_established); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_established_node) = +{ + .function = tcp6_established, + .name = "tcp6-established", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_ESTABLISHED_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_ESTABLISHED_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, +}; +/* *INDENT-ON* */ + + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_established_node, tcp6_established); + +vlib_node_registration_t tcp4_syn_sent_node; +vlib_node_registration_t tcp6_syn_sent_node; + +always_inline uword +tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->cpu_index, errors = 0; + u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0, ack0, seq0; + vlib_buffer_t *b0; + tcp_header_t *tcp0 = 0; + tcp_connection_t *tc0; + ip4_header_t *ip40; + ip6_header_t *ip60; + u32 n_advance_bytes0, n_data_bytes0; + tcp_connection_t *new_tc0; + u32 next0 = TCP_SYN_SENT_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + tc0 = + tcp_half_open_connection_get (vnet_buffer (b0)-> + tcp.connection_index); + + ack0 = vnet_buffer (b0)->tcp.ack_number; + seq0 = vnet_buffer (b0)->tcp.seq_number; + + /* Checksum computed by ipx_local no need to compute again */ + + if (is_ip4) + { + ip40 = vlib_buffer_get_current (b0); + tcp0 = ip4_next_header (ip40); + n_advance_bytes0 = (ip4_header_bytes (ip40) + + tcp_header_bytes (tcp0)); + n_data_bytes0 = clib_net_to_host_u16 (ip40->length) + - n_advance_bytes0; + } + else + { + ip60 = vlib_buffer_get_current (b0); + tcp0 = ip6_next_header (ip60); + n_advance_bytes0 = tcp_header_bytes (tcp0); + n_data_bytes0 = clib_net_to_host_u16 (ip60->payload_length) + - n_advance_bytes0; + n_advance_bytes0 += sizeof (ip60[0]); + } + + if (PREDICT_FALSE + (!tcp_ack (tcp0) && !tcp_rst (tcp0) && !tcp_syn (tcp0))) + goto drop; + + /* SYNs, FINs and data consume sequence numbers */ + vnet_buffer (b0)->tcp.seq_end = seq0 + tcp_is_syn (tcp0) + + tcp_is_fin (tcp0) + n_data_bytes0; + + /* + * 1. check the ACK bit + */ + + /* + * If the ACK bit is set + * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless + * the RST bit is set, if so drop the segment and return) + * + * and discard the segment. Return. + * If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable. + */ + if (tcp_ack (tcp0)) + { + if (ack0 <= tc0->iss || ack0 > tc0->snd_nxt) + { + if (!tcp_rst (tcp0)) + tcp_send_reset (b0, is_ip4); + + goto drop; + } + + /* Make sure ACK is valid */ + if (tc0->snd_una > ack0) + goto drop; + } + + /* + * 2. check the RST bit + */ + + if (tcp_rst (tcp0)) + { + /* If ACK is acceptable, signal client that peer is not + * willing to accept connection and drop connection*/ + if (tcp_ack (tcp0)) + { + stream_session_connect_notify (&tc0->connection, sst, + 1 /* fail */ ); + tcp_connection_cleanup (tc0); + } + goto drop; + } + + /* + * 3. check the security and precedence (skipped) + */ + + /* + * 4. check the SYN bit + */ + + /* No SYN flag. Drop. */ + if (!tcp_syn (tcp0)) + goto drop; + + /* Stop connection establishment and retransmit timers */ + tcp_timer_reset (tc0, TCP_TIMER_ESTABLISH); + tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT_SYN); + + /* Valid SYN or SYN-ACK. Move connection from half-open pool to + * current thread pool. */ + pool_get (tm->connections[my_thread_index], new_tc0); + clib_memcpy (new_tc0, tc0, sizeof (*new_tc0)); + + new_tc0->c_thread_index = my_thread_index; + + /* Cleanup half-open connection XXX lock */ + pool_put (tm->half_open_connections, tc0); + + new_tc0->rcv_nxt = vnet_buffer (b0)->tcp.seq_end; + new_tc0->irs = seq0; + + /* Parse options */ + tcp_options_parse (tcp0, &new_tc0->opt); + tcp_connection_init_vars (new_tc0); + + if (tcp_opts_tstamp (&new_tc0->opt)) + { + new_tc0->tsval_recent = new_tc0->opt.tsval; + new_tc0->tsval_recent_age = tcp_time_now (); + } + + if (tcp_opts_wscale (&new_tc0->opt)) + new_tc0->snd_wscale = new_tc0->opt.wscale; + + new_tc0->snd_wnd = clib_net_to_host_u32 (tcp0->window) + << new_tc0->snd_wscale; + new_tc0->snd_wl1 = seq0; + new_tc0->snd_wl2 = ack0; + + /* SYN-ACK: See if we can switch to ESTABLISHED state */ + if (tcp_ack (tcp0)) + { + /* Our SYN is ACKed: we have iss < ack = snd_una */ + + /* TODO Dequeue acknowledged segments if we support Fast Open */ + new_tc0->snd_una = ack0; + new_tc0->state = TCP_STATE_ESTABLISHED; + + /* Notify app that we have connection */ + stream_session_connect_notify (&new_tc0->connection, sst, 0); + + /* Make sure after data segment processing ACK is sent */ + new_tc0->flags |= TCP_CONN_SNDACK; + } + /* SYN: Simultaneous open. Change state to SYN-RCVD and send SYN-ACK */ + else + { + new_tc0->state = TCP_STATE_SYN_RCVD; + + /* Notify app that we have connection XXX */ + stream_session_connect_notify (&new_tc0->connection, sst, 0); + + tcp_make_synack (new_tc0, b0); + next0 = tcp_next_output (is_ip4); + + goto drop; + } + + /* Read data, if any */ + if (n_data_bytes0) + { + error0 = + tcp_segment_rcv (tm, new_tc0, b0, n_data_bytes0, &next0); + if (error0 == TCP_ERROR_PURE_ACK) + error0 = TCP_ERROR_SYN_ACKS_RCVD; + } + else + { + tcp_make_ack (new_tc0, b0); + next0 = tcp_next_output (new_tc0->c_is_ip4); + } + + drop: + + b0->error = error0 ? node->errors[error0] : 0; + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + errors = session_manager_flush_enqueue_events (my_thread_index); + if (errors) + { + if (is_ip4) + vlib_node_increment_counter (vm, tcp4_established_node.index, + TCP_ERROR_EVENT_FIFO_FULL, errors); + else + vlib_node_increment_counter (vm, tcp6_established_node.index, + TCP_ERROR_EVENT_FIFO_FULL, errors); + } + + return from_frame->n_vectors; +} + +static uword +tcp4_syn_sent (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_syn_sent_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +tcp6_syn_sent_rcv (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_syn_sent_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_syn_sent_node) = +{ + .function = tcp4_syn_sent, + .name = "tcp4-syn-sent", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_SYN_SENT_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_syn_sent_node, tcp4_syn_sent); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_syn_sent_node) = +{ + .function = tcp6_syn_sent_rcv, + .name = "tcp6-syn-sent", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_SYN_SENT_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_SYN_SENT_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + } +,}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_syn_sent_node, tcp6_syn_sent_rcv); +/** + * Handles reception for all states except LISTEN, SYN-SEND and ESTABLISHED + * as per RFC793 p. 64 + */ +always_inline uword +tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->cpu_index, errors = 0; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + tcp_header_t *tcp0 = 0; + tcp_connection_t *tc0; + ip4_header_t *ip40; + ip6_header_t *ip60; + u32 n_advance_bytes0, n_data_bytes0; + u32 next0 = TCP_RCV_PROCESS_NEXT_DROP, error0 = TCP_ERROR_ENQUEUED; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index, + my_thread_index); + + /* Checksum computed by ipx_local no need to compute again */ + + if (is_ip4) + { + ip40 = vlib_buffer_get_current (b0); + tcp0 = ip4_next_header (ip40); + n_advance_bytes0 = (ip4_header_bytes (ip40) + + tcp_header_bytes (tcp0)); + n_data_bytes0 = clib_net_to_host_u16 (ip40->length) + - n_advance_bytes0; + } + else + { + ip60 = vlib_buffer_get_current (b0); + tcp0 = ip6_next_header (ip60); + n_advance_bytes0 = tcp_header_bytes (tcp0); + n_data_bytes0 = clib_net_to_host_u16 (ip60->payload_length) + - n_advance_bytes0; + n_advance_bytes0 += sizeof (ip60[0]); + } + + /* SYNs, FINs and data consume sequence numbers */ + vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number + + tcp_is_syn (tcp0) + tcp_is_fin (tcp0) + n_data_bytes0; + + /* + * Special treatment for CLOSED + */ + switch (tc0->state) + { + case TCP_STATE_CLOSED: + goto drop; + break; + } + + /* + * For all other states (except LISTEN) + */ + + /* 1-4: check SEQ, RST, SYN */ + if (PREDICT_FALSE + (tcp_segment_validate (vm, tc0, b0, tcp0, &next0))) + { + error0 = TCP_ERROR_SEGMENT_INVALID; + goto drop; + } + + /* 5: check the ACK field */ + switch (tc0->state) + { + case TCP_STATE_SYN_RCVD: + /* + * If the segment acknowledgment is not acceptable, form a + * reset segment, + * + * and send it. + */ + if (!tcp_rcv_ack_is_acceptable (tc0, b0)) + { + tcp_send_reset (b0, is_ip4); + goto drop; + } + /* Switch state to ESTABLISHED */ + tc0->state = TCP_STATE_ESTABLISHED; + + /* Initialize session variables */ + tc0->snd_una = vnet_buffer (b0)->tcp.ack_number; + tc0->snd_wnd = clib_net_to_host_u32 (tcp0->window) + << tc0->opt.wscale; + tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number; + tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number; + + /* Shoulder tap the server */ + stream_session_accept_notify (&tc0->connection); + + tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT_SYN); + break; + case TCP_STATE_ESTABLISHED: + /* We can get packets in established state here because they + * were enqueued before state change */ + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + + break; + case TCP_STATE_FIN_WAIT_1: + /* In addition to the processing for the ESTABLISHED state, if + * our FIN is now acknowledged then enter FIN-WAIT-2 and + * continue processing in that state. */ + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + tc0->state = TCP_STATE_FIN_WAIT_2; + /* Stop all timers, 2MSL will be set lower */ + tcp_connection_timers_reset (tc0); + break; + case TCP_STATE_FIN_WAIT_2: + /* In addition to the processing for the ESTABLISHED state, if + * the retransmission queue is empty, the user's CLOSE can be + * acknowledged ("ok") but do not delete the TCB. */ + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + /* check if rtx queue is empty and ack CLOSE TODO */ + break; + case TCP_STATE_CLOSE_WAIT: + /* Do the same processing as for the ESTABLISHED state. */ + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + break; + case TCP_STATE_CLOSING: + /* In addition to the processing for the ESTABLISHED state, if + * the ACK acknowledges our FIN then enter the TIME-WAIT state, + * otherwise ignore the segment. */ + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + + /* XXX test that send queue empty */ + tc0->state = TCP_STATE_TIME_WAIT; + goto drop; + + break; + case TCP_STATE_LAST_ACK: + /* The only thing that can arrive in this state is an + * acknowledgment of our FIN. If our FIN is now acknowledged, + * delete the TCB, enter the CLOSED state, and return. */ + + if (!tcp_rcv_ack_is_acceptable (tc0, b0)) + goto drop; + + tcp_connection_del (tc0); + goto drop; + + break; + case TCP_STATE_TIME_WAIT: + /* The only thing that can arrive in this state is a + * retransmission of the remote FIN. Acknowledge it, and restart + * the 2 MSL timeout. */ + + /* TODO */ + goto drop; + break; + default: + ASSERT (0); + } + + /* 6: check the URG bit TODO */ + + /* 7: process the segment text */ + switch (tc0->state) + { + case TCP_STATE_ESTABLISHED: + case TCP_STATE_FIN_WAIT_1: + case TCP_STATE_FIN_WAIT_2: + error0 = tcp_segment_rcv (tm, tc0, b0, n_data_bytes0, &next0); + break; + case TCP_STATE_CLOSE_WAIT: + case TCP_STATE_CLOSING: + case TCP_STATE_LAST_ACK: + case TCP_STATE_TIME_WAIT: + /* This should not occur, since a FIN has been received from the + * remote side. Ignore the segment text. */ + break; + } + + /* 8: check the FIN bit */ + if (!tcp_fin (tcp0)) + goto drop; + + switch (tc0->state) + { + case TCP_STATE_ESTABLISHED: + case TCP_STATE_SYN_RCVD: + /* Send FIN-ACK notify app and enter CLOSE-WAIT */ + tcp_connection_timers_reset (tc0); + tcp_make_finack (tc0, b0); + next0 = tcp_next_output (tc0->c_is_ip4); + stream_session_disconnect_notify (&tc0->connection); + tc0->state = TCP_STATE_CLOSE_WAIT; + break; + case TCP_STATE_CLOSE_WAIT: + case TCP_STATE_CLOSING: + case TCP_STATE_LAST_ACK: + /* move along .. */ + break; + case TCP_STATE_FIN_WAIT_1: + tc0->state = TCP_STATE_TIME_WAIT; + tcp_connection_timers_reset (tc0); + tcp_timer_set (tc0, TCP_TIMER_2MSL, TCP_2MSL_TIME); + break; + case TCP_STATE_FIN_WAIT_2: + /* Got FIN, send ACK! */ + tc0->state = TCP_STATE_TIME_WAIT; + tcp_timer_set (tc0, TCP_TIMER_2MSL, TCP_2MSL_TIME); + tcp_make_ack (tc0, b0); + next0 = tcp_next_output (is_ip4); + break; + case TCP_STATE_TIME_WAIT: + /* Remain in the TIME-WAIT state. Restart the 2 MSL time-wait + * timeout. + */ + tcp_timer_update (tc0, TCP_TIMER_2MSL, TCP_2MSL_TIME); + break; + } + + b0->error = error0 ? node->errors[error0] : 0; + + drop: + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + errors = session_manager_flush_enqueue_events (my_thread_index); + if (errors) + { + if (is_ip4) + vlib_node_increment_counter (vm, tcp4_established_node.index, + TCP_ERROR_EVENT_FIFO_FULL, errors); + else + vlib_node_increment_counter (vm, tcp6_established_node.index, + TCP_ERROR_EVENT_FIFO_FULL, errors); + } + + return from_frame->n_vectors; +} + +static uword +tcp4_rcv_process (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_rcv_process_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +tcp6_rcv_process (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_rcv_process_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_rcv_process_node) = +{ + .function = tcp4_rcv_process, + .name = "tcp4-rcv-process", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_RCV_PROCESS_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_rcv_process_node, tcp4_rcv_process); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_rcv_process_node) = +{ + .function = tcp6_rcv_process, + .name = "tcp6-rcv-process", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_RCV_PROCESS_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_RCV_PROCESS_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_rcv_process_node, tcp6_rcv_process); + +vlib_node_registration_t tcp4_listen_node; +vlib_node_registration_t tcp6_listen_node; + +/** + * LISTEN state processing as per RFC 793 p. 65 + */ +always_inline uword +tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->cpu_index; + tcp_main_t *tm = vnet_get_tcp_main (); + u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + tcp_header_t *th0 = 0; + tcp_connection_t *lc0; + ip4_header_t *ip40; + ip6_header_t *ip60; + tcp_connection_t *child0; + u32 error0 = TCP_ERROR_SYNS_RCVD, next0 = TCP_LISTEN_NEXT_DROP; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + lc0 = tcp_listener_get (vnet_buffer (b0)->tcp.connection_index); + + if (is_ip4) + { + ip40 = vlib_buffer_get_current (b0); + th0 = ip4_next_header (ip40); + } + else + { + ip60 = vlib_buffer_get_current (b0); + th0 = ip6_next_header (ip60); + } + + /* Create child session. For syn-flood protection use filter */ + + /* 1. first check for an RST */ + if (tcp_rst (th0)) + goto drop; + + /* 2. second check for an ACK */ + if (tcp_ack (th0)) + { + tcp_send_reset (b0, is_ip4); + goto drop; + } + + /* 3. check for a SYN (did that already) */ + + /* Create child session and send SYN-ACK */ + pool_get (tm->connections[my_thread_index], child0); + memset (child0, 0, sizeof (*child0)); + + child0->c_c_index = child0 - tm->connections[my_thread_index]; + child0->c_lcl_port = lc0->c_lcl_port; + child0->c_rmt_port = th0->src_port; + child0->c_is_ip4 = is_ip4; + child0->c_thread_index = my_thread_index; + + if (is_ip4) + { + child0->c_lcl_ip4.as_u32 = ip40->dst_address.as_u32; + child0->c_rmt_ip4.as_u32 = ip40->src_address.as_u32; + } + else + { + clib_memcpy (&child0->c_lcl_ip6, &ip60->dst_address, + sizeof (ip6_address_t)); + clib_memcpy (&child0->c_rmt_ip6, &ip60->src_address, + sizeof (ip6_address_t)); + } + + if (stream_session_accept (&child0->connection, lc0->c_s_index, sst, + 0 /* notify */ )) + { + error0 = TCP_ERROR_CREATE_SESSION_FAIL; + goto drop; + } + + tcp_options_parse (th0, &child0->opt); + tcp_connection_init_vars (child0); + + child0->irs = vnet_buffer (b0)->tcp.seq_number; + child0->rcv_nxt = vnet_buffer (b0)->tcp.seq_number + 1; + child0->state = TCP_STATE_SYN_RCVD; + + /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK} + * segments are used to initialize PAWS. */ + if (tcp_opts_tstamp (&child0->opt)) + { + child0->tsval_recent = child0->opt.tsval; + child0->tsval_recent_age = tcp_time_now (); + } + + /* Reuse buffer to make syn-ack and send */ + tcp_make_synack (child0, b0); + next0 = tcp_next_output (is_ip4); + + drop: + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + + } + + b0->error = error0 ? node->errors[error0] : 0; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + return from_frame->n_vectors; +} + +static uword +tcp4_listen (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_listen_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +tcp6_listen (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_listen_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_listen_node) = +{ + .function = tcp4_listen, + .name = "tcp4-listen", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_LISTEN_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_LISTEN_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_listen_node, tcp4_listen); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_listen_node) = +{ + .function = tcp6_listen, + .name = "tcp6-listen", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_LISTEN_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_LISTEN_NEXT_##s] = n, + foreach_tcp_state_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_listen_node, tcp6_listen); + +vlib_node_registration_t tcp4_input_node; +vlib_node_registration_t tcp6_input_node; + +typedef enum _tcp_input_next +{ + TCP_INPUT_NEXT_DROP, + TCP_INPUT_NEXT_LISTEN, + TCP_INPUT_NEXT_RCV_PROCESS, + TCP_INPUT_NEXT_SYN_SENT, + TCP_INPUT_NEXT_ESTABLISHED, + TCP_INPUT_NEXT_RESET, + TCP_INPUT_N_NEXT +} tcp_input_next_t; + +#define foreach_tcp4_input_next \ + _ (DROP, "error-drop") \ + _ (LISTEN, "tcp4-listen") \ + _ (RCV_PROCESS, "tcp4-rcv-process") \ + _ (SYN_SENT, "tcp4-syn-sent") \ + _ (ESTABLISHED, "tcp4-established") \ + _ (RESET, "tcp4-reset") + +#define foreach_tcp6_input_next \ + _ (DROP, "error-drop") \ + _ (LISTEN, "tcp6-listen") \ + _ (RCV_PROCESS, "tcp6-rcv-process") \ + _ (SYN_SENT, "tcp6-syn-sent") \ + _ (ESTABLISHED, "tcp6-established") \ + _ (RESET, "tcp6-reset") + +typedef struct +{ + u16 src_port; + u16 dst_port; + u8 state; +} tcp_rx_trace_t; + +const char *tcp_fsm_states[] = { +#define _(sym, str) str, + foreach_tcp_fsm_state +#undef _ +}; + +u8 * +format_tcp_state (u8 * s, va_list * args) +{ + tcp_state_t *state = va_arg (*args, tcp_state_t *); + + if (state[0] < TCP_N_STATES) + s = format (s, "%s", tcp_fsm_states[state[0]]); + else + s = format (s, "UNKNOWN"); + + return s; +} + +u8 * +format_tcp_rx_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + tcp_rx_trace_t *t = va_arg (*args, tcp_rx_trace_t *); + + s = format (s, "TCP: src-port %d dst-port %U%s\n", + clib_net_to_host_u16 (t->src_port), + clib_net_to_host_u16 (t->dst_port), format_tcp_state, t->state); + + return s; +} + +#define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN) + +always_inline uword +tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->cpu_index; + tcp_main_t *tm = vnet_get_tcp_main (); + session_manager_main_t *ssm = vnet_get_session_manager_main (); + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + tcp_header_t *tcp0 = 0; + tcp_connection_t *tc0; + ip4_header_t *ip40; + ip6_header_t *ip60; + u32 error0 = TCP_ERROR_NO_LISTENER, next0 = TCP_INPUT_NEXT_DROP; + u8 flags0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + if (is_ip4) + { + ip40 = vlib_buffer_get_current (b0); + tcp0 = ip4_next_header (ip40); + + /* lookup session */ + tc0 = + (tcp_connection_t *) stream_session_lookup_transport4 (ssm, + &ip40->dst_address, + &ip40->src_address, + tcp0->dst_port, + tcp0->src_port, + SESSION_TYPE_IP4_TCP, + my_thread_index); + } + else + { + ip60 = vlib_buffer_get_current (b0); + tcp0 = ip6_next_header (ip60); + tc0 = + (tcp_connection_t *) stream_session_lookup_transport6 (ssm, + &ip60->src_address, + &ip60->dst_address, + tcp0->src_port, + tcp0->dst_port, + SESSION_TYPE_IP6_TCP, + my_thread_index); + } + + /* Session exists */ + if (PREDICT_TRUE (0 != tc0)) + { + /* Save connection index */ + vnet_buffer (b0)->tcp.connection_index = tc0->c_c_index; + vnet_buffer (b0)->tcp.seq_number = + clib_net_to_host_u32 (tcp0->seq_number); + vnet_buffer (b0)->tcp.ack_number = + clib_net_to_host_u32 (tcp0->ack_number); + + flags0 = tcp0->flags & filter_flags; + next0 = tm->dispatch_table[tc0->state][flags0].next; + error0 = tm->dispatch_table[tc0->state][flags0].error; + + if (PREDICT_FALSE (error0 == TCP_ERROR_DISPATCH)) + { + /* Overload tcp flags to store state */ + vnet_buffer (b0)->tcp.flags = tc0->state; + } + } + else + { + /* Send reset */ + next0 = TCP_INPUT_NEXT_RESET; + error0 = TCP_ERROR_NO_LISTENER; + vnet_buffer (b0)->tcp.flags = 0; + } + + b0->error = error0 ? node->errors[error0] : 0; + + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return from_frame->n_vectors; +} + +static uword +tcp4_input (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +tcp6_input (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_input_node) = +{ + .function = tcp4_input, + .name = "tcp4-input", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_INPUT_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_INPUT_NEXT_##s] = n, + foreach_tcp4_input_next +#undef _ + }, + .format_buffer = format_tcp_header, + .format_trace = format_tcp_rx_trace, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_input_node, tcp4_input); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_input_node) = +{ + .function = tcp6_input, + .name = "tcp6-input", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_INPUT_N_NEXT, + .next_nodes = + { +#define _(s,n) [TCP_INPUT_NEXT_##s] = n, + foreach_tcp6_input_next +#undef _ + }, + .format_buffer = format_tcp_header, + .format_trace = format_tcp_rx_trace, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_input_node, tcp6_input); +void +tcp_update_time (f64 now, u32 thread_index) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + tw_timer_expire_timers_16t_2w_512sl (&tm->timer_wheels[thread_index], now); +} + +static void +tcp_dispatch_table_init (tcp_main_t * tm) +{ + int i, j; + for (i = 0; i < ARRAY_LEN (tm->dispatch_table); i++) + for (j = 0; j < ARRAY_LEN (tm->dispatch_table[i]); j++) + { + tm->dispatch_table[i][j].next = TCP_INPUT_NEXT_DROP; + tm->dispatch_table[i][j].error = TCP_ERROR_DISPATCH; + } + +#define _(t,f,n,e) \ +do { \ + tm->dispatch_table[TCP_STATE_##t][f].next = (n); \ + tm->dispatch_table[TCP_STATE_##t][f].error = (e); \ +} while (0) + + /* SYNs for new connections -> tcp-listen. */ + _(LISTEN, TCP_FLAG_SYN, TCP_INPUT_NEXT_LISTEN, TCP_ERROR_NONE); + /* ACK for for a SYN-ACK -> tcp-rcv-process. */ + _(SYN_RCVD, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + /* SYN-ACK for a SYN */ + _(SYN_SENT, TCP_FLAG_SYN | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, + TCP_ERROR_NONE); + _(SYN_SENT, TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE); + _(SYN_SENT, TCP_FLAG_RST, TCP_INPUT_NEXT_SYN_SENT, TCP_ERROR_NONE); + _(SYN_SENT, TCP_FLAG_RST | TCP_FLAG_ACK, TCP_INPUT_NEXT_SYN_SENT, + TCP_ERROR_NONE); + /* ACK for for established connection -> tcp-established. */ + _(ESTABLISHED, TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); + /* FIN for for established connection -> tcp-established. */ + _(ESTABLISHED, TCP_FLAG_FIN, TCP_INPUT_NEXT_ESTABLISHED, TCP_ERROR_NONE); + _(ESTABLISHED, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_ESTABLISHED, + TCP_ERROR_NONE); + /* ACK or FIN-ACK to our FIN */ + _(FIN_WAIT_1, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(FIN_WAIT_1, TCP_FLAG_ACK | TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, + TCP_ERROR_NONE); + /* FIN in reply to our FIN from the other side */ + _(FIN_WAIT_1, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + /* FIN confirming that the peer (app) has closed */ + _(FIN_WAIT_2, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(FIN_WAIT_2, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, + TCP_ERROR_NONE); + _(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); +#undef _ +} + +clib_error_t * +tcp_input_init (vlib_main_t * vm) +{ + clib_error_t *error = 0; + tcp_main_t *tm = vnet_get_tcp_main (); + + if ((error = vlib_call_init_function (vm, tcp_init))) + return error; + + /* Initialize dispatch table. */ + tcp_dispatch_table_init (tm); + + return error; +} + +VLIB_INIT_FUNCTION (tcp_input_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c new file mode 100644 index 00000000..856dffe4 --- /dev/null +++ b/src/vnet/tcp/tcp_newreno.c @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +void +newreno_congestion (tcp_connection_t * tc) +{ + tc->prev_ssthresh = tc->ssthresh; + tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss); +} + +void +newreno_recovered (tcp_connection_t * tc) +{ + tc->cwnd = tc->ssthresh; +} + +void +newreno_rcv_ack (tcp_connection_t * tc) +{ + if (tcp_in_slowstart (tc)) + { + tc->cwnd += clib_min (tc->snd_mss, tc->bytes_acked); + } + else + { + /* Round up to 1 if needed */ + tc->cwnd += clib_max (tc->snd_mss * tc->snd_mss / tc->cwnd, 1); + } +} + +void +newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type) +{ + if (ack_type == TCP_CC_DUPACK) + { + tc->cwnd += tc->snd_mss; + } + else if (ack_type == TCP_CC_PARTIALACK) + { + tc->cwnd -= tc->bytes_acked; + if (tc->bytes_acked > tc->snd_mss) + tc->bytes_acked += tc->snd_mss; + } +} + +void +newreno_conn_init (tcp_connection_t * tc) +{ + tc->ssthresh = tc->snd_wnd; + tc->cwnd = tcp_initial_cwnd (tc); +} + +const static tcp_cc_algorithm_t tcp_newreno = { + .congestion = newreno_congestion, + .recovered = newreno_recovered, + .rcv_ack = newreno_rcv_ack, + .rcv_cong_ack = newreno_rcv_cong_ack, + .init = newreno_conn_init +}; + +clib_error_t * +newreno_init (vlib_main_t * vm) +{ + clib_error_t *error = 0; + + tcp_cc_algo_register (TCP_CC_NEWRENO, &tcp_newreno); + + return error; +} + +VLIB_INIT_FUNCTION (newreno_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c new file mode 100644 index 00000000..dbcf1f74 --- /dev/null +++ b/src/vnet/tcp/tcp_output.c @@ -0,0 +1,1412 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +vlib_node_registration_t tcp4_output_node; +vlib_node_registration_t tcp6_output_node; + +typedef enum _tcp_output_nect +{ + TCP_OUTPUT_NEXT_DROP, + TCP_OUTPUT_NEXT_IP_LOOKUP, + TCP_OUTPUT_N_NEXT +} tcp_output_next_t; + +#define foreach_tcp4_output_next \ + _ (DROP, "error-drop") \ + _ (IP_LOOKUP, "ip4-lookup") + +#define foreach_tcp6_output_next \ + _ (DROP, "error-drop") \ + _ (IP_LOOKUP, "ip6-lookup") + +static char *tcp_error_strings[] = { +#define tcp_error(n,s) s, +#include +#undef tcp_error +}; + +typedef struct +{ + u16 src_port; + u16 dst_port; + u8 state; +} tcp_tx_trace_t; + +u16 dummy_mtu = 400; + +u8 * +format_tcp_tx_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + + s = format (s, "TBD\n"); + + return s; +} + +void +tcp_set_snd_mss (tcp_connection_t * tc) +{ + u16 snd_mss; + + /* TODO find our iface MTU */ + snd_mss = dummy_mtu; + + /* TODO cache mss and consider PMTU discovery */ + snd_mss = tc->opt.mss < snd_mss ? tc->opt.mss : snd_mss; + + tc->snd_mss = snd_mss; + + if (tc->snd_mss == 0) + { + clib_warning ("snd mss is 0"); + tc->snd_mss = dummy_mtu; + } +} + +static u8 +tcp_window_compute_scale (u32 available_space) +{ + u8 wnd_scale = 0; + while (wnd_scale < TCP_MAX_WND_SCALE + && (available_space >> wnd_scale) > TCP_WND_MAX) + wnd_scale++; + return wnd_scale; +} + +/** + * Compute initial window and scale factor. As per RFC1323, window field in + * SYN and SYN-ACK segments is never scaled. + */ +u32 +tcp_initial_window_to_advertise (tcp_connection_t * tc) +{ + u32 available_space; + + /* Initial wnd for SYN. Fifos are not allocated yet. + * Use some predefined value */ + if (tc->state != TCP_STATE_SYN_RCVD) + { + return TCP_DEFAULT_RX_FIFO_SIZE; + } + + available_space = stream_session_max_enqueue (&tc->connection); + tc->rcv_wscale = tcp_window_compute_scale (available_space); + tc->rcv_wnd = clib_min (available_space, TCP_WND_MAX << tc->rcv_wscale); + + return clib_min (tc->rcv_wnd, TCP_WND_MAX); +} + +/** + * Compute and return window to advertise, scaled as per RFC1323 + */ +u32 +tcp_window_to_advertise (tcp_connection_t * tc, tcp_state_t state) +{ + u32 available_space, wnd, scaled_space; + + if (state != TCP_STATE_ESTABLISHED) + return tcp_initial_window_to_advertise (tc); + + available_space = stream_session_max_enqueue (&tc->connection); + scaled_space = available_space >> tc->rcv_wscale; + + /* Need to update scale */ + if (PREDICT_FALSE ((scaled_space == 0 && available_space != 0)) + || (scaled_space >= TCP_WND_MAX)) + tc->rcv_wscale = tcp_window_compute_scale (available_space); + + wnd = clib_min (available_space, TCP_WND_MAX << tc->rcv_wscale); + tc->rcv_wnd = wnd; + + return wnd >> tc->rcv_wscale; +} + +/** + * Write TCP options to segment. + */ +u32 +tcp_options_write (u8 * data, tcp_options_t * opts) +{ + u32 opts_len = 0; + u32 buf, seq_len = 4; + + if (tcp_opts_mss (opts)) + { + *data++ = TCP_OPTION_MSS; + *data++ = TCP_OPTION_LEN_MSS; + buf = clib_host_to_net_u16 (opts->mss); + clib_memcpy (data, &buf, sizeof (opts->mss)); + data += sizeof (opts->mss); + opts_len += TCP_OPTION_LEN_MSS; + } + + if (tcp_opts_wscale (opts)) + { + *data++ = TCP_OPTION_WINDOW_SCALE; + *data++ = TCP_OPTION_LEN_WINDOW_SCALE; + *data++ = opts->wscale; + opts_len += TCP_OPTION_LEN_WINDOW_SCALE; + } + + if (tcp_opts_sack_permitted (opts)) + { + *data++ = TCP_OPTION_SACK_PERMITTED; + *data++ = TCP_OPTION_LEN_SACK_PERMITTED; + opts_len += TCP_OPTION_LEN_SACK_PERMITTED; + } + + if (tcp_opts_tstamp (opts)) + { + *data++ = TCP_OPTION_TIMESTAMP; + *data++ = TCP_OPTION_LEN_TIMESTAMP; + buf = clib_host_to_net_u32 (opts->tsval); + clib_memcpy (data, &buf, sizeof (opts->tsval)); + data += sizeof (opts->tsval); + buf = clib_host_to_net_u32 (opts->tsecr); + clib_memcpy (data, &buf, sizeof (opts->tsecr)); + data += sizeof (opts->tsecr); + opts_len += TCP_OPTION_LEN_TIMESTAMP; + } + + if (tcp_opts_sack (opts)) + { + int i; + u32 n_sack_blocks = clib_min (vec_len (opts->sacks), + TCP_OPTS_MAX_SACK_BLOCKS); + + if (n_sack_blocks != 0) + { + *data++ = TCP_OPTION_SACK_BLOCK; + *data++ = 2 + n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK; + for (i = 0; i < n_sack_blocks; i++) + { + buf = clib_host_to_net_u32 (opts->sacks[i].start); + clib_memcpy (data, &buf, seq_len); + data += seq_len; + buf = clib_host_to_net_u32 (opts->sacks[i].end); + clib_memcpy (data, &buf, seq_len); + data += seq_len; + } + opts_len += 2 + n_sack_blocks * TCP_OPTION_LEN_SACK_BLOCK; + } + } + + /* Terminate TCP options */ + if (opts_len % 4) + { + *data++ = TCP_OPTION_EOL; + opts_len += TCP_OPTION_LEN_EOL; + } + + /* Pad with zeroes to a u32 boundary */ + while (opts_len % 4) + { + *data++ = TCP_OPTION_NOOP; + opts_len += TCP_OPTION_LEN_NOOP; + } + return opts_len; +} + +always_inline int +tcp_make_syn_options (tcp_options_t * opts, u32 initial_wnd) +{ + u8 len = 0; + + opts->flags |= TCP_OPTS_FLAG_MSS; + opts->mss = dummy_mtu; /*XXX discover that */ + len += TCP_OPTION_LEN_MSS; + + opts->flags |= TCP_OPTS_FLAG_WSCALE; + opts->wscale = tcp_window_compute_scale (initial_wnd); + len += TCP_OPTION_LEN_WINDOW_SCALE; + + opts->flags |= TCP_OPTS_FLAG_TSTAMP; + opts->tsval = tcp_time_now (); + opts->tsecr = 0; + len += TCP_OPTION_LEN_TIMESTAMP; + + opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; + len += TCP_OPTION_LEN_SACK_PERMITTED; + + /* Align to needed boundary */ + len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN; + return len; +} + +always_inline int +tcp_make_synack_options (tcp_connection_t * tc, tcp_options_t * opts) +{ + u8 len = 0; + + opts->flags |= TCP_OPTS_FLAG_MSS; + opts->mss = dummy_mtu; /*XXX discover that */ + len += TCP_OPTION_LEN_MSS; + + if (tcp_opts_wscale (&tc->opt)) + { + opts->flags |= TCP_OPTS_FLAG_WSCALE; + opts->wscale = tc->rcv_wscale; + len += TCP_OPTION_LEN_WINDOW_SCALE; + } + + if (tcp_opts_tstamp (&tc->opt)) + { + opts->flags |= TCP_OPTS_FLAG_TSTAMP; + opts->tsval = tcp_time_now (); + opts->tsecr = tc->tsval_recent; + len += TCP_OPTION_LEN_TIMESTAMP; + } + + if (tcp_opts_sack_permitted (&tc->opt)) + { + opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; + len += TCP_OPTION_LEN_SACK_PERMITTED; + } + + /* Align to needed boundary */ + len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN; + return len; +} + +always_inline int +tcp_make_established_options (tcp_connection_t * tc, tcp_options_t * opts) +{ + u8 len = 0; + + opts->flags = 0; + + if (tcp_opts_tstamp (&tc->opt)) + { + opts->flags |= TCP_OPTS_FLAG_TSTAMP; + opts->tsval = tcp_time_now (); + opts->tsecr = tc->tsval_recent; + len += TCP_OPTION_LEN_TIMESTAMP; + } + if (tcp_opts_sack_permitted (&tc->opt)) + { + if (vec_len (tc->snd_sacks)) + { + opts->flags |= TCP_OPTS_FLAG_SACK; + opts->sacks = tc->snd_sacks; + opts->n_sack_blocks = vec_len (tc->snd_sacks); + len += 2 + TCP_OPTION_LEN_SACK_BLOCK * opts->n_sack_blocks; + } + } + + /* Align to needed boundary */ + len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN; + return len; +} + +always_inline int +tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts, + tcp_state_t state) +{ + switch (state) + { + case TCP_STATE_ESTABLISHED: + case TCP_STATE_FIN_WAIT_1: + return tcp_make_established_options (tc, opts); + case TCP_STATE_SYN_RCVD: + return tcp_make_synack_options (tc, opts); + case TCP_STATE_SYN_SENT: + return tcp_make_syn_options (opts, + tcp_initial_window_to_advertise (tc)); + default: + clib_warning ("Not handled!"); + return 0; + } +} + +#define tcp_get_free_buffer_index(tm, bidx) \ +do { \ + u32 *my_tx_buffers, n_free_buffers; \ + u32 cpu_index = tm->vlib_main->cpu_index; \ + my_tx_buffers = tm->tx_buffers[cpu_index]; \ + if (PREDICT_FALSE(vec_len (my_tx_buffers) == 0)) \ + { \ + n_free_buffers = 32; /* TODO config or macro */ \ + vec_validate (my_tx_buffers, n_free_buffers - 1); \ + _vec_len(my_tx_buffers) = vlib_buffer_alloc_from_free_list ( \ + tm->vlib_main, my_tx_buffers, n_free_buffers, \ + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); \ + tm->tx_buffers[cpu_index] = my_tx_buffers; \ + } \ + /* buffer shortage */ \ + if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) \ + return; \ + *bidx = my_tx_buffers[_vec_len (my_tx_buffers)-1]; \ + _vec_len (my_tx_buffers) -= 1; \ +} while (0) + +always_inline void +tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b) +{ + vlib_buffer_t *it = b; + do + { + it->current_data = 0; + it->current_length = 0; + it->total_length_not_including_first_buffer = 0; + } + while ((it->flags & VLIB_BUFFER_NEXT_PRESENT) + && (it = vlib_get_buffer (vm, it->next_buffer))); + + /* Leave enough space for headers */ + vlib_buffer_make_headroom (b, MAX_HDRS_LEN); +} + +/** + * Prepare ACK + */ +void +tcp_make_ack_i (tcp_connection_t * tc, vlib_buffer_t * b, tcp_state_t state, + u8 flags) +{ + tcp_options_t _snd_opts, *snd_opts = &_snd_opts; + u8 tcp_opts_len, tcp_hdr_opts_len; + tcp_header_t *th; + u16 wnd; + + wnd = tcp_window_to_advertise (tc, state); + + /* Make and write options */ + tcp_opts_len = tcp_make_established_options (tc, snd_opts); + tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t); + + th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt, + tc->rcv_nxt, tcp_hdr_opts_len, flags, wnd); + + tcp_options_write ((u8 *) (th + 1), snd_opts); + + /* Mark as ACK */ + vnet_buffer (b)->tcp.connection_index = tc->c_c_index; +} + +/** + * Convert buffer to ACK + */ +void +tcp_make_ack (tcp_connection_t * tc, vlib_buffer_t * b) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = tm->vlib_main; + + tcp_reuse_buffer (vm, b); + tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_ACK); + vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; +} + +/** + * Convert buffer to FIN-ACK + */ +void +tcp_make_finack (tcp_connection_t * tc, vlib_buffer_t * b) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = tm->vlib_main; + + tcp_reuse_buffer (vm, b); + tcp_make_ack_i (tc, b, TCP_STATE_ESTABLISHED, TCP_FLAG_ACK | TCP_FLAG_FIN); + + /* Reset flags, make sure ack is sent */ + tc->flags = TCP_CONN_SNDACK; + vnet_buffer (b)->tcp.flags &= ~TCP_BUF_FLAG_DUPACK; + + tc->snd_nxt += 1; +} + +/** + * Convert buffer to SYN-ACK + */ +void +tcp_make_synack (tcp_connection_t * tc, vlib_buffer_t * b) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = tm->vlib_main; + tcp_options_t _snd_opts, *snd_opts = &_snd_opts; + u8 tcp_opts_len, tcp_hdr_opts_len; + tcp_header_t *th; + u16 initial_wnd; + u32 time_now; + + memset (snd_opts, 0, sizeof (*snd_opts)); + + tcp_reuse_buffer (vm, b); + + /* Set random initial sequence */ + time_now = tcp_time_now (); + + tc->iss = random_u32 (&time_now); + tc->snd_una = tc->iss; + tc->snd_nxt = tc->iss + 1; + tc->snd_una_max = tc->snd_nxt; + + initial_wnd = tcp_initial_window_to_advertise (tc); + + /* Make and write options */ + tcp_opts_len = tcp_make_synack_options (tc, snd_opts); + tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t); + + th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss, + tc->rcv_nxt, tcp_hdr_opts_len, + TCP_FLAG_SYN | TCP_FLAG_ACK, initial_wnd); + + tcp_options_write ((u8 *) (th + 1), snd_opts); + + vnet_buffer (b)->tcp.connection_index = tc->c_c_index; + vnet_buffer (b)->tcp.flags = TCP_BUF_FLAG_ACK; + + /* Init retransmit timer */ + tcp_retransmit_timer_set (tm, tc); +} + +always_inline void +tcp_enqueue_to_ip_lookup (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, + u8 is_ip4) +{ + u32 *to_next, next_index; + vlib_frame_t *f; + + b->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; + b->error = 0; + + /* Default FIB for now */ + vnet_buffer (b)->sw_if_index[VLIB_TX] = 0; + + /* Send to IP lookup */ + next_index = is_ip4 ? ip4_lookup_node.index : ip6_lookup_node.index; + f = vlib_get_frame_to_node (vm, next_index); + + /* Enqueue the packet */ + to_next = vlib_frame_vector_args (f); + to_next[0] = bi; + f->n_vectors = 1; + vlib_put_frame_to_node (vm, next_index, f); +} + +int +tcp_make_reset_in_place (vlib_main_t * vm, vlib_buffer_t * b0, + tcp_state_t state, u32 my_thread_index, u8 is_ip4) +{ + u8 tcp_hdr_len = sizeof (tcp_header_t); + ip4_header_t *ih4; + ip6_header_t *ih6; + tcp_header_t *th0; + ip4_address_t src_ip40; + ip6_address_t src_ip60; + u16 src_port0; + u32 tmp; + + /* Find IP and TCP headers */ + if (is_ip4) + { + ih4 = vlib_buffer_get_current (b0); + th0 = ip4_next_header (ih4); + } + else + { + ih6 = vlib_buffer_get_current (b0); + th0 = ip6_next_header (ih6); + } + + /* Swap src and dst ip */ + if (is_ip4) + { + ASSERT ((ih4->ip_version_and_header_length & 0xF0) == 0x40); + src_ip40.as_u32 = ih4->src_address.as_u32; + ih4->src_address.as_u32 = ih4->dst_address.as_u32; + ih4->dst_address.as_u32 = src_ip40.as_u32; + + /* Chop the end of the pkt */ + b0->current_length += ip4_header_bytes (ih4) + tcp_hdr_len; + } + else + { + ASSERT ((ih6->ip_version_traffic_class_and_flow_label & 0xF0) == 0x60); + clib_memcpy (&src_ip60, &ih6->src_address, sizeof (ip6_address_t)); + clib_memcpy (&ih6->src_address, &ih6->dst_address, + sizeof (ip6_address_t)); + clib_memcpy (&ih6->dst_address, &src_ip60, sizeof (ip6_address_t)); + + /* Chop the end of the pkt */ + b0->current_length += sizeof (ip6_header_t) + tcp_hdr_len; + } + + /* Try to determine what/why we're actually resetting and swap + * src and dst ports */ + if (state == TCP_STATE_CLOSED) + { + if (!tcp_syn (th0)) + return -1; + + tmp = clib_net_to_host_u32 (th0->seq_number); + + /* Got a SYN for no listener. */ + th0->flags = TCP_FLAG_RST | TCP_FLAG_ACK; + th0->ack_number = clib_host_to_net_u32 (tmp + 1); + th0->seq_number = 0; + + } + else if (state >= TCP_STATE_SYN_SENT) + { + th0->flags = TCP_FLAG_RST | TCP_FLAG_ACK; + th0->seq_number = th0->ack_number; + th0->ack_number = 0; + } + + src_port0 = th0->src_port; + th0->src_port = th0->dst_port; + th0->dst_port = src_port0; + th0->window = 0; + th0->data_offset_and_reserved = (tcp_hdr_len >> 2) << 4; + th0->urgent_pointer = 0; + + /* Compute checksum */ + if (is_ip4) + { + th0->checksum = ip4_tcp_udp_compute_checksum (vm, b0, ih4); + } + else + { + int bogus = ~0; + th0->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b0, ih6, &bogus); + ASSERT (!bogus); + } + + return 0; +} + +/** + * Send reset without reusing existing buffer + */ +void +tcp_send_reset (vlib_buffer_t * pkt, u8 is_ip4) +{ + vlib_buffer_t *b; + u32 bi; + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = tm->vlib_main; + u8 tcp_hdr_len, flags = 0; + tcp_header_t *th, *pkt_th; + u32 seq, ack; + ip4_header_t *ih4, *pkt_ih4; + ip6_header_t *ih6, *pkt_ih6; + + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (vm, bi); + + /* Leave enough space for headers */ + vlib_buffer_make_headroom (b, MAX_HDRS_LEN); + + /* Make and write options */ + tcp_hdr_len = sizeof (tcp_header_t); + + if (is_ip4) + { + pkt_ih4 = vlib_buffer_get_current (pkt); + pkt_th = ip4_next_header (pkt_ih4); + } + else + { + pkt_ih6 = vlib_buffer_get_current (pkt); + pkt_th = ip6_next_header (pkt_ih6); + } + + if (tcp_ack (pkt_th)) + { + flags = TCP_FLAG_RST; + seq = pkt_th->ack_number; + ack = 0; + } + else + { + flags = TCP_FLAG_RST | TCP_FLAG_ACK; + seq = 0; + ack = clib_host_to_net_u32 (vnet_buffer (pkt)->tcp.seq_end); + } + + th = vlib_buffer_push_tcp_net_order (b, pkt_th->dst_port, pkt_th->src_port, + seq, ack, tcp_hdr_len, flags, 0); + + /* Swap src and dst ip */ + if (is_ip4) + { + ASSERT ((pkt_ih4->ip_version_and_header_length & 0xF0) == 0x40); + ih4 = vlib_buffer_push_ip4 (vm, b, &pkt_ih4->dst_address, + &pkt_ih4->src_address, IP_PROTOCOL_TCP); + th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ih4); + } + else + { + int bogus = ~0; + pkt_ih6 = (ip6_header_t *) (pkt_th - 1); + ASSERT ((pkt_ih6->ip_version_traffic_class_and_flow_label & 0xF0) == + 0x60); + ih6 = + vlib_buffer_push_ip6 (vm, b, &pkt_ih6->dst_address, + &pkt_ih6->src_address, IP_PROTOCOL_TCP); + th->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ih6, &bogus); + ASSERT (!bogus); + } + + tcp_enqueue_to_ip_lookup (vm, b, bi, is_ip4); +} + +void +tcp_push_ip_hdr (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b) +{ + tcp_header_t *th = vlib_buffer_get_current (b); + + if (tc->c_is_ip4) + { + ip4_header_t *ih; + ih = vlib_buffer_push_ip4 (tm->vlib_main, b, &tc->c_lcl_ip4, + &tc->c_rmt_ip4, IP_PROTOCOL_TCP); + th->checksum = ip4_tcp_udp_compute_checksum (tm->vlib_main, b, ih); + } + else + { + ip6_header_t *ih; + int bogus = ~0; + + ih = vlib_buffer_push_ip6 (tm->vlib_main, b, &tc->c_lcl_ip6, + &tc->c_rmt_ip6, IP_PROTOCOL_TCP); + th->checksum = ip6_tcp_udp_icmp_compute_checksum (tm->vlib_main, b, ih, + &bogus); + ASSERT (!bogus); + } +} + +/** + * Send SYN + * + * Builds a SYN packet for a half-open connection and sends it to ipx_lookup. + * The packet is not forwarded through tcpx_output to avoid doing lookups + * in the half_open pool. + */ +void +tcp_send_syn (tcp_connection_t * tc) +{ + vlib_buffer_t *b; + u32 bi; + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = tm->vlib_main; + u8 tcp_hdr_opts_len, tcp_opts_len; + tcp_header_t *th; + u32 time_now; + u16 initial_wnd; + tcp_options_t snd_opts; + + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (vm, bi); + + /* Leave enough space for headers */ + vlib_buffer_make_headroom (b, MAX_HDRS_LEN); + + /* Set random initial sequence */ + time_now = tcp_time_now (); + + tc->iss = random_u32 (&time_now); + tc->snd_una = tc->iss; + tc->snd_una_max = tc->snd_nxt = tc->iss + 1; + + initial_wnd = tcp_initial_window_to_advertise (tc); + + /* Make and write options */ + memset (&snd_opts, 0, sizeof (snd_opts)); + tcp_opts_len = tcp_make_syn_options (&snd_opts, initial_wnd); + tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t); + + th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->iss, + tc->rcv_nxt, tcp_hdr_opts_len, TCP_FLAG_SYN, + initial_wnd); + + tcp_options_write ((u8 *) (th + 1), &snd_opts); + + /* Measure RTT with this */ + tc->rtt_ts = tcp_time_now (); + tc->rtt_seq = tc->snd_nxt; + + /* Start retransmit trimer */ + tcp_timer_set (tc, TCP_TIMER_RETRANSMIT_SYN, tc->rto * TCP_TO_TIMER_TICK); + tc->rto_boff = 0; + + /* Set the connection establishment timer */ + tcp_timer_set (tc, TCP_TIMER_ESTABLISH, TCP_ESTABLISH_TIME); + + tcp_push_ip_hdr (tm, tc, b); + tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4); +} + +always_inline void +tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) +{ + u32 *to_next, next_index; + vlib_frame_t *f; + + b->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; + b->error = 0; + + /* Decide where to send the packet */ + next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index; + f = vlib_get_frame_to_node (vm, next_index); + + /* Enqueue the packet */ + to_next = vlib_frame_vector_args (f); + to_next[0] = bi; + f->n_vectors = 1; + vlib_put_frame_to_node (vm, next_index, f); +} + +/** + * Send FIN + */ +void +tcp_send_fin (tcp_connection_t * tc) +{ + vlib_buffer_t *b; + u32 bi; + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = tm->vlib_main; + + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (vm, bi); + + /* Leave enough space for headers */ + vlib_buffer_make_headroom (b, MAX_HDRS_LEN); + + tcp_make_finack (tc, b); + + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); +} + +always_inline u8 +tcp_make_state_flags (tcp_state_t next_state) +{ + switch (next_state) + { + case TCP_STATE_ESTABLISHED: + return TCP_FLAG_ACK; + case TCP_STATE_SYN_RCVD: + return TCP_FLAG_SYN | TCP_FLAG_ACK; + case TCP_STATE_SYN_SENT: + return TCP_FLAG_SYN; + case TCP_STATE_LAST_ACK: + case TCP_STATE_FIN_WAIT_1: + return TCP_FLAG_FIN; + default: + clib_warning ("Shouldn't be here!"); + } + return 0; +} + +/** + * Push TCP header and update connection variables + */ +static void +tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, + tcp_state_t next_state) +{ + u32 advertise_wnd, data_len; + u8 tcp_opts_len, tcp_hdr_opts_len, opts_write_len, flags; + tcp_options_t _snd_opts, *snd_opts = &_snd_opts; + tcp_header_t *th; + + data_len = b->current_length; + vnet_buffer (b)->tcp.flags = 0; + + /* Make and write options */ + memset (snd_opts, 0, sizeof (*snd_opts)); + tcp_opts_len = tcp_make_options (tc, snd_opts, next_state); + tcp_hdr_opts_len = tcp_opts_len + sizeof (tcp_header_t); + + /* Get rcv window to advertise */ + advertise_wnd = tcp_window_to_advertise (tc, next_state); + flags = tcp_make_state_flags (next_state); + + /* Push header and options */ + th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt, + tc->rcv_nxt, tcp_hdr_opts_len, flags, + advertise_wnd); + + opts_write_len = tcp_options_write ((u8 *) (th + 1), snd_opts); + + ASSERT (opts_write_len == tcp_opts_len); + + /* Tag the buffer with the connection index */ + vnet_buffer (b)->tcp.connection_index = tc->c_c_index; + + tc->snd_nxt += data_len; +} + +/* Send delayed ACK when timer expires */ +void +tcp_timer_delack_handler (u32 index) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = tm->vlib_main; + u32 thread_index = os_get_cpu_number (); + tcp_connection_t *tc; + vlib_buffer_t *b; + u32 bi; + + tc = tcp_connection_get (index, thread_index); + + /* Get buffer */ + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (vm, bi); + + /* Fill in the ACK */ + tcp_make_ack (tc, b); + + tc->timers[TCP_TIMER_DELACK] = TCP_TIMER_HANDLE_INVALID; + tc->flags &= ~TCP_CONN_DELACK; + + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); +} + +/** Build a retransmit segment + * + * @return the number of bytes in the segment or 0 if there's nothing to + * retransmit + * */ +u32 +tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, + u32 max_bytes) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = tm->vlib_main; + u32 n_bytes, offset = 0; + sack_scoreboard_hole_t *hole; + u32 hole_size; + + tcp_reuse_buffer (vm, b); + + ASSERT (tc->state == TCP_STATE_ESTABLISHED); + ASSERT (max_bytes != 0); + + if (tcp_opts_sack_permitted (&tc->opt)) + { + /* XXX get first hole not retransmitted yet */ + hole = scoreboard_first_hole (&tc->sack_sb); + if (!hole) + return 0; + + offset = hole->start - tc->snd_una; + hole_size = hole->end - hole->start; + + ASSERT (hole_size); + + if (hole_size < max_bytes) + max_bytes = hole_size; + } + else + { + if (seq_geq (tc->snd_nxt, tc->snd_una_max)) + return 0; + } + + n_bytes = stream_session_peek_bytes (&tc->connection, + vlib_buffer_get_current (b), offset, + max_bytes); + ASSERT (n_bytes != 0); + + tc->snd_nxt += n_bytes; + tcp_push_hdr_i (tc, b, tc->state); + + return n_bytes; +} + +static void +tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = tm->vlib_main; + u32 thread_index = os_get_cpu_number (); + tcp_connection_t *tc; + vlib_buffer_t *b; + u32 bi, max_bytes, snd_space; + + if (is_syn) + { + tc = tcp_half_open_connection_get (index); + } + else + { + tc = tcp_connection_get (index, thread_index); + } + + /* Make sure timer handle is set to invalid */ + tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID; + + /* Increment RTO backoff (also equal to number of retries) */ + tc->rto_boff += 1; + + /* Go back to first un-acked byte */ + tc->snd_nxt = tc->snd_una; + + /* Get buffer */ + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (vm, bi); + + if (tc->state == TCP_STATE_ESTABLISHED) + { + tcp_fastrecovery_off (tc); + + /* Exponential backoff */ + tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); + + /* Figure out what and how many bytes we can send */ + snd_space = tcp_available_snd_space (tc); + max_bytes = clib_min (tc->snd_mss, snd_space); + tcp_prepare_retransmit_segment (tc, b, max_bytes); + + tc->rtx_bytes += max_bytes; + + /* No fancy recovery for now! */ + scoreboard_clear (&tc->sack_sb); + } + else + { + /* Retransmit for SYN/SYNACK */ + ASSERT (tc->state == TCP_STATE_SYN_RCVD + || tc->state == TCP_STATE_SYN_SENT); + + /* Try without increasing RTO a number of times. If this fails, + * start growing RTO exponentially */ + if (tc->rto_boff > TCP_RTO_SYN_RETRIES) + tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); + + vlib_buffer_make_headroom (b, MAX_HDRS_LEN); + tcp_push_hdr_i (tc, b, tc->state); + } + + if (!is_syn) + { + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + + /* Re-enable retransmit timer */ + tcp_retransmit_timer_set (tm, tc); + } + else + { + ASSERT (tc->state == TCP_STATE_SYN_SENT); + + /* This goes straight to ipx_lookup */ + tcp_push_ip_hdr (tm, tc, b); + tcp_enqueue_to_ip_lookup (vm, b, bi, tc->c_is_ip4); + + /* Re-enable retransmit timer */ + tcp_timer_set (tc, TCP_TIMER_RETRANSMIT_SYN, + tc->rto * TCP_TO_TIMER_TICK); + } +} + +void +tcp_timer_retransmit_handler (u32 index) +{ + tcp_timer_retransmit_handler_i (index, 0); +} + +void +tcp_timer_retransmit_syn_handler (u32 index) +{ + tcp_timer_retransmit_handler_i (index, 1); +} + +/** + * Retansmit first unacked segment */ +void +tcp_retransmit_first_unacked (tcp_connection_t * tc) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + u32 snd_nxt = tc->snd_nxt; + vlib_buffer_t *b; + u32 bi; + + tc->snd_nxt = tc->snd_una; + + /* Get buffer */ + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (tm->vlib_main, bi); + + tcp_prepare_retransmit_segment (tc, b, tc->snd_mss); + tcp_enqueue_to_output (tm->vlib_main, b, bi, tc->c_is_ip4); + + tc->snd_nxt = snd_nxt; + tc->rtx_bytes += tc->snd_mss; +} + +void +tcp_fast_retransmit (tcp_connection_t * tc) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + u32 snd_space, max_bytes, n_bytes, bi; + vlib_buffer_t *b; + + ASSERT (tcp_in_fastrecovery (tc)); + + clib_warning ("fast retransmit!"); + + /* Start resending from first un-acked segment */ + tc->snd_nxt = tc->snd_una; + + snd_space = tcp_available_snd_space (tc); + + while (snd_space) + { + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (tm->vlib_main, bi); + + max_bytes = clib_min (tc->snd_mss, snd_space); + n_bytes = tcp_prepare_retransmit_segment (tc, b, max_bytes); + + /* Nothing left to retransmit */ + if (n_bytes == 0) + return; + + tcp_enqueue_to_output (tm->vlib_main, b, bi, tc->c_is_ip4); + + snd_space -= n_bytes; + } + + /* If window allows, send new data */ + tc->snd_nxt = tc->snd_una_max; +} + +always_inline u32 +tcp_session_has_ooo_data (tcp_connection_t * tc) +{ + stream_session_t *s = + stream_session_get (tc->c_s_index, tc->c_thread_index); + return svm_fifo_has_ooo_data (s->server_rx_fifo); +} + +always_inline uword +tcp46_output_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + tcp_main_t *tm = vnet_get_tcp_main (); + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->cpu_index; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + tcp_connection_t *tc0; + tcp_header_t *th0; + u32 error0 = TCP_ERROR_PKTS_SENT, next0 = TCP_OUTPUT_NEXT_IP_LOOKUP; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + tc0 = tcp_connection_get (vnet_buffer (b0)->tcp.connection_index, + my_thread_index); + th0 = vlib_buffer_get_current (b0); + + if (is_ip4) + { + ip4_header_t *ih0; + ih0 = vlib_buffer_push_ip4 (vm, b0, &tc0->c_lcl_ip4, + &tc0->c_rmt_ip4, IP_PROTOCOL_TCP); + th0->checksum = ip4_tcp_udp_compute_checksum (vm, b0, ih0); + } + else + { + ip6_header_t *ih0; + int bogus = ~0; + + ih0 = vlib_buffer_push_ip6 (vm, b0, &tc0->c_lcl_ip6, + &tc0->c_rmt_ip6, IP_PROTOCOL_TCP); + th0->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b0, ih0, + &bogus); + ASSERT (!bogus); + } + + /* Filter out DUPACKs if there are no OOO segments left */ + if (PREDICT_FALSE + (vnet_buffer (b0)->tcp.flags & TCP_BUF_FLAG_DUPACK)) + { + tc0->snt_dupacks--; + ASSERT (tc0->snt_dupacks >= 0); + if (!tcp_session_has_ooo_data (tc0)) + { + error0 = TCP_ERROR_FILTERED_DUPACKS; + next0 = TCP_OUTPUT_NEXT_DROP; + goto done; + } + } + + /* Retransmitted SYNs do reach this but it should be harmless */ + tc0->rcv_las = tc0->rcv_nxt; + + /* Stop DELACK timer and fix flags */ + tc0->flags &= + ~(TCP_CONN_SNDACK | TCP_CONN_DELACK | TCP_CONN_BURSTACK); + if (tcp_timer_is_active (tc0, TCP_TIMER_DELACK)) + { + tcp_timer_reset (tc0, TCP_TIMER_DELACK); + } + + /* If not retransmitting + * 1) update snd_una_max (SYN, SYNACK, new data, FIN) + * 2) If we're not tracking an ACK, start tracking */ + if (seq_lt (tc0->snd_una_max, tc0->snd_nxt)) + { + tc0->snd_una_max = tc0->snd_nxt; + if (tc0->rtt_ts == 0) + { + tc0->rtt_ts = tcp_time_now (); + tc0->rtt_seq = tc0->snd_nxt; + } + } + + /* Set the retransmit timer if not set already and not + * doing a pure ACK */ + if (!tcp_timer_is_active (tc0, TCP_TIMER_RETRANSMIT) + && tc0->snd_nxt != tc0->snd_una) + { + tcp_retransmit_timer_set (tm, tc0); + tc0->rto_boff = 0; + } + + /* set fib index to default and lookup node */ + /* XXX network virtualization (vrf/vni) */ + vnet_buffer (b0)->sw_if_index[VLIB_RX] = 0; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + + b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; + + done: + b0->error = error0 != 0 ? node->errors[error0] : 0; + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return from_frame->n_vectors; +} + +static uword +tcp4_output (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_output_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +tcp6_output (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_output_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + +VLIB_REGISTER_NODE (tcp4_output_node) = +{ + .function = tcp4_output,.name = "tcp4-output", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32),.n_errors = TCP_N_ERROR,.error_strings = + tcp_error_strings,.n_next_nodes = TCP_OUTPUT_N_NEXT,.next_nodes = + { +#define _(s,n) [TCP_OUTPUT_NEXT_##s] = n, + foreach_tcp4_output_next +#undef _ + } +,.format_buffer = format_tcp_header,.format_trace = format_tcp_tx_trace,}; + +VLIB_NODE_FUNCTION_MULTIARCH (tcp4_output_node, tcp4_output) +VLIB_REGISTER_NODE (tcp6_output_node) = +{ + .function = tcp6_output,.name = "tcp6-output", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32),.n_errors = TCP_N_ERROR,.error_strings = + tcp_error_strings,.n_next_nodes = TCP_OUTPUT_N_NEXT,.next_nodes = + { +#define _(s,n) [TCP_OUTPUT_NEXT_##s] = n, + foreach_tcp6_output_next +#undef _ + } +,.format_buffer = format_tcp_header,.format_trace = format_tcp_tx_trace,}; + +VLIB_NODE_FUNCTION_MULTIARCH (tcp6_output_node, tcp6_output) u32 +tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b) +{ + tcp_connection_t *tc; + + tc = (tcp_connection_t *) tconn; + tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED); + return 0; +} + +typedef enum _tcp_reset_next +{ + TCP_RESET_NEXT_DROP, + TCP_RESET_NEXT_IP_LOOKUP, + TCP_RESET_N_NEXT +} tcp_reset_next_t; + +#define foreach_tcp4_reset_next \ + _(DROP, "error-drop") \ + _(IP_LOOKUP, "ip4-lookup") + +#define foreach_tcp6_reset_next \ + _(DROP, "error-drop") \ + _(IP_LOOKUP, "ip6-lookup") + +static uword +tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame, u8 is_ip4) +{ + u32 n_left_from, next_index, *from, *to_next; + u32 my_thread_index = vm->cpu_index; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 error0 = TCP_ERROR_RST_SENT, next0 = TCP_RESET_NEXT_IP_LOOKUP; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + if (tcp_make_reset_in_place (vm, b0, vnet_buffer (b0)->tcp.flags, + my_thread_index, is_ip4)) + { + error0 = TCP_ERROR_LOOKUP_DROPS; + next0 = TCP_RESET_NEXT_DROP; + goto done; + } + + /* Prepare to send to IP lookup */ + vnet_buffer (b0)->sw_if_index[VLIB_TX] = 0; + next0 = TCP_RESET_NEXT_IP_LOOKUP; + + done: + b0->error = error0 != 0 ? node->errors[error0] : 0; + b0->flags |= VNET_BUFFER_LOCALLY_ORIGINATED; + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + return from_frame->n_vectors; +} + +static uword +tcp4_send_reset (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_send_reset_inline (vm, node, from_frame, 1); +} + +static uword +tcp6_send_reset (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return tcp46_send_reset_inline (vm, node, from_frame, 0); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp4_reset_node) = { + .function = tcp4_send_reset, + .name = "tcp4-reset", + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_RESET_N_NEXT, + .next_nodes = { +#define _(s,n) [TCP_RESET_NEXT_##s] = n, + foreach_tcp4_reset_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (tcp6_reset_node) = { + .function = tcp6_send_reset, + .name = "tcp6-reset", + .vector_size = sizeof (u32), + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, + .n_next_nodes = TCP_RESET_N_NEXT, + .next_nodes = { +#define _(s,n) [TCP_RESET_NEXT_##s] = n, + foreach_tcp6_reset_next +#undef _ + }, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_packet.h b/src/vnet/tcp/tcp_packet.h new file mode 100644 index 00000000..866c5fd6 --- /dev/null +++ b/src/vnet/tcp/tcp_packet.h @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_tcp_packet_h +#define included_tcp_packet_h + +#include + +/* TCP flags bit 0 first. */ +#define foreach_tcp_flag \ + _ (FIN) /**< No more data from sender. */ \ + _ (SYN) /**< Synchronize sequence numbers. */ \ + _ (RST) /**< Reset the connection. */ \ + _ (PSH) /**< Push function. */ \ + _ (ACK) /**< Ack field significant. */ \ + _ (URG) /**< Urgent pointer field significant. */ \ + _ (ECE) /**< ECN-echo. Receiver got CE packet */ \ + _ (CWR) /**< Sender reduced congestion window */ + +enum +{ +#define _(f) TCP_FLAG_BIT_##f, + foreach_tcp_flag +#undef _ + TCP_N_FLAG_BITS, +}; + +enum +{ +#define _(f) TCP_FLAG_##f = 1 << TCP_FLAG_BIT_##f, + foreach_tcp_flag +#undef _ +}; + +typedef struct _tcp_header +{ + union + { + struct + { + u16 src_port; /**< Source port. */ + u16 dst_port; /**< Destination port. */ + }; + struct + { + u16 src, dst; + }; + }; + + u32 seq_number; /**< Sequence number of the first data octet in this + * segment, except when SYN is present. If SYN + * is present the seq number is is the ISN and the + * first data octet is ISN+1 */ + u32 ack_number; /**< Acknowledgement number if ACK is set. It contains + * the value of the next sequence number the sender + * of the segment is expecting to receive. */ + u8 data_offset_and_reserved; + u8 flags; /**< Flags: see the macro above */ + u16 window; /**< Number of bytes sender is willing to receive. */ + + u16 checksum; /**< Checksum of TCP pseudo header and data. */ + u16 urgent_pointer; /**< Seq number of the byte after the urgent data. */ +} __attribute__ ((packed)) tcp_header_t; + +/* Flag tests that return 0 or !0 */ +#define tcp_doff(_th) ((_th)->data_offset_and_reserved >> 4) +#define tcp_fin(_th) ((_th)->flags & TCP_FLAG_FIN) +#define tcp_syn(_th) ((_th)->flags & TCP_FLAG_SYN) +#define tcp_rst(_th) ((_th)->flags & TCP_FLAG_RST) +#define tcp_psh(_th) ((_th)->flags & TCP_FLAG_PSH) +#define tcp_ack(_th) ((_th)->flags & TCP_FLAG_ACK) +#define tcp_urg(_th) ((_th)->flags & TCP_FLAG_URG) +#define tcp_ece(_th) ((_th)->flags & TCP_FLAG_ECE) +#define tcp_cwr(_th) ((_th)->flags & TCP_FLAG_CWR) + +/* Flag tests that return 0 or 1 */ +#define tcp_is_syn(_th) !!((_th)->flags & TCP_FLAG_SYN) +#define tcp_is_fin(_th) !!((_th)->flags & TCP_FLAG_FIN) + +always_inline int +tcp_header_bytes (tcp_header_t * t) +{ + return tcp_doff (t) * sizeof (u32); +} + +/* + * TCP options. + */ + +typedef enum tcp_option_type +{ + TCP_OPTION_EOL = 0, /**< End of options. */ + TCP_OPTION_NOOP = 1, /**< No operation. */ + TCP_OPTION_MSS = 2, /**< Limit MSS. */ + TCP_OPTION_WINDOW_SCALE = 3, /**< Window scale. */ + TCP_OPTION_SACK_PERMITTED = 4, /**< Selective Ack permitted. */ + TCP_OPTION_SACK_BLOCK = 5, /**< Selective Ack block. */ + TCP_OPTION_TIMESTAMP = 8, /**< Timestamps. */ + TCP_OPTION_UTO = 28, /**< User timeout. */ + TCP_OPTION_AO = 29, /**< Authentication Option. */ +} tcp_option_type_t; + +#define foreach_tcp_options_flag \ + _ (MSS) /**< MSS advertised in SYN */ \ + _ (TSTAMP) /**< Timestamp capability advertised in SYN */ \ + _ (WSCALE) /**< Wnd scale capability advertised in SYN */ \ + _ (SACK_PERMITTED) /**< SACK capability advertised in SYN */ \ + _ (SACK) /**< SACK present */ + +enum +{ +#define _(f) TCP_OPTS_FLAG_BIT_##f, + foreach_tcp_options_flag +#undef _ + TCP_OPTIONS_N_FLAG_BITS, +}; + +enum +{ +#define _(f) TCP_OPTS_FLAG_##f = 1 << TCP_OPTS_FLAG_BIT_##f, + foreach_tcp_options_flag +#undef _ +}; + +typedef struct _sack_block +{ + u32 start; /**< Start sequence number */ + u32 end; /**< End sequence number */ +} sack_block_t; + +typedef struct +{ + u8 flags; /** Option flags, see above */ + + /* Received options */ + u16 mss; /**< Maximum segment size advertised by peer */ + u8 wscale; /**< Window scale advertised by peer */ + u32 tsval; /**< Peer's timestamp value */ + u32 tsecr; /**< Echoed/reflected time stamp */ + sack_block_t *sacks; /**< SACK blocks received */ + u8 n_sack_blocks; /**< Number of SACKs blocks */ +} tcp_options_t; + +/* Flag tests that return 0 or !0 */ +#define tcp_opts_mss(_to) ((_to)->flags & TCP_OPTS_FLAG_MSS) +#define tcp_opts_tstamp(_to) ((_to)->flags & TCP_OPTS_FLAG_TSTAMP) +#define tcp_opts_wscale(_to) ((_to)->flags & TCP_OPTS_FLAG_WSCALE) +#define tcp_opts_sack(_to) ((_to)->flags & TCP_OPTS_FLAG_SACK) +#define tcp_opts_sack_permitted(_to) ((_to)->flags & TCP_OPTS_FLAG_SACK_PERMITTED) + +/* TCP option lengths */ +#define TCP_OPTION_LEN_EOL 1 +#define TCP_OPTION_LEN_NOOP 1 +#define TCP_OPTION_LEN_MSS 4 +#define TCP_OPTION_LEN_WINDOW_SCALE 3 +#define TCP_OPTION_LEN_SACK_PERMITTED 2 +#define TCP_OPTION_LEN_TIMESTAMP 10 +#define TCP_OPTION_LEN_SACK_BLOCK 8 + +#define TCP_WND_MAX 65535U +#define TCP_MAX_WND_SCALE 14 /* See RFC 1323 */ +#define TCP_OPTS_ALIGN 4 +#define TCP_OPTS_MAX_SACK_BLOCKS 3 +#endif /* included_tcp_packet_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_pg.c b/src/vnet/tcp/tcp_pg.c new file mode 100644 index 00000000..dc324049 --- /dev/null +++ b/src/vnet/tcp/tcp_pg.c @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/tcp_pg: TCP packet-generator interface + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include + +/* TCP flags bit 0 first. */ +#define foreach_tcp_flag \ + _ (FIN) \ + _ (SYN) \ + _ (RST) \ + _ (PSH) \ + _ (ACK) \ + _ (URG) \ + _ (ECE) \ + _ (CWR) + +static void +tcp_pg_edit_function (pg_main_t * pg, + pg_stream_t * s, + pg_edit_group_t * g, + u32 * packets, + u32 n_packets) +{ + vlib_main_t * vm = vlib_get_main(); + u32 ip_offset, tcp_offset; + + tcp_offset = g->start_byte_offset; + ip_offset = (g-1)->start_byte_offset; + + while (n_packets >= 1) + { + vlib_buffer_t * p0; + ip4_header_t * ip0; + tcp_header_t * tcp0; + ip_csum_t sum0; + u32 tcp_len0; + + p0 = vlib_get_buffer (vm, packets[0]); + n_packets -= 1; + packets += 1; + + ASSERT (p0->current_data == 0); + ip0 = (void *) (p0->data + ip_offset); + tcp0 = (void *) (p0->data + tcp_offset); + tcp_len0 = clib_net_to_host_u16 (ip0->length) - sizeof (ip0[0]); + + /* Initialize checksum with header. */ + if (BITS (sum0) == 32) + { + sum0 = clib_mem_unaligned (&ip0->src_address, u32); + sum0 = ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->dst_address, u32)); + } + else + sum0 = clib_mem_unaligned (&ip0->src_address, u64); + + sum0 = ip_csum_with_carry + (sum0, clib_host_to_net_u32 (tcp_len0 + (ip0->protocol << 16))); + + /* Invalidate possibly old checksum. */ + tcp0->checksum = 0; + + sum0 = ip_incremental_checksum_buffer (vm, p0, tcp_offset, tcp_len0, sum0); + + tcp0->checksum = ~ ip_csum_fold (sum0); + } +} + +typedef struct { + pg_edit_t src, dst; + pg_edit_t seq_number, ack_number; + pg_edit_t data_offset_and_reserved; +#define _(f) pg_edit_t f##_flag; + foreach_tcp_flag +#undef _ + pg_edit_t window; + pg_edit_t checksum; + pg_edit_t urgent_pointer; +} pg_tcp_header_t; + +static inline void +pg_tcp_header_init (pg_tcp_header_t * p) +{ + /* Initialize fields that are not bit fields in the IP header. */ +#define _(f) pg_edit_init (&p->f, tcp_header_t, f); + _ (src); + _ (dst); + _ (seq_number); + _ (ack_number); + _ (window); + _ (checksum); + _ (urgent_pointer); +#undef _ + + /* Initialize bit fields. */ +#define _(f) \ + pg_edit_init_bitfield (&p->f##_flag, tcp_header_t, \ + flags, \ + TCP_FLAG_BIT_##f, 1); + + foreach_tcp_flag +#undef _ + + pg_edit_init_bitfield (&p->data_offset_and_reserved, tcp_header_t, + data_offset_and_reserved, + 4, 4); +} + +uword +unformat_pg_tcp_header (unformat_input_t * input, va_list * args) +{ + pg_stream_t * s = va_arg (*args, pg_stream_t *); + pg_tcp_header_t * p; + u32 group_index; + + p = pg_create_edit_group (s, sizeof (p[0]), sizeof (tcp_header_t), + &group_index); + pg_tcp_header_init (p); + + /* Defaults. */ + pg_edit_set_fixed (&p->seq_number, 0); + pg_edit_set_fixed (&p->ack_number, 0); + + pg_edit_set_fixed (&p->data_offset_and_reserved, + sizeof (tcp_header_t) / sizeof (u32)); + + pg_edit_set_fixed (&p->window, 4096); + pg_edit_set_fixed (&p->urgent_pointer, 0); + +#define _(f) pg_edit_set_fixed (&p->f##_flag, 0); + foreach_tcp_flag +#undef _ + + p->checksum.type = PG_EDIT_UNSPECIFIED; + + if (! unformat (input, "TCP: %U -> %U", + unformat_pg_edit, + unformat_tcp_udp_port, &p->src, + unformat_pg_edit, + unformat_tcp_udp_port, &p->dst)) + goto error; + + /* Parse options. */ + while (1) + { + if (unformat (input, "window %U", + unformat_pg_edit, + unformat_pg_number, &p->window)) + ; + + else if (unformat (input, "checksum %U", + unformat_pg_edit, + unformat_pg_number, &p->checksum)) + ; + + /* Flags. */ +#define _(f) else if (unformat (input, #f)) pg_edit_set_fixed (&p->f##_flag, 1); + foreach_tcp_flag +#undef _ + + /* Can't parse input: try next protocol level. */ + else + break; + } + + { + ip_main_t * im = &ip_main; + u16 dst_port; + tcp_udp_port_info_t * pi; + + pi = 0; + if (p->dst.type == PG_EDIT_FIXED) + { + dst_port = pg_edit_get_value (&p->dst, PG_EDIT_LO); + pi = ip_get_tcp_udp_port_info (im, dst_port); + } + + if (pi && pi->unformat_pg_edit + && unformat_user (input, pi->unformat_pg_edit, s)) + ; + + else if (! unformat_user (input, unformat_pg_payload, s)) + goto error; + + if (p->checksum.type == PG_EDIT_UNSPECIFIED) + { + pg_edit_group_t * g = pg_stream_get_group (s, group_index); + g->edit_function = tcp_pg_edit_function; + g->edit_function_opaque = 0; + } + + return 1; + } + + error: + /* Free up any edits we may have added. */ + pg_free_edit_group (s); + return 0; +} + diff --git a/src/vnet/tcp/tcp_syn_filter4.c b/src/vnet/tcp/tcp_syn_filter4.c new file mode 100644 index 00000000..c7605a30 --- /dev/null +++ b/src/vnet/tcp/tcp_syn_filter4.c @@ -0,0 +1,542 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +typedef struct +{ + f64 next_reset; + f64 reset_interval; + u8 *syn_counts; +} syn_filter4_runtime_t; + +typedef struct +{ + u32 next_index; + int not_a_syn; + u8 filter_value; +} syn_filter4_trace_t; + +/* packet trace format function */ +static u8 * +format_syn_filter4_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + syn_filter4_trace_t *t = va_arg (*args, syn_filter4_trace_t *); + + s = format (s, "SYN_FILTER4: next index %d, %s", + t->next_index, t->not_a_syn ? "not a syn" : "syn"); + if (t->not_a_syn == 0) + s = format (s, ", filter value %d\n", t->filter_value); + else + s = format (s, "\n"); + return s; +} + +static vlib_node_registration_t syn_filter4_node; + +#define foreach_syn_filter_error \ +_(THROTTLED, "TCP SYN packet throttle drops") \ +_(OK, "TCP SYN packets passed") + +typedef enum +{ +#define _(sym,str) SYN_FILTER_ERROR_##sym, + foreach_syn_filter_error +#undef _ + SYN_FILTER_N_ERROR, +} syn_filter_error_t; + +static char *syn_filter4_error_strings[] = { +#define _(sym,string) string, + foreach_syn_filter_error +#undef _ +}; + +typedef enum +{ + SYN_FILTER_NEXT_DROP, + SYN_FILTER_N_NEXT, +} syn_filter_next_t; + +extern vnet_feature_arc_registration_t vnet_feat_arc_ip4_local; + +static uword +syn_filter4_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + u32 n_left_from, *from, *to_next; + syn_filter_next_t next_index; + u32 ok_syn_packets = 0; + vnet_feature_main_t *fm = &feature_main; + u8 arc_index = vnet_feat_arc_ip4_local.feature_arc_index; + vnet_feature_config_main_t *cm = &fm->feature_config_mains[arc_index]; + syn_filter4_runtime_t *rt = (syn_filter4_runtime_t *) node->runtime_data; + f64 now = vlib_time_now (vm); + /* Shut up spurious gcc warnings. */ + u8 *c0 = 0, *c1 = 0, *c2 = 0, *c3 = 0; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + if (now > rt->next_reset) + { + memset (rt->syn_counts, 0, vec_len (rt->syn_counts)); + rt->next_reset = now + rt->reset_interval; + } + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 8 && n_left_to_next >= 4) + { + u32 bi0, bi1, bi2, bi3; + vlib_buffer_t *b0, *b1, *b2, *b3; + u32 next0, next1, next2, next3; + ip4_header_t *ip0, *ip1, *ip2, *ip3; + tcp_header_t *tcp0, *tcp1, *tcp2, *tcp3; + u32 not_a_syn0 = 1, not_a_syn1 = 1, not_a_syn2 = 1, not_a_syn3 = 1; + u64 hash0, hash1, hash2, hash3; + + /* Prefetch next iteration. */ + { + vlib_buffer_t *p4, *p5, *p6, *p7; + + p4 = vlib_get_buffer (vm, from[4]); + p5 = vlib_get_buffer (vm, from[5]); + p6 = vlib_get_buffer (vm, from[6]); + p7 = vlib_get_buffer (vm, from[7]); + + vlib_prefetch_buffer_header (p4, LOAD); + vlib_prefetch_buffer_header (p5, LOAD); + vlib_prefetch_buffer_header (p6, LOAD); + vlib_prefetch_buffer_header (p7, LOAD); + + CLIB_PREFETCH (p4->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p5->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p6->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p7->data, CLIB_CACHE_LINE_BYTES, STORE); + } + + /* speculatively enqueue b0 and b1 to the current next frame */ + to_next[0] = bi0 = from[0]; + to_next[1] = bi1 = from[1]; + to_next[2] = bi2 = from[2]; + to_next[3] = bi3 = from[3]; + from += 4; + to_next += 4; + n_left_from -= 4; + n_left_to_next -= 4; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + b2 = vlib_get_buffer (vm, bi2); + b3 = vlib_get_buffer (vm, bi3); + + vnet_get_config_data + (&cm->config_main, &b0->current_config_index, + &next0, 0 /* sizeof (c0[0]) */ ); + vnet_get_config_data + (&cm->config_main, &b1->current_config_index, + &next1, 0 /* sizeof (c0[0]) */ ); + vnet_get_config_data + (&cm->config_main, &b2->current_config_index, + &next2, 0 /* sizeof (c0[0]) */ ); + vnet_get_config_data + (&cm->config_main, &b3->current_config_index, + &next3, 0 /* sizeof (c0[0]) */ ); + + /* Not TCP? */ + ip0 = vlib_buffer_get_current (b0); + if (ip0->protocol != IP_PROTOCOL_TCP) + goto trace00; + + tcp0 = ip4_next_header (ip0); + /* + * Not a SYN? + * $$$$ hack: the TCP bitfield flags seem not to compile + * correct code. + */ + if (PREDICT_TRUE (!(tcp0->flags & 0x2))) + goto trace00; + + not_a_syn0 = 0; + hash0 = clib_xxhash ((u64) ip0->src_address.as_u32); + c0 = &rt->syn_counts[hash0 & (_vec_len (rt->syn_counts) - 1)]; + if (PREDICT_FALSE (*c0 >= 0x80)) + { + next0 = SYN_FILTER_NEXT_DROP; + b0->error = node->errors[SYN_FILTER_ERROR_THROTTLED]; + goto trace00; + } + *c0 += 1; + ok_syn_packets++; + + trace00: + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + syn_filter4_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->not_a_syn = not_a_syn0; + t->next_index = next0; + t->filter_value = not_a_syn0 ? 0 : *c0; + } + + /* Not TCP? */ + ip1 = vlib_buffer_get_current (b1); + if (ip1->protocol != IP_PROTOCOL_TCP) + goto trace01; + + tcp1 = ip4_next_header (ip1); + /* + * Not a SYN? + * $$$$ hack: the TCP bitfield flags seem not to compile + * correct code. + */ + if (PREDICT_TRUE (!(tcp1->flags & 0x2))) + goto trace01; + + not_a_syn1 = 0; + hash1 = clib_xxhash ((u64) ip1->src_address.as_u32); + c1 = &rt->syn_counts[hash1 & (_vec_len (rt->syn_counts) - 1)]; + if (PREDICT_FALSE (*c1 >= 0x80)) + { + next1 = SYN_FILTER_NEXT_DROP; + b1->error = node->errors[SYN_FILTER_ERROR_THROTTLED]; + goto trace01; + } + *c1 += 1; + ok_syn_packets++; + + trace01: + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b1->flags & VLIB_BUFFER_IS_TRACED))) + { + syn_filter4_trace_t *t = + vlib_add_trace (vm, node, b1, sizeof (*t)); + t->not_a_syn = not_a_syn1; + t->next_index = next1; + t->filter_value = not_a_syn1 ? 0 : *c1; + } + + /* Not TCP? */ + ip2 = vlib_buffer_get_current (b2); + if (ip2->protocol != IP_PROTOCOL_TCP) + goto trace02; + + tcp2 = ip4_next_header (ip2); + /* + * Not a SYN? + * $$$$ hack: the TCP bitfield flags seem not to compile + * correct code. + */ + if (PREDICT_TRUE (!(tcp2->flags & 0x2))) + goto trace02; + + not_a_syn2 = 0; + hash2 = clib_xxhash ((u64) ip2->src_address.as_u32); + c2 = &rt->syn_counts[hash2 & (_vec_len (rt->syn_counts) - 1)]; + if (PREDICT_FALSE (*c2 >= 0x80)) + { + next2 = SYN_FILTER_NEXT_DROP; + b2->error = node->errors[SYN_FILTER_ERROR_THROTTLED]; + goto trace02; + } + *c2 += 1; + ok_syn_packets++; + + trace02: + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b2->flags & VLIB_BUFFER_IS_TRACED))) + { + syn_filter4_trace_t *t = + vlib_add_trace (vm, node, b2, sizeof (*t)); + t->not_a_syn = not_a_syn2; + t->next_index = next2; + t->filter_value = not_a_syn2 ? 0 : *c2; + } + + /* Not TCP? */ + ip3 = vlib_buffer_get_current (b3); + if (ip3->protocol != IP_PROTOCOL_TCP) + goto trace03; + + tcp3 = ip4_next_header (ip3); + /* + * Not a SYN? + * $$$$ hack: the TCP bitfield flags seem not to compile + * correct code. + */ + if (PREDICT_TRUE (!(tcp3->flags & 0x2))) + goto trace03; + + not_a_syn3 = 0; + hash3 = clib_xxhash ((u64) ip3->src_address.as_u32); + c3 = &rt->syn_counts[hash3 & (_vec_len (rt->syn_counts) - 1)]; + if (PREDICT_FALSE (*c3 >= 0x80)) + { + next3 = SYN_FILTER_NEXT_DROP; + b3->error = node->errors[SYN_FILTER_ERROR_THROTTLED]; + goto trace03; + } + *c3 += 1; + ok_syn_packets++; + + trace03: + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b3->flags & VLIB_BUFFER_IS_TRACED))) + { + syn_filter4_trace_t *t = + vlib_add_trace (vm, node, b3, sizeof (*t)); + t->not_a_syn = not_a_syn3; + t->next_index = next3; + t->filter_value = not_a_syn3 ? 0 : *c3; + } + vlib_validate_buffer_enqueue_x4 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, bi2, bi3, + next0, next1, next2, next3); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 next0; + ip4_header_t *ip0; + tcp_header_t *tcp0; + u32 not_a_syn0 = 1; + u32 hash0; + u8 *c0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + vnet_get_config_data + (&cm->config_main, &b0->current_config_index, + &next0, 0 /* sizeof (c0[0]) */ ); + + /* Not TCP? */ + ip0 = vlib_buffer_get_current (b0); + if (ip0->protocol != IP_PROTOCOL_TCP) + goto trace0; + + tcp0 = ip4_next_header (ip0); + /* + * Not a SYN? + * $$$$ hack: the TCP bitfield flags seem not to compile + * correct code. + */ + if (PREDICT_TRUE (!(tcp0->flags & 0x2))) + goto trace0; + + not_a_syn0 = 0; + hash0 = clib_xxhash ((u64) ip0->src_address.as_u32); + c0 = &rt->syn_counts[hash0 & (_vec_len (rt->syn_counts) - 1)]; + if (PREDICT_FALSE (*c0 >= 0x80)) + { + next0 = SYN_FILTER_NEXT_DROP; + b0->error = node->errors[SYN_FILTER_ERROR_THROTTLED]; + goto trace0; + } + *c0 += 1; + ok_syn_packets++; + + trace0: + + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + syn_filter4_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->not_a_syn = not_a_syn0; + t->next_index = next0; + t->filter_value = not_a_syn0 ? 0 : *c0; + } + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, syn_filter4_node.index, + SYN_FILTER_ERROR_OK, ok_syn_packets); + return frame->n_vectors; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (syn_filter4_node, static) = +{ + .function = syn_filter4_node_fn, + .name = "syn-filter-4", + .vector_size = sizeof (u32), + .format_trace = format_syn_filter4_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .runtime_data_bytes = sizeof (syn_filter4_runtime_t), + .n_errors = ARRAY_LEN(syn_filter4_error_strings), + .error_strings = syn_filter4_error_strings, + + .n_next_nodes = SYN_FILTER_N_NEXT, + + /* edit / add dispositions here */ + .next_nodes = { + [SYN_FILTER_NEXT_DROP] = "error-drop", + }, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (syn_filter4_node, syn_filter4_node_fn); + +/* *INDENT-OFF* */ +VNET_FEATURE_INIT (syn_filter_4, static) = +{ + .arc_name = "ip4-local", + .node_name = "syn-filter-4", + .runs_before = VNET_FEATURES("ip4-local-end-of-arc"), +}; +/* *INDENT-ON* */ + +int +syn_filter_enable_disable (u32 sw_if_index, int enable_disable) +{ + vnet_main_t *vnm = vnet_get_main (); + vnet_sw_interface_t *sw; + int rv = 0; + + /* Utterly wrong? */ + if (pool_is_free_index (vnm->interface_main.sw_interfaces, sw_if_index)) + return VNET_API_ERROR_INVALID_SW_IF_INDEX; + + /* Not a physical port? */ + sw = vnet_get_sw_interface (vnm, sw_if_index); + if (sw->type != VNET_SW_INTERFACE_TYPE_HARDWARE) + return VNET_API_ERROR_INVALID_SW_IF_INDEX; + + if (enable_disable) + { + vlib_main_t *vm = vlib_get_main (); + syn_filter4_runtime_t *rt; + + rt = vlib_node_get_runtime_data (vm, syn_filter4_node.index); + vec_validate (rt->syn_counts, 1023); + /* + * Given perfect disperson / optimal hashing results: + * Allow 128k (successful) syns/sec. 1024, buckets each of which + * absorb 128 syns before filtering. Reset table once a second. + * Reality bites, lets try resetting once every 100ms. + */ + rt->reset_interval = 0.1; /* reset interval in seconds */ + } + + rv = vnet_feature_enable_disable ("ip4-local", "syn-filter-4", + sw_if_index, enable_disable, 0, 0); + + return rv; +} + +static clib_error_t * +syn_filter_enable_disable_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vnet_main_t *vnm = vnet_get_main (); + u32 sw_if_index = ~0; + int enable_disable = 1; + int rv; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "disable")) + enable_disable = 0; + else if (unformat (input, "%U", unformat_vnet_sw_interface, + vnm, &sw_if_index)) + ; + else + break; + } + + if (sw_if_index == ~0) + return clib_error_return (0, "Please specify an interface..."); + + rv = syn_filter_enable_disable (sw_if_index, enable_disable); + + switch (rv) + { + case 0: + break; + + case VNET_API_ERROR_INVALID_SW_IF_INDEX: + return clib_error_return + (0, "Invalid interface, only works on physical ports"); + break; + + case VNET_API_ERROR_UNIMPLEMENTED: + return clib_error_return (0, + "Device driver doesn't support redirection"); + break; + + case VNET_API_ERROR_INVALID_VALUE: + return clib_error_return (0, "feature arc not found"); + + case VNET_API_ERROR_INVALID_VALUE_2: + return clib_error_return (0, "feature node not found"); + + default: + return clib_error_return (0, "syn_filter_enable_disable returned %d", + rv); + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (sr_content_command, static) = +{ + .path = "ip syn filter", + .short_help = "ip syn filter [disable]", + .function = syn_filter_enable_disable_command_fn, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/tcp/tcp_timer.h b/src/vnet/tcp/tcp_timer.h new file mode 100644 index 00000000..fa25268c --- /dev/null +++ b/src/vnet/tcp/tcp_timer.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_tcp_timer_h__ +#define __included_tcp_timer_h__ + +#include +#include + +#endif /* __included_tcp_timer_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/udp/builtin_server.c b/src/vnet/udp/builtin_server.c new file mode 100644 index 00000000..afa66ba4 --- /dev/null +++ b/src/vnet/udp/builtin_server.c @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** @file + udp builtin server +*/ + +#include +#include +#include + +/** per-worker built-in server copy buffers */ +u8 **copy_buffers; + +static int +builtin_session_create_callback (stream_session_t * s) +{ + /* Simple version: declare session ready-to-go... */ + s->session_state = SESSION_STATE_READY; + return 0; +} + +static void +builtin_session_disconnect_callback (stream_session_t * s) +{ + stream_session_disconnect (s); +} + +static int +builtin_server_rx_callback (stream_session_t * s) +{ + svm_fifo_t *rx_fifo, *tx_fifo; + u32 this_transfer; + int actual_transfer; + u8 *my_copy_buffer; + session_fifo_event_t evt; + unix_shared_memory_queue_t *q; + + my_copy_buffer = copy_buffers[s->thread_index]; + rx_fifo = s->server_rx_fifo; + tx_fifo = s->server_tx_fifo; + + this_transfer = svm_fifo_max_enqueue (tx_fifo) + < svm_fifo_max_dequeue (rx_fifo) ? + svm_fifo_max_enqueue (tx_fifo) : svm_fifo_max_dequeue (rx_fifo); + + vec_validate (my_copy_buffer, this_transfer - 1); + _vec_len (my_copy_buffer) = this_transfer; + + actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, 0, this_transfer, + my_copy_buffer); + ASSERT (actual_transfer == this_transfer); + actual_transfer = svm_fifo_enqueue_nowait (tx_fifo, 0, this_transfer, + my_copy_buffer); + + copy_buffers[s->thread_index] = my_copy_buffer; + + /* Fabricate TX event, send to ourselves */ + evt.fifo = tx_fifo; + evt.event_type = FIFO_EVENT_SERVER_TX; + /* $$$$ for event logging */ + evt.enqueue_length = actual_transfer; + evt.event_id = 0; + q = session_manager_get_vpp_event_queue (s->thread_index); + unix_shared_memory_queue_add (q, (u8 *) & evt, 0 /* do wait for mutex */ ); + + return 0; +} + +/* *INDENT-OFF* */ +static session_cb_vft_t builtin_server = { + .session_accept_callback = builtin_session_create_callback, + .session_disconnect_callback = builtin_session_disconnect_callback, + .builtin_server_rx_callback = builtin_server_rx_callback +}; +/* *INDENT-ON* */ + +static int +bind_builtin_uri_server (u8 * uri) +{ + vnet_bind_args_t _a, *a = &_a; + char segment_name[128]; + u32 segment_name_length; + int rv; + u64 options[16]; + + segment_name_length = ARRAY_LEN (segment_name); + + memset (a, 0, sizeof (*a)); + memset (options, 0, sizeof (options)); + + a->uri = (char *) uri; + a->api_client_index = ~0; /* built-in server */ + a->segment_name = segment_name; + a->segment_name_length = segment_name_length; + a->session_cb_vft = &builtin_server; + + options[SESSION_OPTIONS_ACCEPT_COOKIE] = 0x12345678; + options[SESSION_OPTIONS_SEGMENT_SIZE] = (2 << 30); /*$$$$ config / arg */ + a->options = options; + + rv = vnet_bind_uri (a); + + return rv; +} + +static int +unbind_builtin_uri_server (u8 * uri) +{ + int rv; + + rv = vnet_unbind_uri ((char *) uri, ~0 /* client_index */ ); + + return rv; +} + +static clib_error_t * +builtin_server_init (vlib_main_t * vm) +{ + vlib_thread_main_t *vtm = vlib_get_thread_main (); + u32 num_threads; + + num_threads = 1 /* main thread */ + vtm->n_threads; + + vec_validate (copy_buffers, num_threads - 1); + return 0; +} + +VLIB_INIT_FUNCTION (builtin_server_init); + +static clib_error_t * +builtin_uri_bind_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + u8 *uri = 0; + int rv; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "uri %s", &uri)) + ; + else + break; + } + + if (uri == 0) + return clib_error_return (0, "uri to bind not specified..."); + + rv = bind_builtin_uri_server (uri); + + vec_free (uri); + + switch (rv) + { + case 0: + break; + + default: + return clib_error_return (0, "bind_uri_server returned %d", rv); + break; + } + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (builtin_uri_bind_command, static) = +{ + .path = "builtin uri bind", + .short_help = "builtin uri bind", + .function = builtin_uri_bind_command_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +builtin_uri_unbind_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + u8 *uri = 0; + int rv; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "uri %s", &uri)) + ; + else + break; + } + + if (uri == 0) + return clib_error_return (0, "uri to unbind not specified..."); + + rv = unbind_builtin_uri_server (uri); + + vec_free (uri); + + switch (rv) + { + case 0: + break; + + default: + return clib_error_return (0, "unbind_uri_server returned %d", rv); + break; + } + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (builtin_uri_unbind_command, static) = +{ + .path = "builtin uri unbind", + .short_help = "builtin uri unbind", + .function = builtin_uri_unbind_command_fn, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/udp/udp.c b/src/vnet/udp/udp.c new file mode 100644 index 00000000..9e740466 --- /dev/null +++ b/src/vnet/udp/udp.c @@ -0,0 +1,342 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** @file + udp state machine, etc. +*/ + +#include +#include +#include +#include + +udp_uri_main_t udp_uri_main; + +u32 +udp_session_bind_ip4 (vlib_main_t * vm, u32 session_index, + ip46_address_t * ip, u16 port_number_host_byte_order) +{ + udp_uri_main_t *um = vnet_get_udp_main (); + udp_connection_t *listener; + + pool_get (um->udp_listeners, listener); + memset (listener, 0, sizeof (udp_connection_t)); + listener->c_lcl_port = clib_host_to_net_u16 (port_number_host_byte_order); + listener->c_lcl_ip4.as_u32 = ip->ip4.as_u32; + listener->c_proto = SESSION_TYPE_IP4_UDP; + udp_register_dst_port (um->vlib_main, port_number_host_byte_order, + udp4_uri_input_node.index, 1 /* is_ipv4 */ ); + return 0; +} + +u32 +udp_session_bind_ip6 (vlib_main_t * vm, u32 session_index, + ip46_address_t * ip, u16 port_number_host_byte_order) +{ + udp_uri_main_t *um = vnet_get_udp_main (); + udp_connection_t *listener; + + pool_get (um->udp_listeners, listener); + listener->c_lcl_port = clib_host_to_net_u16 (port_number_host_byte_order); + clib_memcpy (&listener->c_lcl_ip6, &ip->ip6, sizeof (ip6_address_t)); + listener->c_proto = SESSION_TYPE_IP6_UDP; + udp_register_dst_port (um->vlib_main, port_number_host_byte_order, + udp4_uri_input_node.index, 0 /* is_ipv4 */ ); + return 0; +} + +u32 +udp_session_unbind_ip4 (vlib_main_t * vm, u32 listener_index) +{ + udp_connection_t *listener; + listener = udp_listener_get (listener_index); + + /* deregister the udp_local mapping */ + udp_unregister_dst_port (vm, listener->c_lcl_port, 1 /* is_ipv4 */ ); + return 0; +} + +u32 +udp_session_unbind_ip6 (vlib_main_t * vm, u32 listener_index) +{ + udp_connection_t *listener; + + listener = udp_listener_get (listener_index); + + /* deregister the udp_local mapping */ + udp_unregister_dst_port (vm, listener->c_lcl_port, 0 /* is_ipv4 */ ); + return 0; +} + +transport_connection_t * +udp_session_get_listener (u32 listener_index) +{ + udp_connection_t *us; + + us = udp_listener_get (listener_index); + return &us->connection; +} + +u32 +udp_push_header (transport_connection_t * tconn, vlib_buffer_t * b) +{ + udp_connection_t *us; + u8 *data; + udp_header_t *udp; + + us = (udp_connection_t *) tconn; + + if (tconn->is_ip4) + { + ip4_header_t *ip; + + data = vlib_buffer_get_current (b); + udp = (udp_header_t *) (data - sizeof (udp_header_t)); + ip = (ip4_header_t *) ((u8 *) udp - sizeof (ip4_header_t)); + + /* Build packet header, swap rx key src + dst fields */ + ip->src_address.as_u32 = us->c_lcl_ip4.as_u32; + ip->dst_address.as_u32 = us->c_rmt_ip4.as_u32; + ip->ip_version_and_header_length = 0x45; + ip->ttl = 254; + ip->protocol = IP_PROTOCOL_UDP; + ip->length = clib_host_to_net_u16 (b->current_length + sizeof (*udp)); + ip->checksum = ip4_header_checksum (ip); + + udp->src_port = us->c_lcl_port; + udp->dst_port = us->c_rmt_port; + udp->length = clib_host_to_net_u16 (b->current_length); + udp->checksum = 0; + + b->current_length = sizeof (*ip) + sizeof (*udp); + return SESSION_QUEUE_NEXT_IP4_LOOKUP; + } + else + { + vlib_main_t *vm = vlib_get_main (); + ip6_header_t *ip; + u16 payload_length; + int bogus = ~0; + + data = vlib_buffer_get_current (b); + udp = (udp_header_t *) (data - sizeof (udp_header_t)); + ip = (ip6_header_t *) ((u8 *) udp - sizeof (ip6_header_t)); + + /* Build packet header, swap rx key src + dst fields */ + clib_memcpy (&ip->src_address, &us->c_lcl_ip6, sizeof (ip6_address_t)); + clib_memcpy (&ip->dst_address, &us->c_rmt_ip6, sizeof (ip6_address_t)); + + ip->ip_version_traffic_class_and_flow_label = + clib_host_to_net_u32 (0x6 << 28); + + ip->hop_limit = 0xff; + ip->protocol = IP_PROTOCOL_UDP; + + payload_length = vlib_buffer_length_in_chain (vm, b); + payload_length -= sizeof (*ip); + + ip->payload_length = clib_host_to_net_u16 (payload_length); + + udp->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b, ip, &bogus); + ASSERT (!bogus); + + udp->src_port = us->c_lcl_port; + udp->dst_port = us->c_rmt_port; + udp->length = clib_host_to_net_u16 (b->current_length); + udp->checksum = 0; + + b->current_length = sizeof (*ip) + sizeof (*udp); + + return SESSION_QUEUE_NEXT_IP6_LOOKUP; + } +} + +transport_connection_t * +udp_session_get (u32 connection_index, u32 my_thread_index) +{ + udp_uri_main_t *um = vnet_get_udp_main (); + + udp_connection_t *us; + us = + pool_elt_at_index (um->udp_sessions[my_thread_index], connection_index); + return &us->connection; +} + +void +udp_session_close (u32 connection_index, u32 my_thread_index) +{ + udp_uri_main_t *um = vnet_get_udp_main (); + pool_put_index (um->udp_sessions[my_thread_index], connection_index); +} + +u8 * +format_udp_session_ip4 (u8 * s, va_list * args) +{ + u32 uci = va_arg (*args, u32); + u32 thread_index = va_arg (*args, u32); + udp_connection_t *u4; + + u4 = udp_connection_get (uci, thread_index); + + s = format (s, "[%s] %U:%d->%U:%d", "udp", format_ip4_address, + &u4->c_lcl_ip4, clib_net_to_host_u16 (u4->c_lcl_port), + format_ip4_address, &u4->c_rmt_ip4, + clib_net_to_host_u16 (u4->c_rmt_port)); + return s; +} + +u8 * +format_udp_session_ip6 (u8 * s, va_list * args) +{ + u32 uci = va_arg (*args, u32); + u32 thread_index = va_arg (*args, u32); + udp_connection_t *tc = udp_connection_get (uci, thread_index); + s = format (s, "[%s] %U:%d->%U:%d", "udp", format_ip6_address, + &tc->c_lcl_ip6, clib_net_to_host_u16 (tc->c_lcl_port), + format_ip6_address, &tc->c_rmt_ip6, + clib_net_to_host_u16 (tc->c_rmt_port)); + return s; +} + +u8 * +format_udp_listener_session_ip4 (u8 * s, va_list * args) +{ + u32 tci = va_arg (*args, u32); + udp_connection_t *tc = udp_listener_get (tci); + s = format (s, "[%s] %U:%d->%U:%d", "udp", format_ip4_address, + &tc->c_lcl_ip4, clib_net_to_host_u16 (tc->c_lcl_port), + format_ip4_address, &tc->c_rmt_ip4, + clib_net_to_host_u16 (tc->c_rmt_port)); + return s; +} + +u8 * +format_udp_listener_session_ip6 (u8 * s, va_list * args) +{ + u32 tci = va_arg (*args, u32); + udp_connection_t *tc = udp_listener_get (tci); + s = format (s, "[%s] %U:%d->%U:%d", "udp", format_ip6_address, + &tc->c_lcl_ip6, clib_net_to_host_u16 (tc->c_lcl_port), + format_ip6_address, &tc->c_rmt_ip6, + clib_net_to_host_u16 (tc->c_rmt_port)); + return s; +} + +u16 +udp_send_mss_uri (transport_connection_t * t) +{ + /* TODO figure out MTU of output interface */ + return 400; +} + +u32 +udp_send_space_uri (transport_connection_t * t) +{ + /* No constraint on TX window */ + return ~0; +} + +int +udp_open_connection (ip46_address_t * addr, u16 port) +{ + clib_warning ("Not implemented"); + return 0; +} + +/* *INDENT-OFF* */ +const static transport_proto_vft_t udp4_proto = { + .bind = udp_session_bind_ip4, + .open = udp_open_connection, + .unbind = udp_session_unbind_ip4, + .push_header = udp_push_header, + .get_connection = udp_session_get, + .get_listener = udp_session_get_listener, + .close = udp_session_close, + .send_mss = udp_send_mss_uri, + .send_space = udp_send_space_uri, + .format_connection = format_udp_session_ip4, + .format_listener = format_udp_listener_session_ip4 +}; + +const static transport_proto_vft_t udp6_proto = { + .bind = udp_session_bind_ip6, + .open = udp_open_connection, + .unbind = udp_session_unbind_ip6, + .push_header = udp_push_header, + .get_connection = udp_session_get, + .get_listener = udp_session_get_listener, + .close = udp_session_close, + .send_mss = udp_send_mss_uri, + .send_space = udp_send_space_uri, + .format_connection = format_udp_session_ip6, + .format_listener = format_udp_listener_session_ip6 +}; +/* *INDENT-ON* */ + +static clib_error_t * +udp_init (vlib_main_t * vm) +{ + udp_uri_main_t *um = vnet_get_udp_main (); + ip_main_t *im = &ip_main; + vlib_thread_main_t *tm = vlib_get_thread_main (); + u32 num_threads; + clib_error_t *error = 0; + ip_protocol_info_t *pi; + + um->vlib_main = vm; + um->vnet_main = vnet_get_main (); + + if ((error = vlib_call_init_function (vm, ip_main_init))) + return error; + if ((error = vlib_call_init_function (vm, ip4_lookup_init))) + return error; + if ((error = vlib_call_init_function (vm, ip6_lookup_init))) + return error; + + /* + * Registrations + */ + + /* IP registration */ + pi = ip_get_protocol_info (im, IP_PROTOCOL_UDP); + if (pi == 0) + return clib_error_return (0, "UDP protocol info AWOL"); + pi->format_header = format_udp_header; + pi->unformat_pg_edit = unformat_pg_udp_header; + + + /* Register as transport with URI */ + session_register_transport (SESSION_TYPE_IP4_UDP, &udp4_proto); + session_register_transport (SESSION_TYPE_IP6_UDP, &udp6_proto); + + /* + * Initialize data structures + */ + + num_threads = 1 /* main thread */ + tm->n_threads; + vec_validate (um->udp_sessions, num_threads - 1); + + return error; +} + +VLIB_INIT_FUNCTION (udp_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/udp/udp.h b/src/vnet/udp/udp.h new file mode 100644 index 00000000..7ab26ce9 --- /dev/null +++ b/src/vnet/udp/udp.h @@ -0,0 +1,362 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_udp_h__ +#define __included_udp_h__ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +typedef struct +{ + transport_connection_t connection; /** must be first */ + + /** ersatz MTU to limit fifo pushes to test data size */ + u32 mtu; +} udp_connection_t; + +typedef struct _udp_uri_main +{ + /* Per-worker thread udp connection pools */ + udp_connection_t **udp_sessions; + udp_connection_t *udp_listeners; + + /* convenience */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; + ip4_main_t *ip4_main; + ip6_main_t *ip6_main; +} udp_uri_main_t; + +extern udp_uri_main_t udp_uri_main; +extern vlib_node_registration_t udp4_uri_input_node; + +always_inline udp_uri_main_t * +vnet_get_udp_main () +{ + return &udp_uri_main; +} + +always_inline udp_connection_t * +udp_connection_get (u32 conn_index, u32 thread_index) +{ + return pool_elt_at_index (udp_uri_main.udp_sessions[thread_index], + conn_index); +} + +always_inline udp_connection_t * +udp_listener_get (u32 conn_index) +{ + return pool_elt_at_index (udp_uri_main.udp_listeners, conn_index); +} + +typedef enum +{ +#define udp_error(n,s) UDP_ERROR_##n, +#include +#undef udp_error + UDP_N_ERROR, +} udp_error_t; + +#define foreach_udp4_dst_port \ +_ (67, dhcp_to_server) \ +_ (68, dhcp_to_client) \ +_ (500, ikev2) \ +_ (3784, bfd4) \ +_ (3785, bfd_echo4) \ +_ (4341, lisp_gpe) \ +_ (4342, lisp_cp) \ +_ (4739, ipfix) \ +_ (4789, vxlan) \ +_ (4789, vxlan6) \ +_ (4790, vxlan_gpe) \ +_ (6633, vpath_3) + + +#define foreach_udp6_dst_port \ +_ (547, dhcpv6_to_server) \ +_ (546, dhcpv6_to_client) \ +_ (3784, bfd6) \ +_ (3785, bfd_echo6) \ +_ (4341, lisp_gpe6) \ +_ (4342, lisp_cp6) \ +_ (4790, vxlan6_gpe) \ +_ (6633, vpath6_3) + +typedef enum +{ +#define _(n,f) UDP_DST_PORT_##f = n, + foreach_udp4_dst_port foreach_udp6_dst_port +#undef _ +} udp_dst_port_t; + +typedef enum +{ +#define _(n,f) UDP6_DST_PORT_##f = n, + foreach_udp6_dst_port +#undef _ +} udp6_dst_port_t; + +typedef struct +{ + /* Name (a c string). */ + char *name; + + /* GRE protocol type in host byte order. */ + udp_dst_port_t dst_port; + + /* Node which handles this type. */ + u32 node_index; + + /* Next index for this type. */ + u32 next_index; +} udp_dst_port_info_t; + +typedef enum +{ + UDP_IP6 = 0, + UDP_IP4, /* the code is full of is_ip4... */ + N_UDP_AF, +} udp_af_t; + +typedef struct +{ + udp_dst_port_info_t *dst_port_infos[N_UDP_AF]; + + /* Hash tables mapping name/protocol to protocol info index. */ + uword *dst_port_info_by_name[N_UDP_AF]; + uword *dst_port_info_by_dst_port[N_UDP_AF]; + + /* convenience */ + vlib_main_t *vlib_main; +} udp_main_t; + +always_inline udp_dst_port_info_t * +udp_get_dst_port_info (udp_main_t * um, udp_dst_port_t dst_port, u8 is_ip4) +{ + uword *p = hash_get (um->dst_port_info_by_dst_port[is_ip4], dst_port); + return p ? vec_elt_at_index (um->dst_port_infos[is_ip4], p[0]) : 0; +} + +format_function_t format_udp_header; +format_function_t format_udp_rx_trace; + +unformat_function_t unformat_udp_header; + +void udp_register_dst_port (vlib_main_t * vm, + udp_dst_port_t dst_port, + u32 node_index, u8 is_ip4); + +void +udp_unregister_dst_port (vlib_main_t * vm, + udp_dst_port_t dst_port, u8 is_ip4); + +void udp_punt_unknown (vlib_main_t * vm, u8 is_ip4, u8 is_add); + +always_inline void +ip_udp_fixup_one (vlib_main_t * vm, vlib_buffer_t * b0, u8 is_ip4) +{ + u16 new_l0; + udp_header_t *udp0; + + if (is_ip4) + { + ip4_header_t *ip0; + ip_csum_t sum0; + u16 old_l0 = 0; + + ip0 = vlib_buffer_get_current (b0); + + /* fix the ing outer-IP checksum */ + sum0 = ip0->checksum; + /* old_l0 always 0, see the rewrite setup */ + new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0)); + + sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t, + length /* changed member */ ); + ip0->checksum = ip_csum_fold (sum0); + ip0->length = new_l0; + + /* Fix UDP length */ + udp0 = (udp_header_t *) (ip0 + 1); + new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0) + - sizeof (*ip0)); + udp0->length = new_l0; + } + else + { + ip6_header_t *ip0; + int bogus0; + + ip0 = vlib_buffer_get_current (b0); + + new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0) + - sizeof (*ip0)); + ip0->payload_length = new_l0; + + /* Fix UDP length */ + udp0 = (udp_header_t *) (ip0 + 1); + udp0->length = new_l0; + + udp0->checksum = + ip6_tcp_udp_icmp_compute_checksum (vm, b0, ip0, &bogus0); + ASSERT (bogus0 == 0); + + if (udp0->checksum == 0) + udp0->checksum = 0xffff; + } +} + +always_inline void +ip_udp_encap_one (vlib_main_t * vm, vlib_buffer_t * b0, u8 * ec0, word ec_len, + u8 is_ip4) +{ + vlib_buffer_advance (b0, -ec_len); + + if (is_ip4) + { + ip4_header_t *ip0; + + ip0 = vlib_buffer_get_current (b0); + + /* Apply the encap string. */ + clib_memcpy (ip0, ec0, ec_len); + ip_udp_fixup_one (vm, b0, 1); + } + else + { + ip6_header_t *ip0; + + ip0 = vlib_buffer_get_current (b0); + + /* Apply the encap string. */ + clib_memcpy (ip0, ec0, ec_len); + ip_udp_fixup_one (vm, b0, 0); + } +} + +always_inline void +ip_udp_encap_two (vlib_main_t * vm, vlib_buffer_t * b0, vlib_buffer_t * b1, + u8 * ec0, u8 * ec1, word ec_len, u8 is_v4) +{ + u16 new_l0, new_l1; + udp_header_t *udp0, *udp1; + + ASSERT (_vec_len (ec0) == _vec_len (ec1)); + + vlib_buffer_advance (b0, -ec_len); + vlib_buffer_advance (b1, -ec_len); + + if (is_v4) + { + ip4_header_t *ip0, *ip1; + ip_csum_t sum0, sum1; + u16 old_l0 = 0, old_l1 = 0; + + ip0 = vlib_buffer_get_current (b0); + ip1 = vlib_buffer_get_current (b1); + + /* Apply the encap string */ + clib_memcpy (ip0, ec0, ec_len); + clib_memcpy (ip1, ec1, ec_len); + + /* fix the ing outer-IP checksum */ + sum0 = ip0->checksum; + sum1 = ip1->checksum; + + /* old_l0 always 0, see the rewrite setup */ + new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0)); + new_l1 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1)); + + sum0 = ip_csum_update (sum0, old_l0, new_l0, ip4_header_t, + length /* changed member */ ); + sum1 = ip_csum_update (sum1, old_l1, new_l1, ip4_header_t, + length /* changed member */ ); + + ip0->checksum = ip_csum_fold (sum0); + ip1->checksum = ip_csum_fold (sum1); + + ip0->length = new_l0; + ip1->length = new_l1; + + /* Fix UDP length */ + udp0 = (udp_header_t *) (ip0 + 1); + udp1 = (udp_header_t *) (ip1 + 1); + + new_l0 = + clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0) - + sizeof (*ip0)); + new_l1 = + clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1) - + sizeof (*ip1)); + udp0->length = new_l0; + udp1->length = new_l1; + } + else + { + ip6_header_t *ip0, *ip1; + int bogus0, bogus1; + + ip0 = vlib_buffer_get_current (b0); + ip1 = vlib_buffer_get_current (b1); + + /* Apply the encap string. */ + clib_memcpy (ip0, ec0, ec_len); + clib_memcpy (ip1, ec1, ec_len); + + new_l0 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b0) + - sizeof (*ip0)); + new_l1 = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, b1) + - sizeof (*ip1)); + ip0->payload_length = new_l0; + ip1->payload_length = new_l1; + + /* Fix UDP length */ + udp0 = (udp_header_t *) (ip0 + 1); + udp1 = (udp_header_t *) (ip1 + 1); + + udp0->length = new_l0; + udp1->length = new_l1; + + udp0->checksum = + ip6_tcp_udp_icmp_compute_checksum (vm, b0, ip0, &bogus0); + udp1->checksum = + ip6_tcp_udp_icmp_compute_checksum (vm, b1, ip1, &bogus1); + ASSERT (bogus0 == 0); + ASSERT (bogus1 == 0); + + if (udp0->checksum == 0) + udp0->checksum = 0xffff; + if (udp1->checksum == 0) + udp1->checksum = 0xffff; + } +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ + +#endif /* __included_udp_h__ */ diff --git a/src/vnet/udp/udp_error.def b/src/vnet/udp/udp_error.def new file mode 100644 index 00000000..bfdae0ac --- /dev/null +++ b/src/vnet/udp/udp_error.def @@ -0,0 +1,21 @@ +/* + * udp_error.def: udp errors + * + * Copyright (c) 2013-2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +udp_error (NONE, "no error") +udp_error (NO_LISTENER, "no listener for dst port") +udp_error (LENGTH_ERROR, "UDP packets with length errors") +udp_error (PUNT, "no listener punt") diff --git a/src/vnet/udp/udp_format.c b/src/vnet/udp/udp_format.c new file mode 100644 index 00000000..abdf561e --- /dev/null +++ b/src/vnet/udp/udp_format.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/udp_format.c: udp formatting + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include + +/* Format UDP header. */ +u8 * +format_udp_header (u8 * s, va_list * args) +{ + udp_header_t *udp = va_arg (*args, udp_header_t *); + u32 max_header_bytes = va_arg (*args, u32); + uword indent; + u32 header_bytes = sizeof (udp[0]); + + /* Nothing to do. */ + if (max_header_bytes < sizeof (udp[0])) + return format (s, "UDP header truncated"); + + indent = format_get_indent (s); + indent += 2; + + s = format (s, "UDP: %d -> %d", + clib_net_to_host_u16 (udp->src_port), + clib_net_to_host_u16 (udp->dst_port)); + + s = format (s, "\n%Ulength %d, checksum 0x%04x", + format_white_space, indent, + clib_net_to_host_u16 (udp->length), + clib_net_to_host_u16 (udp->checksum)); + + /* Recurse into next protocol layer. */ + if (max_header_bytes != 0 && header_bytes < max_header_bytes) + { + ip_main_t *im = &ip_main; + tcp_udp_port_info_t *pi; + + pi = ip_get_tcp_udp_port_info (im, udp->dst_port); + + if (pi && pi->format_header) + s = format (s, "\n%U%U", + format_white_space, indent - 2, pi->format_header, + /* next protocol header */ (udp + 1), + max_header_bytes - sizeof (udp[0])); + } + + return s; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/udp/udp_input.c b/src/vnet/udp/udp_input.c new file mode 100644 index 00000000..4d509335 --- /dev/null +++ b/src/vnet/udp/udp_input.c @@ -0,0 +1,314 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include "../session/application_interface.h" + +vlib_node_registration_t udp4_uri_input_node; + +typedef struct +{ + u32 session; + u32 disposition; + u32 thread_index; +} udp4_uri_input_trace_t; + +/* packet trace format function */ +static u8 * +format_udp4_uri_input_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + udp4_uri_input_trace_t *t = va_arg (*args, udp4_uri_input_trace_t *); + + s = format (s, "UDP4_URI_INPUT: session %d, disposition %d, thread %d", + t->session, t->disposition, t->thread_index); + return s; +} + +typedef enum +{ + UDP4_URI_INPUT_NEXT_DROP, + UDP4_URI_INPUT_N_NEXT, +} udp4_uri_input_next_t; + +static char *udp4_uri_input_error_strings[] = { +#define _(sym,string) string, + foreach_session_input_error +#undef _ +}; + +static uword +udp4_uri_input_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + u32 n_left_from, *from, *to_next; + udp4_uri_input_next_t next_index; + udp_uri_main_t *um = vnet_get_udp_main (); + session_manager_main_t *smm = vnet_get_session_manager_main (); + u32 my_thread_index = vm->cpu_index; + u8 my_enqueue_epoch; + u32 *session_indices_to_enqueue; + static u32 serial_number; + int i; + + my_enqueue_epoch = ++smm->current_enqueue_epoch[my_thread_index]; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 next0 = UDP4_URI_INPUT_NEXT_DROP; + u32 error0 = SESSION_ERROR_ENQUEUED; + udp_header_t *udp0; + ip4_header_t *ip0; + stream_session_t *s0; + svm_fifo_t *f0; + u16 udp_len0; + u8 *data0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + /* udp_local hands us a pointer to the udp data */ + + data0 = vlib_buffer_get_current (b0); + udp0 = (udp_header_t *) (data0 - sizeof (*udp0)); + + /* $$$$ fixme: udp_local doesn't do ip options correctly anyhow */ + ip0 = (ip4_header_t *) (((u8 *) udp0) - sizeof (*ip0)); + s0 = 0; + + /* lookup session */ + s0 = stream_session_lookup4 (&ip0->dst_address, &ip0->src_address, + udp0->dst_port, udp0->src_port, + SESSION_TYPE_IP4_UDP, my_thread_index); + + /* no listener */ + if (PREDICT_FALSE (s0 == 0)) + { + error0 = SESSION_ERROR_NO_LISTENER; + goto trace0; + } + + f0 = s0->server_rx_fifo; + + /* established hit */ + if (PREDICT_TRUE (s0->session_state == SESSION_STATE_READY)) + { + udp_len0 = clib_net_to_host_u16 (udp0->length); + + if (PREDICT_FALSE (udp_len0 > svm_fifo_max_enqueue (f0))) + { + error0 = SESSION_ERROR_FIFO_FULL; + goto trace0; + } + + svm_fifo_enqueue_nowait (f0, 0 /* pid */ , + udp_len0 - sizeof (*udp0), + (u8 *) (udp0 + 1)); + + b0->error = node->errors[SESSION_ERROR_ENQUEUED]; + + /* We need to send an RX event on this fifo */ + if (s0->enqueue_epoch != my_enqueue_epoch) + { + s0->enqueue_epoch = my_enqueue_epoch; + + vec_add1 (smm->session_indices_to_enqueue_by_thread + [my_thread_index], + s0 - smm->sessions[my_thread_index]); + } + } + /* listener hit */ + else if (s0->session_state == SESSION_STATE_LISTENING) + { + udp_connection_t *us; + int rv; + + error0 = SESSION_ERROR_NOT_READY; + + /* + * create udp transport session + */ + pool_get (um->udp_sessions[my_thread_index], us); + + us->mtu = 1024; /* $$$$ policy */ + + us->c_lcl_ip4.as_u32 = ip0->dst_address.as_u32; + us->c_rmt_ip4.as_u32 = ip0->src_address.as_u32; + us->c_lcl_port = udp0->dst_port; + us->c_rmt_port = udp0->src_port; + us->c_proto = SESSION_TYPE_IP4_UDP; + us->c_c_index = us - um->udp_sessions[my_thread_index]; + + /* + * create stream session and attach the udp session to it + */ + rv = stream_session_accept (&us->connection, s0->session_index, + SESSION_TYPE_IP4_UDP, + 1 /*notify */ ); + if (rv) + error0 = rv; + + } + else + { + + error0 = SESSION_ERROR_NOT_READY; + goto trace0; + } + + trace0: + b0->error = node->errors[error0]; + + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + udp4_uri_input_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + + t->session = ~0; + if (s0) + t->session = s0 - smm->sessions[my_thread_index]; + t->disposition = error0; + t->thread_index = my_thread_index; + } + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + /* Send enqueue events */ + + session_indices_to_enqueue = + smm->session_indices_to_enqueue_by_thread[my_thread_index]; + + for (i = 0; i < vec_len (session_indices_to_enqueue); i++) + { + session_fifo_event_t evt; + unix_shared_memory_queue_t *q; + stream_session_t *s0; + application_t *server0; + + /* Get session */ + s0 = pool_elt_at_index (smm->sessions[my_thread_index], + session_indices_to_enqueue[i]); + + /* Get session's server */ + server0 = application_get (s0->app_index); + + /* Built-in server? Deliver the goods... */ + if (server0->cb_fns.builtin_server_rx_callback) + { + server0->cb_fns.builtin_server_rx_callback (s0); + continue; + } + + /* Fabricate event */ + evt.fifo = s0->server_rx_fifo; + evt.event_type = FIFO_EVENT_SERVER_RX; + evt.event_id = serial_number++; + evt.enqueue_length = svm_fifo_max_dequeue (s0->server_rx_fifo); + + /* Add event to server's event queue */ + q = server0->event_queue; + + /* Don't block for lack of space */ + if (PREDICT_TRUE (q->cursize < q->maxsize)) + unix_shared_memory_queue_add (server0->event_queue, (u8 *) & evt, + 0 /* do wait for mutex */ ); + else + { + vlib_node_increment_counter (vm, udp4_uri_input_node.index, + SESSION_ERROR_FIFO_FULL, 1); + } + if (1) + { + ELOG_TYPE_DECLARE (e) = + { + .format = "evt-enqueue: id %d length %d",.format_args = "i4i4",}; + struct + { + u32 data[2]; + } *ed; + ed = ELOG_DATA (&vlib_global_main.elog_main, e); + ed->data[0] = evt.event_id; + ed->data[1] = evt.enqueue_length; + } + } + + vec_reset_length (session_indices_to_enqueue); + + smm->session_indices_to_enqueue_by_thread[my_thread_index] = + session_indices_to_enqueue; + + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (udp4_uri_input_node) = +{ + .function = udp4_uri_input_node_fn,.name = "udp4-uri-input",.vector_size = + sizeof (u32),.format_trace = format_udp4_uri_input_trace,.type = + VLIB_NODE_TYPE_INTERNAL,.n_errors = + ARRAY_LEN (udp4_uri_input_error_strings),.error_strings = + udp4_uri_input_error_strings,.n_next_nodes = UDP4_URI_INPUT_N_NEXT, + /* edit / add dispositions here */ + .next_nodes = + { + [UDP4_URI_INPUT_NEXT_DROP] = "error-drop",} +,}; + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/udp/udp_local.c b/src/vnet/udp/udp_local.c new file mode 100644 index 00000000..6b239f73 --- /dev/null +++ b/src/vnet/udp/udp_local.c @@ -0,0 +1,666 @@ +/* + * node.c: udp packet processing + * + * Copyright (c) 2013 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +udp_main_t udp_main; + +#define foreach_udp_input_next \ + _ (PUNT, "error-punt") \ + _ (DROP, "error-drop") \ + _ (ICMP4_ERROR, "ip4-icmp-error") \ + _ (ICMP6_ERROR, "ip6-icmp-error") + +typedef enum +{ +#define _(s,n) UDP_INPUT_NEXT_##s, + foreach_udp_input_next +#undef _ + UDP_INPUT_N_NEXT, +} udp_input_next_t; + +typedef struct +{ + u16 src_port; + u16 dst_port; + u8 bound; +} udp_rx_trace_t; + +u8 * +format_udp_rx_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + udp_rx_trace_t *t = va_arg (*args, udp_rx_trace_t *); + + s = format (s, "UDP: src-port %d dst-port %d%s", + clib_net_to_host_u16 (t->src_port), + clib_net_to_host_u16 (t->dst_port), + t->bound ? "" : " (no listener)"); + return s; +} + +typedef struct +{ + /* Sparse vector mapping udp dst_port in network byte order + to next index. */ + u16 *next_by_dst_port; + u8 punt_unknown; +} udp_input_runtime_t; + +vlib_node_registration_t udp4_input_node; +vlib_node_registration_t udp6_input_node; + +always_inline uword +udp46_input_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame, int is_ip4) +{ + udp_input_runtime_t *rt = is_ip4 ? + (void *) vlib_node_get_runtime_data (vm, udp4_input_node.index) + : (void *) vlib_node_get_runtime_data (vm, udp6_input_node.index); + __attribute__ ((unused)) u32 n_left_from, next_index, *from, *to_next; + word n_no_listener = 0; + u8 punt_unknown = rt->punt_unknown; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + u32 bi0, bi1; + vlib_buffer_t *b0, *b1; + udp_header_t *h0 = 0, *h1 = 0; + u32 i0, i1, dst_port0, dst_port1; + u32 advance0, advance1; + u32 error0, next0, error1, next1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t *p2, *p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, sizeof (h0[0]), LOAD); + CLIB_PREFETCH (p3->data, sizeof (h1[0]), LOAD); + } + + bi0 = from[0]; + bi1 = from[1]; + to_next[0] = bi0; + to_next[1] = bi1; + from += 2; + to_next += 2; + n_left_to_next -= 2; + n_left_from -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + /* ip4/6_local hands us the ip header, not the udp header */ + if (is_ip4) + { + advance0 = sizeof (ip4_header_t); + advance1 = sizeof (ip4_header_t); + } + else + { + advance0 = sizeof (ip6_header_t); + advance1 = sizeof (ip6_header_t); + } + + if (PREDICT_FALSE (b0->current_length < advance0 + sizeof (*h0))) + { + error0 = UDP_ERROR_LENGTH_ERROR; + next0 = UDP_INPUT_NEXT_DROP; + } + else + { + vlib_buffer_advance (b0, advance0); + h0 = vlib_buffer_get_current (b0); + error0 = next0 = 0; + if (PREDICT_FALSE (clib_net_to_host_u16 (h0->length) > + vlib_buffer_length_in_chain (vm, b0))) + { + error0 = UDP_ERROR_LENGTH_ERROR; + next0 = UDP_INPUT_NEXT_DROP; + } + } + + if (PREDICT_FALSE (b1->current_length < advance1 + sizeof (*h1))) + { + error1 = UDP_ERROR_LENGTH_ERROR; + next1 = UDP_INPUT_NEXT_DROP; + } + else + { + vlib_buffer_advance (b1, advance1); + h1 = vlib_buffer_get_current (b1); + error1 = next1 = 0; + if (PREDICT_FALSE (clib_net_to_host_u16 (h1->length) > + vlib_buffer_length_in_chain (vm, b1))) + { + error1 = UDP_ERROR_LENGTH_ERROR; + next1 = UDP_INPUT_NEXT_DROP; + } + } + + /* Index sparse array with network byte order. */ + dst_port0 = (error0 == 0) ? h0->dst_port : 0; + dst_port1 = (error1 == 0) ? h1->dst_port : 0; + sparse_vec_index2 (rt->next_by_dst_port, dst_port0, dst_port1, + &i0, &i1); + next0 = (error0 == 0) ? vec_elt (rt->next_by_dst_port, i0) : next0; + next1 = (error1 == 0) ? vec_elt (rt->next_by_dst_port, i1) : next1; + + if (PREDICT_FALSE (i0 == SPARSE_VEC_INVALID_INDEX)) + { + // move the pointer back so icmp-error can find the + // ip packet header + vlib_buffer_advance (b0, -(word) advance0); + + if (PREDICT_FALSE (punt_unknown)) + { + b0->error = node->errors[UDP_ERROR_PUNT]; + next0 = UDP_INPUT_NEXT_PUNT; + } + else if (is_ip4) + { + icmp4_error_set_vnet_buffer (b0, + ICMP4_destination_unreachable, + ICMP4_destination_unreachable_port_unreachable, + 0); + next0 = UDP_INPUT_NEXT_ICMP4_ERROR; + n_no_listener++; + } + else + { + icmp6_error_set_vnet_buffer (b0, + ICMP6_destination_unreachable, + ICMP6_destination_unreachable_port_unreachable, + 0); + next0 = UDP_INPUT_NEXT_ICMP6_ERROR; + n_no_listener++; + } + } + else + { + b0->error = node->errors[UDP_ERROR_NONE]; + // advance to the payload + vlib_buffer_advance (b0, sizeof (*h0)); + } + + if (PREDICT_FALSE (i1 == SPARSE_VEC_INVALID_INDEX)) + { + // move the pointer back so icmp-error can find the + // ip packet header + vlib_buffer_advance (b1, -(word) advance1); + + if (PREDICT_FALSE (punt_unknown)) + { + b1->error = node->errors[UDP_ERROR_PUNT]; + next1 = UDP_INPUT_NEXT_PUNT; + } + else if (is_ip4) + { + icmp4_error_set_vnet_buffer (b1, + ICMP4_destination_unreachable, + ICMP4_destination_unreachable_port_unreachable, + 0); + next1 = UDP_INPUT_NEXT_ICMP4_ERROR; + n_no_listener++; + } + else + { + icmp6_error_set_vnet_buffer (b1, + ICMP6_destination_unreachable, + ICMP6_destination_unreachable_port_unreachable, + 0); + next1 = UDP_INPUT_NEXT_ICMP6_ERROR; + n_no_listener++; + } + } + else + { + b1->error = node->errors[UDP_ERROR_NONE]; + // advance to the payload + vlib_buffer_advance (b1, sizeof (*h1)); + } + + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + udp_rx_trace_t *tr = vlib_add_trace (vm, node, + b0, sizeof (*tr)); + if (b0->error != node->errors[UDP_ERROR_LENGTH_ERROR]) + { + tr->src_port = h0 ? h0->src_port : 0; + tr->dst_port = h0 ? h0->dst_port : 0; + tr->bound = (next0 != UDP_INPUT_NEXT_ICMP4_ERROR && + next0 != UDP_INPUT_NEXT_ICMP6_ERROR); + } + } + if (PREDICT_FALSE (b1->flags & VLIB_BUFFER_IS_TRACED)) + { + udp_rx_trace_t *tr = vlib_add_trace (vm, node, + b1, sizeof (*tr)); + if (b1->error != node->errors[UDP_ERROR_LENGTH_ERROR]) + { + tr->src_port = h1 ? h1->src_port : 0; + tr->dst_port = h1 ? h1->dst_port : 0; + tr->bound = (next1 != UDP_INPUT_NEXT_ICMP4_ERROR && + next1 != UDP_INPUT_NEXT_ICMP6_ERROR); + } + } + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t *b0; + udp_header_t *h0 = 0; + u32 i0, next0; + u32 advance0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + /* ip4/6_local hands us the ip header, not the udp header */ + if (is_ip4) + advance0 = sizeof (ip4_header_t); + else + advance0 = sizeof (ip6_header_t); + + if (PREDICT_FALSE (b0->current_length < advance0 + sizeof (*h0))) + { + b0->error = node->errors[UDP_ERROR_LENGTH_ERROR]; + next0 = UDP_INPUT_NEXT_DROP; + goto trace_x1; + } + + vlib_buffer_advance (b0, advance0); + + h0 = vlib_buffer_get_current (b0); + + if (PREDICT_TRUE (clib_net_to_host_u16 (h0->length) <= + vlib_buffer_length_in_chain (vm, b0))) + { + i0 = sparse_vec_index (rt->next_by_dst_port, h0->dst_port); + next0 = vec_elt (rt->next_by_dst_port, i0); + + if (PREDICT_FALSE (i0 == SPARSE_VEC_INVALID_INDEX)) + { + // move the pointer back so icmp-error can find the + // ip packet header + vlib_buffer_advance (b0, -(word) advance0); + + if (PREDICT_FALSE (punt_unknown)) + { + b0->error = node->errors[UDP_ERROR_PUNT]; + next0 = UDP_INPUT_NEXT_PUNT; + } + else if (is_ip4) + { + icmp4_error_set_vnet_buffer (b0, + ICMP4_destination_unreachable, + ICMP4_destination_unreachable_port_unreachable, + 0); + next0 = UDP_INPUT_NEXT_ICMP4_ERROR; + n_no_listener++; + } + else + { + icmp6_error_set_vnet_buffer (b0, + ICMP6_destination_unreachable, + ICMP6_destination_unreachable_port_unreachable, + 0); + next0 = UDP_INPUT_NEXT_ICMP6_ERROR; + n_no_listener++; + } + } + else + { + b0->error = node->errors[UDP_ERROR_NONE]; + // advance to the payload + vlib_buffer_advance (b0, sizeof (*h0)); + } + } + else + { + b0->error = node->errors[UDP_ERROR_LENGTH_ERROR]; + next0 = UDP_INPUT_NEXT_DROP; + } + + trace_x1: + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + udp_rx_trace_t *tr = vlib_add_trace (vm, node, + b0, sizeof (*tr)); + if (b0->error != node->errors[UDP_ERROR_LENGTH_ERROR]) + { + tr->src_port = h0->src_port; + tr->dst_port = h0->dst_port; + tr->bound = (next0 != UDP_INPUT_NEXT_ICMP4_ERROR && + next0 != UDP_INPUT_NEXT_ICMP6_ERROR); + } + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + vlib_error_count (vm, node->node_index, UDP_ERROR_NO_LISTENER, + n_no_listener); + return from_frame->n_vectors; +} + +static char *udp_error_strings[] = { +#define udp_error(n,s) s, +#include "udp_error.def" +#undef udp_error +}; + +static uword +udp4_input (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * from_frame) +{ + return udp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */ ); +} + +static uword +udp6_input (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * from_frame) +{ + return udp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */ ); +} + + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (udp4_input_node) = { + .function = udp4_input, + .name = "ip4-udp-lookup", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + + .runtime_data_bytes = sizeof (udp_input_runtime_t), + + .n_errors = UDP_N_ERROR, + .error_strings = udp_error_strings, + + .n_next_nodes = UDP_INPUT_N_NEXT, + .next_nodes = { +#define _(s,n) [UDP_INPUT_NEXT_##s] = n, + foreach_udp_input_next +#undef _ + }, + + .format_buffer = format_udp_header, + .format_trace = format_udp_rx_trace, + .unformat_buffer = unformat_udp_header, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (udp4_input_node, udp4_input); + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (udp6_input_node) = { + .function = udp6_input, + .name = "ip6-udp-lookup", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + + .runtime_data_bytes = sizeof (udp_input_runtime_t), + + .n_errors = UDP_N_ERROR, + .error_strings = udp_error_strings, + + .n_next_nodes = UDP_INPUT_N_NEXT, + .next_nodes = { +#define _(s,n) [UDP_INPUT_NEXT_##s] = n, + foreach_udp_input_next +#undef _ + }, + + .format_buffer = format_udp_header, + .format_trace = format_udp_rx_trace, + .unformat_buffer = unformat_udp_header, +}; +/* *INDENT-ON* */ + +VLIB_NODE_FUNCTION_MULTIARCH (udp6_input_node, udp6_input); + +static void +add_dst_port (udp_main_t * um, + udp_dst_port_t dst_port, char *dst_port_name, u8 is_ip4) +{ + udp_dst_port_info_t *pi; + u32 i; + + vec_add2 (um->dst_port_infos[is_ip4], pi, 1); + i = pi - um->dst_port_infos[is_ip4]; + + pi->name = dst_port_name; + pi->dst_port = dst_port; + pi->next_index = pi->node_index = ~0; + + hash_set (um->dst_port_info_by_dst_port[is_ip4], dst_port, i); + + if (pi->name) + hash_set_mem (um->dst_port_info_by_name[is_ip4], pi->name, i); +} + +void +udp_register_dst_port (vlib_main_t * vm, + udp_dst_port_t dst_port, u32 node_index, u8 is_ip4) +{ + udp_main_t *um = &udp_main; + udp_dst_port_info_t *pi; + udp_input_runtime_t *rt; + u16 *n; + + { + clib_error_t *error = vlib_call_init_function (vm, udp_local_init); + if (error) + clib_error_report (error); + } + + pi = udp_get_dst_port_info (um, dst_port, is_ip4); + if (!pi) + { + add_dst_port (um, dst_port, 0, is_ip4); + pi = udp_get_dst_port_info (um, dst_port, is_ip4); + ASSERT (pi); + } + + pi->node_index = node_index; + pi->next_index = vlib_node_add_next (vm, + is_ip4 ? udp4_input_node.index + : udp6_input_node.index, node_index); + + /* Setup udp protocol -> next index sparse vector mapping. */ + rt = vlib_node_get_runtime_data + (vm, is_ip4 ? udp4_input_node.index : udp6_input_node.index); + n = sparse_vec_validate (rt->next_by_dst_port, + clib_host_to_net_u16 (dst_port)); + n[0] = pi->next_index; +} + +void +udp_unregister_dst_port (vlib_main_t * vm, udp_dst_port_t dst_port, u8 is_ip4) +{ + udp_main_t *um = &udp_main; + udp_dst_port_info_t *pi; + udp_input_runtime_t *rt; + u16 *n; + + pi = udp_get_dst_port_info (um, dst_port, is_ip4); + /* Not registered? Fagedaboudit */ + if (!pi) + return; + + /* Kill the mapping. Don't bother killing the pi, it may be back. */ + rt = vlib_node_get_runtime_data + (vm, is_ip4 ? udp4_input_node.index : udp6_input_node.index); + n = sparse_vec_validate (rt->next_by_dst_port, + clib_host_to_net_u16 (dst_port)); + n[0] = SPARSE_VEC_INVALID_INDEX; +} + +void +udp_punt_unknown (vlib_main_t * vm, u8 is_ip4, u8 is_add) +{ + udp_input_runtime_t *rt; + + { + clib_error_t *error = vlib_call_init_function (vm, udp_local_init); + if (error) + clib_error_report (error); + } + + rt = vlib_node_get_runtime_data + (vm, is_ip4 ? udp4_input_node.index : udp6_input_node.index); + + rt->punt_unknown = is_add; +} + +/* Parse a UDP header. */ +uword +unformat_udp_header (unformat_input_t * input, va_list * args) +{ + u8 **result = va_arg (*args, u8 **); + udp_header_t *udp; + __attribute__ ((unused)) int old_length; + u16 src_port, dst_port; + + /* Allocate space for IP header. */ + { + void *p; + + old_length = vec_len (*result); + vec_add2 (*result, p, sizeof (ip4_header_t)); + udp = p; + } + + memset (udp, 0, sizeof (udp[0])); + if (unformat (input, "src-port %d dst-port %d", &src_port, &dst_port)) + { + udp->src_port = clib_host_to_net_u16 (src_port); + udp->dst_port = clib_host_to_net_u16 (dst_port); + return 1; + } + return 0; +} + +static void +udp_setup_node (vlib_main_t * vm, u32 node_index) +{ + vlib_node_t *n = vlib_get_node (vm, node_index); + pg_node_t *pn = pg_get_node (node_index); + + n->format_buffer = format_udp_header; + n->unformat_buffer = unformat_udp_header; + pn->unformat_edit = unformat_pg_udp_header; +} + +clib_error_t * +udp_local_init (vlib_main_t * vm) +{ + udp_input_runtime_t *rt; + udp_main_t *um = &udp_main; + int i; + + { + clib_error_t *error; + error = vlib_call_init_function (vm, udp_init); + if (error) + clib_error_report (error); + } + + + for (i = 0; i < 2; i++) + { + um->dst_port_info_by_name[i] = hash_create_string (0, sizeof (uword)); + um->dst_port_info_by_dst_port[i] = hash_create (0, sizeof (uword)); + } + + udp_setup_node (vm, udp4_input_node.index); + udp_setup_node (vm, udp6_input_node.index); + + rt = vlib_node_get_runtime_data (vm, udp4_input_node.index); + + rt->next_by_dst_port = sparse_vec_new + ( /* elt bytes */ sizeof (rt->next_by_dst_port[0]), + /* bits in index */ BITS (((udp_header_t *) 0)->dst_port)); + + rt->punt_unknown = 0; + +#define _(n,s) add_dst_port (um, UDP_DST_PORT_##s, #s, 1 /* is_ip4 */); + foreach_udp4_dst_port +#undef _ + rt = vlib_node_get_runtime_data (vm, udp6_input_node.index); + + rt->next_by_dst_port = sparse_vec_new + ( /* elt bytes */ sizeof (rt->next_by_dst_port[0]), + /* bits in index */ BITS (((udp_header_t *) 0)->dst_port)); + + rt->punt_unknown = 0; + +#define _(n,s) add_dst_port (um, UDP_DST_PORT_##s, #s, 0 /* is_ip4 */); + foreach_udp6_dst_port +#undef _ + ip4_register_protocol (IP_PROTOCOL_UDP, udp4_input_node.index); + /* Note: ip6 differs from ip4, UDP is hotwired to ip6-udp-lookup */ + return 0; +} + +VLIB_INIT_FUNCTION (udp_local_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/udp/udp_packet.h b/src/vnet/udp/udp_packet.h new file mode 100644 index 00000000..beea3059 --- /dev/null +++ b/src/vnet/udp/udp_packet.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip4/udp_packet.h: UDP packet format + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_udp_packet_h +#define included_udp_packet_h + +typedef struct +{ + /* Source and destination port. */ + u16 src_port, dst_port; + + /* Length of UDP header plus payload. */ + u16 length; + + /* Checksum of UDP pseudo-header and data or + zero if checksum is disabled. */ + u16 checksum; +} udp_header_t; + +#endif /* included_udp_packet_h */ + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/udp/udp_pg.c b/src/vnet/udp/udp_pg.c new file mode 100644 index 00000000..c9d8d38c --- /dev/null +++ b/src/vnet/udp/udp_pg.c @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/udp_pg: UDP packet-generator interface + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include /* for unformat_udp_udp_port */ + +#define UDP_PG_EDIT_LENGTH (1 << 0) +#define UDP_PG_EDIT_CHECKSUM (1 << 1) + +always_inline void +udp_pg_edit_function_inline (pg_main_t * pg, + pg_stream_t * s, + pg_edit_group_t * g, + u32 * packets, u32 n_packets, u32 flags) +{ + vlib_main_t *vm = vlib_get_main (); + u32 ip_offset, udp_offset; + + udp_offset = g->start_byte_offset; + ip_offset = (g - 1)->start_byte_offset; + + while (n_packets >= 1) + { + vlib_buffer_t *p0; + ip4_header_t *ip0; + udp_header_t *udp0; + u32 udp_len0; + + p0 = vlib_get_buffer (vm, packets[0]); + n_packets -= 1; + packets += 1; + + ip0 = (void *) (p0->data + ip_offset); + udp0 = (void *) (p0->data + udp_offset); + udp_len0 = clib_net_to_host_u16 (ip0->length) - sizeof (ip0[0]); + + if (flags & UDP_PG_EDIT_LENGTH) + udp0->length = + clib_net_to_host_u16 (vlib_buffer_length_in_chain (vm, p0) + - ip_offset); + + /* Initialize checksum with header. */ + if (flags & UDP_PG_EDIT_CHECKSUM) + { + ip_csum_t sum0; + + sum0 = clib_mem_unaligned (&ip0->src_address, u64); + + sum0 = ip_csum_with_carry + (sum0, clib_host_to_net_u32 (udp_len0 + (ip0->protocol << 16))); + + /* Invalidate possibly old checksum. */ + udp0->checksum = 0; + + sum0 = + ip_incremental_checksum_buffer (vm, p0, udp_offset, udp_len0, + sum0); + + sum0 = ~ip_csum_fold (sum0); + + /* Zero checksum means checksumming disabled. */ + sum0 = sum0 != 0 ? sum0 : 0xffff; + + udp0->checksum = sum0; + } + } +} + +static void +udp_pg_edit_function (pg_main_t * pg, + pg_stream_t * s, + pg_edit_group_t * g, u32 * packets, u32 n_packets) +{ + switch (g->edit_function_opaque) + { + case UDP_PG_EDIT_LENGTH: + udp_pg_edit_function_inline (pg, s, g, packets, n_packets, + UDP_PG_EDIT_LENGTH); + break; + + case UDP_PG_EDIT_CHECKSUM: + udp_pg_edit_function_inline (pg, s, g, packets, n_packets, + UDP_PG_EDIT_CHECKSUM); + break; + + case UDP_PG_EDIT_CHECKSUM | UDP_PG_EDIT_LENGTH: + udp_pg_edit_function_inline (pg, s, g, packets, n_packets, + UDP_PG_EDIT_CHECKSUM | UDP_PG_EDIT_LENGTH); + break; + + default: + ASSERT (0); + break; + } +} + +typedef struct +{ + pg_edit_t src_port, dst_port; + pg_edit_t length; + pg_edit_t checksum; +} pg_udp_header_t; + +static inline void +pg_udp_header_init (pg_udp_header_t * p) +{ + /* Initialize fields that are not bit fields in the IP header. */ +#define _(f) pg_edit_init (&p->f, udp_header_t, f); + _(src_port); + _(dst_port); + _(length); + _(checksum); +#undef _ +} + +uword +unformat_pg_udp_header (unformat_input_t * input, va_list * args) +{ + pg_stream_t *s = va_arg (*args, pg_stream_t *); + pg_udp_header_t *p; + u32 group_index; + + p = pg_create_edit_group (s, sizeof (p[0]), sizeof (udp_header_t), + &group_index); + pg_udp_header_init (p); + + /* Defaults. */ + p->checksum.type = PG_EDIT_UNSPECIFIED; + p->length.type = PG_EDIT_UNSPECIFIED; + + if (!unformat (input, "UDP: %U -> %U", + unformat_pg_edit, + unformat_tcp_udp_port, &p->src_port, + unformat_pg_edit, unformat_tcp_udp_port, &p->dst_port)) + goto error; + + /* Parse options. */ + while (1) + { + if (unformat (input, "length %U", + unformat_pg_edit, unformat_pg_number, &p->length)) + ; + + else if (unformat (input, "checksum %U", + unformat_pg_edit, unformat_pg_number, &p->checksum)) + ; + + /* Can't parse input: try next protocol level. */ + else + break; + } + + { + ip_main_t *im = &ip_main; + u16 dst_port; + tcp_udp_port_info_t *pi; + + pi = 0; + if (p->dst_port.type == PG_EDIT_FIXED) + { + dst_port = pg_edit_get_value (&p->dst_port, PG_EDIT_LO); + pi = ip_get_tcp_udp_port_info (im, dst_port); + } + + if (pi && pi->unformat_pg_edit + && unformat_user (input, pi->unformat_pg_edit, s)) + ; + + else if (!unformat_user (input, unformat_pg_payload, s)) + goto error; + + p = pg_get_edit_group (s, group_index); + if (p->checksum.type == PG_EDIT_UNSPECIFIED + || p->length.type == PG_EDIT_UNSPECIFIED) + { + pg_edit_group_t *g = pg_stream_get_group (s, group_index); + g->edit_function = udp_pg_edit_function; + g->edit_function_opaque = 0; + if (p->checksum.type == PG_EDIT_UNSPECIFIED) + g->edit_function_opaque |= UDP_PG_EDIT_CHECKSUM; + if (p->length.type == PG_EDIT_UNSPECIFIED) + g->edit_function_opaque |= UDP_PG_EDIT_LENGTH; + } + + return 1; + } + +error: + /* Free up any edits we may have added. */ + pg_free_edit_group (s); + return 0; +} + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/vnet_all_api_h.h b/src/vnet/vnet_all_api_h.h index 142acedc..c4075db6 100644 --- a/src/vnet/vnet_all_api_h.h +++ b/src/vnet/vnet_all_api_h.h @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include diff --git a/src/vnet/vxlan-gpe/vxlan_gpe.h b/src/vnet/vxlan-gpe/vxlan_gpe.h index 1b4bc44e..e768d230 100644 --- a/src/vnet/vxlan-gpe/vxlan_gpe.h +++ b/src/vnet/vxlan-gpe/vxlan_gpe.h @@ -29,7 +29,7 @@ #include #include #include -#include +#include /** * @brief VXLAN GPE header struct diff --git a/src/vnet/vxlan/vxlan.h b/src/vnet/vxlan/vxlan.h index adfa3a8e..dca1cd12 100644 --- a/src/vnet/vxlan/vxlan.h +++ b/src/vnet/vxlan/vxlan.h @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/vpp/api/vpe.api b/src/vpp/api/vpe.api index 24f48293..2d6e4f37 100644 --- a/src/vpp/api/vpe.api +++ b/src/vpp/api/vpe.api @@ -38,6 +38,7 @@ * IPSEC-GRE APIs: see .../src/vnet/ipsec-gre/{ipsec_gre.api, ipsec_gre_api.c} * LISP APIs: see .../src/vnet/lisp/{lisp.api, lisp_api.c} * LISP-GPE APIs: see .../src/vnet/lisp-gpe/{lisp_gpe.api, lisp_gpe_api.c} + * SESSION APIs: .../vnet/session/{session.api session_api.c} * MPLS APIs: see .../src/vnet/mpls/{mpls.api, mpls_api.c} * SR APIs: see .../src/vnet/sr/{sr.api, sr_api.c} * DPDK APIs: see ... /src/vnet/devices/dpdk/{dpdk.api, dpdk_api.c} diff --git a/src/vppinfra.am b/src/vppinfra.am index 8d375958..4b9f0c29 100644 --- a/src/vppinfra.am +++ b/src/vppinfra.am @@ -157,7 +157,9 @@ nobase_include_HEADERS = \ vppinfra/asm_mips.h \ vppinfra/asm_x86.h \ vppinfra/bihash_8_8.h \ + vppinfra/bihash_16_8.h \ vppinfra/bihash_24_8.h \ + vppinfra/bihash_48_8.h \ vppinfra/bihash_template.h \ vppinfra/bihash_template.c \ vppinfra/bitmap.h \ @@ -206,6 +208,7 @@ nobase_include_HEADERS = \ vppinfra/timer.h \ vppinfra/tw_timer_2t_1w_2048sl.h \ vppinfra/tw_timer_16t_2w_512sl.h \ + vppinfra/tw_timer_16t_1w_2048sl.h \ vppinfra/tw_timer_template.h \ vppinfra/tw_timer_template.c \ vppinfra/types.h \ @@ -261,6 +264,8 @@ CLIB_CORE = \ vppinfra/tw_timer_2t_1w_2048sl.c \ vppinfra/tw_timer_16t_2w_512sl.h \ vppinfra/tw_timer_16t_2w_512sl.c \ + vppinfra/tw_timer_16t_1w_2048sl.h \ + vppinfra/tw_timer_16t_1w_2048sl.c \ vppinfra/unformat.c \ vppinfra/vec.c \ vppinfra/vector.c \ diff --git a/src/vppinfra/bihash_16_8.h b/src/vppinfra/bihash_16_8.h new file mode 100644 index 00000000..ce80f70e --- /dev/null +++ b/src/vppinfra/bihash_16_8.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#undef BIHASH_TYPE + +#define BIHASH_TYPE _16_8 +#define BIHASH_KVP_PER_PAGE 4 + +#ifndef __included_bihash_16_8_h__ +#define __included_bihash_16_8_h__ + +#include +#include +#include +#include + +typedef struct +{ + u64 key[2]; + u64 value; +} clib_bihash_kv_16_8_t; + +static inline int +clib_bihash_is_free_16_8 (clib_bihash_kv_16_8_t * v) +{ + /* Free values are memset to 0xff, check a bit... */ + if (v->key[0] == ~0ULL && v->value == ~0ULL) + return 1; + return 0; +} + +#if __SSE4_2__ +#ifndef __defined_crc_u32__ +#define __defined_crc_u32__ +static inline u32 +crc_u32 (u32 data, u32 value) +{ + __asm__ volatile ("crc32l %[data], %[value];":[value] "+r" (value):[data] + "rm" (data)); + return value; +} +#endif /* __defined_crc_u32__ */ + +static inline u64 +clib_bihash_hash_16_8 (clib_bihash_kv_16_8_t * v) +{ + u32 *dp = (u32 *) & v->key[0]; + u32 value = 0; + + value = crc_u32 (dp[0], value); + value = crc_u32 (dp[1], value); + value = crc_u32 (dp[2], value); + value = crc_u32 (dp[3], value); + + return value; +} +#else +static inline u64 +clib_bihash_hash_16_8 (clib_bihash_kv_16_8_t * v) +{ + u64 tmp = v->key[0] ^ v->key[1]; + return clib_xxhash (tmp); +} +#endif + +static inline u8 * +format_bihash_kvp_16_8 (u8 * s, va_list * args) +{ + clib_bihash_kv_16_8_t *v = va_arg (*args, clib_bihash_kv_16_8_t *); + + s = format (s, "key %llu %llu value %llu", v->key[0], v->key[1], v->value); + return s; +} + +static inline int +clib_bihash_key_compare_16_8 (u64 * a, u64 * b) +{ + return ((a[0] ^ b[0]) | (a[1] ^ b[1])) == 0; +} + +#undef __included_bihash_template_h__ +#include + +#endif /* __included_bihash_16_8_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vppinfra/bihash_48_8.h b/src/vppinfra/bihash_48_8.h new file mode 100644 index 00000000..1a6e7691 --- /dev/null +++ b/src/vppinfra/bihash_48_8.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#undef BIHASH_TYPE + +#define BIHASH_TYPE _48_8 +#define BIHASH_KVP_PER_PAGE 4 + +#ifndef __included_bihash_48_8_h__ +#define __included_bihash_48_8_h__ + +#include +#include +#include +#include + +typedef struct +{ + u64 key[6]; + u64 value; +} clib_bihash_kv_48_8_t; + +static inline int +clib_bihash_is_free_48_8 (const clib_bihash_kv_48_8_t * v) +{ + /* Free values are memset to 0xff, check a bit... */ + if (v->key[0] == ~0ULL && v->value == ~0ULL) + return 1; + return 0; +} + +#if __SSE4_2__ +#ifndef __defined_crc_u32__ +#define __defined_crc_u32__ +static inline u32 +crc_u32 (u32 data, u32 value) +{ + __asm__ volatile ("crc32l %[data], %[value];":[value] "+r" (value):[data] + "rm" (data)); + return value; +} +#endif /* __defined_crc_u32__ */ + +static inline u64 +clib_bihash_hash_48_8 (const clib_bihash_kv_48_8_t * v) +{ + const u32 *dp = (const u32 *) &v->key[0]; + u32 value = 0; + + value = crc_u32 (dp[0], value); + value = crc_u32 (dp[1], value); + value = crc_u32 (dp[2], value); + value = crc_u32 (dp[3], value); + value = crc_u32 (dp[4], value); + value = crc_u32 (dp[5], value); + value = crc_u32 (dp[6], value); + value = crc_u32 (dp[7], value); + value = crc_u32 (dp[8], value); + value = crc_u32 (dp[9], value); + value = crc_u32 (dp[10], value); + value = crc_u32 (dp[11], value); + + return value; +} +#else +static inline u64 +clib_bihash_hash_48_8 (const clib_bihash_kv_48_8_t * v) +{ + u64 tmp = v->key[0] ^ v->key[1] ^ v->key[2] ^ v->key[3] ^ v->key[4] + ^ v->key[5]; + return clib_xxhash (tmp); +} +#endif + +static inline u8 * +format_bihash_kvp_48_8 (u8 * s, va_list * args) +{ + clib_bihash_kv_48_8_t *v = va_arg (*args, clib_bihash_kv_48_8_t *); + + s = format (s, "key %llu %llu %llu %llu %llu %llu value %llu", v->key[0], + v->key[1], v->key[2], v->key[3], v->key[4], v->key[5], + v->value); + return s; +} + +static inline int +clib_bihash_key_compare_48_8 (const u64 * a, const u64 * b) +{ + return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2]) | (a[3] ^ b[3]) + | (a[4] ^ b[4]) | (a[5] ^ b[5])) == 0; +} + +#undef __included_bihash_template_h__ +#include + +#endif /* __included_bihash_48_8_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vppinfra/tw_timer_16t_1w_2048sl.c b/src/vppinfra/tw_timer_16t_1w_2048sl.c new file mode 100644 index 00000000..3f342045 --- /dev/null +++ b/src/vppinfra/tw_timer_16t_1w_2048sl.c @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "tw_timer_16t_1w_2048sl.h" +#include "tw_timer_template.c" + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vppinfra/tw_timer_16t_1w_2048sl.h b/src/vppinfra/tw_timer_16t_1w_2048sl.h new file mode 100644 index 00000000..685ac31e --- /dev/null +++ b/src/vppinfra/tw_timer_16t_1w_2048sl.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __included_tw_timer_16t_2w_512sl_h__ +#define __included_tw_timer_16t_2w_512sl_h__ + +/* ... So that a client app can create multiple wheel geometries */ +#undef TW_TIMER_WHEELS +#undef TW_SLOTS_PER_RING +#undef TW_RING_SHIFT +#undef TW_RING_MASK +#undef TW_TIMERS_PER_OBJECT +#undef LOG2_TW_TIMERS_PER_OBJECT +#undef TW_SUFFIX + +#define TW_TIMER_WHEELS 1 +#define TW_SLOTS_PER_RING 2048 +#define TW_RING_SHIFT 11 +#define TW_RING_MASK (TW_SLOTS_PER_RING -1) +#define TW_TIMERS_PER_OBJECT 16 +#define LOG2_TW_TIMERS_PER_OBJECT 4 +#define TW_SUFFIX _16t_1w_2048sl + +#include + +#endif /* __included_tw_timer_16t_2w_512sl_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ -- cgit 1.2.3-korg From a1a093d4e46e38503332a97ad216f80053a15f2b Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Thu, 2 Mar 2017 13:13:23 -0500 Subject: Clean up binary api message handler registration issues Removed a fair number of "BUG" message handlers, due to conflicts with actual message handlers in api_format.c. Vpp itself had no business receiving certain messages, up to the point where we started building in relevant code from vpp_api_test. Eliminated all but one duplicate registration complaint. That one needs attention from the vxlan team since the duplicated handlers have diverged. Change-Id: Iafce5429d2f906270643b4ea5f0130e20beb4d1d Signed-off-by: Dave Barach --- src/vat/api_format.c | 43 ++++++- src/vlib/unix/input.c | 31 ++++- src/vlibapi/api.h | 15 +++ src/vlibapi/api_shared.c | 4 + src/vnet/classify/classify_api.c | 8 -- src/vnet/devices/virtio/vhost_user_api.c | 10 +- src/vnet/dhcp/dhcp_api.c | 8 -- src/vnet/interface_api.c | 7 - src/vnet/ip/ip_api.c | 83 ------------ src/vnet/l2/l2_api.c | 22 ---- src/vnet/mpls/mpls_api.c | 28 +--- src/vpp/api/api.c | 211 ------------------------------- src/vpp/api/api_main.c | 1 - src/vpp/stats/stats.c | 7 - 14 files changed, 81 insertions(+), 397 deletions(-) (limited to 'src/vlib') diff --git a/src/vat/api_format.c b/src/vat/api_format.c index 1321bade..52436917 100644 --- a/src/vat/api_format.c +++ b/src/vat/api_format.c @@ -944,6 +944,7 @@ static void vl_api_sw_interface_details_t_handler_json } } +#if VPP_API_TEST_BUILTIN == 0 static void vl_api_sw_interface_set_flags_t_handler (vl_api_sw_interface_set_flags_t * mp) { @@ -954,6 +955,7 @@ static void vl_api_sw_interface_set_flags_t_handler mp->admin_up_down ? "admin-up" : "admin-down", mp->link_up_down ? "link-up" : "link-down"); } +#endif static void vl_api_sw_interface_set_flags_t_handler_json (vl_api_sw_interface_set_flags_t * mp) @@ -4009,7 +4011,6 @@ foreach_standard_reply_retval_handler; #define foreach_vpe_api_reply_msg \ _(CREATE_LOOPBACK_REPLY, create_loopback_reply) \ _(SW_INTERFACE_DETAILS, sw_interface_details) \ -_(SW_INTERFACE_SET_FLAGS, sw_interface_set_flags) \ _(SW_INTERFACE_SET_FLAGS_REPLY, sw_interface_set_flags_reply) \ _(CONTROL_PING_REPLY, control_ping_reply) \ _(CLI_REPLY, cli_reply) \ @@ -4126,11 +4127,6 @@ _(IKEV2_INITIATE_REKEY_CHILD_SA_REPLY, ikev2_initiate_rekey_child_sa_reply) \ _(DELETE_LOOPBACK_REPLY, delete_loopback_reply) \ _(BD_IP_MAC_ADD_DEL_REPLY, bd_ip_mac_add_del_reply) \ _(DHCP_COMPL_EVENT, dhcp_compl_event) \ -_(VNET_INTERFACE_COUNTERS, vnet_interface_counters) \ -_(VNET_IP4_FIB_COUNTERS, vnet_ip4_fib_counters) \ -_(VNET_IP6_FIB_COUNTERS, vnet_ip6_fib_counters) \ -_(VNET_IP4_NBR_COUNTERS, vnet_ip4_nbr_counters) \ -_(VNET_IP6_NBR_COUNTERS, vnet_ip6_nbr_counters) \ _(MAP_ADD_DOMAIN_REPLY, map_add_domain_reply) \ _(MAP_DEL_DOMAIN_REPLY, map_del_domain_reply) \ _(MAP_ADD_DEL_RULE_REPLY, map_add_del_rule_reply) \ @@ -4232,6 +4228,14 @@ _(SW_INTERFACE_SET_MTU_REPLY, sw_interface_set_mtu_reply) \ _(IP_NEIGHBOR_DETAILS, ip_neighbor_details) \ _(SW_INTERFACE_GET_TABLE_REPLY, sw_interface_get_table_reply) +#define foreach_standalone_reply_msg \ +_(SW_INTERFACE_SET_FLAGS, sw_interface_set_flags) \ +_(VNET_INTERFACE_COUNTERS, vnet_interface_counters) \ +_(VNET_IP4_FIB_COUNTERS, vnet_ip4_fib_counters) \ +_(VNET_IP6_FIB_COUNTERS, vnet_ip6_fib_counters) \ +_(VNET_IP4_NBR_COUNTERS, vnet_ip4_nbr_counters) \ +_(VNET_IP6_NBR_COUNTERS, vnet_ip6_nbr_counters) + typedef struct { u8 *name; @@ -15425,7 +15429,15 @@ api_af_packet_create (vat_main_t * vam) vec_free (host_if_name); S (mp); - W2 (ret, fprintf (vam->ofp, " new sw_if_index = %d ", vam->sw_if_index)); + + /* *INDENT-OFF* */ + W2 (ret, + ({ + if (ret == 0) + fprintf (vam->ofp ? vam->ofp : stderr, + " new sw_if_index = %d\n", vam->sw_if_index); + })); + /* *INDENT-ON* */ return ret; } @@ -18417,6 +18429,9 @@ _(unset, "usage: unset ") } \ } foreach_vpe_api_reply_msg; +#if VPP_API_TEST_BUILTIN == 0 +foreach_standalone_reply_msg; +#endif #undef _ void @@ -18430,6 +18445,9 @@ vat_api_hookup (vat_main_t * vam) vl_api_##n##_t_print, \ sizeof(vl_api_##n##_t), 1); foreach_vpe_api_reply_msg; +#if VPP_API_TEST_BUILTIN == 0 + foreach_standalone_reply_msg; +#endif #undef _ #if (VPP_API_TEST_BUILTIN==0) @@ -18463,6 +18481,17 @@ vat_api_hookup (vat_main_t * vam) #undef _ } +#if VPP_API_TEST_BUILTIN +static clib_error_t * +vat_api_hookup_shim (vlib_main_t * vm) +{ + vat_api_hookup (&vat_main); + return 0; +} + +VLIB_API_INIT_FUNCTION (vat_api_hookup_shim); +#endif + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vlib/unix/input.c b/src/vlib/unix/input.c index 07096ed2..7b4183a4 100644 --- a/src/vlib/unix/input.c +++ b/src/vlib/unix/input.c @@ -66,6 +66,7 @@ linux_epoll_file_update (unix_file_t * f, unix_file_update_type_t update_type) unix_main_t *um = &unix_main; linux_epoll_main_t *em = &linux_epoll_main; struct epoll_event e; + int op; memset (&e, 0, sizeof (e)); @@ -76,13 +77,29 @@ linux_epoll_file_update (unix_file_t * f, unix_file_update_type_t update_type) e.events |= EPOLLET; e.data.u32 = f - um->file_pool; - if (epoll_ctl (em->epoll_fd, - (update_type == UNIX_FILE_UPDATE_ADD - ? EPOLL_CTL_ADD - : (update_type == UNIX_FILE_UPDATE_MODIFY - ? EPOLL_CTL_MOD - : EPOLL_CTL_DEL)), f->file_descriptor, &e) < 0) - clib_warning ("epoll_ctl"); + op = -1; + + switch (update_type) + { + case UNIX_FILE_UPDATE_ADD: + op = EPOLL_CTL_ADD; + break; + + case UNIX_FILE_UPDATE_MODIFY: + op = EPOLL_CTL_MOD; + break; + + case UNIX_FILE_UPDATE_DELETE: + op = EPOLL_CTL_DEL; + break; + + default: + clib_warning ("unknown update_type %d", update_type); + return; + } + + if (epoll_ctl (em->epoll_fd, op, f->file_descriptor, &e) < 0) + clib_unix_warning ("epoll_ctl"); } static uword diff --git a/src/vlibapi/api.h b/src/vlibapi/api.h index fcb101d7..b40ece15 100644 --- a/src/vlibapi/api.h +++ b/src/vlibapi/api.h @@ -271,6 +271,21 @@ vlib_node_t **vlib_node_unserialize (u8 * vector); #define VLIB_API_INIT_FUNCTION(x) VLIB_DECLARE_INIT_FUNCTION(x,api_init) +/* Call given init function: used for init function dependencies. */ +#define vlib_call_api_init_function(vm, x) \ + ({ \ + extern vlib_init_function_t * _VLIB_INIT_FUNCTION_SYMBOL (x,api_init); \ + vlib_init_function_t * _f = _VLIB_INIT_FUNCTION_SYMBOL (x,api_init); \ + clib_error_t * _error = 0; \ + if (! hash_get (vm->init_functions_called, _f)) \ + { \ + hash_set1 (vm->init_functions_called, _f); \ + _error = _f (vm); \ + } \ + _error; \ + }) + + #endif /* included_api_h */ /* diff --git a/src/vlibapi/api_shared.c b/src/vlibapi/api_shared.c index 1a2740e2..79921afe 100644 --- a/src/vlibapi/api_shared.c +++ b/src/vlibapi/api_shared.c @@ -667,6 +667,10 @@ vl_msg_api_config (vl_msg_api_msg_config_t * c) foreach_msg_api_vector; #undef _ + if (am->msg_names[c->id]) + clib_warning ("BUG: multiple registrations of 'vl_api_%s_t_handler'", + c->name); + am->msg_names[c->id] = c->name; am->msg_handlers[c->id] = c->handler; am->msg_cleanup_handlers[c->id] = c->cleanup; diff --git a/src/vnet/classify/classify_api.c b/src/vnet/classify/classify_api.c index 77a8b434..24c7a2b9 100644 --- a/src/vnet/classify/classify_api.c +++ b/src/vnet/classify/classify_api.c @@ -53,7 +53,6 @@ _(CLASSIFY_TABLE_IDS,classify_table_ids) \ _(CLASSIFY_TABLE_BY_INTERFACE, classify_table_by_interface) \ _(CLASSIFY_TABLE_INFO,classify_table_info) \ _(CLASSIFY_SESSION_DUMP,classify_session_dump) \ -_(CLASSIFY_SESSION_DETAILS,classify_session_details) \ _(POLICER_CLASSIFY_SET_INTERFACE, policer_classify_set_interface) \ _(POLICER_CLASSIFY_DUMP, policer_classify_dump) \ _(FLOW_CLASSIFY_SET_INTERFACE, flow_classify_set_interface) \ @@ -356,13 +355,6 @@ vl_api_classify_table_info_t_handler (vl_api_classify_table_info_t * mp) vl_msg_api_send_shmem (q, (u8 *) & rmp); } -static void -vl_api_classify_session_details_t_handler (vl_api_classify_session_details_t * - mp) -{ - clib_warning ("BUG"); -} - static void send_classify_session_details (unix_shared_memory_queue_t * q, u32 table_id, diff --git a/src/vnet/devices/virtio/vhost_user_api.c b/src/vnet/devices/virtio/vhost_user_api.c index dd517c26..8dbd032b 100644 --- a/src/vnet/devices/virtio/vhost_user_api.c +++ b/src/vnet/devices/virtio/vhost_user_api.c @@ -46,8 +46,7 @@ _(CREATE_VHOST_USER_IF, create_vhost_user_if) \ _(MODIFY_VHOST_USER_IF, modify_vhost_user_if) \ _(DELETE_VHOST_USER_IF, delete_vhost_user_if) \ -_(SW_INTERFACE_VHOST_USER_DUMP, sw_interface_vhost_user_dump) \ -_(SW_INTERFACE_VHOST_USER_DETAILS, sw_interface_vhost_user_details) +_(SW_INTERFACE_VHOST_USER_DUMP, sw_interface_vhost_user_dump) /* * WARNING: replicated pending api refactor completion @@ -148,13 +147,6 @@ vl_api_delete_vhost_user_if_t_handler (vl_api_delete_vhost_user_if_t * mp) } } -static void - vl_api_sw_interface_vhost_user_details_t_handler - (vl_api_sw_interface_vhost_user_details_t * mp) -{ - clib_warning ("BUG"); -} - static void send_sw_interface_vhost_user_details (vpe_api_main_t * am, unix_shared_memory_queue_t * q, diff --git a/src/vnet/dhcp/dhcp_api.c b/src/vnet/dhcp/dhcp_api.c index bdf02cae..ce34f6a4 100644 --- a/src/vnet/dhcp/dhcp_api.c +++ b/src/vnet/dhcp/dhcp_api.c @@ -46,7 +46,6 @@ #define foreach_vpe_api_msg \ _(DHCP_PROXY_CONFIG,dhcp_proxy_config) \ _(DHCP_PROXY_DUMP,dhcp_proxy_dump) \ -_(DHCP_PROXY_DETAILS,dhcp_proxy_details) \ _(DHCP_PROXY_SET_VSS,dhcp_proxy_set_vss) \ _(DHCP_CLIENT_CONFIG, dhcp_client_config) @@ -158,13 +157,6 @@ dhcp_send_details (fib_protocol_t proto, vl_msg_api_send_shmem (q, (u8 *) & mp); } - -static void -vl_api_dhcp_proxy_details_t_handler (vl_api_dhcp_proxy_details_t * mp) -{ - clib_warning ("BUG"); -} - void dhcp_compl_event_callback (u32 client_index, u32 pid, u8 * hostname, u8 is_ipv6, u8 * host_address, u8 * router_address, diff --git a/src/vnet/interface_api.c b/src/vnet/interface_api.c index 63f7cad4..60cd6d40 100644 --- a/src/vnet/interface_api.c +++ b/src/vnet/interface_api.c @@ -50,7 +50,6 @@ _(SW_INTERFACE_SET_FLAGS, sw_interface_set_flags) \ _(SW_INTERFACE_SET_MTU, sw_interface_set_mtu) \ _(WANT_INTERFACE_EVENTS, want_interface_events) \ _(SW_INTERFACE_DUMP, sw_interface_dump) \ -_(SW_INTERFACE_DETAILS, sw_interface_details) \ _(SW_INTERFACE_ADD_DEL_ADDRESS, sw_interface_add_del_address) \ _(SW_INTERFACE_SET_TABLE, sw_interface_set_table) \ _(SW_INTERFACE_GET_TABLE, sw_interface_get_table) \ @@ -684,12 +683,6 @@ out: REPLY_MACRO (VL_API_SW_INTERFACE_TAG_ADD_DEL_REPLY); } -static void -vl_api_sw_interface_details_t_handler (vl_api_sw_interface_details_t * mp) -{ - clib_warning ("BUG"); -} - /* * vpe_api_hookup * Add vpe's API message handlers to the table. diff --git a/src/vnet/ip/ip_api.c b/src/vnet/ip/ip_api.c index 49d941c2..ab164a5f 100644 --- a/src/vnet/ip/ip_api.c +++ b/src/vnet/ip/ip_api.c @@ -59,17 +59,12 @@ #define foreach_ip_api_msg \ _(IP_FIB_DUMP, ip_fib_dump) \ -_(IP_FIB_DETAILS, ip_fib_details) \ _(IP6_FIB_DUMP, ip6_fib_dump) \ -_(IP6_FIB_DETAILS, ip6_fib_details) \ _(IP_MFIB_DUMP, ip_mfib_dump) \ -_(IP_MFIB_DETAILS, ip_mfib_details) \ _(IP6_MFIB_DUMP, ip6_mfib_dump) \ -_(IP6_MFIB_DETAILS, ip6_mfib_details) \ _(IP_NEIGHBOR_DUMP, ip_neighbor_dump) \ _(IP_MROUTE_ADD_DEL, ip_mroute_add_del) \ _(MFIB_SIGNAL_DUMP, mfib_signal_dump) \ -_(IP_NEIGHBOR_DETAILS, ip_neighbor_details) \ _(IP_ADDRESS_DUMP, ip_address_dump) \ _(IP_DUMP, ip_dump) \ _(IP_NEIGHBOR_ADD_DEL, ip_neighbor_add_del) \ @@ -105,12 +100,6 @@ send_ip_neighbor_details (u8 is_ipv6, vl_msg_api_send_shmem (q, (u8 *) & mp); } -static void -vl_api_ip_neighbor_details_t_handler (vl_api_ip_neighbor_details_t * mp) -{ - clib_warning ("BUG"); -} - static void vl_api_ip_neighbor_dump_t_handler (vl_api_ip_neighbor_dump_t * mp) { @@ -185,24 +174,6 @@ copy_fib_next_hop (fib_route_path_encode_t * api_rpath, void *fp_arg) sizeof (api_rpath->rpath.frp_addr.ip6)); } -static void -vl_api_ip_fib_details_t_handler (vl_api_ip_fib_details_t * mp) -{ - clib_warning ("BUG"); -} - -static void -vl_api_ip_fib_details_t_endian (vl_api_ip_fib_details_t * mp) -{ - clib_warning ("BUG"); -} - -static void -vl_api_ip_fib_details_t_print (vl_api_ip_fib_details_t * mp) -{ - clib_warning ("BUG"); -} - static void send_ip_fib_details (vpe_api_main_t * am, unix_shared_memory_queue_t * q, @@ -316,24 +287,6 @@ vl_api_ip_fib_dump_t_handler (vl_api_ip_fib_dump_t * mp) vec_free (lfeis); } -static void -vl_api_ip6_fib_details_t_handler (vl_api_ip6_fib_details_t * mp) -{ - clib_warning ("BUG"); -} - -static void -vl_api_ip6_fib_details_t_endian (vl_api_ip6_fib_details_t * mp) -{ - clib_warning ("BUG"); -} - -static void -vl_api_ip6_fib_details_t_print (vl_api_ip6_fib_details_t * mp) -{ - clib_warning ("BUG"); -} - static void send_ip6_fib_details (vpe_api_main_t * am, unix_shared_memory_queue_t * q, @@ -469,24 +422,6 @@ vl_api_ip6_fib_dump_t_handler (vl_api_ip6_fib_dump_t * mp) /* *INDENT-ON* */ } -static void -vl_api_ip_mfib_details_t_handler (vl_api_ip_mfib_details_t * mp) -{ - clib_warning ("BUG"); -} - -static void -vl_api_ip_mfib_details_t_endian (vl_api_ip_mfib_details_t * mp) -{ - clib_warning ("BUG"); -} - -static void -vl_api_ip_mfib_details_t_print (vl_api_ip_mfib_details_t * mp) -{ - clib_warning ("BUG"); -} - static void send_ip_mfib_details (vpe_api_main_t * am, unix_shared_memory_queue_t * q, @@ -591,24 +526,6 @@ vl_api_ip_mfib_dump_t_handler (vl_api_ip_mfib_dump_t * mp) vec_free (api_rpaths); } -static void -vl_api_ip6_mfib_details_t_handler (vl_api_ip6_mfib_details_t * mp) -{ - clib_warning ("BUG"); -} - -static void -vl_api_ip6_mfib_details_t_endian (vl_api_ip6_mfib_details_t * mp) -{ - clib_warning ("BUG"); -} - -static void -vl_api_ip6_mfib_details_t_print (vl_api_ip6_mfib_details_t * mp) -{ - clib_warning ("BUG"); -} - static void send_ip6_mfib_details (vpe_api_main_t * am, unix_shared_memory_queue_t * q, diff --git a/src/vnet/l2/l2_api.c b/src/vnet/l2/l2_api.c index a3cc49bf..a985852c 100644 --- a/src/vnet/l2/l2_api.c +++ b/src/vnet/l2/l2_api.c @@ -48,13 +48,10 @@ _(L2_XCONNECT_DUMP, l2_xconnect_dump) \ _(L2_FIB_CLEAR_TABLE, l2_fib_clear_table) \ _(L2_FIB_TABLE_DUMP, l2_fib_table_dump) \ -_(L2_FIB_TABLE_ENTRY, l2_fib_table_entry) \ _(L2FIB_ADD_DEL, l2fib_add_del) \ _(L2_FLAGS, l2_flags) \ _(BRIDGE_DOMAIN_ADD_DEL, bridge_domain_add_del) \ _(BRIDGE_DOMAIN_DUMP, bridge_domain_dump) \ -_(BRIDGE_DOMAIN_DETAILS, bridge_domain_details) \ -_(BRIDGE_DOMAIN_SW_IF_DETAILS, bridge_domain_sw_if_details) \ _(BRIDGE_FLAGS, bridge_flags) \ _(L2_INTERFACE_VLAN_TAG_REWRITE, l2_interface_vlan_tag_rewrite) \ _(L2_INTERFACE_PBB_TAG_REWRITE, l2_interface_pbb_tag_rewrite) @@ -140,12 +137,6 @@ send_l2fib_table_entry (vpe_api_main_t * am, vl_msg_api_send_shmem (q, (u8 *) & mp); } -static void -vl_api_l2_fib_table_entry_t_handler (vl_api_l2_fib_table_entry_t * mp) -{ - clib_warning ("BUG"); -} - static void vl_api_l2_fib_table_dump_t_handler (vl_api_l2_fib_table_dump_t * mp) { @@ -329,19 +320,6 @@ vl_api_bridge_domain_add_del_t_handler (vl_api_bridge_domain_add_del_t * mp) REPLY_MACRO (VL_API_BRIDGE_DOMAIN_ADD_DEL_REPLY); } -static void -vl_api_bridge_domain_details_t_handler (vl_api_bridge_domain_details_t * mp) -{ - clib_warning ("BUG"); -} - -static void - vl_api_bridge_domain_sw_if_details_t_handler - (vl_api_bridge_domain_sw_if_details_t * mp) -{ - clib_warning ("BUG"); -} - static void send_bridge_domain_details (unix_shared_memory_queue_t * q, l2_bridge_domain_t * bd_config, diff --git a/src/vnet/mpls/mpls_api.c b/src/vnet/mpls/mpls_api.c index ebbeba69..a36a5046 100644 --- a/src/vnet/mpls/mpls_api.c +++ b/src/vnet/mpls/mpls_api.c @@ -50,9 +50,7 @@ _(MPLS_IP_BIND_UNBIND, mpls_ip_bind_unbind) \ _(MPLS_ROUTE_ADD_DEL, mpls_route_add_del) \ _(MPLS_TUNNEL_ADD_DEL, mpls_tunnel_add_del) \ _(MPLS_TUNNEL_DUMP, mpls_tunnel_dump) \ -_(MPLS_TUNNEL_DETAILS, mpls_tunnel_details) \ -_(MPLS_FIB_DUMP, mpls_fib_dump) \ -_(MPLS_FIB_DETAILS, mpls_fib_details) +_(MPLS_FIB_DUMP, mpls_fib_dump) extern void stats_dslock_with_hint (int hint, int tag); extern void stats_dsunlock (void); @@ -280,12 +278,6 @@ vl_api_mpls_tunnel_add_del_t_handler (vl_api_mpls_tunnel_add_del_t * mp) /* *INDENT-ON* */ } -static void -vl_api_mpls_tunnel_details_t_handler (vl_api_mpls_tunnel_details_t * mp) -{ - clib_warning ("BUG"); -} - typedef struct mpls_tunnel_send_walk_ctx_t_ { unix_shared_memory_queue_t *q; @@ -340,24 +332,6 @@ vl_api_mpls_tunnel_dump_t_handler (vl_api_mpls_tunnel_dump_t * mp) mpls_tunnel_walk (send_mpls_tunnel_entry, &ctx); } -static void -vl_api_mpls_fib_details_t_handler (vl_api_mpls_fib_details_t * mp) -{ - clib_warning ("BUG"); -} - -static void -vl_api_mpls_fib_details_t_endian (vl_api_mpls_fib_details_t * mp) -{ - clib_warning ("BUG"); -} - -static void -vl_api_mpls_fib_details_t_print (vl_api_mpls_fib_details_t * mp) -{ - clib_warning ("BUG"); -} - static void send_mpls_fib_details (vpe_api_main_t * am, unix_shared_memory_queue_t * q, diff --git a/src/vpp/api/api.c b/src/vpp/api/api.c index 60fd0199..a8f471e8 100644 --- a/src/vpp/api/api.c +++ b/src/vpp/api/api.c @@ -128,12 +128,8 @@ _(CLASSIFY_SET_INTERFACE_IP_TABLE, classify_set_interface_ip_table) \ _(CLASSIFY_SET_INTERFACE_L2_TABLES, classify_set_interface_l2_tables) \ _(GET_NODE_INDEX, get_node_index) \ _(ADD_NODE_NEXT, add_node_next) \ -_(VXLAN_ADD_DEL_TUNNEL, vxlan_add_del_tunnel) \ -_(VXLAN_TUNNEL_DUMP, vxlan_tunnel_dump) \ _(L2_INTERFACE_EFP_FILTER, l2_interface_efp_filter) \ _(SHOW_VERSION, show_version) \ -_(VXLAN_GPE_ADD_DEL_TUNNEL, vxlan_gpe_add_del_tunnel) \ -_(VXLAN_GPE_TUNNEL_DUMP, vxlan_gpe_tunnel_dump) \ _(INTERFACE_NAME_RENUMBER, interface_name_renumber) \ _(WANT_IP4_ARP_EVENTS, want_ip4_arp_events) \ _(WANT_IP6_ND_EVENTS, want_ip6_nd_events) \ @@ -1436,62 +1432,6 @@ out: /* *INDENT-ON* */ } -static void vl_api_vxlan_add_del_tunnel_t_handler - (vl_api_vxlan_add_del_tunnel_t * mp) -{ - vl_api_vxlan_add_del_tunnel_reply_t *rmp; - int rv = 0; - vnet_vxlan_add_del_tunnel_args_t _a, *a = &_a; - u32 encap_fib_index; - uword *p; - ip4_main_t *im = &ip4_main; - vnet_main_t *vnm = vnet_get_main (); - u32 sw_if_index = ~0; - - p = hash_get (im->fib_index_by_table_id, ntohl (mp->encap_vrf_id)); - if (!p) - { - rv = VNET_API_ERROR_NO_SUCH_FIB; - goto out; - } - encap_fib_index = p[0]; - memset (a, 0, sizeof (*a)); - - a->is_add = mp->is_add; - a->is_ip6 = mp->is_ipv6; - - /* ip addresses sent in network byte order */ - ip46_from_addr_buf (mp->is_ipv6, mp->dst_address, &a->dst); - ip46_from_addr_buf (mp->is_ipv6, mp->src_address, &a->src); - - /* Check src & dst are different */ - if (ip46_address_cmp (&a->dst, &a->src) == 0) - { - rv = VNET_API_ERROR_SAME_SRC_DST; - goto out; - } - a->mcast_sw_if_index = ntohl (mp->mcast_sw_if_index); - if (ip46_address_is_multicast (&a->dst) && - pool_is_free_index (vnm->interface_main.sw_interfaces, - a->mcast_sw_if_index)) - { - rv = VNET_API_ERROR_INVALID_SW_IF_INDEX; - goto out; - } - a->encap_fib_index = encap_fib_index; - a->decap_next_index = ntohl (mp->decap_next_index); - a->vni = ntohl (mp->vni); - rv = vnet_vxlan_add_del_tunnel (a, &sw_if_index); - -out: - /* *INDENT-OFF* */ - REPLY_MACRO2(VL_API_VXLAN_ADD_DEL_TUNNEL_REPLY, - ({ - rmp->sw_if_index = ntohl (sw_if_index); - })); - /* *INDENT-ON* */ -} - static void send_vxlan_tunnel_details (vxlan_tunnel_t * t, unix_shared_memory_queue_t * q, u32 context) { @@ -1525,43 +1465,6 @@ static void send_vxlan_tunnel_details vl_msg_api_send_shmem (q, (u8 *) & rmp); } -static void vl_api_vxlan_tunnel_dump_t_handler - (vl_api_vxlan_tunnel_dump_t * mp) -{ - unix_shared_memory_queue_t *q; - vxlan_main_t *vxm = &vxlan_main; - vxlan_tunnel_t *t; - u32 sw_if_index; - - q = vl_api_client_index_to_input_queue (mp->client_index); - if (q == 0) - { - return; - } - - sw_if_index = ntohl (mp->sw_if_index); - - if (~0 == sw_if_index) - { - /* *INDENT-OFF* */ - pool_foreach (t, vxm->tunnels, - ({ - send_vxlan_tunnel_details(t, q, mp->context); - })); - /* *INDENT-ON* */ - } - else - { - if ((sw_if_index >= vec_len (vxm->tunnel_index_by_sw_if_index)) || - (~0 == vxm->tunnel_index_by_sw_if_index[sw_if_index])) - { - return; - } - t = &vxm->tunnels[vxm->tunnel_index_by_sw_if_index[sw_if_index]]; - send_vxlan_tunnel_details (t, q, mp->context); - } -} - static void vl_api_l2_patch_add_del_t_handler (vl_api_l2_patch_add_del_t * mp) { @@ -1585,83 +1488,6 @@ vl_api_l2_patch_add_del_t_handler (vl_api_l2_patch_add_del_t * mp) REPLY_MACRO (VL_API_L2_PATCH_ADD_DEL_REPLY); } -static void - vl_api_vxlan_gpe_add_del_tunnel_t_handler - (vl_api_vxlan_gpe_add_del_tunnel_t * mp) -{ - vl_api_vxlan_gpe_add_del_tunnel_reply_t *rmp; - int rv = 0; - vnet_vxlan_gpe_add_del_tunnel_args_t _a, *a = &_a; - u32 encap_fib_index, decap_fib_index; - u8 protocol; - uword *p; - ip4_main_t *im = &ip4_main; - u32 sw_if_index = ~0; - - - p = hash_get (im->fib_index_by_table_id, ntohl (mp->encap_vrf_id)); - if (!p) - { - rv = VNET_API_ERROR_NO_SUCH_FIB; - goto out; - } - encap_fib_index = p[0]; - - protocol = mp->protocol; - - /* Interpret decap_vrf_id as an opaque if sending to other-than-ip4-input */ - if (protocol == VXLAN_GPE_INPUT_NEXT_IP4_INPUT) - { - p = hash_get (im->fib_index_by_table_id, ntohl (mp->decap_vrf_id)); - if (!p) - { - rv = VNET_API_ERROR_NO_SUCH_INNER_FIB; - goto out; - } - decap_fib_index = p[0]; - } - else - { - decap_fib_index = ntohl (mp->decap_vrf_id); - } - - /* Check src & dst are different */ - if ((mp->is_ipv6 && memcmp (mp->local, mp->remote, 16) == 0) || - (!mp->is_ipv6 && memcmp (mp->local, mp->remote, 4) == 0)) - { - rv = VNET_API_ERROR_SAME_SRC_DST; - goto out; - } - memset (a, 0, sizeof (*a)); - - a->is_add = mp->is_add; - a->is_ip6 = mp->is_ipv6; - /* ip addresses sent in network byte order */ - if (a->is_ip6) - { - clib_memcpy (&(a->local.ip6), mp->local, 16); - clib_memcpy (&(a->remote.ip6), mp->remote, 16); - } - else - { - clib_memcpy (&(a->local.ip4), mp->local, 4); - clib_memcpy (&(a->remote.ip4), mp->remote, 4); - } - a->encap_fib_index = encap_fib_index; - a->decap_fib_index = decap_fib_index; - a->protocol = protocol; - a->vni = ntohl (mp->vni); - rv = vnet_vxlan_gpe_add_del_tunnel (a, &sw_if_index); - -out: - /* *INDENT-OFF* */ - REPLY_MACRO2(VL_API_VXLAN_GPE_ADD_DEL_TUNNEL_REPLY, - ({ - rmp->sw_if_index = ntohl (sw_if_index); - })); - /* *INDENT-ON* */ -} - static void send_vxlan_gpe_tunnel_details (vxlan_gpe_tunnel_t * t, unix_shared_memory_queue_t * q, u32 context) { @@ -1696,43 +1522,6 @@ static void send_vxlan_gpe_tunnel_details vl_msg_api_send_shmem (q, (u8 *) & rmp); } -static void vl_api_vxlan_gpe_tunnel_dump_t_handler - (vl_api_vxlan_gpe_tunnel_dump_t * mp) -{ - unix_shared_memory_queue_t *q; - vxlan_gpe_main_t *vgm = &vxlan_gpe_main; - vxlan_gpe_tunnel_t *t; - u32 sw_if_index; - - q = vl_api_client_index_to_input_queue (mp->client_index); - if (q == 0) - { - return; - } - - sw_if_index = ntohl (mp->sw_if_index); - - if (~0 == sw_if_index) - { - /* *INDENT-OFF* */ - pool_foreach (t, vgm->tunnels, - ({ - send_vxlan_gpe_tunnel_details(t, q, mp->context); - })); - /* *INDENT-ON* */ - } - else - { - if ((sw_if_index >= vec_len (vgm->tunnel_index_by_sw_if_index)) || - (~0 == vgm->tunnel_index_by_sw_if_index[sw_if_index])) - { - return; - } - t = &vgm->tunnels[vgm->tunnel_index_by_sw_if_index[sw_if_index]]; - send_vxlan_gpe_tunnel_details (t, q, mp->context); - } -} - static void vl_api_interface_name_renumber_t_handler (vl_api_interface_name_renumber_t * mp) diff --git a/src/vpp/api/api_main.c b/src/vpp/api/api_main.c index 97b501e0..6ae510b1 100644 --- a/src/vpp/api/api_main.c +++ b/src/vpp/api/api_main.c @@ -48,7 +48,6 @@ api_main_init (vlib_main_t * vm) vam->vlib_main = vm; vam->my_client_index = (u32) ~ 0; init_error_string_table (vam); - vat_api_hookup (vam); rv = vat_plugin_init (vam); if (rv) clib_warning ("vat_plugin_init returned %d", rv); diff --git a/src/vpp/stats/stats.c b/src/vpp/stats/stats.c index 5e9b0d69..c46d441a 100644 --- a/src/vpp/stats/stats.c +++ b/src/vpp/stats/stats.c @@ -46,7 +46,6 @@ stats_main_t stats_main; #define foreach_stats_msg \ _(WANT_STATS, want_stats) \ -_(WANT_STATS_REPLY, want_stats_reply) \ _(VNET_INTERFACE_COUNTERS, vnet_interface_counters) \ _(VNET_IP4_FIB_COUNTERS, vnet_ip4_fib_counters) \ _(VNET_IP6_FIB_COUNTERS, vnet_ip6_fib_counters) \ @@ -1226,12 +1225,6 @@ vl_api_vnet_ip6_nbr_counters_t_handler (vl_api_vnet_ip6_nbr_counters_t * mp) } } -static void -vl_api_want_stats_reply_t_handler (vl_api_want_stats_reply_t * mp) -{ - clib_warning ("BUG"); -} - static void vl_api_want_stats_t_handler (vl_api_want_stats_t * mp) { -- cgit 1.2.3-korg From c8c44ebea155f15a9e3067fe05228a1e932185fc Mon Sep 17 00:00:00 2001 From: Klement Sekera Date: Thu, 2 Mar 2017 11:13:30 +0100 Subject: Add support for unix { coredump-size }. Use setrlimit to set the core size limit if the argument is passed to vpp. Change-Id: Ie76c082b2af81337310fcb1925af915a42067f15 Signed-off-by: Klement Sekera --- src/vlib/unix/main.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'src/vlib') diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c index a04d9f9c..6b96cc0d 100644 --- a/src/vlib/unix/main.c +++ b/src/vlib/unix/main.c @@ -46,6 +46,8 @@ #include #include #include +#include +#include /** Default CLI pager limit is not configured in startup.conf */ #define UNIX_CLI_DEFAULT_PAGER_LIMIT 100000 @@ -340,6 +342,26 @@ unix_config (vlib_main_t * vm, unformat_input_t * input) else if (unformat (input, "cli-history-limit %d", &um->cli_history_limit)) ; + else if (unformat (input, "coredump-size")) + { + uword coredump_size = 0; + if (unformat (input, "unlimited")) + { + coredump_size = RLIM_INFINITY; + } + else + if (!unformat (input, "%U", unformat_memory_size, &coredump_size)) + { + return clib_error_return (0, + "invalid coredump-size parameter `%U'", + format_unformat_error, input); + } + const struct rlimit new_limit = { coredump_size, coredump_size }; + if (0 != setrlimit (RLIMIT_CORE, &new_limit)) + { + clib_unix_warning ("prlimit() failed"); + } + } else if (unformat (input, "full-coredump")) { int fd; -- cgit 1.2.3-korg From e5ef1d7a7b63fcbd43529f079137c0c990a8de2f Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 2 Mar 2017 12:33:48 +0100 Subject: vlib: add process restart cli Change-Id: I8b81e53ebea573b4edb17aca7e1c284f3984e399 Signed-off-by: Damjan Marion --- src/vlib/cli.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'src/vlib') diff --git a/src/vlib/cli.c b/src/vlib/cli.c index 2d141115..f853f655 100644 --- a/src/vlib/cli.c +++ b/src/vlib/cli.c @@ -39,6 +39,7 @@ #include #include +#include /* Root of all show commands. */ /* *INDENT-OFF* */ @@ -757,6 +758,25 @@ VLIB_CLI_COMMAND (cmd_test_heap_validate,static) = { }; /* *INDENT-ON* */ +static clib_error_t * +restart_cmd_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + char *newenviron[] = { NULL }; + + execve (vm->name, (char **) vm->argv, newenviron); + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (restart_cmd,static) = { + .path = "restart", + .short_help = "restart process", + .function = restart_cmd_fn, +}; +/* *INDENT-ON* */ + #ifdef TEST_CODE /* * A trivial test harness to verify the per-process output_function -- cgit 1.2.3-korg From 80f54e20270ed0628ee725e3e3c515731a0188f2 Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Wed, 8 Mar 2017 19:08:56 -0500 Subject: vlib_mains == 0 special cases be gone Clean up spurious binary API client link dependency on libvlib.so, which managed to hide behind vlib_mains == 0 checks reached by VLIB_xxx_FUNCTION macros. Change-Id: I5df1f8ab07dca1944250e643ccf06e60a8462325 Signed-off-by: Dave Barach --- src/plugins/dpdk/ipsec/ipsec.c | 8 +- src/vlib-api.am | 4 +- src/vlib/buffer.c | 27 +- src/vlib/global_funcs.h | 2 +- src/vlib/node_cli.c | 28 +- src/vlib/node_funcs.h | 4 +- src/vlib/threads.c | 16 +- src/vlib/threads.h | 43 ++- src/vlibapi/api.h | 4 +- src/vlibapi/api_shared.c | 530 ++--------------------------------- src/vlibapi/node_serialize.c | 15 +- src/vlibmemory/memory_vlib.c | 471 +++++++++++++++++++++++++++++++ src/vnet/devices/virtio/vhost-user.c | 9 +- src/vpp-api-test.am | 2 - src/vpp/api/api.c | 1 - src/vpp/api/gmon.c | 9 +- 16 files changed, 575 insertions(+), 598 deletions(-) (limited to 'src/vlib') diff --git a/src/plugins/dpdk/ipsec/ipsec.c b/src/plugins/dpdk/ipsec/ipsec.c index 16bec20a..b0aaaaec 100644 --- a/src/plugins/dpdk/ipsec/ipsec.c +++ b/src/plugins/dpdk/ipsec/ipsec.c @@ -380,13 +380,9 @@ dpdk_ipsec_process (vlib_main_t * vm, vlib_node_runtime_t * rt, im->cb.check_support_cb = dpdk_ipsec_check_support; im->cb.add_del_sa_sess_cb = add_del_sa_sess; - if (vec_len (vlib_mains) == 0) - vlib_node_set_state (&vlib_global_main, dpdk_crypto_input_node.index, + for (i = 1; i < tm->n_vlib_mains; i++) + vlib_node_set_state (vlib_mains[i], dpdk_crypto_input_node.index, VLIB_NODE_STATE_POLLING); - else - for (i = 1; i < tm->n_vlib_mains; i++) - vlib_node_set_state (vlib_mains[i], dpdk_crypto_input_node.index, - VLIB_NODE_STATE_POLLING); /* TODO cryptodev counters */ diff --git a/src/vlib-api.am b/src/vlib-api.am index c05929b1..4e1dae99 100644 --- a/src/vlib-api.am +++ b/src/vlib-api.am @@ -14,7 +14,7 @@ lib_LTLIBRARIES += libvlibmemory.la libvlibapi.la libvlibmemoryclient.la \ libvlibsocket.la -libvlibmemory_la_DEPENDENCIES = libvppinfra.la libsvm.la libvlib.la +libvlibmemory_la_DEPENDENCIES = libvppinfra.la libsvm.la libvlibmemory_la_LIBADD = $(libvlibmemory_la_DEPENDENCIES) -lpthread libvlibmemory_la_SOURCES = \ vlibmemory/api.h \ @@ -26,7 +26,7 @@ libvlibmemory_la_SOURCES = \ vlibmemory/unix_shared_memory_queue.c \ vlibmemory/unix_shared_memory_queue.h -libvlibapi_la_DEPENDENCIES = libvppinfra.la libvlib.la libvlibmemory.la +libvlibapi_la_DEPENDENCIES = libvppinfra.la libvlibapi_la_LIBADD = $(libvlibapi_la_DEPENDENCIES) libvlibapi_la_SOURCES = \ vlibapi/api.h \ diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index 9f26bec7..6ba82584 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -261,7 +261,28 @@ done: return result; } -vlib_main_t **vlib_mains; +/* + * Hand-craft a static vector w/ length 1, so vec_len(vlib_mains) =1 + * and vlib_mains[0] = &vlib_global_main from the beginning of time. + * + * The only place which should ever expand vlib_mains is start_workers() + * in threads.c. It knows about the bootstrap vector. + */ +/* *INDENT-OFF* */ +static struct +{ + vec_header_t h; + vlib_main_t *vm; +} __attribute__ ((packed)) __bootstrap_vlib_main_vector + __attribute__ ((aligned (CLIB_CACHE_LINE_BYTES))) = +{ + .h.len = 1, + .vm = &vlib_global_main, +}; +/* *INDENT-ON* */ + +vlib_main_t **vlib_mains = &__bootstrap_vlib_main_vector.vm; + /* When dubugging validate that given buffers are either known allocated or known free. */ @@ -280,7 +301,7 @@ vlib_buffer_validate_alloc_free (vlib_main_t * vm, ASSERT (os_get_cpu_number () == 0); /* smp disaster check */ - if (vlib_mains) + if (vec_len (vlib_mains) > 1) ASSERT (vm == vlib_mains[0]); is_free = expected_state == VLIB_BUFFER_KNOWN_ALLOCATED; @@ -956,7 +977,7 @@ show_buffers (vlib_main_t * vm, do { - curr_vm = vec_len (vlib_mains) ? vlib_mains[vm_index] : vm; + curr_vm = vlib_mains[vm_index]; bm = curr_vm->buffer_main; /* *INDENT-OFF* */ diff --git a/src/vlib/global_funcs.h b/src/vlib/global_funcs.h index bbdbdef5..f51ec381 100644 --- a/src/vlib/global_funcs.h +++ b/src/vlib/global_funcs.h @@ -23,7 +23,7 @@ always_inline vlib_main_t * vlib_get_main (void) { vlib_main_t *vm; - vm = vlib_mains ? vlib_mains[os_get_cpu_number ()] : &vlib_global_main; + vm = vlib_mains[os_get_cpu_number ()]; ASSERT (vm); return vm; } diff --git a/src/vlib/node_cli.c b/src/vlib/node_cli.c index 05d0f0b5..62ab2e64 100644 --- a/src/vlib/node_cli.c +++ b/src/vlib/node_cli.c @@ -248,16 +248,11 @@ show_node_runtime (vlib_main_t * vm, if (unformat (input, "max") || unformat (input, "m")) max = 1; - if (vec_len (vlib_mains) == 0) - vec_add1 (stat_vms, vm); - else + for (i = 0; i < vec_len (vlib_mains); i++) { - for (i = 0; i < vec_len (vlib_mains); i++) - { - stat_vm = vlib_mains[i]; - if (stat_vm) - vec_add1 (stat_vms, stat_vm); - } + stat_vm = vlib_mains[i]; + if (stat_vm) + vec_add1 (stat_vms, stat_vm); } /* @@ -331,7 +326,7 @@ show_node_runtime (vlib_main_t * vm, } } - if (vec_len (vlib_mains)) + if (vec_len (vlib_mains) > 1) { vlib_worker_thread_t *w = vlib_worker_threads + j; if (j > 0) @@ -404,16 +399,11 @@ clear_node_runtime (vlib_main_t * vm, vlib_main_t **stat_vms = 0, *stat_vm; vlib_node_runtime_t *r; - if (vec_len (vlib_mains) == 0) - vec_add1 (stat_vms, vm); - else + for (i = 0; i < vec_len (vlib_mains); i++) { - for (i = 0; i < vec_len (vlib_mains); i++) - { - stat_vm = vlib_mains[i]; - if (stat_vm) - vec_add1 (stat_vms, stat_vm); - } + stat_vm = vlib_mains[i]; + if (stat_vm) + vec_add1 (stat_vms, stat_vm); } vlib_worker_thread_barrier_sync (vm); diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h index f49a8d6f..8ccfc438 100644 --- a/src/vlib/node_funcs.h +++ b/src/vlib/node_funcs.h @@ -201,7 +201,7 @@ vlib_get_frame_no_check (vlib_main_t * vm, uword frame_index) vlib_frame_t *f; u32 cpu_index = frame_index & VLIB_CPU_MASK; u32 offset = frame_index & VLIB_OFFSET_MASK; - vm = vlib_mains ? vlib_mains[cpu_index] : vm; + vm = vlib_mains[cpu_index]; f = vm->heap_base + offset; return f; } @@ -213,7 +213,7 @@ vlib_frame_index_no_check (vlib_main_t * vm, vlib_frame_t * f) ASSERT (((uword) f & VLIB_CPU_MASK) == 0); - vm = vlib_mains ? vlib_mains[f->cpu_index] : vm; + vm = vlib_mains[f->cpu_index]; i = ((u8 *) f - (u8 *) vm->heap_base); return i | f->cpu_index; diff --git a/src/vlib/threads.c b/src/vlib/threads.c index e3ea3c9c..4676be97 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -570,9 +570,13 @@ start_workers (vlib_main_t * vm) if (n_vlib_mains > 1) { - vec_validate (vlib_mains, tm->n_vlib_mains - 1); + /* Replace hand-crafted length-1 vector with a real vector */ + vlib_mains = 0; + + vec_validate_aligned (vlib_mains, tm->n_vlib_mains - 1, + CLIB_CACHE_LINE_BYTES); _vec_len (vlib_mains) = 0; - vec_add1 (vlib_mains, vm); + vec_add1_aligned (vlib_mains, vm, CLIB_CACHE_LINE_BYTES); vlib_worker_threads->wait_at_barrier = clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES); @@ -685,7 +689,7 @@ start_workers (vlib_main_t * vm) /* Packet trace buffers are guaranteed to be empty, nothing to do here */ clib_mem_set_heap (oldheap); - vec_add1 (vlib_mains, vm_clone); + vec_add1_aligned (vlib_mains, vm_clone, CLIB_CACHE_LINE_BYTES); vm_clone->error_main.counters = vec_dup (vlib_mains[0]->error_main.counters); @@ -805,7 +809,7 @@ vlib_worker_thread_node_runtime_update (void) ASSERT (os_get_cpu_number () == 0); - if (vec_len (vlib_mains) == 0) + if (vec_len (vlib_mains) == 1) return; vm = vlib_mains[0]; @@ -1148,7 +1152,7 @@ vlib_worker_thread_barrier_sync (vlib_main_t * vm) f64 deadline; u32 count; - if (!vlib_mains) + if (vec_len (vlib_mains) < 2) return; count = vec_len (vlib_mains) - 1; @@ -1179,7 +1183,7 @@ vlib_worker_thread_barrier_release (vlib_main_t * vm) { f64 deadline; - if (!vlib_mains) + if (vec_len (vlib_mains) < 2) return; if (--vlib_worker_threads[0].recursion_level > 0) diff --git a/src/vlib/threads.h b/src/vlib/threads.h index 75a5a281..a032311c 100644 --- a/src/vlib/threads.h +++ b/src/vlib/threads.h @@ -222,30 +222,25 @@ vlib_worker_thread_barrier_check (void) } } -#define foreach_vlib_main(body) \ -do { \ - vlib_main_t ** __vlib_mains = 0, *this_vlib_main; \ - int ii; \ - \ - if (vec_len (vlib_mains) == 0) \ - vec_add1 (__vlib_mains, &vlib_global_main); \ - else \ - { \ - for (ii = 0; ii < vec_len (vlib_mains); ii++) \ - { \ - this_vlib_main = vlib_mains[ii]; \ - if (this_vlib_main) \ - vec_add1 (__vlib_mains, this_vlib_main); \ - } \ - } \ - \ - for (ii = 0; ii < vec_len (__vlib_mains); ii++) \ - { \ - this_vlib_main = __vlib_mains[ii]; \ - /* body uses this_vlib_main... */ \ - (body); \ - } \ - vec_free (__vlib_mains); \ +#define foreach_vlib_main(body) \ +do { \ + vlib_main_t ** __vlib_mains = 0, *this_vlib_main; \ + int ii; \ + \ + for (ii = 0; ii < vec_len (vlib_mains); ii++) \ + { \ + this_vlib_main = vlib_mains[ii]; \ + if (this_vlib_main) \ + vec_add1 (__vlib_mains, this_vlib_main); \ + } \ + \ + for (ii = 0; ii < vec_len (__vlib_mains); ii++) \ + { \ + this_vlib_main = __vlib_mains[ii]; \ + /* body uses this_vlib_main... */ \ + (body); \ + } \ + vec_free (__vlib_mains); \ } while (0); #define foreach_sched_policy \ diff --git a/src/vlibapi/api.h b/src/vlibapi/api.h index 2cbeb63c..87a56121 100644 --- a/src/vlibapi/api.h +++ b/src/vlibapi/api.h @@ -252,11 +252,13 @@ void vl_msg_api_queue_handler (unix_shared_memory_queue_t * q); vl_api_trace_t *vl_msg_api_trace_get (api_main_t * am, vl_api_trace_which_t which); +void vl_msg_api_barrier_sync (void) __attribute__ ((weak)); +void vl_msg_api_barrier_release (void) __attribute__ ((weak)); void vl_msg_api_free (void *); void vl_noop_handler (void *mp); -clib_error_t *vl_api_init (vlib_main_t * vm); void vl_msg_api_increment_missing_client_counter (void); void vl_msg_api_post_mortem_dump (void); +void vl_msg_api_post_mortem_dump_enable_disable (int enable); void vl_msg_api_register_pd_handler (void *handler, u16 msg_id_host_byte_order); int vl_msg_api_pd_handler (void *mp, int rv); diff --git a/src/vlibapi/api_shared.c b/src/vlibapi/api_shared.c index 69ba10c1..6774e3dd 100644 --- a/src/vlibapi/api_shared.c +++ b/src/vlibapi/api_shared.c @@ -23,11 +23,6 @@ #include #include #include -#include -#include -#include -#include -#include #include #include #include @@ -36,19 +31,14 @@ #include #include -api_main_t api_main; - -void vl_msg_api_barrier_sync (void) __attribute__ ((weak)); -void -vl_msg_api_barrier_sync (void) -{ -} - -void vl_msg_api_barrier_release (void) __attribute__ ((weak)); -void -vl_msg_api_barrier_release (void) -{ -} +/* *INDENT-OFF* */ +api_main_t api_main = + { + .region_name = "/unset", + .api_uid = -1, + .api_gid = -1, + }; +/* *INDENT-ON* */ void vl_msg_api_increment_missing_client_counter (void) @@ -57,14 +47,6 @@ vl_msg_api_increment_missing_client_counter (void) am->missing_clients++; } -typedef enum -{ - DUMP, - CUSTOM_DUMP, - REPLAY, - INITIALIZERS, -} vl_api_replay_t; - int vl_msg_api_rx_trace_enabled (api_main_t * am) { @@ -397,6 +379,16 @@ vl_msg_api_trace_configure (api_main_t * am, vl_api_trace_which_t which, return 0; } +void +vl_msg_api_barrier_sync (void) +{ +} + +void +vl_msg_api_barrier_release (void) +{ +} + always_inline void msg_handler_internal (api_main_t * am, void *the_msg, int trace_it, int do_it, int free_it) @@ -748,495 +740,15 @@ vl_noop_handler (void *mp) { } -clib_error_t * -vl_api_init (vlib_main_t * vm) -{ - static u8 once; - api_main_t *am = &api_main; - - if (once) - return 0; - - once = 1; - - am->region_name = "/unset"; - /* - * Eventually passed to fchown, -1 => "current user" - * instead of 0 => "root". A very fine disctinction at best. - */ - if (am->api_uid == 0) - am->api_uid = -1; - if (am->api_gid == 0) - am->api_gid = -1; - - return (0); -} - -void vl_msg_api_custom_dump_configure (api_main_t * am) - __attribute__ ((weak)); -void -vl_msg_api_custom_dump_configure (api_main_t * am) -{ -} - -VLIB_INIT_FUNCTION (vl_api_init); - -static void -vl_msg_api_process_file (vlib_main_t * vm, u8 * filename, - u32 first_index, u32 last_index, - vl_api_replay_t which) -{ - vl_api_trace_file_header_t *hp; - int i, fd; - struct stat statb; - size_t file_size; - u8 *msg; - u8 endian_swap_needed = 0; - api_main_t *am = &api_main; - u8 *tmpbuf = 0; - u32 nitems; - void **saved_print_handlers = 0; - - fd = open ((char *) filename, O_RDONLY); - - if (fd < 0) - { - vlib_cli_output (vm, "Couldn't open %s\n", filename); - return; - } - - if (fstat (fd, &statb) < 0) - { - vlib_cli_output (vm, "Couldn't stat %s\n", filename); - close (fd); - return; - } - - if (!(statb.st_mode & S_IFREG) || (statb.st_size < sizeof (*hp))) - { - vlib_cli_output (vm, "File not plausible: %s\n", filename); - close (fd); - return; - } - - file_size = statb.st_size; - file_size = (file_size + 4095) & ~(4096); - - hp = mmap (0, file_size, PROT_READ, MAP_PRIVATE, fd, 0); - - if (hp == (vl_api_trace_file_header_t *) MAP_FAILED) - { - vlib_cli_output (vm, "mmap failed: %s\n", filename); - close (fd); - return; - } - close (fd); - - if ((clib_arch_is_little_endian && hp->endian == VL_API_BIG_ENDIAN) - || (clib_arch_is_big_endian && hp->endian == VL_API_LITTLE_ENDIAN)) - endian_swap_needed = 1; - - if (endian_swap_needed) - nitems = ntohl (hp->nitems); - else - nitems = hp->nitems; - - if (last_index == (u32) ~ 0) - { - last_index = nitems - 1; - } - - if (first_index >= nitems || last_index >= nitems) - { - vlib_cli_output (vm, "Range (%d, %d) outside file range (0, %d)\n", - first_index, last_index, nitems - 1); - munmap (hp, file_size); - return; - } - if (hp->wrapped) - vlib_cli_output (vm, - "Note: wrapped/incomplete trace, results may vary\n"); - - if (which == CUSTOM_DUMP) - { - saved_print_handlers = (void **) vec_dup (am->msg_print_handlers); - vl_msg_api_custom_dump_configure (am); - } - - - msg = (u8 *) (hp + 1); - - for (i = 0; i < first_index; i++) - { - trace_cfg_t *cfgp; - int size; - u16 msg_id; - - size = clib_host_to_net_u32 (*(u32 *) msg); - msg += sizeof (u32); - - if (clib_arch_is_little_endian) - msg_id = ntohs (*((u16 *) msg)); - else - msg_id = *((u16 *) msg); - - cfgp = am->api_trace_cfg + msg_id; - if (!cfgp) - { - vlib_cli_output (vm, "Ugh: msg id %d no trace config\n", msg_id); - munmap (hp, file_size); - return; - } - msg += size; - } - - if (which == REPLAY) - am->replay_in_progress = 1; - - for (; i <= last_index; i++) - { - trace_cfg_t *cfgp; - u16 *msg_idp; - u16 msg_id; - int size; - - if (which == DUMP) - vlib_cli_output (vm, "---------- trace %d -----------\n", i); - - size = clib_host_to_net_u32 (*(u32 *) msg); - msg += sizeof (u32); - - if (clib_arch_is_little_endian) - msg_id = ntohs (*((u16 *) msg)); - else - msg_id = *((u16 *) msg); - - cfgp = am->api_trace_cfg + msg_id; - if (!cfgp) - { - vlib_cli_output (vm, "Ugh: msg id %d no trace config\n", msg_id); - munmap (hp, file_size); - vec_free (tmpbuf); - am->replay_in_progress = 0; - return; - } - - /* Copy the buffer (from the read-only mmap'ed file) */ - vec_validate (tmpbuf, size - 1 + sizeof (uword)); - clib_memcpy (tmpbuf + sizeof (uword), msg, size); - memset (tmpbuf, 0xf, sizeof (uword)); - - /* - * Endian swap if needed. All msg data is supposed to be - * in network byte order. All msg handlers are supposed to - * know that. The generic message dumpers don't know that. - * One could fix apigen, I suppose. - */ - if ((which == DUMP && clib_arch_is_little_endian) || endian_swap_needed) - { - void (*endian_fp) (void *); - if (msg_id >= vec_len (am->msg_endian_handlers) - || (am->msg_endian_handlers[msg_id] == 0)) - { - vlib_cli_output (vm, "Ugh: msg id %d no endian swap\n", msg_id); - munmap (hp, file_size); - vec_free (tmpbuf); - am->replay_in_progress = 0; - return; - } - endian_fp = am->msg_endian_handlers[msg_id]; - (*endian_fp) (tmpbuf + sizeof (uword)); - } - - /* msg_id always in network byte order */ - if (clib_arch_is_little_endian) - { - msg_idp = (u16 *) (tmpbuf + sizeof (uword)); - *msg_idp = msg_id; - } - - switch (which) - { - case CUSTOM_DUMP: - case DUMP: - if (msg_id < vec_len (am->msg_print_handlers) && - am->msg_print_handlers[msg_id]) - { - u8 *(*print_fp) (void *, void *); - - print_fp = (void *) am->msg_print_handlers[msg_id]; - (*print_fp) (tmpbuf + sizeof (uword), vm); - } - else - { - vlib_cli_output (vm, "Skipping msg id %d: no print fcn\n", - msg_id); - break; - } - break; - - case INITIALIZERS: - if (msg_id < vec_len (am->msg_print_handlers) && - am->msg_print_handlers[msg_id]) - { - u8 *s; - int j; - u8 *(*print_fp) (void *, void *); - - print_fp = (void *) am->msg_print_handlers[msg_id]; - - vlib_cli_output (vm, "/*"); - - (*print_fp) (tmpbuf + sizeof (uword), vm); - vlib_cli_output (vm, "*/\n"); - - s = format (0, "static u8 * vl_api_%s_%d[%d] = {", - am->msg_names[msg_id], i, - am->api_trace_cfg[msg_id].size); - - for (j = 0; j < am->api_trace_cfg[msg_id].size; j++) - { - if ((j & 7) == 0) - s = format (s, "\n "); - s = format (s, "0x%02x,", tmpbuf[sizeof (uword) + j]); - } - s = format (s, "\n};\n%c", 0); - vlib_cli_output (vm, (char *) s); - vec_free (s); - } - break; - - case REPLAY: - if (msg_id < vec_len (am->msg_print_handlers) && - am->msg_print_handlers[msg_id] && cfgp->replay_enable) - { - void (*handler) (void *); - - handler = (void *) am->msg_handlers[msg_id]; - - if (!am->is_mp_safe[msg_id]) - vl_msg_api_barrier_sync (); - (*handler) (tmpbuf + sizeof (uword)); - if (!am->is_mp_safe[msg_id]) - vl_msg_api_barrier_release (); - } - else - { - if (cfgp->replay_enable) - vlib_cli_output (vm, "Skipping msg id %d: no handler\n", - msg_id); - break; - } - break; - } - - _vec_len (tmpbuf) = 0; - msg += size; - } - - if (saved_print_handlers) - { - clib_memcpy (am->msg_print_handlers, saved_print_handlers, - vec_len (am->msg_print_handlers) * sizeof (void *)); - vec_free (saved_print_handlers); - } - - munmap (hp, file_size); - vec_free (tmpbuf); - am->replay_in_progress = 0; -} - -u8 * -format_vl_msg_api_trace_status (u8 * s, va_list * args) -{ - api_main_t *am = va_arg (*args, api_main_t *); - vl_api_trace_which_t which = va_arg (*args, vl_api_trace_which_t); - vl_api_trace_t *tp; - char *trace_name; - - switch (which) - { - case VL_API_TRACE_TX: - tp = am->tx_trace; - trace_name = "TX trace"; - break; - - case VL_API_TRACE_RX: - tp = am->rx_trace; - trace_name = "RX trace"; - break; - - default: - abort (); - } - - if (tp == 0) - { - s = format (s, "%s: not yet configured.\n", trace_name); - return s; - } - - s = format (s, "%s: used %d of %d items, %s enabled, %s wrapped\n", - trace_name, vec_len (tp->traces), tp->nitems, - tp->enabled ? "is" : "is not", tp->wrapped ? "has" : "has not"); - return s; -} static u8 post_mortem_dump_enabled; -static clib_error_t * -api_trace_command_fn (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) -{ - u32 nitems = 256 << 10; - api_main_t *am = &api_main; - vl_api_trace_which_t which = VL_API_TRACE_RX; - u8 *filename; - u32 first = 0; - u32 last = (u32) ~ 0; - FILE *fp; - int rv; - - while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) - { - if (unformat (input, "on") || unformat (input, "enable")) - { - if (unformat (input, "nitems %d", &nitems)) - ; - vl_msg_api_trace_configure (am, which, nitems); - vl_msg_api_trace_onoff (am, which, 1 /* on */ ); - } - else if (unformat (input, "off")) - { - vl_msg_api_trace_onoff (am, which, 0); - } - else if (unformat (input, "save %s", &filename)) - { - u8 *chroot_filename; - if (strstr ((char *) filename, "..") - || index ((char *) filename, '/')) - { - vlib_cli_output (vm, "illegal characters in filename '%s'", - filename); - return 0; - } - - chroot_filename = format (0, "/tmp/%s%c", filename, 0); - - vec_free (filename); - - fp = fopen ((char *) chroot_filename, "w"); - if (fp == NULL) - { - vlib_cli_output (vm, "Couldn't create %s\n", chroot_filename); - return 0; - } - rv = vl_msg_api_trace_save (am, which, fp); - fclose (fp); - if (rv == -1) - vlib_cli_output (vm, "API Trace data not present\n"); - else if (rv == -2) - vlib_cli_output (vm, "File for writing is closed\n"); - else if (rv == -10) - vlib_cli_output (vm, "Error while writing header to file\n"); - else if (rv == -11) - vlib_cli_output (vm, "Error while writing trace to file\n"); - else if (rv == -12) - vlib_cli_output (vm, - "Error while writing end of buffer trace to file\n"); - else if (rv == -13) - vlib_cli_output (vm, - "Error while writing start of buffer trace to file\n"); - else if (rv < 0) - vlib_cli_output (vm, "Unkown error while saving: %d", rv); - else - vlib_cli_output (vm, "API trace saved to %s\n", chroot_filename); - vec_free (chroot_filename); - } - else if (unformat (input, "dump %s", &filename)) - { - vl_msg_api_process_file (vm, filename, first, last, DUMP); - } - else if (unformat (input, "custom-dump %s", &filename)) - { - vl_msg_api_process_file (vm, filename, first, last, CUSTOM_DUMP); - } - else if (unformat (input, "replay %s", &filename)) - { - vl_msg_api_process_file (vm, filename, first, last, REPLAY); - } - else if (unformat (input, "initializers %s", &filename)) - { - vl_msg_api_process_file (vm, filename, first, last, INITIALIZERS); - } - else if (unformat (input, "tx")) - { - which = VL_API_TRACE_TX; - } - else if (unformat (input, "first %d", &first)) - { - ; - } - else if (unformat (input, "last %d", &last)) - { - ; - } - else if (unformat (input, "status")) - { - vlib_cli_output (vm, "%U", format_vl_msg_api_trace_status, - am, which); - } - else if (unformat (input, "free")) - { - vl_msg_api_trace_onoff (am, which, 0); - vl_msg_api_trace_free (am, which); - } - else if (unformat (input, "post-mortem-on")) - post_mortem_dump_enabled = 1; - else if (unformat (input, "post-mortem-off")) - post_mortem_dump_enabled = 0; - else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); - } - return 0; -} - -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (api_trace_command, static) = { - .path = "api trace", - .short_help = - "api trace [on|off][dump|save|replay ][status][free][post-mortem-on]", - .function = api_trace_command_fn, -}; -/* *INDENT-ON* */ - -static clib_error_t * -api_config_fn (vlib_main_t * vm, unformat_input_t * input) +void +vl_msg_api_post_mortem_dump_enable_disable (int enable) { - u32 nitems = 256 << 10; - vl_api_trace_which_t which = VL_API_TRACE_RX; - api_main_t *am = &api_main; - - while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) - { - if (unformat (input, "on") || unformat (input, "enable")) - { - if (unformat (input, "nitems %d", &nitems)) - ; - vl_msg_api_trace_configure (am, which, nitems); - vl_msg_api_trace_onoff (am, which, 1 /* on */ ); - post_mortem_dump_enabled = 1; - } - else - return clib_error_return (0, "unknown input `%U'", - format_unformat_error, input); - } - return 0; + post_mortem_dump_enabled = enable; } -VLIB_CONFIG_FUNCTION (api_config_fn, "api-trace"); - void vl_msg_api_post_mortem_dump (void) { diff --git a/src/vlibapi/node_serialize.c b/src/vlibapi/node_serialize.c index 4dc1a7d2..50e5c41c 100644 --- a/src/vlibapi/node_serialize.c +++ b/src/vlibapi/node_serialize.c @@ -73,16 +73,11 @@ vlib_node_serialize (vlib_node_main_t * nm, u8 * vector, if (vec_len (stat_vms) == 0) { - if (vec_len (vlib_mains) == 0) - vec_add1 (stat_vms, vm); - else + for (i = 0; i < vec_len (vlib_mains); i++) { - for (i = 0; i < vec_len (vlib_mains); i++) - { - stat_vm = vlib_mains[i]; - if (stat_vm) - vec_add1 (stat_vms, stat_vm); - } + stat_vm = vlib_mains[i]; + if (stat_vm) + vec_add1 (stat_vms, stat_vm); } } @@ -286,7 +281,7 @@ vlib_node_unserialize (u8 * vector) return nodes_by_thread; } -#if CLIB_DEBUG > 0 +#if TEST_CODE static clib_error_t * test_node_serialize_command_fn (vlib_main_t * vm, diff --git a/src/vlibmemory/memory_vlib.c b/src/vlibmemory/memory_vlib.c index 3a7415c0..d2e05968 100644 --- a/src/vlibmemory/memory_vlib.c +++ b/src/vlibmemory/memory_vlib.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include #include @@ -1437,6 +1439,475 @@ rpc_api_hookup (vlib_main_t * vm) VLIB_API_INIT_FUNCTION (rpc_api_hookup); +typedef enum +{ + DUMP, + CUSTOM_DUMP, + REPLAY, + INITIALIZERS, +} vl_api_replay_t; + +u8 * +format_vl_msg_api_trace_status (u8 * s, va_list * args) +{ + api_main_t *am = va_arg (*args, api_main_t *); + vl_api_trace_which_t which = va_arg (*args, vl_api_trace_which_t); + vl_api_trace_t *tp; + char *trace_name; + + switch (which) + { + case VL_API_TRACE_TX: + tp = am->tx_trace; + trace_name = "TX trace"; + break; + + case VL_API_TRACE_RX: + tp = am->rx_trace; + trace_name = "RX trace"; + break; + + default: + abort (); + } + + if (tp == 0) + { + s = format (s, "%s: not yet configured.\n", trace_name); + return s; + } + + s = format (s, "%s: used %d of %d items, %s enabled, %s wrapped\n", + trace_name, vec_len (tp->traces), tp->nitems, + tp->enabled ? "is" : "is not", tp->wrapped ? "has" : "has not"); + return s; +} + +void vl_msg_api_custom_dump_configure (api_main_t * am) + __attribute__ ((weak)); +void +vl_msg_api_custom_dump_configure (api_main_t * am) +{ +} + +static void +vl_msg_api_process_file (vlib_main_t * vm, u8 * filename, + u32 first_index, u32 last_index, + vl_api_replay_t which) +{ + vl_api_trace_file_header_t *hp; + int i, fd; + struct stat statb; + size_t file_size; + u8 *msg; + u8 endian_swap_needed = 0; + api_main_t *am = &api_main; + u8 *tmpbuf = 0; + u32 nitems; + void **saved_print_handlers = 0; + + fd = open ((char *) filename, O_RDONLY); + + if (fd < 0) + { + vlib_cli_output (vm, "Couldn't open %s\n", filename); + return; + } + + if (fstat (fd, &statb) < 0) + { + vlib_cli_output (vm, "Couldn't stat %s\n", filename); + close (fd); + return; + } + + if (!(statb.st_mode & S_IFREG) || (statb.st_size < sizeof (*hp))) + { + vlib_cli_output (vm, "File not plausible: %s\n", filename); + close (fd); + return; + } + + file_size = statb.st_size; + file_size = (file_size + 4095) & ~(4096); + + hp = mmap (0, file_size, PROT_READ, MAP_PRIVATE, fd, 0); + + if (hp == (vl_api_trace_file_header_t *) MAP_FAILED) + { + vlib_cli_output (vm, "mmap failed: %s\n", filename); + close (fd); + return; + } + close (fd); + + if ((clib_arch_is_little_endian && hp->endian == VL_API_BIG_ENDIAN) + || (clib_arch_is_big_endian && hp->endian == VL_API_LITTLE_ENDIAN)) + endian_swap_needed = 1; + + if (endian_swap_needed) + nitems = ntohl (hp->nitems); + else + nitems = hp->nitems; + + if (last_index == (u32) ~ 0) + { + last_index = nitems - 1; + } + + if (first_index >= nitems || last_index >= nitems) + { + vlib_cli_output (vm, "Range (%d, %d) outside file range (0, %d)\n", + first_index, last_index, nitems - 1); + munmap (hp, file_size); + return; + } + if (hp->wrapped) + vlib_cli_output (vm, + "Note: wrapped/incomplete trace, results may vary\n"); + + if (which == CUSTOM_DUMP) + { + saved_print_handlers = (void **) vec_dup (am->msg_print_handlers); + vl_msg_api_custom_dump_configure (am); + } + + + msg = (u8 *) (hp + 1); + + for (i = 0; i < first_index; i++) + { + trace_cfg_t *cfgp; + int size; + u16 msg_id; + + size = clib_host_to_net_u32 (*(u32 *) msg); + msg += sizeof (u32); + + if (clib_arch_is_little_endian) + msg_id = ntohs (*((u16 *) msg)); + else + msg_id = *((u16 *) msg); + + cfgp = am->api_trace_cfg + msg_id; + if (!cfgp) + { + vlib_cli_output (vm, "Ugh: msg id %d no trace config\n", msg_id); + munmap (hp, file_size); + return; + } + msg += size; + } + + if (which == REPLAY) + am->replay_in_progress = 1; + + for (; i <= last_index; i++) + { + trace_cfg_t *cfgp; + u16 *msg_idp; + u16 msg_id; + int size; + + if (which == DUMP) + vlib_cli_output (vm, "---------- trace %d -----------\n", i); + + size = clib_host_to_net_u32 (*(u32 *) msg); + msg += sizeof (u32); + + if (clib_arch_is_little_endian) + msg_id = ntohs (*((u16 *) msg)); + else + msg_id = *((u16 *) msg); + + cfgp = am->api_trace_cfg + msg_id; + if (!cfgp) + { + vlib_cli_output (vm, "Ugh: msg id %d no trace config\n", msg_id); + munmap (hp, file_size); + vec_free (tmpbuf); + am->replay_in_progress = 0; + return; + } + + /* Copy the buffer (from the read-only mmap'ed file) */ + vec_validate (tmpbuf, size - 1 + sizeof (uword)); + clib_memcpy (tmpbuf + sizeof (uword), msg, size); + memset (tmpbuf, 0xf, sizeof (uword)); + + /* + * Endian swap if needed. All msg data is supposed to be + * in network byte order. All msg handlers are supposed to + * know that. The generic message dumpers don't know that. + * One could fix apigen, I suppose. + */ + if ((which == DUMP && clib_arch_is_little_endian) || endian_swap_needed) + { + void (*endian_fp) (void *); + if (msg_id >= vec_len (am->msg_endian_handlers) + || (am->msg_endian_handlers[msg_id] == 0)) + { + vlib_cli_output (vm, "Ugh: msg id %d no endian swap\n", msg_id); + munmap (hp, file_size); + vec_free (tmpbuf); + am->replay_in_progress = 0; + return; + } + endian_fp = am->msg_endian_handlers[msg_id]; + (*endian_fp) (tmpbuf + sizeof (uword)); + } + + /* msg_id always in network byte order */ + if (clib_arch_is_little_endian) + { + msg_idp = (u16 *) (tmpbuf + sizeof (uword)); + *msg_idp = msg_id; + } + + switch (which) + { + case CUSTOM_DUMP: + case DUMP: + if (msg_id < vec_len (am->msg_print_handlers) && + am->msg_print_handlers[msg_id]) + { + u8 *(*print_fp) (void *, void *); + + print_fp = (void *) am->msg_print_handlers[msg_id]; + (*print_fp) (tmpbuf + sizeof (uword), vm); + } + else + { + vlib_cli_output (vm, "Skipping msg id %d: no print fcn\n", + msg_id); + break; + } + break; + + case INITIALIZERS: + if (msg_id < vec_len (am->msg_print_handlers) && + am->msg_print_handlers[msg_id]) + { + u8 *s; + int j; + u8 *(*print_fp) (void *, void *); + + print_fp = (void *) am->msg_print_handlers[msg_id]; + + vlib_cli_output (vm, "/*"); + + (*print_fp) (tmpbuf + sizeof (uword), vm); + vlib_cli_output (vm, "*/\n"); + + s = format (0, "static u8 * vl_api_%s_%d[%d] = {", + am->msg_names[msg_id], i, + am->api_trace_cfg[msg_id].size); + + for (j = 0; j < am->api_trace_cfg[msg_id].size; j++) + { + if ((j & 7) == 0) + s = format (s, "\n "); + s = format (s, "0x%02x,", tmpbuf[sizeof (uword) + j]); + } + s = format (s, "\n};\n%c", 0); + vlib_cli_output (vm, (char *) s); + vec_free (s); + } + break; + + case REPLAY: + if (msg_id < vec_len (am->msg_print_handlers) && + am->msg_print_handlers[msg_id] && cfgp->replay_enable) + { + void (*handler) (void *); + + handler = (void *) am->msg_handlers[msg_id]; + + if (!am->is_mp_safe[msg_id]) + vl_msg_api_barrier_sync (); + (*handler) (tmpbuf + sizeof (uword)); + if (!am->is_mp_safe[msg_id]) + vl_msg_api_barrier_release (); + } + else + { + if (cfgp->replay_enable) + vlib_cli_output (vm, "Skipping msg id %d: no handler\n", + msg_id); + break; + } + break; + } + + _vec_len (tmpbuf) = 0; + msg += size; + } + + if (saved_print_handlers) + { + clib_memcpy (am->msg_print_handlers, saved_print_handlers, + vec_len (am->msg_print_handlers) * sizeof (void *)); + vec_free (saved_print_handlers); + } + + munmap (hp, file_size); + vec_free (tmpbuf); + am->replay_in_progress = 0; +} + +static clib_error_t * +api_trace_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + u32 nitems = 256 << 10; + api_main_t *am = &api_main; + vl_api_trace_which_t which = VL_API_TRACE_RX; + u8 *filename; + u32 first = 0; + u32 last = (u32) ~ 0; + FILE *fp; + int rv; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "on") || unformat (input, "enable")) + { + if (unformat (input, "nitems %d", &nitems)) + ; + vl_msg_api_trace_configure (am, which, nitems); + vl_msg_api_trace_onoff (am, which, 1 /* on */ ); + } + else if (unformat (input, "off")) + { + vl_msg_api_trace_onoff (am, which, 0); + } + else if (unformat (input, "save %s", &filename)) + { + u8 *chroot_filename; + if (strstr ((char *) filename, "..") + || index ((char *) filename, '/')) + { + vlib_cli_output (vm, "illegal characters in filename '%s'", + filename); + return 0; + } + + chroot_filename = format (0, "/tmp/%s%c", filename, 0); + + vec_free (filename); + + fp = fopen ((char *) chroot_filename, "w"); + if (fp == NULL) + { + vlib_cli_output (vm, "Couldn't create %s\n", chroot_filename); + return 0; + } + rv = vl_msg_api_trace_save (am, which, fp); + fclose (fp); + if (rv == -1) + vlib_cli_output (vm, "API Trace data not present\n"); + else if (rv == -2) + vlib_cli_output (vm, "File for writing is closed\n"); + else if (rv == -10) + vlib_cli_output (vm, "Error while writing header to file\n"); + else if (rv == -11) + vlib_cli_output (vm, "Error while writing trace to file\n"); + else if (rv == -12) + vlib_cli_output (vm, + "Error while writing end of buffer trace to file\n"); + else if (rv == -13) + vlib_cli_output (vm, + "Error while writing start of buffer trace to file\n"); + else if (rv < 0) + vlib_cli_output (vm, "Unkown error while saving: %d", rv); + else + vlib_cli_output (vm, "API trace saved to %s\n", chroot_filename); + vec_free (chroot_filename); + } + else if (unformat (input, "dump %s", &filename)) + { + vl_msg_api_process_file (vm, filename, first, last, DUMP); + } + else if (unformat (input, "custom-dump %s", &filename)) + { + vl_msg_api_process_file (vm, filename, first, last, CUSTOM_DUMP); + } + else if (unformat (input, "replay %s", &filename)) + { + vl_msg_api_process_file (vm, filename, first, last, REPLAY); + } + else if (unformat (input, "initializers %s", &filename)) + { + vl_msg_api_process_file (vm, filename, first, last, INITIALIZERS); + } + else if (unformat (input, "tx")) + { + which = VL_API_TRACE_TX; + } + else if (unformat (input, "first %d", &first)) + { + ; + } + else if (unformat (input, "last %d", &last)) + { + ; + } + else if (unformat (input, "status")) + { + vlib_cli_output (vm, "%U", format_vl_msg_api_trace_status, + am, which); + } + else if (unformat (input, "free")) + { + vl_msg_api_trace_onoff (am, which, 0); + vl_msg_api_trace_free (am, which); + } + else if (unformat (input, "post-mortem-on")) + vl_msg_api_post_mortem_dump_enable_disable (1 /* enable */ ); + else if (unformat (input, "post-mortem-off")) + vl_msg_api_post_mortem_dump_enable_disable (0 /* enable */ ); + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (api_trace_command, static) = { + .path = "api trace", + .short_help = + "api trace [on|off][dump|save|replay ][status][free][post-mortem-on]", + .function = api_trace_command_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +api_config_fn (vlib_main_t * vm, unformat_input_t * input) +{ + u32 nitems = 256 << 10; + vl_api_trace_which_t which = VL_API_TRACE_RX; + api_main_t *am = &api_main; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "on") || unformat (input, "enable")) + { + if (unformat (input, "nitems %d", &nitems)) + ; + vl_msg_api_trace_configure (am, which, nitems); + vl_msg_api_trace_onoff (am, which, 1 /* on */ ); + vl_msg_api_post_mortem_dump_enable_disable (1 /* enable */ ); + } + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + return 0; +} + +VLIB_CONFIG_FUNCTION (api_config_fn, "api-trace"); + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/devices/virtio/vhost-user.c b/src/vnet/devices/virtio/vhost-user.c index b6b4c04a..100ec613 100644 --- a/src/vnet/devices/virtio/vhost-user.c +++ b/src/vnet/devices/virtio/vhost-user.c @@ -374,8 +374,7 @@ vhost_user_rx_thread_placement () for (i = vum->input_cpu_first_index; i < vum->input_cpu_first_index + vum->input_cpu_count; i++) { - vlib_node_set_state (vlib_mains ? vlib_mains[i] : &vlib_global_main, - vhost_user_input_node.index, + vlib_node_set_state (vlib_mains[i], vhost_user_input_node.index, VLIB_NODE_STATE_DISABLED); vec_add1 (workers, i); } @@ -406,9 +405,9 @@ vhost_user_rx_thread_placement () iaq.qid = qid; iaq.vhost_iface_index = vui - vum->vhost_user_interfaces; vec_add1 (vhc->rx_queues, iaq); - vlib_node_set_state (vlib_mains ? vlib_mains[cpu_index] : - &vlib_global_main, vhost_user_input_node.index, - VLIB_NODE_STATE_POLLING); + vlib_node_set_state (vlib_mains[cpu_index], + vhost_user_input_node.index, + VLIB_NODE_STATE_POLLING); } }); /* *INDENT-ON* */ diff --git a/src/vpp-api-test.am b/src/vpp-api-test.am index f0d5df62..ceab687c 100644 --- a/src/vpp-api-test.am +++ b/src/vpp-api-test.am @@ -34,14 +34,12 @@ vpp_json_test_SOURCES = \ vat/json_test.c vpp_api_test_LDADD = \ - libvlib.la \ libvlibmemoryclient.la \ libsvm.la \ libvatplugin.la \ libvppinfra.la \ libvlibapi.la \ libvlibmemory.la \ - libvnet.la \ -lpthread -lm -lrt -ldl -lcrypto vpp_api_test_LDFLAGS = -Wl,--export-dynamic diff --git a/src/vpp/api/api.c b/src/vpp/api/api.c index 828394ed..c85dc680 100644 --- a/src/vpp/api/api.c +++ b/src/vpp/api/api.c @@ -2143,7 +2143,6 @@ vpe_api_init (vlib_main_t * vm) am->oam_events_registration_hash = hash_create (0, sizeof (uword)); am->bfd_events_registration_hash = hash_create (0, sizeof (uword)); - vl_api_init (vm); vl_set_memory_region_name ("/vpe-api"); vl_enable_disable_memory_api (vm, 1 /* enable it */ ); diff --git a/src/vpp/api/gmon.c b/src/vpp/api/gmon.c index 610f40ed..277be8c0 100644 --- a/src/vpp/api/gmon.c +++ b/src/vpp/api/gmon.c @@ -122,13 +122,8 @@ gmon_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) /* Initial wait for the world to settle down */ vlib_process_suspend (vm, 5.0); - if (vec_len (vlib_mains) == 0) - vec_add1 (gm->my_vlib_mains, &vlib_global_main); - else - { - for (i = 0; i < vec_len (vlib_mains); i++) - vec_add1 (gm->my_vlib_mains, vlib_mains[i]); - } + for (i = 0; i < vec_len (vlib_mains); i++) + vec_add1 (gm->my_vlib_mains, vlib_mains[i]); while (1) { -- cgit 1.2.3-korg From e9d52d54361296af520e1ece0c25307a2d86c018 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 9 Mar 2017 15:42:26 +0100 Subject: vlib: deduplicatee code in main and worker main loop Change-Id: Id18d59c9442602633a6310b2001a95bce8b6b232 Signed-off-by: Damjan Marion --- src/vlib/main.c | 176 +++++++++++++++++++++++++++++++++-------------------- src/vlib/main.h | 2 + src/vlib/threads.c | 76 +---------------------- src/vlib/threads.h | 4 +- 4 files changed, 116 insertions(+), 142 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/main.c b/src/vlib/main.c index 09f34bbd..91760706 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -1398,51 +1398,75 @@ dispatch_suspended_process (vlib_main_t * vm, return t; } -static void -vlib_main_loop (vlib_main_t * vm) +static_always_inline void +vlib_main_or_worker_loop (vlib_main_t * vm, int is_main) { vlib_node_main_t *nm = &vm->node_main; + vlib_thread_main_t *tm = vlib_get_thread_main (); uword i; u64 cpu_time_now; + vlib_frame_queue_main_t *fqm; /* Initialize pending node vector. */ - vec_resize (nm->pending_frames, 32); - _vec_len (nm->pending_frames) = 0; + if (is_main) + { + vec_resize (nm->pending_frames, 32); + _vec_len (nm->pending_frames) = 0; + } /* Mark time of main loop start. */ - cpu_time_now = vm->clib_time.last_cpu_time; - vm->cpu_time_main_loop_start = cpu_time_now; + if (is_main) + { + cpu_time_now = vm->clib_time.last_cpu_time; + vm->cpu_time_main_loop_start = cpu_time_now; + } + else + cpu_time_now = clib_cpu_time_now (); /* Arrange for first level of timing wheel to cover times we care most about. */ - nm->timing_wheel.min_sched_time = 10e-6; - nm->timing_wheel.max_sched_time = 10e-3; - timing_wheel_init (&nm->timing_wheel, - cpu_time_now, vm->clib_time.clocks_per_second); + if (is_main) + { + nm->timing_wheel.min_sched_time = 10e-6; + nm->timing_wheel.max_sched_time = 10e-3; + timing_wheel_init (&nm->timing_wheel, + cpu_time_now, vm->clib_time.clocks_per_second); + vec_alloc (nm->data_from_advancing_timing_wheel, 32); + } /* Pre-allocate expired nodes. */ - vec_alloc (nm->data_from_advancing_timing_wheel, 32); vec_alloc (nm->pending_interrupt_node_runtime_indices, 32); - if (!nm->polling_threshold_vector_length) - nm->polling_threshold_vector_length = 10; - if (!nm->interrupt_threshold_vector_length) - nm->interrupt_threshold_vector_length = 5; + if (is_main) + { + if (!nm->polling_threshold_vector_length) + nm->polling_threshold_vector_length = 10; + if (!nm->interrupt_threshold_vector_length) + nm->interrupt_threshold_vector_length = 5; - nm->current_process_index = ~0; + nm->current_process_index = ~0; + } /* Start all processes. */ - { - uword i; - for (i = 0; i < vec_len (nm->processes); i++) - cpu_time_now = - dispatch_process (vm, nm->processes[i], /* frame */ 0, cpu_time_now); - } + if (is_main) + { + uword i; + for (i = 0; i < vec_len (nm->processes); i++) + cpu_time_now = dispatch_process (vm, nm->processes[i], /* frame */ 0, + cpu_time_now); + } while (1) { vlib_node_runtime_t *n; + if (!is_main) + { + vlib_worker_thread_barrier_check (); + vec_foreach (fqm, tm->frame_queue_mains) + vlib_frame_queue_dequeue (vm, fqm); + } + /* Process pre-input nodes. */ vec_foreach (n, nm->nodes_by_type[VLIB_NODE_TYPE_PRE_INPUT]) cpu_time_now = dispatch_node (vm, n, @@ -1459,7 +1483,7 @@ vlib_main_loop (vlib_main_t * vm) /* frame */ 0, cpu_time_now); - if (PREDICT_TRUE (vm->queue_signal_pending == 0)) + if (PREDICT_TRUE (is_main && vm->queue_signal_pending == 0)) vm->queue_signal_callback (vm); /* Next handle interrupts. */ @@ -1484,58 +1508,64 @@ vlib_main_loop (vlib_main_t * vm) } } - /* Check if process nodes have expired from timing wheel. */ - nm->data_from_advancing_timing_wheel - = timing_wheel_advance (&nm->timing_wheel, cpu_time_now, - nm->data_from_advancing_timing_wheel, - &nm->cpu_time_next_process_ready); - - ASSERT (nm->data_from_advancing_timing_wheel != 0); - if (PREDICT_FALSE (_vec_len (nm->data_from_advancing_timing_wheel) > 0)) + if (is_main) { - uword i; - - processes_timing_wheel_data: - for (i = 0; i < _vec_len (nm->data_from_advancing_timing_wheel); - i++) + /* Check if process nodes have expired from timing wheel. */ + nm->data_from_advancing_timing_wheel + = timing_wheel_advance (&nm->timing_wheel, cpu_time_now, + nm->data_from_advancing_timing_wheel, + &nm->cpu_time_next_process_ready); + + ASSERT (nm->data_from_advancing_timing_wheel != 0); + if (PREDICT_FALSE + (_vec_len (nm->data_from_advancing_timing_wheel) > 0)) { - u32 d = nm->data_from_advancing_timing_wheel[i]; - u32 di = vlib_timing_wheel_data_get_index (d); + uword i; - if (vlib_timing_wheel_data_is_timed_event (d)) + processes_timing_wheel_data: + for (i = 0; i < _vec_len (nm->data_from_advancing_timing_wheel); + i++) { - vlib_signal_timed_event_data_t *te = - pool_elt_at_index (nm->signal_timed_event_data_pool, di); - vlib_node_t *n = vlib_get_node (vm, te->process_node_index); - vlib_process_t *p = - vec_elt (nm->processes, n->runtime_index); - void *data; - data = - vlib_process_signal_event_helper (nm, n, p, - te->event_type_index, - te->n_data_elts, - te->n_data_elt_bytes); - if (te->n_data_bytes < sizeof (te->inline_event_data)) - clib_memcpy (data, te->inline_event_data, - te->n_data_bytes); + u32 d = nm->data_from_advancing_timing_wheel[i]; + u32 di = vlib_timing_wheel_data_get_index (d); + + if (vlib_timing_wheel_data_is_timed_event (d)) + { + vlib_signal_timed_event_data_t *te = + pool_elt_at_index (nm->signal_timed_event_data_pool, + di); + vlib_node_t *n = + vlib_get_node (vm, te->process_node_index); + vlib_process_t *p = + vec_elt (nm->processes, n->runtime_index); + void *data; + data = + vlib_process_signal_event_helper (nm, n, p, + te->event_type_index, + te->n_data_elts, + te->n_data_elt_bytes); + if (te->n_data_bytes < sizeof (te->inline_event_data)) + clib_memcpy (data, te->inline_event_data, + te->n_data_bytes); + else + { + clib_memcpy (data, te->event_data_as_vector, + te->n_data_bytes); + vec_free (te->event_data_as_vector); + } + pool_put (nm->signal_timed_event_data_pool, te); + } else { - clib_memcpy (data, te->event_data_as_vector, - te->n_data_bytes); - vec_free (te->event_data_as_vector); + cpu_time_now = clib_cpu_time_now (); + cpu_time_now = + dispatch_suspended_process (vm, di, cpu_time_now); } - pool_put (nm->signal_timed_event_data_pool, te); } - else - { - cpu_time_now = clib_cpu_time_now (); - cpu_time_now = - dispatch_suspended_process (vm, di, cpu_time_now); - } - } - /* Reset vector. */ - _vec_len (nm->data_from_advancing_timing_wheel) = 0; + /* Reset vector. */ + _vec_len (nm->data_from_advancing_timing_wheel) = 0; + } } /* Input nodes may have added work to the pending vector. @@ -1548,7 +1578,7 @@ vlib_main_loop (vlib_main_t * vm) _vec_len (nm->pending_frames) = 0; /* Pending internal nodes may resume processes. */ - if (_vec_len (nm->data_from_advancing_timing_wheel) > 0) + if (is_main && _vec_len (nm->data_from_advancing_timing_wheel) > 0) goto processes_timing_wheel_data; vlib_increment_main_loop_counter (vm); @@ -1559,6 +1589,18 @@ vlib_main_loop (vlib_main_t * vm) } } +static void +vlib_main_loop (vlib_main_t * vm) +{ + vlib_main_or_worker_loop (vm, /* is_main */ 1); +} + +void +vlib_worker_loop (vlib_main_t * vm) +{ + vlib_main_or_worker_loop (vm, /* is_main */ 0); +} + vlib_main_t vlib_global_main; static clib_error_t * diff --git a/src/vlib/main.h b/src/vlib/main.h index d9ac1445..a6d50b39 100644 --- a/src/vlib/main.h +++ b/src/vlib/main.h @@ -178,6 +178,8 @@ typedef struct vlib_main_t /* Global main structure. */ extern vlib_main_t vlib_global_main; +void vlib_worker_loop (vlib_main_t * vm); + always_inline f64 vlib_time_now (vlib_main_t * vm) { diff --git a/src/vlib/threads.c b/src/vlib/threads.c index 4676be97..07dbff33 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -1208,9 +1208,8 @@ vlib_worker_thread_barrier_release (vlib_main_t * vm) * If so, pull the packets off the frames and put them to * the handoff node. */ -static inline int -vlib_frame_queue_dequeue_internal (vlib_main_t * vm, - vlib_frame_queue_main_t * fqm) +int +vlib_frame_queue_dequeue (vlib_main_t * vm, vlib_frame_queue_main_t * fqm) { u32 thread_id = vm->cpu_index; vlib_frame_queue_t *fq = fqm->vlib_frame_queues[thread_id]; @@ -1337,75 +1336,6 @@ vlib_frame_queue_dequeue_internal (vlib_main_t * vm, return processed; } -static_always_inline void -vlib_worker_thread_internal (vlib_main_t * vm) -{ - vlib_node_main_t *nm = &vm->node_main; - vlib_thread_main_t *tm = vlib_get_thread_main (); - u64 cpu_time_now = clib_cpu_time_now (); - vlib_frame_queue_main_t *fqm; - - vec_alloc (nm->pending_interrupt_node_runtime_indices, 32); - - while (1) - { - vlib_worker_thread_barrier_check (); - - vec_foreach (fqm, tm->frame_queue_mains) - vlib_frame_queue_dequeue_internal (vm, fqm); - - vlib_node_runtime_t *n; - vec_foreach (n, nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]) - { - cpu_time_now = dispatch_node (vm, n, VLIB_NODE_TYPE_INPUT, - VLIB_NODE_STATE_POLLING, /* frame */ 0, - cpu_time_now); - } - - /* Next handle interrupts. */ - { - uword l = _vec_len (nm->pending_interrupt_node_runtime_indices); - uword i; - if (l > 0) - { - _vec_len (nm->pending_interrupt_node_runtime_indices) = 0; - for (i = 0; i < l; i++) - { - n = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT], - nm-> - pending_interrupt_node_runtime_indices - [i]); - cpu_time_now = - dispatch_node (vm, n, VLIB_NODE_TYPE_INPUT, - VLIB_NODE_STATE_INTERRUPT, - /* frame */ 0, - cpu_time_now); - } - } - } - - if (_vec_len (nm->pending_frames)) - { - int i; - cpu_time_now = clib_cpu_time_now (); - for (i = 0; i < _vec_len (nm->pending_frames); i++) - { - vlib_pending_frame_t *p; - - p = nm->pending_frames + i; - - cpu_time_now = dispatch_pending_node (vm, p, cpu_time_now); - } - _vec_len (nm->pending_frames) = 0; - } - vlib_increment_main_loop_counter (vm); - - /* Record time stamp in case there are no enabled nodes and above - calls do not update time stamp. */ - cpu_time_now = clib_cpu_time_now (); - } -} - void vlib_worker_thread_fn (void *arg) { @@ -1423,7 +1353,7 @@ vlib_worker_thread_fn (void *arg) while (tm->extern_thread_mgmt && tm->worker_thread_release == 0) vlib_worker_thread_barrier_check (); - vlib_worker_thread_internal (vm); + vlib_worker_loop (vm); } /* *INDENT-OFF* */ diff --git a/src/vlib/threads.h b/src/vlib/threads.h index a032311c..fc1633f6 100644 --- a/src/vlib/threads.h +++ b/src/vlib/threads.h @@ -159,8 +159,8 @@ int vlib_frame_queue_enqueue (vlib_main_t * vm, u32 node_runtime_index, u32 frame_queue_index, vlib_frame_t * frame, vlib_frame_queue_msg_type_t type); -int vlib_frame_queue_dequeue (int thread_id, - vlib_main_t * vm, vlib_node_main_t * nm); +int +vlib_frame_queue_dequeue (vlib_main_t * vm, vlib_frame_queue_main_t * fqm); u64 dispatch_node (vlib_main_t * vm, vlib_node_runtime_t * node, -- cgit 1.2.3-korg From 374e2c5fc30a5bfabfd2eb6c2d3ca5797402af16 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 9 Mar 2017 20:38:15 +0100 Subject: Retire vpp_lite vpp_lite platform is not needed anymore as same efect can be achieved with following startup.conf config: plugins { plugin dpdk_plugin.so { disable } } Change-Id: I690ea8ceb1c6e1fe32e01e7da54e9958019a93bf Signed-off-by: Damjan Marion --- Makefile | 16 +- build-data/platforms/vpp.mk | 5 - build-data/platforms/vpp_lite.mk | 50 - src/configure.ac | 4 +- src/plugins/Makefile.am | 6 +- src/plugins/ixge.am | 20 + src/plugins/ixge/ixge.c | 2947 +++++++++++++++++++++++ src/plugins/ixge/ixge.h | 1293 ++++++++++ src/vlib/buffer.c | 6 + src/vnet.am | 14 +- src/vnet/devices/nic/ixge.c | 2938 ---------------------- src/vnet/devices/nic/ixge.h | 1293 ---------- src/vnet/devices/nic/sfp.c | 117 - src/vnet/devices/nic/sfp.h | 117 - src/vnet/ethernet/sfp.c | 117 + src/vnet/ethernet/sfp.h | 117 + src/vpp-api/lua/bench.lua | 4 +- src/vpp-api/lua/examples/cli/lua-cli.lua | 4 +- src/vpp-api/lua/examples/example-classifier.lua | 4 +- src/vpp-api/lua/examples/example-cli.lua | 4 +- src/vpp/vnet/main.c | 3 - test/framework.py | 4 +- 22 files changed, 4529 insertions(+), 4554 deletions(-) delete mode 100644 build-data/platforms/vpp_lite.mk create mode 100644 src/plugins/ixge.am create mode 100644 src/plugins/ixge/ixge.c create mode 100644 src/plugins/ixge/ixge.h delete mode 100644 src/vnet/devices/nic/ixge.c delete mode 100644 src/vnet/devices/nic/ixge.h delete mode 100644 src/vnet/devices/nic/sfp.c delete mode 100644 src/vnet/devices/nic/sfp.h create mode 100644 src/vnet/ethernet/sfp.c create mode 100644 src/vnet/ethernet/sfp.h (limited to 'src/vlib') diff --git a/Makefile b/Makefile index f0173cc1..1527d60a 100644 --- a/Makefile +++ b/Makefile @@ -230,18 +230,18 @@ define test endef test: bootstrap - $(call test,vpp_lite,vpp_lite,test) + $(call test,vpp,vpp,test) test-debug: bootstrap - $(call test,vpp_lite,vpp_lite_debug,test) + $(call test,vpp,vpp_debug,test) test-all: bootstrap $(eval EXTENDED_TESTS=yes) - $(call test,vpp_lite,vpp_lite,test) + $(call test,vpp,vpp,test) test-all-debug: bootstrap $(eval EXTENDED_TESTS=yes) - $(call test,vpp_lite,vpp_lite_debug,test) + $(call test,vpp,vpp_debug,test) test-help: @make -C test help @@ -262,7 +262,7 @@ test-wipe-doc: @make -C test wipe-doc test-cov: bootstrap - $(call test,vpp_lite,vpp_lite_gcov,cov) + $(call test,vpp,vpp_gcov,cov) test-wipe-cov: @make -C test wipe-cov @@ -271,10 +271,10 @@ test-checkstyle: @make -C test checkstyle retest: - $(call test,vpp_lite,vpp_lite,retest) + $(call test,vpp,vpp,retest) retest-debug: - $(call test,vpp_lite,vpp_lite_debug,retest) + $(call test,vpp,vpp_debug,retest) STARTUP_DIR ?= $(PWD) ifeq ("$(wildcard $(STARTUP_CONF))","") @@ -376,8 +376,6 @@ endef verify: install-dep $(BR)/.bootstrap.ok dpdk-install-dev $(call banner,"Building for PLATFORM=vpp using gcc") @make -C build-root PLATFORM=vpp TAG=vpp wipe-all install-packages - $(call banner,"Building for PLATFORM=vpp_lite using gcc") - @make -C build-root PLATFORM=vpp_lite TAG=vpp_lite wipe-all install-packages ifeq ($(OS_ID)-$(OS_VERSION_ID),ubuntu-16.04) $(call banner,"Installing dependencies") @sudo -E apt-get update diff --git a/build-data/platforms/vpp.mk b/build-data/platforms/vpp.mk index 401a383a..c61375d8 100644 --- a/build-data/platforms/vpp.mk +++ b/build-data/platforms/vpp.mk @@ -35,11 +35,6 @@ vpp_uses_dpdk = yes vpp_root_packages = vpp gmod -vpp_configure_args_vpp = --with-dpdk - -# Set these parameters carefully. The vlib_buffer_t is 128 bytes, i.e. -vlib_configure_args_vpp = --with-pre-data=128 - # DPDK configuration parameters # vpp_uses_dpdk_cryptodev_sw = yes # vpp_uses_dpdk_mlx5_pmd = yes diff --git a/build-data/platforms/vpp_lite.mk b/build-data/platforms/vpp_lite.mk deleted file mode 100644 index a556b487..00000000 --- a/build-data/platforms/vpp_lite.mk +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) 2016 Cisco and/or its affiliates. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# vector packet processor -vpp_lite_arch = native -ifeq ($(shell uname -m),x86_64) -vpp_lite_march = corei7 # Nehalem Instruction set -vpp_lite_mtune = corei7-avx # Optimize for Sandy Bridge -else -vpp_lite_march = native -vpp_lite_mtune = generic -endif -vpp_lite_native_tools = vppapigen - -vpp_lite_uses_dpdk = no - -# Uncoment to enable building unit tests -#vpp_lite_enable_tests = yes - -vpp_lite_root_packages = vpp gmod - -vlib_configure_args_vpp_lite = --with-pre-data=128 - -vnet_configure_args_vpp_lite = -vpp_configure_args_vpp_lite = - -vpp_lite_debug_TAG_CFLAGS = -g -O0 -DCLIB_DEBUG -DFORTIFY_SOURCE=2 -march=$(MARCH) \ - -fstack-protector-all -fPIC -Werror -vpp_lite_debug_TAG_LDFLAGS = -g -O0 -DCLIB_DEBUG -DFORTIFY_SOURCE=2 -march=$(MARCH) \ - -fstack-protector-all -fPIC -Werror - -vpp_lite_TAG_CFLAGS = -g -O2 -DFORTIFY_SOURCE=2 -march=$(MARCH) -mtune=$(MTUNE) \ - -fstack-protector -fPIC -Werror -vpp_lite_TAG_LDFLAGS = -g -O2 -DFORTIFY_SOURCE=2 -march=$(MARCH) -mtune=$(MTUNE) \ - -fstack-protector -fPIC -Werror - -vpp_lite_gcov_TAG_CFLAGS = -g -O0 -DCLIB_DEBUG -march=$(MARCH) \ - -fPIC -Werror -fprofile-arcs -ftest-coverage -vpp_lite_gcov_TAG_LDFLAGS = -g -O0 -DCLIB_DEBUG -march=$(MARCH) \ - -fPIC -Werror -coverage diff --git a/src/configure.ac b/src/configure.ac index c22d152e..d90740d9 100644 --- a/src/configure.ac +++ b/src/configure.ac @@ -97,7 +97,6 @@ DISABLE_ARG(papi, [Disable Python API bindings]) DISABLE_ARG(japi, [Disable Java API bindings]) # --with-X -WITH_ARG(dpdk, [Use use DPDK]) WITH_ARG(dpdk_crypto_sw,[Use DPDK cryptodev SW PMDs]) WITH_ARG(dpdk_mlx5_pmd, [Use DPDK with mlx5 PMD]) @@ -130,7 +129,6 @@ AC_ARG_WITH(pre-data, AC_SUBST(PRE_DATA_SIZE, [$with_pre_data]) AC_SUBST(APICLI, [-DVPP_API_TEST_BUILTIN=${n_with_apicli}]) -AC_DEFINE_UNQUOTED(DPDK, [${n_with_dpdk}]) AC_DEFINE_UNQUOTED(DPDK_SHARED_LIB, [${n_enable_dpdk_shared}]) AC_DEFINE_UNQUOTED(DPDK_CRYPTO_SW, [${n_with_dpdk_crypto_sw}]) AC_DEFINE_UNQUOTED(WITH_LIBSSL, [${n_with_libssl}]) @@ -147,9 +145,11 @@ AC_SUBST(AR_FLAGS) # Please keep alphabetical order PLUGIN_ENABLED(acl) +PLUGIN_ENABLED(dpdk) PLUGIN_ENABLED(flowperpkt) PLUGIN_ENABLED(ila) PLUGIN_ENABLED(ioam) +PLUGIN_ENABLED(ixge) PLUGIN_ENABLED(lb) PLUGIN_ENABLED(sixrd) PLUGIN_ENABLED(snat) diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am index 7b36049e..255e644f 100644 --- a/src/plugins/Makefile.am +++ b/src/plugins/Makefile.am @@ -33,7 +33,7 @@ if ENABLE_ACL_PLUGIN include acl.am endif -if WITH_DPDK +if ENABLE_DPDK_PLUGIN include dpdk.am endif @@ -49,6 +49,10 @@ if ENABLE_IOAM_PLUGIN include ioam.am endif +if ENABLE_IXGE_PLUGIN +include ixge.am +endif + if ENABLE_LB_PLUGIN include lb.am endif diff --git a/src/plugins/ixge.am b/src/plugins/ixge.am new file mode 100644 index 00000000..7e61344b --- /dev/null +++ b/src/plugins/ixge.am @@ -0,0 +1,20 @@ +# Copyright (c) 2016 Cisco Systems, Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +vppplugins_LTLIBRARIES += ixge_plugin.la + +ixge_plugin_la_SOURCES = ixge/ixge.c + +noinst_HEADERS += ixge/ixge.h + +# vi:syntax=automake diff --git a/src/plugins/ixge/ixge.c b/src/plugins/ixge/ixge.c new file mode 100644 index 00000000..4eebc457 --- /dev/null +++ b/src/plugins/ixge/ixge.c @@ -0,0 +1,2947 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * WARNING! + * This driver is not intended for production use and it is unsupported. + * It is provided for educational use only. + * Please use supported DPDK driver instead. + */ + +#if __x86_64__ +#include + +#ifndef CLIB_HAVE_VEC128 +#warning HACK: ixge driver wont really work, missing u32x4 +typedef unsigned long long u32x4; +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#define IXGE_ALWAYS_POLL 0 + +#define EVENT_SET_FLAGS 0 +#define IXGE_HWBP_RACE_ELOG 0 + +#define PCI_VENDOR_ID_INTEL 0x8086 + +/* 10 GIG E (XGE) PHY IEEE 802.3 clause 45 definitions. */ +#define XGE_PHY_DEV_TYPE_PMA_PMD 1 +#define XGE_PHY_DEV_TYPE_PHY_XS 4 +#define XGE_PHY_ID1 0x2 +#define XGE_PHY_ID2 0x3 +#define XGE_PHY_CONTROL 0x0 +#define XGE_PHY_CONTROL_RESET (1 << 15) + +ixge_main_t ixge_main; +static vlib_node_registration_t ixge_input_node; +static vlib_node_registration_t ixge_process_node; + +static void +ixge_semaphore_get (ixge_device_t * xd) +{ + ixge_main_t *xm = &ixge_main; + vlib_main_t *vm = xm->vlib_main; + ixge_regs_t *r = xd->regs; + u32 i; + + i = 0; + while (!(r->software_semaphore & (1 << 0))) + { + if (i > 0) + vlib_process_suspend (vm, 100e-6); + i++; + } + do + { + r->software_semaphore |= 1 << 1; + } + while (!(r->software_semaphore & (1 << 1))); +} + +static void +ixge_semaphore_release (ixge_device_t * xd) +{ + ixge_regs_t *r = xd->regs; + r->software_semaphore &= ~3; +} + +static void +ixge_software_firmware_sync (ixge_device_t * xd, u32 sw_mask) +{ + ixge_main_t *xm = &ixge_main; + vlib_main_t *vm = xm->vlib_main; + ixge_regs_t *r = xd->regs; + u32 fw_mask = sw_mask << 5; + u32 m, done = 0; + + while (!done) + { + ixge_semaphore_get (xd); + m = r->software_firmware_sync; + done = (m & fw_mask) == 0; + if (done) + r->software_firmware_sync = m | sw_mask; + ixge_semaphore_release (xd); + if (!done) + vlib_process_suspend (vm, 10e-3); + } +} + +static void +ixge_software_firmware_sync_release (ixge_device_t * xd, u32 sw_mask) +{ + ixge_regs_t *r = xd->regs; + ixge_semaphore_get (xd); + r->software_firmware_sync &= ~sw_mask; + ixge_semaphore_release (xd); +} + +u32 +ixge_read_write_phy_reg (ixge_device_t * xd, u32 dev_type, u32 reg_index, + u32 v, u32 is_read) +{ + ixge_regs_t *r = xd->regs; + const u32 busy_bit = 1 << 30; + u32 x; + + ASSERT (xd->phy_index < 2); + ixge_software_firmware_sync (xd, 1 << (1 + xd->phy_index)); + + ASSERT (reg_index < (1 << 16)); + ASSERT (dev_type < (1 << 5)); + if (!is_read) + r->xge_mac.phy_data = v; + + /* Address cycle. */ + x = + reg_index | (dev_type << 16) | (xd-> + phys[xd->phy_index].mdio_address << 21); + r->xge_mac.phy_command = x | busy_bit; + /* Busy wait timed to take 28e-6 secs. No suspend. */ + while (r->xge_mac.phy_command & busy_bit) + ; + + r->xge_mac.phy_command = x | ((is_read ? 2 : 1) << 26) | busy_bit; + while (r->xge_mac.phy_command & busy_bit) + ; + + if (is_read) + v = r->xge_mac.phy_data >> 16; + + ixge_software_firmware_sync_release (xd, 1 << (1 + xd->phy_index)); + + return v; +} + +static u32 +ixge_read_phy_reg (ixge_device_t * xd, u32 dev_type, u32 reg_index) +{ + return ixge_read_write_phy_reg (xd, dev_type, reg_index, 0, /* is_read */ + 1); +} + +static void +ixge_write_phy_reg (ixge_device_t * xd, u32 dev_type, u32 reg_index, u32 v) +{ + (void) ixge_read_write_phy_reg (xd, dev_type, reg_index, v, /* is_read */ + 0); +} + +static void +ixge_i2c_put_bits (i2c_bus_t * b, int scl, int sda) +{ + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, b->private_data); + u32 v; + + v = 0; + v |= (sda != 0) << 3; + v |= (scl != 0) << 1; + xd->regs->i2c_control = v; +} + +static void +ixge_i2c_get_bits (i2c_bus_t * b, int *scl, int *sda) +{ + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, b->private_data); + u32 v; + + v = xd->regs->i2c_control; + *sda = (v & (1 << 2)) != 0; + *scl = (v & (1 << 0)) != 0; +} + +static u16 +ixge_read_eeprom (ixge_device_t * xd, u32 address) +{ + ixge_regs_t *r = xd->regs; + u32 v; + r->eeprom_read = (( /* start bit */ (1 << 0)) | (address << 2)); + /* Wait for done bit. */ + while (!((v = r->eeprom_read) & (1 << 1))) + ; + return v >> 16; +} + +static void +ixge_sfp_enable_disable_laser (ixge_device_t * xd, uword enable) +{ + u32 tx_disable_bit = 1 << 3; + if (enable) + xd->regs->sdp_control &= ~tx_disable_bit; + else + xd->regs->sdp_control |= tx_disable_bit; +} + +static void +ixge_sfp_enable_disable_10g (ixge_device_t * xd, uword enable) +{ + u32 is_10g_bit = 1 << 5; + if (enable) + xd->regs->sdp_control |= is_10g_bit; + else + xd->regs->sdp_control &= ~is_10g_bit; +} + +static clib_error_t * +ixge_sfp_phy_init_from_eeprom (ixge_device_t * xd, u16 sfp_type) +{ + u16 a, id, reg_values_addr = 0; + + a = ixge_read_eeprom (xd, 0x2b); + if (a == 0 || a == 0xffff) + return clib_error_create ("no init sequence in eeprom"); + + while (1) + { + id = ixge_read_eeprom (xd, ++a); + if (id == 0xffff) + break; + reg_values_addr = ixge_read_eeprom (xd, ++a); + if (id == sfp_type) + break; + } + if (id != sfp_type) + return clib_error_create ("failed to find id 0x%x", sfp_type); + + ixge_software_firmware_sync (xd, 1 << 3); + while (1) + { + u16 v = ixge_read_eeprom (xd, ++reg_values_addr); + if (v == 0xffff) + break; + xd->regs->core_analog_config = v; + } + ixge_software_firmware_sync_release (xd, 1 << 3); + + /* Make sure laser is off. We'll turn on the laser when + the interface is brought up. */ + ixge_sfp_enable_disable_laser (xd, /* enable */ 0); + ixge_sfp_enable_disable_10g (xd, /* is_10g */ 1); + + return 0; +} + +static void +ixge_sfp_device_up_down (ixge_device_t * xd, uword is_up) +{ + u32 v; + + if (is_up) + { + /* pma/pmd 10g serial SFI. */ + xd->regs->xge_mac.auto_negotiation_control2 &= ~(3 << 16); + xd->regs->xge_mac.auto_negotiation_control2 |= 2 << 16; + + v = xd->regs->xge_mac.auto_negotiation_control; + v &= ~(7 << 13); + v |= (0 << 13); + /* Restart autoneg. */ + v |= (1 << 12); + xd->regs->xge_mac.auto_negotiation_control = v; + + while (!(xd->regs->xge_mac.link_partner_ability[0] & 0xf0000)) + ; + + v = xd->regs->xge_mac.auto_negotiation_control; + + /* link mode 10g sfi serdes */ + v &= ~(7 << 13); + v |= (3 << 13); + + /* Restart autoneg. */ + v |= (1 << 12); + xd->regs->xge_mac.auto_negotiation_control = v; + + xd->regs->xge_mac.link_status; + } + + ixge_sfp_enable_disable_laser (xd, /* enable */ is_up); + + /* Give time for link partner to notice that we're up. */ + if (is_up && vlib_in_process_context (vlib_get_main ())) + { + vlib_process_suspend (vlib_get_main (), 300e-3); + } +} + +always_inline ixge_dma_regs_t * +get_dma_regs (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 qi) +{ + ixge_regs_t *r = xd->regs; + ASSERT (qi < 128); + if (rt == VLIB_RX) + return qi < 64 ? &r->rx_dma0[qi] : &r->rx_dma1[qi - 64]; + else + return &r->tx_dma[qi]; +} + +static clib_error_t * +ixge_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) +{ + vnet_hw_interface_t *hif = vnet_get_hw_interface (vnm, hw_if_index); + uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0; + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, hif->dev_instance); + ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_RX, 0); + + if (is_up) + { + xd->regs->rx_enable |= 1; + xd->regs->tx_dma_control |= 1; + dr->control |= 1 << 25; + while (!(dr->control & (1 << 25))) + ; + } + else + { + xd->regs->rx_enable &= ~1; + xd->regs->tx_dma_control &= ~1; + } + + ixge_sfp_device_up_down (xd, is_up); + + return /* no error */ 0; +} + +static void +ixge_sfp_phy_init (ixge_device_t * xd) +{ + ixge_phy_t *phy = xd->phys + xd->phy_index; + i2c_bus_t *ib = &xd->i2c_bus; + + ib->private_data = xd->device_index; + ib->put_bits = ixge_i2c_put_bits; + ib->get_bits = ixge_i2c_get_bits; + vlib_i2c_init (ib); + + vlib_i2c_read_eeprom (ib, 0x50, 0, 128, (u8 *) & xd->sfp_eeprom); + + if (vlib_i2c_bus_timed_out (ib) || !sfp_eeprom_is_valid (&xd->sfp_eeprom)) + xd->sfp_eeprom.id = SFP_ID_unknown; + else + { + /* FIXME 5 => SR/LR eeprom ID. */ + clib_error_t *e = + ixge_sfp_phy_init_from_eeprom (xd, 5 + xd->pci_function); + if (e) + clib_error_report (e); + } + + phy->mdio_address = ~0; +} + +static void +ixge_phy_init (ixge_device_t * xd) +{ + ixge_main_t *xm = &ixge_main; + vlib_main_t *vm = xm->vlib_main; + ixge_phy_t *phy = xd->phys + xd->phy_index; + + switch (xd->device_id) + { + case IXGE_82599_sfp: + case IXGE_82599_sfp_em: + case IXGE_82599_sfp_fcoe: + /* others? */ + return ixge_sfp_phy_init (xd); + + default: + break; + } + + /* Probe address of phy. */ + { + u32 i, v; + + phy->mdio_address = ~0; + for (i = 0; i < 32; i++) + { + phy->mdio_address = i; + v = ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PMA_PMD, XGE_PHY_ID1); + if (v != 0xffff && v != 0) + break; + } + + /* No PHY found? */ + if (i >= 32) + return; + } + + phy->id = + ((ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PMA_PMD, XGE_PHY_ID1) << 16) | + ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PMA_PMD, XGE_PHY_ID2)); + + { + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d, phy id 0x%d mdio address %d",.format_args = "i4i4i4",}; + struct + { + u32 instance, id, address; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->id = phy->id; + ed->address = phy->mdio_address; + } + + /* Reset phy. */ + ixge_write_phy_reg (xd, XGE_PHY_DEV_TYPE_PHY_XS, XGE_PHY_CONTROL, + XGE_PHY_CONTROL_RESET); + + /* Wait for self-clearning reset bit to clear. */ + do + { + vlib_process_suspend (vm, 1e-3); + } + while (ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PHY_XS, XGE_PHY_CONTROL) & + XGE_PHY_CONTROL_RESET); +} + +static u8 * +format_ixge_rx_from_hw_descriptor (u8 * s, va_list * va) +{ + ixge_rx_from_hw_descriptor_t *d = + va_arg (*va, ixge_rx_from_hw_descriptor_t *); + u32 s0 = d->status[0], s2 = d->status[2]; + u32 is_ip4, is_ip6, is_ip, is_tcp, is_udp; + uword indent = format_get_indent (s); + + s = format (s, "%s-owned", + (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE) ? "sw" : + "hw"); + s = + format (s, ", length this descriptor %d, l3 offset %d", + d->n_packet_bytes_this_descriptor, + IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s0)); + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) + s = format (s, ", end-of-packet"); + + s = format (s, "\n%U", format_white_space, indent); + + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_ETHERNET_ERROR) + s = format (s, "layer2 error"); + + if (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_LAYER2) + { + s = format (s, "layer 2 type %d", (s0 & 0x1f)); + return s; + } + + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_VLAN) + s = format (s, "vlan header 0x%x\n%U", d->vlan_tag, + format_white_space, indent); + + if ((is_ip4 = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4))) + { + s = format (s, "ip4%s", + (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4_EXT) ? " options" : + ""); + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED) + s = format (s, " checksum %s", + (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) ? + "bad" : "ok"); + } + if ((is_ip6 = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6))) + s = format (s, "ip6%s", + (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6_EXT) ? " extended" : + ""); + is_tcp = is_udp = 0; + if ((is_ip = (is_ip4 | is_ip6))) + { + is_tcp = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_TCP) != 0; + is_udp = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_UDP) != 0; + if (is_tcp) + s = format (s, ", tcp"); + if (is_udp) + s = format (s, ", udp"); + } + + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED) + s = format (s, ", tcp checksum %s", + (s2 & IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR) ? "bad" : + "ok"); + if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED) + s = format (s, ", udp checksum %s", + (s2 & IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR) ? "bad" : + "ok"); + + return s; +} + +static u8 * +format_ixge_tx_descriptor (u8 * s, va_list * va) +{ + ixge_tx_descriptor_t *d = va_arg (*va, ixge_tx_descriptor_t *); + u32 s0 = d->status0, s1 = d->status1; + uword indent = format_get_indent (s); + u32 v; + + s = format (s, "buffer 0x%Lx, %d packet bytes, %d bytes this buffer", + d->buffer_address, s1 >> 14, d->n_bytes_this_buffer); + + s = format (s, "\n%U", format_white_space, indent); + + if ((v = (s0 >> 0) & 3)) + s = format (s, "reserved 0x%x, ", v); + + if ((v = (s0 >> 2) & 3)) + s = format (s, "mac 0x%x, ", v); + + if ((v = (s0 >> 4) & 0xf) != 3) + s = format (s, "type 0x%x, ", v); + + s = format (s, "%s%s%s%s%s%s%s%s", + (s0 & (1 << 8)) ? "eop, " : "", + (s0 & (1 << 9)) ? "insert-fcs, " : "", + (s0 & (1 << 10)) ? "reserved26, " : "", + (s0 & (1 << 11)) ? "report-status, " : "", + (s0 & (1 << 12)) ? "reserved28, " : "", + (s0 & (1 << 13)) ? "is-advanced, " : "", + (s0 & (1 << 14)) ? "vlan-enable, " : "", + (s0 & (1 << 15)) ? "tx-segmentation, " : ""); + + if ((v = s1 & 0xf) != 0) + s = format (s, "status 0x%x, ", v); + + if ((v = (s1 >> 4) & 0xf)) + s = format (s, "context 0x%x, ", v); + + if ((v = (s1 >> 8) & 0x3f)) + s = format (s, "options 0x%x, ", v); + + return s; +} + +typedef struct +{ + ixge_descriptor_t before, after; + + u32 buffer_index; + + u16 device_index; + + u8 queue_index; + + u8 is_start_of_packet; + + /* Copy of VLIB buffer; packet data stored in pre_data. */ + vlib_buffer_t buffer; +} ixge_rx_dma_trace_t; + +static u8 * +format_ixge_rx_dma_trace (u8 * s, va_list * va) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); + vlib_node_t *node = va_arg (*va, vlib_node_t *); + vnet_main_t *vnm = vnet_get_main (); + ixge_rx_dma_trace_t *t = va_arg (*va, ixge_rx_dma_trace_t *); + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, t->device_index); + format_function_t *f; + uword indent = format_get_indent (s); + + { + vnet_sw_interface_t *sw = + vnet_get_sw_interface (vnm, xd->vlib_sw_if_index); + s = + format (s, "%U rx queue %d", format_vnet_sw_interface_name, vnm, sw, + t->queue_index); + } + + s = format (s, "\n%Ubefore: %U", + format_white_space, indent, + format_ixge_rx_from_hw_descriptor, &t->before); + s = format (s, "\n%Uafter : head/tail address 0x%Lx/0x%Lx", + format_white_space, indent, + t->after.rx_to_hw.head_address, t->after.rx_to_hw.tail_address); + + s = format (s, "\n%Ubuffer 0x%x: %U", + format_white_space, indent, + t->buffer_index, format_vlib_buffer, &t->buffer); + + s = format (s, "\n%U", format_white_space, indent); + + f = node->format_buffer; + if (!f || !t->is_start_of_packet) + f = format_hex_bytes; + s = format (s, "%U", f, t->buffer.pre_data, sizeof (t->buffer.pre_data)); + + return s; +} + +#define foreach_ixge_error \ + _ (none, "no error") \ + _ (tx_full_drops, "tx ring full drops") \ + _ (ip4_checksum_error, "ip4 checksum errors") \ + _ (rx_alloc_fail, "rx buf alloc from free list failed") \ + _ (rx_alloc_no_physmem, "rx buf alloc failed no physmem") + +typedef enum +{ +#define _(f,s) IXGE_ERROR_##f, + foreach_ixge_error +#undef _ + IXGE_N_ERROR, +} ixge_error_t; + +always_inline void +ixge_rx_next_and_error_from_status_x1 (ixge_device_t * xd, + u32 s00, u32 s02, + u8 * next0, u8 * error0, u32 * flags0) +{ + u8 is0_ip4, is0_ip6, n0, e0; + u32 f0; + + e0 = IXGE_ERROR_none; + n0 = IXGE_RX_NEXT_ETHERNET_INPUT; + + is0_ip4 = s02 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED; + n0 = is0_ip4 ? IXGE_RX_NEXT_IP4_INPUT : n0; + + e0 = (is0_ip4 && (s02 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) + ? IXGE_ERROR_ip4_checksum_error : e0); + + is0_ip6 = s00 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6; + n0 = is0_ip6 ? IXGE_RX_NEXT_IP6_INPUT : n0; + + n0 = (xd->per_interface_next_index != ~0) ? + xd->per_interface_next_index : n0; + + /* Check for error. */ + n0 = e0 != IXGE_ERROR_none ? IXGE_RX_NEXT_DROP : n0; + + f0 = ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED + | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED)) + ? IP_BUFFER_L4_CHECKSUM_COMPUTED : 0); + + f0 |= ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR + | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR)) + ? 0 : IP_BUFFER_L4_CHECKSUM_CORRECT); + + *error0 = e0; + *next0 = n0; + *flags0 = f0; +} + +always_inline void +ixge_rx_next_and_error_from_status_x2 (ixge_device_t * xd, + u32 s00, u32 s02, + u32 s10, u32 s12, + u8 * next0, u8 * error0, u32 * flags0, + u8 * next1, u8 * error1, u32 * flags1) +{ + u8 is0_ip4, is0_ip6, n0, e0; + u8 is1_ip4, is1_ip6, n1, e1; + u32 f0, f1; + + e0 = e1 = IXGE_ERROR_none; + n0 = n1 = IXGE_RX_NEXT_IP4_INPUT; + + is0_ip4 = s02 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED; + is1_ip4 = s12 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED; + + n0 = is0_ip4 ? IXGE_RX_NEXT_IP4_INPUT : n0; + n1 = is1_ip4 ? IXGE_RX_NEXT_IP4_INPUT : n1; + + e0 = (is0_ip4 && (s02 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) + ? IXGE_ERROR_ip4_checksum_error : e0); + e1 = (is1_ip4 && (s12 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) + ? IXGE_ERROR_ip4_checksum_error : e1); + + is0_ip6 = s00 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6; + is1_ip6 = s10 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6; + + n0 = is0_ip6 ? IXGE_RX_NEXT_IP6_INPUT : n0; + n1 = is1_ip6 ? IXGE_RX_NEXT_IP6_INPUT : n1; + + n0 = (xd->per_interface_next_index != ~0) ? + xd->per_interface_next_index : n0; + n1 = (xd->per_interface_next_index != ~0) ? + xd->per_interface_next_index : n1; + + /* Check for error. */ + n0 = e0 != IXGE_ERROR_none ? IXGE_RX_NEXT_DROP : n0; + n1 = e1 != IXGE_ERROR_none ? IXGE_RX_NEXT_DROP : n1; + + *error0 = e0; + *error1 = e1; + + *next0 = n0; + *next1 = n1; + + f0 = ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED + | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED)) + ? IP_BUFFER_L4_CHECKSUM_COMPUTED : 0); + f1 = ((s12 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED + | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED)) + ? IP_BUFFER_L4_CHECKSUM_COMPUTED : 0); + + f0 |= ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR + | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR)) + ? 0 : IP_BUFFER_L4_CHECKSUM_CORRECT); + f1 |= ((s12 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR + | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR)) + ? 0 : IP_BUFFER_L4_CHECKSUM_CORRECT); + + *flags0 = f0; + *flags1 = f1; +} + +static void +ixge_rx_trace (ixge_main_t * xm, + ixge_device_t * xd, + ixge_dma_queue_t * dq, + ixge_descriptor_t * before_descriptors, + u32 * before_buffers, + ixge_descriptor_t * after_descriptors, uword n_descriptors) +{ + vlib_main_t *vm = xm->vlib_main; + vlib_node_runtime_t *node = dq->rx.node; + ixge_rx_from_hw_descriptor_t *bd; + ixge_rx_to_hw_descriptor_t *ad; + u32 *b, n_left, is_sop, next_index_sop; + + n_left = n_descriptors; + b = before_buffers; + bd = &before_descriptors->rx_from_hw; + ad = &after_descriptors->rx_to_hw; + is_sop = dq->rx.is_start_of_packet; + next_index_sop = dq->rx.saved_start_of_packet_next_index; + + while (n_left >= 2) + { + u32 bi0, bi1, flags0, flags1; + vlib_buffer_t *b0, *b1; + ixge_rx_dma_trace_t *t0, *t1; + u8 next0, error0, next1, error1; + + bi0 = b[0]; + bi1 = b[1]; + n_left -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + ixge_rx_next_and_error_from_status_x2 (xd, + bd[0].status[0], bd[0].status[2], + bd[1].status[0], bd[1].status[2], + &next0, &error0, &flags0, + &next1, &error1, &flags1); + + next_index_sop = is_sop ? next0 : next_index_sop; + vlib_trace_buffer (vm, node, next_index_sop, b0, /* follow_chain */ 0); + t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0->is_start_of_packet = is_sop; + is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + next_index_sop = is_sop ? next1 : next_index_sop; + vlib_trace_buffer (vm, node, next_index_sop, b1, /* follow_chain */ 0); + t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0])); + t1->is_start_of_packet = is_sop; + is_sop = (b1->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + t0->queue_index = dq->queue_index; + t1->queue_index = dq->queue_index; + t0->device_index = xd->device_index; + t1->device_index = xd->device_index; + t0->before.rx_from_hw = bd[0]; + t1->before.rx_from_hw = bd[1]; + t0->after.rx_to_hw = ad[0]; + t1->after.rx_to_hw = ad[1]; + t0->buffer_index = bi0; + t1->buffer_index = bi1; + memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); + memcpy (&t1->buffer, b1, sizeof (b1[0]) - sizeof (b0->pre_data)); + memcpy (t0->buffer.pre_data, b0->data + b0->current_data, + sizeof (t0->buffer.pre_data)); + memcpy (t1->buffer.pre_data, b1->data + b1->current_data, + sizeof (t1->buffer.pre_data)); + + b += 2; + bd += 2; + ad += 2; + } + + while (n_left >= 1) + { + u32 bi0, flags0; + vlib_buffer_t *b0; + ixge_rx_dma_trace_t *t0; + u8 next0, error0; + + bi0 = b[0]; + n_left -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + ixge_rx_next_and_error_from_status_x1 (xd, + bd[0].status[0], bd[0].status[2], + &next0, &error0, &flags0); + + next_index_sop = is_sop ? next0 : next_index_sop; + vlib_trace_buffer (vm, node, next_index_sop, b0, /* follow_chain */ 0); + t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0->is_start_of_packet = is_sop; + is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + t0->queue_index = dq->queue_index; + t0->device_index = xd->device_index; + t0->before.rx_from_hw = bd[0]; + t0->after.rx_to_hw = ad[0]; + t0->buffer_index = bi0; + memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); + memcpy (t0->buffer.pre_data, b0->data + b0->current_data, + sizeof (t0->buffer.pre_data)); + + b += 1; + bd += 1; + ad += 1; + } +} + +typedef struct +{ + ixge_tx_descriptor_t descriptor; + + u32 buffer_index; + + u16 device_index; + + u8 queue_index; + + u8 is_start_of_packet; + + /* Copy of VLIB buffer; packet data stored in pre_data. */ + vlib_buffer_t buffer; +} ixge_tx_dma_trace_t; + +static u8 * +format_ixge_tx_dma_trace (u8 * s, va_list * va) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *); + ixge_tx_dma_trace_t *t = va_arg (*va, ixge_tx_dma_trace_t *); + vnet_main_t *vnm = vnet_get_main (); + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, t->device_index); + format_function_t *f; + uword indent = format_get_indent (s); + + { + vnet_sw_interface_t *sw = + vnet_get_sw_interface (vnm, xd->vlib_sw_if_index); + s = + format (s, "%U tx queue %d", format_vnet_sw_interface_name, vnm, sw, + t->queue_index); + } + + s = format (s, "\n%Udescriptor: %U", + format_white_space, indent, + format_ixge_tx_descriptor, &t->descriptor); + + s = format (s, "\n%Ubuffer 0x%x: %U", + format_white_space, indent, + t->buffer_index, format_vlib_buffer, &t->buffer); + + s = format (s, "\n%U", format_white_space, indent); + + f = format_ethernet_header_with_length; + if (!f || !t->is_start_of_packet) + f = format_hex_bytes; + s = format (s, "%U", f, t->buffer.pre_data, sizeof (t->buffer.pre_data)); + + return s; +} + +typedef struct +{ + vlib_node_runtime_t *node; + + u32 is_start_of_packet; + + u32 n_bytes_in_packet; + + ixge_tx_descriptor_t *start_of_packet_descriptor; +} ixge_tx_state_t; + +static void +ixge_tx_trace (ixge_main_t * xm, + ixge_device_t * xd, + ixge_dma_queue_t * dq, + ixge_tx_state_t * tx_state, + ixge_tx_descriptor_t * descriptors, + u32 * buffers, uword n_descriptors) +{ + vlib_main_t *vm = xm->vlib_main; + vlib_node_runtime_t *node = tx_state->node; + ixge_tx_descriptor_t *d; + u32 *b, n_left, is_sop; + + n_left = n_descriptors; + b = buffers; + d = descriptors; + is_sop = tx_state->is_start_of_packet; + + while (n_left >= 2) + { + u32 bi0, bi1; + vlib_buffer_t *b0, *b1; + ixge_tx_dma_trace_t *t0, *t1; + + bi0 = b[0]; + bi1 = b[1]; + n_left -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0->is_start_of_packet = is_sop; + is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0])); + t1->is_start_of_packet = is_sop; + is_sop = (b1->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + t0->queue_index = dq->queue_index; + t1->queue_index = dq->queue_index; + t0->device_index = xd->device_index; + t1->device_index = xd->device_index; + t0->descriptor = d[0]; + t1->descriptor = d[1]; + t0->buffer_index = bi0; + t1->buffer_index = bi1; + memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); + memcpy (&t1->buffer, b1, sizeof (b1[0]) - sizeof (b0->pre_data)); + memcpy (t0->buffer.pre_data, b0->data + b0->current_data, + sizeof (t0->buffer.pre_data)); + memcpy (t1->buffer.pre_data, b1->data + b1->current_data, + sizeof (t1->buffer.pre_data)); + + b += 2; + d += 2; + } + + while (n_left >= 1) + { + u32 bi0; + vlib_buffer_t *b0; + ixge_tx_dma_trace_t *t0; + + bi0 = b[0]; + n_left -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0->is_start_of_packet = is_sop; + is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + t0->queue_index = dq->queue_index; + t0->device_index = xd->device_index; + t0->descriptor = d[0]; + t0->buffer_index = bi0; + memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); + memcpy (t0->buffer.pre_data, b0->data + b0->current_data, + sizeof (t0->buffer.pre_data)); + + b += 1; + d += 1; + } +} + +always_inline uword +ixge_ring_sub (ixge_dma_queue_t * q, u32 i0, u32 i1) +{ + i32 d = i1 - i0; + ASSERT (i0 < q->n_descriptors); + ASSERT (i1 < q->n_descriptors); + return d < 0 ? q->n_descriptors + d : d; +} + +always_inline uword +ixge_ring_add (ixge_dma_queue_t * q, u32 i0, u32 i1) +{ + u32 d = i0 + i1; + ASSERT (i0 < q->n_descriptors); + ASSERT (i1 < q->n_descriptors); + d -= d >= q->n_descriptors ? q->n_descriptors : 0; + return d; +} + +always_inline uword +ixge_tx_descriptor_matches_template (ixge_main_t * xm, + ixge_tx_descriptor_t * d) +{ + u32 cmp; + + cmp = ((d->status0 & xm->tx_descriptor_template_mask.status0) + ^ xm->tx_descriptor_template.status0); + if (cmp) + return 0; + cmp = ((d->status1 & xm->tx_descriptor_template_mask.status1) + ^ xm->tx_descriptor_template.status1); + if (cmp) + return 0; + + return 1; +} + +static uword +ixge_tx_no_wrap (ixge_main_t * xm, + ixge_device_t * xd, + ixge_dma_queue_t * dq, + u32 * buffers, + u32 start_descriptor_index, + u32 n_descriptors, ixge_tx_state_t * tx_state) +{ + vlib_main_t *vm = xm->vlib_main; + ixge_tx_descriptor_t *d, *d_sop; + u32 n_left = n_descriptors; + u32 *to_free = vec_end (xm->tx_buffers_pending_free); + u32 *to_tx = + vec_elt_at_index (dq->descriptor_buffer_indices, start_descriptor_index); + u32 is_sop = tx_state->is_start_of_packet; + u32 len_sop = tx_state->n_bytes_in_packet; + u16 template_status = xm->tx_descriptor_template.status0; + u32 descriptor_prefetch_rotor = 0; + + ASSERT (start_descriptor_index + n_descriptors <= dq->n_descriptors); + d = &dq->descriptors[start_descriptor_index].tx; + d_sop = is_sop ? d : tx_state->start_of_packet_descriptor; + + while (n_left >= 4) + { + vlib_buffer_t *b0, *b1; + u32 bi0, fi0, len0; + u32 bi1, fi1, len1; + u8 is_eop0, is_eop1; + + /* Prefetch next iteration. */ + vlib_prefetch_buffer_with_index (vm, buffers[2], LOAD); + vlib_prefetch_buffer_with_index (vm, buffers[3], LOAD); + + if ((descriptor_prefetch_rotor & 0x3) == 0) + CLIB_PREFETCH (d + 4, CLIB_CACHE_LINE_BYTES, STORE); + + descriptor_prefetch_rotor += 2; + + bi0 = buffers[0]; + bi1 = buffers[1]; + + to_free[0] = fi0 = to_tx[0]; + to_tx[0] = bi0; + to_free += fi0 != 0; + + to_free[0] = fi1 = to_tx[1]; + to_tx[1] = bi1; + to_free += fi1 != 0; + + buffers += 2; + n_left -= 2; + to_tx += 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + is_eop0 = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + is_eop1 = (b1->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + len0 = b0->current_length; + len1 = b1->current_length; + + ASSERT (ixge_tx_descriptor_matches_template (xm, d + 0)); + ASSERT (ixge_tx_descriptor_matches_template (xm, d + 1)); + + d[0].buffer_address = + vlib_get_buffer_data_physical_address (vm, bi0) + b0->current_data; + d[1].buffer_address = + vlib_get_buffer_data_physical_address (vm, bi1) + b1->current_data; + + d[0].n_bytes_this_buffer = len0; + d[1].n_bytes_this_buffer = len1; + + d[0].status0 = + template_status | (is_eop0 << + IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET); + d[1].status0 = + template_status | (is_eop1 << + IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET); + + len_sop = (is_sop ? 0 : len_sop) + len0; + d_sop[0].status1 = + IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET (len_sop); + d += 1; + d_sop = is_eop0 ? d : d_sop; + + is_sop = is_eop0; + + len_sop = (is_sop ? 0 : len_sop) + len1; + d_sop[0].status1 = + IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET (len_sop); + d += 1; + d_sop = is_eop1 ? d : d_sop; + + is_sop = is_eop1; + } + + while (n_left > 0) + { + vlib_buffer_t *b0; + u32 bi0, fi0, len0; + u8 is_eop0; + + bi0 = buffers[0]; + + to_free[0] = fi0 = to_tx[0]; + to_tx[0] = bi0; + to_free += fi0 != 0; + + buffers += 1; + n_left -= 1; + to_tx += 1; + + b0 = vlib_get_buffer (vm, bi0); + + is_eop0 = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; + + len0 = b0->current_length; + + ASSERT (ixge_tx_descriptor_matches_template (xm, d + 0)); + + d[0].buffer_address = + vlib_get_buffer_data_physical_address (vm, bi0) + b0->current_data; + + d[0].n_bytes_this_buffer = len0; + + d[0].status0 = + template_status | (is_eop0 << + IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET); + + len_sop = (is_sop ? 0 : len_sop) + len0; + d_sop[0].status1 = + IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET (len_sop); + d += 1; + d_sop = is_eop0 ? d : d_sop; + + is_sop = is_eop0; + } + + if (tx_state->node->flags & VLIB_NODE_FLAG_TRACE) + { + to_tx = + vec_elt_at_index (dq->descriptor_buffer_indices, + start_descriptor_index); + ixge_tx_trace (xm, xd, dq, tx_state, + &dq->descriptors[start_descriptor_index].tx, to_tx, + n_descriptors); + } + + _vec_len (xm->tx_buffers_pending_free) = + to_free - xm->tx_buffers_pending_free; + + /* When we are done d_sop can point to end of ring. Wrap it if so. */ + { + ixge_tx_descriptor_t *d_start = &dq->descriptors[0].tx; + + ASSERT (d_sop - d_start <= dq->n_descriptors); + d_sop = d_sop - d_start == dq->n_descriptors ? d_start : d_sop; + } + + tx_state->is_start_of_packet = is_sop; + tx_state->start_of_packet_descriptor = d_sop; + tx_state->n_bytes_in_packet = len_sop; + + return n_descriptors; +} + +static uword +ixge_interface_tx (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * f) +{ + ixge_main_t *xm = &ixge_main; + vnet_interface_output_runtime_t *rd = (void *) node->runtime_data; + ixge_device_t *xd = vec_elt_at_index (xm->devices, rd->dev_instance); + ixge_dma_queue_t *dq; + u32 *from, n_left_tx, n_descriptors_to_tx, n_tail_drop; + u32 queue_index = 0; /* fixme parameter */ + ixge_tx_state_t tx_state; + + tx_state.node = node; + tx_state.is_start_of_packet = 1; + tx_state.start_of_packet_descriptor = 0; + tx_state.n_bytes_in_packet = 0; + + from = vlib_frame_vector_args (f); + + dq = vec_elt_at_index (xd->dma_queues[VLIB_TX], queue_index); + + dq->head_index = dq->tx.head_index_write_back[0]; + + /* Since head == tail means ring is empty we can send up to dq->n_descriptors - 1. */ + n_left_tx = dq->n_descriptors - 1; + n_left_tx -= ixge_ring_sub (dq, dq->head_index, dq->tail_index); + + _vec_len (xm->tx_buffers_pending_free) = 0; + + n_descriptors_to_tx = f->n_vectors; + n_tail_drop = 0; + if (PREDICT_FALSE (n_descriptors_to_tx > n_left_tx)) + { + i32 i, n_ok, i_eop, i_sop; + + i_sop = i_eop = ~0; + for (i = n_left_tx - 1; i >= 0; i--) + { + vlib_buffer_t *b = vlib_get_buffer (vm, from[i]); + if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + if (i_sop != ~0 && i_eop != ~0) + break; + i_eop = i; + i_sop = i + 1; + } + } + if (i == 0) + n_ok = 0; + else + n_ok = i_eop + 1; + + { + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d, ring full to tx %d head %d tail %d",.format_args = + "i2i2i2i2",}; + struct + { + u16 instance, to_tx, head, tail; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->to_tx = n_descriptors_to_tx; + ed->head = dq->head_index; + ed->tail = dq->tail_index; + } + + if (n_ok < n_descriptors_to_tx) + { + n_tail_drop = n_descriptors_to_tx - n_ok; + vec_add (xm->tx_buffers_pending_free, from + n_ok, n_tail_drop); + vlib_error_count (vm, ixge_input_node.index, + IXGE_ERROR_tx_full_drops, n_tail_drop); + } + + n_descriptors_to_tx = n_ok; + } + + dq->tx.n_buffers_on_ring += n_descriptors_to_tx; + + /* Process from tail to end of descriptor ring. */ + if (n_descriptors_to_tx > 0 && dq->tail_index < dq->n_descriptors) + { + u32 n = + clib_min (dq->n_descriptors - dq->tail_index, n_descriptors_to_tx); + n = ixge_tx_no_wrap (xm, xd, dq, from, dq->tail_index, n, &tx_state); + from += n; + n_descriptors_to_tx -= n; + dq->tail_index += n; + ASSERT (dq->tail_index <= dq->n_descriptors); + if (dq->tail_index == dq->n_descriptors) + dq->tail_index = 0; + } + + if (n_descriptors_to_tx > 0) + { + u32 n = + ixge_tx_no_wrap (xm, xd, dq, from, 0, n_descriptors_to_tx, &tx_state); + from += n; + ASSERT (n == n_descriptors_to_tx); + dq->tail_index += n; + ASSERT (dq->tail_index <= dq->n_descriptors); + if (dq->tail_index == dq->n_descriptors) + dq->tail_index = 0; + } + + /* We should only get full packets. */ + ASSERT (tx_state.is_start_of_packet); + + /* Report status when last descriptor is done. */ + { + u32 i = dq->tail_index == 0 ? dq->n_descriptors - 1 : dq->tail_index - 1; + ixge_tx_descriptor_t *d = &dq->descriptors[i].tx; + d->status0 |= IXGE_TX_DESCRIPTOR_STATUS0_REPORT_STATUS; + } + + /* Give new descriptors to hardware. */ + { + ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_TX, queue_index); + + CLIB_MEMORY_BARRIER (); + + dr->tail_index = dq->tail_index; + } + + /* Free any buffers that are done. */ + { + u32 n = _vec_len (xm->tx_buffers_pending_free); + if (n > 0) + { + vlib_buffer_free_no_next (vm, xm->tx_buffers_pending_free, n); + _vec_len (xm->tx_buffers_pending_free) = 0; + ASSERT (dq->tx.n_buffers_on_ring >= n); + dq->tx.n_buffers_on_ring -= (n - n_tail_drop); + } + } + + return f->n_vectors; +} + +static uword +ixge_rx_queue_no_wrap (ixge_main_t * xm, + ixge_device_t * xd, + ixge_dma_queue_t * dq, + u32 start_descriptor_index, u32 n_descriptors) +{ + vlib_main_t *vm = xm->vlib_main; + vlib_node_runtime_t *node = dq->rx.node; + ixge_descriptor_t *d; + static ixge_descriptor_t *d_trace_save; + static u32 *d_trace_buffers; + u32 n_descriptors_left = n_descriptors; + u32 *to_rx = + vec_elt_at_index (dq->descriptor_buffer_indices, start_descriptor_index); + u32 *to_add; + u32 bi_sop = dq->rx.saved_start_of_packet_buffer_index; + u32 bi_last = dq->rx.saved_last_buffer_index; + u32 next_index_sop = dq->rx.saved_start_of_packet_next_index; + u32 is_sop = dq->rx.is_start_of_packet; + u32 next_index, n_left_to_next, *to_next; + u32 n_packets = 0; + u32 n_bytes = 0; + u32 n_trace = vlib_get_trace_count (vm, node); + vlib_buffer_t *b_last, b_dummy; + + ASSERT (start_descriptor_index + n_descriptors <= dq->n_descriptors); + d = &dq->descriptors[start_descriptor_index]; + + b_last = bi_last != ~0 ? vlib_get_buffer (vm, bi_last) : &b_dummy; + next_index = dq->rx.next_index; + + if (n_trace > 0) + { + u32 n = clib_min (n_trace, n_descriptors); + if (d_trace_save) + { + _vec_len (d_trace_save) = 0; + _vec_len (d_trace_buffers) = 0; + } + vec_add (d_trace_save, (ixge_descriptor_t *) d, n); + vec_add (d_trace_buffers, to_rx, n); + } + + { + uword l = vec_len (xm->rx_buffers_to_add); + + if (l < n_descriptors_left) + { + u32 n_to_alloc = 2 * dq->n_descriptors - l; + u32 n_allocated; + + vec_resize (xm->rx_buffers_to_add, n_to_alloc); + + _vec_len (xm->rx_buffers_to_add) = l; + n_allocated = vlib_buffer_alloc_from_free_list + (vm, xm->rx_buffers_to_add + l, n_to_alloc, + xm->vlib_buffer_free_list_index); + _vec_len (xm->rx_buffers_to_add) += n_allocated; + + /* Handle transient allocation failure */ + if (PREDICT_FALSE (l + n_allocated <= n_descriptors_left)) + { + if (n_allocated == 0) + vlib_error_count (vm, ixge_input_node.index, + IXGE_ERROR_rx_alloc_no_physmem, 1); + else + vlib_error_count (vm, ixge_input_node.index, + IXGE_ERROR_rx_alloc_fail, 1); + + n_descriptors_left = l + n_allocated; + } + n_descriptors = n_descriptors_left; + } + + /* Add buffers from end of vector going backwards. */ + to_add = vec_end (xm->rx_buffers_to_add) - 1; + } + + while (n_descriptors_left > 0) + { + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_descriptors_left >= 4 && n_left_to_next >= 2) + { + vlib_buffer_t *b0, *b1; + u32 bi0, fi0, len0, l3_offset0, s20, s00, flags0; + u32 bi1, fi1, len1, l3_offset1, s21, s01, flags1; + u8 is_eop0, error0, next0; + u8 is_eop1, error1, next1; + ixge_descriptor_t d0, d1; + + vlib_prefetch_buffer_with_index (vm, to_rx[2], STORE); + vlib_prefetch_buffer_with_index (vm, to_rx[3], STORE); + + CLIB_PREFETCH (d + 2, 32, STORE); + + d0.as_u32x4 = d[0].as_u32x4; + d1.as_u32x4 = d[1].as_u32x4; + + s20 = d0.rx_from_hw.status[2]; + s21 = d1.rx_from_hw.status[2]; + + s00 = d0.rx_from_hw.status[0]; + s01 = d1.rx_from_hw.status[0]; + + if (! + ((s20 & s21) & IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE)) + goto found_hw_owned_descriptor_x2; + + bi0 = to_rx[0]; + bi1 = to_rx[1]; + + ASSERT (to_add - 1 >= xm->rx_buffers_to_add); + fi0 = to_add[0]; + fi1 = to_add[-1]; + + to_rx[0] = fi0; + to_rx[1] = fi1; + to_rx += 2; + to_add -= 2; + + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == + vlib_buffer_is_known (vm, bi0)); + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == + vlib_buffer_is_known (vm, bi1)); + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == + vlib_buffer_is_known (vm, fi0)); + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == + vlib_buffer_is_known (vm, fi1)); + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + /* + * Turn this on if you run into + * "bad monkey" contexts, and you want to know exactly + * which nodes they've visited... See main.c... + */ + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b1); + + CLIB_PREFETCH (b0->data, CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (b1->data, CLIB_CACHE_LINE_BYTES, LOAD); + + is_eop0 = (s20 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) != 0; + is_eop1 = (s21 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) != 0; + + ixge_rx_next_and_error_from_status_x2 (xd, s00, s20, s01, s21, + &next0, &error0, &flags0, + &next1, &error1, &flags1); + + next0 = is_sop ? next0 : next_index_sop; + next1 = is_eop0 ? next1 : next0; + next_index_sop = next1; + + b0->flags |= flags0 | (!is_eop0 << VLIB_BUFFER_LOG2_NEXT_PRESENT); + b1->flags |= flags1 | (!is_eop1 << VLIB_BUFFER_LOG2_NEXT_PRESENT); + + vnet_buffer (b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; + vnet_buffer (b1)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + vnet_buffer (b1)->sw_if_index[VLIB_TX] = (u32) ~ 0; + + b0->error = node->errors[error0]; + b1->error = node->errors[error1]; + + len0 = d0.rx_from_hw.n_packet_bytes_this_descriptor; + len1 = d1.rx_from_hw.n_packet_bytes_this_descriptor; + n_bytes += len0 + len1; + n_packets += is_eop0 + is_eop1; + + /* Give new buffers to hardware. */ + d0.rx_to_hw.tail_address = + vlib_get_buffer_data_physical_address (vm, fi0); + d1.rx_to_hw.tail_address = + vlib_get_buffer_data_physical_address (vm, fi1); + d0.rx_to_hw.head_address = d[0].rx_to_hw.tail_address; + d1.rx_to_hw.head_address = d[1].rx_to_hw.tail_address; + d[0].as_u32x4 = d0.as_u32x4; + d[1].as_u32x4 = d1.as_u32x4; + + d += 2; + n_descriptors_left -= 2; + + /* Point to either l2 or l3 header depending on next. */ + l3_offset0 = (is_sop && (next0 != IXGE_RX_NEXT_ETHERNET_INPUT)) + ? IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s00) : 0; + l3_offset1 = (is_eop0 && (next1 != IXGE_RX_NEXT_ETHERNET_INPUT)) + ? IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s01) : 0; + + b0->current_length = len0 - l3_offset0; + b1->current_length = len1 - l3_offset1; + b0->current_data = l3_offset0; + b1->current_data = l3_offset1; + + b_last->next_buffer = is_sop ? ~0 : bi0; + b0->next_buffer = is_eop0 ? ~0 : bi1; + bi_last = bi1; + b_last = b1; + + if (CLIB_DEBUG > 0) + { + u32 bi_sop0 = is_sop ? bi0 : bi_sop; + u32 bi_sop1 = is_eop0 ? bi1 : bi_sop0; + + if (is_eop0) + { + u8 *msg = vlib_validate_buffer (vm, bi_sop0, + /* follow_buffer_next */ 1); + ASSERT (!msg); + } + if (is_eop1) + { + u8 *msg = vlib_validate_buffer (vm, bi_sop1, + /* follow_buffer_next */ 1); + ASSERT (!msg); + } + } + if (0) /* "Dave" version */ + { + u32 bi_sop0 = is_sop ? bi0 : bi_sop; + u32 bi_sop1 = is_eop0 ? bi1 : bi_sop0; + + if (is_eop0) + { + to_next[0] = bi_sop0; + to_next++; + n_left_to_next--; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi_sop0, next0); + } + if (is_eop1) + { + to_next[0] = bi_sop1; + to_next++; + n_left_to_next--; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi_sop1, next1); + } + is_sop = is_eop1; + bi_sop = bi_sop1; + } + if (1) /* "Eliot" version */ + { + /* Speculatively enqueue to cached next. */ + u8 saved_is_sop = is_sop; + u32 bi_sop_save = bi_sop; + + bi_sop = saved_is_sop ? bi0 : bi_sop; + to_next[0] = bi_sop; + to_next += is_eop0; + n_left_to_next -= is_eop0; + + bi_sop = is_eop0 ? bi1 : bi_sop; + to_next[0] = bi_sop; + to_next += is_eop1; + n_left_to_next -= is_eop1; + + is_sop = is_eop1; + + if (PREDICT_FALSE + (!(next0 == next_index && next1 == next_index))) + { + /* Undo speculation. */ + to_next -= is_eop0 + is_eop1; + n_left_to_next += is_eop0 + is_eop1; + + /* Re-do both descriptors being careful about where we enqueue. */ + bi_sop = saved_is_sop ? bi0 : bi_sop_save; + if (is_eop0) + { + if (next0 != next_index) + vlib_set_next_frame_buffer (vm, node, next0, bi_sop); + else + { + to_next[0] = bi_sop; + to_next += 1; + n_left_to_next -= 1; + } + } + + bi_sop = is_eop0 ? bi1 : bi_sop; + if (is_eop1) + { + if (next1 != next_index) + vlib_set_next_frame_buffer (vm, node, next1, bi_sop); + else + { + to_next[0] = bi_sop; + to_next += 1; + n_left_to_next -= 1; + } + } + + /* Switch cached next index when next for both packets is the same. */ + if (is_eop0 && is_eop1 && next0 == next1) + { + vlib_put_next_frame (vm, node, next_index, + n_left_to_next); + next_index = next0; + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + } + } + } + } + + /* Bail out of dual loop and proceed with single loop. */ + found_hw_owned_descriptor_x2: + + while (n_descriptors_left > 0 && n_left_to_next > 0) + { + vlib_buffer_t *b0; + u32 bi0, fi0, len0, l3_offset0, s20, s00, flags0; + u8 is_eop0, error0, next0; + ixge_descriptor_t d0; + + d0.as_u32x4 = d[0].as_u32x4; + + s20 = d0.rx_from_hw.status[2]; + s00 = d0.rx_from_hw.status[0]; + + if (!(s20 & IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE)) + goto found_hw_owned_descriptor_x1; + + bi0 = to_rx[0]; + ASSERT (to_add >= xm->rx_buffers_to_add); + fi0 = to_add[0]; + + to_rx[0] = fi0; + to_rx += 1; + to_add -= 1; + + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == + vlib_buffer_is_known (vm, bi0)); + ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == + vlib_buffer_is_known (vm, fi0)); + + b0 = vlib_get_buffer (vm, bi0); + + /* + * Turn this on if you run into + * "bad monkey" contexts, and you want to know exactly + * which nodes they've visited... + */ + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + + is_eop0 = (s20 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) != 0; + ixge_rx_next_and_error_from_status_x1 + (xd, s00, s20, &next0, &error0, &flags0); + + next0 = is_sop ? next0 : next_index_sop; + next_index_sop = next0; + + b0->flags |= flags0 | (!is_eop0 << VLIB_BUFFER_LOG2_NEXT_PRESENT); + + vnet_buffer (b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; + vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; + + b0->error = node->errors[error0]; + + len0 = d0.rx_from_hw.n_packet_bytes_this_descriptor; + n_bytes += len0; + n_packets += is_eop0; + + /* Give new buffer to hardware. */ + d0.rx_to_hw.tail_address = + vlib_get_buffer_data_physical_address (vm, fi0); + d0.rx_to_hw.head_address = d0.rx_to_hw.tail_address; + d[0].as_u32x4 = d0.as_u32x4; + + d += 1; + n_descriptors_left -= 1; + + /* Point to either l2 or l3 header depending on next. */ + l3_offset0 = (is_sop && (next0 != IXGE_RX_NEXT_ETHERNET_INPUT)) + ? IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s00) : 0; + b0->current_length = len0 - l3_offset0; + b0->current_data = l3_offset0; + + b_last->next_buffer = is_sop ? ~0 : bi0; + bi_last = bi0; + b_last = b0; + + bi_sop = is_sop ? bi0 : bi_sop; + + if (CLIB_DEBUG > 0 && is_eop0) + { + u8 *msg = + vlib_validate_buffer (vm, bi_sop, /* follow_buffer_next */ 1); + ASSERT (!msg); + } + + if (0) /* "Dave" version */ + { + if (is_eop0) + { + to_next[0] = bi_sop; + to_next++; + n_left_to_next--; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi_sop, next0); + } + } + if (1) /* "Eliot" version */ + { + if (PREDICT_TRUE (next0 == next_index)) + { + to_next[0] = bi_sop; + to_next += is_eop0; + n_left_to_next -= is_eop0; + } + else + { + if (next0 != next_index && is_eop0) + vlib_set_next_frame_buffer (vm, node, next0, bi_sop); + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + next_index = next0; + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + } + } + is_sop = is_eop0; + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + +found_hw_owned_descriptor_x1: + if (n_descriptors_left > 0) + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + + _vec_len (xm->rx_buffers_to_add) = (to_add + 1) - xm->rx_buffers_to_add; + + { + u32 n_done = n_descriptors - n_descriptors_left; + + if (n_trace > 0 && n_done > 0) + { + u32 n = clib_min (n_trace, n_done); + ixge_rx_trace (xm, xd, dq, + d_trace_save, + d_trace_buffers, + &dq->descriptors[start_descriptor_index], n); + vlib_set_trace_count (vm, node, n_trace - n); + } + if (d_trace_save) + { + _vec_len (d_trace_save) = 0; + _vec_len (d_trace_buffers) = 0; + } + + /* Don't keep a reference to b_last if we don't have to. + Otherwise we can over-write a next_buffer pointer after already haven + enqueued a packet. */ + if (is_sop) + { + b_last->next_buffer = ~0; + bi_last = ~0; + } + + dq->rx.n_descriptors_done_this_call = n_done; + dq->rx.n_descriptors_done_total += n_done; + dq->rx.is_start_of_packet = is_sop; + dq->rx.saved_start_of_packet_buffer_index = bi_sop; + dq->rx.saved_last_buffer_index = bi_last; + dq->rx.saved_start_of_packet_next_index = next_index_sop; + dq->rx.next_index = next_index; + dq->rx.n_bytes += n_bytes; + + return n_packets; + } +} + +static uword +ixge_rx_queue (ixge_main_t * xm, + ixge_device_t * xd, + vlib_node_runtime_t * node, u32 queue_index) +{ + ixge_dma_queue_t *dq = + vec_elt_at_index (xd->dma_queues[VLIB_RX], queue_index); + ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_RX, dq->queue_index); + uword n_packets = 0; + u32 hw_head_index, sw_head_index; + + /* One time initialization. */ + if (!dq->rx.node) + { + dq->rx.node = node; + dq->rx.is_start_of_packet = 1; + dq->rx.saved_start_of_packet_buffer_index = ~0; + dq->rx.saved_last_buffer_index = ~0; + } + + dq->rx.next_index = node->cached_next_index; + + dq->rx.n_descriptors_done_total = 0; + dq->rx.n_descriptors_done_this_call = 0; + dq->rx.n_bytes = 0; + + /* Fetch head from hardware and compare to where we think we are. */ + hw_head_index = dr->head_index; + sw_head_index = dq->head_index; + + if (hw_head_index == sw_head_index) + goto done; + + if (hw_head_index < sw_head_index) + { + u32 n_tried = dq->n_descriptors - sw_head_index; + n_packets += ixge_rx_queue_no_wrap (xm, xd, dq, sw_head_index, n_tried); + sw_head_index = + ixge_ring_add (dq, sw_head_index, + dq->rx.n_descriptors_done_this_call); + + if (dq->rx.n_descriptors_done_this_call != n_tried) + goto done; + } + if (hw_head_index >= sw_head_index) + { + u32 n_tried = hw_head_index - sw_head_index; + n_packets += ixge_rx_queue_no_wrap (xm, xd, dq, sw_head_index, n_tried); + sw_head_index = + ixge_ring_add (dq, sw_head_index, + dq->rx.n_descriptors_done_this_call); + } + +done: + dq->head_index = sw_head_index; + dq->tail_index = + ixge_ring_add (dq, dq->tail_index, dq->rx.n_descriptors_done_total); + + /* Give tail back to hardware. */ + CLIB_MEMORY_BARRIER (); + + dr->tail_index = dq->tail_index; + + vlib_increment_combined_counter (vnet_main. + interface_main.combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, + 0 /* cpu_index */ , + xd->vlib_sw_if_index, n_packets, + dq->rx.n_bytes); + + return n_packets; +} + +static void +ixge_interrupt (ixge_main_t * xm, ixge_device_t * xd, u32 i) +{ + vlib_main_t *vm = xm->vlib_main; + ixge_regs_t *r = xd->regs; + + if (i != 20) + { + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d, %s",.format_args = "i1t1",.n_enum_strings = + 16,.enum_strings = + { + "flow director", + "rx miss", + "pci exception", + "mailbox", + "link status change", + "linksec key exchange", + "manageability event", + "reserved23", + "sdp0", + "sdp1", + "sdp2", + "sdp3", + "ecc", "descriptor handler error", "tcp timer", "other",},}; + struct + { + u8 instance; + u8 index; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->index = i - 16; + } + else + { + u32 v = r->xge_mac.link_status; + uword is_up = (v & (1 << 30)) != 0; + + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d, link status change 0x%x",.format_args = "i4i4",}; + struct + { + u32 instance, link_status; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->link_status = v; + xd->link_status_at_last_link_change = v; + + vlib_process_signal_event (vm, ixge_process_node.index, + EVENT_SET_FLAGS, + ((is_up << 31) | xd->vlib_hw_if_index)); + } +} + +always_inline u32 +clean_block (u32 * b, u32 * t, u32 n_left) +{ + u32 *t0 = t; + + while (n_left >= 4) + { + u32 bi0, bi1, bi2, bi3; + + t[0] = bi0 = b[0]; + b[0] = 0; + t += bi0 != 0; + + t[0] = bi1 = b[1]; + b[1] = 0; + t += bi1 != 0; + + t[0] = bi2 = b[2]; + b[2] = 0; + t += bi2 != 0; + + t[0] = bi3 = b[3]; + b[3] = 0; + t += bi3 != 0; + + b += 4; + n_left -= 4; + } + + while (n_left > 0) + { + u32 bi0; + + t[0] = bi0 = b[0]; + b[0] = 0; + t += bi0 != 0; + b += 1; + n_left -= 1; + } + + return t - t0; +} + +static void +ixge_tx_queue (ixge_main_t * xm, ixge_device_t * xd, u32 queue_index) +{ + vlib_main_t *vm = xm->vlib_main; + ixge_dma_queue_t *dq = + vec_elt_at_index (xd->dma_queues[VLIB_TX], queue_index); + u32 n_clean, *b, *t, *t0; + i32 n_hw_owned_descriptors; + i32 first_to_clean, last_to_clean; + u64 hwbp_race = 0; + + /* Handle case where head write back pointer update + * arrives after the interrupt during high PCI bus loads. + */ + while ((dq->head_index == dq->tx.head_index_write_back[0]) && + dq->tx.n_buffers_on_ring && (dq->head_index != dq->tail_index)) + { + hwbp_race++; + if (IXGE_HWBP_RACE_ELOG && (hwbp_race == 1)) + { + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d tx head index race: head %4d, tail %4d, buffs %4d",.format_args + = "i4i4i4i4",}; + struct + { + u32 instance, head_index, tail_index, n_buffers_on_ring; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->head_index = dq->head_index; + ed->tail_index = dq->tail_index; + ed->n_buffers_on_ring = dq->tx.n_buffers_on_ring; + } + } + + dq->head_index = dq->tx.head_index_write_back[0]; + n_hw_owned_descriptors = ixge_ring_sub (dq, dq->head_index, dq->tail_index); + ASSERT (dq->tx.n_buffers_on_ring >= n_hw_owned_descriptors); + n_clean = dq->tx.n_buffers_on_ring - n_hw_owned_descriptors; + + if (IXGE_HWBP_RACE_ELOG && hwbp_race) + { + ELOG_TYPE_DECLARE (e) = + { + .function = (char *) __FUNCTION__,.format = + "ixge %d tx head index race: head %4d, hw_owned %4d, n_clean %4d, retries %d",.format_args + = "i4i4i4i4i4",}; + struct + { + u32 instance, head_index, n_hw_owned_descriptors, n_clean, retries; + } *ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->instance = xd->device_index; + ed->head_index = dq->head_index; + ed->n_hw_owned_descriptors = n_hw_owned_descriptors; + ed->n_clean = n_clean; + ed->retries = hwbp_race; + } + + /* + * This function used to wait until hardware owned zero descriptors. + * At high PPS rates, that doesn't happen until the TX ring is + * completely full of descriptors which need to be cleaned up. + * That, in turn, causes TX ring-full drops and/or long RX service + * interruptions. + */ + if (n_clean == 0) + return; + + /* Clean the n_clean descriptors prior to the reported hardware head */ + last_to_clean = dq->head_index - 1; + last_to_clean = (last_to_clean < 0) ? last_to_clean + dq->n_descriptors : + last_to_clean; + + first_to_clean = (last_to_clean) - (n_clean - 1); + first_to_clean = (first_to_clean < 0) ? first_to_clean + dq->n_descriptors : + first_to_clean; + + vec_resize (xm->tx_buffers_pending_free, dq->n_descriptors - 1); + t0 = t = xm->tx_buffers_pending_free; + b = dq->descriptor_buffer_indices + first_to_clean; + + /* Wrap case: clean from first to end, then start to last */ + if (first_to_clean > last_to_clean) + { + t += clean_block (b, t, (dq->n_descriptors - 1) - first_to_clean); + first_to_clean = 0; + b = dq->descriptor_buffer_indices; + } + + /* Typical case: clean from first to last */ + if (first_to_clean <= last_to_clean) + t += clean_block (b, t, (last_to_clean - first_to_clean) + 1); + + if (t > t0) + { + u32 n = t - t0; + vlib_buffer_free_no_next (vm, t0, n); + ASSERT (dq->tx.n_buffers_on_ring >= n); + dq->tx.n_buffers_on_ring -= n; + _vec_len (xm->tx_buffers_pending_free) = 0; + } +} + +/* RX queue interrupts 0 thru 7; TX 8 thru 15. */ +always_inline uword +ixge_interrupt_is_rx_queue (uword i) +{ + return i < 8; +} + +always_inline uword +ixge_interrupt_is_tx_queue (uword i) +{ + return i >= 8 && i < 16; +} + +always_inline uword +ixge_tx_queue_to_interrupt (uword i) +{ + return 8 + i; +} + +always_inline uword +ixge_rx_queue_to_interrupt (uword i) +{ + return 0 + i; +} + +always_inline uword +ixge_interrupt_rx_queue (uword i) +{ + ASSERT (ixge_interrupt_is_rx_queue (i)); + return i - 0; +} + +always_inline uword +ixge_interrupt_tx_queue (uword i) +{ + ASSERT (ixge_interrupt_is_tx_queue (i)); + return i - 8; +} + +static uword +ixge_device_input (ixge_main_t * xm, + ixge_device_t * xd, vlib_node_runtime_t * node) +{ + ixge_regs_t *r = xd->regs; + u32 i, s; + uword n_rx_packets = 0; + + s = r->interrupt.status_write_1_to_set; + if (s) + r->interrupt.status_write_1_to_clear = s; + + /* *INDENT-OFF* */ + foreach_set_bit (i, s, ({ + if (ixge_interrupt_is_rx_queue (i)) + n_rx_packets += ixge_rx_queue (xm, xd, node, ixge_interrupt_rx_queue (i)); + + else if (ixge_interrupt_is_tx_queue (i)) + ixge_tx_queue (xm, xd, ixge_interrupt_tx_queue (i)); + + else + ixge_interrupt (xm, xd, i); + })); + /* *INDENT-ON* */ + + return n_rx_packets; +} + +static uword +ixge_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f) +{ + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd; + uword n_rx_packets = 0; + + if (node->state == VLIB_NODE_STATE_INTERRUPT) + { + uword i; + + /* Loop over devices with interrupts. */ + /* *INDENT-OFF* */ + foreach_set_bit (i, node->runtime_data[0], ({ + xd = vec_elt_at_index (xm->devices, i); + n_rx_packets += ixge_device_input (xm, xd, node); + + /* Re-enable interrupts since we're going to stay in interrupt mode. */ + if (! (node->flags & VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE)) + xd->regs->interrupt.enable_write_1_to_set = ~0; + })); + /* *INDENT-ON* */ + + /* Clear mask of devices with pending interrupts. */ + node->runtime_data[0] = 0; + } + else + { + /* Poll all devices for input/interrupts. */ + vec_foreach (xd, xm->devices) + { + n_rx_packets += ixge_device_input (xm, xd, node); + + /* Re-enable interrupts when switching out of polling mode. */ + if (node->flags & + VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE) + xd->regs->interrupt.enable_write_1_to_set = ~0; + } + } + + return n_rx_packets; +} + +static char *ixge_error_strings[] = { +#define _(n,s) s, + foreach_ixge_error +#undef _ +}; + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (ixge_input_node, static) = { + .function = ixge_input, + .type = VLIB_NODE_TYPE_INPUT, + .name = "ixge-input", + + /* Will be enabled if/when hardware is detected. */ + .state = VLIB_NODE_STATE_DISABLED, + + .format_buffer = format_ethernet_header_with_length, + .format_trace = format_ixge_rx_dma_trace, + + .n_errors = IXGE_N_ERROR, + .error_strings = ixge_error_strings, + + .n_next_nodes = IXGE_RX_N_NEXT, + .next_nodes = { + [IXGE_RX_NEXT_DROP] = "error-drop", + [IXGE_RX_NEXT_ETHERNET_INPUT] = "ethernet-input", + [IXGE_RX_NEXT_IP4_INPUT] = "ip4-input", + [IXGE_RX_NEXT_IP6_INPUT] = "ip6-input", + }, +}; + +VLIB_NODE_FUNCTION_MULTIARCH_CLONE (ixge_input) +CLIB_MULTIARCH_SELECT_FN (ixge_input) +/* *INDENT-ON* */ + +static u8 * +format_ixge_device_name (u8 * s, va_list * args) +{ + u32 i = va_arg (*args, u32); + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, i); + return format (s, "TenGigabitEthernet%U", + format_vlib_pci_handle, &xd->pci_device.bus_address); +} + +#define IXGE_COUNTER_IS_64_BIT (1 << 0) +#define IXGE_COUNTER_NOT_CLEAR_ON_READ (1 << 1) + +static u8 ixge_counter_flags[] = { +#define _(a,f) 0, +#define _64(a,f) IXGE_COUNTER_IS_64_BIT, + foreach_ixge_counter +#undef _ +#undef _64 +}; + +static void +ixge_update_counters (ixge_device_t * xd) +{ + /* Byte offset for counter registers. */ + static u32 reg_offsets[] = { +#define _(a,f) (a) / sizeof (u32), +#define _64(a,f) _(a,f) + foreach_ixge_counter +#undef _ +#undef _64 + }; + volatile u32 *r = (volatile u32 *) xd->regs; + int i; + + for (i = 0; i < ARRAY_LEN (xd->counters); i++) + { + u32 o = reg_offsets[i]; + xd->counters[i] += r[o]; + if (ixge_counter_flags[i] & IXGE_COUNTER_NOT_CLEAR_ON_READ) + r[o] = 0; + if (ixge_counter_flags[i] & IXGE_COUNTER_IS_64_BIT) + xd->counters[i] += (u64) r[o + 1] << (u64) 32; + } +} + +static u8 * +format_ixge_device_id (u8 * s, va_list * args) +{ + u32 device_id = va_arg (*args, u32); + char *t = 0; + switch (device_id) + { +#define _(f,n) case n: t = #f; break; + foreach_ixge_pci_device_id; +#undef _ + default: + t = 0; + break; + } + if (t == 0) + s = format (s, "unknown 0x%x", device_id); + else + s = format (s, "%s", t); + return s; +} + +static u8 * +format_ixge_link_status (u8 * s, va_list * args) +{ + ixge_device_t *xd = va_arg (*args, ixge_device_t *); + u32 v = xd->link_status_at_last_link_change; + + s = format (s, "%s", (v & (1 << 30)) ? "up" : "down"); + + { + char *modes[] = { + "1g", "10g parallel", "10g serial", "autoneg", + }; + char *speeds[] = { + "unknown", "100m", "1g", "10g", + }; + s = format (s, ", mode %s, speed %s", + modes[(v >> 26) & 3], speeds[(v >> 28) & 3]); + } + + return s; +} + +static u8 * +format_ixge_device (u8 * s, va_list * args) +{ + u32 dev_instance = va_arg (*args, u32); + CLIB_UNUSED (int verbose) = va_arg (*args, int); + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, dev_instance); + ixge_phy_t *phy = xd->phys + xd->phy_index; + uword indent = format_get_indent (s); + + ixge_update_counters (xd); + xd->link_status_at_last_link_change = xd->regs->xge_mac.link_status; + + s = format (s, "Intel 8259X: id %U\n%Ulink %U", + format_ixge_device_id, xd->device_id, + format_white_space, indent + 2, format_ixge_link_status, xd); + + { + + s = format (s, "\n%UPCIe %U", format_white_space, indent + 2, + format_vlib_pci_link_speed, &xd->pci_device); + } + + s = format (s, "\n%U", format_white_space, indent + 2); + if (phy->mdio_address != ~0) + s = format (s, "PHY address %d, id 0x%x", phy->mdio_address, phy->id); + else if (xd->sfp_eeprom.id == SFP_ID_sfp) + s = format (s, "SFP %U", format_sfp_eeprom, &xd->sfp_eeprom); + else + s = format (s, "PHY not found"); + + /* FIXME */ + { + ixge_dma_queue_t *dq = vec_elt_at_index (xd->dma_queues[VLIB_RX], 0); + ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_RX, 0); + u32 hw_head_index = dr->head_index; + u32 sw_head_index = dq->head_index; + u32 nitems; + + nitems = ixge_ring_sub (dq, hw_head_index, sw_head_index); + s = format (s, "\n%U%d unprocessed, %d total buffers on rx queue 0 ring", + format_white_space, indent + 2, nitems, dq->n_descriptors); + + s = format (s, "\n%U%d buffers in driver rx cache", + format_white_space, indent + 2, + vec_len (xm->rx_buffers_to_add)); + + s = format (s, "\n%U%d buffers on tx queue 0 ring", + format_white_space, indent + 2, + xd->dma_queues[VLIB_TX][0].tx.n_buffers_on_ring); + } + { + u32 i; + u64 v; + static char *names[] = { +#define _(a,f) #f, +#define _64(a,f) _(a,f) + foreach_ixge_counter +#undef _ +#undef _64 + }; + + for (i = 0; i < ARRAY_LEN (names); i++) + { + v = xd->counters[i] - xd->counters_last_clear[i]; + if (v != 0) + s = format (s, "\n%U%-40U%16Ld", + format_white_space, indent + 2, + format_c_identifier, names[i], v); + } + } + + return s; +} + +static void +ixge_clear_hw_interface_counters (u32 instance) +{ + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd = vec_elt_at_index (xm->devices, instance); + ixge_update_counters (xd); + memcpy (xd->counters_last_clear, xd->counters, sizeof (xd->counters)); +} + +/* + * Dynamically redirect all pkts from a specific interface + * to the specified node + */ +static void +ixge_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index, + u32 node_index) +{ + ixge_main_t *xm = &ixge_main; + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + ixge_device_t *xd = vec_elt_at_index (xm->devices, hw->dev_instance); + + /* Shut off redirection */ + if (node_index == ~0) + { + xd->per_interface_next_index = node_index; + return; + } + + xd->per_interface_next_index = + vlib_node_add_next (xm->vlib_main, ixge_input_node.index, node_index); +} + + +/* *INDENT-OFF* */ +VNET_DEVICE_CLASS (ixge_device_class) = { + .name = "ixge", + .tx_function = ixge_interface_tx, + .format_device_name = format_ixge_device_name, + .format_device = format_ixge_device, + .format_tx_trace = format_ixge_tx_dma_trace, + .clear_counters = ixge_clear_hw_interface_counters, + .admin_up_down_function = ixge_interface_admin_up_down, + .rx_redirect_to_node = ixge_set_interface_next_node, + .flatten_output_chains = 1, +}; +/* *INDENT-ON* */ + +#define IXGE_N_BYTES_IN_RX_BUFFER (2048) // DAW-HACK: Set Rx buffer size so all packets < ETH_MTU_SIZE fit in the buffer (i.e. sop & eop for all descriptors). + +static clib_error_t * +ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) +{ + ixge_main_t *xm = &ixge_main; + vlib_main_t *vm = xm->vlib_main; + ixge_dma_queue_t *dq; + clib_error_t *error = 0; + + vec_validate (xd->dma_queues[rt], queue_index); + dq = vec_elt_at_index (xd->dma_queues[rt], queue_index); + + if (!xm->n_descriptors_per_cache_line) + xm->n_descriptors_per_cache_line = + CLIB_CACHE_LINE_BYTES / sizeof (dq->descriptors[0]); + + if (!xm->n_bytes_in_rx_buffer) + xm->n_bytes_in_rx_buffer = IXGE_N_BYTES_IN_RX_BUFFER; + xm->n_bytes_in_rx_buffer = round_pow2 (xm->n_bytes_in_rx_buffer, 1024); + if (!xm->vlib_buffer_free_list_index) + { + xm->vlib_buffer_free_list_index = + vlib_buffer_get_or_create_free_list (vm, xm->n_bytes_in_rx_buffer, + "ixge rx"); + ASSERT (xm->vlib_buffer_free_list_index != 0); + } + + if (!xm->n_descriptors[rt]) + xm->n_descriptors[rt] = 4 * VLIB_FRAME_SIZE; + + dq->queue_index = queue_index; + dq->n_descriptors = + round_pow2 (xm->n_descriptors[rt], xm->n_descriptors_per_cache_line); + dq->head_index = dq->tail_index = 0; + + dq->descriptors = vlib_physmem_alloc_aligned (vm, &error, + dq->n_descriptors * + sizeof (dq->descriptors[0]), + 128 /* per chip spec */ ); + if (error) + return error; + + memset (dq->descriptors, 0, + dq->n_descriptors * sizeof (dq->descriptors[0])); + vec_resize (dq->descriptor_buffer_indices, dq->n_descriptors); + + if (rt == VLIB_RX) + { + u32 n_alloc, i; + + n_alloc = vlib_buffer_alloc_from_free_list + (vm, dq->descriptor_buffer_indices, + vec_len (dq->descriptor_buffer_indices), + xm->vlib_buffer_free_list_index); + ASSERT (n_alloc == vec_len (dq->descriptor_buffer_indices)); + for (i = 0; i < n_alloc; i++) + { + vlib_buffer_t *b = + vlib_get_buffer (vm, dq->descriptor_buffer_indices[i]); + dq->descriptors[i].rx_to_hw.tail_address = + vlib_physmem_virtual_to_physical (vm, b->data); + } + } + else + { + u32 i; + + dq->tx.head_index_write_back = + vlib_physmem_alloc (vm, &error, CLIB_CACHE_LINE_BYTES); + + for (i = 0; i < dq->n_descriptors; i++) + dq->descriptors[i].tx = xm->tx_descriptor_template; + + vec_validate (xm->tx_buffers_pending_free, dq->n_descriptors - 1); + } + + { + ixge_dma_regs_t *dr = get_dma_regs (xd, rt, queue_index); + u64 a; + + a = vlib_physmem_virtual_to_physical (vm, dq->descriptors); + dr->descriptor_address[0] = a & 0xFFFFFFFF; + dr->descriptor_address[1] = a >> (u64) 32; + dr->n_descriptor_bytes = dq->n_descriptors * sizeof (dq->descriptors[0]); + dq->head_index = dq->tail_index = 0; + + if (rt == VLIB_RX) + { + ASSERT ((xm->n_bytes_in_rx_buffer / 1024) < 32); + dr->rx_split_control = + ( /* buffer size */ ((xm->n_bytes_in_rx_buffer / 1024) << 0) + | ( /* lo free descriptor threshold (units of 64 descriptors) */ + (1 << 22)) | ( /* descriptor type: advanced one buffer */ + (1 << 25)) | ( /* drop if no descriptors available */ + (1 << 28))); + + /* Give hardware all but last 16 cache lines' worth of descriptors. */ + dq->tail_index = dq->n_descriptors - + 16 * xm->n_descriptors_per_cache_line; + } + else + { + /* Make sure its initialized before hardware can get to it. */ + dq->tx.head_index_write_back[0] = dq->head_index; + + a = + vlib_physmem_virtual_to_physical (vm, dq->tx.head_index_write_back); + dr->tx.head_index_write_back_address[0] = /* enable bit */ 1 | a; + dr->tx.head_index_write_back_address[1] = (u64) a >> (u64) 32; + } + + /* DMA on 82599 does not work with [13] rx data write relaxed ordering + and [12] undocumented set. */ + if (rt == VLIB_RX) + dr->dca_control &= ~((1 << 13) | (1 << 12)); + + CLIB_MEMORY_BARRIER (); + + if (rt == VLIB_TX) + { + xd->regs->tx_dma_control |= (1 << 0); + dr->control |= ((32 << 0) /* prefetch threshold */ + | (64 << 8) /* host threshold */ + | (0 << 16) /* writeback threshold */ ); + } + + /* Enable this queue and wait for hardware to initialize + before adding to tail. */ + if (rt == VLIB_TX) + { + dr->control |= 1 << 25; + while (!(dr->control & (1 << 25))) + ; + } + + /* Set head/tail indices and enable DMA. */ + dr->head_index = dq->head_index; + dr->tail_index = dq->tail_index; + } + + return error; +} + +static u32 +ixge_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hw, u32 flags) +{ + ixge_device_t *xd; + ixge_regs_t *r; + u32 old; + ixge_main_t *xm = &ixge_main; + + xd = vec_elt_at_index (xm->devices, hw->dev_instance); + r = xd->regs; + + old = r->filter_control; + + if (flags & ETHERNET_INTERFACE_FLAG_ACCEPT_ALL) + r->filter_control = old | (1 << 9) /* unicast promiscuous */ ; + else + r->filter_control = old & ~(1 << 9); + + return old; +} + +static void +ixge_device_init (ixge_main_t * xm) +{ + vnet_main_t *vnm = vnet_get_main (); + ixge_device_t *xd; + + /* Reset chip(s). */ + vec_foreach (xd, xm->devices) + { + ixge_regs_t *r = xd->regs; + const u32 reset_bit = (1 << 26) | (1 << 3); + + r->control |= reset_bit; + + /* No need to suspend. Timed to take ~1e-6 secs */ + while (r->control & reset_bit) + ; + + /* Software loaded. */ + r->extended_control |= (1 << 28); + + ixge_phy_init (xd); + + /* Register ethernet interface. */ + { + u8 addr8[6]; + u32 i, addr32[2]; + clib_error_t *error; + + addr32[0] = r->rx_ethernet_address0[0][0]; + addr32[1] = r->rx_ethernet_address0[0][1]; + for (i = 0; i < 6; i++) + addr8[i] = addr32[i / 4] >> ((i % 4) * 8); + + error = ethernet_register_interface + (vnm, ixge_device_class.index, xd->device_index, + /* ethernet address */ addr8, + &xd->vlib_hw_if_index, ixge_flag_change); + if (error) + clib_error_report (error); + } + + { + vnet_sw_interface_t *sw = + vnet_get_hw_sw_interface (vnm, xd->vlib_hw_if_index); + xd->vlib_sw_if_index = sw->sw_if_index; + } + + ixge_dma_init (xd, VLIB_RX, /* queue_index */ 0); + + xm->n_descriptors[VLIB_TX] = 20 * VLIB_FRAME_SIZE; + + ixge_dma_init (xd, VLIB_TX, /* queue_index */ 0); + + /* RX/TX queue 0 gets mapped to interrupt bits 0 & 8. */ + r->interrupt.queue_mapping[0] = (( /* valid bit */ (1 << 7) | + ixge_rx_queue_to_interrupt (0)) << 0); + + r->interrupt.queue_mapping[0] |= (( /* valid bit */ (1 << 7) | + ixge_tx_queue_to_interrupt (0)) << 8); + + /* No use in getting too many interrupts. + Limit them to one every 3/4 ring size at line rate + min sized packets. + No need for this since kernel/vlib main loop provides adequate interrupt + limiting scheme. */ + if (0) + { + f64 line_rate_max_pps = + 10e9 / (8 * (64 + /* interframe padding */ 20)); + ixge_throttle_queue_interrupt (r, 0, + .75 * xm->n_descriptors[VLIB_RX] / + line_rate_max_pps); + } + + /* Accept all multicast and broadcast packets. Should really add them + to the dst_ethernet_address register array. */ + r->filter_control |= (1 << 10) | (1 << 8); + + /* Enable frames up to size in mac frame size register. */ + r->xge_mac.control |= 1 << 2; + r->xge_mac.rx_max_frame_size = (9216 + 14) << 16; + + /* Enable all interrupts. */ + if (!IXGE_ALWAYS_POLL) + r->interrupt.enable_write_1_to_set = ~0; + } +} + +static uword +ixge_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + vnet_main_t *vnm = vnet_get_main (); + ixge_main_t *xm = &ixge_main; + ixge_device_t *xd; + uword event_type, *event_data = 0; + f64 timeout, link_debounce_deadline; + + ixge_device_init (xm); + + /* Clear all counters. */ + vec_foreach (xd, xm->devices) + { + ixge_update_counters (xd); + memset (xd->counters, 0, sizeof (xd->counters)); + } + + timeout = 30.0; + link_debounce_deadline = 1e70; + + while (1) + { + /* 36 bit stat counters could overflow in ~50 secs. + We poll every 30 secs to be conservative. */ + vlib_process_wait_for_event_or_clock (vm, timeout); + + event_type = vlib_process_get_events (vm, &event_data); + + switch (event_type) + { + case EVENT_SET_FLAGS: + /* 1 ms */ + link_debounce_deadline = vlib_time_now (vm) + 1e-3; + timeout = 1e-3; + break; + + case ~0: + /* No events found: timer expired. */ + if (vlib_time_now (vm) > link_debounce_deadline) + { + vec_foreach (xd, xm->devices) + { + ixge_regs_t *r = xd->regs; + u32 v = r->xge_mac.link_status; + uword is_up = (v & (1 << 30)) != 0; + + vnet_hw_interface_set_flags + (vnm, xd->vlib_hw_if_index, + is_up ? VNET_HW_INTERFACE_FLAG_LINK_UP : 0); + } + link_debounce_deadline = 1e70; + timeout = 30.0; + } + break; + + default: + ASSERT (0); + } + + if (event_data) + _vec_len (event_data) = 0; + + /* Query stats every 30 secs. */ + { + f64 now = vlib_time_now (vm); + if (now - xm->time_last_stats_update > 30) + { + xm->time_last_stats_update = now; + vec_foreach (xd, xm->devices) ixge_update_counters (xd); + } + } + } + + return 0; +} + +static vlib_node_registration_t ixge_process_node = { + .function = ixge_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "ixge-process", +}; + +clib_error_t * +ixge_init (vlib_main_t * vm) +{ + ixge_main_t *xm = &ixge_main; + clib_error_t *error; + + xm->vlib_main = vm; + memset (&xm->tx_descriptor_template, 0, + sizeof (xm->tx_descriptor_template)); + memset (&xm->tx_descriptor_template_mask, 0, + sizeof (xm->tx_descriptor_template_mask)); + xm->tx_descriptor_template.status0 = + (IXGE_TX_DESCRIPTOR_STATUS0_ADVANCED | + IXGE_TX_DESCRIPTOR_STATUS0_IS_ADVANCED | + IXGE_TX_DESCRIPTOR_STATUS0_INSERT_FCS); + xm->tx_descriptor_template_mask.status0 = 0xffff; + xm->tx_descriptor_template_mask.status1 = 0x00003fff; + + xm->tx_descriptor_template_mask.status0 &= + ~(IXGE_TX_DESCRIPTOR_STATUS0_IS_END_OF_PACKET + | IXGE_TX_DESCRIPTOR_STATUS0_REPORT_STATUS); + xm->tx_descriptor_template_mask.status1 &= + ~(IXGE_TX_DESCRIPTOR_STATUS1_DONE); + + error = vlib_call_init_function (vm, pci_bus_init); + + return error; +} + +VLIB_INIT_FUNCTION (ixge_init); + + +static void +ixge_pci_intr_handler (vlib_pci_device_t * dev) +{ + ixge_main_t *xm = &ixge_main; + vlib_main_t *vm = xm->vlib_main; + + vlib_node_set_interrupt_pending (vm, ixge_input_node.index); + + /* Let node know which device is interrupting. */ + { + vlib_node_runtime_t *rt = + vlib_node_get_runtime (vm, ixge_input_node.index); + rt->runtime_data[0] |= 1 << dev->private_data; + } +} + +static clib_error_t * +ixge_pci_init (vlib_main_t * vm, vlib_pci_device_t * dev) +{ + ixge_main_t *xm = &ixge_main; + clib_error_t *error; + void *r; + ixge_device_t *xd; + + /* Device found: make sure we have dma memory. */ + if (unix_physmem_is_fake (vm)) + return clib_error_return (0, "no physical memory available"); + + error = vlib_pci_map_resource (dev, 0, &r); + if (error) + return error; + + vec_add2 (xm->devices, xd, 1); + + if (vec_len (xm->devices) == 1) + { + ixge_input_node.function = ixge_input_multiarch_select (); + } + + xd->pci_device = dev[0]; + xd->device_id = xd->pci_device.config0.header.device_id; + xd->regs = r; + xd->device_index = xd - xm->devices; + xd->pci_function = dev->bus_address.function; + xd->per_interface_next_index = ~0; + + + /* Chip found so enable node. */ + { + vlib_node_set_state (vm, ixge_input_node.index, + (IXGE_ALWAYS_POLL + ? VLIB_NODE_STATE_POLLING + : VLIB_NODE_STATE_INTERRUPT)); + + dev->private_data = xd->device_index; + } + + if (vec_len (xm->devices) == 1) + { + vlib_register_node (vm, &ixge_process_node); + xm->process_node_index = ixge_process_node.index; + } + + error = vlib_pci_bus_master_enable (dev); + + if (error) + return error; + + return vlib_pci_intr_enable (dev); +} + +/* *INDENT-OFF* */ +PCI_REGISTER_DEVICE (ixge_pci_device_registration,static) = { + .init_function = ixge_pci_init, + .interrupt_handler = ixge_pci_intr_handler, + .supported_devices = { +#define _(t,i) { .vendor_id = PCI_VENDOR_ID_INTEL, .device_id = i, }, + foreach_ixge_pci_device_id +#undef _ + { 0 }, + }, +}; +/* *INDENT-ON* */ + +void +ixge_set_next_node (ixge_rx_next_t next, char *name) +{ + vlib_node_registration_t *r = &ixge_input_node; + + switch (next) + { + case IXGE_RX_NEXT_IP4_INPUT: + case IXGE_RX_NEXT_IP6_INPUT: + case IXGE_RX_NEXT_ETHERNET_INPUT: + r->next_nodes[next] = name; + break; + + default: + clib_warning ("%s: illegal next %d\n", __FUNCTION__, next); + break; + } +} +#endif + +/* *INDENT-OFF* */ +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, + .default_disabled = 1, +}; + +/* *INDENT-ON* */ +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/ixge/ixge.h b/src/plugins/ixge/ixge.h new file mode 100644 index 00000000..779603b3 --- /dev/null +++ b/src/plugins/ixge/ixge.h @@ -0,0 +1,1293 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_ixge_h +#define included_ixge_h + +#include +#include +#include +#include +#include +#include + +typedef volatile struct +{ + /* [31:7] 128 byte aligned. */ + u32 descriptor_address[2]; + u32 n_descriptor_bytes; + + /* [5] rx/tx descriptor dca enable + [6] rx packet head dca enable + [7] rx packet tail dca enable + [9] rx/tx descriptor relaxed order + [11] rx/tx descriptor write back relaxed order + [13] rx/tx data write/read relaxed order + [15] rx head data write relaxed order + [31:24] apic id for cpu's cache. */ + u32 dca_control; + + u32 head_index; + + /* [4:0] tail buffer size (in 1k byte units) + [13:8] head buffer size (in 64 byte units) + [24:22] lo free descriptors threshold (units of 64 descriptors) + [27:25] descriptor type 0 = legacy, 1 = advanced one buffer (e.g. tail), + 2 = advanced header splitting (head + tail), 5 = advanced header + splitting (head only). + [28] drop if no descriptors available. */ + u32 rx_split_control; + + u32 tail_index; + CLIB_PAD_FROM_TO (0x1c, 0x28); + + /* [7:0] rx/tx prefetch threshold + [15:8] rx/tx host threshold + [24:16] rx/tx write back threshold + [25] rx/tx enable + [26] tx descriptor writeback flush + [30] rx strip vlan enable */ + u32 control; + + u32 rx_coallesce_control; + + union + { + struct + { + /* packets bytes lo hi */ + u32 stats[3]; + + u32 unused; + } rx; + + struct + { + u32 unused[2]; + + /* [0] enables head write back. */ + u32 head_index_write_back_address[2]; + } tx; + }; +} ixge_dma_regs_t; + +/* Only advanced descriptors are supported. */ +typedef struct +{ + u64 tail_address; + u64 head_address; +} ixge_rx_to_hw_descriptor_t; + +typedef struct +{ + u32 status[3]; + u16 n_packet_bytes_this_descriptor; + u16 vlan_tag; +} ixge_rx_from_hw_descriptor_t; + +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_LAYER2 (1 << (4 + 11)) +/* Valid if not layer2. */ +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4 (1 << (4 + 0)) +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4_EXT (1 << (4 + 1)) +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6 (1 << (4 + 2)) +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6_EXT (1 << (4 + 3)) +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_TCP (1 << (4 + 4)) +#define IXGE_RX_DESCRIPTOR_STATUS0_IS_UDP (1 << (4 + 5)) +#define IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET(s) (((s) >> 21) & 0x3ff) + +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE (1 << (0 + 0)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET (1 << (0 + 1)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_VLAN (1 << (0 + 3)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED (1 << (0 + 4)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED (1 << (0 + 5)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED (1 << (0 + 6)) +#define IXGE_RX_DESCRIPTOR_STATUS2_NOT_UNICAST (1 << (0 + 7)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IS_DOUBLE_VLAN (1 << (0 + 9)) +#define IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR (1 << (0 + 10)) +#define IXGE_RX_DESCRIPTOR_STATUS2_ETHERNET_ERROR (1 << (20 + 9)) +#define IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR (1 << (20 + 10)) +#define IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR (1 << (20 + 11)) + +/* For layer2 packets stats0 bottom 3 bits give ether type index from filter. */ +#define IXGE_RX_DESCRIPTOR_STATUS0_LAYER2_ETHERNET_TYPE(s) ((s) & 7) + +typedef struct +{ + u64 buffer_address; + u16 n_bytes_this_buffer; + u16 status0; + u32 status1; +#define IXGE_TX_DESCRIPTOR_STATUS0_ADVANCED (3 << 4) +#define IXGE_TX_DESCRIPTOR_STATUS0_IS_ADVANCED (1 << (8 + 5)) +#define IXGE_TX_DESCRIPTOR_STATUS0_LOG2_REPORT_STATUS (8 + 3) +#define IXGE_TX_DESCRIPTOR_STATUS0_REPORT_STATUS (1 << IXGE_TX_DESCRIPTOR_STATUS0_LOG2_REPORT_STATUS) +#define IXGE_TX_DESCRIPTOR_STATUS0_INSERT_FCS (1 << (8 + 1)) +#define IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET (8 + 0) +#define IXGE_TX_DESCRIPTOR_STATUS0_IS_END_OF_PACKET (1 << IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET) +#define IXGE_TX_DESCRIPTOR_STATUS1_DONE (1 << 0) +#define IXGE_TX_DESCRIPTOR_STATUS1_CONTEXT(i) (/* valid */ (1 << 7) | ((i) << 4)) +#define IXGE_TX_DESCRIPTOR_STATUS1_IPSEC_OFFLOAD (1 << (8 + 2)) +#define IXGE_TX_DESCRIPTOR_STATUS1_INSERT_TCP_UDP_CHECKSUM (1 << (8 + 1)) +#define IXGE_TX_DESCRIPTOR_STATUS1_INSERT_IP4_CHECKSUM (1 << (8 + 0)) +#define IXGE_TX_DESCRIPTOR_STATUS0_N_BYTES_THIS_BUFFER(l) ((l) << 0) +#define IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET(l) ((l) << 14) +} ixge_tx_descriptor_t; + +typedef struct +{ + struct + { + u8 checksum_start_offset; + u8 checksum_insert_offset; + u16 checksum_end_offset; + } ip, tcp; + u32 status0; + + u8 status1; + + /* Byte offset after UDP/TCP header. */ + u8 payload_offset; + + u16 max_tcp_segment_size; +} __attribute__ ((packed)) ixge_tx_context_descriptor_t; + +typedef union +{ + ixge_rx_to_hw_descriptor_t rx_to_hw; + ixge_rx_from_hw_descriptor_t rx_from_hw; + ixge_tx_descriptor_t tx; + u32x4 as_u32x4; +} ixge_descriptor_t; + +typedef volatile struct +{ + /* [2] pcie master disable + [3] mac reset + [26] global device reset */ + u32 control; + u32 control_alias; + /* [3:2] device id (0 or 1 for dual port chips) + [7] link is up + [17:10] num vfs + [18] io active + [19] pcie master enable status */ + u32 status_read_only; + CLIB_PAD_FROM_TO (0xc, 0x18); + /* [14] pf reset done + [17] relaxed ordering disable + [26] extended vlan enable + [28] driver loaded */ + u32 extended_control; + CLIB_PAD_FROM_TO (0x1c, 0x20); + + /* software definable pins. + sdp_data [7:0] + sdp_is_output [15:8] + sdp_is_native [23:16] + sdp_function [31:24]. + */ + u32 sdp_control; + CLIB_PAD_FROM_TO (0x24, 0x28); + + /* [0] i2c clock in + [1] i2c clock out + [2] i2c data in + [3] i2c data out */ + u32 i2c_control; + CLIB_PAD_FROM_TO (0x2c, 0x4c); + u32 tcp_timer; + + CLIB_PAD_FROM_TO (0x50, 0x200); + + u32 led_control; + + CLIB_PAD_FROM_TO (0x204, 0x600); + u32 core_spare; + CLIB_PAD_FROM_TO (0x604, 0x700); + + struct + { + u32 vflr_events_clear[4]; + u32 mailbox_interrupt_status[4]; + u32 mailbox_interrupt_enable[4]; + CLIB_PAD_FROM_TO (0x730, 0x800); + } pf_foo; + + struct + { + u32 status_write_1_to_clear; + CLIB_PAD_FROM_TO (0x804, 0x808); + u32 status_write_1_to_set; + CLIB_PAD_FROM_TO (0x80c, 0x810); + u32 status_auto_clear_enable; + CLIB_PAD_FROM_TO (0x814, 0x820); + + /* [11:3] minimum inter-interrupt interval + (2e-6 units; 20e-6 units for fast ethernet). + [15] low-latency interrupt moderation enable + [20:16] low-latency interrupt credit + [27:21] interval counter + [31] write disable for credit and counter (write only). */ + u32 throttle0[24]; + + u32 enable_write_1_to_set; + CLIB_PAD_FROM_TO (0x884, 0x888); + u32 enable_write_1_to_clear; + CLIB_PAD_FROM_TO (0x88c, 0x890); + u32 enable_auto_clear; + u32 msi_to_eitr_select; + /* [3:0] spd 0-3 interrupt detection enable + [4] msi-x enable + [5] other clear disable (makes other bits in status not clear on read) + etc. */ + u32 control; + CLIB_PAD_FROM_TO (0x89c, 0x900); + + /* Defines interrupt mapping for 128 rx + 128 tx queues. + 64 x 4 8 bit entries. + For register [i]: + [5:0] bit in interrupt status for rx queue 2*i + 0 + [7] valid bit + [13:8] bit for tx queue 2*i + 0 + [15] valid bit + similar for rx 2*i + 1 and tx 2*i + 1. */ + u32 queue_mapping[64]; + + /* tcp timer [7:0] and other interrupts [15:8] */ + u32 misc_mapping; + CLIB_PAD_FROM_TO (0xa04, 0xa90); + + /* 64 interrupts determined by mappings. */ + u32 status1_write_1_to_clear[4]; + u32 enable1_write_1_to_set[4]; + u32 enable1_write_1_to_clear[4]; + CLIB_PAD_FROM_TO (0xac0, 0xad0); + u32 status1_enable_auto_clear[4]; + CLIB_PAD_FROM_TO (0xae0, 0x1000); + } interrupt; + + ixge_dma_regs_t rx_dma0[64]; + + CLIB_PAD_FROM_TO (0x2000, 0x2140); + u32 dcb_rx_packet_plane_t4_config[8]; + u32 dcb_rx_packet_plane_t4_status[8]; + CLIB_PAD_FROM_TO (0x2180, 0x2300); + + /* reg i defines mapping for 4 rx queues starting at 4*i + 0. */ + u32 rx_queue_stats_mapping[32]; + u32 rx_queue_stats_control; + + CLIB_PAD_FROM_TO (0x2384, 0x2410); + u32 fc_user_descriptor_ptr[2]; + u32 fc_buffer_control; + CLIB_PAD_FROM_TO (0x241c, 0x2420); + u32 fc_rx_dma; + CLIB_PAD_FROM_TO (0x2424, 0x2430); + u32 dcb_packet_plane_control; + CLIB_PAD_FROM_TO (0x2434, 0x2f00); + + u32 rx_dma_control; + u32 pf_queue_drop_enable; + CLIB_PAD_FROM_TO (0x2f08, 0x2f20); + u32 rx_dma_descriptor_cache_config; + CLIB_PAD_FROM_TO (0x2f24, 0x3000); + + /* 1 bit. */ + u32 rx_enable; + CLIB_PAD_FROM_TO (0x3004, 0x3008); + /* [15:0] ether type (little endian) + [31:16] opcode (big endian) */ + u32 flow_control_control; + CLIB_PAD_FROM_TO (0x300c, 0x3020); + /* 3 bit traffic class for each of 8 priorities. */ + u32 rx_priority_to_traffic_class; + CLIB_PAD_FROM_TO (0x3024, 0x3028); + u32 rx_coallesce_data_buffer_control; + CLIB_PAD_FROM_TO (0x302c, 0x3190); + u32 rx_packet_buffer_flush_detect; + CLIB_PAD_FROM_TO (0x3194, 0x3200); + u32 flow_control_tx_timers[4]; /* 2 timer values */ + CLIB_PAD_FROM_TO (0x3210, 0x3220); + u32 flow_control_rx_threshold_lo[8]; + CLIB_PAD_FROM_TO (0x3240, 0x3260); + u32 flow_control_rx_threshold_hi[8]; + CLIB_PAD_FROM_TO (0x3280, 0x32a0); + u32 flow_control_refresh_threshold; + CLIB_PAD_FROM_TO (0x32a4, 0x3c00); + /* For each of 8 traffic classes (units of bytes). */ + u32 rx_packet_buffer_size[8]; + CLIB_PAD_FROM_TO (0x3c20, 0x3d00); + u32 flow_control_config; + CLIB_PAD_FROM_TO (0x3d04, 0x4200); + + struct + { + u32 pcs_config; + CLIB_PAD_FROM_TO (0x4204, 0x4208); + u32 link_control; + u32 link_status; + u32 pcs_debug[2]; + u32 auto_negotiation; + u32 link_partner_ability; + u32 auto_negotiation_tx_next_page; + u32 auto_negotiation_link_partner_next_page; + CLIB_PAD_FROM_TO (0x4228, 0x4240); + } gige_mac; + + struct + { + /* [0] tx crc enable + [2] enable frames up to max frame size register [31:16] + [10] pad frames < 64 bytes if specified by user + [15] loopback enable + [16] mdc hi speed + [17] turn off mdc between mdio packets */ + u32 control; + + /* [5] rx symbol error (all bits clear on read) + [6] rx illegal symbol + [7] rx idle error + [8] rx local fault + [9] rx remote fault */ + u32 status; + + u32 pause_and_pace_control; + CLIB_PAD_FROM_TO (0x424c, 0x425c); + u32 phy_command; + u32 phy_data; + CLIB_PAD_FROM_TO (0x4264, 0x4268); + + /* [31:16] max frame size in bytes. */ + u32 rx_max_frame_size; + CLIB_PAD_FROM_TO (0x426c, 0x4288); + + /* [0] + [2] pcs receive link up? (latch lo) + [7] local fault + [1] + [0] pcs 10g base r capable + [1] pcs 10g base x capable + [2] pcs 10g base w capable + [10] rx local fault + [11] tx local fault + [15:14] 2 => device present at this address (else not present) */ + u32 xgxs_status[2]; + + u32 base_x_pcs_status; + + /* [0] pass unrecognized flow control frames + [1] discard pause frames + [2] rx priority flow control enable (only in dcb mode) + [3] rx flow control enable. */ + u32 flow_control; + + /* [3:0] tx lanes change polarity + [7:4] rx lanes change polarity + [11:8] swizzle tx lanes + [15:12] swizzle rx lanes + 4 x 2 bit tx lane swap + 4 x 2 bit rx lane swap. */ + u32 serdes_control; + + u32 fifo_control; + + /* [0] force link up + [1] autoneg ack2 bit to transmit + [6:2] autoneg selector field to transmit + [8:7] 10g pma/pmd type 0 => xaui, 1 kx4, 2 cx4 + [9] 1g pma/pmd type 0 => sfi, 1 => kx/bx + [10] disable 10g on without main power + [11] restart autoneg on transition to dx power state + [12] restart autoneg + [15:13] link mode: + 0 => 1g no autoneg + 1 => 10g kx4 parallel link no autoneg + 2 => 1g bx autoneg + 3 => 10g sfi serdes + 4 => kx4/kx/kr + 5 => xgmii 1g/100m + 6 => kx4/kx/kr 1g an + 7 kx4/kx/kr sgmii. + [16] kr support + [17] fec requested + [18] fec ability + etc. */ + u32 auto_negotiation_control; + + /* [0] signal detect 1g/100m + [1] fec signal detect + [2] 10g serial pcs fec block lock + [3] 10g serial high error rate + [4] 10g serial pcs block lock + [5] kx/kx4/kr autoneg next page received + [6] kx/kx4/kr backplane autoneg next page received + [7] link status clear to read + [11:8] 10g signal detect (4 lanes) (for serial just lane 0) + [12] 10g serial signal detect + [16:13] 10g parallel lane sync status + [17] 10g parallel align status + [18] 1g sync status + [19] kx/kx4/kr backplane autoneg is idle + [20] 1g autoneg enabled + [21] 1g pcs enabled for sgmii + [22] 10g xgxs enabled + [23] 10g serial fec enabled (forward error detection) + [24] 10g kr pcs enabled + [25] sgmii enabled + [27:26] mac link mode + 0 => 1g + 1 => 10g parallel + 2 => 10g serial + 3 => autoneg + [29:28] link speed + 1 => 100m + 2 => 1g + 3 => 10g + [30] link is up + [31] kx/kx4/kr backplane autoneg completed successfully. */ + u32 link_status; + + /* [17:16] pma/pmd for 10g serial + 0 => kr, 2 => sfi + [18] disable dme pages */ + u32 auto_negotiation_control2; + + CLIB_PAD_FROM_TO (0x42ac, 0x42b0); + u32 link_partner_ability[2]; + CLIB_PAD_FROM_TO (0x42b8, 0x42d0); + u32 manageability_control; + u32 link_partner_next_page[2]; + CLIB_PAD_FROM_TO (0x42dc, 0x42e0); + u32 kr_pcs_control; + u32 kr_pcs_status; + u32 fec_status[2]; + CLIB_PAD_FROM_TO (0x42f0, 0x4314); + u32 sgmii_control; + CLIB_PAD_FROM_TO (0x4318, 0x4324); + u32 link_status2; + CLIB_PAD_FROM_TO (0x4328, 0x4900); + } xge_mac; + + u32 tx_dcb_control; + u32 tx_dcb_descriptor_plane_queue_select; + u32 tx_dcb_descriptor_plane_t1_config; + u32 tx_dcb_descriptor_plane_t1_status; + CLIB_PAD_FROM_TO (0x4910, 0x4950); + + /* For each TC in units of 1k bytes. */ + u32 tx_packet_buffer_thresholds[8]; + CLIB_PAD_FROM_TO (0x4970, 0x4980); + struct + { + u32 mmw; + u32 config; + u32 status; + u32 rate_drift; + } dcb_tx_rate_scheduler; + CLIB_PAD_FROM_TO (0x4990, 0x4a80); + u32 tx_dma_control; + CLIB_PAD_FROM_TO (0x4a84, 0x4a88); + u32 tx_dma_tcp_flags_control[2]; + CLIB_PAD_FROM_TO (0x4a90, 0x4b00); + u32 pf_mailbox[64]; + CLIB_PAD_FROM_TO (0x4c00, 0x5000); + + /* RX */ + u32 checksum_control; + CLIB_PAD_FROM_TO (0x5004, 0x5008); + u32 rx_filter_control; + CLIB_PAD_FROM_TO (0x500c, 0x5010); + u32 management_vlan_tag[8]; + u32 management_udp_tcp_ports[8]; + CLIB_PAD_FROM_TO (0x5050, 0x5078); + /* little endian. */ + u32 extended_vlan_ether_type; + CLIB_PAD_FROM_TO (0x507c, 0x5080); + /* [1] store/dma bad packets + [8] accept all multicast + [9] accept all unicast + [10] accept all broadcast. */ + u32 filter_control; + CLIB_PAD_FROM_TO (0x5084, 0x5088); + /* [15:0] vlan ethernet type (0x8100) little endian + [28] cfi bit expected + [29] drop packets with unexpected cfi bit + [30] vlan filter enable. */ + u32 vlan_control; + CLIB_PAD_FROM_TO (0x508c, 0x5090); + /* [1:0] hi bit of ethernet address for 12 bit index into multicast table + 0 => 47, 1 => 46, 2 => 45, 3 => 43. + [2] enable multicast filter + */ + u32 multicast_control; + CLIB_PAD_FROM_TO (0x5094, 0x5100); + u32 fcoe_rx_control; + CLIB_PAD_FROM_TO (0x5104, 0x5108); + u32 fc_flt_context; + CLIB_PAD_FROM_TO (0x510c, 0x5110); + u32 fc_filter_control; + CLIB_PAD_FROM_TO (0x5114, 0x5120); + u32 rx_message_type_lo; + CLIB_PAD_FROM_TO (0x5124, 0x5128); + /* [15:0] ethernet type (little endian) + [18:16] matche pri in vlan tag + [19] priority match enable + [25:20] virtualization pool + [26] pool enable + [27] is fcoe + [30] ieee 1588 timestamp enable + [31] filter enable. + (See ethernet_type_queue_select.) */ + u32 ethernet_type_queue_filter[8]; + CLIB_PAD_FROM_TO (0x5148, 0x5160); + /* [7:0] l2 ethernet type and + [15:8] l2 ethernet type or */ + u32 management_decision_filters1[8]; + u32 vf_vm_tx_switch_loopback_enable[2]; + u32 rx_time_sync_control; + CLIB_PAD_FROM_TO (0x518c, 0x5190); + u32 management_ethernet_type_filters[4]; + u32 rx_timestamp_attributes_lo; + u32 rx_timestamp_hi; + u32 rx_timestamp_attributes_hi; + CLIB_PAD_FROM_TO (0x51ac, 0x51b0); + u32 pf_virtual_control; + CLIB_PAD_FROM_TO (0x51b4, 0x51d8); + u32 fc_offset_parameter; + CLIB_PAD_FROM_TO (0x51dc, 0x51e0); + u32 vf_rx_enable[2]; + u32 rx_timestamp_lo; + CLIB_PAD_FROM_TO (0x51ec, 0x5200); + /* 12 bits determined by multicast_control + lookup bits in this vector. */ + u32 multicast_enable[128]; + + /* [0] ethernet address [31:0] + [1] [15:0] ethernet address [47:32] + [31] valid bit. + Index 0 is read from eeprom after reset. */ + u32 rx_ethernet_address0[16][2]; + + CLIB_PAD_FROM_TO (0x5480, 0x5800); + u32 wake_up_control; + CLIB_PAD_FROM_TO (0x5804, 0x5808); + u32 wake_up_filter_control; + CLIB_PAD_FROM_TO (0x580c, 0x5818); + u32 multiple_rx_queue_command_82598; + CLIB_PAD_FROM_TO (0x581c, 0x5820); + u32 management_control; + u32 management_filter_control; + CLIB_PAD_FROM_TO (0x5828, 0x5838); + u32 wake_up_ip4_address_valid; + CLIB_PAD_FROM_TO (0x583c, 0x5840); + u32 wake_up_ip4_address_table[4]; + u32 management_control_to_host; + CLIB_PAD_FROM_TO (0x5854, 0x5880); + u32 wake_up_ip6_address_table[4]; + + /* unicast_and broadcast_and vlan_and ip_address_and + etc. */ + u32 management_decision_filters[8]; + + u32 management_ip4_or_ip6_address_filters[4][4]; + CLIB_PAD_FROM_TO (0x58f0, 0x5900); + u32 wake_up_packet_length; + CLIB_PAD_FROM_TO (0x5904, 0x5910); + u32 management_ethernet_address_filters[4][2]; + CLIB_PAD_FROM_TO (0x5930, 0x5a00); + u32 wake_up_packet_memory[32]; + CLIB_PAD_FROM_TO (0x5a80, 0x5c00); + u32 redirection_table_82598[32]; + u32 rss_random_keys_82598[10]; + CLIB_PAD_FROM_TO (0x5ca8, 0x6000); + + ixge_dma_regs_t tx_dma[128]; + + u32 pf_vm_vlan_insert[64]; + u32 tx_dma_tcp_max_alloc_size_requests; + CLIB_PAD_FROM_TO (0x8104, 0x8110); + u32 vf_tx_enable[2]; + CLIB_PAD_FROM_TO (0x8118, 0x8120); + /* [0] dcb mode enable + [1] virtualization mode enable + [3:2] number of tcs/qs per pool. */ + u32 multiple_tx_queues_command; + CLIB_PAD_FROM_TO (0x8124, 0x8200); + u32 pf_vf_anti_spoof[8]; + u32 pf_dma_tx_switch_control; + CLIB_PAD_FROM_TO (0x8224, 0x82e0); + u32 tx_strict_low_latency_queues[4]; + CLIB_PAD_FROM_TO (0x82f0, 0x8600); + u32 tx_queue_stats_mapping_82599[32]; + u32 tx_queue_packet_counts[32]; + u32 tx_queue_byte_counts[32][2]; + + struct + { + u32 control; + u32 status; + u32 buffer_almost_full; + CLIB_PAD_FROM_TO (0x880c, 0x8810); + u32 buffer_min_ifg; + CLIB_PAD_FROM_TO (0x8814, 0x8900); + } tx_security; + + struct + { + u32 index; + u32 salt; + u32 key[4]; + CLIB_PAD_FROM_TO (0x8918, 0x8a00); + } tx_ipsec; + + struct + { + u32 capabilities; + u32 control; + u32 tx_sci[2]; + u32 sa; + u32 sa_pn[2]; + u32 key[2][4]; + /* untagged packets, encrypted packets, protected packets, + encrypted bytes, protected bytes */ + u32 stats[5]; + CLIB_PAD_FROM_TO (0x8a50, 0x8c00); + } tx_link_security; + + struct + { + u32 control; + u32 timestamp_value[2]; + u32 system_time[2]; + u32 increment_attributes; + u32 time_adjustment_offset[2]; + u32 aux_control; + u32 target_time[2][2]; + CLIB_PAD_FROM_TO (0x8c34, 0x8c3c); + u32 aux_time_stamp[2][2]; + CLIB_PAD_FROM_TO (0x8c4c, 0x8d00); + } tx_timesync; + + struct + { + u32 control; + u32 status; + CLIB_PAD_FROM_TO (0x8d08, 0x8e00); + } rx_security; + + struct + { + u32 index; + u32 ip_address[4]; + u32 spi; + u32 ip_index; + u32 key[4]; + u32 salt; + u32 mode; + CLIB_PAD_FROM_TO (0x8e34, 0x8f00); + } rx_ipsec; + + struct + { + u32 capabilities; + u32 control; + u32 sci[2]; + u32 sa[2]; + u32 sa_pn[2]; + u32 key[2][4]; + /* see datasheet */ + u32 stats[17]; + CLIB_PAD_FROM_TO (0x8f84, 0x9000); + } rx_link_security; + + /* 4 wake up, 2 management, 2 wake up. */ + u32 flexible_filters[8][16][4]; + CLIB_PAD_FROM_TO (0x9800, 0xa000); + + /* 4096 bits. */ + u32 vlan_filter[128]; + + /* [0] ethernet address [31:0] + [1] [15:0] ethernet address [47:32] + [31] valid bit. + Index 0 is read from eeprom after reset. */ + u32 rx_ethernet_address1[128][2]; + + /* select one of 64 pools for each rx address. */ + u32 rx_ethernet_address_pool_select[128][2]; + CLIB_PAD_FROM_TO (0xaa00, 0xc800); + u32 tx_priority_to_traffic_class; + CLIB_PAD_FROM_TO (0xc804, 0xcc00); + + /* In bytes units of 1k. Total packet buffer is 160k. */ + u32 tx_packet_buffer_size[8]; + + CLIB_PAD_FROM_TO (0xcc20, 0xcd10); + u32 tx_manageability_tc_mapping; + CLIB_PAD_FROM_TO (0xcd14, 0xcd20); + u32 dcb_tx_packet_plane_t2_config[8]; + u32 dcb_tx_packet_plane_t2_status[8]; + CLIB_PAD_FROM_TO (0xcd60, 0xce00); + + u32 tx_flow_control_status; + CLIB_PAD_FROM_TO (0xce04, 0xd000); + + ixge_dma_regs_t rx_dma1[64]; + + struct + { + /* Bigendian ip4 src/dst address. */ + u32 src_address[128]; + u32 dst_address[128]; + + /* TCP/UDP ports [15:0] src [31:16] dst; bigendian. */ + u32 tcp_udp_port[128]; + + /* [1:0] protocol tcp, udp, sctp, other + [4:2] match priority (highest wins) + [13:8] pool + [25] src address match disable + [26] dst address match disable + [27] src port match disable + [28] dst port match disable + [29] protocol match disable + [30] pool match disable + [31] enable. */ + u32 control[128]; + + /* [12] size bypass + [19:13] must be 0x80 + [20] low-latency interrupt + [27:21] rx queue. */ + u32 interrupt[128]; + } ip4_filters; + + CLIB_PAD_FROM_TO (0xea00, 0xeb00); + /* 4 bit rss output index indexed by 7 bit hash. + 128 8 bit fields = 32 registers. */ + u32 redirection_table_82599[32]; + + u32 rss_random_key_82599[10]; + CLIB_PAD_FROM_TO (0xeba8, 0xec00); + /* [15:0] reserved + [22:16] rx queue index + [29] low-latency interrupt on match + [31] enable */ + u32 ethernet_type_queue_select[8]; + CLIB_PAD_FROM_TO (0xec20, 0xec30); + u32 syn_packet_queue_filter; + CLIB_PAD_FROM_TO (0xec34, 0xec60); + u32 immediate_interrupt_rx_vlan_priority; + CLIB_PAD_FROM_TO (0xec64, 0xec70); + u32 rss_queues_per_traffic_class; + CLIB_PAD_FROM_TO (0xec74, 0xec90); + u32 lli_size_threshold; + CLIB_PAD_FROM_TO (0xec94, 0xed00); + + struct + { + u32 control; + CLIB_PAD_FROM_TO (0xed04, 0xed10); + u32 table[8]; + CLIB_PAD_FROM_TO (0xed30, 0xee00); + } fcoe_redirection; + + struct + { + /* [1:0] packet buffer allocation 0 => disabled, else 64k*2^(f-1) + [3] packet buffer initialization done + [4] perfetch match mode + [5] report status in rss field of rx descriptors + [7] report status always + [14:8] drop queue + [20:16] flex 2 byte packet offset (units of 2 bytes) + [27:24] max linked list length + [31:28] full threshold. */ + u32 control; + CLIB_PAD_FROM_TO (0xee04, 0xee0c); + + u32 data[8]; + + /* [1:0] 0 => no action, 1 => add, 2 => remove, 3 => query. + [2] valid filter found by query command + [3] filter update override + [4] ip6 adress table + [6:5] l4 protocol reserved, udp, tcp, sctp + [7] is ip6 + [8] clear head/tail + [9] packet drop action + [10] matched packet generates low-latency interrupt + [11] last in linked list + [12] collision + [15] rx queue enable + [22:16] rx queue + [29:24] pool. */ + u32 command; + + CLIB_PAD_FROM_TO (0xee30, 0xee3c); + /* ip4 dst/src address, tcp ports, udp ports. + set bits mean bit is ignored. */ + u32 ip4_masks[4]; + u32 filter_length; + u32 usage_stats; + u32 failed_usage_stats; + u32 filters_match_stats; + u32 filters_miss_stats; + CLIB_PAD_FROM_TO (0xee60, 0xee68); + /* Lookup, signature. */ + u32 hash_keys[2]; + /* [15:0] ip6 src address 1 bit per byte + [31:16] ip6 dst address. */ + u32 ip6_mask; + /* [0] vlan id + [1] vlan priority + [2] pool + [3] ip protocol + [4] flex + [5] dst ip6. */ + u32 other_mask; + CLIB_PAD_FROM_TO (0xee78, 0xf000); + } flow_director; + + struct + { + u32 l2_control[64]; + u32 vlan_pool_filter[64]; + u32 vlan_pool_filter_bitmap[128]; + u32 dst_ethernet_address[128]; + u32 mirror_rule[4]; + u32 mirror_rule_vlan[8]; + u32 mirror_rule_pool[8]; + CLIB_PAD_FROM_TO (0xf650, 0x10010); + } pf_bar; + + u32 eeprom_flash_control; + /* [0] start + [1] done + [15:2] address + [31:16] read data. */ + u32 eeprom_read; + CLIB_PAD_FROM_TO (0x10018, 0x1001c); + u32 flash_access; + CLIB_PAD_FROM_TO (0x10020, 0x10114); + u32 flash_data; + u32 flash_control; + u32 flash_read_data; + CLIB_PAD_FROM_TO (0x10120, 0x1013c); + u32 flash_opcode; + u32 software_semaphore; + CLIB_PAD_FROM_TO (0x10144, 0x10148); + u32 firmware_semaphore; + CLIB_PAD_FROM_TO (0x1014c, 0x10160); + u32 software_firmware_sync; + CLIB_PAD_FROM_TO (0x10164, 0x10200); + u32 general_rx_control; + CLIB_PAD_FROM_TO (0x10204, 0x11000); + + struct + { + u32 control; + CLIB_PAD_FROM_TO (0x11004, 0x11010); + /* [3:0] enable counters + [7:4] leaky bucket counter mode + [29] reset + [30] stop + [31] start. */ + u32 counter_control; + /* [7:0],[15:8],[23:16],[31:24] event for counters 0-3. + event codes: + 0x0 bad tlp + 0x10 reqs that reached timeout + etc. */ + u32 counter_event; + CLIB_PAD_FROM_TO (0x11018, 0x11020); + u32 counters_clear_on_read[4]; + u32 counter_config[4]; + struct + { + u32 address; + u32 data; + } indirect_access; + CLIB_PAD_FROM_TO (0x11048, 0x11050); + u32 extended_control; + CLIB_PAD_FROM_TO (0x11054, 0x11064); + u32 mirrored_revision_id; + CLIB_PAD_FROM_TO (0x11068, 0x11070); + u32 dca_requester_id_information; + + /* [0] global disable + [4:1] mode: 0 => legacy, 1 => dca 1.0. */ + u32 dca_control; + CLIB_PAD_FROM_TO (0x11078, 0x110b0); + /* [0] pci completion abort + [1] unsupported i/o address + [2] wrong byte enable + [3] pci timeout */ + u32 pcie_interrupt_status; + CLIB_PAD_FROM_TO (0x110b4, 0x110b8); + u32 pcie_interrupt_enable; + CLIB_PAD_FROM_TO (0x110bc, 0x110c0); + u32 msi_x_pba_clear[8]; + CLIB_PAD_FROM_TO (0x110e0, 0x12300); + } pcie; + + u32 interrupt_throttle1[128 - 24]; + CLIB_PAD_FROM_TO (0x124a0, 0x14f00); + + u32 core_analog_config; + CLIB_PAD_FROM_TO (0x14f04, 0x14f10); + u32 core_common_config; + CLIB_PAD_FROM_TO (0x14f14, 0x15f14); + + u32 link_sec_software_firmware_interface; +} ixge_regs_t; + +typedef union +{ + struct + { + /* Addresses bigendian. */ + union + { + struct + { + ip6_address_t src_address; + u32 unused[1]; + } ip6; + struct + { + u32 unused[3]; + ip4_address_t src_address, dst_address; + } ip4; + }; + + /* [15:0] src port (little endian). + [31:16] dst port. */ + u32 tcp_udp_ports; + + /* [15:0] vlan (cfi bit set to 0). + [31:16] flex bytes. bigendian. */ + u32 vlan_and_flex_word; + + /* [14:0] hash + [15] bucket valid + [31:16] signature (signature filers)/sw-index (perfect match). */ + u32 hash; + }; + + u32 as_u32[8]; +} ixge_flow_director_key_t; + +always_inline void +ixge_throttle_queue_interrupt (ixge_regs_t * r, + u32 queue_interrupt_index, + f64 inter_interrupt_interval_in_secs) +{ + volatile u32 *tr = + (queue_interrupt_index < ARRAY_LEN (r->interrupt.throttle0) + ? &r->interrupt.throttle0[queue_interrupt_index] + : &r->interrupt_throttle1[queue_interrupt_index]); + ASSERT (queue_interrupt_index < 128); + u32 v; + i32 i, mask = (1 << 9) - 1; + + i = flt_round_nearest (inter_interrupt_interval_in_secs / 2e-6); + i = i < 1 ? 1 : i; + i = i >= mask ? mask : i; + + v = tr[0]; + v &= ~(mask << 3); + v |= i << 3; + tr[0] = v; +} + +#define foreach_ixge_counter \ + _ (0x40d0, rx_total_packets) \ + _64 (0x40c0, rx_total_bytes) \ + _ (0x41b0, rx_good_packets_before_filtering) \ + _64 (0x41b4, rx_good_bytes_before_filtering) \ + _ (0x2f50, rx_dma_good_packets) \ + _64 (0x2f54, rx_dma_good_bytes) \ + _ (0x2f5c, rx_dma_duplicated_good_packets) \ + _64 (0x2f60, rx_dma_duplicated_good_bytes) \ + _ (0x2f68, rx_dma_good_loopback_packets) \ + _64 (0x2f6c, rx_dma_good_loopback_bytes) \ + _ (0x2f74, rx_dma_good_duplicated_loopback_packets) \ + _64 (0x2f78, rx_dma_good_duplicated_loopback_bytes) \ + _ (0x4074, rx_good_packets) \ + _64 (0x4088, rx_good_bytes) \ + _ (0x407c, rx_multicast_packets) \ + _ (0x4078, rx_broadcast_packets) \ + _ (0x405c, rx_64_byte_packets) \ + _ (0x4060, rx_65_127_byte_packets) \ + _ (0x4064, rx_128_255_byte_packets) \ + _ (0x4068, rx_256_511_byte_packets) \ + _ (0x406c, rx_512_1023_byte_packets) \ + _ (0x4070, rx_gt_1023_byte_packets) \ + _ (0x4000, rx_crc_errors) \ + _ (0x4120, rx_ip_checksum_errors) \ + _ (0x4004, rx_illegal_symbol_errors) \ + _ (0x4008, rx_error_symbol_errors) \ + _ (0x4034, rx_mac_local_faults) \ + _ (0x4038, rx_mac_remote_faults) \ + _ (0x4040, rx_length_errors) \ + _ (0x41a4, rx_xons) \ + _ (0x41a8, rx_xoffs) \ + _ (0x40a4, rx_undersize_packets) \ + _ (0x40a8, rx_fragments) \ + _ (0x40ac, rx_oversize_packets) \ + _ (0x40b0, rx_jabbers) \ + _ (0x40b4, rx_management_packets) \ + _ (0x40b8, rx_management_drops) \ + _ (0x3fa0, rx_missed_packets_pool_0) \ + _ (0x40d4, tx_total_packets) \ + _ (0x4080, tx_good_packets) \ + _64 (0x4090, tx_good_bytes) \ + _ (0x40f0, tx_multicast_packets) \ + _ (0x40f4, tx_broadcast_packets) \ + _ (0x87a0, tx_dma_good_packets) \ + _64 (0x87a4, tx_dma_good_bytes) \ + _ (0x40d8, tx_64_byte_packets) \ + _ (0x40dc, tx_65_127_byte_packets) \ + _ (0x40e0, tx_128_255_byte_packets) \ + _ (0x40e4, tx_256_511_byte_packets) \ + _ (0x40e8, tx_512_1023_byte_packets) \ + _ (0x40ec, tx_gt_1023_byte_packets) \ + _ (0x4010, tx_undersize_drops) \ + _ (0x8780, switch_security_violation_packets) \ + _ (0x5118, fc_crc_errors) \ + _ (0x241c, fc_rx_drops) \ + _ (0x2424, fc_last_error_count) \ + _ (0x2428, fcoe_rx_packets) \ + _ (0x242c, fcoe_rx_dwords) \ + _ (0x8784, fcoe_tx_packets) \ + _ (0x8788, fcoe_tx_dwords) \ + _ (0x1030, queue_0_rx_count) \ + _ (0x1430, queue_0_drop_count) \ + _ (0x1070, queue_1_rx_count) \ + _ (0x1470, queue_1_drop_count) \ + _ (0x10b0, queue_2_rx_count) \ + _ (0x14b0, queue_2_drop_count) \ + _ (0x10f0, queue_3_rx_count) \ + _ (0x14f0, queue_3_drop_count) \ + _ (0x1130, queue_4_rx_count) \ + _ (0x1530, queue_4_drop_count) \ + _ (0x1170, queue_5_rx_count) \ + _ (0x1570, queue_5_drop_count) \ + _ (0x11b0, queue_6_rx_count) \ + _ (0x15b0, queue_6_drop_count) \ + _ (0x11f0, queue_7_rx_count) \ + _ (0x15f0, queue_7_drop_count) \ + _ (0x1230, queue_8_rx_count) \ + _ (0x1630, queue_8_drop_count) \ + _ (0x1270, queue_9_rx_count) \ + _ (0x1270, queue_9_drop_count) + + + + +typedef enum +{ +#define _(a,f) IXGE_COUNTER_##f, +#define _64(a,f) _(a,f) + foreach_ixge_counter +#undef _ +#undef _64 + IXGE_N_COUNTER, +} ixge_counter_type_t; + +typedef struct +{ + u32 mdio_address; + + /* 32 bit ID read from ID registers. */ + u32 id; +} ixge_phy_t; + +typedef struct +{ + /* Cache aligned descriptors. */ + ixge_descriptor_t *descriptors; + + /* Number of descriptors in table. */ + u32 n_descriptors; + + /* Software head and tail pointers into descriptor ring. */ + u32 head_index, tail_index; + + /* Index into dma_queues vector. */ + u32 queue_index; + + /* Buffer indices corresponding to each active descriptor. */ + u32 *descriptor_buffer_indices; + + union + { + struct + { + u32 *volatile head_index_write_back; + + u32 n_buffers_on_ring; + } tx; + + struct + { + /* Buffer indices to use to replenish each descriptor. */ + u32 *replenish_buffer_indices; + + vlib_node_runtime_t *node; + u32 next_index; + + u32 saved_start_of_packet_buffer_index; + + u32 saved_start_of_packet_next_index; + u32 saved_last_buffer_index; + + u32 is_start_of_packet; + + u32 n_descriptors_done_total; + + u32 n_descriptors_done_this_call; + + u32 n_bytes; + } rx; + }; +} ixge_dma_queue_t; + +#define foreach_ixge_pci_device_id \ + _ (82598, 0x10b6) \ + _ (82598_bx, 0x1508) \ + _ (82598af_dual_port, 0x10c6) \ + _ (82598af_single_port, 0x10c7) \ + _ (82598at, 0x10c8) \ + _ (82598at2, 0x150b) \ + _ (82598eb_sfp_lom, 0x10db) \ + _ (82598eb_cx4, 0x10dd) \ + _ (82598_cx4_dual_port, 0x10ec) \ + _ (82598_da_dual_port, 0x10f1) \ + _ (82598_sr_dual_port_em, 0x10e1) \ + _ (82598eb_xf_lr, 0x10f4) \ + _ (82599_kx4, 0x10f7) \ + _ (82599_kx4_mezz, 0x1514) \ + _ (82599_kr, 0x1517) \ + _ (82599_combo_backplane, 0x10f8) \ + _ (82599_cx4, 0x10f9) \ + _ (82599_sfp, 0x10fb) \ + _ (82599_backplane_fcoe, 0x152a) \ + _ (82599_sfp_fcoe, 0x1529) \ + _ (82599_sfp_em, 0x1507) \ + _ (82599_xaui_lom, 0x10fc) \ + _ (82599_t3_lom, 0x151c) \ + _ (x540t, 0x1528) + +typedef enum +{ +#define _(f,n) IXGE_##f = n, + foreach_ixge_pci_device_id +#undef _ +} ixge_pci_device_id_t; + +typedef struct +{ + /* registers */ + ixge_regs_t *regs; + + /* Specific next index when using dynamic redirection */ + u32 per_interface_next_index; + + /* PCI bus info. */ + vlib_pci_device_t pci_device; + + /* From PCI config space header. */ + ixge_pci_device_id_t device_id; + + u16 device_index; + + /* 0 or 1. */ + u16 pci_function; + + /* VLIB interface for this instance. */ + u32 vlib_hw_if_index, vlib_sw_if_index; + + ixge_dma_queue_t *dma_queues[VLIB_N_RX_TX]; + + /* Phy index (0 or 1) and address on MDI bus. */ + u32 phy_index; + ixge_phy_t phys[2]; + + /* Value of link_status register at last link change. */ + u32 link_status_at_last_link_change; + + i2c_bus_t i2c_bus; + sfp_eeprom_t sfp_eeprom; + + /* Counters. */ + u64 counters[IXGE_N_COUNTER], counters_last_clear[IXGE_N_COUNTER]; +} ixge_device_t; + +typedef struct +{ + vlib_main_t *vlib_main; + + /* Vector of devices. */ + ixge_device_t *devices; + + /* Descriptor ring sizes. */ + u32 n_descriptors[VLIB_N_RX_TX]; + + /* RX buffer size. Must be at least 1k; will be rounded to + next largest 1k size. */ + u32 n_bytes_in_rx_buffer; + + u32 n_descriptors_per_cache_line; + + u32 vlib_buffer_free_list_index; + + u32 process_node_index; + + /* Template and mask for initializing/validating TX descriptors. */ + ixge_tx_descriptor_t tx_descriptor_template, tx_descriptor_template_mask; + + /* Vector of buffers for which TX is done and can be freed. */ + u32 *tx_buffers_pending_free; + + u32 *rx_buffers_to_add; + + f64 time_last_stats_update; +} ixge_main_t; + +ixge_main_t ixge_main; +vnet_device_class_t ixge_device_class; + +typedef enum +{ + IXGE_RX_NEXT_IP4_INPUT, + IXGE_RX_NEXT_IP6_INPUT, + IXGE_RX_NEXT_ETHERNET_INPUT, + IXGE_RX_NEXT_DROP, + IXGE_RX_N_NEXT, +} ixge_rx_next_t; + +void ixge_set_next_node (ixge_rx_next_t, char *); + +#endif /* included_ixge_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index 6ba82584..a517a597 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -44,6 +44,7 @@ */ #include +#include uword vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm, @@ -583,6 +584,11 @@ alloc_from_free_list (vlib_main_t * vm, dst = alloc_buffers; + /* wait with buffer memory allocation as long as possible + in case external buffer manager takes over */ + if (PREDICT_FALSE (vm->os_physmem_alloc_aligned == 0)) + unix_physmem_init (vm, 0 /* fail_if_physical_memory_not_present */ ); + n_filled = fill_free_list (vm, free_list, n_alloc_buffers); if (n_filled == 0) return 0; diff --git a/src/vnet.am b/src/vnet.am index ca24c9c5..bc9655cc 100644 --- a/src/vnet.am +++ b/src/vnet.am @@ -114,14 +114,16 @@ libvnet_la_SOURCES += \ vnet/ethernet/init.c \ vnet/ethernet/interface.c \ vnet/ethernet/node.c \ - vnet/ethernet/pg.c + vnet/ethernet/pg.c \ + vnet/ethernet/sfp.c nobase_include_HEADERS += \ vnet/ethernet/arp_packet.h \ vnet/ethernet/error.def \ vnet/ethernet/ethernet.h \ vnet/ethernet/packet.h \ - vnet/ethernet/types.def + vnet/ethernet/types.def \ + vnet/ethernet/sfp.h ######################################## # Layer 2 protocol: Ethernet bridging @@ -792,14 +794,6 @@ nobase_include_HEADERS += \ vnet/pg/pg.h \ vnet/pg/edit.h -if !WITH_DPDK -libvnet_la_SOURCES += \ - vnet/devices/nic/ixge.c \ - vnet/devices/nic/ixge.h \ - vnet/devices/nic/sfp.c \ - vnet/devices/nic/sfp.h -endif - ######################################## # virtio ######################################## diff --git a/src/vnet/devices/nic/ixge.c b/src/vnet/devices/nic/ixge.c deleted file mode 100644 index d4c4c6b7..00000000 --- a/src/vnet/devices/nic/ixge.c +++ /dev/null @@ -1,2938 +0,0 @@ -/* - * Copyright (c) 2016 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * WARNING! - * This driver is not intended for production use and it is unsupported. - * It is provided for educational use only. - * Please use supported DPDK driver instead. - */ - -#if __x86_64__ -#include - -#ifndef CLIB_HAVE_VEC128 -#warning HACK: ixge driver wont really work, missing u32x4 -typedef unsigned long long u32x4; -#endif - -#include -#include -#include -#include -#include -#include - -#define IXGE_ALWAYS_POLL 0 - -#define EVENT_SET_FLAGS 0 -#define IXGE_HWBP_RACE_ELOG 0 - -#define PCI_VENDOR_ID_INTEL 0x8086 - -/* 10 GIG E (XGE) PHY IEEE 802.3 clause 45 definitions. */ -#define XGE_PHY_DEV_TYPE_PMA_PMD 1 -#define XGE_PHY_DEV_TYPE_PHY_XS 4 -#define XGE_PHY_ID1 0x2 -#define XGE_PHY_ID2 0x3 -#define XGE_PHY_CONTROL 0x0 -#define XGE_PHY_CONTROL_RESET (1 << 15) - -ixge_main_t ixge_main; -static vlib_node_registration_t ixge_input_node; -static vlib_node_registration_t ixge_process_node; - -static void -ixge_semaphore_get (ixge_device_t * xd) -{ - ixge_main_t *xm = &ixge_main; - vlib_main_t *vm = xm->vlib_main; - ixge_regs_t *r = xd->regs; - u32 i; - - i = 0; - while (!(r->software_semaphore & (1 << 0))) - { - if (i > 0) - vlib_process_suspend (vm, 100e-6); - i++; - } - do - { - r->software_semaphore |= 1 << 1; - } - while (!(r->software_semaphore & (1 << 1))); -} - -static void -ixge_semaphore_release (ixge_device_t * xd) -{ - ixge_regs_t *r = xd->regs; - r->software_semaphore &= ~3; -} - -static void -ixge_software_firmware_sync (ixge_device_t * xd, u32 sw_mask) -{ - ixge_main_t *xm = &ixge_main; - vlib_main_t *vm = xm->vlib_main; - ixge_regs_t *r = xd->regs; - u32 fw_mask = sw_mask << 5; - u32 m, done = 0; - - while (!done) - { - ixge_semaphore_get (xd); - m = r->software_firmware_sync; - done = (m & fw_mask) == 0; - if (done) - r->software_firmware_sync = m | sw_mask; - ixge_semaphore_release (xd); - if (!done) - vlib_process_suspend (vm, 10e-3); - } -} - -static void -ixge_software_firmware_sync_release (ixge_device_t * xd, u32 sw_mask) -{ - ixge_regs_t *r = xd->regs; - ixge_semaphore_get (xd); - r->software_firmware_sync &= ~sw_mask; - ixge_semaphore_release (xd); -} - -u32 -ixge_read_write_phy_reg (ixge_device_t * xd, u32 dev_type, u32 reg_index, - u32 v, u32 is_read) -{ - ixge_regs_t *r = xd->regs; - const u32 busy_bit = 1 << 30; - u32 x; - - ASSERT (xd->phy_index < 2); - ixge_software_firmware_sync (xd, 1 << (1 + xd->phy_index)); - - ASSERT (reg_index < (1 << 16)); - ASSERT (dev_type < (1 << 5)); - if (!is_read) - r->xge_mac.phy_data = v; - - /* Address cycle. */ - x = - reg_index | (dev_type << 16) | (xd-> - phys[xd->phy_index].mdio_address << 21); - r->xge_mac.phy_command = x | busy_bit; - /* Busy wait timed to take 28e-6 secs. No suspend. */ - while (r->xge_mac.phy_command & busy_bit) - ; - - r->xge_mac.phy_command = x | ((is_read ? 2 : 1) << 26) | busy_bit; - while (r->xge_mac.phy_command & busy_bit) - ; - - if (is_read) - v = r->xge_mac.phy_data >> 16; - - ixge_software_firmware_sync_release (xd, 1 << (1 + xd->phy_index)); - - return v; -} - -static u32 -ixge_read_phy_reg (ixge_device_t * xd, u32 dev_type, u32 reg_index) -{ - return ixge_read_write_phy_reg (xd, dev_type, reg_index, 0, /* is_read */ - 1); -} - -static void -ixge_write_phy_reg (ixge_device_t * xd, u32 dev_type, u32 reg_index, u32 v) -{ - (void) ixge_read_write_phy_reg (xd, dev_type, reg_index, v, /* is_read */ - 0); -} - -static void -ixge_i2c_put_bits (i2c_bus_t * b, int scl, int sda) -{ - ixge_main_t *xm = &ixge_main; - ixge_device_t *xd = vec_elt_at_index (xm->devices, b->private_data); - u32 v; - - v = 0; - v |= (sda != 0) << 3; - v |= (scl != 0) << 1; - xd->regs->i2c_control = v; -} - -static void -ixge_i2c_get_bits (i2c_bus_t * b, int *scl, int *sda) -{ - ixge_main_t *xm = &ixge_main; - ixge_device_t *xd = vec_elt_at_index (xm->devices, b->private_data); - u32 v; - - v = xd->regs->i2c_control; - *sda = (v & (1 << 2)) != 0; - *scl = (v & (1 << 0)) != 0; -} - -static u16 -ixge_read_eeprom (ixge_device_t * xd, u32 address) -{ - ixge_regs_t *r = xd->regs; - u32 v; - r->eeprom_read = (( /* start bit */ (1 << 0)) | (address << 2)); - /* Wait for done bit. */ - while (!((v = r->eeprom_read) & (1 << 1))) - ; - return v >> 16; -} - -static void -ixge_sfp_enable_disable_laser (ixge_device_t * xd, uword enable) -{ - u32 tx_disable_bit = 1 << 3; - if (enable) - xd->regs->sdp_control &= ~tx_disable_bit; - else - xd->regs->sdp_control |= tx_disable_bit; -} - -static void -ixge_sfp_enable_disable_10g (ixge_device_t * xd, uword enable) -{ - u32 is_10g_bit = 1 << 5; - if (enable) - xd->regs->sdp_control |= is_10g_bit; - else - xd->regs->sdp_control &= ~is_10g_bit; -} - -static clib_error_t * -ixge_sfp_phy_init_from_eeprom (ixge_device_t * xd, u16 sfp_type) -{ - u16 a, id, reg_values_addr = 0; - - a = ixge_read_eeprom (xd, 0x2b); - if (a == 0 || a == 0xffff) - return clib_error_create ("no init sequence in eeprom"); - - while (1) - { - id = ixge_read_eeprom (xd, ++a); - if (id == 0xffff) - break; - reg_values_addr = ixge_read_eeprom (xd, ++a); - if (id == sfp_type) - break; - } - if (id != sfp_type) - return clib_error_create ("failed to find id 0x%x", sfp_type); - - ixge_software_firmware_sync (xd, 1 << 3); - while (1) - { - u16 v = ixge_read_eeprom (xd, ++reg_values_addr); - if (v == 0xffff) - break; - xd->regs->core_analog_config = v; - } - ixge_software_firmware_sync_release (xd, 1 << 3); - - /* Make sure laser is off. We'll turn on the laser when - the interface is brought up. */ - ixge_sfp_enable_disable_laser (xd, /* enable */ 0); - ixge_sfp_enable_disable_10g (xd, /* is_10g */ 1); - - return 0; -} - -static void -ixge_sfp_device_up_down (ixge_device_t * xd, uword is_up) -{ - u32 v; - - if (is_up) - { - /* pma/pmd 10g serial SFI. */ - xd->regs->xge_mac.auto_negotiation_control2 &= ~(3 << 16); - xd->regs->xge_mac.auto_negotiation_control2 |= 2 << 16; - - v = xd->regs->xge_mac.auto_negotiation_control; - v &= ~(7 << 13); - v |= (0 << 13); - /* Restart autoneg. */ - v |= (1 << 12); - xd->regs->xge_mac.auto_negotiation_control = v; - - while (!(xd->regs->xge_mac.link_partner_ability[0] & 0xf0000)) - ; - - v = xd->regs->xge_mac.auto_negotiation_control; - - /* link mode 10g sfi serdes */ - v &= ~(7 << 13); - v |= (3 << 13); - - /* Restart autoneg. */ - v |= (1 << 12); - xd->regs->xge_mac.auto_negotiation_control = v; - - xd->regs->xge_mac.link_status; - } - - ixge_sfp_enable_disable_laser (xd, /* enable */ is_up); - - /* Give time for link partner to notice that we're up. */ - if (is_up && vlib_in_process_context (vlib_get_main ())) - { - vlib_process_suspend (vlib_get_main (), 300e-3); - } -} - -always_inline ixge_dma_regs_t * -get_dma_regs (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 qi) -{ - ixge_regs_t *r = xd->regs; - ASSERT (qi < 128); - if (rt == VLIB_RX) - return qi < 64 ? &r->rx_dma0[qi] : &r->rx_dma1[qi - 64]; - else - return &r->tx_dma[qi]; -} - -static clib_error_t * -ixge_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) -{ - vnet_hw_interface_t *hif = vnet_get_hw_interface (vnm, hw_if_index); - uword is_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0; - ixge_main_t *xm = &ixge_main; - ixge_device_t *xd = vec_elt_at_index (xm->devices, hif->dev_instance); - ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_RX, 0); - - if (is_up) - { - xd->regs->rx_enable |= 1; - xd->regs->tx_dma_control |= 1; - dr->control |= 1 << 25; - while (!(dr->control & (1 << 25))) - ; - } - else - { - xd->regs->rx_enable &= ~1; - xd->regs->tx_dma_control &= ~1; - } - - ixge_sfp_device_up_down (xd, is_up); - - return /* no error */ 0; -} - -static void -ixge_sfp_phy_init (ixge_device_t * xd) -{ - ixge_phy_t *phy = xd->phys + xd->phy_index; - i2c_bus_t *ib = &xd->i2c_bus; - - ib->private_data = xd->device_index; - ib->put_bits = ixge_i2c_put_bits; - ib->get_bits = ixge_i2c_get_bits; - vlib_i2c_init (ib); - - vlib_i2c_read_eeprom (ib, 0x50, 0, 128, (u8 *) & xd->sfp_eeprom); - - if (vlib_i2c_bus_timed_out (ib) || !sfp_eeprom_is_valid (&xd->sfp_eeprom)) - xd->sfp_eeprom.id = SFP_ID_unknown; - else - { - /* FIXME 5 => SR/LR eeprom ID. */ - clib_error_t *e = - ixge_sfp_phy_init_from_eeprom (xd, 5 + xd->pci_function); - if (e) - clib_error_report (e); - } - - phy->mdio_address = ~0; -} - -static void -ixge_phy_init (ixge_device_t * xd) -{ - ixge_main_t *xm = &ixge_main; - vlib_main_t *vm = xm->vlib_main; - ixge_phy_t *phy = xd->phys + xd->phy_index; - - switch (xd->device_id) - { - case IXGE_82599_sfp: - case IXGE_82599_sfp_em: - case IXGE_82599_sfp_fcoe: - /* others? */ - return ixge_sfp_phy_init (xd); - - default: - break; - } - - /* Probe address of phy. */ - { - u32 i, v; - - phy->mdio_address = ~0; - for (i = 0; i < 32; i++) - { - phy->mdio_address = i; - v = ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PMA_PMD, XGE_PHY_ID1); - if (v != 0xffff && v != 0) - break; - } - - /* No PHY found? */ - if (i >= 32) - return; - } - - phy->id = - ((ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PMA_PMD, XGE_PHY_ID1) << 16) | - ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PMA_PMD, XGE_PHY_ID2)); - - { - ELOG_TYPE_DECLARE (e) = - { - .function = (char *) __FUNCTION__,.format = - "ixge %d, phy id 0x%d mdio address %d",.format_args = "i4i4i4",}; - struct - { - u32 instance, id, address; - } *ed; - ed = ELOG_DATA (&vm->elog_main, e); - ed->instance = xd->device_index; - ed->id = phy->id; - ed->address = phy->mdio_address; - } - - /* Reset phy. */ - ixge_write_phy_reg (xd, XGE_PHY_DEV_TYPE_PHY_XS, XGE_PHY_CONTROL, - XGE_PHY_CONTROL_RESET); - - /* Wait for self-clearning reset bit to clear. */ - do - { - vlib_process_suspend (vm, 1e-3); - } - while (ixge_read_phy_reg (xd, XGE_PHY_DEV_TYPE_PHY_XS, XGE_PHY_CONTROL) & - XGE_PHY_CONTROL_RESET); -} - -static u8 * -format_ixge_rx_from_hw_descriptor (u8 * s, va_list * va) -{ - ixge_rx_from_hw_descriptor_t *d = - va_arg (*va, ixge_rx_from_hw_descriptor_t *); - u32 s0 = d->status[0], s2 = d->status[2]; - u32 is_ip4, is_ip6, is_ip, is_tcp, is_udp; - uword indent = format_get_indent (s); - - s = format (s, "%s-owned", - (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE) ? "sw" : - "hw"); - s = - format (s, ", length this descriptor %d, l3 offset %d", - d->n_packet_bytes_this_descriptor, - IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s0)); - if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) - s = format (s, ", end-of-packet"); - - s = format (s, "\n%U", format_white_space, indent); - - if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_ETHERNET_ERROR) - s = format (s, "layer2 error"); - - if (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_LAYER2) - { - s = format (s, "layer 2 type %d", (s0 & 0x1f)); - return s; - } - - if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_VLAN) - s = format (s, "vlan header 0x%x\n%U", d->vlan_tag, - format_white_space, indent); - - if ((is_ip4 = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4))) - { - s = format (s, "ip4%s", - (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4_EXT) ? " options" : - ""); - if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED) - s = format (s, " checksum %s", - (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) ? - "bad" : "ok"); - } - if ((is_ip6 = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6))) - s = format (s, "ip6%s", - (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6_EXT) ? " extended" : - ""); - is_tcp = is_udp = 0; - if ((is_ip = (is_ip4 | is_ip6))) - { - is_tcp = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_TCP) != 0; - is_udp = (s0 & IXGE_RX_DESCRIPTOR_STATUS0_IS_UDP) != 0; - if (is_tcp) - s = format (s, ", tcp"); - if (is_udp) - s = format (s, ", udp"); - } - - if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED) - s = format (s, ", tcp checksum %s", - (s2 & IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR) ? "bad" : - "ok"); - if (s2 & IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED) - s = format (s, ", udp checksum %s", - (s2 & IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR) ? "bad" : - "ok"); - - return s; -} - -static u8 * -format_ixge_tx_descriptor (u8 * s, va_list * va) -{ - ixge_tx_descriptor_t *d = va_arg (*va, ixge_tx_descriptor_t *); - u32 s0 = d->status0, s1 = d->status1; - uword indent = format_get_indent (s); - u32 v; - - s = format (s, "buffer 0x%Lx, %d packet bytes, %d bytes this buffer", - d->buffer_address, s1 >> 14, d->n_bytes_this_buffer); - - s = format (s, "\n%U", format_white_space, indent); - - if ((v = (s0 >> 0) & 3)) - s = format (s, "reserved 0x%x, ", v); - - if ((v = (s0 >> 2) & 3)) - s = format (s, "mac 0x%x, ", v); - - if ((v = (s0 >> 4) & 0xf) != 3) - s = format (s, "type 0x%x, ", v); - - s = format (s, "%s%s%s%s%s%s%s%s", - (s0 & (1 << 8)) ? "eop, " : "", - (s0 & (1 << 9)) ? "insert-fcs, " : "", - (s0 & (1 << 10)) ? "reserved26, " : "", - (s0 & (1 << 11)) ? "report-status, " : "", - (s0 & (1 << 12)) ? "reserved28, " : "", - (s0 & (1 << 13)) ? "is-advanced, " : "", - (s0 & (1 << 14)) ? "vlan-enable, " : "", - (s0 & (1 << 15)) ? "tx-segmentation, " : ""); - - if ((v = s1 & 0xf) != 0) - s = format (s, "status 0x%x, ", v); - - if ((v = (s1 >> 4) & 0xf)) - s = format (s, "context 0x%x, ", v); - - if ((v = (s1 >> 8) & 0x3f)) - s = format (s, "options 0x%x, ", v); - - return s; -} - -typedef struct -{ - ixge_descriptor_t before, after; - - u32 buffer_index; - - u16 device_index; - - u8 queue_index; - - u8 is_start_of_packet; - - /* Copy of VLIB buffer; packet data stored in pre_data. */ - vlib_buffer_t buffer; -} ixge_rx_dma_trace_t; - -static u8 * -format_ixge_rx_dma_trace (u8 * s, va_list * va) -{ - CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); - vlib_node_t *node = va_arg (*va, vlib_node_t *); - vnet_main_t *vnm = vnet_get_main (); - ixge_rx_dma_trace_t *t = va_arg (*va, ixge_rx_dma_trace_t *); - ixge_main_t *xm = &ixge_main; - ixge_device_t *xd = vec_elt_at_index (xm->devices, t->device_index); - format_function_t *f; - uword indent = format_get_indent (s); - - { - vnet_sw_interface_t *sw = - vnet_get_sw_interface (vnm, xd->vlib_sw_if_index); - s = - format (s, "%U rx queue %d", format_vnet_sw_interface_name, vnm, sw, - t->queue_index); - } - - s = format (s, "\n%Ubefore: %U", - format_white_space, indent, - format_ixge_rx_from_hw_descriptor, &t->before); - s = format (s, "\n%Uafter : head/tail address 0x%Lx/0x%Lx", - format_white_space, indent, - t->after.rx_to_hw.head_address, t->after.rx_to_hw.tail_address); - - s = format (s, "\n%Ubuffer 0x%x: %U", - format_white_space, indent, - t->buffer_index, format_vlib_buffer, &t->buffer); - - s = format (s, "\n%U", format_white_space, indent); - - f = node->format_buffer; - if (!f || !t->is_start_of_packet) - f = format_hex_bytes; - s = format (s, "%U", f, t->buffer.pre_data, sizeof (t->buffer.pre_data)); - - return s; -} - -#define foreach_ixge_error \ - _ (none, "no error") \ - _ (tx_full_drops, "tx ring full drops") \ - _ (ip4_checksum_error, "ip4 checksum errors") \ - _ (rx_alloc_fail, "rx buf alloc from free list failed") \ - _ (rx_alloc_no_physmem, "rx buf alloc failed no physmem") - -typedef enum -{ -#define _(f,s) IXGE_ERROR_##f, - foreach_ixge_error -#undef _ - IXGE_N_ERROR, -} ixge_error_t; - -always_inline void -ixge_rx_next_and_error_from_status_x1 (ixge_device_t * xd, - u32 s00, u32 s02, - u8 * next0, u8 * error0, u32 * flags0) -{ - u8 is0_ip4, is0_ip6, n0, e0; - u32 f0; - - e0 = IXGE_ERROR_none; - n0 = IXGE_RX_NEXT_ETHERNET_INPUT; - - is0_ip4 = s02 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED; - n0 = is0_ip4 ? IXGE_RX_NEXT_IP4_INPUT : n0; - - e0 = (is0_ip4 && (s02 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) - ? IXGE_ERROR_ip4_checksum_error : e0); - - is0_ip6 = s00 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6; - n0 = is0_ip6 ? IXGE_RX_NEXT_IP6_INPUT : n0; - - n0 = (xd->per_interface_next_index != ~0) ? - xd->per_interface_next_index : n0; - - /* Check for error. */ - n0 = e0 != IXGE_ERROR_none ? IXGE_RX_NEXT_DROP : n0; - - f0 = ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED - | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED)) - ? IP_BUFFER_L4_CHECKSUM_COMPUTED : 0); - - f0 |= ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR - | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR)) - ? 0 : IP_BUFFER_L4_CHECKSUM_CORRECT); - - *error0 = e0; - *next0 = n0; - *flags0 = f0; -} - -always_inline void -ixge_rx_next_and_error_from_status_x2 (ixge_device_t * xd, - u32 s00, u32 s02, - u32 s10, u32 s12, - u8 * next0, u8 * error0, u32 * flags0, - u8 * next1, u8 * error1, u32 * flags1) -{ - u8 is0_ip4, is0_ip6, n0, e0; - u8 is1_ip4, is1_ip6, n1, e1; - u32 f0, f1; - - e0 = e1 = IXGE_ERROR_none; - n0 = n1 = IXGE_RX_NEXT_IP4_INPUT; - - is0_ip4 = s02 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED; - is1_ip4 = s12 & IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED; - - n0 = is0_ip4 ? IXGE_RX_NEXT_IP4_INPUT : n0; - n1 = is1_ip4 ? IXGE_RX_NEXT_IP4_INPUT : n1; - - e0 = (is0_ip4 && (s02 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) - ? IXGE_ERROR_ip4_checksum_error : e0); - e1 = (is1_ip4 && (s12 & IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR) - ? IXGE_ERROR_ip4_checksum_error : e1); - - is0_ip6 = s00 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6; - is1_ip6 = s10 & IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6; - - n0 = is0_ip6 ? IXGE_RX_NEXT_IP6_INPUT : n0; - n1 = is1_ip6 ? IXGE_RX_NEXT_IP6_INPUT : n1; - - n0 = (xd->per_interface_next_index != ~0) ? - xd->per_interface_next_index : n0; - n1 = (xd->per_interface_next_index != ~0) ? - xd->per_interface_next_index : n1; - - /* Check for error. */ - n0 = e0 != IXGE_ERROR_none ? IXGE_RX_NEXT_DROP : n0; - n1 = e1 != IXGE_ERROR_none ? IXGE_RX_NEXT_DROP : n1; - - *error0 = e0; - *error1 = e1; - - *next0 = n0; - *next1 = n1; - - f0 = ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED - | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED)) - ? IP_BUFFER_L4_CHECKSUM_COMPUTED : 0); - f1 = ((s12 & (IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED - | IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED)) - ? IP_BUFFER_L4_CHECKSUM_COMPUTED : 0); - - f0 |= ((s02 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR - | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR)) - ? 0 : IP_BUFFER_L4_CHECKSUM_CORRECT); - f1 |= ((s12 & (IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR - | IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR)) - ? 0 : IP_BUFFER_L4_CHECKSUM_CORRECT); - - *flags0 = f0; - *flags1 = f1; -} - -static void -ixge_rx_trace (ixge_main_t * xm, - ixge_device_t * xd, - ixge_dma_queue_t * dq, - ixge_descriptor_t * before_descriptors, - u32 * before_buffers, - ixge_descriptor_t * after_descriptors, uword n_descriptors) -{ - vlib_main_t *vm = xm->vlib_main; - vlib_node_runtime_t *node = dq->rx.node; - ixge_rx_from_hw_descriptor_t *bd; - ixge_rx_to_hw_descriptor_t *ad; - u32 *b, n_left, is_sop, next_index_sop; - - n_left = n_descriptors; - b = before_buffers; - bd = &before_descriptors->rx_from_hw; - ad = &after_descriptors->rx_to_hw; - is_sop = dq->rx.is_start_of_packet; - next_index_sop = dq->rx.saved_start_of_packet_next_index; - - while (n_left >= 2) - { - u32 bi0, bi1, flags0, flags1; - vlib_buffer_t *b0, *b1; - ixge_rx_dma_trace_t *t0, *t1; - u8 next0, error0, next1, error1; - - bi0 = b[0]; - bi1 = b[1]; - n_left -= 2; - - b0 = vlib_get_buffer (vm, bi0); - b1 = vlib_get_buffer (vm, bi1); - - ixge_rx_next_and_error_from_status_x2 (xd, - bd[0].status[0], bd[0].status[2], - bd[1].status[0], bd[1].status[2], - &next0, &error0, &flags0, - &next1, &error1, &flags1); - - next_index_sop = is_sop ? next0 : next_index_sop; - vlib_trace_buffer (vm, node, next_index_sop, b0, /* follow_chain */ 0); - t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); - t0->is_start_of_packet = is_sop; - is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; - - next_index_sop = is_sop ? next1 : next_index_sop; - vlib_trace_buffer (vm, node, next_index_sop, b1, /* follow_chain */ 0); - t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0])); - t1->is_start_of_packet = is_sop; - is_sop = (b1->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; - - t0->queue_index = dq->queue_index; - t1->queue_index = dq->queue_index; - t0->device_index = xd->device_index; - t1->device_index = xd->device_index; - t0->before.rx_from_hw = bd[0]; - t1->before.rx_from_hw = bd[1]; - t0->after.rx_to_hw = ad[0]; - t1->after.rx_to_hw = ad[1]; - t0->buffer_index = bi0; - t1->buffer_index = bi1; - memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); - memcpy (&t1->buffer, b1, sizeof (b1[0]) - sizeof (b0->pre_data)); - memcpy (t0->buffer.pre_data, b0->data + b0->current_data, - sizeof (t0->buffer.pre_data)); - memcpy (t1->buffer.pre_data, b1->data + b1->current_data, - sizeof (t1->buffer.pre_data)); - - b += 2; - bd += 2; - ad += 2; - } - - while (n_left >= 1) - { - u32 bi0, flags0; - vlib_buffer_t *b0; - ixge_rx_dma_trace_t *t0; - u8 next0, error0; - - bi0 = b[0]; - n_left -= 1; - - b0 = vlib_get_buffer (vm, bi0); - - ixge_rx_next_and_error_from_status_x1 (xd, - bd[0].status[0], bd[0].status[2], - &next0, &error0, &flags0); - - next_index_sop = is_sop ? next0 : next_index_sop; - vlib_trace_buffer (vm, node, next_index_sop, b0, /* follow_chain */ 0); - t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); - t0->is_start_of_packet = is_sop; - is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; - - t0->queue_index = dq->queue_index; - t0->device_index = xd->device_index; - t0->before.rx_from_hw = bd[0]; - t0->after.rx_to_hw = ad[0]; - t0->buffer_index = bi0; - memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); - memcpy (t0->buffer.pre_data, b0->data + b0->current_data, - sizeof (t0->buffer.pre_data)); - - b += 1; - bd += 1; - ad += 1; - } -} - -typedef struct -{ - ixge_tx_descriptor_t descriptor; - - u32 buffer_index; - - u16 device_index; - - u8 queue_index; - - u8 is_start_of_packet; - - /* Copy of VLIB buffer; packet data stored in pre_data. */ - vlib_buffer_t buffer; -} ixge_tx_dma_trace_t; - -static u8 * -format_ixge_tx_dma_trace (u8 * s, va_list * va) -{ - CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); - CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *); - ixge_tx_dma_trace_t *t = va_arg (*va, ixge_tx_dma_trace_t *); - vnet_main_t *vnm = vnet_get_main (); - ixge_main_t *xm = &ixge_main; - ixge_device_t *xd = vec_elt_at_index (xm->devices, t->device_index); - format_function_t *f; - uword indent = format_get_indent (s); - - { - vnet_sw_interface_t *sw = - vnet_get_sw_interface (vnm, xd->vlib_sw_if_index); - s = - format (s, "%U tx queue %d", format_vnet_sw_interface_name, vnm, sw, - t->queue_index); - } - - s = format (s, "\n%Udescriptor: %U", - format_white_space, indent, - format_ixge_tx_descriptor, &t->descriptor); - - s = format (s, "\n%Ubuffer 0x%x: %U", - format_white_space, indent, - t->buffer_index, format_vlib_buffer, &t->buffer); - - s = format (s, "\n%U", format_white_space, indent); - - f = format_ethernet_header_with_length; - if (!f || !t->is_start_of_packet) - f = format_hex_bytes; - s = format (s, "%U", f, t->buffer.pre_data, sizeof (t->buffer.pre_data)); - - return s; -} - -typedef struct -{ - vlib_node_runtime_t *node; - - u32 is_start_of_packet; - - u32 n_bytes_in_packet; - - ixge_tx_descriptor_t *start_of_packet_descriptor; -} ixge_tx_state_t; - -static void -ixge_tx_trace (ixge_main_t * xm, - ixge_device_t * xd, - ixge_dma_queue_t * dq, - ixge_tx_state_t * tx_state, - ixge_tx_descriptor_t * descriptors, - u32 * buffers, uword n_descriptors) -{ - vlib_main_t *vm = xm->vlib_main; - vlib_node_runtime_t *node = tx_state->node; - ixge_tx_descriptor_t *d; - u32 *b, n_left, is_sop; - - n_left = n_descriptors; - b = buffers; - d = descriptors; - is_sop = tx_state->is_start_of_packet; - - while (n_left >= 2) - { - u32 bi0, bi1; - vlib_buffer_t *b0, *b1; - ixge_tx_dma_trace_t *t0, *t1; - - bi0 = b[0]; - bi1 = b[1]; - n_left -= 2; - - b0 = vlib_get_buffer (vm, bi0); - b1 = vlib_get_buffer (vm, bi1); - - t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); - t0->is_start_of_packet = is_sop; - is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; - - t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0])); - t1->is_start_of_packet = is_sop; - is_sop = (b1->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; - - t0->queue_index = dq->queue_index; - t1->queue_index = dq->queue_index; - t0->device_index = xd->device_index; - t1->device_index = xd->device_index; - t0->descriptor = d[0]; - t1->descriptor = d[1]; - t0->buffer_index = bi0; - t1->buffer_index = bi1; - memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); - memcpy (&t1->buffer, b1, sizeof (b1[0]) - sizeof (b0->pre_data)); - memcpy (t0->buffer.pre_data, b0->data + b0->current_data, - sizeof (t0->buffer.pre_data)); - memcpy (t1->buffer.pre_data, b1->data + b1->current_data, - sizeof (t1->buffer.pre_data)); - - b += 2; - d += 2; - } - - while (n_left >= 1) - { - u32 bi0; - vlib_buffer_t *b0; - ixge_tx_dma_trace_t *t0; - - bi0 = b[0]; - n_left -= 1; - - b0 = vlib_get_buffer (vm, bi0); - - t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); - t0->is_start_of_packet = is_sop; - is_sop = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; - - t0->queue_index = dq->queue_index; - t0->device_index = xd->device_index; - t0->descriptor = d[0]; - t0->buffer_index = bi0; - memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data)); - memcpy (t0->buffer.pre_data, b0->data + b0->current_data, - sizeof (t0->buffer.pre_data)); - - b += 1; - d += 1; - } -} - -always_inline uword -ixge_ring_sub (ixge_dma_queue_t * q, u32 i0, u32 i1) -{ - i32 d = i1 - i0; - ASSERT (i0 < q->n_descriptors); - ASSERT (i1 < q->n_descriptors); - return d < 0 ? q->n_descriptors + d : d; -} - -always_inline uword -ixge_ring_add (ixge_dma_queue_t * q, u32 i0, u32 i1) -{ - u32 d = i0 + i1; - ASSERT (i0 < q->n_descriptors); - ASSERT (i1 < q->n_descriptors); - d -= d >= q->n_descriptors ? q->n_descriptors : 0; - return d; -} - -always_inline uword -ixge_tx_descriptor_matches_template (ixge_main_t * xm, - ixge_tx_descriptor_t * d) -{ - u32 cmp; - - cmp = ((d->status0 & xm->tx_descriptor_template_mask.status0) - ^ xm->tx_descriptor_template.status0); - if (cmp) - return 0; - cmp = ((d->status1 & xm->tx_descriptor_template_mask.status1) - ^ xm->tx_descriptor_template.status1); - if (cmp) - return 0; - - return 1; -} - -static uword -ixge_tx_no_wrap (ixge_main_t * xm, - ixge_device_t * xd, - ixge_dma_queue_t * dq, - u32 * buffers, - u32 start_descriptor_index, - u32 n_descriptors, ixge_tx_state_t * tx_state) -{ - vlib_main_t *vm = xm->vlib_main; - ixge_tx_descriptor_t *d, *d_sop; - u32 n_left = n_descriptors; - u32 *to_free = vec_end (xm->tx_buffers_pending_free); - u32 *to_tx = - vec_elt_at_index (dq->descriptor_buffer_indices, start_descriptor_index); - u32 is_sop = tx_state->is_start_of_packet; - u32 len_sop = tx_state->n_bytes_in_packet; - u16 template_status = xm->tx_descriptor_template.status0; - u32 descriptor_prefetch_rotor = 0; - - ASSERT (start_descriptor_index + n_descriptors <= dq->n_descriptors); - d = &dq->descriptors[start_descriptor_index].tx; - d_sop = is_sop ? d : tx_state->start_of_packet_descriptor; - - while (n_left >= 4) - { - vlib_buffer_t *b0, *b1; - u32 bi0, fi0, len0; - u32 bi1, fi1, len1; - u8 is_eop0, is_eop1; - - /* Prefetch next iteration. */ - vlib_prefetch_buffer_with_index (vm, buffers[2], LOAD); - vlib_prefetch_buffer_with_index (vm, buffers[3], LOAD); - - if ((descriptor_prefetch_rotor & 0x3) == 0) - CLIB_PREFETCH (d + 4, CLIB_CACHE_LINE_BYTES, STORE); - - descriptor_prefetch_rotor += 2; - - bi0 = buffers[0]; - bi1 = buffers[1]; - - to_free[0] = fi0 = to_tx[0]; - to_tx[0] = bi0; - to_free += fi0 != 0; - - to_free[0] = fi1 = to_tx[1]; - to_tx[1] = bi1; - to_free += fi1 != 0; - - buffers += 2; - n_left -= 2; - to_tx += 2; - - b0 = vlib_get_buffer (vm, bi0); - b1 = vlib_get_buffer (vm, bi1); - - is_eop0 = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; - is_eop1 = (b1->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; - - len0 = b0->current_length; - len1 = b1->current_length; - - ASSERT (ixge_tx_descriptor_matches_template (xm, d + 0)); - ASSERT (ixge_tx_descriptor_matches_template (xm, d + 1)); - - d[0].buffer_address = - vlib_get_buffer_data_physical_address (vm, bi0) + b0->current_data; - d[1].buffer_address = - vlib_get_buffer_data_physical_address (vm, bi1) + b1->current_data; - - d[0].n_bytes_this_buffer = len0; - d[1].n_bytes_this_buffer = len1; - - d[0].status0 = - template_status | (is_eop0 << - IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET); - d[1].status0 = - template_status | (is_eop1 << - IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET); - - len_sop = (is_sop ? 0 : len_sop) + len0; - d_sop[0].status1 = - IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET (len_sop); - d += 1; - d_sop = is_eop0 ? d : d_sop; - - is_sop = is_eop0; - - len_sop = (is_sop ? 0 : len_sop) + len1; - d_sop[0].status1 = - IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET (len_sop); - d += 1; - d_sop = is_eop1 ? d : d_sop; - - is_sop = is_eop1; - } - - while (n_left > 0) - { - vlib_buffer_t *b0; - u32 bi0, fi0, len0; - u8 is_eop0; - - bi0 = buffers[0]; - - to_free[0] = fi0 = to_tx[0]; - to_tx[0] = bi0; - to_free += fi0 != 0; - - buffers += 1; - n_left -= 1; - to_tx += 1; - - b0 = vlib_get_buffer (vm, bi0); - - is_eop0 = (b0->flags & VLIB_BUFFER_NEXT_PRESENT) == 0; - - len0 = b0->current_length; - - ASSERT (ixge_tx_descriptor_matches_template (xm, d + 0)); - - d[0].buffer_address = - vlib_get_buffer_data_physical_address (vm, bi0) + b0->current_data; - - d[0].n_bytes_this_buffer = len0; - - d[0].status0 = - template_status | (is_eop0 << - IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET); - - len_sop = (is_sop ? 0 : len_sop) + len0; - d_sop[0].status1 = - IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET (len_sop); - d += 1; - d_sop = is_eop0 ? d : d_sop; - - is_sop = is_eop0; - } - - if (tx_state->node->flags & VLIB_NODE_FLAG_TRACE) - { - to_tx = - vec_elt_at_index (dq->descriptor_buffer_indices, - start_descriptor_index); - ixge_tx_trace (xm, xd, dq, tx_state, - &dq->descriptors[start_descriptor_index].tx, to_tx, - n_descriptors); - } - - _vec_len (xm->tx_buffers_pending_free) = - to_free - xm->tx_buffers_pending_free; - - /* When we are done d_sop can point to end of ring. Wrap it if so. */ - { - ixge_tx_descriptor_t *d_start = &dq->descriptors[0].tx; - - ASSERT (d_sop - d_start <= dq->n_descriptors); - d_sop = d_sop - d_start == dq->n_descriptors ? d_start : d_sop; - } - - tx_state->is_start_of_packet = is_sop; - tx_state->start_of_packet_descriptor = d_sop; - tx_state->n_bytes_in_packet = len_sop; - - return n_descriptors; -} - -static uword -ixge_interface_tx (vlib_main_t * vm, - vlib_node_runtime_t * node, vlib_frame_t * f) -{ - ixge_main_t *xm = &ixge_main; - vnet_interface_output_runtime_t *rd = (void *) node->runtime_data; - ixge_device_t *xd = vec_elt_at_index (xm->devices, rd->dev_instance); - ixge_dma_queue_t *dq; - u32 *from, n_left_tx, n_descriptors_to_tx, n_tail_drop; - u32 queue_index = 0; /* fixme parameter */ - ixge_tx_state_t tx_state; - - tx_state.node = node; - tx_state.is_start_of_packet = 1; - tx_state.start_of_packet_descriptor = 0; - tx_state.n_bytes_in_packet = 0; - - from = vlib_frame_vector_args (f); - - dq = vec_elt_at_index (xd->dma_queues[VLIB_TX], queue_index); - - dq->head_index = dq->tx.head_index_write_back[0]; - - /* Since head == tail means ring is empty we can send up to dq->n_descriptors - 1. */ - n_left_tx = dq->n_descriptors - 1; - n_left_tx -= ixge_ring_sub (dq, dq->head_index, dq->tail_index); - - _vec_len (xm->tx_buffers_pending_free) = 0; - - n_descriptors_to_tx = f->n_vectors; - n_tail_drop = 0; - if (PREDICT_FALSE (n_descriptors_to_tx > n_left_tx)) - { - i32 i, n_ok, i_eop, i_sop; - - i_sop = i_eop = ~0; - for (i = n_left_tx - 1; i >= 0; i--) - { - vlib_buffer_t *b = vlib_get_buffer (vm, from[i]); - if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT)) - { - if (i_sop != ~0 && i_eop != ~0) - break; - i_eop = i; - i_sop = i + 1; - } - } - if (i == 0) - n_ok = 0; - else - n_ok = i_eop + 1; - - { - ELOG_TYPE_DECLARE (e) = - { - .function = (char *) __FUNCTION__,.format = - "ixge %d, ring full to tx %d head %d tail %d",.format_args = - "i2i2i2i2",}; - struct - { - u16 instance, to_tx, head, tail; - } *ed; - ed = ELOG_DATA (&vm->elog_main, e); - ed->instance = xd->device_index; - ed->to_tx = n_descriptors_to_tx; - ed->head = dq->head_index; - ed->tail = dq->tail_index; - } - - if (n_ok < n_descriptors_to_tx) - { - n_tail_drop = n_descriptors_to_tx - n_ok; - vec_add (xm->tx_buffers_pending_free, from + n_ok, n_tail_drop); - vlib_error_count (vm, ixge_input_node.index, - IXGE_ERROR_tx_full_drops, n_tail_drop); - } - - n_descriptors_to_tx = n_ok; - } - - dq->tx.n_buffers_on_ring += n_descriptors_to_tx; - - /* Process from tail to end of descriptor ring. */ - if (n_descriptors_to_tx > 0 && dq->tail_index < dq->n_descriptors) - { - u32 n = - clib_min (dq->n_descriptors - dq->tail_index, n_descriptors_to_tx); - n = ixge_tx_no_wrap (xm, xd, dq, from, dq->tail_index, n, &tx_state); - from += n; - n_descriptors_to_tx -= n; - dq->tail_index += n; - ASSERT (dq->tail_index <= dq->n_descriptors); - if (dq->tail_index == dq->n_descriptors) - dq->tail_index = 0; - } - - if (n_descriptors_to_tx > 0) - { - u32 n = - ixge_tx_no_wrap (xm, xd, dq, from, 0, n_descriptors_to_tx, &tx_state); - from += n; - ASSERT (n == n_descriptors_to_tx); - dq->tail_index += n; - ASSERT (dq->tail_index <= dq->n_descriptors); - if (dq->tail_index == dq->n_descriptors) - dq->tail_index = 0; - } - - /* We should only get full packets. */ - ASSERT (tx_state.is_start_of_packet); - - /* Report status when last descriptor is done. */ - { - u32 i = dq->tail_index == 0 ? dq->n_descriptors - 1 : dq->tail_index - 1; - ixge_tx_descriptor_t *d = &dq->descriptors[i].tx; - d->status0 |= IXGE_TX_DESCRIPTOR_STATUS0_REPORT_STATUS; - } - - /* Give new descriptors to hardware. */ - { - ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_TX, queue_index); - - CLIB_MEMORY_BARRIER (); - - dr->tail_index = dq->tail_index; - } - - /* Free any buffers that are done. */ - { - u32 n = _vec_len (xm->tx_buffers_pending_free); - if (n > 0) - { - vlib_buffer_free_no_next (vm, xm->tx_buffers_pending_free, n); - _vec_len (xm->tx_buffers_pending_free) = 0; - ASSERT (dq->tx.n_buffers_on_ring >= n); - dq->tx.n_buffers_on_ring -= (n - n_tail_drop); - } - } - - return f->n_vectors; -} - -static uword -ixge_rx_queue_no_wrap (ixge_main_t * xm, - ixge_device_t * xd, - ixge_dma_queue_t * dq, - u32 start_descriptor_index, u32 n_descriptors) -{ - vlib_main_t *vm = xm->vlib_main; - vlib_node_runtime_t *node = dq->rx.node; - ixge_descriptor_t *d; - static ixge_descriptor_t *d_trace_save; - static u32 *d_trace_buffers; - u32 n_descriptors_left = n_descriptors; - u32 *to_rx = - vec_elt_at_index (dq->descriptor_buffer_indices, start_descriptor_index); - u32 *to_add; - u32 bi_sop = dq->rx.saved_start_of_packet_buffer_index; - u32 bi_last = dq->rx.saved_last_buffer_index; - u32 next_index_sop = dq->rx.saved_start_of_packet_next_index; - u32 is_sop = dq->rx.is_start_of_packet; - u32 next_index, n_left_to_next, *to_next; - u32 n_packets = 0; - u32 n_bytes = 0; - u32 n_trace = vlib_get_trace_count (vm, node); - vlib_buffer_t *b_last, b_dummy; - - ASSERT (start_descriptor_index + n_descriptors <= dq->n_descriptors); - d = &dq->descriptors[start_descriptor_index]; - - b_last = bi_last != ~0 ? vlib_get_buffer (vm, bi_last) : &b_dummy; - next_index = dq->rx.next_index; - - if (n_trace > 0) - { - u32 n = clib_min (n_trace, n_descriptors); - if (d_trace_save) - { - _vec_len (d_trace_save) = 0; - _vec_len (d_trace_buffers) = 0; - } - vec_add (d_trace_save, (ixge_descriptor_t *) d, n); - vec_add (d_trace_buffers, to_rx, n); - } - - { - uword l = vec_len (xm->rx_buffers_to_add); - - if (l < n_descriptors_left) - { - u32 n_to_alloc = 2 * dq->n_descriptors - l; - u32 n_allocated; - - vec_resize (xm->rx_buffers_to_add, n_to_alloc); - - _vec_len (xm->rx_buffers_to_add) = l; - n_allocated = vlib_buffer_alloc_from_free_list - (vm, xm->rx_buffers_to_add + l, n_to_alloc, - xm->vlib_buffer_free_list_index); - _vec_len (xm->rx_buffers_to_add) += n_allocated; - - /* Handle transient allocation failure */ - if (PREDICT_FALSE (l + n_allocated <= n_descriptors_left)) - { - if (n_allocated == 0) - vlib_error_count (vm, ixge_input_node.index, - IXGE_ERROR_rx_alloc_no_physmem, 1); - else - vlib_error_count (vm, ixge_input_node.index, - IXGE_ERROR_rx_alloc_fail, 1); - - n_descriptors_left = l + n_allocated; - } - n_descriptors = n_descriptors_left; - } - - /* Add buffers from end of vector going backwards. */ - to_add = vec_end (xm->rx_buffers_to_add) - 1; - } - - while (n_descriptors_left > 0) - { - vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); - - while (n_descriptors_left >= 4 && n_left_to_next >= 2) - { - vlib_buffer_t *b0, *b1; - u32 bi0, fi0, len0, l3_offset0, s20, s00, flags0; - u32 bi1, fi1, len1, l3_offset1, s21, s01, flags1; - u8 is_eop0, error0, next0; - u8 is_eop1, error1, next1; - ixge_descriptor_t d0, d1; - - vlib_prefetch_buffer_with_index (vm, to_rx[2], STORE); - vlib_prefetch_buffer_with_index (vm, to_rx[3], STORE); - - CLIB_PREFETCH (d + 2, 32, STORE); - - d0.as_u32x4 = d[0].as_u32x4; - d1.as_u32x4 = d[1].as_u32x4; - - s20 = d0.rx_from_hw.status[2]; - s21 = d1.rx_from_hw.status[2]; - - s00 = d0.rx_from_hw.status[0]; - s01 = d1.rx_from_hw.status[0]; - - if (! - ((s20 & s21) & IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE)) - goto found_hw_owned_descriptor_x2; - - bi0 = to_rx[0]; - bi1 = to_rx[1]; - - ASSERT (to_add - 1 >= xm->rx_buffers_to_add); - fi0 = to_add[0]; - fi1 = to_add[-1]; - - to_rx[0] = fi0; - to_rx[1] = fi1; - to_rx += 2; - to_add -= 2; - - ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == - vlib_buffer_is_known (vm, bi0)); - ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == - vlib_buffer_is_known (vm, bi1)); - ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == - vlib_buffer_is_known (vm, fi0)); - ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == - vlib_buffer_is_known (vm, fi1)); - - b0 = vlib_get_buffer (vm, bi0); - b1 = vlib_get_buffer (vm, bi1); - - /* - * Turn this on if you run into - * "bad monkey" contexts, and you want to know exactly - * which nodes they've visited... See main.c... - */ - VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); - VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b1); - - CLIB_PREFETCH (b0->data, CLIB_CACHE_LINE_BYTES, LOAD); - CLIB_PREFETCH (b1->data, CLIB_CACHE_LINE_BYTES, LOAD); - - is_eop0 = (s20 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) != 0; - is_eop1 = (s21 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) != 0; - - ixge_rx_next_and_error_from_status_x2 (xd, s00, s20, s01, s21, - &next0, &error0, &flags0, - &next1, &error1, &flags1); - - next0 = is_sop ? next0 : next_index_sop; - next1 = is_eop0 ? next1 : next0; - next_index_sop = next1; - - b0->flags |= flags0 | (!is_eop0 << VLIB_BUFFER_LOG2_NEXT_PRESENT); - b1->flags |= flags1 | (!is_eop1 << VLIB_BUFFER_LOG2_NEXT_PRESENT); - - vnet_buffer (b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; - vnet_buffer (b1)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; - vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; - vnet_buffer (b1)->sw_if_index[VLIB_TX] = (u32) ~ 0; - - b0->error = node->errors[error0]; - b1->error = node->errors[error1]; - - len0 = d0.rx_from_hw.n_packet_bytes_this_descriptor; - len1 = d1.rx_from_hw.n_packet_bytes_this_descriptor; - n_bytes += len0 + len1; - n_packets += is_eop0 + is_eop1; - - /* Give new buffers to hardware. */ - d0.rx_to_hw.tail_address = - vlib_get_buffer_data_physical_address (vm, fi0); - d1.rx_to_hw.tail_address = - vlib_get_buffer_data_physical_address (vm, fi1); - d0.rx_to_hw.head_address = d[0].rx_to_hw.tail_address; - d1.rx_to_hw.head_address = d[1].rx_to_hw.tail_address; - d[0].as_u32x4 = d0.as_u32x4; - d[1].as_u32x4 = d1.as_u32x4; - - d += 2; - n_descriptors_left -= 2; - - /* Point to either l2 or l3 header depending on next. */ - l3_offset0 = (is_sop && (next0 != IXGE_RX_NEXT_ETHERNET_INPUT)) - ? IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s00) : 0; - l3_offset1 = (is_eop0 && (next1 != IXGE_RX_NEXT_ETHERNET_INPUT)) - ? IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s01) : 0; - - b0->current_length = len0 - l3_offset0; - b1->current_length = len1 - l3_offset1; - b0->current_data = l3_offset0; - b1->current_data = l3_offset1; - - b_last->next_buffer = is_sop ? ~0 : bi0; - b0->next_buffer = is_eop0 ? ~0 : bi1; - bi_last = bi1; - b_last = b1; - - if (CLIB_DEBUG > 0) - { - u32 bi_sop0 = is_sop ? bi0 : bi_sop; - u32 bi_sop1 = is_eop0 ? bi1 : bi_sop0; - - if (is_eop0) - { - u8 *msg = vlib_validate_buffer (vm, bi_sop0, - /* follow_buffer_next */ 1); - ASSERT (!msg); - } - if (is_eop1) - { - u8 *msg = vlib_validate_buffer (vm, bi_sop1, - /* follow_buffer_next */ 1); - ASSERT (!msg); - } - } - if (0) /* "Dave" version */ - { - u32 bi_sop0 = is_sop ? bi0 : bi_sop; - u32 bi_sop1 = is_eop0 ? bi1 : bi_sop0; - - if (is_eop0) - { - to_next[0] = bi_sop0; - to_next++; - n_left_to_next--; - - vlib_validate_buffer_enqueue_x1 (vm, node, next_index, - to_next, n_left_to_next, - bi_sop0, next0); - } - if (is_eop1) - { - to_next[0] = bi_sop1; - to_next++; - n_left_to_next--; - - vlib_validate_buffer_enqueue_x1 (vm, node, next_index, - to_next, n_left_to_next, - bi_sop1, next1); - } - is_sop = is_eop1; - bi_sop = bi_sop1; - } - if (1) /* "Eliot" version */ - { - /* Speculatively enqueue to cached next. */ - u8 saved_is_sop = is_sop; - u32 bi_sop_save = bi_sop; - - bi_sop = saved_is_sop ? bi0 : bi_sop; - to_next[0] = bi_sop; - to_next += is_eop0; - n_left_to_next -= is_eop0; - - bi_sop = is_eop0 ? bi1 : bi_sop; - to_next[0] = bi_sop; - to_next += is_eop1; - n_left_to_next -= is_eop1; - - is_sop = is_eop1; - - if (PREDICT_FALSE - (!(next0 == next_index && next1 == next_index))) - { - /* Undo speculation. */ - to_next -= is_eop0 + is_eop1; - n_left_to_next += is_eop0 + is_eop1; - - /* Re-do both descriptors being careful about where we enqueue. */ - bi_sop = saved_is_sop ? bi0 : bi_sop_save; - if (is_eop0) - { - if (next0 != next_index) - vlib_set_next_frame_buffer (vm, node, next0, bi_sop); - else - { - to_next[0] = bi_sop; - to_next += 1; - n_left_to_next -= 1; - } - } - - bi_sop = is_eop0 ? bi1 : bi_sop; - if (is_eop1) - { - if (next1 != next_index) - vlib_set_next_frame_buffer (vm, node, next1, bi_sop); - else - { - to_next[0] = bi_sop; - to_next += 1; - n_left_to_next -= 1; - } - } - - /* Switch cached next index when next for both packets is the same. */ - if (is_eop0 && is_eop1 && next0 == next1) - { - vlib_put_next_frame (vm, node, next_index, - n_left_to_next); - next_index = next0; - vlib_get_next_frame (vm, node, next_index, - to_next, n_left_to_next); - } - } - } - } - - /* Bail out of dual loop and proceed with single loop. */ - found_hw_owned_descriptor_x2: - - while (n_descriptors_left > 0 && n_left_to_next > 0) - { - vlib_buffer_t *b0; - u32 bi0, fi0, len0, l3_offset0, s20, s00, flags0; - u8 is_eop0, error0, next0; - ixge_descriptor_t d0; - - d0.as_u32x4 = d[0].as_u32x4; - - s20 = d0.rx_from_hw.status[2]; - s00 = d0.rx_from_hw.status[0]; - - if (!(s20 & IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE)) - goto found_hw_owned_descriptor_x1; - - bi0 = to_rx[0]; - ASSERT (to_add >= xm->rx_buffers_to_add); - fi0 = to_add[0]; - - to_rx[0] = fi0; - to_rx += 1; - to_add -= 1; - - ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == - vlib_buffer_is_known (vm, bi0)); - ASSERT (VLIB_BUFFER_KNOWN_ALLOCATED == - vlib_buffer_is_known (vm, fi0)); - - b0 = vlib_get_buffer (vm, bi0); - - /* - * Turn this on if you run into - * "bad monkey" contexts, and you want to know exactly - * which nodes they've visited... - */ - VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); - - is_eop0 = (s20 & IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET) != 0; - ixge_rx_next_and_error_from_status_x1 - (xd, s00, s20, &next0, &error0, &flags0); - - next0 = is_sop ? next0 : next_index_sop; - next_index_sop = next0; - - b0->flags |= flags0 | (!is_eop0 << VLIB_BUFFER_LOG2_NEXT_PRESENT); - - vnet_buffer (b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index; - vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0; - - b0->error = node->errors[error0]; - - len0 = d0.rx_from_hw.n_packet_bytes_this_descriptor; - n_bytes += len0; - n_packets += is_eop0; - - /* Give new buffer to hardware. */ - d0.rx_to_hw.tail_address = - vlib_get_buffer_data_physical_address (vm, fi0); - d0.rx_to_hw.head_address = d0.rx_to_hw.tail_address; - d[0].as_u32x4 = d0.as_u32x4; - - d += 1; - n_descriptors_left -= 1; - - /* Point to either l2 or l3 header depending on next. */ - l3_offset0 = (is_sop && (next0 != IXGE_RX_NEXT_ETHERNET_INPUT)) - ? IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET (s00) : 0; - b0->current_length = len0 - l3_offset0; - b0->current_data = l3_offset0; - - b_last->next_buffer = is_sop ? ~0 : bi0; - bi_last = bi0; - b_last = b0; - - bi_sop = is_sop ? bi0 : bi_sop; - - if (CLIB_DEBUG > 0 && is_eop0) - { - u8 *msg = - vlib_validate_buffer (vm, bi_sop, /* follow_buffer_next */ 1); - ASSERT (!msg); - } - - if (0) /* "Dave" version */ - { - if (is_eop0) - { - to_next[0] = bi_sop; - to_next++; - n_left_to_next--; - - vlib_validate_buffer_enqueue_x1 (vm, node, next_index, - to_next, n_left_to_next, - bi_sop, next0); - } - } - if (1) /* "Eliot" version */ - { - if (PREDICT_TRUE (next0 == next_index)) - { - to_next[0] = bi_sop; - to_next += is_eop0; - n_left_to_next -= is_eop0; - } - else - { - if (next0 != next_index && is_eop0) - vlib_set_next_frame_buffer (vm, node, next0, bi_sop); - - vlib_put_next_frame (vm, node, next_index, n_left_to_next); - next_index = next0; - vlib_get_next_frame (vm, node, next_index, - to_next, n_left_to_next); - } - } - is_sop = is_eop0; - } - vlib_put_next_frame (vm, node, next_index, n_left_to_next); - } - -found_hw_owned_descriptor_x1: - if (n_descriptors_left > 0) - vlib_put_next_frame (vm, node, next_index, n_left_to_next); - - _vec_len (xm->rx_buffers_to_add) = (to_add + 1) - xm->rx_buffers_to_add; - - { - u32 n_done = n_descriptors - n_descriptors_left; - - if (n_trace > 0 && n_done > 0) - { - u32 n = clib_min (n_trace, n_done); - ixge_rx_trace (xm, xd, dq, - d_trace_save, - d_trace_buffers, - &dq->descriptors[start_descriptor_index], n); - vlib_set_trace_count (vm, node, n_trace - n); - } - if (d_trace_save) - { - _vec_len (d_trace_save) = 0; - _vec_len (d_trace_buffers) = 0; - } - - /* Don't keep a reference to b_last if we don't have to. - Otherwise we can over-write a next_buffer pointer after already haven - enqueued a packet. */ - if (is_sop) - { - b_last->next_buffer = ~0; - bi_last = ~0; - } - - dq->rx.n_descriptors_done_this_call = n_done; - dq->rx.n_descriptors_done_total += n_done; - dq->rx.is_start_of_packet = is_sop; - dq->rx.saved_start_of_packet_buffer_index = bi_sop; - dq->rx.saved_last_buffer_index = bi_last; - dq->rx.saved_start_of_packet_next_index = next_index_sop; - dq->rx.next_index = next_index; - dq->rx.n_bytes += n_bytes; - - return n_packets; - } -} - -static uword -ixge_rx_queue (ixge_main_t * xm, - ixge_device_t * xd, - vlib_node_runtime_t * node, u32 queue_index) -{ - ixge_dma_queue_t *dq = - vec_elt_at_index (xd->dma_queues[VLIB_RX], queue_index); - ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_RX, dq->queue_index); - uword n_packets = 0; - u32 hw_head_index, sw_head_index; - - /* One time initialization. */ - if (!dq->rx.node) - { - dq->rx.node = node; - dq->rx.is_start_of_packet = 1; - dq->rx.saved_start_of_packet_buffer_index = ~0; - dq->rx.saved_last_buffer_index = ~0; - } - - dq->rx.next_index = node->cached_next_index; - - dq->rx.n_descriptors_done_total = 0; - dq->rx.n_descriptors_done_this_call = 0; - dq->rx.n_bytes = 0; - - /* Fetch head from hardware and compare to where we think we are. */ - hw_head_index = dr->head_index; - sw_head_index = dq->head_index; - - if (hw_head_index == sw_head_index) - goto done; - - if (hw_head_index < sw_head_index) - { - u32 n_tried = dq->n_descriptors - sw_head_index; - n_packets += ixge_rx_queue_no_wrap (xm, xd, dq, sw_head_index, n_tried); - sw_head_index = - ixge_ring_add (dq, sw_head_index, - dq->rx.n_descriptors_done_this_call); - - if (dq->rx.n_descriptors_done_this_call != n_tried) - goto done; - } - if (hw_head_index >= sw_head_index) - { - u32 n_tried = hw_head_index - sw_head_index; - n_packets += ixge_rx_queue_no_wrap (xm, xd, dq, sw_head_index, n_tried); - sw_head_index = - ixge_ring_add (dq, sw_head_index, - dq->rx.n_descriptors_done_this_call); - } - -done: - dq->head_index = sw_head_index; - dq->tail_index = - ixge_ring_add (dq, dq->tail_index, dq->rx.n_descriptors_done_total); - - /* Give tail back to hardware. */ - CLIB_MEMORY_BARRIER (); - - dr->tail_index = dq->tail_index; - - vlib_increment_combined_counter (vnet_main. - interface_main.combined_sw_if_counters + - VNET_INTERFACE_COUNTER_RX, - 0 /* cpu_index */ , - xd->vlib_sw_if_index, n_packets, - dq->rx.n_bytes); - - return n_packets; -} - -static void -ixge_interrupt (ixge_main_t * xm, ixge_device_t * xd, u32 i) -{ - vlib_main_t *vm = xm->vlib_main; - ixge_regs_t *r = xd->regs; - - if (i != 20) - { - ELOG_TYPE_DECLARE (e) = - { - .function = (char *) __FUNCTION__,.format = - "ixge %d, %s",.format_args = "i1t1",.n_enum_strings = - 16,.enum_strings = - { - "flow director", - "rx miss", - "pci exception", - "mailbox", - "link status change", - "linksec key exchange", - "manageability event", - "reserved23", - "sdp0", - "sdp1", - "sdp2", - "sdp3", - "ecc", "descriptor handler error", "tcp timer", "other",},}; - struct - { - u8 instance; - u8 index; - } *ed; - ed = ELOG_DATA (&vm->elog_main, e); - ed->instance = xd->device_index; - ed->index = i - 16; - } - else - { - u32 v = r->xge_mac.link_status; - uword is_up = (v & (1 << 30)) != 0; - - ELOG_TYPE_DECLARE (e) = - { - .function = (char *) __FUNCTION__,.format = - "ixge %d, link status change 0x%x",.format_args = "i4i4",}; - struct - { - u32 instance, link_status; - } *ed; - ed = ELOG_DATA (&vm->elog_main, e); - ed->instance = xd->device_index; - ed->link_status = v; - xd->link_status_at_last_link_change = v; - - vlib_process_signal_event (vm, ixge_process_node.index, - EVENT_SET_FLAGS, - ((is_up << 31) | xd->vlib_hw_if_index)); - } -} - -always_inline u32 -clean_block (u32 * b, u32 * t, u32 n_left) -{ - u32 *t0 = t; - - while (n_left >= 4) - { - u32 bi0, bi1, bi2, bi3; - - t[0] = bi0 = b[0]; - b[0] = 0; - t += bi0 != 0; - - t[0] = bi1 = b[1]; - b[1] = 0; - t += bi1 != 0; - - t[0] = bi2 = b[2]; - b[2] = 0; - t += bi2 != 0; - - t[0] = bi3 = b[3]; - b[3] = 0; - t += bi3 != 0; - - b += 4; - n_left -= 4; - } - - while (n_left > 0) - { - u32 bi0; - - t[0] = bi0 = b[0]; - b[0] = 0; - t += bi0 != 0; - b += 1; - n_left -= 1; - } - - return t - t0; -} - -static void -ixge_tx_queue (ixge_main_t * xm, ixge_device_t * xd, u32 queue_index) -{ - vlib_main_t *vm = xm->vlib_main; - ixge_dma_queue_t *dq = - vec_elt_at_index (xd->dma_queues[VLIB_TX], queue_index); - u32 n_clean, *b, *t, *t0; - i32 n_hw_owned_descriptors; - i32 first_to_clean, last_to_clean; - u64 hwbp_race = 0; - - /* Handle case where head write back pointer update - * arrives after the interrupt during high PCI bus loads. - */ - while ((dq->head_index == dq->tx.head_index_write_back[0]) && - dq->tx.n_buffers_on_ring && (dq->head_index != dq->tail_index)) - { - hwbp_race++; - if (IXGE_HWBP_RACE_ELOG && (hwbp_race == 1)) - { - ELOG_TYPE_DECLARE (e) = - { - .function = (char *) __FUNCTION__,.format = - "ixge %d tx head index race: head %4d, tail %4d, buffs %4d",.format_args - = "i4i4i4i4",}; - struct - { - u32 instance, head_index, tail_index, n_buffers_on_ring; - } *ed; - ed = ELOG_DATA (&vm->elog_main, e); - ed->instance = xd->device_index; - ed->head_index = dq->head_index; - ed->tail_index = dq->tail_index; - ed->n_buffers_on_ring = dq->tx.n_buffers_on_ring; - } - } - - dq->head_index = dq->tx.head_index_write_back[0]; - n_hw_owned_descriptors = ixge_ring_sub (dq, dq->head_index, dq->tail_index); - ASSERT (dq->tx.n_buffers_on_ring >= n_hw_owned_descriptors); - n_clean = dq->tx.n_buffers_on_ring - n_hw_owned_descriptors; - - if (IXGE_HWBP_RACE_ELOG && hwbp_race) - { - ELOG_TYPE_DECLARE (e) = - { - .function = (char *) __FUNCTION__,.format = - "ixge %d tx head index race: head %4d, hw_owned %4d, n_clean %4d, retries %d",.format_args - = "i4i4i4i4i4",}; - struct - { - u32 instance, head_index, n_hw_owned_descriptors, n_clean, retries; - } *ed; - ed = ELOG_DATA (&vm->elog_main, e); - ed->instance = xd->device_index; - ed->head_index = dq->head_index; - ed->n_hw_owned_descriptors = n_hw_owned_descriptors; - ed->n_clean = n_clean; - ed->retries = hwbp_race; - } - - /* - * This function used to wait until hardware owned zero descriptors. - * At high PPS rates, that doesn't happen until the TX ring is - * completely full of descriptors which need to be cleaned up. - * That, in turn, causes TX ring-full drops and/or long RX service - * interruptions. - */ - if (n_clean == 0) - return; - - /* Clean the n_clean descriptors prior to the reported hardware head */ - last_to_clean = dq->head_index - 1; - last_to_clean = (last_to_clean < 0) ? last_to_clean + dq->n_descriptors : - last_to_clean; - - first_to_clean = (last_to_clean) - (n_clean - 1); - first_to_clean = (first_to_clean < 0) ? first_to_clean + dq->n_descriptors : - first_to_clean; - - vec_resize (xm->tx_buffers_pending_free, dq->n_descriptors - 1); - t0 = t = xm->tx_buffers_pending_free; - b = dq->descriptor_buffer_indices + first_to_clean; - - /* Wrap case: clean from first to end, then start to last */ - if (first_to_clean > last_to_clean) - { - t += clean_block (b, t, (dq->n_descriptors - 1) - first_to_clean); - first_to_clean = 0; - b = dq->descriptor_buffer_indices; - } - - /* Typical case: clean from first to last */ - if (first_to_clean <= last_to_clean) - t += clean_block (b, t, (last_to_clean - first_to_clean) + 1); - - if (t > t0) - { - u32 n = t - t0; - vlib_buffer_free_no_next (vm, t0, n); - ASSERT (dq->tx.n_buffers_on_ring >= n); - dq->tx.n_buffers_on_ring -= n; - _vec_len (xm->tx_buffers_pending_free) = 0; - } -} - -/* RX queue interrupts 0 thru 7; TX 8 thru 15. */ -always_inline uword -ixge_interrupt_is_rx_queue (uword i) -{ - return i < 8; -} - -always_inline uword -ixge_interrupt_is_tx_queue (uword i) -{ - return i >= 8 && i < 16; -} - -always_inline uword -ixge_tx_queue_to_interrupt (uword i) -{ - return 8 + i; -} - -always_inline uword -ixge_rx_queue_to_interrupt (uword i) -{ - return 0 + i; -} - -always_inline uword -ixge_interrupt_rx_queue (uword i) -{ - ASSERT (ixge_interrupt_is_rx_queue (i)); - return i - 0; -} - -always_inline uword -ixge_interrupt_tx_queue (uword i) -{ - ASSERT (ixge_interrupt_is_tx_queue (i)); - return i - 8; -} - -static uword -ixge_device_input (ixge_main_t * xm, - ixge_device_t * xd, vlib_node_runtime_t * node) -{ - ixge_regs_t *r = xd->regs; - u32 i, s; - uword n_rx_packets = 0; - - s = r->interrupt.status_write_1_to_set; - if (s) - r->interrupt.status_write_1_to_clear = s; - - /* *INDENT-OFF* */ - foreach_set_bit (i, s, ({ - if (ixge_interrupt_is_rx_queue (i)) - n_rx_packets += ixge_rx_queue (xm, xd, node, ixge_interrupt_rx_queue (i)); - - else if (ixge_interrupt_is_tx_queue (i)) - ixge_tx_queue (xm, xd, ixge_interrupt_tx_queue (i)); - - else - ixge_interrupt (xm, xd, i); - })); - /* *INDENT-ON* */ - - return n_rx_packets; -} - -static uword -ixge_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f) -{ - ixge_main_t *xm = &ixge_main; - ixge_device_t *xd; - uword n_rx_packets = 0; - - if (node->state == VLIB_NODE_STATE_INTERRUPT) - { - uword i; - - /* Loop over devices with interrupts. */ - /* *INDENT-OFF* */ - foreach_set_bit (i, node->runtime_data[0], ({ - xd = vec_elt_at_index (xm->devices, i); - n_rx_packets += ixge_device_input (xm, xd, node); - - /* Re-enable interrupts since we're going to stay in interrupt mode. */ - if (! (node->flags & VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE)) - xd->regs->interrupt.enable_write_1_to_set = ~0; - })); - /* *INDENT-ON* */ - - /* Clear mask of devices with pending interrupts. */ - node->runtime_data[0] = 0; - } - else - { - /* Poll all devices for input/interrupts. */ - vec_foreach (xd, xm->devices) - { - n_rx_packets += ixge_device_input (xm, xd, node); - - /* Re-enable interrupts when switching out of polling mode. */ - if (node->flags & - VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE) - xd->regs->interrupt.enable_write_1_to_set = ~0; - } - } - - return n_rx_packets; -} - -static char *ixge_error_strings[] = { -#define _(n,s) s, - foreach_ixge_error -#undef _ -}; - -/* *INDENT-OFF* */ -VLIB_REGISTER_NODE (ixge_input_node, static) = { - .function = ixge_input, - .type = VLIB_NODE_TYPE_INPUT, - .name = "ixge-input", - - /* Will be enabled if/when hardware is detected. */ - .state = VLIB_NODE_STATE_DISABLED, - - .format_buffer = format_ethernet_header_with_length, - .format_trace = format_ixge_rx_dma_trace, - - .n_errors = IXGE_N_ERROR, - .error_strings = ixge_error_strings, - - .n_next_nodes = IXGE_RX_N_NEXT, - .next_nodes = { - [IXGE_RX_NEXT_DROP] = "error-drop", - [IXGE_RX_NEXT_ETHERNET_INPUT] = "ethernet-input", - [IXGE_RX_NEXT_IP4_INPUT] = "ip4-input", - [IXGE_RX_NEXT_IP6_INPUT] = "ip6-input", - }, -}; - -VLIB_NODE_FUNCTION_MULTIARCH_CLONE (ixge_input) -CLIB_MULTIARCH_SELECT_FN (ixge_input) -/* *INDENT-ON* */ - -static u8 * -format_ixge_device_name (u8 * s, va_list * args) -{ - u32 i = va_arg (*args, u32); - ixge_main_t *xm = &ixge_main; - ixge_device_t *xd = vec_elt_at_index (xm->devices, i); - return format (s, "TenGigabitEthernet%U", - format_vlib_pci_handle, &xd->pci_device.bus_address); -} - -#define IXGE_COUNTER_IS_64_BIT (1 << 0) -#define IXGE_COUNTER_NOT_CLEAR_ON_READ (1 << 1) - -static u8 ixge_counter_flags[] = { -#define _(a,f) 0, -#define _64(a,f) IXGE_COUNTER_IS_64_BIT, - foreach_ixge_counter -#undef _ -#undef _64 -}; - -static void -ixge_update_counters (ixge_device_t * xd) -{ - /* Byte offset for counter registers. */ - static u32 reg_offsets[] = { -#define _(a,f) (a) / sizeof (u32), -#define _64(a,f) _(a,f) - foreach_ixge_counter -#undef _ -#undef _64 - }; - volatile u32 *r = (volatile u32 *) xd->regs; - int i; - - for (i = 0; i < ARRAY_LEN (xd->counters); i++) - { - u32 o = reg_offsets[i]; - xd->counters[i] += r[o]; - if (ixge_counter_flags[i] & IXGE_COUNTER_NOT_CLEAR_ON_READ) - r[o] = 0; - if (ixge_counter_flags[i] & IXGE_COUNTER_IS_64_BIT) - xd->counters[i] += (u64) r[o + 1] << (u64) 32; - } -} - -static u8 * -format_ixge_device_id (u8 * s, va_list * args) -{ - u32 device_id = va_arg (*args, u32); - char *t = 0; - switch (device_id) - { -#define _(f,n) case n: t = #f; break; - foreach_ixge_pci_device_id; -#undef _ - default: - t = 0; - break; - } - if (t == 0) - s = format (s, "unknown 0x%x", device_id); - else - s = format (s, "%s", t); - return s; -} - -static u8 * -format_ixge_link_status (u8 * s, va_list * args) -{ - ixge_device_t *xd = va_arg (*args, ixge_device_t *); - u32 v = xd->link_status_at_last_link_change; - - s = format (s, "%s", (v & (1 << 30)) ? "up" : "down"); - - { - char *modes[] = { - "1g", "10g parallel", "10g serial", "autoneg", - }; - char *speeds[] = { - "unknown", "100m", "1g", "10g", - }; - s = format (s, ", mode %s, speed %s", - modes[(v >> 26) & 3], speeds[(v >> 28) & 3]); - } - - return s; -} - -static u8 * -format_ixge_device (u8 * s, va_list * args) -{ - u32 dev_instance = va_arg (*args, u32); - CLIB_UNUSED (int verbose) = va_arg (*args, int); - ixge_main_t *xm = &ixge_main; - ixge_device_t *xd = vec_elt_at_index (xm->devices, dev_instance); - ixge_phy_t *phy = xd->phys + xd->phy_index; - uword indent = format_get_indent (s); - - ixge_update_counters (xd); - xd->link_status_at_last_link_change = xd->regs->xge_mac.link_status; - - s = format (s, "Intel 8259X: id %U\n%Ulink %U", - format_ixge_device_id, xd->device_id, - format_white_space, indent + 2, format_ixge_link_status, xd); - - { - - s = format (s, "\n%UPCIe %U", format_white_space, indent + 2, - format_vlib_pci_link_speed, &xd->pci_device); - } - - s = format (s, "\n%U", format_white_space, indent + 2); - if (phy->mdio_address != ~0) - s = format (s, "PHY address %d, id 0x%x", phy->mdio_address, phy->id); - else if (xd->sfp_eeprom.id == SFP_ID_sfp) - s = format (s, "SFP %U", format_sfp_eeprom, &xd->sfp_eeprom); - else - s = format (s, "PHY not found"); - - /* FIXME */ - { - ixge_dma_queue_t *dq = vec_elt_at_index (xd->dma_queues[VLIB_RX], 0); - ixge_dma_regs_t *dr = get_dma_regs (xd, VLIB_RX, 0); - u32 hw_head_index = dr->head_index; - u32 sw_head_index = dq->head_index; - u32 nitems; - - nitems = ixge_ring_sub (dq, hw_head_index, sw_head_index); - s = format (s, "\n%U%d unprocessed, %d total buffers on rx queue 0 ring", - format_white_space, indent + 2, nitems, dq->n_descriptors); - - s = format (s, "\n%U%d buffers in driver rx cache", - format_white_space, indent + 2, - vec_len (xm->rx_buffers_to_add)); - - s = format (s, "\n%U%d buffers on tx queue 0 ring", - format_white_space, indent + 2, - xd->dma_queues[VLIB_TX][0].tx.n_buffers_on_ring); - } - { - u32 i; - u64 v; - static char *names[] = { -#define _(a,f) #f, -#define _64(a,f) _(a,f) - foreach_ixge_counter -#undef _ -#undef _64 - }; - - for (i = 0; i < ARRAY_LEN (names); i++) - { - v = xd->counters[i] - xd->counters_last_clear[i]; - if (v != 0) - s = format (s, "\n%U%-40U%16Ld", - format_white_space, indent + 2, - format_c_identifier, names[i], v); - } - } - - return s; -} - -static void -ixge_clear_hw_interface_counters (u32 instance) -{ - ixge_main_t *xm = &ixge_main; - ixge_device_t *xd = vec_elt_at_index (xm->devices, instance); - ixge_update_counters (xd); - memcpy (xd->counters_last_clear, xd->counters, sizeof (xd->counters)); -} - -/* - * Dynamically redirect all pkts from a specific interface - * to the specified node - */ -static void -ixge_set_interface_next_node (vnet_main_t * vnm, u32 hw_if_index, - u32 node_index) -{ - ixge_main_t *xm = &ixge_main; - vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); - ixge_device_t *xd = vec_elt_at_index (xm->devices, hw->dev_instance); - - /* Shut off redirection */ - if (node_index == ~0) - { - xd->per_interface_next_index = node_index; - return; - } - - xd->per_interface_next_index = - vlib_node_add_next (xm->vlib_main, ixge_input_node.index, node_index); -} - - -/* *INDENT-OFF* */ -VNET_DEVICE_CLASS (ixge_device_class) = { - .name = "ixge", - .tx_function = ixge_interface_tx, - .format_device_name = format_ixge_device_name, - .format_device = format_ixge_device, - .format_tx_trace = format_ixge_tx_dma_trace, - .clear_counters = ixge_clear_hw_interface_counters, - .admin_up_down_function = ixge_interface_admin_up_down, - .rx_redirect_to_node = ixge_set_interface_next_node, - .flatten_output_chains = 1, -}; -/* *INDENT-ON* */ - -#define IXGE_N_BYTES_IN_RX_BUFFER (2048) // DAW-HACK: Set Rx buffer size so all packets < ETH_MTU_SIZE fit in the buffer (i.e. sop & eop for all descriptors). - -static clib_error_t * -ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) -{ - ixge_main_t *xm = &ixge_main; - vlib_main_t *vm = xm->vlib_main; - ixge_dma_queue_t *dq; - clib_error_t *error = 0; - - vec_validate (xd->dma_queues[rt], queue_index); - dq = vec_elt_at_index (xd->dma_queues[rt], queue_index); - - if (!xm->n_descriptors_per_cache_line) - xm->n_descriptors_per_cache_line = - CLIB_CACHE_LINE_BYTES / sizeof (dq->descriptors[0]); - - if (!xm->n_bytes_in_rx_buffer) - xm->n_bytes_in_rx_buffer = IXGE_N_BYTES_IN_RX_BUFFER; - xm->n_bytes_in_rx_buffer = round_pow2 (xm->n_bytes_in_rx_buffer, 1024); - if (!xm->vlib_buffer_free_list_index) - { - xm->vlib_buffer_free_list_index = - vlib_buffer_get_or_create_free_list (vm, xm->n_bytes_in_rx_buffer, - "ixge rx"); - ASSERT (xm->vlib_buffer_free_list_index != 0); - } - - if (!xm->n_descriptors[rt]) - xm->n_descriptors[rt] = 4 * VLIB_FRAME_SIZE; - - dq->queue_index = queue_index; - dq->n_descriptors = - round_pow2 (xm->n_descriptors[rt], xm->n_descriptors_per_cache_line); - dq->head_index = dq->tail_index = 0; - - dq->descriptors = vlib_physmem_alloc_aligned (vm, &error, - dq->n_descriptors * - sizeof (dq->descriptors[0]), - 128 /* per chip spec */ ); - if (error) - return error; - - memset (dq->descriptors, 0, - dq->n_descriptors * sizeof (dq->descriptors[0])); - vec_resize (dq->descriptor_buffer_indices, dq->n_descriptors); - - if (rt == VLIB_RX) - { - u32 n_alloc, i; - - n_alloc = vlib_buffer_alloc_from_free_list - (vm, dq->descriptor_buffer_indices, - vec_len (dq->descriptor_buffer_indices), - xm->vlib_buffer_free_list_index); - ASSERT (n_alloc == vec_len (dq->descriptor_buffer_indices)); - for (i = 0; i < n_alloc; i++) - { - vlib_buffer_t *b = - vlib_get_buffer (vm, dq->descriptor_buffer_indices[i]); - dq->descriptors[i].rx_to_hw.tail_address = - vlib_physmem_virtual_to_physical (vm, b->data); - } - } - else - { - u32 i; - - dq->tx.head_index_write_back = - vlib_physmem_alloc (vm, &error, CLIB_CACHE_LINE_BYTES); - - for (i = 0; i < dq->n_descriptors; i++) - dq->descriptors[i].tx = xm->tx_descriptor_template; - - vec_validate (xm->tx_buffers_pending_free, dq->n_descriptors - 1); - } - - { - ixge_dma_regs_t *dr = get_dma_regs (xd, rt, queue_index); - u64 a; - - a = vlib_physmem_virtual_to_physical (vm, dq->descriptors); - dr->descriptor_address[0] = a & 0xFFFFFFFF; - dr->descriptor_address[1] = a >> (u64) 32; - dr->n_descriptor_bytes = dq->n_descriptors * sizeof (dq->descriptors[0]); - dq->head_index = dq->tail_index = 0; - - if (rt == VLIB_RX) - { - ASSERT ((xm->n_bytes_in_rx_buffer / 1024) < 32); - dr->rx_split_control = - ( /* buffer size */ ((xm->n_bytes_in_rx_buffer / 1024) << 0) - | ( /* lo free descriptor threshold (units of 64 descriptors) */ - (1 << 22)) | ( /* descriptor type: advanced one buffer */ - (1 << 25)) | ( /* drop if no descriptors available */ - (1 << 28))); - - /* Give hardware all but last 16 cache lines' worth of descriptors. */ - dq->tail_index = dq->n_descriptors - - 16 * xm->n_descriptors_per_cache_line; - } - else - { - /* Make sure its initialized before hardware can get to it. */ - dq->tx.head_index_write_back[0] = dq->head_index; - - a = - vlib_physmem_virtual_to_physical (vm, dq->tx.head_index_write_back); - dr->tx.head_index_write_back_address[0] = /* enable bit */ 1 | a; - dr->tx.head_index_write_back_address[1] = (u64) a >> (u64) 32; - } - - /* DMA on 82599 does not work with [13] rx data write relaxed ordering - and [12] undocumented set. */ - if (rt == VLIB_RX) - dr->dca_control &= ~((1 << 13) | (1 << 12)); - - CLIB_MEMORY_BARRIER (); - - if (rt == VLIB_TX) - { - xd->regs->tx_dma_control |= (1 << 0); - dr->control |= ((32 << 0) /* prefetch threshold */ - | (64 << 8) /* host threshold */ - | (0 << 16) /* writeback threshold */ ); - } - - /* Enable this queue and wait for hardware to initialize - before adding to tail. */ - if (rt == VLIB_TX) - { - dr->control |= 1 << 25; - while (!(dr->control & (1 << 25))) - ; - } - - /* Set head/tail indices and enable DMA. */ - dr->head_index = dq->head_index; - dr->tail_index = dq->tail_index; - } - - return error; -} - -static u32 -ixge_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hw, u32 flags) -{ - ixge_device_t *xd; - ixge_regs_t *r; - u32 old; - ixge_main_t *xm = &ixge_main; - - xd = vec_elt_at_index (xm->devices, hw->dev_instance); - r = xd->regs; - - old = r->filter_control; - - if (flags & ETHERNET_INTERFACE_FLAG_ACCEPT_ALL) - r->filter_control = old | (1 << 9) /* unicast promiscuous */ ; - else - r->filter_control = old & ~(1 << 9); - - return old; -} - -static void -ixge_device_init (ixge_main_t * xm) -{ - vnet_main_t *vnm = vnet_get_main (); - ixge_device_t *xd; - - /* Reset chip(s). */ - vec_foreach (xd, xm->devices) - { - ixge_regs_t *r = xd->regs; - const u32 reset_bit = (1 << 26) | (1 << 3); - - r->control |= reset_bit; - - /* No need to suspend. Timed to take ~1e-6 secs */ - while (r->control & reset_bit) - ; - - /* Software loaded. */ - r->extended_control |= (1 << 28); - - ixge_phy_init (xd); - - /* Register ethernet interface. */ - { - u8 addr8[6]; - u32 i, addr32[2]; - clib_error_t *error; - - addr32[0] = r->rx_ethernet_address0[0][0]; - addr32[1] = r->rx_ethernet_address0[0][1]; - for (i = 0; i < 6; i++) - addr8[i] = addr32[i / 4] >> ((i % 4) * 8); - - error = ethernet_register_interface - (vnm, ixge_device_class.index, xd->device_index, - /* ethernet address */ addr8, - &xd->vlib_hw_if_index, ixge_flag_change); - if (error) - clib_error_report (error); - } - - { - vnet_sw_interface_t *sw = - vnet_get_hw_sw_interface (vnm, xd->vlib_hw_if_index); - xd->vlib_sw_if_index = sw->sw_if_index; - } - - ixge_dma_init (xd, VLIB_RX, /* queue_index */ 0); - - xm->n_descriptors[VLIB_TX] = 20 * VLIB_FRAME_SIZE; - - ixge_dma_init (xd, VLIB_TX, /* queue_index */ 0); - - /* RX/TX queue 0 gets mapped to interrupt bits 0 & 8. */ - r->interrupt.queue_mapping[0] = (( /* valid bit */ (1 << 7) | - ixge_rx_queue_to_interrupt (0)) << 0); - - r->interrupt.queue_mapping[0] |= (( /* valid bit */ (1 << 7) | - ixge_tx_queue_to_interrupt (0)) << 8); - - /* No use in getting too many interrupts. - Limit them to one every 3/4 ring size at line rate - min sized packets. - No need for this since kernel/vlib main loop provides adequate interrupt - limiting scheme. */ - if (0) - { - f64 line_rate_max_pps = - 10e9 / (8 * (64 + /* interframe padding */ 20)); - ixge_throttle_queue_interrupt (r, 0, - .75 * xm->n_descriptors[VLIB_RX] / - line_rate_max_pps); - } - - /* Accept all multicast and broadcast packets. Should really add them - to the dst_ethernet_address register array. */ - r->filter_control |= (1 << 10) | (1 << 8); - - /* Enable frames up to size in mac frame size register. */ - r->xge_mac.control |= 1 << 2; - r->xge_mac.rx_max_frame_size = (9216 + 14) << 16; - - /* Enable all interrupts. */ - if (!IXGE_ALWAYS_POLL) - r->interrupt.enable_write_1_to_set = ~0; - } -} - -static uword -ixge_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) -{ - vnet_main_t *vnm = vnet_get_main (); - ixge_main_t *xm = &ixge_main; - ixge_device_t *xd; - uword event_type, *event_data = 0; - f64 timeout, link_debounce_deadline; - - ixge_device_init (xm); - - /* Clear all counters. */ - vec_foreach (xd, xm->devices) - { - ixge_update_counters (xd); - memset (xd->counters, 0, sizeof (xd->counters)); - } - - timeout = 30.0; - link_debounce_deadline = 1e70; - - while (1) - { - /* 36 bit stat counters could overflow in ~50 secs. - We poll every 30 secs to be conservative. */ - vlib_process_wait_for_event_or_clock (vm, timeout); - - event_type = vlib_process_get_events (vm, &event_data); - - switch (event_type) - { - case EVENT_SET_FLAGS: - /* 1 ms */ - link_debounce_deadline = vlib_time_now (vm) + 1e-3; - timeout = 1e-3; - break; - - case ~0: - /* No events found: timer expired. */ - if (vlib_time_now (vm) > link_debounce_deadline) - { - vec_foreach (xd, xm->devices) - { - ixge_regs_t *r = xd->regs; - u32 v = r->xge_mac.link_status; - uword is_up = (v & (1 << 30)) != 0; - - vnet_hw_interface_set_flags - (vnm, xd->vlib_hw_if_index, - is_up ? VNET_HW_INTERFACE_FLAG_LINK_UP : 0); - } - link_debounce_deadline = 1e70; - timeout = 30.0; - } - break; - - default: - ASSERT (0); - } - - if (event_data) - _vec_len (event_data) = 0; - - /* Query stats every 30 secs. */ - { - f64 now = vlib_time_now (vm); - if (now - xm->time_last_stats_update > 30) - { - xm->time_last_stats_update = now; - vec_foreach (xd, xm->devices) ixge_update_counters (xd); - } - } - } - - return 0; -} - -static vlib_node_registration_t ixge_process_node = { - .function = ixge_process, - .type = VLIB_NODE_TYPE_PROCESS, - .name = "ixge-process", -}; - -clib_error_t * -ixge_init (vlib_main_t * vm) -{ - ixge_main_t *xm = &ixge_main; - clib_error_t *error; - - xm->vlib_main = vm; - memset (&xm->tx_descriptor_template, 0, - sizeof (xm->tx_descriptor_template)); - memset (&xm->tx_descriptor_template_mask, 0, - sizeof (xm->tx_descriptor_template_mask)); - xm->tx_descriptor_template.status0 = - (IXGE_TX_DESCRIPTOR_STATUS0_ADVANCED | - IXGE_TX_DESCRIPTOR_STATUS0_IS_ADVANCED | - IXGE_TX_DESCRIPTOR_STATUS0_INSERT_FCS); - xm->tx_descriptor_template_mask.status0 = 0xffff; - xm->tx_descriptor_template_mask.status1 = 0x00003fff; - - xm->tx_descriptor_template_mask.status0 &= - ~(IXGE_TX_DESCRIPTOR_STATUS0_IS_END_OF_PACKET - | IXGE_TX_DESCRIPTOR_STATUS0_REPORT_STATUS); - xm->tx_descriptor_template_mask.status1 &= - ~(IXGE_TX_DESCRIPTOR_STATUS1_DONE); - - error = vlib_call_init_function (vm, pci_bus_init); - - return error; -} - -VLIB_INIT_FUNCTION (ixge_init); - - -static void -ixge_pci_intr_handler (vlib_pci_device_t * dev) -{ - ixge_main_t *xm = &ixge_main; - vlib_main_t *vm = xm->vlib_main; - - vlib_node_set_interrupt_pending (vm, ixge_input_node.index); - - /* Let node know which device is interrupting. */ - { - vlib_node_runtime_t *rt = - vlib_node_get_runtime (vm, ixge_input_node.index); - rt->runtime_data[0] |= 1 << dev->private_data; - } -} - -static clib_error_t * -ixge_pci_init (vlib_main_t * vm, vlib_pci_device_t * dev) -{ - ixge_main_t *xm = &ixge_main; - clib_error_t *error; - void *r; - ixge_device_t *xd; - - /* Device found: make sure we have dma memory. */ - if (unix_physmem_is_fake (vm)) - return clib_error_return (0, "no physical memory available"); - - error = vlib_pci_map_resource (dev, 0, &r); - if (error) - return error; - - vec_add2 (xm->devices, xd, 1); - - if (vec_len (xm->devices) == 1) - { - ixge_input_node.function = ixge_input_multiarch_select (); - } - - xd->pci_device = dev[0]; - xd->device_id = xd->pci_device.config0.header.device_id; - xd->regs = r; - xd->device_index = xd - xm->devices; - xd->pci_function = dev->bus_address.function; - xd->per_interface_next_index = ~0; - - - /* Chip found so enable node. */ - { - vlib_node_set_state (vm, ixge_input_node.index, - (IXGE_ALWAYS_POLL - ? VLIB_NODE_STATE_POLLING - : VLIB_NODE_STATE_INTERRUPT)); - - dev->private_data = xd->device_index; - } - - if (vec_len (xm->devices) == 1) - { - vlib_register_node (vm, &ixge_process_node); - xm->process_node_index = ixge_process_node.index; - } - - error = vlib_pci_bus_master_enable (dev); - - if (error) - return error; - - return vlib_pci_intr_enable (dev); -} - -/* *INDENT-OFF* */ -PCI_REGISTER_DEVICE (ixge_pci_device_registration,static) = { - .init_function = ixge_pci_init, - .interrupt_handler = ixge_pci_intr_handler, - .supported_devices = { -#define _(t,i) { .vendor_id = PCI_VENDOR_ID_INTEL, .device_id = i, }, - foreach_ixge_pci_device_id -#undef _ - { 0 }, - }, -}; -/* *INDENT-ON* */ - -void -ixge_set_next_node (ixge_rx_next_t next, char *name) -{ - vlib_node_registration_t *r = &ixge_input_node; - - switch (next) - { - case IXGE_RX_NEXT_IP4_INPUT: - case IXGE_RX_NEXT_IP6_INPUT: - case IXGE_RX_NEXT_ETHERNET_INPUT: - r->next_nodes[next] = name; - break; - - default: - clib_warning ("%s: illegal next %d\n", __FUNCTION__, next); - break; - } -} -#endif - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet/devices/nic/ixge.h b/src/vnet/devices/nic/ixge.h deleted file mode 100644 index a8e652dc..00000000 --- a/src/vnet/devices/nic/ixge.h +++ /dev/null @@ -1,1293 +0,0 @@ -/* - * Copyright (c) 2016 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef included_ixge_h -#define included_ixge_h - -#include -#include -#include -#include -#include -#include - -typedef volatile struct -{ - /* [31:7] 128 byte aligned. */ - u32 descriptor_address[2]; - u32 n_descriptor_bytes; - - /* [5] rx/tx descriptor dca enable - [6] rx packet head dca enable - [7] rx packet tail dca enable - [9] rx/tx descriptor relaxed order - [11] rx/tx descriptor write back relaxed order - [13] rx/tx data write/read relaxed order - [15] rx head data write relaxed order - [31:24] apic id for cpu's cache. */ - u32 dca_control; - - u32 head_index; - - /* [4:0] tail buffer size (in 1k byte units) - [13:8] head buffer size (in 64 byte units) - [24:22] lo free descriptors threshold (units of 64 descriptors) - [27:25] descriptor type 0 = legacy, 1 = advanced one buffer (e.g. tail), - 2 = advanced header splitting (head + tail), 5 = advanced header - splitting (head only). - [28] drop if no descriptors available. */ - u32 rx_split_control; - - u32 tail_index; - CLIB_PAD_FROM_TO (0x1c, 0x28); - - /* [7:0] rx/tx prefetch threshold - [15:8] rx/tx host threshold - [24:16] rx/tx write back threshold - [25] rx/tx enable - [26] tx descriptor writeback flush - [30] rx strip vlan enable */ - u32 control; - - u32 rx_coallesce_control; - - union - { - struct - { - /* packets bytes lo hi */ - u32 stats[3]; - - u32 unused; - } rx; - - struct - { - u32 unused[2]; - - /* [0] enables head write back. */ - u32 head_index_write_back_address[2]; - } tx; - }; -} ixge_dma_regs_t; - -/* Only advanced descriptors are supported. */ -typedef struct -{ - u64 tail_address; - u64 head_address; -} ixge_rx_to_hw_descriptor_t; - -typedef struct -{ - u32 status[3]; - u16 n_packet_bytes_this_descriptor; - u16 vlan_tag; -} ixge_rx_from_hw_descriptor_t; - -#define IXGE_RX_DESCRIPTOR_STATUS0_IS_LAYER2 (1 << (4 + 11)) -/* Valid if not layer2. */ -#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4 (1 << (4 + 0)) -#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP4_EXT (1 << (4 + 1)) -#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6 (1 << (4 + 2)) -#define IXGE_RX_DESCRIPTOR_STATUS0_IS_IP6_EXT (1 << (4 + 3)) -#define IXGE_RX_DESCRIPTOR_STATUS0_IS_TCP (1 << (4 + 4)) -#define IXGE_RX_DESCRIPTOR_STATUS0_IS_UDP (1 << (4 + 5)) -#define IXGE_RX_DESCRIPTOR_STATUS0_L3_OFFSET(s) (((s) >> 21) & 0x3ff) - -#define IXGE_RX_DESCRIPTOR_STATUS2_IS_OWNED_BY_SOFTWARE (1 << (0 + 0)) -#define IXGE_RX_DESCRIPTOR_STATUS2_IS_END_OF_PACKET (1 << (0 + 1)) -#define IXGE_RX_DESCRIPTOR_STATUS2_IS_VLAN (1 << (0 + 3)) -#define IXGE_RX_DESCRIPTOR_STATUS2_IS_UDP_CHECKSUMMED (1 << (0 + 4)) -#define IXGE_RX_DESCRIPTOR_STATUS2_IS_TCP_CHECKSUMMED (1 << (0 + 5)) -#define IXGE_RX_DESCRIPTOR_STATUS2_IS_IP4_CHECKSUMMED (1 << (0 + 6)) -#define IXGE_RX_DESCRIPTOR_STATUS2_NOT_UNICAST (1 << (0 + 7)) -#define IXGE_RX_DESCRIPTOR_STATUS2_IS_DOUBLE_VLAN (1 << (0 + 9)) -#define IXGE_RX_DESCRIPTOR_STATUS2_UDP_CHECKSUM_ERROR (1 << (0 + 10)) -#define IXGE_RX_DESCRIPTOR_STATUS2_ETHERNET_ERROR (1 << (20 + 9)) -#define IXGE_RX_DESCRIPTOR_STATUS2_TCP_CHECKSUM_ERROR (1 << (20 + 10)) -#define IXGE_RX_DESCRIPTOR_STATUS2_IP4_CHECKSUM_ERROR (1 << (20 + 11)) - -/* For layer2 packets stats0 bottom 3 bits give ether type index from filter. */ -#define IXGE_RX_DESCRIPTOR_STATUS0_LAYER2_ETHERNET_TYPE(s) ((s) & 7) - -typedef struct -{ - u64 buffer_address; - u16 n_bytes_this_buffer; - u16 status0; - u32 status1; -#define IXGE_TX_DESCRIPTOR_STATUS0_ADVANCED (3 << 4) -#define IXGE_TX_DESCRIPTOR_STATUS0_IS_ADVANCED (1 << (8 + 5)) -#define IXGE_TX_DESCRIPTOR_STATUS0_LOG2_REPORT_STATUS (8 + 3) -#define IXGE_TX_DESCRIPTOR_STATUS0_REPORT_STATUS (1 << IXGE_TX_DESCRIPTOR_STATUS0_LOG2_REPORT_STATUS) -#define IXGE_TX_DESCRIPTOR_STATUS0_INSERT_FCS (1 << (8 + 1)) -#define IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET (8 + 0) -#define IXGE_TX_DESCRIPTOR_STATUS0_IS_END_OF_PACKET (1 << IXGE_TX_DESCRIPTOR_STATUS0_LOG2_IS_END_OF_PACKET) -#define IXGE_TX_DESCRIPTOR_STATUS1_DONE (1 << 0) -#define IXGE_TX_DESCRIPTOR_STATUS1_CONTEXT(i) (/* valid */ (1 << 7) | ((i) << 4)) -#define IXGE_TX_DESCRIPTOR_STATUS1_IPSEC_OFFLOAD (1 << (8 + 2)) -#define IXGE_TX_DESCRIPTOR_STATUS1_INSERT_TCP_UDP_CHECKSUM (1 << (8 + 1)) -#define IXGE_TX_DESCRIPTOR_STATUS1_INSERT_IP4_CHECKSUM (1 << (8 + 0)) -#define IXGE_TX_DESCRIPTOR_STATUS0_N_BYTES_THIS_BUFFER(l) ((l) << 0) -#define IXGE_TX_DESCRIPTOR_STATUS1_N_BYTES_IN_PACKET(l) ((l) << 14) -} ixge_tx_descriptor_t; - -typedef struct -{ - struct - { - u8 checksum_start_offset; - u8 checksum_insert_offset; - u16 checksum_end_offset; - } ip, tcp; - u32 status0; - - u8 status1; - - /* Byte offset after UDP/TCP header. */ - u8 payload_offset; - - u16 max_tcp_segment_size; -} __attribute__ ((packed)) ixge_tx_context_descriptor_t; - -typedef union -{ - ixge_rx_to_hw_descriptor_t rx_to_hw; - ixge_rx_from_hw_descriptor_t rx_from_hw; - ixge_tx_descriptor_t tx; - u32x4 as_u32x4; -} ixge_descriptor_t; - -typedef volatile struct -{ - /* [2] pcie master disable - [3] mac reset - [26] global device reset */ - u32 control; - u32 control_alias; - /* [3:2] device id (0 or 1 for dual port chips) - [7] link is up - [17:10] num vfs - [18] io active - [19] pcie master enable status */ - u32 status_read_only; - CLIB_PAD_FROM_TO (0xc, 0x18); - /* [14] pf reset done - [17] relaxed ordering disable - [26] extended vlan enable - [28] driver loaded */ - u32 extended_control; - CLIB_PAD_FROM_TO (0x1c, 0x20); - - /* software definable pins. - sdp_data [7:0] - sdp_is_output [15:8] - sdp_is_native [23:16] - sdp_function [31:24]. - */ - u32 sdp_control; - CLIB_PAD_FROM_TO (0x24, 0x28); - - /* [0] i2c clock in - [1] i2c clock out - [2] i2c data in - [3] i2c data out */ - u32 i2c_control; - CLIB_PAD_FROM_TO (0x2c, 0x4c); - u32 tcp_timer; - - CLIB_PAD_FROM_TO (0x50, 0x200); - - u32 led_control; - - CLIB_PAD_FROM_TO (0x204, 0x600); - u32 core_spare; - CLIB_PAD_FROM_TO (0x604, 0x700); - - struct - { - u32 vflr_events_clear[4]; - u32 mailbox_interrupt_status[4]; - u32 mailbox_interrupt_enable[4]; - CLIB_PAD_FROM_TO (0x730, 0x800); - } pf_foo; - - struct - { - u32 status_write_1_to_clear; - CLIB_PAD_FROM_TO (0x804, 0x808); - u32 status_write_1_to_set; - CLIB_PAD_FROM_TO (0x80c, 0x810); - u32 status_auto_clear_enable; - CLIB_PAD_FROM_TO (0x814, 0x820); - - /* [11:3] minimum inter-interrupt interval - (2e-6 units; 20e-6 units for fast ethernet). - [15] low-latency interrupt moderation enable - [20:16] low-latency interrupt credit - [27:21] interval counter - [31] write disable for credit and counter (write only). */ - u32 throttle0[24]; - - u32 enable_write_1_to_set; - CLIB_PAD_FROM_TO (0x884, 0x888); - u32 enable_write_1_to_clear; - CLIB_PAD_FROM_TO (0x88c, 0x890); - u32 enable_auto_clear; - u32 msi_to_eitr_select; - /* [3:0] spd 0-3 interrupt detection enable - [4] msi-x enable - [5] other clear disable (makes other bits in status not clear on read) - etc. */ - u32 control; - CLIB_PAD_FROM_TO (0x89c, 0x900); - - /* Defines interrupt mapping for 128 rx + 128 tx queues. - 64 x 4 8 bit entries. - For register [i]: - [5:0] bit in interrupt status for rx queue 2*i + 0 - [7] valid bit - [13:8] bit for tx queue 2*i + 0 - [15] valid bit - similar for rx 2*i + 1 and tx 2*i + 1. */ - u32 queue_mapping[64]; - - /* tcp timer [7:0] and other interrupts [15:8] */ - u32 misc_mapping; - CLIB_PAD_FROM_TO (0xa04, 0xa90); - - /* 64 interrupts determined by mappings. */ - u32 status1_write_1_to_clear[4]; - u32 enable1_write_1_to_set[4]; - u32 enable1_write_1_to_clear[4]; - CLIB_PAD_FROM_TO (0xac0, 0xad0); - u32 status1_enable_auto_clear[4]; - CLIB_PAD_FROM_TO (0xae0, 0x1000); - } interrupt; - - ixge_dma_regs_t rx_dma0[64]; - - CLIB_PAD_FROM_TO (0x2000, 0x2140); - u32 dcb_rx_packet_plane_t4_config[8]; - u32 dcb_rx_packet_plane_t4_status[8]; - CLIB_PAD_FROM_TO (0x2180, 0x2300); - - /* reg i defines mapping for 4 rx queues starting at 4*i + 0. */ - u32 rx_queue_stats_mapping[32]; - u32 rx_queue_stats_control; - - CLIB_PAD_FROM_TO (0x2384, 0x2410); - u32 fc_user_descriptor_ptr[2]; - u32 fc_buffer_control; - CLIB_PAD_FROM_TO (0x241c, 0x2420); - u32 fc_rx_dma; - CLIB_PAD_FROM_TO (0x2424, 0x2430); - u32 dcb_packet_plane_control; - CLIB_PAD_FROM_TO (0x2434, 0x2f00); - - u32 rx_dma_control; - u32 pf_queue_drop_enable; - CLIB_PAD_FROM_TO (0x2f08, 0x2f20); - u32 rx_dma_descriptor_cache_config; - CLIB_PAD_FROM_TO (0x2f24, 0x3000); - - /* 1 bit. */ - u32 rx_enable; - CLIB_PAD_FROM_TO (0x3004, 0x3008); - /* [15:0] ether type (little endian) - [31:16] opcode (big endian) */ - u32 flow_control_control; - CLIB_PAD_FROM_TO (0x300c, 0x3020); - /* 3 bit traffic class for each of 8 priorities. */ - u32 rx_priority_to_traffic_class; - CLIB_PAD_FROM_TO (0x3024, 0x3028); - u32 rx_coallesce_data_buffer_control; - CLIB_PAD_FROM_TO (0x302c, 0x3190); - u32 rx_packet_buffer_flush_detect; - CLIB_PAD_FROM_TO (0x3194, 0x3200); - u32 flow_control_tx_timers[4]; /* 2 timer values */ - CLIB_PAD_FROM_TO (0x3210, 0x3220); - u32 flow_control_rx_threshold_lo[8]; - CLIB_PAD_FROM_TO (0x3240, 0x3260); - u32 flow_control_rx_threshold_hi[8]; - CLIB_PAD_FROM_TO (0x3280, 0x32a0); - u32 flow_control_refresh_threshold; - CLIB_PAD_FROM_TO (0x32a4, 0x3c00); - /* For each of 8 traffic classes (units of bytes). */ - u32 rx_packet_buffer_size[8]; - CLIB_PAD_FROM_TO (0x3c20, 0x3d00); - u32 flow_control_config; - CLIB_PAD_FROM_TO (0x3d04, 0x4200); - - struct - { - u32 pcs_config; - CLIB_PAD_FROM_TO (0x4204, 0x4208); - u32 link_control; - u32 link_status; - u32 pcs_debug[2]; - u32 auto_negotiation; - u32 link_partner_ability; - u32 auto_negotiation_tx_next_page; - u32 auto_negotiation_link_partner_next_page; - CLIB_PAD_FROM_TO (0x4228, 0x4240); - } gige_mac; - - struct - { - /* [0] tx crc enable - [2] enable frames up to max frame size register [31:16] - [10] pad frames < 64 bytes if specified by user - [15] loopback enable - [16] mdc hi speed - [17] turn off mdc between mdio packets */ - u32 control; - - /* [5] rx symbol error (all bits clear on read) - [6] rx illegal symbol - [7] rx idle error - [8] rx local fault - [9] rx remote fault */ - u32 status; - - u32 pause_and_pace_control; - CLIB_PAD_FROM_TO (0x424c, 0x425c); - u32 phy_command; - u32 phy_data; - CLIB_PAD_FROM_TO (0x4264, 0x4268); - - /* [31:16] max frame size in bytes. */ - u32 rx_max_frame_size; - CLIB_PAD_FROM_TO (0x426c, 0x4288); - - /* [0] - [2] pcs receive link up? (latch lo) - [7] local fault - [1] - [0] pcs 10g base r capable - [1] pcs 10g base x capable - [2] pcs 10g base w capable - [10] rx local fault - [11] tx local fault - [15:14] 2 => device present at this address (else not present) */ - u32 xgxs_status[2]; - - u32 base_x_pcs_status; - - /* [0] pass unrecognized flow control frames - [1] discard pause frames - [2] rx priority flow control enable (only in dcb mode) - [3] rx flow control enable. */ - u32 flow_control; - - /* [3:0] tx lanes change polarity - [7:4] rx lanes change polarity - [11:8] swizzle tx lanes - [15:12] swizzle rx lanes - 4 x 2 bit tx lane swap - 4 x 2 bit rx lane swap. */ - u32 serdes_control; - - u32 fifo_control; - - /* [0] force link up - [1] autoneg ack2 bit to transmit - [6:2] autoneg selector field to transmit - [8:7] 10g pma/pmd type 0 => xaui, 1 kx4, 2 cx4 - [9] 1g pma/pmd type 0 => sfi, 1 => kx/bx - [10] disable 10g on without main power - [11] restart autoneg on transition to dx power state - [12] restart autoneg - [15:13] link mode: - 0 => 1g no autoneg - 1 => 10g kx4 parallel link no autoneg - 2 => 1g bx autoneg - 3 => 10g sfi serdes - 4 => kx4/kx/kr - 5 => xgmii 1g/100m - 6 => kx4/kx/kr 1g an - 7 kx4/kx/kr sgmii. - [16] kr support - [17] fec requested - [18] fec ability - etc. */ - u32 auto_negotiation_control; - - /* [0] signal detect 1g/100m - [1] fec signal detect - [2] 10g serial pcs fec block lock - [3] 10g serial high error rate - [4] 10g serial pcs block lock - [5] kx/kx4/kr autoneg next page received - [6] kx/kx4/kr backplane autoneg next page received - [7] link status clear to read - [11:8] 10g signal detect (4 lanes) (for serial just lane 0) - [12] 10g serial signal detect - [16:13] 10g parallel lane sync status - [17] 10g parallel align status - [18] 1g sync status - [19] kx/kx4/kr backplane autoneg is idle - [20] 1g autoneg enabled - [21] 1g pcs enabled for sgmii - [22] 10g xgxs enabled - [23] 10g serial fec enabled (forward error detection) - [24] 10g kr pcs enabled - [25] sgmii enabled - [27:26] mac link mode - 0 => 1g - 1 => 10g parallel - 2 => 10g serial - 3 => autoneg - [29:28] link speed - 1 => 100m - 2 => 1g - 3 => 10g - [30] link is up - [31] kx/kx4/kr backplane autoneg completed successfully. */ - u32 link_status; - - /* [17:16] pma/pmd for 10g serial - 0 => kr, 2 => sfi - [18] disable dme pages */ - u32 auto_negotiation_control2; - - CLIB_PAD_FROM_TO (0x42ac, 0x42b0); - u32 link_partner_ability[2]; - CLIB_PAD_FROM_TO (0x42b8, 0x42d0); - u32 manageability_control; - u32 link_partner_next_page[2]; - CLIB_PAD_FROM_TO (0x42dc, 0x42e0); - u32 kr_pcs_control; - u32 kr_pcs_status; - u32 fec_status[2]; - CLIB_PAD_FROM_TO (0x42f0, 0x4314); - u32 sgmii_control; - CLIB_PAD_FROM_TO (0x4318, 0x4324); - u32 link_status2; - CLIB_PAD_FROM_TO (0x4328, 0x4900); - } xge_mac; - - u32 tx_dcb_control; - u32 tx_dcb_descriptor_plane_queue_select; - u32 tx_dcb_descriptor_plane_t1_config; - u32 tx_dcb_descriptor_plane_t1_status; - CLIB_PAD_FROM_TO (0x4910, 0x4950); - - /* For each TC in units of 1k bytes. */ - u32 tx_packet_buffer_thresholds[8]; - CLIB_PAD_FROM_TO (0x4970, 0x4980); - struct - { - u32 mmw; - u32 config; - u32 status; - u32 rate_drift; - } dcb_tx_rate_scheduler; - CLIB_PAD_FROM_TO (0x4990, 0x4a80); - u32 tx_dma_control; - CLIB_PAD_FROM_TO (0x4a84, 0x4a88); - u32 tx_dma_tcp_flags_control[2]; - CLIB_PAD_FROM_TO (0x4a90, 0x4b00); - u32 pf_mailbox[64]; - CLIB_PAD_FROM_TO (0x4c00, 0x5000); - - /* RX */ - u32 checksum_control; - CLIB_PAD_FROM_TO (0x5004, 0x5008); - u32 rx_filter_control; - CLIB_PAD_FROM_TO (0x500c, 0x5010); - u32 management_vlan_tag[8]; - u32 management_udp_tcp_ports[8]; - CLIB_PAD_FROM_TO (0x5050, 0x5078); - /* little endian. */ - u32 extended_vlan_ether_type; - CLIB_PAD_FROM_TO (0x507c, 0x5080); - /* [1] store/dma bad packets - [8] accept all multicast - [9] accept all unicast - [10] accept all broadcast. */ - u32 filter_control; - CLIB_PAD_FROM_TO (0x5084, 0x5088); - /* [15:0] vlan ethernet type (0x8100) little endian - [28] cfi bit expected - [29] drop packets with unexpected cfi bit - [30] vlan filter enable. */ - u32 vlan_control; - CLIB_PAD_FROM_TO (0x508c, 0x5090); - /* [1:0] hi bit of ethernet address for 12 bit index into multicast table - 0 => 47, 1 => 46, 2 => 45, 3 => 43. - [2] enable multicast filter - */ - u32 multicast_control; - CLIB_PAD_FROM_TO (0x5094, 0x5100); - u32 fcoe_rx_control; - CLIB_PAD_FROM_TO (0x5104, 0x5108); - u32 fc_flt_context; - CLIB_PAD_FROM_TO (0x510c, 0x5110); - u32 fc_filter_control; - CLIB_PAD_FROM_TO (0x5114, 0x5120); - u32 rx_message_type_lo; - CLIB_PAD_FROM_TO (0x5124, 0x5128); - /* [15:0] ethernet type (little endian) - [18:16] matche pri in vlan tag - [19] priority match enable - [25:20] virtualization pool - [26] pool enable - [27] is fcoe - [30] ieee 1588 timestamp enable - [31] filter enable. - (See ethernet_type_queue_select.) */ - u32 ethernet_type_queue_filter[8]; - CLIB_PAD_FROM_TO (0x5148, 0x5160); - /* [7:0] l2 ethernet type and - [15:8] l2 ethernet type or */ - u32 management_decision_filters1[8]; - u32 vf_vm_tx_switch_loopback_enable[2]; - u32 rx_time_sync_control; - CLIB_PAD_FROM_TO (0x518c, 0x5190); - u32 management_ethernet_type_filters[4]; - u32 rx_timestamp_attributes_lo; - u32 rx_timestamp_hi; - u32 rx_timestamp_attributes_hi; - CLIB_PAD_FROM_TO (0x51ac, 0x51b0); - u32 pf_virtual_control; - CLIB_PAD_FROM_TO (0x51b4, 0x51d8); - u32 fc_offset_parameter; - CLIB_PAD_FROM_TO (0x51dc, 0x51e0); - u32 vf_rx_enable[2]; - u32 rx_timestamp_lo; - CLIB_PAD_FROM_TO (0x51ec, 0x5200); - /* 12 bits determined by multicast_control - lookup bits in this vector. */ - u32 multicast_enable[128]; - - /* [0] ethernet address [31:0] - [1] [15:0] ethernet address [47:32] - [31] valid bit. - Index 0 is read from eeprom after reset. */ - u32 rx_ethernet_address0[16][2]; - - CLIB_PAD_FROM_TO (0x5480, 0x5800); - u32 wake_up_control; - CLIB_PAD_FROM_TO (0x5804, 0x5808); - u32 wake_up_filter_control; - CLIB_PAD_FROM_TO (0x580c, 0x5818); - u32 multiple_rx_queue_command_82598; - CLIB_PAD_FROM_TO (0x581c, 0x5820); - u32 management_control; - u32 management_filter_control; - CLIB_PAD_FROM_TO (0x5828, 0x5838); - u32 wake_up_ip4_address_valid; - CLIB_PAD_FROM_TO (0x583c, 0x5840); - u32 wake_up_ip4_address_table[4]; - u32 management_control_to_host; - CLIB_PAD_FROM_TO (0x5854, 0x5880); - u32 wake_up_ip6_address_table[4]; - - /* unicast_and broadcast_and vlan_and ip_address_and - etc. */ - u32 management_decision_filters[8]; - - u32 management_ip4_or_ip6_address_filters[4][4]; - CLIB_PAD_FROM_TO (0x58f0, 0x5900); - u32 wake_up_packet_length; - CLIB_PAD_FROM_TO (0x5904, 0x5910); - u32 management_ethernet_address_filters[4][2]; - CLIB_PAD_FROM_TO (0x5930, 0x5a00); - u32 wake_up_packet_memory[32]; - CLIB_PAD_FROM_TO (0x5a80, 0x5c00); - u32 redirection_table_82598[32]; - u32 rss_random_keys_82598[10]; - CLIB_PAD_FROM_TO (0x5ca8, 0x6000); - - ixge_dma_regs_t tx_dma[128]; - - u32 pf_vm_vlan_insert[64]; - u32 tx_dma_tcp_max_alloc_size_requests; - CLIB_PAD_FROM_TO (0x8104, 0x8110); - u32 vf_tx_enable[2]; - CLIB_PAD_FROM_TO (0x8118, 0x8120); - /* [0] dcb mode enable - [1] virtualization mode enable - [3:2] number of tcs/qs per pool. */ - u32 multiple_tx_queues_command; - CLIB_PAD_FROM_TO (0x8124, 0x8200); - u32 pf_vf_anti_spoof[8]; - u32 pf_dma_tx_switch_control; - CLIB_PAD_FROM_TO (0x8224, 0x82e0); - u32 tx_strict_low_latency_queues[4]; - CLIB_PAD_FROM_TO (0x82f0, 0x8600); - u32 tx_queue_stats_mapping_82599[32]; - u32 tx_queue_packet_counts[32]; - u32 tx_queue_byte_counts[32][2]; - - struct - { - u32 control; - u32 status; - u32 buffer_almost_full; - CLIB_PAD_FROM_TO (0x880c, 0x8810); - u32 buffer_min_ifg; - CLIB_PAD_FROM_TO (0x8814, 0x8900); - } tx_security; - - struct - { - u32 index; - u32 salt; - u32 key[4]; - CLIB_PAD_FROM_TO (0x8918, 0x8a00); - } tx_ipsec; - - struct - { - u32 capabilities; - u32 control; - u32 tx_sci[2]; - u32 sa; - u32 sa_pn[2]; - u32 key[2][4]; - /* untagged packets, encrypted packets, protected packets, - encrypted bytes, protected bytes */ - u32 stats[5]; - CLIB_PAD_FROM_TO (0x8a50, 0x8c00); - } tx_link_security; - - struct - { - u32 control; - u32 timestamp_value[2]; - u32 system_time[2]; - u32 increment_attributes; - u32 time_adjustment_offset[2]; - u32 aux_control; - u32 target_time[2][2]; - CLIB_PAD_FROM_TO (0x8c34, 0x8c3c); - u32 aux_time_stamp[2][2]; - CLIB_PAD_FROM_TO (0x8c4c, 0x8d00); - } tx_timesync; - - struct - { - u32 control; - u32 status; - CLIB_PAD_FROM_TO (0x8d08, 0x8e00); - } rx_security; - - struct - { - u32 index; - u32 ip_address[4]; - u32 spi; - u32 ip_index; - u32 key[4]; - u32 salt; - u32 mode; - CLIB_PAD_FROM_TO (0x8e34, 0x8f00); - } rx_ipsec; - - struct - { - u32 capabilities; - u32 control; - u32 sci[2]; - u32 sa[2]; - u32 sa_pn[2]; - u32 key[2][4]; - /* see datasheet */ - u32 stats[17]; - CLIB_PAD_FROM_TO (0x8f84, 0x9000); - } rx_link_security; - - /* 4 wake up, 2 management, 2 wake up. */ - u32 flexible_filters[8][16][4]; - CLIB_PAD_FROM_TO (0x9800, 0xa000); - - /* 4096 bits. */ - u32 vlan_filter[128]; - - /* [0] ethernet address [31:0] - [1] [15:0] ethernet address [47:32] - [31] valid bit. - Index 0 is read from eeprom after reset. */ - u32 rx_ethernet_address1[128][2]; - - /* select one of 64 pools for each rx address. */ - u32 rx_ethernet_address_pool_select[128][2]; - CLIB_PAD_FROM_TO (0xaa00, 0xc800); - u32 tx_priority_to_traffic_class; - CLIB_PAD_FROM_TO (0xc804, 0xcc00); - - /* In bytes units of 1k. Total packet buffer is 160k. */ - u32 tx_packet_buffer_size[8]; - - CLIB_PAD_FROM_TO (0xcc20, 0xcd10); - u32 tx_manageability_tc_mapping; - CLIB_PAD_FROM_TO (0xcd14, 0xcd20); - u32 dcb_tx_packet_plane_t2_config[8]; - u32 dcb_tx_packet_plane_t2_status[8]; - CLIB_PAD_FROM_TO (0xcd60, 0xce00); - - u32 tx_flow_control_status; - CLIB_PAD_FROM_TO (0xce04, 0xd000); - - ixge_dma_regs_t rx_dma1[64]; - - struct - { - /* Bigendian ip4 src/dst address. */ - u32 src_address[128]; - u32 dst_address[128]; - - /* TCP/UDP ports [15:0] src [31:16] dst; bigendian. */ - u32 tcp_udp_port[128]; - - /* [1:0] protocol tcp, udp, sctp, other - [4:2] match priority (highest wins) - [13:8] pool - [25] src address match disable - [26] dst address match disable - [27] src port match disable - [28] dst port match disable - [29] protocol match disable - [30] pool match disable - [31] enable. */ - u32 control[128]; - - /* [12] size bypass - [19:13] must be 0x80 - [20] low-latency interrupt - [27:21] rx queue. */ - u32 interrupt[128]; - } ip4_filters; - - CLIB_PAD_FROM_TO (0xea00, 0xeb00); - /* 4 bit rss output index indexed by 7 bit hash. - 128 8 bit fields = 32 registers. */ - u32 redirection_table_82599[32]; - - u32 rss_random_key_82599[10]; - CLIB_PAD_FROM_TO (0xeba8, 0xec00); - /* [15:0] reserved - [22:16] rx queue index - [29] low-latency interrupt on match - [31] enable */ - u32 ethernet_type_queue_select[8]; - CLIB_PAD_FROM_TO (0xec20, 0xec30); - u32 syn_packet_queue_filter; - CLIB_PAD_FROM_TO (0xec34, 0xec60); - u32 immediate_interrupt_rx_vlan_priority; - CLIB_PAD_FROM_TO (0xec64, 0xec70); - u32 rss_queues_per_traffic_class; - CLIB_PAD_FROM_TO (0xec74, 0xec90); - u32 lli_size_threshold; - CLIB_PAD_FROM_TO (0xec94, 0xed00); - - struct - { - u32 control; - CLIB_PAD_FROM_TO (0xed04, 0xed10); - u32 table[8]; - CLIB_PAD_FROM_TO (0xed30, 0xee00); - } fcoe_redirection; - - struct - { - /* [1:0] packet buffer allocation 0 => disabled, else 64k*2^(f-1) - [3] packet buffer initialization done - [4] perfetch match mode - [5] report status in rss field of rx descriptors - [7] report status always - [14:8] drop queue - [20:16] flex 2 byte packet offset (units of 2 bytes) - [27:24] max linked list length - [31:28] full threshold. */ - u32 control; - CLIB_PAD_FROM_TO (0xee04, 0xee0c); - - u32 data[8]; - - /* [1:0] 0 => no action, 1 => add, 2 => remove, 3 => query. - [2] valid filter found by query command - [3] filter update override - [4] ip6 adress table - [6:5] l4 protocol reserved, udp, tcp, sctp - [7] is ip6 - [8] clear head/tail - [9] packet drop action - [10] matched packet generates low-latency interrupt - [11] last in linked list - [12] collision - [15] rx queue enable - [22:16] rx queue - [29:24] pool. */ - u32 command; - - CLIB_PAD_FROM_TO (0xee30, 0xee3c); - /* ip4 dst/src address, tcp ports, udp ports. - set bits mean bit is ignored. */ - u32 ip4_masks[4]; - u32 filter_length; - u32 usage_stats; - u32 failed_usage_stats; - u32 filters_match_stats; - u32 filters_miss_stats; - CLIB_PAD_FROM_TO (0xee60, 0xee68); - /* Lookup, signature. */ - u32 hash_keys[2]; - /* [15:0] ip6 src address 1 bit per byte - [31:16] ip6 dst address. */ - u32 ip6_mask; - /* [0] vlan id - [1] vlan priority - [2] pool - [3] ip protocol - [4] flex - [5] dst ip6. */ - u32 other_mask; - CLIB_PAD_FROM_TO (0xee78, 0xf000); - } flow_director; - - struct - { - u32 l2_control[64]; - u32 vlan_pool_filter[64]; - u32 vlan_pool_filter_bitmap[128]; - u32 dst_ethernet_address[128]; - u32 mirror_rule[4]; - u32 mirror_rule_vlan[8]; - u32 mirror_rule_pool[8]; - CLIB_PAD_FROM_TO (0xf650, 0x10010); - } pf_bar; - - u32 eeprom_flash_control; - /* [0] start - [1] done - [15:2] address - [31:16] read data. */ - u32 eeprom_read; - CLIB_PAD_FROM_TO (0x10018, 0x1001c); - u32 flash_access; - CLIB_PAD_FROM_TO (0x10020, 0x10114); - u32 flash_data; - u32 flash_control; - u32 flash_read_data; - CLIB_PAD_FROM_TO (0x10120, 0x1013c); - u32 flash_opcode; - u32 software_semaphore; - CLIB_PAD_FROM_TO (0x10144, 0x10148); - u32 firmware_semaphore; - CLIB_PAD_FROM_TO (0x1014c, 0x10160); - u32 software_firmware_sync; - CLIB_PAD_FROM_TO (0x10164, 0x10200); - u32 general_rx_control; - CLIB_PAD_FROM_TO (0x10204, 0x11000); - - struct - { - u32 control; - CLIB_PAD_FROM_TO (0x11004, 0x11010); - /* [3:0] enable counters - [7:4] leaky bucket counter mode - [29] reset - [30] stop - [31] start. */ - u32 counter_control; - /* [7:0],[15:8],[23:16],[31:24] event for counters 0-3. - event codes: - 0x0 bad tlp - 0x10 reqs that reached timeout - etc. */ - u32 counter_event; - CLIB_PAD_FROM_TO (0x11018, 0x11020); - u32 counters_clear_on_read[4]; - u32 counter_config[4]; - struct - { - u32 address; - u32 data; - } indirect_access; - CLIB_PAD_FROM_TO (0x11048, 0x11050); - u32 extended_control; - CLIB_PAD_FROM_TO (0x11054, 0x11064); - u32 mirrored_revision_id; - CLIB_PAD_FROM_TO (0x11068, 0x11070); - u32 dca_requester_id_information; - - /* [0] global disable - [4:1] mode: 0 => legacy, 1 => dca 1.0. */ - u32 dca_control; - CLIB_PAD_FROM_TO (0x11078, 0x110b0); - /* [0] pci completion abort - [1] unsupported i/o address - [2] wrong byte enable - [3] pci timeout */ - u32 pcie_interrupt_status; - CLIB_PAD_FROM_TO (0x110b4, 0x110b8); - u32 pcie_interrupt_enable; - CLIB_PAD_FROM_TO (0x110bc, 0x110c0); - u32 msi_x_pba_clear[8]; - CLIB_PAD_FROM_TO (0x110e0, 0x12300); - } pcie; - - u32 interrupt_throttle1[128 - 24]; - CLIB_PAD_FROM_TO (0x124a0, 0x14f00); - - u32 core_analog_config; - CLIB_PAD_FROM_TO (0x14f04, 0x14f10); - u32 core_common_config; - CLIB_PAD_FROM_TO (0x14f14, 0x15f14); - - u32 link_sec_software_firmware_interface; -} ixge_regs_t; - -typedef union -{ - struct - { - /* Addresses bigendian. */ - union - { - struct - { - ip6_address_t src_address; - u32 unused[1]; - } ip6; - struct - { - u32 unused[3]; - ip4_address_t src_address, dst_address; - } ip4; - }; - - /* [15:0] src port (little endian). - [31:16] dst port. */ - u32 tcp_udp_ports; - - /* [15:0] vlan (cfi bit set to 0). - [31:16] flex bytes. bigendian. */ - u32 vlan_and_flex_word; - - /* [14:0] hash - [15] bucket valid - [31:16] signature (signature filers)/sw-index (perfect match). */ - u32 hash; - }; - - u32 as_u32[8]; -} ixge_flow_director_key_t; - -always_inline void -ixge_throttle_queue_interrupt (ixge_regs_t * r, - u32 queue_interrupt_index, - f64 inter_interrupt_interval_in_secs) -{ - volatile u32 *tr = - (queue_interrupt_index < ARRAY_LEN (r->interrupt.throttle0) - ? &r->interrupt.throttle0[queue_interrupt_index] - : &r->interrupt_throttle1[queue_interrupt_index]); - ASSERT (queue_interrupt_index < 128); - u32 v; - i32 i, mask = (1 << 9) - 1; - - i = flt_round_nearest (inter_interrupt_interval_in_secs / 2e-6); - i = i < 1 ? 1 : i; - i = i >= mask ? mask : i; - - v = tr[0]; - v &= ~(mask << 3); - v |= i << 3; - tr[0] = v; -} - -#define foreach_ixge_counter \ - _ (0x40d0, rx_total_packets) \ - _64 (0x40c0, rx_total_bytes) \ - _ (0x41b0, rx_good_packets_before_filtering) \ - _64 (0x41b4, rx_good_bytes_before_filtering) \ - _ (0x2f50, rx_dma_good_packets) \ - _64 (0x2f54, rx_dma_good_bytes) \ - _ (0x2f5c, rx_dma_duplicated_good_packets) \ - _64 (0x2f60, rx_dma_duplicated_good_bytes) \ - _ (0x2f68, rx_dma_good_loopback_packets) \ - _64 (0x2f6c, rx_dma_good_loopback_bytes) \ - _ (0x2f74, rx_dma_good_duplicated_loopback_packets) \ - _64 (0x2f78, rx_dma_good_duplicated_loopback_bytes) \ - _ (0x4074, rx_good_packets) \ - _64 (0x4088, rx_good_bytes) \ - _ (0x407c, rx_multicast_packets) \ - _ (0x4078, rx_broadcast_packets) \ - _ (0x405c, rx_64_byte_packets) \ - _ (0x4060, rx_65_127_byte_packets) \ - _ (0x4064, rx_128_255_byte_packets) \ - _ (0x4068, rx_256_511_byte_packets) \ - _ (0x406c, rx_512_1023_byte_packets) \ - _ (0x4070, rx_gt_1023_byte_packets) \ - _ (0x4000, rx_crc_errors) \ - _ (0x4120, rx_ip_checksum_errors) \ - _ (0x4004, rx_illegal_symbol_errors) \ - _ (0x4008, rx_error_symbol_errors) \ - _ (0x4034, rx_mac_local_faults) \ - _ (0x4038, rx_mac_remote_faults) \ - _ (0x4040, rx_length_errors) \ - _ (0x41a4, rx_xons) \ - _ (0x41a8, rx_xoffs) \ - _ (0x40a4, rx_undersize_packets) \ - _ (0x40a8, rx_fragments) \ - _ (0x40ac, rx_oversize_packets) \ - _ (0x40b0, rx_jabbers) \ - _ (0x40b4, rx_management_packets) \ - _ (0x40b8, rx_management_drops) \ - _ (0x3fa0, rx_missed_packets_pool_0) \ - _ (0x40d4, tx_total_packets) \ - _ (0x4080, tx_good_packets) \ - _64 (0x4090, tx_good_bytes) \ - _ (0x40f0, tx_multicast_packets) \ - _ (0x40f4, tx_broadcast_packets) \ - _ (0x87a0, tx_dma_good_packets) \ - _64 (0x87a4, tx_dma_good_bytes) \ - _ (0x40d8, tx_64_byte_packets) \ - _ (0x40dc, tx_65_127_byte_packets) \ - _ (0x40e0, tx_128_255_byte_packets) \ - _ (0x40e4, tx_256_511_byte_packets) \ - _ (0x40e8, tx_512_1023_byte_packets) \ - _ (0x40ec, tx_gt_1023_byte_packets) \ - _ (0x4010, tx_undersize_drops) \ - _ (0x8780, switch_security_violation_packets) \ - _ (0x5118, fc_crc_errors) \ - _ (0x241c, fc_rx_drops) \ - _ (0x2424, fc_last_error_count) \ - _ (0x2428, fcoe_rx_packets) \ - _ (0x242c, fcoe_rx_dwords) \ - _ (0x8784, fcoe_tx_packets) \ - _ (0x8788, fcoe_tx_dwords) \ - _ (0x1030, queue_0_rx_count) \ - _ (0x1430, queue_0_drop_count) \ - _ (0x1070, queue_1_rx_count) \ - _ (0x1470, queue_1_drop_count) \ - _ (0x10b0, queue_2_rx_count) \ - _ (0x14b0, queue_2_drop_count) \ - _ (0x10f0, queue_3_rx_count) \ - _ (0x14f0, queue_3_drop_count) \ - _ (0x1130, queue_4_rx_count) \ - _ (0x1530, queue_4_drop_count) \ - _ (0x1170, queue_5_rx_count) \ - _ (0x1570, queue_5_drop_count) \ - _ (0x11b0, queue_6_rx_count) \ - _ (0x15b0, queue_6_drop_count) \ - _ (0x11f0, queue_7_rx_count) \ - _ (0x15f0, queue_7_drop_count) \ - _ (0x1230, queue_8_rx_count) \ - _ (0x1630, queue_8_drop_count) \ - _ (0x1270, queue_9_rx_count) \ - _ (0x1270, queue_9_drop_count) - - - - -typedef enum -{ -#define _(a,f) IXGE_COUNTER_##f, -#define _64(a,f) _(a,f) - foreach_ixge_counter -#undef _ -#undef _64 - IXGE_N_COUNTER, -} ixge_counter_type_t; - -typedef struct -{ - u32 mdio_address; - - /* 32 bit ID read from ID registers. */ - u32 id; -} ixge_phy_t; - -typedef struct -{ - /* Cache aligned descriptors. */ - ixge_descriptor_t *descriptors; - - /* Number of descriptors in table. */ - u32 n_descriptors; - - /* Software head and tail pointers into descriptor ring. */ - u32 head_index, tail_index; - - /* Index into dma_queues vector. */ - u32 queue_index; - - /* Buffer indices corresponding to each active descriptor. */ - u32 *descriptor_buffer_indices; - - union - { - struct - { - u32 *volatile head_index_write_back; - - u32 n_buffers_on_ring; - } tx; - - struct - { - /* Buffer indices to use to replenish each descriptor. */ - u32 *replenish_buffer_indices; - - vlib_node_runtime_t *node; - u32 next_index; - - u32 saved_start_of_packet_buffer_index; - - u32 saved_start_of_packet_next_index; - u32 saved_last_buffer_index; - - u32 is_start_of_packet; - - u32 n_descriptors_done_total; - - u32 n_descriptors_done_this_call; - - u32 n_bytes; - } rx; - }; -} ixge_dma_queue_t; - -#define foreach_ixge_pci_device_id \ - _ (82598, 0x10b6) \ - _ (82598_bx, 0x1508) \ - _ (82598af_dual_port, 0x10c6) \ - _ (82598af_single_port, 0x10c7) \ - _ (82598at, 0x10c8) \ - _ (82598at2, 0x150b) \ - _ (82598eb_sfp_lom, 0x10db) \ - _ (82598eb_cx4, 0x10dd) \ - _ (82598_cx4_dual_port, 0x10ec) \ - _ (82598_da_dual_port, 0x10f1) \ - _ (82598_sr_dual_port_em, 0x10e1) \ - _ (82598eb_xf_lr, 0x10f4) \ - _ (82599_kx4, 0x10f7) \ - _ (82599_kx4_mezz, 0x1514) \ - _ (82599_kr, 0x1517) \ - _ (82599_combo_backplane, 0x10f8) \ - _ (82599_cx4, 0x10f9) \ - _ (82599_sfp, 0x10fb) \ - _ (82599_backplane_fcoe, 0x152a) \ - _ (82599_sfp_fcoe, 0x1529) \ - _ (82599_sfp_em, 0x1507) \ - _ (82599_xaui_lom, 0x10fc) \ - _ (82599_t3_lom, 0x151c) \ - _ (x540t, 0x1528) - -typedef enum -{ -#define _(f,n) IXGE_##f = n, - foreach_ixge_pci_device_id -#undef _ -} ixge_pci_device_id_t; - -typedef struct -{ - /* registers */ - ixge_regs_t *regs; - - /* Specific next index when using dynamic redirection */ - u32 per_interface_next_index; - - /* PCI bus info. */ - vlib_pci_device_t pci_device; - - /* From PCI config space header. */ - ixge_pci_device_id_t device_id; - - u16 device_index; - - /* 0 or 1. */ - u16 pci_function; - - /* VLIB interface for this instance. */ - u32 vlib_hw_if_index, vlib_sw_if_index; - - ixge_dma_queue_t *dma_queues[VLIB_N_RX_TX]; - - /* Phy index (0 or 1) and address on MDI bus. */ - u32 phy_index; - ixge_phy_t phys[2]; - - /* Value of link_status register at last link change. */ - u32 link_status_at_last_link_change; - - i2c_bus_t i2c_bus; - sfp_eeprom_t sfp_eeprom; - - /* Counters. */ - u64 counters[IXGE_N_COUNTER], counters_last_clear[IXGE_N_COUNTER]; -} ixge_device_t; - -typedef struct -{ - vlib_main_t *vlib_main; - - /* Vector of devices. */ - ixge_device_t *devices; - - /* Descriptor ring sizes. */ - u32 n_descriptors[VLIB_N_RX_TX]; - - /* RX buffer size. Must be at least 1k; will be rounded to - next largest 1k size. */ - u32 n_bytes_in_rx_buffer; - - u32 n_descriptors_per_cache_line; - - u32 vlib_buffer_free_list_index; - - u32 process_node_index; - - /* Template and mask for initializing/validating TX descriptors. */ - ixge_tx_descriptor_t tx_descriptor_template, tx_descriptor_template_mask; - - /* Vector of buffers for which TX is done and can be freed. */ - u32 *tx_buffers_pending_free; - - u32 *rx_buffers_to_add; - - f64 time_last_stats_update; -} ixge_main_t; - -ixge_main_t ixge_main; -vnet_device_class_t ixge_device_class; - -typedef enum -{ - IXGE_RX_NEXT_IP4_INPUT, - IXGE_RX_NEXT_IP6_INPUT, - IXGE_RX_NEXT_ETHERNET_INPUT, - IXGE_RX_NEXT_DROP, - IXGE_RX_N_NEXT, -} ixge_rx_next_t; - -void ixge_set_next_node (ixge_rx_next_t, char *); - -#endif /* included_ixge_h */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet/devices/nic/sfp.c b/src/vnet/devices/nic/sfp.c deleted file mode 100644 index 9e9c008d..00000000 --- a/src/vnet/devices/nic/sfp.c +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright (c) 2016 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -static u8 * -format_space_terminated (u8 * s, va_list * args) -{ - u32 l = va_arg (*args, u32); - u8 *v = va_arg (*args, u8 *); - u8 *p; - - for (p = v + l - 1; p >= v && p[0] == ' '; p--) - ; - vec_add (s, v, clib_min (p - v + 1, l)); - return s; -} - -static u8 * -format_sfp_id (u8 * s, va_list * args) -{ - u32 id = va_arg (*args, u32); - char *t = 0; - switch (id) - { -#define _(f) case SFP_ID_##f: t = #f; break; - foreach_sfp_id -#undef _ - default: - return format (s, "unknown 0x%x", id); - } - return format (s, "%s", t); -} - -static u8 * -format_sfp_compatibility (u8 * s, va_list * args) -{ - u32 c = va_arg (*args, u32); - char *t = 0; - switch (c) - { -#define _(a,b,f) case SFP_COMPATIBILITY_##f: t = #f; break; - foreach_sfp_compatibility -#undef _ - default: - return format (s, "unknown 0x%x", c); - } - return format (s, "%s", t); -} - -u32 -sfp_is_comatible (sfp_eeprom_t * e, sfp_compatibility_t c) -{ - static struct - { - u8 byte, bit; - } t[] = - { -#define _(a,b,f) { .byte = a, .bit = b, }, - foreach_sfp_compatibility -#undef _ - }; - - ASSERT (c < ARRAY_LEN (t)); - return (e->compatibility[t[c].byte] & (1 << t[c].bit)) != 0; -} - -u8 * -format_sfp_eeprom (u8 * s, va_list * args) -{ - sfp_eeprom_t *e = va_arg (*args, sfp_eeprom_t *); - uword indent = format_get_indent (s); - int i; - - if (e->id != SFP_ID_sfp) - s = format (s, "id %U, ", format_sfp_id, e->id); - - s = format (s, "compatibility:"); - for (i = 0; i < SFP_N_COMPATIBILITY; i++) - if (sfp_is_comatible (e, i)) - s = format (s, " %U", format_sfp_compatibility, i); - - s = format (s, "\n%Uvendor: %U, part %U", - format_white_space, indent, - format_space_terminated, sizeof (e->vendor_name), - e->vendor_name, format_space_terminated, - sizeof (e->vendor_part_number), e->vendor_part_number); - s = - format (s, "\n%Urevision: %U, serial: %U, date code: %U", - format_white_space, indent, format_space_terminated, - sizeof (e->vendor_revision), e->vendor_revision, - format_space_terminated, sizeof (e->vendor_serial_number), - e->vendor_serial_number, format_space_terminated, - sizeof (e->vendor_date_code), e->vendor_date_code); - - return s; -} - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet/devices/nic/sfp.h b/src/vnet/devices/nic/sfp.h deleted file mode 100644 index a1ac7997..00000000 --- a/src/vnet/devices/nic/sfp.h +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright (c) 2016 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef included_vnet_optics_sfp_h -#define included_vnet_optics_sfp_h - -#include - -#define foreach_sfp_id \ - _ (unknown) \ - _ (gbic) \ - _ (on_motherboard) \ - _ (sfp) - -typedef enum -{ -#define _(f) SFP_ID_##f, - foreach_sfp_id -#undef _ -} sfp_id_t; - -typedef struct -{ - u8 id; - u8 extended_id; - u8 connector_type; - u8 compatibility[8]; - u8 encoding; - u8 nominal_bit_rate_100mbits_per_sec; - u8 reserved13; - u8 link_length[5]; - u8 reserved19; - u8 vendor_name[16]; - u8 reserved36; - u8 vendor_oui[3]; - u8 vendor_part_number[16]; - u8 vendor_revision[4]; - /* 16 bit value network byte order. */ - u8 laser_wavelength_in_nm[2]; - u8 reserved62; - u8 checksum_0_to_62; - - u8 options[2]; - u8 max_bit_rate_margin_percent; - u8 min_bit_rate_margin_percent; - u8 vendor_serial_number[16]; - u8 vendor_date_code[8]; - u8 reserved92[3]; - u8 checksum_63_to_94; - u8 vendor_specific[32]; - u8 reserved128[384]; - - /* Vendor specific data follows. */ - u8 vendor_specific1[0]; -} sfp_eeprom_t; - -always_inline uword -sfp_eeprom_is_valid (sfp_eeprom_t * e) -{ - int i; - u8 sum = 0; - for (i = 0; i < 63; i++) - sum += ((u8 *) e)[i]; - return sum == e->checksum_0_to_62; -} - -/* _ (byte_index, bit_index, name) */ -#define foreach_sfp_compatibility \ - _ (0, 4, 10g_base_sr) \ - _ (0, 5, 10g_base_lr) \ - _ (1, 2, oc48_long_reach) \ - _ (1, 1, oc48_intermediate_reach) \ - _ (1, 0, oc48_short_reach) \ - _ (2, 6, oc12_long_reach) \ - _ (2, 5, oc12_intermediate_reach) \ - _ (2, 4, oc12_short_reach) \ - _ (2, 2, oc3_long_reach) \ - _ (2, 1, oc3_intermediate_reach) \ - _ (2, 0, oc3_short_reach) \ - _ (3, 3, 1g_base_t) \ - _ (3, 2, 1g_base_cx) \ - _ (3, 1, 1g_base_lx) \ - _ (3, 0, 1g_base_sx) - -typedef enum -{ -#define _(a,b,f) SFP_COMPATIBILITY_##f, - foreach_sfp_compatibility -#undef _ - SFP_N_COMPATIBILITY, -} sfp_compatibility_t; - -u32 sfp_is_comatible (sfp_eeprom_t * e, sfp_compatibility_t c); - -format_function_t format_sfp_eeprom; - -#endif /* included_vnet_optics_sfp_h */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vnet/ethernet/sfp.c b/src/vnet/ethernet/sfp.c new file mode 100644 index 00000000..624740e3 --- /dev/null +++ b/src/vnet/ethernet/sfp.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +static u8 * +format_space_terminated (u8 * s, va_list * args) +{ + u32 l = va_arg (*args, u32); + u8 *v = va_arg (*args, u8 *); + u8 *p; + + for (p = v + l - 1; p >= v && p[0] == ' '; p--) + ; + vec_add (s, v, clib_min (p - v + 1, l)); + return s; +} + +static u8 * +format_sfp_id (u8 * s, va_list * args) +{ + u32 id = va_arg (*args, u32); + char *t = 0; + switch (id) + { +#define _(f) case SFP_ID_##f: t = #f; break; + foreach_sfp_id +#undef _ + default: + return format (s, "unknown 0x%x", id); + } + return format (s, "%s", t); +} + +static u8 * +format_sfp_compatibility (u8 * s, va_list * args) +{ + u32 c = va_arg (*args, u32); + char *t = 0; + switch (c) + { +#define _(a,b,f) case SFP_COMPATIBILITY_##f: t = #f; break; + foreach_sfp_compatibility +#undef _ + default: + return format (s, "unknown 0x%x", c); + } + return format (s, "%s", t); +} + +u32 +sfp_is_comatible (sfp_eeprom_t * e, sfp_compatibility_t c) +{ + static struct + { + u8 byte, bit; + } t[] = + { +#define _(a,b,f) { .byte = a, .bit = b, }, + foreach_sfp_compatibility +#undef _ + }; + + ASSERT (c < ARRAY_LEN (t)); + return (e->compatibility[t[c].byte] & (1 << t[c].bit)) != 0; +} + +u8 * +format_sfp_eeprom (u8 * s, va_list * args) +{ + sfp_eeprom_t *e = va_arg (*args, sfp_eeprom_t *); + uword indent = format_get_indent (s); + int i; + + if (e->id != SFP_ID_sfp) + s = format (s, "id %U, ", format_sfp_id, e->id); + + s = format (s, "compatibility:"); + for (i = 0; i < SFP_N_COMPATIBILITY; i++) + if (sfp_is_comatible (e, i)) + s = format (s, " %U", format_sfp_compatibility, i); + + s = format (s, "\n%Uvendor: %U, part %U", + format_white_space, indent, + format_space_terminated, sizeof (e->vendor_name), + e->vendor_name, format_space_terminated, + sizeof (e->vendor_part_number), e->vendor_part_number); + s = + format (s, "\n%Urevision: %U, serial: %U, date code: %U", + format_white_space, indent, format_space_terminated, + sizeof (e->vendor_revision), e->vendor_revision, + format_space_terminated, sizeof (e->vendor_serial_number), + e->vendor_serial_number, format_space_terminated, + sizeof (e->vendor_date_code), e->vendor_date_code); + + return s; +} + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vnet/ethernet/sfp.h b/src/vnet/ethernet/sfp.h new file mode 100644 index 00000000..a1ac7997 --- /dev/null +++ b/src/vnet/ethernet/sfp.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_vnet_optics_sfp_h +#define included_vnet_optics_sfp_h + +#include + +#define foreach_sfp_id \ + _ (unknown) \ + _ (gbic) \ + _ (on_motherboard) \ + _ (sfp) + +typedef enum +{ +#define _(f) SFP_ID_##f, + foreach_sfp_id +#undef _ +} sfp_id_t; + +typedef struct +{ + u8 id; + u8 extended_id; + u8 connector_type; + u8 compatibility[8]; + u8 encoding; + u8 nominal_bit_rate_100mbits_per_sec; + u8 reserved13; + u8 link_length[5]; + u8 reserved19; + u8 vendor_name[16]; + u8 reserved36; + u8 vendor_oui[3]; + u8 vendor_part_number[16]; + u8 vendor_revision[4]; + /* 16 bit value network byte order. */ + u8 laser_wavelength_in_nm[2]; + u8 reserved62; + u8 checksum_0_to_62; + + u8 options[2]; + u8 max_bit_rate_margin_percent; + u8 min_bit_rate_margin_percent; + u8 vendor_serial_number[16]; + u8 vendor_date_code[8]; + u8 reserved92[3]; + u8 checksum_63_to_94; + u8 vendor_specific[32]; + u8 reserved128[384]; + + /* Vendor specific data follows. */ + u8 vendor_specific1[0]; +} sfp_eeprom_t; + +always_inline uword +sfp_eeprom_is_valid (sfp_eeprom_t * e) +{ + int i; + u8 sum = 0; + for (i = 0; i < 63; i++) + sum += ((u8 *) e)[i]; + return sum == e->checksum_0_to_62; +} + +/* _ (byte_index, bit_index, name) */ +#define foreach_sfp_compatibility \ + _ (0, 4, 10g_base_sr) \ + _ (0, 5, 10g_base_lr) \ + _ (1, 2, oc48_long_reach) \ + _ (1, 1, oc48_intermediate_reach) \ + _ (1, 0, oc48_short_reach) \ + _ (2, 6, oc12_long_reach) \ + _ (2, 5, oc12_intermediate_reach) \ + _ (2, 4, oc12_short_reach) \ + _ (2, 2, oc3_long_reach) \ + _ (2, 1, oc3_intermediate_reach) \ + _ (2, 0, oc3_short_reach) \ + _ (3, 3, 1g_base_t) \ + _ (3, 2, 1g_base_cx) \ + _ (3, 1, 1g_base_lx) \ + _ (3, 0, 1g_base_sx) + +typedef enum +{ +#define _(a,b,f) SFP_COMPATIBILITY_##f, + foreach_sfp_compatibility +#undef _ + SFP_N_COMPATIBILITY, +} sfp_compatibility_t; + +u32 sfp_is_comatible (sfp_eeprom_t * e, sfp_compatibility_t c); + +format_function_t format_sfp_eeprom; + +#endif /* included_vnet_optics_sfp_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vpp-api/lua/bench.lua b/src/vpp-api/lua/bench.lua index 8e5a0b4b..c7231b90 100644 --- a/src/vpp-api/lua/bench.lua +++ b/src/vpp-api/lua/bench.lua @@ -53,9 +53,9 @@ function do_bench() end root_dir = "/home/ubuntu/vpp" -pneum_path = root_dir .. "/build-root/install-vpp_lite_debug-native/vpp-api/lib64/libpneum.so" +pneum_path = root_dir .. "/build-root/install-vpp_debug-native/vpp-api/lib64/libpneum.so" vpp:init({ pneum_path = pneum_path }) -vpp:json_api(root_dir .. "/build-root/install-vpp_lite_debug-native/vpp/vpp-api/vpe.api.json") +vpp:json_api(root_dir .. "/build-root/install-vpp_debug-native/vpp/vpp-api/vpe.api.json") vpp:connect("lua-bench") local n_tests = 10 diff --git a/src/vpp-api/lua/examples/cli/lua-cli.lua b/src/vpp-api/lua/examples/cli/lua-cli.lua index b3a24d7d..4a27af53 100644 --- a/src/vpp-api/lua/examples/cli/lua-cli.lua +++ b/src/vpp-api/lua/examples/cli/lua-cli.lua @@ -557,12 +557,12 @@ end function init_vpp(vpp) local root_dir = "/home/ubuntu/vpp" - local pneum_path = root_dir .. "/build-root/install-vpp_lite_debug-native/vpp-api/lib64/libpneum.so" + local pneum_path = root_dir .. "/build-root/install-vpp_debug-native/vpp-api/lib64/libpneum.so" vpp:init({ pneum_path = pneum_path }) vpp:init({ pneum_path = pneum_path }) - vpp:json_api(root_dir .. "/build-root/install-vpp_lite_debug-native/vpp/vpp-api/vpe.api.json") + vpp:json_api(root_dir .. "/build-root/install-vpp_debug-native/vpp/vpp-api/vpe.api.json") diff --git a/src/vpp-api/lua/examples/example-classifier.lua b/src/vpp-api/lua/examples/example-classifier.lua index ec9c3d3e..b1270757 100644 --- a/src/vpp-api/lua/examples/example-classifier.lua +++ b/src/vpp-api/lua/examples/example-classifier.lua @@ -20,12 +20,12 @@ local vpp = require "vpp-lapi" local bit = require("bit") root_dir = "/home/ubuntu/vpp" -pneum_path = root_dir .. "/build-root/install-vpp_lite_debug-native/vpp-api/lib64/libpneum.so" +pneum_path = root_dir .. "/build-root/install-vpp_debug-native/vpp-api/lib64/libpneum.so" vpp:init({ pneum_path = pneum_path }) -vpp:json_api(root_dir .. "/build-root/install-vpp_lite_debug-native/vpp/vpp-api/vpe.api.json") +vpp:json_api(root_dir .. "/build-root/install-vpp_debug-native/vpp/vpp-api/vpe.api.json") vpp:connect("aytest") diff --git a/src/vpp-api/lua/examples/example-cli.lua b/src/vpp-api/lua/examples/example-cli.lua index 8b84989f..85425caf 100644 --- a/src/vpp-api/lua/examples/example-cli.lua +++ b/src/vpp-api/lua/examples/example-cli.lua @@ -18,11 +18,11 @@ vpp = require "vpp-lapi" root_dir = "/home/ubuntu/vpp" -pneum_path = root_dir .. "/build-root/install-vpp_lite_debug-native/vpp-api/lib64/libpneum.so" +pneum_path = root_dir .. "/build-root/install-vpp_debug-native/vpp-api/lib64/libpneum.so" vpp:init({ pneum_path = pneum_path }) -vpp:json_api(root_dir .. "/build-root/install-vpp_lite_debug-native/vpp/vpp-api/vpe.api.json") +vpp:json_api(root_dir .. "/build-root/install-vpp_debug-native/vpp/vpp-api/vpe.api.json") vpp:connect("aytest") diff --git a/src/vpp/vnet/main.c b/src/vpp/vnet/main.c index a566d956..d6a12325 100644 --- a/src/vpp/vnet/main.c +++ b/src/vpp/vnet/main.c @@ -199,9 +199,6 @@ defaulted: { vm->init_functions_called = hash_create (0, /* value bytes */ 0); vpe_main_init (vm); -#if DPDK == 0 - unix_physmem_init (vm, 0 /* fail_if_physical_memory_not_present */ ); -#endif return vlib_unix_main (argc, argv); } else diff --git a/test/framework.py b/test/framework.py index 6b1799a5..a0284e37 100644 --- a/test/framework.py +++ b/test/framework.py @@ -157,7 +157,9 @@ class VppTestCase(unittest.TestCase): cls.vpp_cmdline = [cls.vpp_bin, "unix", "{", "nodaemon", debug_cli, coredump_size, "}", "api-trace", "{", "on", "}", - "api-segment", "{", "prefix", cls.shm_prefix, "}"] + "api-segment", "{", "prefix", cls.shm_prefix, "}", + "plugins", "{", "plugin", "dpdk_plugin.so", "{", + "disable", "}", "}"] if cls.plugin_path is not None: cls.vpp_cmdline.extend(["plugin_path", cls.plugin_path]) cls.logger.info("vpp_cmdline: %s" % cls.vpp_cmdline) -- cgit 1.2.3-korg From 20e272c8fce5571122ca149526ee8ddf6f43ee3f Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Tue, 14 Mar 2017 11:10:00 +0100 Subject: vlib: poll pre_input nodes only on main thread Change-Id: I61464fd1610a9754693f31edd72f9fa1b6926511 Signed-off-by: Damjan Marion --- src/vlib/main.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/main.c b/src/vlib/main.c index 91760706..58e88fcc 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -1468,12 +1468,13 @@ vlib_main_or_worker_loop (vlib_main_t * vm, int is_main) } /* Process pre-input nodes. */ - vec_foreach (n, nm->nodes_by_type[VLIB_NODE_TYPE_PRE_INPUT]) - cpu_time_now = dispatch_node (vm, n, - VLIB_NODE_TYPE_PRE_INPUT, - VLIB_NODE_STATE_POLLING, - /* frame */ 0, - cpu_time_now); + if (is_main) + vec_foreach (n, nm->nodes_by_type[VLIB_NODE_TYPE_PRE_INPUT]) + cpu_time_now = dispatch_node (vm, n, + VLIB_NODE_TYPE_PRE_INPUT, + VLIB_NODE_STATE_POLLING, + /* frame */ 0, + cpu_time_now); /* Next process input nodes. */ vec_foreach (n, nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]) -- cgit 1.2.3-korg From 9d676afbb779da5186cb3869925ef6d7d3d04db1 Mon Sep 17 00:00:00 2001 From: Neale Ranns Date: Wed, 15 Mar 2017 01:28:31 -0700 Subject: No vector allocation during buffer copy Change-Id: I7e8556af833ca0e00fadc96dcd2077ff1104541b Signed-off-by: Neale Ranns --- src/vlib/buffer_funcs.h | 5 +---- src/vnet/fib/mpls_fib.h | 4 +--- test/test_ip_mcast.py | 47 ++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 40 insertions(+), 16 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index f346676b..394c336a 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -476,7 +476,6 @@ vlib_buffer_copy (vlib_main_t * vm, vlib_buffer_t * b) { vlib_buffer_t *s, *d, *fd; uword n_alloc, n_buffers = 1; - u32 *new_buffers = 0; u32 flag_mask = VLIB_BUFFER_NEXT_PRESENT | VLIB_BUFFER_TOTAL_LENGTH_VALID; int i; @@ -486,8 +485,8 @@ vlib_buffer_copy (vlib_main_t * vm, vlib_buffer_t * b) n_buffers++; s = vlib_get_buffer (vm, s->next_buffer); } + u32 new_buffers[n_buffers]; - vec_validate (new_buffers, n_buffers - 1); n_alloc = vlib_buffer_alloc (vm, new_buffers, n_buffers); /* No guarantee that we'll get all the buffers we asked for */ @@ -495,7 +494,6 @@ vlib_buffer_copy (vlib_main_t * vm, vlib_buffer_t * b) { if (n_alloc > 0) vlib_buffer_free (vm, new_buffers, n_alloc); - vec_free (new_buffers); return 0; } @@ -526,7 +524,6 @@ vlib_buffer_copy (vlib_main_t * vm, vlib_buffer_t * b) d->flags = s->flags & flag_mask; } - vec_free (new_buffers); return fd; } diff --git a/src/vnet/fib/mpls_fib.h b/src/vnet/fib/mpls_fib.h index e2ef9253..779decaa 100644 --- a/src/vnet/fib/mpls_fib.h +++ b/src/vnet/fib/mpls_fib.h @@ -28,9 +28,7 @@ static inline mpls_fib_t* mpls_fib_get (fib_node_index_t index) { - if (!pool_is_free_index(mpls_main.fibs, index)) - return (&(pool_elt_at_index(mpls_main.fibs, index)->mpls)); - return (NULL); + return (&(pool_elt_at_index(mpls_main.fibs, index)->mpls)); } extern u32 mpls_fib_table_find_or_create_and_lock(u32 table_id); diff --git a/test/test_ip_mcast.py b/test/test_ip_mcast.py index 957bab5a..864cb803 100644 --- a/test/test_ip_mcast.py +++ b/test/test_ip_mcast.py @@ -8,7 +8,7 @@ from vpp_ip_route import VppIpMRoute, VppMRoutePath, VppMFibSignal from scapy.packet import Raw from scapy.layers.l2 import Ether -from scapy.layers.inet import IP, UDP, getmacbyip +from scapy.layers.inet import IP, UDP, getmacbyip, ICMP from scapy.layers.inet6 import IPv6, getmacbyip6 from util import ppp @@ -70,16 +70,17 @@ class TestIPMcast(VppTestCase): i.resolve_arp() i.resolve_ndp() - def create_stream_ip4(self, src_if, src_ip, dst_ip): + def create_stream_ip4(self, src_if, src_ip, dst_ip, payload_size=0): pkts = [] + # default to small packet sizes + p = (Ether(dst=src_if.local_mac, src=src_if.remote_mac) / + IP(src=src_ip, dst=dst_ip) / + UDP(sport=1234, dport=1234)) + if not payload_size: + payload_size = 64 - len(p) + p = p / Raw('\xa5' * payload_size) + for i in range(0, N_PKTS_IN_STREAM): - info = self.create_packet_info(src_if, src_if) - payload = self.info_to_payload(info) - p = (Ether(dst=src_if.local_mac, src=src_if.remote_mac) / - IP(src=src_ip, dst=dst_ip) / - UDP(sport=1234, dport=1234) / - Raw(payload)) - info.data = p.copy() pkts.append(p) return pkts @@ -237,6 +238,7 @@ class TestIPMcast(VppTestCase): # # a stream that matches the route for (1.1.1.1,232.1.1.1) + # small packets # self.vapi.cli("clear trace") tx = self.create_stream_ip4(self.pg0, "1.1.1.1", "232.1.1.1") @@ -254,6 +256,33 @@ class TestIPMcast(VppTestCase): self.verify_capture_ip4(self.pg6, tx) self.verify_capture_ip4(self.pg7, tx) + # no replications on Pg0 + self.pg0.assert_nothing_captured( + remark="IP multicast packets forwarded on PG0") + self.pg3.assert_nothing_captured( + remark="IP multicast packets forwarded on PG3") + + # + # a stream that matches the route for (1.1.1.1,232.1.1.1) + # large packets + # + self.vapi.cli("clear trace") + tx = self.create_stream_ip4(self.pg0, "1.1.1.1", "232.1.1.1", + payload_size=1024) + self.pg0.add_stream(tx) + + self.pg_enable_capture(self.pg_interfaces) + self.pg_start() + + # We expect replications on Pg1->7 + self.verify_capture_ip4(self.pg1, tx) + self.verify_capture_ip4(self.pg2, tx) + self.verify_capture_ip4(self.pg3, tx) + self.verify_capture_ip4(self.pg4, tx) + self.verify_capture_ip4(self.pg5, tx) + self.verify_capture_ip4(self.pg6, tx) + self.verify_capture_ip4(self.pg7, tx) + # no replications on Pg0 self.pg0.assert_nothing_captured( remark="IP multicast packets forwarded on PG0") -- cgit 1.2.3-korg From e9f929b52ddb741ec1e4cb2d92c6be1e798933a0 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 16 Mar 2017 11:32:09 +0100 Subject: vlib: make runtime_data thread-local Change-Id: I4aa3e7e42fb81211de1aed07dc7befee87a1e18b Signed-off-by: Damjan Marion --- src/vlib/init.h | 1 + src/vlib/main.h | 1 + src/vlib/node.c | 4 +- src/vlib/node.h | 81 +++++++++++++++++++++------------------- src/vlib/threads.c | 61 ++++++++++++++++++++++++++++-- src/vnet/gre/node.c | 26 ++++++++++--- src/vnet/hdlc/node.c | 27 +++++++++----- src/vnet/l2/l2_input_classify.c | 15 ++++++++ src/vnet/l2/l2_output_classify.c | 16 ++++++++ src/vnet/l2tp/l2tp.c | 10 +++++ src/vnet/mpls/node.c | 13 +++++++ src/vnet/ppp/node.c | 27 +++++++++----- src/vnet/tcp/tcp_syn_filter4.c | 23 +++++++----- src/vnet/udp/udp_local.c | 74 ++++++++++++++++++++++-------------- 14 files changed, 273 insertions(+), 106 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/init.h b/src/vlib/init.h index 4fa5b304..12db3f90 100644 --- a/src/vlib/init.h +++ b/src/vlib/init.h @@ -109,6 +109,7 @@ static void __vlib_add_##tag##_function_##x (void) \ } #define VLIB_INIT_FUNCTION(x) VLIB_DECLARE_INIT_FUNCTION(x,init) +#define VLIB_WORKER_INIT_FUNCTION(x) VLIB_DECLARE_INIT_FUNCTION(x,worker_init) #define VLIB_MAIN_LOOP_ENTER_FUNCTION(x) \ VLIB_DECLARE_INIT_FUNCTION(x,main_loop_enter) diff --git a/src/vlib/main.h b/src/vlib/main.h index a6d50b39..98bc823d 100644 --- a/src/vlib/main.h +++ b/src/vlib/main.h @@ -162,6 +162,7 @@ typedef struct vlib_main_t /* List of init functions to call, setup by constructors */ _vlib_init_function_list_elt_t *init_function_registrations; + _vlib_init_function_list_elt_t *worker_init_function_registrations; _vlib_init_function_list_elt_t *main_loop_enter_function_registrations; _vlib_init_function_list_elt_t *main_loop_exit_function_registrations; _vlib_init_function_list_elt_t *api_init_function_registrations; diff --git a/src/vlib/node.c b/src/vlib/node.c index c419a13a..dc0a4de5 100644 --- a/src/vlib/node.c +++ b/src/vlib/node.c @@ -434,9 +434,7 @@ register_node (vlib_main_t * vm, vlib_node_registration_t * r) rt->errors[i] = vlib_error_set (n->index, i); STATIC_ASSERT_SIZEOF (vlib_node_runtime_t, 128); - ASSERT (vec_len (n->runtime_data) <= - sizeof (vlib_node_runtime_t) - - STRUCT_OFFSET_OF (vlib_node_runtime_t, runtime_data)); + ASSERT (vec_len (n->runtime_data) <= VLIB_NODE_RUNTIME_DATA_SIZE); if (vec_len (n->runtime_data) > 0) clib_memcpy (rt->runtime_data, n->runtime_data, diff --git a/src/vlib/node.h b/src/vlib/node.h index b624e9d6..2a532cc3 100644 --- a/src/vlib/node.h +++ b/src/vlib/node.h @@ -411,65 +411,68 @@ typedef struct typedef struct vlib_node_runtime_t { - CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); - /* Node function to call. */ - vlib_node_function_t *function; + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); /**< cacheline mark */ - /* Vector of errors for this node. */ - vlib_error_t *errors; + vlib_node_function_t *function; /**< Node function to call. */ - /* Number of clock cycles. */ - u32 clocks_since_last_overflow; + vlib_error_t *errors; /**< Vector of errors for this node. */ - /* Maximum clock cycle for an invocation. */ - u32 max_clock; + u32 clocks_since_last_overflow; /**< Number of clock cycles. */ - /* Number of vectors in the recorded max_clock. */ - u32 max_clock_n; + u32 max_clock; /**< Maximum clock cycle for an + invocation. */ - /* Number of calls. */ - u32 calls_since_last_overflow; + u32 max_clock_n; /**< Number of vectors in the recorded + max_clock. */ - /* Number of vector elements processed by this node. */ - u32 vectors_since_last_overflow; + u32 calls_since_last_overflow; /**< Number of calls. */ - /* Start of next frames for this node. */ - u32 next_frame_index; + u32 vectors_since_last_overflow; /**< Number of vector elements + processed by this node. */ - /* Node index. */ - u32 node_index; + u32 next_frame_index; /**< Start of next frames for this + node. */ - /* For input nodes: decremented on each main loop interation until it reaches zero - and function is called. Allows some input nodes to be called - more than others. */ - u32 input_main_loops_per_call; + u32 node_index; /**< Node index. */ - /* Saved main loop counter of last dispatch of this node. */ - u32 main_loop_count_last_dispatch; + u32 input_main_loops_per_call; /**< For input nodes: decremented + on each main loop interation until + it reaches zero and function is + called. Allows some input nodes to + be called more than others. */ + + u32 main_loop_count_last_dispatch; /**< Saved main loop counter of last + dispatch of this node. */ u32 main_loop_vector_stats[2]; - /* Copy of main node flags. */ - u16 flags; + u16 flags; /**< Copy of main node flags. */ - /* Input node state. */ - u16 state; + u16 state; /**< Input node state. */ u16 n_next_nodes; - /* Next frame index that vector arguments were last enqueued to - last time this node ran. Set to zero before first run - of this node. */ - u16 cached_next_index; - - /* CPU this node runs on */ - u16 cpu_index; - - /* Function dependent node-runtime. */ - u8 runtime_data[0]; + u16 cached_next_index; /**< Next frame index that vector + arguments were last enqueued to + last time this node ran. Set to + zero before first run of this + node. */ + + u16 cpu_index; /**< CPU this node runs on */ + + u8 runtime_data[0]; /**< Function dependent + node-runtime data. This data is + thread local, and it is not + cloned from main thread. It needs + to be initialized for each thread + before it is used unless + runtime_data template exists in + vlib_node_t. */ } vlib_node_runtime_t; +#define VLIB_NODE_RUNTIME_DATA_SIZE (sizeof (vlib_node_runtime_t) - STRUCT_OFFSET_OF (vlib_node_runtime_t, runtime_data)) + typedef struct { /* Number of allocated frames for this scalar/vector size. */ diff --git a/src/vlib/threads.c b/src/vlib/threads.c index 07dbff33..3756c3fa 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -633,6 +633,8 @@ start_workers (vlib_main_t * vm) vm_clone->cpu_index = worker_thread_index; vm_clone->heap_base = w->thread_mheap; vm_clone->mbuf_alloc_list = 0; + vm_clone->init_functions_called = + hash_create (0, /* value bytes */ 0); memset (&vm_clone->random_buffer, 0, sizeof (vm_clone->random_buffer)); @@ -674,11 +676,33 @@ start_workers (vlib_main_t * vm) } nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] = vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]); + vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]) + { + vlib_node_t *n = vlib_get_node (vm, rt->node_index); + rt->cpu_index = vm_clone->cpu_index; + /* copy initial runtime_data from node */ + if (n->runtime_data_bytes > 0) + clib_memcpy (rt->runtime_data, n->runtime_data, + VLIB_NODE_RUNTIME_DATA_SIZE); + else if (CLIB_DEBUG > 0) + memset (rt->runtime_data, 0xfe, + VLIB_NODE_RUNTIME_DATA_SIZE); + } nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] = vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]); vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]) + { + vlib_node_t *n = vlib_get_node (vm, rt->node_index); rt->cpu_index = vm_clone->cpu_index; + /* copy initial runtime_data from node */ + if (n->runtime_data_bytes > 0) + clib_memcpy (rt->runtime_data, n->runtime_data, + VLIB_NODE_RUNTIME_DATA_SIZE); + else if (CLIB_DEBUG > 0) + memset (rt->runtime_data, 0xfe, + VLIB_NODE_RUNTIME_DATA_SIZE); + } nm_clone->processes = vec_dup (nm->processes); @@ -926,26 +950,51 @@ vlib_worker_thread_node_runtime_update (void) clib_mem_free (old_nodes_clone[j]); vec_free (old_nodes_clone); - vec_free (nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]); + /* re-clone internal nodes */ + old_rt = nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]; nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] = vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]); - /* clone input node runtime */ - old_rt = nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]; + vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]) + { + vlib_node_t *n = vlib_get_node (vm, rt->node_index); + rt->cpu_index = vm_clone->cpu_index; + /* copy runtime_data, will be overwritten later for existing rt */ + clib_memcpy (rt->runtime_data, n->runtime_data, + VLIB_NODE_RUNTIME_DATA_SIZE); + } + + for (j = 0; j < vec_len (old_rt); j++) + { + rt = vlib_node_get_runtime (vm_clone, old_rt[j].node_index); + rt->state = old_rt[j].state; + clib_memcpy (rt->runtime_data, old_rt[j].runtime_data, + VLIB_NODE_RUNTIME_DATA_SIZE); + } + vec_free (old_rt); + + /* re-clone input nodes */ + old_rt = nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]; nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] = vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]); vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]) { + vlib_node_t *n = vlib_get_node (vm, rt->node_index); rt->cpu_index = vm_clone->cpu_index; + /* copy runtime_data, will be overwritten later for existing rt */ + clib_memcpy (rt->runtime_data, n->runtime_data, + VLIB_NODE_RUNTIME_DATA_SIZE); } for (j = 0; j < vec_len (old_rt); j++) { rt = vlib_node_get_runtime (vm_clone, old_rt[j].node_index); rt->state = old_rt[j].state; + clib_memcpy (rt->runtime_data, old_rt[j].runtime_data, + VLIB_NODE_RUNTIME_DATA_SIZE); } vec_free (old_rt); @@ -1342,6 +1391,7 @@ vlib_worker_thread_fn (void *arg) vlib_worker_thread_t *w = (vlib_worker_thread_t *) arg; vlib_thread_main_t *tm = vlib_get_thread_main (); vlib_main_t *vm = vlib_get_main (); + clib_error_t *e; ASSERT (vm->cpu_index == os_get_cpu_number ()); @@ -1349,6 +1399,11 @@ vlib_worker_thread_fn (void *arg) clib_time_init (&vm->clib_time); clib_mem_set_heap (w->thread_mheap); + e = vlib_call_init_exit_functions + (vm, vm->worker_init_function_registrations, 1 /* call_once */ ); + if (e) + clib_error_report (e); + /* Wait until the dpdk init sequence is complete */ while (tm->extern_thread_mgmt && tm->worker_thread_release == 0) vlib_worker_thread_barrier_check (); diff --git a/src/vnet/gre/node.c b/src/vnet/gre/node.c index 86f7a6ee..dd16db5e 100644 --- a/src/vnet/gre/node.c +++ b/src/vnet/gre/node.c @@ -448,7 +448,6 @@ gre_register_input_protocol (vlib_main_t * vm, { gre_main_t * em = &gre_main; gre_protocol_info_t * pi; - gre_input_runtime_t * rt; u16 * n; { @@ -464,10 +463,13 @@ gre_register_input_protocol (vlib_main_t * vm, node_index); /* Setup gre protocol -> next index sparse vector mapping. */ - rt = vlib_node_get_runtime_data (vm, gre_input_node.index); - n = sparse_vec_validate (rt->next_by_protocol, - clib_host_to_net_u16 (protocol)); - n[0] = pi->next_index; + foreach_vlib_main ({ + gre_input_runtime_t * rt; + rt = vlib_node_get_runtime_data (this_vlib_main, gre_input_node.index); + n = sparse_vec_validate (rt->next_by_protocol, + clib_host_to_net_u16 (protocol)); + n[0] = pi->next_index; + }); } static void @@ -529,3 +531,17 @@ static clib_error_t * gre_input_init (vlib_main_t * vm) } VLIB_INIT_FUNCTION (gre_input_init); + +static clib_error_t * gre_input_worker_init (vlib_main_t * vm) +{ + gre_input_runtime_t * rt; + + rt = vlib_node_get_runtime_data (vm, gre_input_node.index); + + rt->next_by_protocol = sparse_vec_new + (/* elt bytes */ sizeof (rt->next_by_protocol[0]), + /* bits in index */ BITS (((gre_header_t *) 0)->protocol)); + return 0; +} + +VLIB_WORKER_INIT_FUNCTION (gre_input_worker_init); diff --git a/src/vnet/hdlc/node.c b/src/vnet/hdlc/node.c index 4fe0296a..57e04c85 100644 --- a/src/vnet/hdlc/node.c +++ b/src/vnet/hdlc/node.c @@ -285,18 +285,9 @@ VLIB_REGISTER_NODE (hdlc_input_node) = { .unformat_buffer = unformat_hdlc_header, }; -static clib_error_t * hdlc_input_init (vlib_main_t * vm) +static clib_error_t * hdlc_input_runtime_init (vlib_main_t * vm) { hdlc_input_runtime_t * rt; - - { - clib_error_t * error = vlib_call_init_function (vm, hdlc_init); - if (error) - clib_error_report (error); - } - - hdlc_setup_node (vm, hdlc_input_node.index); - rt = vlib_node_get_runtime_data (vm, hdlc_input_node.index); rt->next_by_protocol = sparse_vec_new @@ -313,7 +304,23 @@ static clib_error_t * hdlc_input_init (vlib_main_t * vm) return 0; } +static clib_error_t * hdlc_input_init (vlib_main_t * vm) +{ + + { + clib_error_t * error = vlib_call_init_function (vm, hdlc_init); + if (error) + clib_error_report (error); + } + + hdlc_setup_node (vm, hdlc_input_node.index); + hdlc_input_runtime_init (vm); + + return 0; +} + VLIB_INIT_FUNCTION (hdlc_input_init); +VLIB_WORKER_INIT_FUNCTION (hdlc_input_runtime_init); void hdlc_register_input_protocol (vlib_main_t * vm, diff --git a/src/vnet/l2/l2_input_classify.c b/src/vnet/l2/l2_input_classify.c index 497df192..485b9abd 100644 --- a/src/vnet/l2/l2_input_classify.c +++ b/src/vnet/l2/l2_input_classify.c @@ -505,6 +505,21 @@ l2_input_classify_init (vlib_main_t * vm) VLIB_INIT_FUNCTION (l2_input_classify_init); +clib_error_t * +l2_input_classify_worker_init (vlib_main_t * vm) +{ + l2_input_classify_main_t *cm = &l2_input_classify_main; + l2_input_classify_runtime_t *rt; + + rt = vlib_node_get_runtime_data (vm, l2_input_classify_node.index); + + rt->l2cm = cm; + rt->vcm = cm->vnet_classify_main; + + return 0; +} + +VLIB_WORKER_INIT_FUNCTION (l2_input_classify_worker_init); /** Enable/disable l2 input classification on a specific interface. */ void diff --git a/src/vnet/l2/l2_output_classify.c b/src/vnet/l2/l2_output_classify.c index 832be1a1..c1bdaddc 100644 --- a/src/vnet/l2/l2_output_classify.c +++ b/src/vnet/l2/l2_output_classify.c @@ -505,6 +505,22 @@ l2_output_classify_init (vlib_main_t * vm) VLIB_INIT_FUNCTION (l2_output_classify_init); +clib_error_t * +l2_output_classify_worker_init (vlib_main_t * vm) +{ + l2_output_classify_main_t *cm = &l2_output_classify_main; + l2_output_classify_runtime_t *rt; + + rt = vlib_node_get_runtime_data (vm, l2_output_classify_node.index); + + rt->l2cm = cm; + rt->vcm = cm->vnet_classify_main; + + return 0; +} + +VLIB_WORKER_INIT_FUNCTION (l2_output_classify_worker_init); + /** Enable/disable l2 input classification on a specific interface. */ void vnet_l2_output_classify_enable_disable (u32 sw_if_index, int enable_disable) diff --git a/src/vnet/l2tp/l2tp.c b/src/vnet/l2tp/l2tp.c index 2d323397..cb94d7e7 100644 --- a/src/vnet/l2tp/l2tp.c +++ b/src/vnet/l2tp/l2tp.c @@ -747,6 +747,16 @@ l2tp_init (vlib_main_t * vm) VLIB_INIT_FUNCTION (l2tp_init); +clib_error_t * +l2tp_worker_init (vlib_main_t * vm) +{ + l2tp_encap_init (vm); + + return 0; +} + +VLIB_WORKER_INIT_FUNCTION (l2tp_worker_init); + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/mpls/node.c b/src/vnet/mpls/node.c index 18100912..03bfaf56 100644 --- a/src/vnet/mpls/node.c +++ b/src/vnet/mpls/node.c @@ -301,3 +301,16 @@ static clib_error_t * mpls_input_init (vlib_main_t * vm) } VLIB_INIT_FUNCTION (mpls_input_init); + +static clib_error_t * mpls_input_worker_init (vlib_main_t * vm) +{ + mpls_input_runtime_t * rt; + rt = vlib_node_get_runtime_data (vm, mpls_input_node.index); + rt->last_label = (u32) ~0; + rt->last_inner_fib_index = 0; + rt->last_outer_fib_index = 0; + rt->mpls_main = &mpls_main; + return 0; +} + +VLIB_WORKER_INIT_FUNCTION (mpls_input_worker_init); diff --git a/src/vnet/ppp/node.c b/src/vnet/ppp/node.c index 4f1f6a71..2f6e0c33 100644 --- a/src/vnet/ppp/node.c +++ b/src/vnet/ppp/node.c @@ -295,18 +295,10 @@ VLIB_REGISTER_NODE (ppp_input_node) = { /* *INDENT-ON* */ static clib_error_t * -ppp_input_init (vlib_main_t * vm) +ppp_input_runtime_init (vlib_main_t * vm) { ppp_input_runtime_t *rt; - { - clib_error_t *error = vlib_call_init_function (vm, ppp_init); - if (error) - clib_error_report (error); - } - - ppp_setup_node (vm, ppp_input_node.index); - rt = vlib_node_get_runtime_data (vm, ppp_input_node.index); rt->next_by_protocol = sparse_vec_new @@ -323,7 +315,24 @@ ppp_input_init (vlib_main_t * vm) return 0; } +static clib_error_t * +ppp_input_init (vlib_main_t * vm) +{ + + { + clib_error_t *error = vlib_call_init_function (vm, ppp_init); + if (error) + clib_error_report (error); + } + + ppp_setup_node (vm, ppp_input_node.index); + ppp_input_runtime_init (vm); + + return 0; +} + VLIB_INIT_FUNCTION (ppp_input_init); +VLIB_WORKER_INIT_FUNCTION (ppp_input_runtime_init); void ppp_register_input_protocol (vlib_main_t * vm, diff --git a/src/vnet/tcp/tcp_syn_filter4.c b/src/vnet/tcp/tcp_syn_filter4.c index c7605a30..9b2a8ac7 100644 --- a/src/vnet/tcp/tcp_syn_filter4.c +++ b/src/vnet/tcp/tcp_syn_filter4.c @@ -450,18 +450,21 @@ syn_filter_enable_disable (u32 sw_if_index, int enable_disable) if (enable_disable) { - vlib_main_t *vm = vlib_get_main (); syn_filter4_runtime_t *rt; - rt = vlib_node_get_runtime_data (vm, syn_filter4_node.index); - vec_validate (rt->syn_counts, 1023); - /* - * Given perfect disperson / optimal hashing results: - * Allow 128k (successful) syns/sec. 1024, buckets each of which - * absorb 128 syns before filtering. Reset table once a second. - * Reality bites, lets try resetting once every 100ms. - */ - rt->reset_interval = 0.1; /* reset interval in seconds */ + /* *INDENT-OFF* */ + foreach_vlib_main ({ + rt = vlib_node_get_runtime_data (this_vlib_main, syn_filter4_node.index); + vec_validate (rt->syn_counts, 1023); + /* + * Given perfect disperson / optimal hashing results: + * Allow 128k (successful) syns/sec. 1024, buckets each of which + * absorb 128 syns before filtering. Reset table once a second. + * Reality bites, lets try resetting once every 100ms. + */ + rt->reset_interval = 0.1; /* reset interval in seconds */ + }); + /* *INDENT-ON* */ } rv = vnet_feature_enable_disable ("ip4-local", "syn-filter-4", diff --git a/src/vnet/udp/udp_local.c b/src/vnet/udp/udp_local.c index 6b239f73..3a60b29b 100644 --- a/src/vnet/udp/udp_local.c +++ b/src/vnet/udp/udp_local.c @@ -520,11 +520,15 @@ udp_register_dst_port (vlib_main_t * vm, : udp6_input_node.index, node_index); /* Setup udp protocol -> next index sparse vector mapping. */ - rt = vlib_node_get_runtime_data - (vm, is_ip4 ? udp4_input_node.index : udp6_input_node.index); - n = sparse_vec_validate (rt->next_by_dst_port, - clib_host_to_net_u16 (dst_port)); - n[0] = pi->next_index; + /* *INDENT-OFF* */ + foreach_vlib_main({ + rt = vlib_node_get_runtime_data + (this_vlib_main, is_ip4 ? udp4_input_node.index : udp6_input_node.index); + n = sparse_vec_validate (rt->next_by_dst_port, + clib_host_to_net_u16 (dst_port)); + n[0] = pi->next_index; + }); + /* *INDENT-ON* */ } void @@ -541,11 +545,15 @@ udp_unregister_dst_port (vlib_main_t * vm, udp_dst_port_t dst_port, u8 is_ip4) return; /* Kill the mapping. Don't bother killing the pi, it may be back. */ - rt = vlib_node_get_runtime_data - (vm, is_ip4 ? udp4_input_node.index : udp6_input_node.index); - n = sparse_vec_validate (rt->next_by_dst_port, - clib_host_to_net_u16 (dst_port)); - n[0] = SPARSE_VEC_INVALID_INDEX; + /* *INDENT-OFF* */ + foreach_vlib_main({ + rt = vlib_node_get_runtime_data + (this_vlib_main, is_ip4 ? udp4_input_node.index : udp6_input_node.index); + n = sparse_vec_validate (rt->next_by_dst_port, + clib_host_to_net_u16 (dst_port)); + n[0] = SPARSE_VEC_INVALID_INDEX; + }); + /* *INDENT-ON* */ } void @@ -604,10 +612,27 @@ udp_setup_node (vlib_main_t * vm, u32 node_index) pn->unformat_edit = unformat_pg_udp_header; } +static void +udp_local_node_runtime_init (vlib_main_t * vm) +{ + udp_input_runtime_t *rt; + + rt = vlib_node_get_runtime_data (vm, udp4_input_node.index); + rt->next_by_dst_port = sparse_vec_new + ( /* elt bytes */ sizeof (rt->next_by_dst_port[0]), + /* bits in index */ BITS (((udp_header_t *) 0)->dst_port)); + rt->punt_unknown = 0; + + rt = vlib_node_get_runtime_data (vm, udp6_input_node.index); + rt->next_by_dst_port = sparse_vec_new + ( /* elt bytes */ sizeof (rt->next_by_dst_port[0]), + /* bits in index */ BITS (((udp_header_t *) 0)->dst_port)); + rt->punt_unknown = 0; +} + clib_error_t * udp_local_init (vlib_main_t * vm) { - udp_input_runtime_t *rt; udp_main_t *um = &udp_main; int i; @@ -628,27 +653,13 @@ udp_local_init (vlib_main_t * vm) udp_setup_node (vm, udp4_input_node.index); udp_setup_node (vm, udp6_input_node.index); - rt = vlib_node_get_runtime_data (vm, udp4_input_node.index); - - rt->next_by_dst_port = sparse_vec_new - ( /* elt bytes */ sizeof (rt->next_by_dst_port[0]), - /* bits in index */ BITS (((udp_header_t *) 0)->dst_port)); - - rt->punt_unknown = 0; + udp_local_node_runtime_init (vm); #define _(n,s) add_dst_port (um, UDP_DST_PORT_##s, #s, 1 /* is_ip4 */); foreach_udp4_dst_port #undef _ - rt = vlib_node_get_runtime_data (vm, udp6_input_node.index); - - rt->next_by_dst_port = sparse_vec_new - ( /* elt bytes */ sizeof (rt->next_by_dst_port[0]), - /* bits in index */ BITS (((udp_header_t *) 0)->dst_port)); - - rt->punt_unknown = 0; - #define _(n,s) add_dst_port (um, UDP_DST_PORT_##s, #s, 0 /* is_ip4 */); - foreach_udp6_dst_port + foreach_udp6_dst_port #undef _ ip4_register_protocol (IP_PROTOCOL_UDP, udp4_input_node.index); /* Note: ip6 differs from ip4, UDP is hotwired to ip6-udp-lookup */ @@ -657,6 +668,15 @@ udp_local_init (vlib_main_t * vm) VLIB_INIT_FUNCTION (udp_local_init); +clib_error_t * +udp_local_worker_init (vlib_main_t * vm) +{ + udp_local_node_runtime_init (vm); + return 0; +} + +VLIB_WORKER_INIT_FUNCTION (udp_local_worker_init); + /* * fd.io coding-style-patch-verification: ON * -- cgit 1.2.3-korg From b6f93a1d1acf0f6ad2cdac0f0ea72842f36776a1 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 16 Mar 2017 17:46:41 +0100 Subject: vlib: additional runtime_data checks Change-Id: I9b6ed9741fae89bdefa6f601398eb63a21155069 Signed-off-by: Damjan Marion --- src/vlib/threads.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/threads.c b/src/vlib/threads.c index 3756c3fa..40789f59 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -681,9 +681,10 @@ start_workers (vlib_main_t * vm) vlib_node_t *n = vlib_get_node (vm, rt->node_index); rt->cpu_index = vm_clone->cpu_index; /* copy initial runtime_data from node */ - if (n->runtime_data_bytes > 0) + if (n->runtime_data && n->runtime_data_bytes > 0) clib_memcpy (rt->runtime_data, n->runtime_data, - VLIB_NODE_RUNTIME_DATA_SIZE); + clib_min (VLIB_NODE_RUNTIME_DATA_SIZE, + n->runtime_data_bytes)); else if (CLIB_DEBUG > 0) memset (rt->runtime_data, 0xfe, VLIB_NODE_RUNTIME_DATA_SIZE); @@ -696,9 +697,10 @@ start_workers (vlib_main_t * vm) vlib_node_t *n = vlib_get_node (vm, rt->node_index); rt->cpu_index = vm_clone->cpu_index; /* copy initial runtime_data from node */ - if (n->runtime_data_bytes > 0) + if (n->runtime_data && n->runtime_data_bytes > 0) clib_memcpy (rt->runtime_data, n->runtime_data, - VLIB_NODE_RUNTIME_DATA_SIZE); + clib_min (VLIB_NODE_RUNTIME_DATA_SIZE, + n->runtime_data_bytes)); else if (CLIB_DEBUG > 0) memset (rt->runtime_data, 0xfe, VLIB_NODE_RUNTIME_DATA_SIZE); @@ -961,8 +963,10 @@ vlib_worker_thread_node_runtime_update (void) vlib_node_t *n = vlib_get_node (vm, rt->node_index); rt->cpu_index = vm_clone->cpu_index; /* copy runtime_data, will be overwritten later for existing rt */ - clib_memcpy (rt->runtime_data, n->runtime_data, - VLIB_NODE_RUNTIME_DATA_SIZE); + if (n->runtime_data && n->runtime_data_bytes > 0) + clib_memcpy (rt->runtime_data, n->runtime_data, + clib_min (VLIB_NODE_RUNTIME_DATA_SIZE, + n->runtime_data_bytes)); } for (j = 0; j < vec_len (old_rt); j++) @@ -985,8 +989,10 @@ vlib_worker_thread_node_runtime_update (void) vlib_node_t *n = vlib_get_node (vm, rt->node_index); rt->cpu_index = vm_clone->cpu_index; /* copy runtime_data, will be overwritten later for existing rt */ - clib_memcpy (rt->runtime_data, n->runtime_data, - VLIB_NODE_RUNTIME_DATA_SIZE); + if (n->runtime_data && n->runtime_data_bytes > 0) + clib_memcpy (rt->runtime_data, n->runtime_data, + clib_min (VLIB_NODE_RUNTIME_DATA_SIZE, + n->runtime_data_bytes)); } for (j = 0; j < vec_len (old_rt); j++) -- cgit 1.2.3-korg From 6aa75af2411a328caa23077778365ea8ba97dcc0 Mon Sep 17 00:00:00 2001 From: Steven Date: Fri, 24 Feb 2017 10:03:22 -0800 Subject: vlib: fix potential crash in dispatch_node ELOG_DATA call dispatch_node may be invoked from vlib main or worker threads. The call to ELOG_DATA in dispatch_node passes the parameter &vm->elog_main. It works fine when dispatch_node is invoked from the main thread. It does bad thing when it is invoked from the worker thread. While we are at it, make two additional enhancements to the same area. 1. Use ELOG_TRACK_DATA instead of ELOG_DATA to enhance g2 viewer presentation. 2. Since ELOG_DATA is in the data path, it could get very chatty. Make the call to ELOG_TRACK_DATA conditional compile. Change-Id: I80ca0eea10bc1e5d0d5549f9844dd9a34dbb65a2 Signed-off-by: Steven --- src/vlib/main.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/main.c b/src/vlib/main.c index 58e88fcc..605771c8 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -1017,6 +1017,7 @@ dispatch_node (vlib_main_t * vm, && (node->flags & VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE))) { +#ifdef DISPATCH_NODE_ELOG_REQUIRED ELOG_TYPE_DECLARE (e) = { .function = (char *) __FUNCTION__,.format = @@ -1028,6 +1029,8 @@ dispatch_node (vlib_main_t * vm, { u32 node_name, vector_length, is_polling; } *ed; + vlib_worker_thread_t *w = vlib_worker_threads + vm->cpu_index; +#endif if (dispatch_state == VLIB_NODE_STATE_INTERRUPT && v >= nm->polling_threshold_vector_length) @@ -1045,10 +1048,13 @@ dispatch_node (vlib_main_t * vm, nm->input_node_counts_by_state[VLIB_NODE_STATE_INTERRUPT] -= 1; nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] += 1; - ed = ELOG_DATA (&vm->elog_main, e); +#ifdef DISPATCH_NODE_ELOG_REQUIRED + ed = ELOG_TRACK_DATA (&vlib_global_main.elog_main, e, + w->elog_track); ed->node_name = n->name_elog_string; ed->vector_length = v; ed->is_polling = 1; +#endif } else if (dispatch_state == VLIB_NODE_STATE_POLLING && v <= nm->interrupt_threshold_vector_length) @@ -1073,10 +1079,13 @@ dispatch_node (vlib_main_t * vm, { node->flags |= VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE; - ed = ELOG_DATA (&vm->elog_main, e); +#ifdef DISPATCH_NODE_ELOG_REQUIRED + ed = ELOG_TRACK_DATA (&vlib_global_main.elog_main, e, + w->elog_track); ed->node_name = n->name_elog_string; ed->vector_length = v; ed->is_polling = 0; +#endif } } } -- cgit 1.2.3-korg From 1bd01099a6512b6119bbf337b36222a6f0770d49 Mon Sep 17 00:00:00 2001 From: Neale Ranns Date: Wed, 15 Mar 2017 15:41:17 -0400 Subject: 64 bit per-thread counters after: TenGigabitEthernet5/0/1-output active 107522 17375708 0 7.22e0 161.60 TenGigabitEthernet5/0/1-tx active 107522 17375708 0 6.93e1 161.60 ip4-input-no-checksum active 107522 17375708 0 2.52e1 161.60 ip4-lookup active 107522 17375708 0 3.10e1 161.60 ip4-rewrite active 107522 17375708 0 2.52e1 161.60 before TenGigabitEthernet5/0/1-output active 433575 110995200 0 6.95e0 256.00 TenGigabitEthernet5/0/1-tx active 433575 110995200 0 7.14e1 256.00 ip4-input-no-checksum active 433575 110995200 0 2.66e1 256.00 ip4-lookup active 433575 110995200 0 3.29e1 256.00 ip4-rewrite active 433575 110995200 0 2.59e1 256.00 Change-Id: I46405bd22189f48a39f06e3443bb7e13f410b539 Signed-off-by: Neale Ranns --- src/vlib/counter.c | 66 +++++++-------- src/vlib/counter.h | 207 ++++++++++++++++------------------------------ src/vnet/ip/ip4_forward.c | 36 ++++---- src/vnet/map/map.c | 2 +- src/vnet/map/map_api.c | 2 +- src/vnet/rewrite.c | 2 +- src/vpp/api/api.c | 2 +- src/vpp/stats/stats.c | 16 ++-- 8 files changed, 132 insertions(+), 201 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/counter.c b/src/vlib/counter.c index 9f66e04d..62f4bd66 100644 --- a/src/vlib/counter.c +++ b/src/vlib/counter.c @@ -42,56 +42,36 @@ void vlib_clear_simple_counters (vlib_simple_counter_main_t * cm) { + counter_t *my_counters; uword i, j; - u16 *my_minis; - for (i = 0; i < vec_len (cm->minis); i++) + for (i = 0; i < vec_len (cm->counters); i++) { - my_minis = cm->minis[i]; + my_counters = cm->counters[i]; - for (j = 0; j < vec_len (my_minis); j++) + for (j = 0; j < vec_len (my_counters); j++) { - cm->maxi[j] += my_minis[j]; - my_minis[j] = 0; + my_counters[j] = 0; } } - - j = vec_len (cm->maxi); - if (j > 0) - vec_validate (cm->value_at_last_clear, j - 1); - for (i = 0; i < j; i++) - cm->value_at_last_clear[i] = cm->maxi[i]; } void vlib_clear_combined_counters (vlib_combined_counter_main_t * cm) { + vlib_counter_t *my_counters; uword i, j; - vlib_mini_counter_t *my_minis; - for (i = 0; i < vec_len (cm->minis); i++) + for (i = 0; i < vec_len (cm->counters); i++) { - my_minis = cm->minis[i]; + my_counters = cm->counters[i]; - for (j = 0; j < vec_len (my_minis); j++) + for (j = 0; j < vec_len (my_counters); j++) { - cm->maxi[j].packets += my_minis[j].packets; - cm->maxi[j].bytes += my_minis[j].bytes; - my_minis[j].packets = 0; - my_minis[j].bytes = 0; + my_counters[j].packets = 0; + my_counters[j].bytes = 0; } } - - j = vec_len (cm->maxi); - if (j > 0) - vec_validate (cm->value_at_last_clear, j - 1); - - for (i = 0; i < j; i++) - { - vlib_counter_t *c = vec_elt_at_index (cm->value_at_last_clear, i); - - c[0] = cm->maxi[i]; - } } void @@ -100,10 +80,9 @@ vlib_validate_simple_counter (vlib_simple_counter_main_t * cm, u32 index) vlib_thread_main_t *tm = vlib_get_thread_main (); int i; - vec_validate (cm->minis, tm->n_vlib_mains - 1); + vec_validate (cm->counters, tm->n_vlib_mains - 1); for (i = 0; i < tm->n_vlib_mains; i++) - vec_validate_aligned (cm->minis[i], index, CLIB_CACHE_LINE_BYTES); - vec_validate_aligned (cm->maxi, index, CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (cm->counters[i], index, CLIB_CACHE_LINE_BYTES); } void @@ -112,10 +91,23 @@ vlib_validate_combined_counter (vlib_combined_counter_main_t * cm, u32 index) vlib_thread_main_t *tm = vlib_get_thread_main (); int i; - vec_validate (cm->minis, tm->n_vlib_mains - 1); + vec_validate (cm->counters, tm->n_vlib_mains - 1); for (i = 0; i < tm->n_vlib_mains; i++) - vec_validate_aligned (cm->minis[i], index, CLIB_CACHE_LINE_BYTES); - vec_validate_aligned (cm->maxi, index, CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (cm->counters[i], index, CLIB_CACHE_LINE_BYTES); +} + +u32 +vlib_combined_counter_n_counters (const vlib_combined_counter_main_t * cm) +{ + ASSERT (cm->counters); + return (vec_len (cm->counters[0])); +} + +u32 +vlib_simple_counter_n_counters (const vlib_simple_counter_main_t * cm) +{ + ASSERT (cm->counters); + return (vec_len (cm->counters[0])); } void diff --git a/src/vlib/counter.h b/src/vlib/counter.h index abfa89ee..17a85217 100644 --- a/src/vlib/counter.h +++ b/src/vlib/counter.h @@ -44,59 +44,48 @@ Optimized thread-safe counters. - Each vlib_[simple|combined]_counter_main_t consists of a single - vector of thread-safe / atomically-updated u64 counters [the - "maxi" vector], and a (u16 **) per-thread vector [the "minis" - vector] of narrow, per-thread counters. - - The idea is to drastically reduce the number of atomic operations. - In the case of packet counts, we divide the number of atomic ops - by 2**16, etc. + Each vlib_[simple|combined]_counter_main_t consists of a per-thread + vector of per-object counters. + + The idea is to drastically eliminate atomic operations. */ +/** 64bit counters */ +typedef u64 counter_t; + /** A collection of simple counters */ typedef struct { - u16 **minis; /**< Per-thread u16 non-atomic counters */ - u64 *maxi; /**< Shared wide counters */ - u64 *value_at_last_clear; /**< Counter values as of last clear. */ - u64 *value_at_last_serialize; /**< Values as of last serialize. */ + counter_t **counters; /**< Per-thread u64 non-atomic counters */ + counter_t *value_at_last_serialize; /**< Values as of last serialize. */ u32 last_incremental_serialize_index; /**< Last counter index serialized incrementally. */ char *name; /**< The counter collection's name. */ } vlib_simple_counter_main_t; +/** The number of counters (not the number of per-thread counters) */ +u32 vlib_simple_counter_n_counters (const vlib_simple_counter_main_t * cm); + /** Increment a simple counter @param cm - (vlib_simple_counter_main_t *) simple counter main pointer @param cpu_index - (u32) the current cpu index @param index - (u32) index of the counter to increment - @param increment - (u32) quantitiy to add to the counter + @param increment - (u64) quantitiy to add to the counter */ always_inline void vlib_increment_simple_counter (vlib_simple_counter_main_t * cm, - u32 cpu_index, u32 index, u32 increment) + u32 cpu_index, u32 index, u64 increment) { - u16 *my_minis; - u16 *mini; - u32 old, new; - - my_minis = cm->minis[cpu_index]; - mini = vec_elt_at_index (my_minis, index); - old = mini[0]; - new = old + increment; - mini[0] = new; + counter_t *my_counters; - if (PREDICT_FALSE (mini[0] != new)) - { - __sync_fetch_and_add (&cm->maxi[index], new); - my_minis[index] = 0; - } + my_counters = cm->counters[cpu_index]; + my_counters[index] += increment; } /** Get the value of a simple counter - Scrapes the entire set of mini counters. Innacurate unless + Scrapes the entire set of per-thread counters. Innacurate unless worker threads which might increment the counter are barrier-synchronized @@ -104,30 +93,21 @@ vlib_increment_simple_counter (vlib_simple_counter_main_t * cm, @param index - (u32) index of the counter to fetch @returns - (u64) current counter value */ -always_inline u64 +always_inline counter_t vlib_get_simple_counter (vlib_simple_counter_main_t * cm, u32 index) { - u16 *my_minis, *mini; - u64 v; + counter_t *my_counters; + counter_t v; int i; - ASSERT (index < vec_len (cm->maxi)); + ASSERT (index < vlib_simple_counter_n_counters (cm)); v = 0; - for (i = 0; i < vec_len (cm->minis); i++) + for (i = 0; i < vec_len (cm->counters); i++) { - my_minis = cm->minis[i]; - mini = vec_elt_at_index (my_minis, index); - v += mini[0]; - } - - v += cm->maxi[index]; - - if (index < vec_len (cm->value_at_last_clear)) - { - ASSERT (v >= cm->value_at_last_clear[index]); - v -= cm->value_at_last_clear[index]; + my_counters = cm->counters[i]; + v += my_counters[index]; } return v; @@ -142,29 +122,24 @@ vlib_get_simple_counter (vlib_simple_counter_main_t * cm, u32 index) always_inline void vlib_zero_simple_counter (vlib_simple_counter_main_t * cm, u32 index) { - u16 *my_minis; + counter_t *my_counters; int i; - ASSERT (index < vec_len (cm->maxi)); + ASSERT (index < vlib_simple_counter_n_counters (cm)); - for (i = 0; i < vec_len (cm->minis); i++) + for (i = 0; i < vec_len (cm->counters); i++) { - my_minis = cm->minis[i]; - my_minis[index] = 0; + my_counters = cm->counters[i]; + my_counters[index] = 0; } - - cm->maxi[index] = 0; - - if (index < vec_len (cm->value_at_last_clear)) - cm->value_at_last_clear[index] = 0; } /** Combined counter to hold both packets and byte differences. */ typedef struct { - u64 packets; /**< packet counter */ - u64 bytes; /**< byte counter */ + counter_t packets; /**< packet counter */ + counter_t bytes; /**< byte counter */ } vlib_counter_t; /** Add two combined counters, results in the first counter @@ -201,24 +176,19 @@ vlib_counter_zero (vlib_counter_t * a) a->packets = a->bytes = 0; } -/** Mini combined counter */ -typedef struct -{ - u16 packets; /**< Packet count */ - i16 bytes; /**< Byte count */ -} vlib_mini_counter_t; - /** A collection of combined counters */ typedef struct { - vlib_mini_counter_t **minis; /**< Per-thread u16 non-atomic counter pairs */ - vlib_counter_t *maxi; /**< Shared wide counter pairs */ - vlib_counter_t *value_at_last_clear; /**< Counter values as of last clear. */ + vlib_counter_t **counters; /**< Per-thread u64 non-atomic counter pairs */ vlib_counter_t *value_at_last_serialize; /**< Counter values as of last serialize. */ u32 last_incremental_serialize_index; /**< Last counter index serialized incrementally. */ char *name; /**< The counter collection's name. */ } vlib_combined_counter_main_t; +/** The number of counters (not the number of per-thread counters) */ +u32 vlib_combined_counter_n_counters (const vlib_combined_counter_main_t * + cm); + /** Clear a collection of simple counters @param cm - (vlib_simple_counter_main_t *) collection to clear */ @@ -233,62 +203,41 @@ void vlib_clear_combined_counters (vlib_combined_counter_main_t * cm); @param cm - (vlib_combined_counter_main_t *) comined counter main pointer @param cpu_index - (u32) the current cpu index @param index - (u32) index of the counter to increment - @param packet_increment - (u32) number of packets to add to the counter - @param byte_increment - (u32) number of bytes to add to the counter + @param packet_increment - (u64) number of packets to add to the counter + @param byte_increment - (u64) number of bytes to add to the counter */ always_inline void vlib_increment_combined_counter (vlib_combined_counter_main_t * cm, u32 cpu_index, - u32 index, - u32 packet_increment, u32 byte_increment) + u32 index, u64 n_packets, u64 n_bytes) { - vlib_mini_counter_t *my_minis, *mini; - u32 old_packets, new_packets; - i32 old_bytes, new_bytes; - - /* Use this CPU's mini counter array */ - my_minis = cm->minis[cpu_index]; - - mini = vec_elt_at_index (my_minis, index); - old_packets = mini->packets; - old_bytes = mini->bytes; - - new_packets = old_packets + packet_increment; - new_bytes = old_bytes + byte_increment; - - mini->packets = new_packets; - mini->bytes = new_bytes; - - /* Bytes always overflow before packets.. */ - if (PREDICT_FALSE (mini->bytes != new_bytes)) - { - vlib_counter_t *maxi = vec_elt_at_index (cm->maxi, index); + vlib_counter_t *my_counters; - __sync_fetch_and_add (&maxi->packets, new_packets); - __sync_fetch_and_add (&maxi->bytes, new_bytes); + /* Use this CPU's counter array */ + my_counters = cm->counters[cpu_index]; - mini->packets = 0; - mini->bytes = 0; - } + my_counters[index].packets += n_packets; + my_counters[index].bytes += n_bytes; } -#define vlib_prefetch_combined_counter(_cm, _cpu_index, _index) \ -{ \ - vlib_mini_counter_t *_cpu_minis; \ - \ - /* \ - * This CPU's mini index is assumed to already be in cache \ - */ \ - _cpu_minis = (_cm)->minis[(_cpu_index)]; \ - CLIB_PREFETCH(_cpu_minis + (_index), \ - sizeof(*_cpu_minis), \ - STORE); \ +/** Pre-fetch a per-thread combined counter for the given object index */ +always_inline void +vlib_prefetch_combined_counter (const vlib_combined_counter_main_t * cm, + u32 cpu_index, u32 index) +{ + vlib_counter_t *cpu_counters; + + /* + * This CPU's index is assumed to already be in cache + */ + cpu_counters = cm->counters[cpu_index]; + CLIB_PREFETCH (cpu_counters + index, CLIB_CACHE_LINE_BYTES, STORE); } /** Get the value of a combined counter, never called in the speed path - Scrapes the entire set of mini counters. Innacurate unless + Scrapes the entire set of per-thread counters. Innacurate unless worker threads which might increment the counter are barrier-synchronized @@ -298,35 +247,27 @@ vlib_increment_combined_counter (vlib_combined_counter_main_t * cm, */ static inline void -vlib_get_combined_counter (vlib_combined_counter_main_t * cm, +vlib_get_combined_counter (const vlib_combined_counter_main_t * cm, u32 index, vlib_counter_t * result) { - vlib_mini_counter_t *my_minis, *mini; - vlib_counter_t *maxi; + vlib_counter_t *my_counters, *counter; int i; result->packets = 0; result->bytes = 0; - for (i = 0; i < vec_len (cm->minis); i++) + for (i = 0; i < vec_len (cm->counters); i++) { - my_minis = cm->minis[i]; + my_counters = cm->counters[i]; - mini = vec_elt_at_index (my_minis, index); - result->packets += mini->packets; - result->bytes += mini->bytes; + counter = vec_elt_at_index (my_counters, index); + result->packets += counter->packets; + result->bytes += counter->bytes; } - - maxi = vec_elt_at_index (cm->maxi, index); - result->packets += maxi->packets; - result->bytes += maxi->bytes; - - if (index < vec_len (cm->value_at_last_clear)) - vlib_counter_sub (result, &cm->value_at_last_clear[index]); } /** Clear a combined counter - Clears the set of per-thread u16 counters, and the shared vlib_counter_t + Clears the set of per-thread counters. @param cm - (vlib_combined_counter_main_t *) combined counter main pointer @param index - (u32) index of the counter to clear @@ -334,21 +275,17 @@ vlib_get_combined_counter (vlib_combined_counter_main_t * cm, always_inline void vlib_zero_combined_counter (vlib_combined_counter_main_t * cm, u32 index) { - vlib_mini_counter_t *mini, *my_minis; + vlib_counter_t *my_counters, *counter; int i; - for (i = 0; i < vec_len (cm->minis); i++) + for (i = 0; i < vec_len (cm->counters); i++) { - my_minis = cm->minis[i]; + my_counters = cm->counters[i]; - mini = vec_elt_at_index (my_minis, index); - mini->packets = 0; - mini->bytes = 0; + counter = vec_elt_at_index (my_counters, index); + counter->packets = 0; + counter->bytes = 0; } - - vlib_counter_zero (&cm->maxi[index]); - if (index < vec_len (cm->value_at_last_clear)) - vlib_counter_zero (&cm->value_at_last_clear[index]); } /** validate a simple counter diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index 0dad61d4..7352c2e7 100644 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -2375,6 +2375,17 @@ ip4_rewrite_inline (vlib_main_t * vm, adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX]; adj_index1 = vnet_buffer (p1)->ip.adj_index[VLIB_TX]; + /* + * pre-fetch the per-adjacency counters + */ + if (do_counters) + { + vlib_prefetch_combined_counter (&adjacency_counters, + cpu_index, adj_index0); + vlib_prefetch_combined_counter (&adjacency_counters, + cpu_index, adj_index1); + } + /* We should never rewrite a pkt using the MISS adjacency */ ASSERT (adj_index0 && adj_index1); @@ -2480,17 +2491,6 @@ ip4_rewrite_inline (vlib_main_t * vm, rewrite_header.max_l3_packet_bytes ? IP4_ERROR_MTU_EXCEEDED : error1); - /* - * pre-fetch the per-adjacency counters - */ - if (do_counters) - { - vlib_prefetch_combined_counter (&adjacency_counters, - cpu_index, adj_index0); - vlib_prefetch_combined_counter (&adjacency_counters, - cpu_index, adj_index1); - } - /* Don't adjust the buffer for ttl issue; icmp-error node wants * to see the IP headerr */ if (PREDICT_TRUE (error0 == IP4_ERROR_NONE)) @@ -2624,8 +2624,9 @@ ip4_rewrite_inline (vlib_main_t * vm, p0->flags &= ~VNET_BUFFER_LOCALLY_ORIGINATED; } - vlib_prefetch_combined_counter (&adjacency_counters, - cpu_index, adj_index0); + if (do_counters) + vlib_prefetch_combined_counter (&adjacency_counters, + cpu_index, adj_index0); /* Guess we are only writing on simple Ethernet header. */ vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t)); @@ -2641,10 +2642,11 @@ ip4_rewrite_inline (vlib_main_t * vm, rw_len0 = adj0[0].rewrite_header.data_bytes; vnet_buffer (p0)->ip.save_rewrite_length = rw_len0; - vlib_increment_combined_counter - (&adjacency_counters, - cpu_index, - adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); + if (do_counters) + vlib_increment_combined_counter + (&adjacency_counters, + cpu_index, adj_index0, 1, + vlib_buffer_length_in_chain (vm, p0) + rw_len0); /* Check MTU of outgoing interface. */ error0 = (vlib_buffer_length_in_chain (vm, p0) diff --git a/src/vnet/map/map.c b/src/vnet/map/map.c index 7006b1db..99305afa 100644 --- a/src/vnet/map/map.c +++ b/src/vnet/map/map.c @@ -1304,7 +1304,7 @@ show_map_stats_command_fn (vlib_main_t * vm, unformat_input_t * input, { which = cm - mm->domain_counters; - for (i = 0; i < vec_len (cm->maxi); i++) + for (i = 0; i < vlib_combined_counter_n_counters (cm); i++) { vlib_get_combined_counter (cm, i, &v); total_pkts[which] += v.packets; diff --git a/src/vnet/map/map_api.c b/src/vnet/map/map_api.c index 7febeb3d..d618e7a6 100644 --- a/src/vnet/map/map_api.c +++ b/src/vnet/map/map_api.c @@ -211,7 +211,7 @@ vl_api_map_summary_stats_t_handler (vl_api_map_summary_stats_t * mp) { which = cm - mm->domain_counters; - for (i = 0; i < vec_len (cm->maxi); i++) + for (i = 0; i < vlib_combined_counter_n_counters (cm); i++) { vlib_get_combined_counter (cm, i, &v); total_pkts[which] += v.packets; diff --git a/src/vnet/rewrite.c b/src/vnet/rewrite.c index c4a171c1..47fb74df 100644 --- a/src/vnet/rewrite.c +++ b/src/vnet/rewrite.c @@ -79,7 +79,7 @@ format_vnet_rewrite (u8 * s, va_list * args) if (NULL != si) s = format (s, "%U: ", format_vnet_sw_interface_name, vnm, si); else - s = format (s, "DELETED"); + s = format (s, "DELETED:%d", rw->sw_if_index); } /* Format rewrite string. */ diff --git a/src/vpp/api/api.c b/src/vpp/api/api.c index d8301fa6..8df40406 100644 --- a/src/vpp/api/api.c +++ b/src/vpp/api/api.c @@ -849,7 +849,7 @@ vl_api_vnet_get_summary_stats_t_handler (vl_api_vnet_get_summary_stats_t * mp) { which = cm - im->combined_sw_if_counters; - for (i = 0; i < vec_len (cm->maxi); i++) + for (i = 0; i < vlib_combined_counter_n_counters (cm); i++) { vlib_get_combined_counter (cm, i, &v); total_pkts[which] += v.packets; diff --git a/src/vpp/stats/stats.c b/src/vpp/stats/stats.c index c46d441a..1927da0b 100644 --- a/src/vpp/stats/stats.c +++ b/src/vpp/stats/stats.c @@ -134,7 +134,7 @@ do_simple_interface_counters (stats_main_t * sm) vlib_simple_counter_main_t *cm; u32 items_this_message = 0; u64 v, *vp = 0; - int i; + int i, n_counts; /* * Prevent interface registration from expanding / moving the vectors... @@ -144,13 +144,13 @@ do_simple_interface_counters (stats_main_t * sm) vec_foreach (cm, im->sw_if_counters) { - - for (i = 0; i < vec_len (cm->maxi); i++) + n_counts = vlib_simple_counter_n_counters (cm); + for (i = 0; i < n_counts; i++) { if (mp == 0) { items_this_message = clib_min (SIMPLE_COUNTER_BATCH_SIZE, - vec_len (cm->maxi) - i); + n_counts - i); mp = vl_msg_api_alloc_as_if_client (sizeof (*mp) + items_this_message * sizeof (v)); @@ -189,19 +189,19 @@ do_combined_interface_counters (stats_main_t * sm) vlib_combined_counter_main_t *cm; u32 items_this_message = 0; vlib_counter_t v, *vp = 0; - int i; + int i, n_counts; vnet_interface_counter_lock (im); vec_foreach (cm, im->combined_sw_if_counters) { - - for (i = 0; i < vec_len (cm->maxi); i++) + n_counts = vlib_combined_counter_n_counters (cm); + for (i = 0; i < n_counts; i++) { if (mp == 0) { items_this_message = clib_min (COMBINED_COUNTER_BATCH_SIZE, - vec_len (cm->maxi) - i); + n_counts - i); mp = vl_msg_api_alloc_as_if_client (sizeof (*mp) + items_this_message * sizeof (v)); -- cgit 1.2.3-korg From eb743fad56b32cb20ad2d2cadc4760f9c25be5e1 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Mon, 20 Mar 2017 16:34:15 +0100 Subject: vnet: add device-input threadplacement infra This change adds two new debug CLI command: - "show interface placmenet" to display which thread (main or worker) is responsible for processing interface rx queue vpp# show interface placement Thread 0 (vpp_main): node af-packet-input: host-vpp1 queue 0 Thread 1 (vpp_wk_0): node af-packet-input: host-virbr0 queue 0 Thread 2 (vpp_wk_1): node af-packet-input: host-vpp2 queue 0 host-lxcbr0 queue 0 - "set interface placmenet" to assign thread (main or worker) which process specific interface rx queue vpp# set interface placement host-vpp1 queue 0 main Change-Id: Id4dd00cf2b05e10fae2125ac7cb4411b446c5e9c Signed-off-by: Damjan Marion --- src/vlib/threads.c | 14 +- src/vnet/devices/af_packet/af_packet.c | 54 +------- src/vnet/devices/af_packet/af_packet.h | 6 - src/vnet/devices/af_packet/node.c | 23 ++-- src/vnet/devices/devices.c | 240 +++++++++++++++++++++++++++++++++ src/vnet/devices/devices.h | 45 +++++++ src/vnet/interface.h | 6 + 7 files changed, 310 insertions(+), 78 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/threads.c b/src/vlib/threads.c index 40789f59..ef3a24d3 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -685,9 +685,6 @@ start_workers (vlib_main_t * vm) clib_memcpy (rt->runtime_data, n->runtime_data, clib_min (VLIB_NODE_RUNTIME_DATA_SIZE, n->runtime_data_bytes)); - else if (CLIB_DEBUG > 0) - memset (rt->runtime_data, 0xfe, - VLIB_NODE_RUNTIME_DATA_SIZE); } nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] = @@ -701,9 +698,6 @@ start_workers (vlib_main_t * vm) clib_memcpy (rt->runtime_data, n->runtime_data, clib_min (VLIB_NODE_RUNTIME_DATA_SIZE, n->runtime_data_bytes)); - else if (CLIB_DEBUG > 0) - memset (rt->runtime_data, 0xfe, - VLIB_NODE_RUNTIME_DATA_SIZE); } nm_clone->processes = vec_dup (nm->processes); @@ -1405,15 +1399,15 @@ vlib_worker_thread_fn (void *arg) clib_time_init (&vm->clib_time); clib_mem_set_heap (w->thread_mheap); + /* Wait until the dpdk init sequence is complete */ + while (tm->extern_thread_mgmt && tm->worker_thread_release == 0) + vlib_worker_thread_barrier_check (); + e = vlib_call_init_exit_functions (vm, vm->worker_init_function_registrations, 1 /* call_once */ ); if (e) clib_error_report (e); - /* Wait until the dpdk init sequence is complete */ - while (tm->extern_thread_mgmt && tm->worker_thread_release == 0) - vlib_worker_thread_barrier_check (); - vlib_worker_loop (vm); } diff --git a/src/vnet/devices/af_packet/af_packet.c b/src/vnet/devices/af_packet/af_packet.c index e491ba47..5fdc59f2 100644 --- a/src/vnet/devices/af_packet/af_packet.c +++ b/src/vnet/devices/af_packet/af_packet.c @@ -67,15 +67,16 @@ af_packet_eth_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, static clib_error_t * af_packet_fd_read_ready (unix_file_t * uf) { - vlib_main_t *vm = vlib_get_main (); af_packet_main_t *apm = &af_packet_main; + vnet_main_t *vnm = vnet_get_main (); u32 idx = uf->private_data; + af_packet_if_t *apif = pool_elt_at_index (apm->interfaces, idx); apm->pending_input_bitmap = clib_bitmap_set (apm->pending_input_bitmap, idx, 1); /* Schedule the rx node */ - vlib_node_set_interrupt_pending (vm, af_packet_input_node.index); + vnet_device_input_set_interrupt_pending (vnm, apif->hw_if_index, 0); return 0; } @@ -171,31 +172,6 @@ error: return ret; } -static void -af_packet_worker_thread_enable () -{ - /* If worker threads are enabled, switch to polling mode */ - foreach_vlib_main (( - { - vlib_node_set_state (this_vlib_main, - af_packet_input_node.index, - VLIB_NODE_STATE_POLLING); - })); - -} - -static void -af_packet_worker_thread_disable () -{ - foreach_vlib_main (( - { - vlib_node_set_state (this_vlib_main, - af_packet_input_node.index, - VLIB_NODE_STATE_INTERRUPT); - })); - -} - int af_packet_create_if (vlib_main_t * vm, u8 * host_if_name, u8 * hw_addr_set, u32 * sw_if_index) @@ -298,6 +274,9 @@ af_packet_create_if (vlib_main_t * vm, u8 * host_if_name, u8 * hw_addr_set, sw = vnet_get_hw_sw_interface (vnm, apif->hw_if_index); apif->sw_if_index = sw->sw_if_index; + vnet_set_device_input_node (apif->hw_if_index, af_packet_input_node.index); + vnet_device_input_assign_thread (apif->hw_if_index, 0, /* queue */ + ~0 /* any cpu */ ); vnet_hw_interface_set_flags (vnm, apif->hw_if_index, VNET_HW_INTERFACE_FLAG_LINK_UP); @@ -307,9 +286,6 @@ af_packet_create_if (vlib_main_t * vm, u8 * host_if_name, u8 * hw_addr_set, if (sw_if_index) *sw_if_index = apif->sw_if_index; - if (tm->n_vlib_mains > 1 && pool_elts (apm->interfaces) == 1) - af_packet_worker_thread_enable (); - return 0; error: @@ -323,7 +299,6 @@ int af_packet_delete_if (vlib_main_t * vm, u8 * host_if_name) { vnet_main_t *vnm = vnet_get_main (); - vlib_thread_main_t *tm = vlib_get_thread_main (); af_packet_main_t *apm = &af_packet_main; af_packet_if_t *apif; uword *p; @@ -373,8 +348,6 @@ af_packet_delete_if (vlib_main_t * vm, u8 * host_if_name) ethernet_delete_interface (vnm, apif->hw_if_index); pool_put (apm->interfaces, apif); - if (tm->n_vlib_mains > 1 && pool_elts (apm->interfaces) == 0) - af_packet_worker_thread_disable (); return 0; } @@ -384,24 +357,9 @@ af_packet_init (vlib_main_t * vm) { af_packet_main_t *apm = &af_packet_main; vlib_thread_main_t *tm = vlib_get_thread_main (); - vlib_thread_registration_t *tr; - uword *p; memset (apm, 0, sizeof (af_packet_main_t)); - apm->input_cpu_first_index = 0; - apm->input_cpu_count = 1; - - /* find out which cpus will be used for input */ - p = hash_get_mem (tm->thread_registrations_by_name, "workers"); - tr = p ? (vlib_thread_registration_t *) p[0] : 0; - - if (tr && tr->count > 0) - { - apm->input_cpu_first_index = tr->first_index; - apm->input_cpu_count = tr->count; - } - mhash_init_vec_string (&apm->if_index_by_host_if_name, sizeof (uword)); vec_validate_aligned (apm->rx_buffers, tm->n_vlib_mains - 1, diff --git a/src/vnet/devices/af_packet/af_packet.h b/src/vnet/devices/af_packet/af_packet.h index e00e5cb4..50ec2378 100644 --- a/src/vnet/devices/af_packet/af_packet.h +++ b/src/vnet/devices/af_packet/af_packet.h @@ -51,12 +51,6 @@ typedef struct /* hash of host interface names */ mhash_t if_index_by_host_if_name; - - /* first cpu index */ - u32 input_cpu_first_index; - - /* total cpu count */ - u32 input_cpu_count; } af_packet_main_t; af_packet_main_t af_packet_main; diff --git a/src/vnet/devices/af_packet/node.c b/src/vnet/devices/af_packet/node.c index ab7fd800..ba337f3f 100644 --- a/src/vnet/devices/af_packet/node.c +++ b/src/vnet/devices/af_packet/node.c @@ -246,20 +246,18 @@ static uword af_packet_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { - int i; u32 n_rx_packets = 0; - u32 cpu_index = os_get_cpu_number (); af_packet_main_t *apm = &af_packet_main; - af_packet_if_t *apif; + vnet_device_input_runtime_t *rt = (void *) node->runtime_data; + vnet_device_and_queue_t *dq; - for (i = 0; i < vec_len (apm->interfaces); i++) - { - apif = vec_elt_at_index (apm->interfaces, i); - if (apif->is_admin_up && - (i % apm->input_cpu_count) == - (cpu_index - apm->input_cpu_first_index)) - n_rx_packets += af_packet_device_input_fn (vm, node, frame, apif); - } + vec_foreach (dq, rt->devices_and_queues) + { + af_packet_if_t *apif; + apif = vec_elt_at_index (apm->interfaces, dq->dev_instance); + if (apif->is_admin_up) + n_rx_packets += af_packet_device_input_fn (vm, node, frame, apif); + } return n_rx_packets; } @@ -271,9 +269,6 @@ VLIB_REGISTER_NODE (af_packet_input_node) = { .sibling_of = "device-input", .format_trace = format_af_packet_input_trace, .type = VLIB_NODE_TYPE_INPUT, - /** - * default state is INTERRUPT mode, switch to POLLING if worker threads are enabled - */ .state = VLIB_NODE_STATE_INTERRUPT, .n_errors = AF_PACKET_INPUT_N_ERROR, .error_strings = af_packet_input_error_strings, diff --git a/src/vnet/devices/devices.c b/src/vnet/devices/devices.c index 38f3002b..41645220 100644 --- a/src/vnet/devices/devices.c +++ b/src/vnet/devices/devices.c @@ -32,6 +32,7 @@ device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, VLIB_REGISTER_NODE (device_input_node) = { .function = device_input_fn, .name = "device-input", + .runtime_data_bytes = sizeof (vnet_device_input_runtime_t), .type = VLIB_NODE_TYPE_INPUT, .state = VLIB_NODE_STATE_DISABLED, .n_next_nodes = VNET_DEVICE_INPUT_N_NEXT_NODES, @@ -83,18 +84,257 @@ VNET_FEATURE_INIT (ethernet_input, static) = { }; /* *INDENT-ON* */ +static int +vnet_device_queue_sort (void *a1, void *a2) +{ + vnet_device_and_queue_t *dq1 = a1; + vnet_device_and_queue_t *dq2 = a2; + + if (dq1->dev_instance > dq2->dev_instance) + return 1; + else if (dq1->dev_instance < dq2->dev_instance) + return -1; + else if (dq1->queue_id > dq2->queue_id) + return 1; + else if (dq1->queue_id < dq2->queue_id) + return -1; + else + return 0; +} + +void +vnet_device_input_assign_thread (u32 hw_if_index, + u16 queue_id, uword cpu_index) +{ + vnet_main_t *vnm = vnet_get_main (); + vnet_device_main_t *vdm = &vnet_device_main; + vlib_main_t *vm; + vnet_device_input_runtime_t *rt; + vnet_device_and_queue_t *dq; + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + + ASSERT (hw->input_node_index > 0); + + if (vdm->first_worker_cpu_index == 0) + cpu_index = 0; + + if (cpu_index != 0 && + (cpu_index < vdm->first_worker_cpu_index || + cpu_index > vdm->last_worker_cpu_index)) + { + cpu_index = vdm->next_worker_cpu_index++; + if (vdm->next_worker_cpu_index > vdm->last_worker_cpu_index) + vdm->next_worker_cpu_index = vdm->first_worker_cpu_index; + } + + vm = vlib_mains[cpu_index]; + rt = vlib_node_get_runtime_data (vm, hw->input_node_index); + + vec_add2 (rt->devices_and_queues, dq, 1); + dq->hw_if_index = hw_if_index; + dq->dev_instance = hw->dev_instance; + dq->queue_id = queue_id; + + vec_sort_with_function (rt->devices_and_queues, vnet_device_queue_sort); + vec_validate (hw->input_node_cpu_index_by_queue, queue_id); + hw->input_node_cpu_index_by_queue[queue_id] = cpu_index; +} + +static int +vnet_device_input_unassign_thread (u32 hw_if_index, u16 queue_id, + uword cpu_index) +{ + vnet_main_t *vnm = vnet_get_main (); + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + vnet_device_input_runtime_t *rt; + vnet_device_and_queue_t *dq; + uword old_cpu_index; + + if (hw->input_node_cpu_index_by_queue == 0) + return VNET_API_ERROR_INVALID_INTERFACE; + + if (vec_len (hw->input_node_cpu_index_by_queue) < queue_id + 1) + return VNET_API_ERROR_INVALID_INTERFACE; + + old_cpu_index = hw->input_node_cpu_index_by_queue[queue_id]; + + if (old_cpu_index == cpu_index) + return 0; + + rt = + vlib_node_get_runtime_data (vlib_mains[old_cpu_index], + hw->input_node_index); + + vec_foreach (dq, rt->devices_and_queues) + if (dq->hw_if_index == hw_if_index && dq->queue_id == queue_id) + { + vec_del1 (rt->devices_and_queues, dq - rt->devices_and_queues); + goto deleted; + } + + return VNET_API_ERROR_INVALID_INTERFACE; + +deleted: + vec_sort_with_function (rt->devices_and_queues, vnet_device_queue_sort); + + return 0; +} + +static clib_error_t * +show_device_placement_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + u8 *s = 0; + vnet_main_t *vnm = vnet_get_main (); + vnet_device_input_runtime_t *rt; + vnet_device_and_queue_t *dq; + vlib_node_t *pn = vlib_get_node_by_name (vm, (u8 *) "device-input"); + uword si; + int index = 0; + + /* *INDENT-OFF* */ + foreach_vlib_main (({ + clib_bitmap_foreach (si, pn->sibling_bitmap, + ({ + rt = vlib_node_get_runtime_data (this_vlib_main, si); + + if (vec_len (rt->devices_and_queues)) + s = format (s, " node %U:\n", format_vlib_node_name, vm, si); + + vec_foreach (dq, rt->devices_and_queues) + { + s = format (s, " %U queue %u\n", + format_vnet_sw_if_index_name, vnm, dq->hw_if_index, + dq->queue_id); + } + })); + if (vec_len (s) > 0) + { + vlib_cli_output(vm, "Thread %u (%v):\n%v", index, + vlib_worker_threads[index].name, s); + vec_reset_length (s); + } + index++; + })); + /* *INDENT-ON* */ + + vec_free (s); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (memif_delete_command, static) = { + .path = "show interface placement", + .short_help = "show interface placement", + .function = show_device_placement_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +set_device_placement (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + clib_error_t *error = 0; + unformat_input_t _line_input, *line_input = &_line_input; + vnet_main_t *vnm = vnet_get_main (); + vnet_device_main_t *vdm = &vnet_device_main; + u32 hw_if_index = (u32) ~ 0; + u32 queue_id = (u32) 0; + u32 cpu_index = (u32) ~ 0; + int rv; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat + (line_input, "%U", unformat_vnet_hw_interface, vnm, &hw_if_index)) + ; + else if (unformat (line_input, "queue %d", &queue_id)) + ; + else if (unformat (line_input, "main", &cpu_index)) + cpu_index = 0; + else if (unformat (line_input, "worker %d", &cpu_index)) + cpu_index += vdm->first_worker_cpu_index; + else + { + error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + unformat_free (line_input); + return error; + } + } + + unformat_free (line_input); + + if (hw_if_index == (u32) ~ 0) + return clib_error_return (0, "please specify valid interface name"); + + if (cpu_index > vdm->last_worker_cpu_index) + return clib_error_return (0, + "please specify valid worker thread or main"); + + rv = vnet_device_input_unassign_thread (hw_if_index, queue_id, cpu_index); + + if (rv) + return clib_error_return (0, "not found"); + + vnet_device_input_assign_thread (hw_if_index, queue_id, cpu_index); + + return 0; +} + +/*? + * This command is used to assign a given interface, and optionally a + * given queue, to a different thread. If the 'queue' is not provided, + * it defaults to 0. + * + * @cliexpar + * Example of how to display the interface placement: + * @cliexstart{show interface placement} + * Thread 1 (vpp_wk_0): + * GigabitEthernet0/8/0 queue 0 + * GigabitEthernet0/9/0 queue 0 + * Thread 2 (vpp_wk_1): + * GigabitEthernet0/8/0 queue 1 + * GigabitEthernet0/9/0 queue 1 + * @cliexend + * Example of how to assign a interface and queue to a thread: + * @cliexcmd{set interface placement GigabitEthernet0/8/0 queue 1 thread 1} +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cmd_set_dpdk_if_placement,static) = { + .path = "set interface placement", + .short_help = "set interface placement [queue ] [thread | main]", + .function = set_device_placement, +}; +/* *INDENT-ON* */ + static clib_error_t * vnet_device_init (vlib_main_t * vm) { vnet_device_main_t *vdm = &vnet_device_main; vlib_thread_main_t *tm = vlib_get_thread_main (); + vlib_thread_registration_t *tr; + uword *p; vec_validate_aligned (vdm->workers, tm->n_vlib_mains - 1, CLIB_CACHE_LINE_BYTES); + + p = hash_get_mem (tm->thread_registrations_by_name, "workers"); + tr = p ? (vlib_thread_registration_t *) p[0] : 0; + if (tr && tr->count > 0) + { + vdm->first_worker_cpu_index = tr->first_index; + vdm->next_worker_cpu_index = tr->first_index; + vdm->last_worker_cpu_index = tr->first_index + tr->count - 1; + } return 0; } VLIB_INIT_FUNCTION (vnet_device_init); + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/devices/devices.h b/src/vnet/devices/devices.h index a5cbc35e..bbb29fe3 100644 --- a/src/vnet/devices/devices.h +++ b/src/vnet/devices/devices.h @@ -50,12 +50,38 @@ typedef struct typedef struct { vnet_device_per_worker_data_t *workers; + uword first_worker_cpu_index; + uword last_worker_cpu_index; + uword next_worker_cpu_index; } vnet_device_main_t; +typedef struct +{ + u32 hw_if_index; + u32 dev_instance; + u16 queue_id; +} vnet_device_and_queue_t; + +typedef struct +{ + vnet_device_and_queue_t *devices_and_queues; +} vnet_device_input_runtime_t; + extern vnet_device_main_t vnet_device_main; extern vlib_node_registration_t device_input_node; extern const u32 device_input_next_node_advance[]; +static inline void +vnet_set_device_input_node (u32 hw_if_index, u32 node_index) +{ + vnet_main_t *vnm = vnet_get_main (); + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + hw->input_node_index = node_index; +} + +void vnet_device_input_assign_thread (u32 hw_if_index, u16 queue_id, + uword cpu_index); + static inline u64 vnet_get_aggregate_rx_packets (void) { @@ -78,6 +104,25 @@ vnet_device_increment_rx_packets (u32 cpu_index, u64 count) pwd->aggregate_rx_packets += count; } +static_always_inline vnet_device_and_queue_t * +vnet_get_device_and_queue (vlib_main_t * vm, vlib_node_runtime_t * node) +{ + vnet_device_input_runtime_t *rt = (void *) node->runtime_data; + return rt->devices_and_queues; +} + +static_always_inline void +vnet_device_input_set_interrupt_pending (vnet_main_t * vnm, u32 hw_if_index, + u16 queue_id) +{ + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + + ASSERT (queue_id < vec_len (hw->input_node_cpu_index_by_queue)); + u32 cpu_index = hw->input_node_cpu_index_by_queue[queue_id]; + vlib_node_set_interrupt_pending (vlib_mains[cpu_index], + hw->input_node_index); +} + #endif /* included_vnet_vnet_device_h */ /* diff --git a/src/vnet/interface.h b/src/vnet/interface.h index ef8f9118..a1ea2d61 100644 --- a/src/vnet/interface.h +++ b/src/vnet/interface.h @@ -464,6 +464,12 @@ typedef struct vnet_hw_interface_t #define VNET_HW_INTERFACE_BOND_INFO_NONE ((uword *) 0) #define VNET_HW_INTERFACE_BOND_INFO_SLAVE ((uword *) ~0) + /* Input node */ + u32 input_node_index; + + /* input node cpu index by queue */ + u32 *input_node_cpu_index_by_queue; + } vnet_hw_interface_t; extern vnet_device_class_t vnet_local_interface_device_class; -- cgit 1.2.3-korg From 7312cc7785a9d1198519e1091a645fecc019a6b8 Mon Sep 17 00:00:00 2001 From: Steven Date: Wed, 15 Mar 2017 21:18:55 -0700 Subject: vhost: support interrupt mode vhost currently supports only polling mode. This patch is to add interrupt mode. When the interface is configured for interrupt mode, our input node does not get called unless there is a packet in the vring. If a particular CPU has one interface configured for polling mode and another in interrupt, the input node is set to polling for that CPU. This diffs also includes two crashes in vlib's dispatch_node. One is included in https://gerrit.fd.io/r/#/c/5516. The other crash is in the ASSERT. The ASSERT can become true when the caller of dispatch_node is in a loop. The first call converted the node to polling. The second call thereafter will hit the ASSERT. Change-Id: If17b6d48b20d7d8605c6a161459828637173cd32 Signed-off-by: Steven --- src/vat/api_format.c | 67 ++++++- src/vlib/main.c | 13 +- src/vnet/devices/virtio/vhost-user.c | 333 ++++++++++++++++++++++++++++++++--- src/vnet/devices/virtio/vhost-user.h | 11 ++ 4 files changed, 392 insertions(+), 32 deletions(-) (limited to 'src/vlib') diff --git a/src/vat/api_format.c b/src/vat/api_format.c index 37b7f93e..3b57ac61 100644 --- a/src/vat/api_format.c +++ b/src/vat/api_format.c @@ -403,6 +403,46 @@ api_unformat_sw_if_index (unformat_input_t * input, va_list * args) } #endif /* VPP_API_TEST_BUILTIN */ +#define VHOST_USER_POLLING_MODE 0 +#define VHOST_USER_INTERRUPT_MODE 1 +#define VHOST_USER_ADAPTIVE_MODE 2 + +static u8 * +api_format_vhost_user_operation_mode (u8 * s, va_list * va) +{ + int operation_mode = va_arg (*va, int); + + switch (operation_mode) + { + case VHOST_USER_POLLING_MODE: + s = format (s, "%-9s", "polling"); + break; + case VHOST_USER_INTERRUPT_MODE: + s = format (s, "%-9s", "interrupt"); + break; + default: + s = format (s, "%-9s", "invalid"); + } + return s; +} + +static uword +api_unformat_vhost_user_operation_mode (unformat_input_t * input, + va_list * args) +{ + u8 *operation_mode = va_arg (*args, u8 *); + uword rc = 1; + + if (unformat (input, "interrupt")) + *operation_mode = VHOST_USER_INTERRUPT_MODE; + else if (unformat (input, "polling")) + *operation_mode = VHOST_USER_POLLING_MODE; + else + rc = 0; + + return rc; +} + static uword unformat_policer_rate_type (unformat_input_t * input, va_list * args) { @@ -11174,6 +11214,7 @@ api_create_vhost_user_if (vat_main_t * vam) u8 use_custom_mac = 0; u8 *tag = 0; int ret; + u8 operation_mode = VHOST_USER_POLLING_MODE; /* Shut up coverity */ memset (hwaddr, 0, sizeof (hwaddr)); @@ -11192,6 +11233,10 @@ api_create_vhost_user_if (vat_main_t * vam) is_server = 1; else if (unformat (i, "tag %s", &tag)) ; + else if (unformat (i, "mode %U", + api_unformat_vhost_user_operation_mode, + &operation_mode)) + ; else break; } @@ -11211,6 +11256,7 @@ api_create_vhost_user_if (vat_main_t * vam) M (CREATE_VHOST_USER_IF, mp); + mp->operation_mode = operation_mode; mp->is_server = is_server; clib_memcpy (mp->sock_filename, file_name, vec_len (file_name)); vec_free (file_name); @@ -11242,6 +11288,7 @@ api_modify_vhost_user_if (vat_main_t * vam) u8 sw_if_index_set = 0; u32 sw_if_index = (u32) ~ 0; int ret; + u8 operation_mode = VHOST_USER_POLLING_MODE; while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) { @@ -11257,6 +11304,10 @@ api_modify_vhost_user_if (vat_main_t * vam) ; else if (unformat (i, "server")) is_server = 1; + else if (unformat (i, "mode %U", + api_unformat_vhost_user_operation_mode, + &operation_mode)) + ; else break; } @@ -11282,6 +11333,7 @@ api_modify_vhost_user_if (vat_main_t * vam) M (MODIFY_VHOST_USER_IF, mp); + mp->operation_mode = operation_mode; mp->sw_if_index = ntohl (sw_if_index); mp->is_server = is_server; clib_memcpy (mp->sock_filename, file_name, vec_len (file_name)); @@ -11337,11 +11389,12 @@ static void vl_api_sw_interface_vhost_user_details_t_handler { vat_main_t *vam = &vat_main; - print (vam->ofp, "%-25s %3" PRIu32 " %6" PRIu32 " %8x %6d %7d %s", + print (vam->ofp, "%-25s %3" PRIu32 " %6" PRIu32 " %8x %6d %7d %U %s", (char *) mp->interface_name, ntohl (mp->sw_if_index), ntohl (mp->virtio_net_hdr_sz), clib_net_to_host_u64 (mp->features), mp->is_server, - ntohl (mp->num_regions), (char *) mp->sock_filename); + ntohl (mp->num_regions), api_format_vhost_user_operation_mode, + mp->operation_mode, (char *) mp->sock_filename); print (vam->ofp, " Status: '%s'", strerror (ntohl (mp->sock_errno))); } @@ -11370,6 +11423,7 @@ static void vl_api_sw_interface_vhost_user_details_t_handler_json vat_json_object_add_string_copy (node, "sock_filename", mp->sock_filename); vat_json_object_add_uint (node, "num_regions", ntohl (mp->num_regions)); vat_json_object_add_uint (node, "sock_errno", ntohl (mp->sock_errno)); + vat_json_object_add_uint (node, "mode", mp->operation_mode); } static int @@ -11379,7 +11433,8 @@ api_sw_interface_vhost_user_dump (vat_main_t * vam) vl_api_control_ping_t *mp_ping; int ret; print (vam->ofp, - "Interface name idx hdr_sz features server regions filename"); + "Interface name idx hdr_sz features server regions mode" + " filename"); /* Get list of vhost-user interfaces */ M (SW_INTERFACE_VHOST_USER_DUMP, mp); @@ -18492,10 +18547,12 @@ _(l2_interface_vlan_tag_rewrite, \ "[translate-2-[1|2]] [push_dot1q 0] tag1 tag2 ") \ _(create_vhost_user_if, \ "socket [server] [renumber ] " \ - "[mac ]") \ + "[mac ] " \ + "[mode ]") \ _(modify_vhost_user_if, \ " | sw_if_index socket \n" \ - "[server] [renumber ]") \ + "[server] [renumber ] " \ + "[mode ]") \ _(delete_vhost_user_if, " | sw_if_index ") \ _(sw_interface_vhost_user_dump, "") \ _(show_version, "") \ diff --git a/src/vlib/main.c b/src/vlib/main.c index 605771c8..50f0b162 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -1032,15 +1032,14 @@ dispatch_node (vlib_main_t * vm, vlib_worker_thread_t *w = vlib_worker_threads + vm->cpu_index; #endif - if (dispatch_state == VLIB_NODE_STATE_INTERRUPT - && v >= nm->polling_threshold_vector_length) + if ((dispatch_state == VLIB_NODE_STATE_INTERRUPT + && v >= nm->polling_threshold_vector_length) && + !(node->flags & + VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE)) { vlib_node_t *n = vlib_get_node (vm, node->node_index); n->state = VLIB_NODE_STATE_POLLING; node->state = VLIB_NODE_STATE_POLLING; - ASSERT (! - (node->flags & - VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE)); node->flags &= ~VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE; node->flags |= @@ -1445,6 +1444,10 @@ vlib_main_or_worker_loop (vlib_main_t * vm, int is_main) /* Pre-allocate expired nodes. */ vec_alloc (nm->pending_interrupt_node_runtime_indices, 32); + if (!nm->polling_threshold_vector_length) + nm->polling_threshold_vector_length = 10; + if (!nm->interrupt_threshold_vector_length) + nm->interrupt_threshold_vector_length = 5; if (is_main) { diff --git a/src/vnet/devices/virtio/vhost-user.c b/src/vnet/devices/virtio/vhost-user.c index 3cbeca9b..5a5beb15 100644 --- a/src/vnet/devices/virtio/vhost-user.c +++ b/src/vnet/devices/virtio/vhost-user.c @@ -368,6 +368,8 @@ vhost_user_rx_thread_placement () vhost_user_intf_t *vui; vhost_cpu_t *vhc; u32 *workers = 0; + u32 cpu_index; + vlib_main_t *vm; //Let's list all workers cpu indexes u32 i; @@ -398,19 +400,59 @@ vhost_user_rx_thread_placement () continue; i %= vec_len (vui_workers); - u32 cpu_index = vui_workers[i]; + cpu_index = vui_workers[i]; i++; vhc = &vum->cpus[cpu_index]; iaq.qid = qid; iaq.vhost_iface_index = vui - vum->vhost_user_interfaces; vec_add1 (vhc->rx_queues, iaq); - vlib_node_set_state (vlib_mains[cpu_index], - vhost_user_input_node.index, - VLIB_NODE_STATE_POLLING); } }); /* *INDENT-ON* */ + + vec_foreach (vhc, vum->cpus) + { + vhost_iface_and_queue_t *vhiq; + u8 mode = VHOST_USER_INTERRUPT_MODE; + + vec_foreach (vhiq, vhc->rx_queues) + { + vui = &vum->vhost_user_interfaces[vhiq->vhost_iface_index]; + if (vui->operation_mode == VHOST_USER_POLLING_MODE) + { + /* At least one interface is polling, cpu is set to polling */ + mode = VHOST_USER_POLLING_MODE; + break; + } + } + vhc->operation_mode = mode; + } + + for (cpu_index = vum->input_cpu_first_index; + cpu_index < vum->input_cpu_first_index + vum->input_cpu_count; + cpu_index++) + { + vlib_node_state_t state = VLIB_NODE_STATE_POLLING; + + vhc = &vum->cpus[cpu_index]; + vm = vlib_mains ? vlib_mains[cpu_index] : &vlib_global_main; + switch (vhc->operation_mode) + { + case VHOST_USER_INTERRUPT_MODE: + state = VLIB_NODE_STATE_INTERRUPT; + break; + case VHOST_USER_POLLING_MODE: + state = VLIB_NODE_STATE_POLLING; + break; + default: + clib_warning ("BUG: bad operation mode %d", vhc->operation_mode); + break; + } + vlib_node_set_state (vm, vhost_user_input_node.index, state); + } + + vec_free (workers); } static int @@ -485,12 +527,68 @@ vhost_user_update_iface_state (vhost_user_intf_t * vui) vhost_user_tx_thread_placement (vui); } +static void +vhost_user_set_interrupt_pending (vhost_user_intf_t * vui, u32 ifq) +{ + vhost_user_main_t *vum = &vhost_user_main; + vhost_cpu_t *vhc; + u32 cpu_index; + vhost_iface_and_queue_t *vhiq; + vlib_main_t *vm; + u32 ifq2; + u8 done = 0; + + if (vhost_user_intf_ready (vui)) + { + vec_foreach (vhc, vum->cpus) + { + if (vhc->operation_mode == VHOST_USER_POLLING_MODE) + continue; + + vec_foreach (vhiq, vhc->rx_queues) + { + /* + * Match the interface and the virtqueue number + */ + if ((vhiq->vhost_iface_index == (ifq >> 8)) && + (VHOST_VRING_IDX_TX (vhiq->qid) == (ifq & 0xff))) + { + cpu_index = vhc - vum->cpus; + vm = vlib_mains ? vlib_mains[cpu_index] : &vlib_global_main; + /* + * Convert RX virtqueue number in the lower byte to vring + * queue index for the input node process. Top bytes contain + * the interface, lower byte contains the queue index. + */ + ifq2 = ((ifq >> 8) << 8) | vhiq->qid; + vhc->pending_input_bitmap = + clib_bitmap_set (vhc->pending_input_bitmap, ifq2, 1); + vlib_node_set_interrupt_pending (vm, + vhost_user_input_node.index); + done = 1; + break; + } + } + if (done) + break; + } + } +} + static clib_error_t * vhost_user_callfd_read_ready (unix_file_t * uf) { __attribute__ ((unused)) int n; u8 buff[8]; + vhost_user_intf_t *vui = + pool_elt_at_index (vhost_user_main.vhost_user_interfaces, + uf->private_data >> 8); + n = read (uf->file_descriptor, ((char *) &buff), 8); + DBG_SOCK ("if %d CALL queue %d", uf->private_data >> 8, + uf->private_data & 0xff); + vhost_user_set_interrupt_pending (vui, uf->private_data); + return 0; } @@ -503,13 +601,20 @@ vhost_user_kickfd_read_ready (unix_file_t * uf) pool_elt_at_index (vhost_user_main.vhost_user_interfaces, uf->private_data >> 8); u32 qid = uf->private_data & 0xff; + n = read (uf->file_descriptor, ((char *) &buff), 8); DBG_SOCK ("if %d KICK queue %d", uf->private_data >> 8, qid); vlib_worker_thread_barrier_sync (vlib_get_main ()); - vui->vrings[qid].started = 1; - vhost_user_update_iface_state (vui); + if (!vui->vrings[qid].started || + (vhost_user_intf_ready (vui) != vui->is_up)) + { + vui->vrings[qid].started = 1; + vhost_user_update_iface_state (vui); + } vlib_worker_thread_barrier_release (vlib_get_main ()); + + vhost_user_set_interrupt_pending (vui, uf->private_data); return 0; } @@ -907,8 +1012,12 @@ vhost_user_socket_read (unix_file_t * uf) vui->vrings[msg.state.index].last_avail_idx = vui->vrings[msg.state.index].used->idx; - /* tell driver that we don't want interrupts */ - vui->vrings[msg.state.index].used->flags = VRING_USED_F_NO_NOTIFY; + if (vui->operation_mode == VHOST_USER_POLLING_MODE) + /* tell driver that we don't want interrupts */ + vui->vrings[msg.state.index].used->flags = VRING_USED_F_NO_NOTIFY; + else + /* tell driver that we want interrupts */ + vui->vrings[msg.state.index].used->flags = 0; break; case VHOST_USER_SET_OWNER: @@ -1811,7 +1920,8 @@ vhost_user_if_input (vlib_main_t * vm, vhost_user_log_dirty_ring (vui, txvq, idx); /* interrupt (call) handling */ - if ((txvq->callfd_idx != ~0) && !(txvq->avail->flags & 1)) + if ((txvq->callfd_idx != ~0) && + !(txvq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { txvq->n_since_last_int += n_rx_packets; @@ -1837,16 +1947,33 @@ vhost_user_input (vlib_main_t * vm, vhost_user_main_t *vum = &vhost_user_main; uword n_rx_packets = 0; u32 cpu_index = os_get_cpu_number (); + vhost_iface_and_queue_t *vhiq; + vhost_user_intf_t *vui; + vhost_cpu_t *vhc; + vhc = &vum->cpus[cpu_index]; + if (PREDICT_TRUE (vhc->operation_mode == VHOST_USER_POLLING_MODE)) + { + vec_foreach (vhiq, vum->cpus[cpu_index].rx_queues) + { + vui = &vum->vhost_user_interfaces[vhiq->vhost_iface_index]; + n_rx_packets += vhost_user_if_input (vm, vum, vui, vhiq->qid, node); + } + } + else + { + int i; - vhost_iface_and_queue_t *vhiq; - vec_foreach (vhiq, vum->cpus[cpu_index].rx_queues) - { - vhost_user_intf_t *vui = - &vum->vhost_user_interfaces[vhiq->vhost_iface_index]; - n_rx_packets += vhost_user_if_input (vm, vum, vui, vhiq->qid, node); - } + /* *INDENT-OFF* */ + clib_bitmap_foreach (i, vhc->pending_input_bitmap, ({ + int qid = i & 0xff; + clib_bitmap_set (vhc->pending_input_bitmap, i, 0); + vui = pool_elt_at_index (vum->vhost_user_interfaces, i >> 8); + n_rx_packets += vhost_user_if_input (vm, vum, vui, qid, node); + })); + /* *INDENT-ON* */ + } return n_rx_packets; } @@ -2241,7 +2368,8 @@ done: } /* interrupt (call) handling */ - if ((rxvq->callfd_idx != ~0) && !(rxvq->avail->flags & 1)) + if ((rxvq->callfd_idx != ~0) && + !(rxvq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { rxvq->n_since_last_int += frame->n_vectors - n_left; @@ -2595,6 +2723,95 @@ vhost_user_vui_init (vnet_main_t * vnm, vhost_user_tx_thread_placement (vui); } +static uword +vhost_user_send_interrupt_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + vhost_user_intf_t *vui; + f64 timeout = 3153600000.0 /* 100 years */ ; + uword event_type, *event_data = 0; + vhost_user_main_t *vum = &vhost_user_main; + vhost_iface_and_queue_t *vhiq; + vhost_cpu_t *vhc; + f64 now, poll_time_remaining; + + while (1) + { + poll_time_remaining = + vlib_process_wait_for_event_or_clock (vm, timeout); + event_type = vlib_process_get_events (vm, &event_data); + vec_reset_length (event_data); + + /* + * Use the remaining timeout if it is less than coalesce time to avoid + * resetting the existing timer in the middle of expiration + */ + timeout = poll_time_remaining; + if (vlib_process_suspend_time_is_zero (timeout) || + (timeout > vum->coalesce_time)) + timeout = vum->coalesce_time; + + now = vlib_time_now (vm); + switch (event_type) + { + case VHOST_USER_EVENT_START_TIMER: + if (!vlib_process_suspend_time_is_zero (poll_time_remaining)) + break; + /* fall through */ + + case ~0: + vec_foreach (vhc, vum->cpus) + { + u32 cpu_index = vhc - vum->cpus; + f64 next_timeout; + + next_timeout = timeout; + vec_foreach (vhiq, vum->cpus[cpu_index].rx_queues) + { + vui = &vum->vhost_user_interfaces[vhiq->vhost_iface_index]; + vhost_user_vring_t *rxvq = + &vui->vrings[VHOST_VRING_IDX_RX (vhiq->qid)]; + vhost_user_vring_t *txvq = + &vui->vrings[VHOST_VRING_IDX_TX (vhiq->qid)]; + + if (txvq->n_since_last_int) + { + if (now >= txvq->int_deadline) + vhost_user_send_call (vm, txvq); + else + next_timeout = txvq->int_deadline - now; + } + + if (rxvq->n_since_last_int) + { + if (now >= rxvq->int_deadline) + vhost_user_send_call (vm, rxvq); + else + next_timeout = rxvq->int_deadline - now; + } + + if ((next_timeout < timeout) && (next_timeout > 0.0)) + timeout = next_timeout; + } + } + break; + + default: + clib_warning ("BUG: unhandled event type %d", event_type); + break; + } + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (vhost_user_send_interrupt_node,static) = { + .function = vhost_user_send_interrupt_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "vhost-user-send-interrupt-process", +}; +/* *INDENT-ON* */ + int vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm, const char *sock_filename, @@ -2608,8 +2825,10 @@ vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm, u32 sw_if_idx = ~0; int rv = 0; int server_sock_fd = -1; + vhost_user_main_t *vum = &vhost_user_main; - if (operation_mode != VHOST_USER_POLLING_MODE) + if ((operation_mode != VHOST_USER_POLLING_MODE) && + (operation_mode != VHOST_USER_INTERRUPT_MODE)) return VNET_API_ERROR_UNIMPLEMENTED; if (sock_filename == NULL || !(strlen (sock_filename) > 0)) @@ -2640,6 +2859,15 @@ vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm, // Process node must connect vlib_process_signal_event (vm, vhost_user_process_node.index, 0, 0); + + if ((operation_mode == VHOST_USER_INTERRUPT_MODE) && + !vum->interrupt_mode && (vum->coalesce_time > 0.0) && + (vum->coalesce_frames > 0)) + { + vum->interrupt_mode = 1; + vlib_process_signal_event (vm, vhost_user_send_interrupt_node.index, + VHOST_USER_EVENT_START_TIMER, 0); + } return rv; } @@ -2658,7 +2886,8 @@ vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm, int rv = 0; vnet_hw_interface_t *hwif; - if (operation_mode != VHOST_USER_POLLING_MODE) + if ((operation_mode != VHOST_USER_POLLING_MODE) && + (operation_mode != VHOST_USER_INTERRUPT_MODE)) return VNET_API_ERROR_UNIMPLEMENTED; if (!(hwif = vnet_get_sup_hw_interface (vnm, sw_if_index)) || hwif->dev_class_index != vhost_user_dev_class.index) @@ -2682,9 +2911,34 @@ vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm, // Process node must connect vlib_process_signal_event (vm, vhost_user_process_node.index, 0, 0); + + if ((operation_mode == VHOST_USER_INTERRUPT_MODE) && + !vum->interrupt_mode && (vum->coalesce_time > 0.0) && + (vum->coalesce_frames > 0)) + { + vum->interrupt_mode = 1; + vlib_process_signal_event (vm, vhost_user_send_interrupt_node.index, + VHOST_USER_EVENT_START_TIMER, 0); + } return rv; } +static uword +unformat_vhost_user_operation_mode (unformat_input_t * input, va_list * args) +{ + u8 *operation_mode = va_arg (*args, u8 *); + uword rc = 1; + + if (unformat (input, "interrupt")) + *operation_mode = VHOST_USER_INTERRUPT_MODE; + else if (unformat (input, "polling")) + *operation_mode = VHOST_USER_POLLING_MODE; + else + rc = 0; + + return rc; +} + clib_error_t * vhost_user_connect_command_fn (vlib_main_t * vm, unformat_input_t * input, @@ -2722,6 +2976,9 @@ vhost_user_connect_command_fn (vlib_main_t * vm, { renumber = 1; } + else if (unformat (line_input, "mode %U", + unformat_vhost_user_operation_mode, &operation_mode)) + ; else { error = clib_error_return (0, "unknown input `%U'", @@ -2851,6 +3108,25 @@ vhost_user_dump_ifs (vnet_main_t * vnm, vlib_main_t * vm, return rv; } +static u8 * +format_vhost_user_operation_mode (u8 * s, va_list * va) +{ + int operation_mode = va_arg (*va, int); + + switch (operation_mode) + { + case VHOST_USER_POLLING_MODE: + s = format (s, "%s", "polling"); + break; + case VHOST_USER_INTERRUPT_MODE: + s = format (s, "%s", "interrupt"); + break; + default: + s = format (s, "%s", "invalid"); + } + return s; +} + clib_error_t * show_vhost_user_command_fn (vlib_main_t * vm, unformat_input_t * input, @@ -2959,14 +3235,22 @@ show_vhost_user_command_fn (vlib_main_t * vm, (vui->unix_server_index != ~0) ? "server" : "client", strerror (vui->sock_errno)); + vlib_cli_output (vm, " configured mode: %U\n", + format_vhost_user_operation_mode, vui->operation_mode); vlib_cli_output (vm, " rx placement: "); vec_foreach (vhc, vum->cpus) { vec_foreach (vhiq, vhc->rx_queues) { if (vhiq->vhost_iface_index == vui - vum->vhost_user_interfaces) - vlib_cli_output (vm, " thread %d on vring %d\n", - vhc - vum->cpus, VHOST_VRING_IDX_TX (vhiq->qid)); + { + vlib_cli_output (vm, " thread %d on vring %d\n", + vhc - vum->cpus, + VHOST_VRING_IDX_TX (vhiq->qid)); + vlib_cli_output (vm, " mode: %U\n", + format_vhost_user_operation_mode, + vhc->operation_mode); + } } } @@ -3096,6 +3380,9 @@ done: * in the name to be specified. If instance already exists, name will be used * anyway and multiple instances will have the same name. Use with caution. * + * - mode [interrupt | polling] - Optional parameter specifying + * the input thread polling policy. + * * @cliexpar * Example of how to create a vhost interface with VPP as the client and all features enabled: * @cliexstart{create vhost-user socket /tmp/vhost1.sock} @@ -3112,7 +3399,9 @@ done: /* *INDENT-OFF* */ VLIB_CLI_COMMAND (vhost_user_connect_command, static) = { .path = "create vhost-user", - .short_help = "create vhost-user socket [server] [feature-mask ] [hwaddr ] [renumber ]", + .short_help = "create vhost-user socket [server] " + "[feature-mask ] [hwaddr ] [renumber ] " + "[mode {interrupt | polling}]", .function = vhost_user_connect_command_fn, }; /* *INDENT-ON* */ diff --git a/src/vnet/devices/virtio/vhost-user.h b/src/vnet/devices/virtio/vhost-user.h index 6b928f05..67f18b8e 100644 --- a/src/vnet/devices/virtio/vhost-user.h +++ b/src/vnet/devices/virtio/vhost-user.h @@ -216,6 +216,8 @@ typedef struct #define VHOST_USER_INTERRUPT_MODE 1 #define VHOST_USER_ADAPTIVE_MODE 2 +#define VHOST_USER_EVENT_START_TIMER 1 + typedef struct { CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); @@ -298,6 +300,12 @@ typedef struct /* This is here so it doesn't end-up * using stack or registers. */ vhost_trace_t *current_trace; + + /* bitmap of pending rx interfaces */ + uword *pending_input_bitmap; + + /* The operation mode computed per cpu based on interface setting */ + u8 operation_mode; } vhost_cpu_t; typedef struct @@ -320,6 +328,9 @@ typedef struct /** Pseudo random iterator */ u32 random; + + /* Node is in interrupt mode */ + u8 interrupt_mode; } vhost_user_main_t; typedef struct -- cgit 1.2.3-korg From 1bfb0ddace3ebb9010275e4bdd847c8c493ff4b3 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 22 Mar 2017 11:08:39 +0100 Subject: vlib: add description field in plugin registration Change-Id: I88b322a5d602f3d6d3310e971479180a89430e0e Signed-off-by: Damjan Marion --- src/examples/sample-plugin/sample/sample.c | 1 + src/plugins/acl/acl.c | 1 + src/plugins/dpdk/main.c | 1 + src/plugins/flowperpkt/flowperpkt.c | 1 + src/plugins/ila/ila.c | 1 + src/plugins/ioam/encap/ip6_ioam_trace.c | 1 + src/plugins/ixge/ixge.c | 1 + src/plugins/lb/lb.c | 1 + src/plugins/sixrd/sixrd.c | 3 ++- src/plugins/snat/snat.c | 1 + src/vlib/unix/plugin.c | 14 +++++++++----- src/vlib/unix/plugin.h | 1 + 12 files changed, 21 insertions(+), 6 deletions(-) (limited to 'src/vlib') diff --git a/src/examples/sample-plugin/sample/sample.c b/src/examples/sample-plugin/sample/sample.c index 01852746..2f8ac4c9 100644 --- a/src/examples/sample-plugin/sample/sample.c +++ b/src/examples/sample-plugin/sample/sample.c @@ -61,6 +61,7 @@ _(SAMPLE_MACSWAP_ENABLE_DISABLE, sample_macswap_enable_disable) /* *INDENT-OFF* */ VLIB_PLUGIN_REGISTER () = { .version = SAMPLE_PLUGIN_BUILD_VER, + .description = "Sample of VPP Plugin", }; /* *INDENT-ON* */ diff --git a/src/plugins/acl/acl.c b/src/plugins/acl/acl.c index 476fbc33..4a4dd434 100644 --- a/src/plugins/acl/acl.c +++ b/src/plugins/acl/acl.c @@ -80,6 +80,7 @@ _(MACIP_ACL_INTERFACE_GET, macip_acl_interface_get) /* *INDENT-OFF* */ VLIB_PLUGIN_REGISTER () = { .version = VPP_BUILD_VER, + .description = "Access Control Lists", }; /* *INDENT-ON* */ diff --git a/src/plugins/dpdk/main.c b/src/plugins/dpdk/main.c index 8073a50a..7ee2a785 100644 --- a/src/plugins/dpdk/main.c +++ b/src/plugins/dpdk/main.c @@ -91,5 +91,6 @@ VLIB_INIT_FUNCTION (dpdk_main_init); /* *INDENT-OFF* */ VLIB_PLUGIN_REGISTER () = { .version = VPP_BUILD_VER, + .description = "Data Plane Development Kit (DPDK)", }; /* *INDENT-ON* */ diff --git a/src/plugins/flowperpkt/flowperpkt.c b/src/plugins/flowperpkt/flowperpkt.c index 587972f7..3e5fc8b0 100644 --- a/src/plugins/flowperpkt/flowperpkt.c +++ b/src/plugins/flowperpkt/flowperpkt.c @@ -448,6 +448,7 @@ _(FLOWPERPKT_TX_INTERFACE_ADD_DEL, flowperpkt_tx_interface_add_del) /* *INDENT-OFF* */ VLIB_PLUGIN_REGISTER () = { .version = VPP_BUILD_VER, + .description = "Flow per Packet", }; /* *INDENT-ON* */ diff --git a/src/plugins/ila/ila.c b/src/plugins/ila/ila.c index 52c7ea55..edbf3017 100644 --- a/src/plugins/ila/ila.c +++ b/src/plugins/ila/ila.c @@ -825,6 +825,7 @@ ila_interface (u32 sw_if_index, u8 disable) /* *INDENT-OFF* */ VLIB_PLUGIN_REGISTER () = { .version = VPP_BUILD_VER, + .description = "Identifier-locator addressing for IPv6", }; /* *INDENT-ON* */ diff --git a/src/plugins/ioam/encap/ip6_ioam_trace.c b/src/plugins/ioam/encap/ip6_ioam_trace.c index 57d3ec5d..299ee88f 100644 --- a/src/plugins/ioam/encap/ip6_ioam_trace.c +++ b/src/plugins/ioam/encap/ip6_ioam_trace.c @@ -411,6 +411,7 @@ VLIB_CLI_COMMAND (ip6_show_ioam_trace_cmd, static) = { /* *INDENT-OFF* */ VLIB_PLUGIN_REGISTER () = { .version = VPP_BUILD_VER, + .description = "Inbound OAM", }; /* *INDENT-ON* */ diff --git a/src/plugins/ixge/ixge.c b/src/plugins/ixge/ixge.c index 4eebc457..f3c5cc09 100644 --- a/src/plugins/ixge/ixge.c +++ b/src/plugins/ixge/ixge.c @@ -2935,6 +2935,7 @@ ixge_set_next_node (ixge_rx_next_t next, char *name) VLIB_PLUGIN_REGISTER () = { .version = VPP_BUILD_VER, .default_disabled = 1, + .description = "Intel 82599 Family Native Driver (experimental)", }; /* *INDENT-ON* */ diff --git a/src/plugins/lb/lb.c b/src/plugins/lb/lb.c index dc3f5be1..add81236 100644 --- a/src/plugins/lb/lb.c +++ b/src/plugins/lb/lb.c @@ -734,6 +734,7 @@ int lb_vip_del(u32 vip_index) /* *INDENT-OFF* */ VLIB_PLUGIN_REGISTER () = { .version = VPP_BUILD_VER, + .description = "Load Balancer", }; /* *INDENT-ON* */ diff --git a/src/plugins/sixrd/sixrd.c b/src/plugins/sixrd/sixrd.c index 67a9a3ad..98387525 100644 --- a/src/plugins/sixrd/sixrd.c +++ b/src/plugins/sixrd/sixrd.c @@ -356,8 +356,9 @@ VLIB_CLI_COMMAND(show_sixrd_stats_command, static) = { }; /* *INDENT-OFF* */ -VLIB_PLUGIN_REGISTER () = { +VLIB_PLUGIN_REGISTER () ={ .version = VPP_BUILD_VER, + .description = "IPv6 Rapid Deployment on IPv4 Infrastructure (RFC5969)", }; /* *INDENT-ON* */ diff --git a/src/plugins/snat/snat.c b/src/plugins/snat/snat.c index d42303f6..70b6a6e2 100644 --- a/src/plugins/snat/snat.c +++ b/src/plugins/snat/snat.c @@ -107,6 +107,7 @@ VNET_FEATURE_INIT (ip4_snat_out2in_fast, static) = { /* *INDENT-OFF* */ VLIB_PLUGIN_REGISTER () = { .version = VPP_BUILD_VER, + .description = "Network Address Translation", }; /* *INDENT-ON* */ diff --git a/src/vlib/unix/plugin.c b/src/vlib/unix/plugin.c index e9846e3b..9b341cc8 100644 --- a/src/vlib/unix/plugin.c +++ b/src/vlib/unix/plugin.c @@ -105,13 +105,13 @@ load_one_plugin (plugin_main_t * pm, plugin_info_t * pi, int from_early_init) } if (reg->default_disabled && pc->is_enabled == 0) { - clib_warning ("Plugin disabled: %s (default)", pi->name); + clib_warning ("Plugin disabled (default): %s", pi->name); goto error; } } else if (reg->default_disabled) { - clib_warning ("Plugin disabled: %s (default)", pi->name); + clib_warning ("Plugin disabled (default): %s", pi->name); goto error; } @@ -184,7 +184,10 @@ load_one_plugin (plugin_main_t * pm, plugin_info_t * pi, int from_early_init) (char *) pi->name, reg->early_init); } - clib_warning ("Loaded plugin: %s", pi->name); + if (reg->description) + clib_warning ("Loaded plugin: %s (%s)", pi->name, reg->description); + else + clib_warning ("Loaded plugin: %s", pi->name); return 0; error: @@ -374,7 +377,7 @@ vlib_plugins_show_cmd_fn (vlib_main_t * vm, plugin_info_t *pi; s = format (s, " Plugin path is: %s\n\n", pm->plugin_path); - s = format (s, " %-41s%s\n", "Plugin", "Version"); + s = format (s, " %-41s%-33s%s\n", "Plugin", "Version", "Description"); /* *INDENT-OFF* */ hash_foreach_mem (key, value, pm->plugin_by_name_hash, @@ -382,7 +385,8 @@ vlib_plugins_show_cmd_fn (vlib_main_t * vm, if (key != 0) { pi = vec_elt_at_index (pm->plugin_info, value); - s = format (s, "%3d. %-40s %s\n", index, key, pi->version); + s = format (s, "%3d. %-40s %-32s %s\n", index, key, pi->version, + pi->reg->description ? pi->reg->description : ""); index++; } }); diff --git a/src/vlib/unix/plugin.h b/src/vlib/unix/plugin.h index 01fec356..d9801ec4 100644 --- a/src/vlib/unix/plugin.h +++ b/src/vlib/unix/plugin.h @@ -62,6 +62,7 @@ typedef CLIB_PACKED(struct { const char version[32]; const char version_required[32]; const char *early_init; + const char *description; }) vlib_plugin_registration_t; /* *INDENT-ON* */ -- cgit 1.2.3-korg From 9a332e1639fbfb4eb4ddf47b1681e05493ae6da3 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Tue, 28 Mar 2017 15:11:20 +0200 Subject: vlib: inline dispatch_node(...) (again) Worker main loop is now shared code with main thread main loop so no need to export functions anymore. Change-Id: I99ee2eee981c1b88ca31d20eabeb6c21d030a34d Signed-off-by: Damjan Marion --- src/vlib/main.c | 4 ++-- src/vlib/threads.h | 9 --------- 2 files changed, 2 insertions(+), 11 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/main.c b/src/vlib/main.c index 50f0b162..55a731f1 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -917,7 +917,7 @@ vlib_dump_context_trace (vlib_main_t * vm, u32 bi) } -/* static_always_inline */ u64 +static_always_inline u64 dispatch_node (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_node_type_t type, @@ -1093,7 +1093,7 @@ dispatch_node (vlib_main_t * vm, return t; } -/* static */ u64 +static u64 dispatch_pending_node (vlib_main_t * vm, vlib_pending_frame_t * p, u64 last_time_stamp) { diff --git a/src/vlib/threads.h b/src/vlib/threads.h index fc1633f6..39f64e1d 100644 --- a/src/vlib/threads.h +++ b/src/vlib/threads.h @@ -162,15 +162,6 @@ int vlib_frame_queue_enqueue (vlib_main_t * vm, u32 node_runtime_index, int vlib_frame_queue_dequeue (vlib_main_t * vm, vlib_frame_queue_main_t * fqm); -u64 dispatch_node (vlib_main_t * vm, - vlib_node_runtime_t * node, - vlib_node_type_t type, - vlib_node_state_t dispatch_state, - vlib_frame_t * frame, u64 last_time_stamp); - -u64 dispatch_pending_node (vlib_main_t * vm, - vlib_pending_frame_t * p, u64 last_time_stamp); - void vlib_worker_thread_node_runtime_update (void); void vlib_create_worker_threads (vlib_main_t * vm, int n, -- cgit 1.2.3-korg From ce359db3b68528ce576862129b2a7709681ad2c6 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 16 Mar 2017 16:15:38 +0100 Subject: vlib: extend foreach_vlib_main macro to assert if workers are not parked Change-Id: I6ff7b65a400734a47bc0a7d03faf86ef1cf4f8c8 Signed-off-by: Damjan Marion --- src/vlib/main.h | 3 +++ src/vlib/threads.h | 36 ++++++++++++++++++++++++------------ 2 files changed, 27 insertions(+), 12 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/main.h b/src/vlib/main.h index 98bc823d..0197b4f3 100644 --- a/src/vlib/main.h +++ b/src/vlib/main.h @@ -174,6 +174,9 @@ typedef struct vlib_main_t volatile u32 api_queue_nonempty; void (*queue_signal_callback) (struct vlib_main_t *); u8 **argv; + + /* debugging */ + volatile int parked_at_barrier; } vlib_main_t; /* Global main structure. */ diff --git a/src/vlib/threads.h b/src/vlib/threads.h index 39f64e1d..eca4fc26 100644 --- a/src/vlib/threads.h +++ b/src/vlib/threads.h @@ -201,18 +201,6 @@ typedef enum void vlib_worker_thread_fork_fixup (vlib_fork_fixup_t which); -static inline void -vlib_worker_thread_barrier_check (void) -{ - if (PREDICT_FALSE (*vlib_worker_threads->wait_at_barrier)) - { - clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, 1); - while (*vlib_worker_threads->wait_at_barrier) - ; - clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, -1); - } -} - #define foreach_vlib_main(body) \ do { \ vlib_main_t ** __vlib_mains = 0, *this_vlib_main; \ @@ -221,6 +209,8 @@ do { \ for (ii = 0; ii < vec_len (vlib_mains); ii++) \ { \ this_vlib_main = vlib_mains[ii]; \ + ASSERT (ii == 0 || \ + this_vlib_main->parked_at_barrier == 1); \ if (this_vlib_main) \ vec_add1 (__vlib_mains, this_vlib_main); \ } \ @@ -320,6 +310,8 @@ typedef struct extern vlib_thread_main_t vlib_thread_main; +#include + #define VLIB_REGISTER_THREAD(x,...) \ __VA_ARGS__ vlib_thread_registration_t x; \ static void __vlib_add_thread_registration_##x (void) \ @@ -356,6 +348,26 @@ vlib_get_current_worker_index () return os_get_cpu_number () - 1; } +static inline void +vlib_worker_thread_barrier_check (void) +{ + if (PREDICT_FALSE (*vlib_worker_threads->wait_at_barrier)) + { + vlib_main_t *vm; + clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, 1); + if (CLIB_DEBUG > 0) + { + vm = vlib_get_main (); + vm->parked_at_barrier = 1; + } + while (*vlib_worker_threads->wait_at_barrier) + ; + if (CLIB_DEBUG > 0) + vm->parked_at_barrier = 0; + clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, -1); + } +} + always_inline vlib_main_t * vlib_get_worker_vlib_main (u32 worker_index) { -- cgit 1.2.3-korg From 903fd513e32a37e55aec0cfb4cf30e000680e0c3 Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Sat, 1 Apr 2017 11:07:40 -0400 Subject: Clean up event log merge code Fix a decade-old ridiculous qsort function bug. Managed to subtract floating-point numbers as if they were integers, leading to manufactured time-paradoxes. That completely confuses g2, leading to the summary disappearance of entire tracks' worth of data at high zoom levels. Add a manual alignment tweak parameter to elog_merge, users can dial-out time paradoxes caused by NTP-grade clock synchronization. The event-logger has a precision of O(100ns), whereas NTP synchronization is O(1ms). Change-Id: I69dedabaa314f69f9df74ec9ee66e21e6c87f703 Signed-off-by: Dave Barach --- src/tools/g2/clib.c | 7 +- src/vlib/main.c | 2 +- src/vppinfra/elog.c | 81 ++++++++++++++----- src/vppinfra/elog.h | 206 ++++++++++++++++++++++++++++++++++++----------- src/vppinfra/test_elog.c | 59 +++++++++++++- 5 files changed, 282 insertions(+), 73 deletions(-) (limited to 'src/vlib') diff --git a/src/tools/g2/clib.c b/src/tools/g2/clib.c index 6454c84d..845026b6 100644 --- a/src/tools/g2/clib.c +++ b/src/tools/g2/clib.c @@ -130,8 +130,6 @@ int read_clib_file(char *clib_file) finalize_events(); - em->events = elog_get_events (em); - cpel_event_init(vec_len(em->events)); starttime = em->events[0].time; @@ -152,3 +150,8 @@ int read_clib_file(char *clib_file) return(0); } + +unsigned int vl(void *a) +{ + return vec_len (a); +} diff --git a/src/vlib/main.c b/src/vlib/main.c index 55a731f1..a2726860 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -701,7 +701,7 @@ elog_save_buffer (vlib_main_t * vm, elog_buffer_capacity (em), chroot_file); vlib_worker_thread_barrier_sync (vm); - error = elog_write_file (em, chroot_file); + error = elog_write_file (em, chroot_file, 1 /* flush ring */ ); vlib_worker_thread_barrier_release (vm); vec_free (chroot_file); return error; diff --git a/src/vppinfra/elog.c b/src/vppinfra/elog.c index e9f06d09..12e3f5d4 100644 --- a/src/vppinfra/elog.c +++ b/src/vppinfra/elog.c @@ -77,6 +77,7 @@ new_event_type (elog_main_t * em, uword i) em->event_type_by_format = hash_create_vec ( /* size */ 0, sizeof (u8), sizeof (uword)); + t->type_index_plus_one = i + 1; hash_set_mem (em->event_type_by_format, t->format, i); } @@ -400,14 +401,15 @@ void elog_time_now (elog_time_stamp_t * et) { u64 cpu_time_now, os_time_now_nsec; + struct timespec ts; #ifdef CLIB_UNIX { #include - struct timespec ts; syscall (SYS_clock_gettime, CLOCK_REALTIME, &ts); cpu_time_now = clib_cpu_time_now (); - os_time_now_nsec = 1e9 * ts.tv_sec + ts.tv_nsec; + /* Subtract 3/30/2017's worth of seconds to retain precision */ + os_time_now_nsec = 1e9 * (ts.tv_sec - 1490885108) + ts.tv_nsec; } #else cpu_time_now = clib_cpu_time_now (); @@ -600,11 +602,21 @@ elog_cmp (void *a1, void *a2) elog_event_t *e1 = a1; elog_event_t *e2 = a2; - return e1->time - e2->time; + if (e1->time < e2->time) + return -1; + + if (e1->time > e2->time) + return 1; + + return 0; } +/* + * merge two event logs. Complicated and cranky. + */ void -elog_merge (elog_main_t * dst, u8 * dst_tag, elog_main_t * src, u8 * src_tag) +elog_merge (elog_main_t * dst, u8 * dst_tag, elog_main_t * src, u8 * src_tag, + f64 align_tweak) { elog_event_t *e; uword l; @@ -615,6 +627,7 @@ elog_merge (elog_main_t * dst, u8 * dst_tag, elog_main_t * src, u8 * src_tag) memset (&newt, 0, sizeof (newt)); + /* Acquire src and dst events */ elog_get_events (src); elog_get_events (dst); @@ -622,7 +635,7 @@ elog_merge (elog_main_t * dst, u8 * dst_tag, elog_main_t * src, u8 * src_tag) vec_append (dst->string_table, src->string_table); l = vec_len (dst->events); - vec_add (dst->events, src->events, vec_len (src->events)); + vec_append (dst->events, src->events); /* Prepend the supplied tag (if any) to all dst track names */ if (dst_tag) @@ -638,6 +651,9 @@ elog_merge (elog_main_t * dst, u8 * dst_tag, elog_main_t * src, u8 * src_tag) } } + /* + * Remember where we started allocating new tracks while merging + */ track_offset_for_src_tracks = vec_len (dst->tracks); /* Copy / tag source tracks */ @@ -688,10 +704,18 @@ elog_merge (elog_main_t * dst, u8 * dst_tag, elog_main_t * src, u8 * src_tag) (elog_time_stamp_diff_cpu (&src->init_time, &dst->init_time) * .5 * (dst->nsec_per_cpu_clock + src->nsec_per_cpu_clock)); - /* Heuristic to see if src/dst came from same time source. - If frequencies are "the same" and os clock and cpu clock agree - to within 100e-9 secs about time difference between src/dst - init_time, then we use cpu clock. Otherwise we use OS clock. */ + /* + * Heuristic to see if src/dst came from same time source. + * If frequencies are "the same" and os clock and cpu clock agree + * to within 100e-9 secs about time difference between src/dst + * init_time, then we use cpu clock. Otherwise we use OS clock. + * + * When merging event logs from different systems, time paradoxes + * at the O(1ms) level are to be expected. Hence, the "align_tweak" + * parameter. If two events logged on different processors are known + * to occur in a specific order - and with a reasonably-estimated + * interval - supply a non-zero "align_tweak" parameter + */ if (fabs (src->nsec_per_cpu_clock - dst->nsec_per_cpu_clock) < 1e-2 && fabs (dt_os_nsec - dt_clock_nsec) < 100) dt_event = dt_clock_nsec; @@ -699,23 +723,45 @@ elog_merge (elog_main_t * dst, u8 * dst_tag, elog_main_t * src, u8 * src_tag) /* Convert to seconds. */ dt_event *= 1e-9; + /* + * Move the earlier set of events later, to avoid creating + * events which preceed the Big Bang (aka have negative timestamps). + * + * Not to any scale, we have something like the following picture: + * + * DST capture start point + * ^ + * +--- dt_event --+ + * v + * SRC capture start point + * + * In this case dt_event is positive, src started after dst, + * to put src events onto a common timebase we have to move them + * forward in time. Naturally, the opposite case is + * possible, too: dt_event will be negative, and so we have to + * move dst events forward in time by the |dt_event|. + * In both cases, we add align_tweak. + */ if (dt_event > 0) { /* Src started after dst. */ for (e = dst->events + l; e < vec_end (dst->events); e++) - e->time += dt_event; + e->time += dt_event + align_tweak; } else { /* Dst started after src. */ + dt_event = -dt_event; for (e = dst->events + 0; e < dst->events + l; e++) - e->time += dt_event; + e->time += dt_event + align_tweak; } } /* Sort events by increasing time. */ vec_sort_with_function (dst->events, elog_cmp); + dst->n_total_events = vec_len (dst->events); + /* Recreate the event ring or the results won't serialize */ { int i; @@ -731,12 +777,7 @@ elog_merge (elog_main_t * dst, u8 * dst_tag, elog_main_t * src, u8 * src_tag) ed = dst->event_ring + i; ed[0] = es[0]; - - /* Invert elog_peek_events calculation */ - ed->time_cycles = - (es->time / dst->cpu_timer.seconds_per_clock) + dst->init_time.cpu; } - dst->n_total_events = vec_len (dst->events); } } @@ -990,6 +1031,7 @@ void serialize_elog_main (serialize_main_t * m, va_list * va) { elog_main_t *em = va_arg (*va, elog_main_t *); + int flush_ring = va_arg (*va, int); elog_event_t *e; serialize_magic (m, elog_serialize_magic, strlen (elog_serialize_magic)); @@ -1005,8 +1047,11 @@ serialize_elog_main (serialize_main_t * m, va_list * va) vec_serialize (m, em->string_table, serialize_vec_8); /* Free old events (cached) in case they have changed. */ - vec_free (em->events); - elog_get_events (em); + if (flush_ring) + { + vec_free (em->events); + elog_get_events (em); + } serialize_integer (m, vec_len (em->events), sizeof (u32)); diff --git a/src/vppinfra/elog.h b/src/vppinfra/elog.h index 9756fb83..359868dd 100644 --- a/src/vppinfra/elog.h +++ b/src/vppinfra/elog.h @@ -35,7 +35,16 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -/* High speed event logging with much thanks to Dave Barach. */ +/* High speed event logger */ + +/** \file + The fine-grained event logger allows lightweight, thread-safe + event logging at minimum cost. In typical operation, logging + a single event costs around 80ns on x86_64. It's appropriate + for at-least per-frame event-logging in vector packet processing. + + See https://wiki.fd.io/view/VPP/elog for more information. +*/ #ifndef included_clib_elog_h #define included_clib_elog_h @@ -50,38 +59,38 @@ typedef struct { union { - /* Absolute time stamp in CPU clock cycles. */ + /** Absolute time stamp in CPU clock cycles. */ u64 time_cycles; - /* Absolute time as floating point number in seconds. */ + /** Absolute time as floating point number in seconds. */ f64 time; }; - /* Event type index. */ + /** Event type index. */ u16 type; - /* Track for this event. Tracks allow events to be sorted and + /** Track for this event. Tracks allow events to be sorted and displayed by track. Think of 2 dimensional display with time and track being the x and y axes. */ u16 track; - /* 20-bytes of data follows and pads to 32 bytes. */ + /** 20-bytes of data follows, pads to 32 bytes. */ u8 data[20]; } elog_event_t; typedef struct { - /* Type index plus one assigned to this type. + /** Type index plus one assigned to this type. This is used to mark type as seen. */ u32 type_index_plus_one; - /* String table as a vector constructed when type is registered. */ + /** String table as a vector constructed when type is registered. */ char **enum_strings_vector; - /* Format string. (example: "my-event (%d,%d)"). */ + /** Format string. (example: "my-event (%d,%d)"). */ char *format; - /* Specifies how arguments to format are parsed from event data. + /** Specifies how arguments to format are parsed from event data. String of characters '0' '1' or '2' '3' to specify log2 size of data (e.g. for u8, u16, u32 or u64), 's' means a null-terminated C string @@ -90,97 +99,110 @@ typedef struct 'f' is a double. */ char *format_args; - /* Function name generating event. */ + /** Function name generating event. */ char *function; - /* Number of elements in string enum table. */ + /** Number of elements in string enum table. */ u32 n_enum_strings; - /* String table for enum/number to string formatting. */ + /** String table for enum/number to string formatting. */ char *enum_strings[]; } elog_event_type_t; typedef struct { - /* Track name vector. */ + /** Track name vector. */ char *name; - /* Set to one when track has been added to + /** Set to one when track has been added to main structure. */ u32 track_index_plus_one; } elog_track_t; typedef struct { - /* CPU cycle counter. */ + /** CPU cycle counter. */ u64 cpu; - /* OS timer in nano secs since epoch Jan 1 1970. */ + /** OS timer in nano secs since epoch 3/30/2017, see elog_time_now() */ u64 os_nsec; } elog_time_stamp_t; typedef struct { - /* Total number of events in buffer. */ + /** Total number of events in buffer. */ u32 n_total_events; - /* When count reaches limit logging is disabled. This is + /** When count reaches limit logging is disabled. This is used for event triggers. */ u32 n_total_events_disable_limit; - /* Dummy event to use when logger is disabled. */ + /** Dummy event to use when logger is disabled. */ elog_event_t dummy_event; - /* Power of 2 number of elements in ring. */ + /** Power of 2 number of elements in ring. */ uword event_ring_size; - /* Vector of events (circular buffer). Power of 2 size. - Used when events are being collected. */ + /** Vector of events (circular buffer). Power of 2 size. + Used when events are being collected. */ elog_event_t *event_ring; - /* Vector of event types. */ + /** Vector of event types. */ elog_event_type_t *event_types; - /* Hash table mapping type format to type index. */ + /** Hash table mapping type format to type index. */ uword *event_type_by_format; - /* Events may refer to strings in string table. */ + /** Events may refer to strings in string table. */ char *string_table; - /* Vector of tracks. */ + /** Vector of tracks. */ elog_track_t *tracks; - /* Default track. */ + /** Default track. */ elog_track_t default_track; - /* Place holder for CPU clock frequency. */ + /** Place holder for CPU clock frequency. */ clib_time_t cpu_timer; + /** Timestamps */ elog_time_stamp_t init_time, serialize_time; - /* SMP lock, non-zero means locking required */ + /** SMP lock, non-zero means locking required */ uword *lock; - /* Use serialize_time and init_time to give estimate for - cpu clock frequency. */ + /** Use serialize_time and init_time to give estimate for + cpu clock frequency. */ f64 nsec_per_cpu_clock; - /* Vector of events converted to generic form after collection. */ + /** Vector of events converted to generic form after collection. */ elog_event_t *events; } elog_main_t; +/** @brief Return number of events in the event-log buffer + @param em elog_main_t * + @return number of events in the buffer +*/ + always_inline uword elog_n_events_in_buffer (elog_main_t * em) { return clib_min (em->n_total_events, em->event_ring_size); } +/** @brief Return number of events which can fit in the event buffer + @param em elog_main_t * + @return number of events which can fit in the buffer +*/ always_inline uword elog_buffer_capacity (elog_main_t * em) { return em->event_ring_size; } +/** @brief Reset the event buffer + @param em elog_main_t * +*/ always_inline void elog_reset_buffer (elog_main_t * em) { @@ -188,6 +210,9 @@ elog_reset_buffer (elog_main_t * em) em->n_total_events_disable_limit = ~0; } +/** @brief Enable or disable event logging + @param em elog_main_t * +*/ always_inline void elog_enable_disable (elog_main_t * em, int is_enabled) { @@ -195,18 +220,27 @@ elog_enable_disable (elog_main_t * em, int is_enabled) em->n_total_events_disable_limit = is_enabled ? ~0 : 0; } -/* Disable logging after specified number of ievents have been logged. +/** @brief disable logging after specified number of ievents have been logged. + This is used as a "debug trigger" when a certain event has occurred. Events will be logged both before and after the "event" but the - event will not be lost as long as N < RING_SIZE. */ + event will not be lost as long as N < RING_SIZE. + + @param em elog_main_t * + @param n uword number of events before disabling event logging +*/ always_inline void elog_disable_after_events (elog_main_t * em, uword n) { em->n_total_events_disable_limit = em->n_total_events + n; } -/* Signal a trigger. We do this when we encounter an event that we want to save - context around (before and after). */ +/* @brief mid-buffer logic-analyzer trigger + + Currently, only midpoint triggering is supported, but it's pretty obvious + how to generalize the scheme. + @param em elog_main_t * +*/ always_inline void elog_disable_trigger (elog_main_t * em) { @@ -214,18 +248,44 @@ elog_disable_trigger (elog_main_t * em) em->n_total_events + vec_len (em->event_ring) / 2; } -/* External function to register types/tracks. */ +/** @brief register an event type + @param em elog_main_t * + @param t elog_event_type_t * event to register + @return type index + @warning Typically not called directly +*/ + word elog_event_type_register (elog_main_t * em, elog_event_type_t * t); + +/** @brief register an event track + @param em elog_main_t * + @param t elog_track_t * track to register + @return track index + @note this function is often called directly +*/ word elog_track_register (elog_main_t * em, elog_track_t * t); +/** @brief event logging enabled predicate + @param em elog_main_t * + @return 1 if enabled, 0 if not enabled +*/ always_inline uword elog_is_enabled (elog_main_t * em) { return em->n_total_events < em->n_total_events_disable_limit; } -/* Add an event to the log. Returns a pointer to the - data for caller to write into. */ +/** @brief Allocate an event to be filled in by the caller + + Not normally called directly; this function underlies the + ELOG_DATA and ELOG_TRACK_DATA macros + + @param em elog_main_t * + @param type elog_event_type_t * type + @param track elog_track_t * track + @param cpu_time u64 current cpu tick value + @returns event to be filled in +*/ always_inline void * elog_event_data_inline (elog_main_t * em, elog_event_type_t * type, @@ -274,7 +334,17 @@ void *elog_event_data (elog_main_t * em, elog_event_type_t * type, elog_track_t * track, u64 cpu_time); -/* Non-inline version. */ +/** @brief Allocate an event to be filled in by the caller, non-inline + + Not normally called directly; this function underlies the + ELOG_DATA and ELOG_TRACK_DATA macros + + @param em elog_main_t * + @param type elog_event_type_t * type + @param track elog_track_t * track + @param cpu_time u64 current cpu tick value + @returns event to be filled in +*/ always_inline void * elog_event_data_not_inline (elog_main_t * em, elog_event_type_t * type, @@ -286,7 +356,11 @@ elog_event_data_not_inline (elog_main_t * em, return elog_event_data (em, type, track, cpu_time); } -/* Most common forms: log a single 32 bit datum, w / w-out track */ +/** @brief Log a single-datum event + @param em elog_main_t * + @param type elog_event_type_t * type + @param data u32 single datum to capture +*/ always_inline void elog (elog_main_t * em, elog_event_type_t * type, u32 data) { @@ -297,7 +371,11 @@ elog (elog_main_t * em, elog_event_type_t * type, u32 data) d[0] = data; } -/* Inline version of above. */ +/** @brief Log a single-datum event, inline version + @param em elog_main_t * + @param type elog_event_type_t * type + @param data u32 single datum to capture +*/ always_inline void elog_inline (elog_main_t * em, elog_event_type_t * type, u32 data) { @@ -308,6 +386,12 @@ elog_inline (elog_main_t * em, elog_event_type_t * type, u32 data) d[0] = data; } +/** @brief Log a single-datum event to a specific track, non-inline version + @param em elog_main_t * + @param type elog_event_type_t * type + @param type elog_event_track_t * track + @param data u32 single datum to capture +*/ always_inline void elog_track (elog_main_t * em, elog_event_type_t * type, elog_track_t * track, u32 data) @@ -319,6 +403,12 @@ elog_track (elog_main_t * em, elog_event_type_t * type, elog_track_t * track, d[0] = data; } +/** @brief Log a single-datum event to a specific track + @param em elog_main_t * + @param type elog_event_type_t * type + @param type elog_event_track_t * track + @param data u32 single datum to capture +*/ always_inline void elog_track_inline (elog_main_t * em, elog_event_type_t * type, elog_track_t * track, u32 data) @@ -392,19 +482,37 @@ elog_data_inline (elog_main_t * em, elog_event_type_t * type, #define ELOG_DATA(em,f) elog_data ((em), &__ELOG_TYPE_VAR (f), &(em)->default_track) #define ELOG_DATA_INLINE(em,f) elog_data_inline ((em), &__ELOG_TYPE_VAR (f), &(em)->default_track) +/** @brief add a string to the event-log string table + + Often combined with hashing and the T4 elog format specifier to + display complex strings in offline tooling + + @param em elog_main_t * + @param format char * + @param VARARGS + @return u32 index to add to event log +*/ u32 elog_string (elog_main_t * em, char *format, ...); + void elog_time_now (elog_time_stamp_t * et); -/* Convert ievents to events and return them as a vector. - Sets em->events to resulting vector. */ +/** @brief convert event ring events to events, and return them as a vector. + @param em elog_main_t * + @return event vector with timestamps in f64 seconds + @note sets em->events to resulting vector. +*/ elog_event_t *elog_get_events (elog_main_t * em); -/* Convert ievents to events and return them as a vector with no side effects. */ +/** @brief convert event ring events to events, and return them as a vector. + @param em elog_main_t * + @return event vector with timestamps in f64 seconds + @note no side effects +*/ elog_event_t *elog_peek_events (elog_main_t * em); /* Merge two logs, add supplied track tags. */ void elog_merge (elog_main_t * dst, u8 * dst_tag, - elog_main_t * src, u8 * src_tag); + elog_main_t * src, u8 * src_tag, f64 align_tweak); /* 2 arguments elog_main_t and elog_event_t to format event or track name. */ u8 *format_elog_event (u8 * s, va_list * va); @@ -418,7 +526,7 @@ void elog_alloc (elog_main_t * em, u32 n_events); #ifdef CLIB_UNIX always_inline clib_error_t * -elog_write_file (elog_main_t * em, char *unix_file) +elog_write_file (elog_main_t * em, char *unix_file, int flush_ring) { serialize_main_t m; clib_error_t *error; @@ -426,7 +534,7 @@ elog_write_file (elog_main_t * em, char *unix_file) error = serialize_open_unix_file (&m, unix_file); if (error) return error; - error = serialize (&m, serialize_elog_main, em); + error = serialize (&m, serialize_elog_main, em, flush_ring); if (!error) serialize_close (&m); return error; diff --git a/src/vppinfra/test_elog.c b/src/vppinfra/test_elog.c index 89905adb..1cf5ba1f 100644 --- a/src/vppinfra/test_elog.c +++ b/src/vppinfra/test_elog.c @@ -52,6 +52,8 @@ test_elog_main (unformat_input_t * input) f64 min_sample_time; char *dump_file, *load_file, *merge_file, **merge_files; u8 *tag, **tags; + f64 align_tweak; + f64 *align_tweaks; n_iter = 100; max_events = 100000; @@ -61,6 +63,7 @@ test_elog_main (unformat_input_t * input) load_file = 0; merge_files = 0; tags = 0; + align_tweaks = 0; min_sample_time = 2; while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { @@ -83,6 +86,8 @@ test_elog_main (unformat_input_t * input) ; else if (unformat (input, "sample-time %f", &min_sample_time)) ; + else if (unformat (input, "align-tweak %f", &align_tweak)) + vec_add1 (align_tweaks, align_tweak); else { error = clib_error_create ("unknown input `%U'\n", @@ -102,9 +107,15 @@ test_elog_main (unformat_input_t * input) { uword i; elog_main_t *ems; - vec_clone (ems, merge_files); + /* Supply default tags as needed */ + if (vec_len (tags) < vec_len (ems)) + { + for (i = vec_len (tags); i < vec_len (ems); i++) + vec_add1 (tags, format (0, "F%d%c", i, 0)); + } + elog_init (em, max_events); for (i = 0; i < vec_len (ems); i++) { @@ -113,7 +124,10 @@ test_elog_main (unformat_input_t * input) goto done; if (i > 0) { - elog_merge (em, tags[0], &ems[i], tags[i]); + align_tweak = 0.0; + if (i <= vec_len (align_tweaks)) + align_tweak = align_tweaks[i - 1]; + elog_merge (em, tags[0], &ems[i], tags[i], align_tweak); tags[0] = 0; } } @@ -217,7 +231,8 @@ test_elog_main (unformat_input_t * input) #ifdef CLIB_UNIX if (dump_file) { - if ((error = elog_write_file (em, dump_file))) + if ((error = + elog_write_file (em, dump_file, 0 /* do not flush ring */ ))) goto done; } #endif @@ -246,6 +261,8 @@ main (int argc, char *argv[]) unformat_input_t i; int r; + clib_mem_init (0, 3ULL << 30); + unformat_init_command_line (&i, argv); r = test_elog_main (&i); unformat_free (&i); @@ -253,6 +270,42 @@ main (int argc, char *argv[]) } #endif +/** + * @brief GDB callable function: vl - Return vector length of vector + * + * @param *p - void - address of vector + * + * @return length - u32 + * + */ +u32 +vl (void *p) +{ + return vec_len (p); +} + +/** + * @brief GDB callable function: pe - call pool_elts - number of elements in a pool + * + * @param *v - void - address of pool + * + * @return number - uword + * + */ +#include +uword +pe (void *v) +{ + return (pool_elts (v)); +} + +#include +uword +he (void *v) +{ + return (hash_elts (v)); +} + /* * fd.io coding-style-patch-verification: ON * -- cgit 1.2.3-korg From 2c2b6407129461e183bc8ad5964cddc98ed1626c Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Tue, 28 Mar 2017 14:16:15 +0200 Subject: vlib: make vlib_node_interrupt_pending(...) thread safe Change-Id: I24577bd32ae23fbe8515cc8d960eab5448ce3b5c Signed-off-by: Damjan Marion --- src/vlib/main.c | 23 ++++++++++++++++++----- src/vlib/node.h | 2 ++ src/vlib/node_funcs.h | 2 ++ 3 files changed, 22 insertions(+), 5 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/main.c b/src/vlib/main.c index a2726860..b22203f0 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -1414,6 +1414,7 @@ vlib_main_or_worker_loop (vlib_main_t * vm, int is_main) uword i; u64 cpu_time_now; vlib_frame_queue_main_t *fqm; + u32 *last_node_runtime_indices = 0; /* Initialize pending node vector. */ if (is_main) @@ -1442,8 +1443,13 @@ vlib_main_or_worker_loop (vlib_main_t * vm, int is_main) vec_alloc (nm->data_from_advancing_timing_wheel, 32); } - /* Pre-allocate expired nodes. */ + /* Pre-allocate interupt runtime indices and lock. */ vec_alloc (nm->pending_interrupt_node_runtime_indices, 32); + vec_alloc (last_node_runtime_indices, 32); + if (!is_main) + clib_spinlock_init (&nm->pending_interrupt_lock); + + /* Pre-allocate expired nodes. */ if (!nm->polling_threshold_vector_length) nm->polling_threshold_vector_length = 10; if (!nm->interrupt_threshold_vector_length) @@ -1505,13 +1511,20 @@ vlib_main_or_worker_loop (vlib_main_t * vm, int is_main) uword i; if (l > 0) { - _vec_len (nm->pending_interrupt_node_runtime_indices) = 0; + u32 *tmp; + if (!is_main) + clib_spinlock_lock (&nm->pending_interrupt_lock); + tmp = nm->pending_interrupt_node_runtime_indices; + nm->pending_interrupt_node_runtime_indices = + last_node_runtime_indices; + last_node_runtime_indices = tmp; + _vec_len (last_node_runtime_indices) = 0; + if (!is_main) + clib_spinlock_unlock (&nm->pending_interrupt_lock); for (i = 0; i < l; i++) { n = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT], - nm-> - pending_interrupt_node_runtime_indices - [i]); + last_node_runtime_indices[i]); cpu_time_now = dispatch_node (vm, n, VLIB_NODE_TYPE_INPUT, VLIB_NODE_STATE_INTERRUPT, diff --git a/src/vlib/node.h b/src/vlib/node.h index 2a532cc3..fc7e7da2 100644 --- a/src/vlib/node.h +++ b/src/vlib/node.h @@ -42,6 +42,7 @@ #include #include +#include #include #include /* for vlib_trace_filter_t */ @@ -644,6 +645,7 @@ typedef struct /* Node runtime indices for input nodes with pending interrupts. */ u32 *pending_interrupt_node_runtime_indices; + clib_spinlock_t pending_interrupt_lock; /* Input nodes are switched from/to interrupt to/from polling mode when average vector length goes above/below polling/interrupt diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h index 8ccfc438..1f7d94e1 100644 --- a/src/vlib/node_funcs.h +++ b/src/vlib/node_funcs.h @@ -183,7 +183,9 @@ vlib_node_set_interrupt_pending (vlib_main_t * vm, u32 node_index) vlib_node_main_t *nm = &vm->node_main; vlib_node_t *n = vec_elt (nm->nodes, node_index); ASSERT (n->type == VLIB_NODE_TYPE_INPUT); + clib_spinlock_lock_if_init (&nm->pending_interrupt_lock); vec_add1 (nm->pending_interrupt_node_runtime_indices, n->runtime_index); + clib_spinlock_unlock_if_init (&nm->pending_interrupt_lock); } always_inline vlib_process_t * -- cgit 1.2.3-korg From 586afd762bfa149f5ca167bd5fd5a0cd59ce94fe Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 5 Apr 2017 19:18:20 +0200 Subject: Use thread local storage for thread index This patch deprecates stack-based thread identification, Also removes requirement that thread stacks are adjacent. Finally, possibly annoying for some folks, it renames all occurences of cpu_index and cpu_number with thread index. Using word "cpu" is misleading here as thread can be migrated ti different CPU, and also it is not related to linux cpu index. Change-Id: I68cdaf661e701d2336fc953dcb9978d10a70f7c1 Signed-off-by: Damjan Marion --- src/examples/srv6-sample-localsid/node.c | 4 +- src/plugins/dpdk/buffer.c | 2 +- src/plugins/dpdk/device/device.c | 8 +- src/plugins/dpdk/device/dpdk_priv.h | 8 +- src/plugins/dpdk/device/init.c | 2 +- src/plugins/dpdk/device/node.c | 32 +++--- src/plugins/dpdk/hqos/hqos.c | 16 +-- src/plugins/dpdk/ipsec/cli.c | 8 +- src/plugins/dpdk/ipsec/crypto_node.c | 4 +- src/plugins/dpdk/ipsec/esp.h | 4 +- src/plugins/dpdk/ipsec/esp_decrypt.c | 4 +- src/plugins/dpdk/ipsec/esp_encrypt.c | 5 +- src/plugins/dpdk/ipsec/ipsec.c | 2 +- src/plugins/dpdk/ipsec/ipsec.h | 4 +- src/plugins/dpdk/main.c | 2 +- src/plugins/flowperpkt/l2_node.c | 2 +- src/plugins/flowperpkt/node.c | 2 +- src/plugins/ioam/export-common/ioam_export.h | 6 +- .../ioam/ip6/ioam_cache_tunnel_select_node.c | 16 +-- src/plugins/ixge/ixge.c | 2 +- src/plugins/lb/lb.c | 8 +- src/plugins/lb/node.c | 22 ++-- src/plugins/lb/refcount.c | 8 +- src/plugins/lb/refcount.h | 4 +- src/plugins/memif/node.c | 35 +++--- src/plugins/snat/in2out.c | 110 +++++++++--------- src/plugins/snat/out2in.c | 102 ++++++++--------- src/plugins/snat/snat.h | 10 +- src/vlib/buffer.c | 6 +- src/vlib/buffer_funcs.h | 4 +- src/vlib/cli.c | 6 +- src/vlib/counter.h | 16 +-- src/vlib/error.c | 2 +- src/vlib/global_funcs.h | 2 +- src/vlib/main.c | 14 +-- src/vlib/main.h | 2 +- src/vlib/node.c | 2 +- src/vlib/node.h | 6 +- src/vlib/node_funcs.h | 8 +- src/vlib/threads.c | 69 ++++------- src/vlib/threads.h | 21 ++-- src/vlib/unix/cj.c | 7 +- src/vlib/unix/cj.h | 2 +- src/vlib/unix/main.c | 43 +++---- src/vnet/adj/adj_l2.c | 4 +- src/vnet/adj/adj_midchain.c | 8 +- src/vnet/adj/adj_nsh.c | 4 +- src/vnet/classify/vnet_classify.c | 16 +-- src/vnet/cop/ip4_whitelist.c | 8 +- src/vnet/cop/ip6_whitelist.c | 8 +- src/vnet/devices/af_packet/node.c | 20 ++-- src/vnet/devices/devices.c | 61 +++++----- src/vnet/devices/devices.h | 18 +-- src/vnet/devices/netmap/node.c | 24 ++-- src/vnet/devices/ssvm/node.c | 6 +- src/vnet/devices/virtio/vhost-user.c | 127 +++++++++++---------- src/vnet/dpo/lookup_dpo.c | 20 ++-- src/vnet/dpo/replicate_dpo.c | 12 +- src/vnet/ethernet/arp.c | 2 +- src/vnet/ethernet/interface.c | 7 +- src/vnet/ethernet/node.c | 14 +-- src/vnet/gre/node.c | 8 +- src/vnet/interface.h | 2 +- src/vnet/interface_output.c | 53 ++++----- src/vnet/ip/ip4_forward.c | 34 +++--- src/vnet/ip/ip4_input.c | 8 +- src/vnet/ip/ip6_forward.c | 24 ++-- src/vnet/ip/ip6_input.c | 8 +- src/vnet/ip/ip6_neighbor.c | 4 +- src/vnet/ipsec/esp.h | 8 +- src/vnet/ipsec/esp_decrypt.c | 13 ++- src/vnet/ipsec/esp_encrypt.c | 13 ++- src/vnet/ipsec/ikev2.c | 64 ++++++----- src/vnet/ipsec/ipsec.h | 12 +- src/vnet/ipsec/ipsec_if.c | 2 +- src/vnet/l2/l2_bvi.h | 2 +- src/vnet/l2/l2_input.c | 14 +-- src/vnet/l2/l2_output.c | 6 +- src/vnet/l2tp/decap.c | 2 +- src/vnet/l2tp/encap.c | 2 +- src/vnet/l2tp/l2tp.c | 6 +- src/vnet/lisp-gpe/decap.c | 16 +-- src/vnet/lldp/lldp_input.c | 2 +- src/vnet/map/ip4_map.c | 14 +-- src/vnet/map/ip4_map_t.c | 12 +- src/vnet/map/ip6_map.c | 19 +-- src/vnet/map/ip6_map_t.c | 12 +- src/vnet/mpls/mpls_input.c | 8 +- src/vnet/mpls/mpls_lookup.c | 20 ++-- src/vnet/mpls/mpls_output.c | 10 +- src/vnet/pg/input.c | 4 +- src/vnet/replication.c | 20 ++-- src/vnet/replication.h | 2 +- src/vnet/session/node.c | 2 +- src/vnet/sr/sr_localsid.c | 44 +++---- src/vnet/tcp/builtin_client.c | 2 +- src/vnet/tcp/tcp.c | 8 +- src/vnet/tcp/tcp_debug.h | 2 +- src/vnet/tcp/tcp_input.c | 10 +- src/vnet/tcp/tcp_output.c | 20 ++-- src/vnet/udp/udp_input.c | 2 +- src/vnet/unix/tapcli.c | 2 +- src/vnet/unix/tuntap.c | 4 +- src/vnet/vxlan-gpe/decap.c | 10 +- src/vnet/vxlan-gpe/encap.c | 12 +- src/vnet/vxlan/decap.c | 10 +- src/vnet/vxlan/encap.c | 12 +- src/vpp/stats/stats.c | 14 +-- src/vpp/stats/stats.h | 2 +- 109 files changed, 790 insertions(+), 791 deletions(-) (limited to 'src/vlib') diff --git a/src/examples/srv6-sample-localsid/node.c b/src/examples/srv6-sample-localsid/node.c index 7bae9cd7..e83e2352 100644 --- a/src/examples/srv6-sample-localsid/node.c +++ b/src/examples/srv6-sample-localsid/node.c @@ -114,7 +114,7 @@ srv6_localsid_sample_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_fram from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; next_index = node->cached_next_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -168,7 +168,7 @@ srv6_localsid_sample_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_fram /* This increments the SRv6 per LocalSID counters.*/ vlib_increment_combined_counter (((next0 == SRV6_SAMPLE_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : &(sm->sr_ls_valid_counters)), - cpu_index, + thread_index, ls0 - sm->localsids, 1, vlib_buffer_length_in_chain (vm, b0)); diff --git a/src/plugins/dpdk/buffer.c b/src/plugins/dpdk/buffer.c index 2765c292..c80b3fa8 100644 --- a/src/plugins/dpdk/buffer.c +++ b/src/plugins/dpdk/buffer.c @@ -132,7 +132,7 @@ dpdk_buffer_delete_free_list (vlib_main_t * vm, u32 free_list_index) u32 merge_index; int i; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); f = vlib_buffer_get_free_list (vm, free_list_index); diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c index 50b26689..91661246 100644 --- a/src/plugins/dpdk/device/device.c +++ b/src/plugins/dpdk/device/device.c @@ -243,7 +243,7 @@ static_always_inline ASSERT (ring->tx_tail == 0); n_retry = 16; - queue_id = vm->cpu_index; + queue_id = vm->thread_index; do { @@ -266,7 +266,7 @@ static_always_inline { /* no wrap, transmit in one burst */ dpdk_device_hqos_per_worker_thread_t *hqos = - &xd->hqos_wt[vm->cpu_index]; + &xd->hqos_wt[vm->thread_index]; ASSERT (hqos->swq != NULL); @@ -332,7 +332,7 @@ dpdk_buffer_recycle (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t * b, u32 bi, struct rte_mbuf **mbp) { dpdk_main_t *dm = &dpdk_main; - u32 my_cpu = vm->cpu_index; + u32 my_cpu = vm->thread_index; struct rte_mbuf *mb_new; if (PREDICT_FALSE (b->flags & VLIB_BUFFER_RECYCLE) == 0) @@ -376,7 +376,7 @@ dpdk_interface_tx (vlib_main_t * vm, tx_ring_hdr_t *ring; u32 n_on_ring; - my_cpu = vm->cpu_index; + my_cpu = vm->thread_index; queue_id = my_cpu; diff --git a/src/plugins/dpdk/device/dpdk_priv.h b/src/plugins/dpdk/device/dpdk_priv.h index dd40ff48..52b4ca4b 100644 --- a/src/plugins/dpdk/device/dpdk_priv.h +++ b/src/plugins/dpdk/device/dpdk_priv.h @@ -79,7 +79,7 @@ dpdk_update_counters (dpdk_device_t * xd, f64 now) { vlib_simple_counter_main_t *cm; vnet_main_t *vnm = vnet_get_main (); - u32 my_cpu = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u64 rxerrors, last_rxerrors; /* only update counters for PMD interfaces */ @@ -96,7 +96,7 @@ dpdk_update_counters (dpdk_device_t * xd, f64 now) cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, VNET_INTERFACE_COUNTER_RX_NO_BUF); - vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, xd->vlib_sw_if_index, xd->stats.rx_nombuf - xd->last_stats.rx_nombuf); } @@ -107,7 +107,7 @@ dpdk_update_counters (dpdk_device_t * xd, f64 now) cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, VNET_INTERFACE_COUNTER_RX_MISS); - vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, xd->vlib_sw_if_index, xd->stats.imissed - xd->last_stats.imissed); } @@ -119,7 +119,7 @@ dpdk_update_counters (dpdk_device_t * xd, f64 now) cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, VNET_INTERFACE_COUNTER_RX_ERROR); - vlib_increment_simple_counter (cm, my_cpu, xd->vlib_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, xd->vlib_sw_if_index, rxerrors - last_rxerrors); } diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 538db6cb..7eaf8da7 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -324,7 +324,7 @@ dpdk_port_setup (dpdk_main_t * dm, dpdk_device_t * xd) int rv; int j; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); if (xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) { diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c index e740fd18..b10e0fad 100644 --- a/src/plugins/dpdk/device/node.c +++ b/src/plugins/dpdk/device/node.c @@ -283,7 +283,7 @@ dpdk_buffer_init_from_template (void *d0, void *d1, void *d2, void *d3, */ static_always_inline u32 dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, - vlib_node_runtime_t * node, u32 cpu_index, u16 queue_id, + vlib_node_runtime_t * node, u32 thread_index, u16 queue_id, int maybe_multiseg) { u32 n_buffers; @@ -294,7 +294,7 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, uword n_rx_bytes = 0; u32 n_trace, trace_cnt __attribute__ ((unused)); vlib_buffer_free_list_t *fl; - vlib_buffer_t *bt = vec_elt_at_index (dm->buffer_templates, cpu_index); + vlib_buffer_t *bt = vec_elt_at_index (dm->buffer_templates, thread_index); if ((xd->flags & DPDK_DEVICE_FLAG_ADMIN_UP) == 0) return 0; @@ -306,7 +306,7 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, return 0; } - vec_reset_length (xd->d_trace_buffers[cpu_index]); + vec_reset_length (xd->d_trace_buffers[thread_index]); trace_cnt = n_trace = vlib_get_trace_count (vm, node); if (n_trace > 0) @@ -318,7 +318,7 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, { struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index++]; vlib_buffer_t *b = vlib_buffer_from_rte_mbuf (mb); - vec_add1 (xd->d_trace_buffers[cpu_index], + vec_add1 (xd->d_trace_buffers[thread_index], vlib_get_buffer_index (vm, b)); } } @@ -546,20 +546,22 @@ dpdk_device_input (dpdk_main_t * dm, dpdk_device_t * xd, vlib_put_next_frame (vm, node, next_index, n_left_to_next); } - if (PREDICT_FALSE (vec_len (xd->d_trace_buffers[cpu_index]) > 0)) + if (PREDICT_FALSE (vec_len (xd->d_trace_buffers[thread_index]) > 0)) { - dpdk_rx_trace (dm, node, xd, queue_id, xd->d_trace_buffers[cpu_index], - vec_len (xd->d_trace_buffers[cpu_index])); - vlib_set_trace_count (vm, node, n_trace - - vec_len (xd->d_trace_buffers[cpu_index])); + dpdk_rx_trace (dm, node, xd, queue_id, + xd->d_trace_buffers[thread_index], + vec_len (xd->d_trace_buffers[thread_index])); + vlib_set_trace_count (vm, node, + n_trace - + vec_len (xd->d_trace_buffers[thread_index])); } vlib_increment_combined_counter (vnet_get_main ()->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, xd->vlib_sw_if_index, mb_index, n_rx_bytes); + thread_index, xd->vlib_sw_if_index, mb_index, n_rx_bytes); - vnet_device_increment_rx_packets (cpu_index, mb_index); + vnet_device_increment_rx_packets (thread_index, mb_index); return mb_index; } @@ -630,19 +632,19 @@ dpdk_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * f) dpdk_device_t *xd; uword n_rx_packets = 0; dpdk_device_and_queue_t *dq; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); /* * Poll all devices on this cpu for input/interrupts. */ /* *INDENT-OFF* */ - vec_foreach (dq, dm->devices_by_cpu[cpu_index]) + vec_foreach (dq, dm->devices_by_cpu[thread_index]) { xd = vec_elt_at_index(dm->devices, dq->device); if (xd->flags & DPDK_DEVICE_FLAG_MAYBE_MULTISEG) - n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id, /* maybe_multiseg */ 1); + n_rx_packets += dpdk_device_input (dm, xd, node, thread_index, dq->queue_id, /* maybe_multiseg */ 1); else - n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id, /* maybe_multiseg */ 0); + n_rx_packets += dpdk_device_input (dm, xd, node, thread_index, dq->queue_id, /* maybe_multiseg */ 0); } /* *INDENT-ON* */ diff --git a/src/plugins/dpdk/hqos/hqos.c b/src/plugins/dpdk/hqos/hqos.c index a288fca7..8b251beb 100644 --- a/src/plugins/dpdk/hqos/hqos.c +++ b/src/plugins/dpdk/hqos/hqos.c @@ -397,7 +397,7 @@ static_always_inline void dpdk_hqos_thread_internal_hqos_dbg_bypass (vlib_main_t * vm) { dpdk_main_t *dm = &dpdk_main; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; u32 dev_pos; dev_pos = 0; @@ -405,12 +405,12 @@ dpdk_hqos_thread_internal_hqos_dbg_bypass (vlib_main_t * vm) { vlib_worker_thread_barrier_check (); - u32 n_devs = vec_len (dm->devices_by_hqos_cpu[cpu_index]); + u32 n_devs = vec_len (dm->devices_by_hqos_cpu[thread_index]); if (dev_pos >= n_devs) dev_pos = 0; dpdk_device_and_queue_t *dq = - vec_elt_at_index (dm->devices_by_hqos_cpu[cpu_index], dev_pos); + vec_elt_at_index (dm->devices_by_hqos_cpu[thread_index], dev_pos); dpdk_device_t *xd = vec_elt_at_index (dm->devices, dq->device); dpdk_device_hqos_per_hqos_thread_t *hqos = xd->hqos_ht; @@ -479,7 +479,7 @@ static_always_inline void dpdk_hqos_thread_internal (vlib_main_t * vm) { dpdk_main_t *dm = &dpdk_main; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; u32 dev_pos; dev_pos = 0; @@ -487,7 +487,7 @@ dpdk_hqos_thread_internal (vlib_main_t * vm) { vlib_worker_thread_barrier_check (); - u32 n_devs = vec_len (dm->devices_by_hqos_cpu[cpu_index]); + u32 n_devs = vec_len (dm->devices_by_hqos_cpu[thread_index]); if (PREDICT_FALSE (n_devs == 0)) { dev_pos = 0; @@ -497,7 +497,7 @@ dpdk_hqos_thread_internal (vlib_main_t * vm) dev_pos = 0; dpdk_device_and_queue_t *dq = - vec_elt_at_index (dm->devices_by_hqos_cpu[cpu_index], dev_pos); + vec_elt_at_index (dm->devices_by_hqos_cpu[thread_index], dev_pos); dpdk_device_t *xd = vec_elt_at_index (dm->devices, dq->device); dpdk_device_hqos_per_hqos_thread_t *hqos = xd->hqos_ht; @@ -586,7 +586,7 @@ dpdk_hqos_thread (vlib_worker_thread_t * w) vm = vlib_get_main (); - ASSERT (vm->cpu_index == os_get_cpu_number ()); + ASSERT (vm->thread_index == vlib_get_thread_index ()); clib_time_init (&vm->clib_time); clib_mem_set_heap (w->thread_mheap); @@ -595,7 +595,7 @@ dpdk_hqos_thread (vlib_worker_thread_t * w) while (tm->worker_thread_release == 0) vlib_worker_thread_barrier_check (); - if (vec_len (dm->devices_by_hqos_cpu[vm->cpu_index]) == 0) + if (vec_len (dm->devices_by_hqos_cpu[vm->thread_index]) == 0) return clib_error ("current I/O TX thread does not have any devices assigned to it"); diff --git a/src/plugins/dpdk/ipsec/cli.c b/src/plugins/dpdk/ipsec/cli.c index cd0a6037..3ae8c9b8 100644 --- a/src/plugins/dpdk/ipsec/cli.c +++ b/src/plugins/dpdk/ipsec/cli.c @@ -42,8 +42,8 @@ dpdk_ipsec_show_mapping (vlib_main_t * vm, u16 detail_display) for (i = 0; i < tm->n_vlib_mains; i++) { uword key, data; - u32 cpu_index = vlib_mains[i]->cpu_index; - crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index]; + u32 thread_index = vlib_mains[i]->thread_index; + crypto_worker_main_t *cwm = &dcm->workers_main[thread_index]; u8 *s = 0; if (skip_master) @@ -57,7 +57,7 @@ dpdk_ipsec_show_mapping (vlib_main_t * vm, u16 detail_display) i32 last_cdev = -1; crypto_qp_data_t *qpd; - s = format (s, "%u\t", cpu_index); + s = format (s, "%u\t", thread_index); /* *INDENT-OFF* */ vec_foreach (qpd, cwm->qp_data) @@ -95,7 +95,7 @@ dpdk_ipsec_show_mapping (vlib_main_t * vm, u16 detail_display) cap.sym.auth.algo = p_key->auth_algo; check_algo_is_supported (&cap, auth_str); vlib_cli_output (vm, "%u\t%10s\t%15s\t%3s\t%u\t%u\n", - vlib_mains[i]->cpu_index, cipher_str, auth_str, + vlib_mains[i]->thread_index, cipher_str, auth_str, p_key->is_outbound ? "out" : "in", cwm->qp_data[data].dev_id, cwm->qp_data[data].qp_id); diff --git a/src/plugins/dpdk/ipsec/crypto_node.c b/src/plugins/dpdk/ipsec/crypto_node.c index dc3452b2..a3c45902 100644 --- a/src/plugins/dpdk/ipsec/crypto_node.c +++ b/src/plugins/dpdk/ipsec/crypto_node.c @@ -171,9 +171,9 @@ static uword dpdk_crypto_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); dpdk_crypto_main_t *dcm = &dpdk_crypto_main; - crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index]; + crypto_worker_main_t *cwm = &dcm->workers_main[thread_index]; crypto_qp_data_t *qpd; u32 n_deq = 0; diff --git a/src/plugins/dpdk/ipsec/esp.h b/src/plugins/dpdk/ipsec/esp.h index 320295b1..56f0c756 100644 --- a/src/plugins/dpdk/ipsec/esp.h +++ b/src/plugins/dpdk/ipsec/esp.h @@ -170,9 +170,9 @@ static_always_inline int create_sym_sess (ipsec_sa_t * sa, crypto_sa_session_t * sa_sess, u8 is_outbound) { - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); dpdk_crypto_main_t *dcm = &dpdk_crypto_main; - crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index]; + crypto_worker_main_t *cwm = &dcm->workers_main[thread_index]; struct rte_crypto_sym_xform cipher_xform = { 0 }; struct rte_crypto_sym_xform auth_xform = { 0 }; struct rte_crypto_sym_xform *xfs; diff --git a/src/plugins/dpdk/ipsec/esp_decrypt.c b/src/plugins/dpdk/ipsec/esp_decrypt.c index 286e03f8..bab76e3b 100644 --- a/src/plugins/dpdk/ipsec/esp_decrypt.c +++ b/src/plugins/dpdk/ipsec/esp_decrypt.c @@ -88,7 +88,7 @@ dpdk_esp_decrypt_node_fn (vlib_main_t * vm, { u32 n_left_from, *from, *to_next, next_index; ipsec_main_t *im = &ipsec_main; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); dpdk_crypto_main_t * dcm = &dpdk_crypto_main; dpdk_esp_main_t * em = &dpdk_esp_main; u32 i; @@ -104,7 +104,7 @@ dpdk_esp_decrypt_node_fn (vlib_main_t * vm, return n_left_from; } - crypto_worker_main_t *cwm = vec_elt_at_index(dcm->workers_main, cpu_index); + crypto_worker_main_t *cwm = vec_elt_at_index(dcm->workers_main, thread_index); u32 n_qps = vec_len(cwm->qp_data); struct rte_crypto_op ** cops_to_enq[n_qps]; u32 n_cop_qp[n_qps], * bi_to_enq[n_qps]; diff --git a/src/plugins/dpdk/ipsec/esp_encrypt.c b/src/plugins/dpdk/ipsec/esp_encrypt.c index 5b03de73..f996d7df 100644 --- a/src/plugins/dpdk/ipsec/esp_encrypt.c +++ b/src/plugins/dpdk/ipsec/esp_encrypt.c @@ -93,7 +93,7 @@ dpdk_esp_encrypt_node_fn (vlib_main_t * vm, { u32 n_left_from, *from, *to_next, next_index; ipsec_main_t *im = &ipsec_main; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); dpdk_crypto_main_t *dcm = &dpdk_crypto_main; dpdk_esp_main_t *em = &dpdk_esp_main; u32 i; @@ -111,7 +111,8 @@ dpdk_esp_encrypt_node_fn (vlib_main_t * vm, return n_left_from; } - crypto_worker_main_t *cwm = vec_elt_at_index (dcm->workers_main, cpu_index); + crypto_worker_main_t *cwm = + vec_elt_at_index (dcm->workers_main, thread_index); u32 n_qps = vec_len (cwm->qp_data); struct rte_crypto_op **cops_to_enq[n_qps]; u32 n_cop_qp[n_qps], *bi_to_enq[n_qps]; diff --git a/src/plugins/dpdk/ipsec/ipsec.c b/src/plugins/dpdk/ipsec/ipsec.c index b0aaaaec..5d8f4fba 100644 --- a/src/plugins/dpdk/ipsec/ipsec.c +++ b/src/plugins/dpdk/ipsec/ipsec.c @@ -289,7 +289,7 @@ dpdk_ipsec_process (vlib_main_t * vm, vlib_node_runtime_t * rt, if (!map) { clib_warning ("unable to create hash table for worker %u", - vlib_mains[i]->cpu_index); + vlib_mains[i]->thread_index); goto error; } cwm->algo_qp_map = map; diff --git a/src/plugins/dpdk/ipsec/ipsec.h b/src/plugins/dpdk/ipsec/ipsec.h index 28bffc80..f0f793c0 100644 --- a/src/plugins/dpdk/ipsec/ipsec.h +++ b/src/plugins/dpdk/ipsec/ipsec.h @@ -95,8 +95,8 @@ static_always_inline void crypto_alloc_cops () { dpdk_crypto_main_t *dcm = &dpdk_crypto_main; - u32 cpu_index = os_get_cpu_number (); - crypto_worker_main_t *cwm = &dcm->workers_main[cpu_index]; + u32 thread_index = vlib_get_thread_index (); + crypto_worker_main_t *cwm = &dcm->workers_main[thread_index]; unsigned socket_id = rte_socket_id (); crypto_qp_data_t *qpd; diff --git a/src/plugins/dpdk/main.c b/src/plugins/dpdk/main.c index 7ee2a785..942b8b2d 100644 --- a/src/plugins/dpdk/main.c +++ b/src/plugins/dpdk/main.c @@ -39,7 +39,7 @@ rte_delay_us_override (unsigned us) * thread then do not intercept. (Must not be called from an * independent pthread). */ - if (os_get_cpu_number () == 0) + if (vlib_get_thread_index () == 0) { /* * We're in the vlib main thread or a vlib process. Make sure diff --git a/src/plugins/flowperpkt/l2_node.c b/src/plugins/flowperpkt/l2_node.c index 1c2f681e..fdaf81d1 100644 --- a/src/plugins/flowperpkt/l2_node.c +++ b/src/plugins/flowperpkt/l2_node.c @@ -102,7 +102,7 @@ add_to_flow_record_l2 (vlib_main_t * vm, u8 * src_mac, u8 * dst_mac, u16 ethertype, u64 timestamp, u16 length, int do_flush) { - u32 my_cpu_number = vm->cpu_index; + u32 my_cpu_number = vm->thread_index; flow_report_main_t *frm = &flow_report_main; ip4_header_t *ip; udp_header_t *udp; diff --git a/src/plugins/flowperpkt/node.c b/src/plugins/flowperpkt/node.c index f77f087d..0277682d 100644 --- a/src/plugins/flowperpkt/node.c +++ b/src/plugins/flowperpkt/node.c @@ -101,7 +101,7 @@ add_to_flow_record_ipv4 (vlib_main_t * vm, u32 src_address, u32 dst_address, u8 tos, u64 timestamp, u16 length, int do_flush) { - u32 my_cpu_number = vm->cpu_index; + u32 my_cpu_number = vm->thread_index; flow_report_main_t *frm = &flow_report_main; ip4_header_t *ip; udp_header_t *udp; diff --git a/src/plugins/ioam/export-common/ioam_export.h b/src/plugins/ioam/export-common/ioam_export.h index 2bf3fd54..9de0d13b 100644 --- a/src/plugins/ioam/export-common/ioam_export.h +++ b/src/plugins/ioam/export-common/ioam_export.h @@ -477,8 +477,8 @@ do { \ from = vlib_frame_vector_args (F); \ n_left_from = (F)->n_vectors; \ next_index = (N)->cached_next_index; \ - while (__sync_lock_test_and_set ((EM)->lockp[(VM)->cpu_index], 1)); \ - my_buf = ioam_export_get_my_buffer (EM, (VM)->cpu_index); \ + while (__sync_lock_test_and_set ((EM)->lockp[(VM)->thread_index], 1)); \ + my_buf = ioam_export_get_my_buffer (EM, (VM)->thread_index); \ my_buf->touched_at = vlib_time_now (VM); \ while (n_left_from > 0) \ { \ @@ -620,7 +620,7 @@ do { \ } \ vlib_node_increment_counter (VM, export_node.index, \ EXPORT_ERROR_RECORDED, pkts_recorded); \ - *(EM)->lockp[(VM)->cpu_index] = 0; \ + *(EM)->lockp[(VM)->thread_index] = 0; \ } while(0) #endif /* __included_ioam_export_h__ */ diff --git a/src/plugins/ioam/ip6/ioam_cache_tunnel_select_node.c b/src/plugins/ioam/ip6/ioam_cache_tunnel_select_node.c index a56dc040..0cf742c9 100644 --- a/src/plugins/ioam/ip6/ioam_cache_tunnel_select_node.c +++ b/src/plugins/ioam/ip6/ioam_cache_tunnel_select_node.c @@ -396,7 +396,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, clib_net_to_host_u32 (tcp0->seq_number) + 1, no_of_responses, now, - vm->cpu_index, &pool_index0)) + vm->thread_index, &pool_index0)) { cache_ts_added++; } @@ -419,7 +419,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, e2e = (ioam_e2e_cache_option_t *) ((u8 *) hbh0 + cm->rewrite_pool_index_offset); - e2e->pool_id = (u8) vm->cpu_index; + e2e->pool_id = (u8) vm->thread_index; e2e->pool_index = pool_index0; ioam_e2e_id_rewrite_handler ((ioam_e2e_id_option_t *) ((u8 *) e2e + @@ -455,7 +455,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, clib_net_to_host_u32 (tcp1->seq_number) + 1, no_of_responses, now, - vm->cpu_index, &pool_index1)) + vm->thread_index, &pool_index1)) { cache_ts_added++; } @@ -479,7 +479,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, e2e = (ioam_e2e_cache_option_t *) ((u8 *) hbh1 + cm->rewrite_pool_index_offset); - e2e->pool_id = (u8) vm->cpu_index; + e2e->pool_id = (u8) vm->thread_index; e2e->pool_index = pool_index1; ioam_e2e_id_rewrite_handler ((ioam_e2e_id_option_t *) ((u8 *) e2e + @@ -562,7 +562,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, clib_net_to_host_u32 (tcp0->seq_number) + 1, no_of_responses, now, - vm->cpu_index, &pool_index0)) + vm->thread_index, &pool_index0)) { cache_ts_added++; } @@ -585,7 +585,7 @@ ip6_reset_ts_hbh_node_fn (vlib_main_t * vm, e2e = (ioam_e2e_cache_option_t *) ((u8 *) hbh0 + cm->rewrite_pool_index_offset); - e2e->pool_id = (u8) vm->cpu_index; + e2e->pool_id = (u8) vm->thread_index; e2e->pool_index = pool_index0; ioam_e2e_id_rewrite_handler ((ioam_e2e_id_option_t *) ((u8 *) e2e + @@ -701,7 +701,7 @@ expired_cache_ts_timer_callback (u32 * expired_timers) ioam_cache_main_t *cm = &ioam_cache_main; int i; u32 pool_index; - u32 thread_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 count = 0; for (i = 0; i < vec_len (expired_timers); i++) @@ -724,7 +724,7 @@ ioam_cache_ts_timer_tick_node_fn (vlib_main_t * vm, vlib_frame_t * f) { ioam_cache_main_t *cm = &ioam_cache_main; - u32 my_thread_index = os_get_cpu_number (); + u32 my_thread_index = vlib_get_thread_index (); struct timespec ts, tsrem; tw_timer_expire_timers_16t_2w_512sl (&cm->timer_wheels[my_thread_index], diff --git a/src/plugins/ixge/ixge.c b/src/plugins/ixge/ixge.c index f3c5cc09..08f5b692 100644 --- a/src/plugins/ixge/ixge.c +++ b/src/plugins/ixge/ixge.c @@ -1887,7 +1887,7 @@ done: vlib_increment_combined_counter (vnet_main. interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - 0 /* cpu_index */ , + 0 /* thread_index */ , xd->vlib_sw_if_index, n_packets, dq->rx.n_bytes); diff --git a/src/plugins/lb/lb.c b/src/plugins/lb/lb.c index add81236..addc2a42 100644 --- a/src/plugins/lb/lb.c +++ b/src/plugins/lb/lb.c @@ -63,11 +63,11 @@ u8 *format_lb_main (u8 * s, va_list * args) s = format(s, " #vips: %u\n", pool_elts(lbm->vips)); s = format(s, " #ass: %u\n", pool_elts(lbm->ass) - 1); - u32 cpu_index; - for(cpu_index = 0; cpu_index < tm->n_vlib_mains; cpu_index++ ) { - lb_hash_t *h = lbm->per_cpu[cpu_index].sticky_ht; + u32 thread_index; + for(thread_index = 0; thread_index < tm->n_vlib_mains; thread_index++ ) { + lb_hash_t *h = lbm->per_cpu[thread_index].sticky_ht; if (h) { - s = format(s, "core %d\n", cpu_index); + s = format(s, "core %d\n", thread_index); s = format(s, " timeout: %ds\n", h->timeout); s = format(s, " usage: %d / %d\n", lb_hash_elts(h, lb_hash_time_now(vlib_get_main())), lb_hash_size(h)); } diff --git a/src/plugins/lb/node.c b/src/plugins/lb/node.c index 8b763c53..3171148b 100644 --- a/src/plugins/lb/node.c +++ b/src/plugins/lb/node.c @@ -60,10 +60,10 @@ format_lb_trace (u8 * s, va_list * args) return s; } -lb_hash_t *lb_get_sticky_table(u32 cpu_index) +lb_hash_t *lb_get_sticky_table(u32 thread_index) { lb_main_t *lbm = &lb_main; - lb_hash_t *sticky_ht = lbm->per_cpu[cpu_index].sticky_ht; + lb_hash_t *sticky_ht = lbm->per_cpu[thread_index].sticky_ht; //Check if size changed if (PREDICT_FALSE(sticky_ht && (lbm->per_cpu_sticky_buckets != lb_hash_nbuckets(sticky_ht)))) { @@ -71,8 +71,8 @@ lb_hash_t *lb_get_sticky_table(u32 cpu_index) lb_hash_bucket_t *b; u32 i; lb_hash_foreach_entry(sticky_ht, b, i) { - vlib_refcount_add(&lbm->as_refcount, cpu_index, b->value[i], -1); - vlib_refcount_add(&lbm->as_refcount, cpu_index, 0, 1); + vlib_refcount_add(&lbm->as_refcount, thread_index, b->value[i], -1); + vlib_refcount_add(&lbm->as_refcount, thread_index, 0, 1); } lb_hash_free(sticky_ht); @@ -81,8 +81,8 @@ lb_hash_t *lb_get_sticky_table(u32 cpu_index) //Create if necessary if (PREDICT_FALSE(sticky_ht == NULL)) { - lbm->per_cpu[cpu_index].sticky_ht = lb_hash_alloc(lbm->per_cpu_sticky_buckets, lbm->flow_timeout); - sticky_ht = lbm->per_cpu[cpu_index].sticky_ht; + lbm->per_cpu[thread_index].sticky_ht = lb_hash_alloc(lbm->per_cpu_sticky_buckets, lbm->flow_timeout); + sticky_ht = lbm->per_cpu[thread_index].sticky_ht; clib_warning("Regenerated sticky table %p", sticky_ht); } @@ -153,10 +153,10 @@ lb_node_fn (vlib_main_t * vm, { lb_main_t *lbm = &lb_main; u32 n_left_from, *from, next_index, *to_next, n_left_to_next; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u32 lb_time = lb_hash_time_now(vm); - lb_hash_t *sticky_ht = lb_get_sticky_table(cpu_index); + lb_hash_t *sticky_ht = lb_get_sticky_table(thread_index); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; next_index = node->cached_next_index; @@ -240,9 +240,9 @@ lb_node_fn (vlib_main_t * vm, //Configuration may be changed, vectors resized, etc... //Dereference previously used - vlib_refcount_add(&lbm->as_refcount, cpu_index, + vlib_refcount_add(&lbm->as_refcount, thread_index, lb_hash_available_value(sticky_ht, hash0, available_index0), -1); - vlib_refcount_add(&lbm->as_refcount, cpu_index, + vlib_refcount_add(&lbm->as_refcount, thread_index, asindex0, 1); //Add sticky entry @@ -260,7 +260,7 @@ lb_node_fn (vlib_main_t * vm, } vlib_increment_simple_counter(&lbm->vip_counters[counter], - cpu_index, + thread_index, vnet_buffer (p0)->ip.adj_index[VLIB_TX], 1); diff --git a/src/plugins/lb/refcount.c b/src/plugins/lb/refcount.c index 22415c88..6f01ab5a 100644 --- a/src/plugins/lb/refcount.c +++ b/src/plugins/lb/refcount.c @@ -31,10 +31,10 @@ u64 vlib_refcount_get(vlib_refcount_t *r, u32 index) { u64 count = 0; vlib_thread_main_t *tm = vlib_get_thread_main (); - u32 cpu_index; - for (cpu_index = 0; cpu_index < tm->n_vlib_mains; cpu_index++) { - if (r->per_cpu[cpu_index].length > index) - count += r->per_cpu[cpu_index].counters[index]; + u32 thread_index; + for (thread_index = 0; thread_index < tm->n_vlib_mains; thread_index++) { + if (r->per_cpu[thread_index].length > index) + count += r->per_cpu[thread_index].counters[index]; } return count; } diff --git a/src/plugins/lb/refcount.h b/src/plugins/lb/refcount.h index 8c26e7be..dcfcb3fe 100644 --- a/src/plugins/lb/refcount.h +++ b/src/plugins/lb/refcount.h @@ -45,9 +45,9 @@ typedef struct { void __vlib_refcount_resize(vlib_refcount_per_cpu_t *per_cpu, u32 size); static_always_inline -void vlib_refcount_add(vlib_refcount_t *r, u32 cpu_index, u32 counter_index, i32 v) +void vlib_refcount_add(vlib_refcount_t *r, u32 thread_index, u32 counter_index, i32 v) { - vlib_refcount_per_cpu_t *per_cpu = &r->per_cpu[cpu_index]; + vlib_refcount_per_cpu_t *per_cpu = &r->per_cpu[thread_index]; if (PREDICT_FALSE(counter_index >= per_cpu->length)) __vlib_refcount_resize(per_cpu, clib_max(counter_index + 16, per_cpu->length * 2)); diff --git a/src/plugins/memif/node.c b/src/plugins/memif/node.c index 659d5dfb..cee1f3d1 100644 --- a/src/plugins/memif/node.c +++ b/src/plugins/memif/node.c @@ -94,7 +94,7 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, u32 n_rx_bytes = 0; u32 *to_next = 0; u32 n_free_bufs; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 bi0, bi1; vlib_buffer_t *b0, *b1; u16 ring_size = 1 << mif->log2_ring_size; @@ -105,14 +105,15 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (mif->per_interface_next_index != ~0) next_index = mif->per_interface_next_index; - n_free_bufs = vec_len (nm->rx_buffers[cpu_index]); + n_free_bufs = vec_len (nm->rx_buffers[thread_index]); if (PREDICT_FALSE (n_free_bufs < ring_size)) { - vec_validate (nm->rx_buffers[cpu_index], ring_size + n_free_bufs - 1); + vec_validate (nm->rx_buffers[thread_index], + ring_size + n_free_bufs - 1); n_free_bufs += - vlib_buffer_alloc (vm, &nm->rx_buffers[cpu_index][n_free_bufs], + vlib_buffer_alloc (vm, &nm->rx_buffers[thread_index][n_free_bufs], ring_size); - _vec_len (nm->rx_buffers[cpu_index]) = n_free_bufs; + _vec_len (nm->rx_buffers[thread_index]) = n_free_bufs; } head = ring->head; @@ -158,15 +159,15 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, CLIB_CACHE_LINE_BYTES, LOAD); } /* get empty buffer */ - u32 last_buf = vec_len (nm->rx_buffers[cpu_index]) - 1; - bi0 = nm->rx_buffers[cpu_index][last_buf]; - bi1 = nm->rx_buffers[cpu_index][last_buf - 1]; - _vec_len (nm->rx_buffers[cpu_index]) -= 2; + u32 last_buf = vec_len (nm->rx_buffers[thread_index]) - 1; + bi0 = nm->rx_buffers[thread_index][last_buf]; + bi1 = nm->rx_buffers[thread_index][last_buf - 1]; + _vec_len (nm->rx_buffers[thread_index]) -= 2; if (last_buf > 4) { - memif_prefetch (vm, nm->rx_buffers[cpu_index][last_buf - 2]); - memif_prefetch (vm, nm->rx_buffers[cpu_index][last_buf - 3]); + memif_prefetch (vm, nm->rx_buffers[thread_index][last_buf - 2]); + memif_prefetch (vm, nm->rx_buffers[thread_index][last_buf - 3]); } /* enqueue buffer */ @@ -256,9 +257,9 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, while (num_slots && n_left_to_next) { /* get empty buffer */ - u32 last_buf = vec_len (nm->rx_buffers[cpu_index]) - 1; - bi0 = nm->rx_buffers[cpu_index][last_buf]; - _vec_len (nm->rx_buffers[cpu_index]) = last_buf; + u32 last_buf = vec_len (nm->rx_buffers[thread_index]) - 1; + bi0 = nm->rx_buffers[thread_index][last_buf]; + _vec_len (nm->rx_buffers[thread_index]) = last_buf; /* enqueue buffer */ to_next[0] = bi0; @@ -315,7 +316,7 @@ memif_device_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, ring->tail = head; vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters - + VNET_INTERFACE_COUNTER_RX, cpu_index, + + VNET_INTERFACE_COUNTER_RX, thread_index, mif->hw_if_index, n_rx_packets, n_rx_bytes); @@ -327,7 +328,7 @@ memif_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { u32 n_rx_packets = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); memif_main_t *nm = &memif_main; memif_if_t *mif; @@ -337,7 +338,7 @@ memif_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, if (mif->flags & MEMIF_IF_FLAG_ADMIN_UP && mif->flags & MEMIF_IF_FLAG_CONNECTED && (mif->if_index % nm->input_cpu_count) == - (cpu_index - nm->input_cpu_first_index)) + (thread_index - nm->input_cpu_first_index)) { if (mif->flags & MEMIF_IF_FLAG_IS_SLAVE) n_rx_packets += diff --git a/src/plugins/snat/in2out.c b/src/plugins/snat/in2out.c index b4961365..e5ee965f 100644 --- a/src/plugins/snat/in2out.c +++ b/src/plugins/snat/in2out.c @@ -212,7 +212,7 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, snat_session_t ** sessionp, vlib_node_runtime_t * node, u32 next0, - u32 cpu_index) + u32 thread_index) { snat_user_t *u; snat_user_key_t user_key; @@ -246,27 +246,27 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, if (clib_bihash_search_8_8 (&sm->user_hash, &kv0, &value0)) { /* no, make a new one */ - pool_get (sm->per_thread_data[cpu_index].users, u); + pool_get (sm->per_thread_data[thread_index].users, u); memset (u, 0, sizeof (*u)); u->addr = ip0->src_address; u->fib_index = rx_fib_index0; - pool_get (sm->per_thread_data[cpu_index].list_pool, per_user_list_head_elt); + pool_get (sm->per_thread_data[thread_index].list_pool, per_user_list_head_elt); u->sessions_per_user_list_head_index = per_user_list_head_elt - - sm->per_thread_data[cpu_index].list_pool; + sm->per_thread_data[thread_index].list_pool; - clib_dlist_init (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_init (sm->per_thread_data[thread_index].list_pool, u->sessions_per_user_list_head_index); - kv0.value = u - sm->per_thread_data[cpu_index].users; + kv0.value = u - sm->per_thread_data[thread_index].users; /* add user */ clib_bihash_add_del_8_8 (&sm->user_hash, &kv0, 1 /* is_add */); } else { - u = pool_elt_at_index (sm->per_thread_data[cpu_index].users, + u = pool_elt_at_index (sm->per_thread_data[thread_index].users, value0.value); } @@ -276,25 +276,25 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, /* Remove the oldest dynamic translation */ do { oldest_per_user_translation_list_index = - clib_dlist_remove_head (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove_head (sm->per_thread_data[thread_index].list_pool, u->sessions_per_user_list_head_index); ASSERT (oldest_per_user_translation_list_index != ~0); /* add it back to the end of the LRU list */ - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, u->sessions_per_user_list_head_index, oldest_per_user_translation_list_index); /* Get the list element */ oldest_per_user_translation_list_elt = - pool_elt_at_index (sm->per_thread_data[cpu_index].list_pool, + pool_elt_at_index (sm->per_thread_data[thread_index].list_pool, oldest_per_user_translation_list_index); /* Get the session index from the list element */ session_index = oldest_per_user_translation_list_elt->value; /* Get the session */ - s = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, session_index); } while (snat_is_session_static (s)); @@ -346,7 +346,7 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, } /* Create a new session */ - pool_get (sm->per_thread_data[cpu_index].sessions, s); + pool_get (sm->per_thread_data[thread_index].sessions, s); memset (s, 0, sizeof (*s)); s->outside_address_index = address_index; @@ -362,22 +362,22 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, } /* Create list elts */ - pool_get (sm->per_thread_data[cpu_index].list_pool, + pool_get (sm->per_thread_data[thread_index].list_pool, per_user_translation_list_elt); - clib_dlist_init (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_init (sm->per_thread_data[thread_index].list_pool, per_user_translation_list_elt - - sm->per_thread_data[cpu_index].list_pool); + sm->per_thread_data[thread_index].list_pool); per_user_translation_list_elt->value = - s - sm->per_thread_data[cpu_index].sessions; + s - sm->per_thread_data[thread_index].sessions; s->per_user_index = per_user_translation_list_elt - - sm->per_thread_data[cpu_index].list_pool; + sm->per_thread_data[thread_index].list_pool; s->per_user_list_head_index = u->sessions_per_user_list_head_index; - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s->per_user_list_head_index, per_user_translation_list_elt - - sm->per_thread_data[cpu_index].list_pool); + sm->per_thread_data[thread_index].list_pool); } s->in2out = *key0; @@ -388,12 +388,12 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, /* Add to translation hashes */ kv0.key = s->in2out.as_u64; - kv0.value = s - sm->per_thread_data[cpu_index].sessions; + kv0.value = s - sm->per_thread_data[thread_index].sessions; if (clib_bihash_add_del_8_8 (&sm->in2out, &kv0, 1 /* is_add */)) clib_warning ("in2out key add failed"); kv0.key = s->out2in.as_u64; - kv0.value = s - sm->per_thread_data[cpu_index].sessions; + kv0.value = s - sm->per_thread_data[thread_index].sessions; if (clib_bihash_add_del_8_8 (&sm->out2in, &kv0, 1 /* is_add */)) clib_warning ("out2in key add failed"); @@ -403,7 +403,7 @@ static u32 slow_path (snat_main_t *sm, vlib_buffer_t *b0, worker_by_out_key.port = s->out2in.port; worker_by_out_key.fib_index = s->out2in.fib_index; kv0.key = worker_by_out_key.as_u64; - kv0.value = cpu_index; + kv0.value = thread_index; clib_bihash_add_del_8_8 (&sm->worker_by_out, &kv0, 1); /* log NAT event */ @@ -465,7 +465,7 @@ snat_in2out_error_t icmp_get_key(icmp46_header_t *icmp0, * * @param[in,out] sm SNAT main * @param[in,out] node SNAT node runtime - * @param[in] cpu_index CPU index + * @param[in] thread_index thread index * @param[in,out] b0 buffer containing packet to be translated * @param[out] p_key address and port before NAT translation * @param[out] p_value address and port after NAT translation @@ -473,7 +473,7 @@ snat_in2out_error_t icmp_get_key(icmp46_header_t *icmp0, * @param d optional parameter */ u32 icmp_match_in2out_slow(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d) @@ -524,13 +524,13 @@ u32 icmp_match_in2out_slow(snat_main_t *sm, vlib_node_runtime_t *node, } next0 = slow_path (sm, b0, ip0, rx_fib_index0, &key0, - &s0, node, next0, cpu_index); + &s0, node, next0, thread_index); if (PREDICT_FALSE (next0 == SNAT_IN2OUT_NEXT_DROP)) goto out; } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); out: @@ -548,7 +548,7 @@ out: * * @param[in] sm SNAT main * @param[in,out] node SNAT node runtime - * @param[in] cpu_index CPU index + * @param[in] thread_index thread index * @param[in,out] b0 buffer containing packet to be translated * @param[out] p_key address and port before NAT translation * @param[out] p_value address and port after NAT translation @@ -556,7 +556,7 @@ out: * @param d optional parameter */ u32 icmp_match_in2out_fast(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d) @@ -624,7 +624,7 @@ static inline u32 icmp_in2out (snat_main_t *sm, u32 rx_fib_index0, vlib_node_runtime_t * node, u32 next0, - u32 cpu_index, + u32 thread_index, void *d) { snat_session_key_t key0, sm0; @@ -641,7 +641,7 @@ static inline u32 icmp_in2out (snat_main_t *sm, echo0 = (icmp_echo_header_t *)(icmp0+1); - next0_tmp = sm->icmp_match_in2out_cb(sm, node, cpu_index, b0, + next0_tmp = sm->icmp_match_in2out_cb(sm, node, thread_index, b0, &key0, &sm0, &dont_translate, d); if (next0_tmp != ~0) next0 = next0_tmp; @@ -847,11 +847,11 @@ static inline u32 icmp_in2out_slow_path (snat_main_t *sm, vlib_node_runtime_t * node, u32 next0, f64 now, - u32 cpu_index, + u32 thread_index, snat_session_t ** p_s0) { next0 = icmp_in2out(sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, - next0, cpu_index, p_s0); + next0, thread_index, p_s0); snat_session_t * s0 = *p_s0; if (PREDICT_TRUE(next0 != SNAT_IN2OUT_NEXT_DROP && s0)) { @@ -862,9 +862,9 @@ static inline u32 icmp_in2out_slow_path (snat_main_t *sm, /* Per-user LRU list maintenance for dynamic translations */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -884,7 +884,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, snat_runtime_t * rt = (snat_runtime_t *)node->runtime_data; f64 now = vlib_time_now (vm); u32 stats_node_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); stats_node_index = is_slow_path ? snat_in2out_slowpath_node.index : snat_in2out_node.index; @@ -977,7 +977,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, { next0 = icmp_in2out_slow_path (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, - node, next0, now, cpu_index, &s0); + node, next0, now, thread_index, &s0); goto trace00; } } @@ -1006,7 +1006,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, goto trace00; next0 = slow_path (sm, b0, ip0, rx_fib_index0, &key0, - &s0, node, next0, cpu_index); + &s0, node, next0, thread_index); if (PREDICT_FALSE (next0 == SNAT_IN2OUT_NEXT_DROP)) goto trace00; } @@ -1017,7 +1017,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, } } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); old_addr0 = ip0->src_address.as_u32; @@ -1063,9 +1063,9 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -1081,7 +1081,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, t->next_index = next0; t->session_index = ~0; if (s0) - t->session_index = s0 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s0 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next0 != SNAT_IN2OUT_NEXT_DROP; @@ -1117,7 +1117,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, { next1 = icmp_in2out_slow_path (sm, b1, ip1, icmp1, sw_if_index1, rx_fib_index1, node, - next1, now, cpu_index, &s1); + next1, now, thread_index, &s1); goto trace01; } } @@ -1146,7 +1146,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, goto trace01; next1 = slow_path (sm, b1, ip1, rx_fib_index1, &key1, - &s1, node, next1, cpu_index); + &s1, node, next1, thread_index); if (PREDICT_FALSE (next1 == SNAT_IN2OUT_NEXT_DROP)) goto trace01; } @@ -1157,7 +1157,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, } } else - s1 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s1 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value1.value); old_addr1 = ip1->src_address.as_u32; @@ -1203,9 +1203,9 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s1)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s1->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s1->per_user_list_head_index, s1->per_user_index); } @@ -1220,7 +1220,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, t->next_index = next1; t->session_index = ~0; if (s1) - t->session_index = s1 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s1 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next1 != SNAT_IN2OUT_NEXT_DROP; @@ -1292,7 +1292,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, { next0 = icmp_in2out_slow_path (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, - next0, now, cpu_index, &s0); + next0, now, thread_index, &s0); goto trace0; } } @@ -1321,7 +1321,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, goto trace0; next0 = slow_path (sm, b0, ip0, rx_fib_index0, &key0, - &s0, node, next0, cpu_index); + &s0, node, next0, thread_index); if (PREDICT_FALSE (next0 == SNAT_IN2OUT_NEXT_DROP)) goto trace0; @@ -1333,7 +1333,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, } } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); old_addr0 = ip0->src_address.as_u32; @@ -1379,9 +1379,9 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -1397,7 +1397,7 @@ snat_in2out_node_fn_inline (vlib_main_t * vm, t->next_index = next0; t->session_index = ~0; if (s0) - t->session_index = s0 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s0 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next0 != SNAT_IN2OUT_NEXT_DROP; @@ -2010,7 +2010,7 @@ snat_in2out_worker_handoff_fn (vlib_main_t * vm, u32 n_left_to_next_worker = 0, *to_next_worker = 0; u32 next_worker_index = 0; u32 current_worker_index = ~0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); ASSERT (vec_len (sm->workers)); @@ -2048,7 +2048,7 @@ snat_in2out_worker_handoff_fn (vlib_main_t * vm, next_worker_index = sm->worker_in2out_cb(ip0, rx_fib_index0); - if (PREDICT_FALSE (next_worker_index != cpu_index)) + if (PREDICT_FALSE (next_worker_index != thread_index)) { do_handoff = 1; diff --git a/src/plugins/snat/out2in.c b/src/plugins/snat/out2in.c index 656e42db..5d308d78 100644 --- a/src/plugins/snat/out2in.c +++ b/src/plugins/snat/out2in.c @@ -129,7 +129,7 @@ create_session_for_static_mapping (snat_main_t *sm, snat_session_key_t in2out, snat_session_key_t out2in, vlib_node_runtime_t * node, - u32 cpu_index) + u32 thread_index) { snat_user_t *u; snat_user_key_t user_key; @@ -146,36 +146,36 @@ create_session_for_static_mapping (snat_main_t *sm, if (clib_bihash_search_8_8 (&sm->user_hash, &kv0, &value0)) { /* no, make a new one */ - pool_get (sm->per_thread_data[cpu_index].users, u); + pool_get (sm->per_thread_data[thread_index].users, u); memset (u, 0, sizeof (*u)); u->addr = in2out.addr; u->fib_index = in2out.fib_index; - pool_get (sm->per_thread_data[cpu_index].list_pool, + pool_get (sm->per_thread_data[thread_index].list_pool, per_user_list_head_elt); u->sessions_per_user_list_head_index = per_user_list_head_elt - - sm->per_thread_data[cpu_index].list_pool; + sm->per_thread_data[thread_index].list_pool; - clib_dlist_init (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_init (sm->per_thread_data[thread_index].list_pool, u->sessions_per_user_list_head_index); - kv0.value = u - sm->per_thread_data[cpu_index].users; + kv0.value = u - sm->per_thread_data[thread_index].users; /* add user */ clib_bihash_add_del_8_8 (&sm->user_hash, &kv0, 1 /* is_add */); /* add non-traslated packets worker lookup */ - kv0.value = cpu_index; + kv0.value = thread_index; clib_bihash_add_del_8_8 (&sm->worker_by_in, &kv0, 1); } else { - u = pool_elt_at_index (sm->per_thread_data[cpu_index].users, + u = pool_elt_at_index (sm->per_thread_data[thread_index].users, value0.value); } - pool_get (sm->per_thread_data[cpu_index].sessions, s); + pool_get (sm->per_thread_data[thread_index].sessions, s); memset (s, 0, sizeof (*s)); s->outside_address_index = ~0; @@ -183,22 +183,22 @@ create_session_for_static_mapping (snat_main_t *sm, u->nstaticsessions++; /* Create list elts */ - pool_get (sm->per_thread_data[cpu_index].list_pool, + pool_get (sm->per_thread_data[thread_index].list_pool, per_user_translation_list_elt); - clib_dlist_init (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_init (sm->per_thread_data[thread_index].list_pool, per_user_translation_list_elt - - sm->per_thread_data[cpu_index].list_pool); + sm->per_thread_data[thread_index].list_pool); per_user_translation_list_elt->value = - s - sm->per_thread_data[cpu_index].sessions; + s - sm->per_thread_data[thread_index].sessions; s->per_user_index = - per_user_translation_list_elt - sm->per_thread_data[cpu_index].list_pool; + per_user_translation_list_elt - sm->per_thread_data[thread_index].list_pool; s->per_user_list_head_index = u->sessions_per_user_list_head_index; - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s->per_user_list_head_index, per_user_translation_list_elt - - sm->per_thread_data[cpu_index].list_pool); + sm->per_thread_data[thread_index].list_pool); s->in2out = in2out; s->out2in = out2in; @@ -206,12 +206,12 @@ create_session_for_static_mapping (snat_main_t *sm, /* Add to translation hashes */ kv0.key = s->in2out.as_u64; - kv0.value = s - sm->per_thread_data[cpu_index].sessions; + kv0.value = s - sm->per_thread_data[thread_index].sessions; if (clib_bihash_add_del_8_8 (&sm->in2out, &kv0, 1 /* is_add */)) clib_warning ("in2out key add failed"); kv0.key = s->out2in.as_u64; - kv0.value = s - sm->per_thread_data[cpu_index].sessions; + kv0.value = s - sm->per_thread_data[thread_index].sessions; if (clib_bihash_add_del_8_8 (&sm->out2in, &kv0, 1 /* is_add */)) clib_warning ("out2in key add failed"); @@ -298,7 +298,7 @@ is_interface_addr(snat_main_t *sm, vlib_node_runtime_t *node, u32 sw_if_index0, * * @param[in,out] sm SNAT main * @param[in,out] node SNAT node runtime - * @param[in] cpu_index CPU index + * @param[in] thread_index thread index * @param[in,out] b0 buffer containing packet to be translated * @param[out] p_key address and port before NAT translation * @param[out] p_value address and port after NAT translation @@ -306,7 +306,7 @@ is_interface_addr(snat_main_t *sm, vlib_node_runtime_t *node, u32 sw_if_index0, * @param d optional parameter */ u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d) @@ -366,7 +366,7 @@ u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node, /* Create session initiated by host from external network */ s0 = create_session_for_static_mapping(sm, b0, sm0, key0, - node, cpu_index); + node, thread_index); if (!s0) { @@ -375,7 +375,7 @@ u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node, } } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); out: @@ -393,7 +393,7 @@ out: * * @param[in] sm SNAT main * @param[in,out] node SNAT node runtime - * @param[in] cpu_index CPU index + * @param[in] thread_index thread index * @param[in,out] b0 buffer containing packet to be translated * @param[out] p_key address and port before NAT translation * @param[out] p_value address and port after NAT translation @@ -401,7 +401,7 @@ out: * @param d optional parameter */ u32 icmp_match_out2in_fast(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d) @@ -460,7 +460,7 @@ static inline u32 icmp_out2in (snat_main_t *sm, u32 rx_fib_index0, vlib_node_runtime_t * node, u32 next0, - u32 cpu_index, + u32 thread_index, void *d) { snat_session_key_t key0, sm0; @@ -477,7 +477,7 @@ static inline u32 icmp_out2in (snat_main_t *sm, echo0 = (icmp_echo_header_t *)(icmp0+1); - next0_tmp = sm->icmp_match_out2in_cb(sm, node, cpu_index, b0, + next0_tmp = sm->icmp_match_out2in_cb(sm, node, thread_index, b0, &key0, &sm0, &dont_translate, d); if (next0_tmp != ~0) next0 = next0_tmp; @@ -589,11 +589,11 @@ static inline u32 icmp_out2in_slow_path (snat_main_t *sm, u32 rx_fib_index0, vlib_node_runtime_t * node, u32 next0, f64 now, - u32 cpu_index, + u32 thread_index, snat_session_t ** p_s0) { next0 = icmp_out2in(sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, - next0, cpu_index, p_s0); + next0, thread_index, p_s0); snat_session_t * s0 = *p_s0; if (PREDICT_TRUE(next0 != SNAT_OUT2IN_NEXT_DROP && s0)) { @@ -604,9 +604,9 @@ static inline u32 icmp_out2in_slow_path (snat_main_t *sm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -624,7 +624,7 @@ snat_out2in_node_fn (vlib_main_t * vm, u32 pkts_processed = 0; snat_main_t * sm = &snat_main; f64 now = vlib_time_now (vm); - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -712,7 +712,7 @@ snat_out2in_node_fn (vlib_main_t * vm, { next0 = icmp_out2in_slow_path (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, - next0, now, cpu_index, &s0); + next0, now, thread_index, &s0); goto trace0; } @@ -743,7 +743,7 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Create session initiated by host from external network */ s0 = create_session_for_static_mapping(sm, b0, sm0, key0, node, - cpu_index); + thread_index); if (!s0) { b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; @@ -752,7 +752,7 @@ snat_out2in_node_fn (vlib_main_t * vm, } } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); old_addr0 = ip0->dst_address.as_u32; @@ -796,9 +796,9 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -813,7 +813,7 @@ snat_out2in_node_fn (vlib_main_t * vm, t->next_index = next0; t->session_index = ~0; if (s0) - t->session_index = s0 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s0 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next0 != SNAT_OUT2IN_NEXT_DROP; @@ -847,7 +847,7 @@ snat_out2in_node_fn (vlib_main_t * vm, { next1 = icmp_out2in_slow_path (sm, b1, ip1, icmp1, sw_if_index1, rx_fib_index1, node, - next1, now, cpu_index, &s1); + next1, now, thread_index, &s1); goto trace1; } @@ -878,7 +878,7 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Create session initiated by host from external network */ s1 = create_session_for_static_mapping(sm, b1, sm1, key1, node, - cpu_index); + thread_index); if (!s1) { b1->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; @@ -887,7 +887,7 @@ snat_out2in_node_fn (vlib_main_t * vm, } } else - s1 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s1 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value1.value); old_addr1 = ip1->dst_address.as_u32; @@ -931,9 +931,9 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s1)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s1->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s1->per_user_list_head_index, s1->per_user_index); } @@ -948,7 +948,7 @@ snat_out2in_node_fn (vlib_main_t * vm, t->next_index = next1; t->session_index = ~0; if (s1) - t->session_index = s1 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s1 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next1 != SNAT_OUT2IN_NEXT_DROP; @@ -1016,7 +1016,7 @@ snat_out2in_node_fn (vlib_main_t * vm, { next0 = icmp_out2in_slow_path (sm, b0, ip0, icmp0, sw_if_index0, rx_fib_index0, node, - next0, now, cpu_index, &s0); + next0, now, thread_index, &s0); goto trace00; } @@ -1048,7 +1048,7 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Create session initiated by host from external network */ s0 = create_session_for_static_mapping(sm, b0, sm0, key0, node, - cpu_index); + thread_index); if (!s0) { b0->error = node->errors[SNAT_OUT2IN_ERROR_NO_TRANSLATION]; @@ -1057,7 +1057,7 @@ snat_out2in_node_fn (vlib_main_t * vm, } } else - s0 = pool_elt_at_index (sm->per_thread_data[cpu_index].sessions, + s0 = pool_elt_at_index (sm->per_thread_data[thread_index].sessions, value0.value); old_addr0 = ip0->dst_address.as_u32; @@ -1101,9 +1101,9 @@ snat_out2in_node_fn (vlib_main_t * vm, /* Per-user LRU list maintenance for dynamic translation */ if (!snat_is_session_static (s0)) { - clib_dlist_remove (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_remove (sm->per_thread_data[thread_index].list_pool, s0->per_user_index); - clib_dlist_addtail (sm->per_thread_data[cpu_index].list_pool, + clib_dlist_addtail (sm->per_thread_data[thread_index].list_pool, s0->per_user_list_head_index, s0->per_user_index); } @@ -1118,7 +1118,7 @@ snat_out2in_node_fn (vlib_main_t * vm, t->next_index = next0; t->session_index = ~0; if (s0) - t->session_index = s0 - sm->per_thread_data[cpu_index].sessions; + t->session_index = s0 - sm->per_thread_data[thread_index].sessions; } pkts_processed += next0 != SNAT_OUT2IN_NEXT_DROP; @@ -1599,7 +1599,7 @@ snat_out2in_worker_handoff_fn (vlib_main_t * vm, u32 n_left_to_next_worker = 0, *to_next_worker = 0; u32 next_worker_index = 0; u32 current_worker_index = ~0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); ASSERT (vec_len (sm->workers)); @@ -1637,7 +1637,7 @@ snat_out2in_worker_handoff_fn (vlib_main_t * vm, next_worker_index = sm->worker_out2in_cb(ip0, rx_fib_index0); - if (PREDICT_FALSE (next_worker_index != cpu_index)) + if (PREDICT_FALSE (next_worker_index != thread_index)) { do_handoff = 1; diff --git a/src/plugins/snat/snat.h b/src/plugins/snat/snat.h index 017825c0..f4e1c5c0 100644 --- a/src/plugins/snat/snat.h +++ b/src/plugins/snat/snat.h @@ -221,7 +221,7 @@ struct snat_main_s; typedef u32 snat_icmp_match_function_t (struct snat_main_s *sm, vlib_node_runtime_t *node, - u32 cpu_index, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, @@ -402,22 +402,22 @@ typedef struct { } tcp_udp_header_t; u32 icmp_match_in2out_fast(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d); u32 icmp_match_in2out_slow(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d); u32 icmp_match_out2in_fast(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d); u32 icmp_match_out2in_slow(snat_main_t *sm, vlib_node_runtime_t *node, - u32 cpu_index, vlib_buffer_t *b0, + u32 thread_index, vlib_buffer_t *b0, snat_session_key_t *p_key, snat_session_key_t *p_value, u8 *p_dont_translate, void *d); diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index a517a597..be3b41ef 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -299,7 +299,7 @@ vlib_buffer_validate_alloc_free (vlib_main_t * vm, if (CLIB_DEBUG == 0) return; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); /* smp disaster check */ if (vec_len (vlib_mains) > 1) @@ -355,7 +355,7 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm, vlib_buffer_free_list_t *f; int i; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); if (!is_default && pool_elts (bm->buffer_free_list_pool) == 0) { @@ -474,7 +474,7 @@ vlib_buffer_delete_free_list_internal (vlib_main_t * vm, u32 free_list_index) u32 merge_index; int i; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); f = vlib_buffer_get_free_list (vm, free_list_index); diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 394c336a..328660a3 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -209,7 +209,7 @@ always_inline vlib_buffer_known_state_t vlib_buffer_is_known (vlib_main_t * vm, u32 buffer_index) { vlib_buffer_main_t *bm = vm->buffer_main; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); uword *p = hash_get (bm->buffer_known_hash, buffer_index); return p ? p[0] : VLIB_BUFFER_UNKNOWN; @@ -221,7 +221,7 @@ vlib_buffer_set_known_state (vlib_main_t * vm, vlib_buffer_known_state_t state) { vlib_buffer_main_t *bm = vm->buffer_main; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); hash_set (bm->buffer_known_hash, buffer_index, state); } diff --git a/src/vlib/cli.c b/src/vlib/cli.c index f853f655..3cc95076 100644 --- a/src/vlib/cli.c +++ b/src/vlib/cli.c @@ -709,7 +709,7 @@ test_heap_validate (vlib_main_t * vm, unformat_input_t * input, { /* *INDENT-OFF* */ foreach_vlib_main({ - heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index]; + heap = clib_per_cpu_mheaps[this_vlib_main->thread_index]; mheap = mheap_header(heap); mheap->flags |= MHEAP_FLAG_VALIDATE; // Turn off small object cache because it delays detection of errors @@ -722,7 +722,7 @@ test_heap_validate (vlib_main_t * vm, unformat_input_t * input, { /* *INDENT-OFF* */ foreach_vlib_main({ - heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index]; + heap = clib_per_cpu_mheaps[this_vlib_main->thread_index]; mheap = mheap_header(heap); mheap->flags &= ~MHEAP_FLAG_VALIDATE; mheap->flags |= MHEAP_FLAG_SMALL_OBJECT_CACHE; @@ -733,7 +733,7 @@ test_heap_validate (vlib_main_t * vm, unformat_input_t * input, { /* *INDENT-OFF* */ foreach_vlib_main({ - heap = clib_per_cpu_mheaps[this_vlib_main->cpu_index]; + heap = clib_per_cpu_mheaps[this_vlib_main->thread_index]; mheap = mheap_header(heap); mheap_validate(heap); }); diff --git a/src/vlib/counter.h b/src/vlib/counter.h index 17a85217..60e2055d 100644 --- a/src/vlib/counter.h +++ b/src/vlib/counter.h @@ -70,17 +70,17 @@ u32 vlib_simple_counter_n_counters (const vlib_simple_counter_main_t * cm); /** Increment a simple counter @param cm - (vlib_simple_counter_main_t *) simple counter main pointer - @param cpu_index - (u32) the current cpu index + @param thread_index - (u32) the current cpu index @param index - (u32) index of the counter to increment @param increment - (u64) quantitiy to add to the counter */ always_inline void vlib_increment_simple_counter (vlib_simple_counter_main_t * cm, - u32 cpu_index, u32 index, u64 increment) + u32 thread_index, u32 index, u64 increment) { counter_t *my_counters; - my_counters = cm->counters[cpu_index]; + my_counters = cm->counters[thread_index]; my_counters[index] += increment; } @@ -201,7 +201,7 @@ void vlib_clear_combined_counters (vlib_combined_counter_main_t * cm); /** Increment a combined counter @param cm - (vlib_combined_counter_main_t *) comined counter main pointer - @param cpu_index - (u32) the current cpu index + @param thread_index - (u32) the current cpu index @param index - (u32) index of the counter to increment @param packet_increment - (u64) number of packets to add to the counter @param byte_increment - (u64) number of bytes to add to the counter @@ -209,13 +209,13 @@ void vlib_clear_combined_counters (vlib_combined_counter_main_t * cm); always_inline void vlib_increment_combined_counter (vlib_combined_counter_main_t * cm, - u32 cpu_index, + u32 thread_index, u32 index, u64 n_packets, u64 n_bytes) { vlib_counter_t *my_counters; /* Use this CPU's counter array */ - my_counters = cm->counters[cpu_index]; + my_counters = cm->counters[thread_index]; my_counters[index].packets += n_packets; my_counters[index].bytes += n_bytes; @@ -224,14 +224,14 @@ vlib_increment_combined_counter (vlib_combined_counter_main_t * cm, /** Pre-fetch a per-thread combined counter for the given object index */ always_inline void vlib_prefetch_combined_counter (const vlib_combined_counter_main_t * cm, - u32 cpu_index, u32 index) + u32 thread_index, u32 index) { vlib_counter_t *cpu_counters; /* * This CPU's index is assumed to already be in cache */ - cpu_counters = cm->counters[cpu_index]; + cpu_counters = cm->counters[thread_index]; CLIB_PREFETCH (cpu_counters + index, CLIB_CACHE_LINE_BYTES, STORE); } diff --git a/src/vlib/error.c b/src/vlib/error.c index a2c23176..e4ed4ee3 100644 --- a/src/vlib/error.c +++ b/src/vlib/error.c @@ -149,7 +149,7 @@ vlib_register_errors (vlib_main_t * vm, vlib_node_t *n = vlib_get_node (vm, node_index); uword l; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); /* Free up any previous error strings. */ if (n->n_errors > 0) diff --git a/src/vlib/global_funcs.h b/src/vlib/global_funcs.h index f51ec381..9dd01fbf 100644 --- a/src/vlib/global_funcs.h +++ b/src/vlib/global_funcs.h @@ -23,7 +23,7 @@ always_inline vlib_main_t * vlib_get_main (void) { vlib_main_t *vm; - vm = vlib_mains[os_get_cpu_number ()]; + vm = vlib_mains[vlib_get_thread_index ()]; ASSERT (vm); return vm; } diff --git a/src/vlib/main.c b/src/vlib/main.c index b22203f0..422d3e26 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -136,18 +136,18 @@ vlib_frame_alloc_to_node (vlib_main_t * vm, u32 to_node_index, else { f = clib_mem_alloc_aligned_no_fail (n, VLIB_FRAME_ALIGN); - f->cpu_index = vm->cpu_index; + f->thread_index = vm->thread_index; fi = vlib_frame_index_no_check (vm, f); } /* Poison frame when debugging. */ if (CLIB_DEBUG > 0) { - u32 save_cpu_index = f->cpu_index; + u32 save_thread_index = f->thread_index; memset (f, 0xfe, n); - f->cpu_index = save_cpu_index; + f->thread_index = save_thread_index; } /* Insert magic number. */ @@ -517,7 +517,7 @@ vlib_put_next_frame (vlib_main_t * vm, * a dangling frame reference. Each thread has its own copy of * the next_frames vector. */ - if (0 && r->cpu_index != next_runtime->cpu_index) + if (0 && r->thread_index != next_runtime->thread_index) { nf->frame_index = ~0; nf->flags &= ~(VLIB_FRAME_PENDING | VLIB_FRAME_IS_ALLOCATED); @@ -866,7 +866,7 @@ vlib_elog_main_loop_event (vlib_main_t * vm, : evm->node_call_elog_event_types, node_index), /* track */ - (vm->cpu_index ? &vlib_worker_threads[vm->cpu_index]. + (vm->thread_index ? &vlib_worker_threads[vm->thread_index]. elog_track : &em->default_track), /* data to log */ n_vectors); } @@ -963,7 +963,7 @@ dispatch_node (vlib_main_t * vm, vm->cpu_time_last_node_dispatch = last_time_stamp; - if (1 /* || vm->cpu_index == node->cpu_index */ ) + if (1 /* || vm->thread_index == node->thread_index */ ) { vlib_main_t *stat_vm; @@ -1029,7 +1029,7 @@ dispatch_node (vlib_main_t * vm, { u32 node_name, vector_length, is_polling; } *ed; - vlib_worker_thread_t *w = vlib_worker_threads + vm->cpu_index; + vlib_worker_thread_t *w = vlib_worker_threads + vm->thread_index; #endif if ((dispatch_state == VLIB_NODE_STATE_INTERRUPT diff --git a/src/vlib/main.h b/src/vlib/main.h index 0197b4f3..329bf073 100644 --- a/src/vlib/main.h +++ b/src/vlib/main.h @@ -156,7 +156,7 @@ typedef struct vlib_main_t uword *init_functions_called; /* to compare with node runtime */ - u32 cpu_index; + u32 thread_index; void **mbuf_alloc_list; diff --git a/src/vlib/node.c b/src/vlib/node.c index dc0a4de5..bbd3a42e 100644 --- a/src/vlib/node.c +++ b/src/vlib/node.c @@ -99,7 +99,7 @@ vlib_node_runtime_update (vlib_main_t * vm, u32 node_index, u32 next_index) vlib_pending_frame_t *pf; i32 i, j, n_insert; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); vlib_worker_thread_barrier_sync (vm); diff --git a/src/vlib/node.h b/src/vlib/node.h index fc7e7da2..1e2f4c38 100644 --- a/src/vlib/node.h +++ b/src/vlib/node.h @@ -344,8 +344,8 @@ typedef struct vlib_frame_t /* Number of vector elements currently in frame. */ u16 n_vectors; - /* Owner cpuid / heap id */ - u16 cpu_index; + /* Owner thread / heap id */ + u16 thread_index; /* Scalar and vector arguments to next node. */ u8 arguments[0]; @@ -459,7 +459,7 @@ typedef struct vlib_node_runtime_t zero before first run of this node. */ - u16 cpu_index; /**< CPU this node runs on */ + u16 thread_index; /**< thread this node runs on */ u8 runtime_data[0]; /**< Function dependent node-runtime data. This data is diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h index 1f7d94e1..54e36874 100644 --- a/src/vlib/node_funcs.h +++ b/src/vlib/node_funcs.h @@ -201,9 +201,9 @@ always_inline vlib_frame_t * vlib_get_frame_no_check (vlib_main_t * vm, uword frame_index) { vlib_frame_t *f; - u32 cpu_index = frame_index & VLIB_CPU_MASK; + u32 thread_index = frame_index & VLIB_CPU_MASK; u32 offset = frame_index & VLIB_OFFSET_MASK; - vm = vlib_mains[cpu_index]; + vm = vlib_mains[thread_index]; f = vm->heap_base + offset; return f; } @@ -215,10 +215,10 @@ vlib_frame_index_no_check (vlib_main_t * vm, vlib_frame_t * f) ASSERT (((uword) f & VLIB_CPU_MASK) == 0); - vm = vlib_mains[f->cpu_index]; + vm = vlib_mains[f->thread_index]; i = ((u8 *) f - (u8 *) vm->heap_base); - return i | f->cpu_index; + return i | f->thread_index; } always_inline vlib_frame_t * diff --git a/src/vlib/threads.c b/src/vlib/threads.c index ef3a24d3..4a111f8d 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -35,27 +35,12 @@ vl (void *p) vlib_worker_thread_t *vlib_worker_threads; vlib_thread_main_t vlib_thread_main; +__thread uword vlib_thread_index = 0; + uword os_get_cpu_number (void) { - void *sp; - uword n; - u32 len; - - len = vec_len (vlib_thread_stacks); - if (len == 0) - return 0; - - /* Get any old stack address. */ - sp = &sp; - - n = ((uword) sp - (uword) vlib_thread_stacks[0]) - >> VLIB_LOG2_THREAD_STACK_SIZE; - - /* "processes" have their own stacks, and they always run in thread 0 */ - n = n >= len ? 0 : n; - - return n; + return vlib_thread_index; } uword @@ -275,21 +260,6 @@ vlib_thread_init (vlib_main_t * vm) return 0; } -vlib_worker_thread_t * -vlib_alloc_thread (vlib_main_t * vm) -{ - vlib_worker_thread_t *w; - - if (vec_len (vlib_worker_threads) >= vec_len (vlib_thread_stacks)) - { - clib_warning ("out of worker threads... Quitting..."); - exit (1); - } - vec_add2 (vlib_worker_threads, w, 1); - w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads]; - return w; -} - vlib_frame_queue_t * vlib_frame_queue_alloc (int nelts) { @@ -427,7 +397,7 @@ vlib_frame_queue_enqueue (vlib_main_t * vm, u32 node_runtime_index, f64 b4 = vlib_time_now_ticks (vm, before); vlib_worker_thread_barrier_check (vm, b4); /* Bad idea. Dequeue -> enqueue -> dequeue -> trouble */ - // vlib_frame_queue_dequeue (vm->cpu_index, vm, nm); + // vlib_frame_queue_dequeue (vm->thread_index, vm, nm); } elt = fq->elts + (new_tail & (fq->nelts - 1)); @@ -497,6 +467,8 @@ vlib_worker_thread_bootstrap_fn (void *arg) w->lwp = syscall (SYS_gettid); w->thread_id = pthread_self (); + vlib_thread_index = w - vlib_worker_threads; + rv = (void *) clib_calljmp ((uword (*)(uword)) w->thread_function, (uword) arg, w->thread_stack + VLIB_THREAD_STACK_SIZE); @@ -610,7 +582,9 @@ start_workers (vlib_main_t * vm) mheap_alloc (0 /* use VM */ , tr->mheap_size); else w->thread_mheap = main_heap; - w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads]; + + w->thread_stack = + vlib_thread_stack_init (w - vlib_worker_threads); w->thread_function = tr->function; w->thread_function_arg = w; w->instance_id = k; @@ -630,7 +604,7 @@ start_workers (vlib_main_t * vm) vm_clone = clib_mem_alloc (sizeof (*vm_clone)); clib_memcpy (vm_clone, vlib_mains[0], sizeof (*vm_clone)); - vm_clone->cpu_index = worker_thread_index; + vm_clone->thread_index = worker_thread_index; vm_clone->heap_base = w->thread_mheap; vm_clone->mbuf_alloc_list = 0; vm_clone->init_functions_called = @@ -679,7 +653,7 @@ start_workers (vlib_main_t * vm) vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]) { vlib_node_t *n = vlib_get_node (vm, rt->node_index); - rt->cpu_index = vm_clone->cpu_index; + rt->thread_index = vm_clone->thread_index; /* copy initial runtime_data from node */ if (n->runtime_data && n->runtime_data_bytes > 0) clib_memcpy (rt->runtime_data, n->runtime_data, @@ -692,7 +666,7 @@ start_workers (vlib_main_t * vm) vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]) { vlib_node_t *n = vlib_get_node (vm, rt->node_index); - rt->cpu_index = vm_clone->cpu_index; + rt->thread_index = vm_clone->thread_index; /* copy initial runtime_data from node */ if (n->runtime_data && n->runtime_data_bytes > 0) clib_memcpy (rt->runtime_data, n->runtime_data, @@ -756,7 +730,8 @@ start_workers (vlib_main_t * vm) mheap_alloc (0 /* use VM */ , tr->mheap_size); else w->thread_mheap = main_heap; - w->thread_stack = vlib_thread_stacks[w - vlib_worker_threads]; + w->thread_stack = + vlib_thread_stack_init (w - vlib_worker_threads); w->thread_function = tr->function; w->thread_function_arg = w; w->instance_id = j; @@ -827,7 +802,7 @@ vlib_worker_thread_node_runtime_update (void) uword n_calls, uword n_vectors, uword n_clocks); - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); if (vec_len (vlib_mains) == 1) return; @@ -835,7 +810,7 @@ vlib_worker_thread_node_runtime_update (void) vm = vlib_mains[0]; nm = &vm->node_main; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); ASSERT (*vlib_worker_threads->wait_at_barrier == 1); /* @@ -955,7 +930,7 @@ vlib_worker_thread_node_runtime_update (void) vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]) { vlib_node_t *n = vlib_get_node (vm, rt->node_index); - rt->cpu_index = vm_clone->cpu_index; + rt->thread_index = vm_clone->thread_index; /* copy runtime_data, will be overwritten later for existing rt */ if (n->runtime_data && n->runtime_data_bytes > 0) clib_memcpy (rt->runtime_data, n->runtime_data, @@ -981,7 +956,7 @@ vlib_worker_thread_node_runtime_update (void) vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]) { vlib_node_t *n = vlib_get_node (vm, rt->node_index); - rt->cpu_index = vm_clone->cpu_index; + rt->thread_index = vm_clone->thread_index; /* copy runtime_data, will be overwritten later for existing rt */ if (n->runtime_data && n->runtime_data_bytes > 0) clib_memcpy (rt->runtime_data, n->runtime_data, @@ -1180,7 +1155,7 @@ vlib_worker_thread_fork_fixup (vlib_fork_fixup_t which) if (vlib_mains == 0) return; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); vlib_worker_thread_barrier_sync (vm); switch (which) @@ -1212,7 +1187,7 @@ vlib_worker_thread_barrier_sync (vlib_main_t * vm) vlib_worker_threads[0].barrier_sync_count++; - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT; @@ -1260,7 +1235,7 @@ vlib_worker_thread_barrier_release (vlib_main_t * vm) int vlib_frame_queue_dequeue (vlib_main_t * vm, vlib_frame_queue_main_t * fqm) { - u32 thread_id = vm->cpu_index; + u32 thread_id = vm->thread_index; vlib_frame_queue_t *fq = fqm->vlib_frame_queues[thread_id]; vlib_frame_queue_elt_t *elt; u32 *from, *to; @@ -1393,7 +1368,7 @@ vlib_worker_thread_fn (void *arg) vlib_main_t *vm = vlib_get_main (); clib_error_t *e; - ASSERT (vm->cpu_index == os_get_cpu_number ()); + ASSERT (vm->thread_index == vlib_get_thread_index ()); vlib_worker_thread_init (w); clib_time_init (&vm->clib_time); diff --git a/src/vlib/threads.h b/src/vlib/threads.h index eca4fc26..101d3d4a 100644 --- a/src/vlib/threads.h +++ b/src/vlib/threads.h @@ -153,8 +153,6 @@ typedef struct /* Called early, in thread 0's context */ clib_error_t *vlib_thread_init (vlib_main_t * vm); -vlib_worker_thread_t *vlib_alloc_thread (vlib_main_t * vm); - int vlib_frame_queue_enqueue (vlib_main_t * vm, u32 node_runtime_index, u32 frame_queue_index, vlib_frame_t * frame, vlib_frame_queue_msg_type_t type); @@ -183,12 +181,19 @@ u32 vlib_frame_queue_main_init (u32 node_index, u32 frame_queue_nelts); void vlib_worker_thread_barrier_sync (vlib_main_t * vm); void vlib_worker_thread_barrier_release (vlib_main_t * vm); +extern __thread uword vlib_thread_index; +static_always_inline uword +vlib_get_thread_index (void) +{ + return vlib_thread_index; +} + always_inline void vlib_smp_unsafe_warning (void) { if (CLIB_DEBUG > 0) { - if (os_get_cpu_number ()) + if (vlib_get_thread_index ()) fformat (stderr, "%s: SMP unsafe warning...\n", __FUNCTION__); } } @@ -331,21 +336,21 @@ vlib_num_workers () } always_inline u32 -vlib_get_worker_cpu_index (u32 worker_index) +vlib_get_worker_thread_index (u32 worker_index) { return worker_index + 1; } always_inline u32 -vlib_get_worker_index (u32 cpu_index) +vlib_get_worker_index (u32 thread_index) { - return cpu_index - 1; + return thread_index - 1; } always_inline u32 vlib_get_current_worker_index () { - return os_get_cpu_number () - 1; + return vlib_get_thread_index () - 1; } static inline void @@ -467,6 +472,8 @@ vlib_get_worker_handoff_queue_elt (u32 frame_queue_index, return elt; } +u8 *vlib_thread_stack_init (uword thread_index); + int vlib_thread_cb_register (struct vlib_main_t *vm, vlib_thread_callbacks_t * cb); diff --git a/src/vlib/unix/cj.c b/src/vlib/unix/cj.c index 33ba163a..7c1e9475 100644 --- a/src/vlib/unix/cj.c +++ b/src/vlib/unix/cj.c @@ -48,7 +48,7 @@ cj_log (u32 type, void *data0, void *data1) r = (cj_record_t *) & (cjm->records[new_tail & (cjm->num_records - 1)]); r->time = vlib_time_now (cjm->vlib_main); - r->cpu = os_get_cpu_number (); + r->thread_index = vlib_get_thread_index (); r->type = type; r->data[0] = pointer_to_uword (data0); r->data[1] = pointer_to_uword (data1); @@ -133,7 +133,8 @@ static inline void cj_dump_one_record (cj_record_t * r) { fprintf (stderr, "[%d]: %10.6f T%02d %llx %llx\n", - r->cpu, r->time, r->type, (long long unsigned int) r->data[0], + r->thread_index, r->time, r->type, + (long long unsigned int) r->data[0], (long long unsigned int) r->data[1]); } @@ -161,7 +162,7 @@ cj_dump_internal (u8 filter0_enable, u64 filter0, index = (cjm->tail + 1) & (cjm->num_records - 1); r = &(cjm->records[index]); - if (r->cpu != (u32) ~ 0) + if (r->thread_index != (u32) ~ 0) { /* Yes, dump from tail + 1 to the end */ for (i = index; i < cjm->num_records; i++) diff --git a/src/vlib/unix/cj.h b/src/vlib/unix/cj.h index 67626afe..d0a1d46e 100644 --- a/src/vlib/unix/cj.h +++ b/src/vlib/unix/cj.h @@ -23,7 +23,7 @@ typedef struct { f64 time; - u32 cpu; + u32 thread_index; u32 type; u64 data[2]; } cj_record_t; diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c index 6b96cc0d..db5ddd64 100644 --- a/src/vlib/unix/main.c +++ b/src/vlib/unix/main.c @@ -510,13 +510,28 @@ thread0 (uword arg) return i; } +u8 * +vlib_thread_stack_init (uword thread_index) +{ + vec_validate (vlib_thread_stacks, thread_index); + vlib_thread_stacks[thread_index] = clib_mem_alloc_aligned + (VLIB_THREAD_STACK_SIZE, VLIB_THREAD_STACK_SIZE); + + /* + * Disallow writes to the bottom page of the stack, to + * catch stack overflows. + */ + if (mprotect (vlib_thread_stacks[thread_index], + clib_mem_get_page_size (), PROT_READ) < 0) + clib_unix_warning ("thread stack"); + return vlib_thread_stacks[thread_index]; +} + int vlib_unix_main (int argc, char *argv[]) { vlib_main_t *vm = &vlib_global_main; /* one and only time for this! */ - vlib_thread_main_t *tm = &vlib_thread_main; unformat_input_t input; - u8 *thread_stacks; clib_error_t *e; int i; @@ -548,29 +563,9 @@ vlib_unix_main (int argc, char *argv[]) } unformat_free (&input); - /* - * allocate n x VLIB_THREAD_STACK_SIZE stacks, aligned to a - * VLIB_THREAD_STACK_SIZE boundary - * See also: os_get_cpu_number() in vlib/vlib/threads.c - */ - thread_stacks = clib_mem_alloc_aligned - ((uword) tm->n_thread_stacks * VLIB_THREAD_STACK_SIZE, - VLIB_THREAD_STACK_SIZE); - - vec_validate (vlib_thread_stacks, tm->n_thread_stacks - 1); - for (i = 0; i < vec_len (vlib_thread_stacks); i++) - { - vlib_thread_stacks[i] = thread_stacks; - - /* - * Disallow writes to the bottom page of the stack, to - * catch stack overflows. - */ - if (mprotect (thread_stacks, clib_mem_get_page_size (), PROT_READ) < 0) - clib_unix_warning ("thread stack"); + vlib_thread_stack_init (0); - thread_stacks += VLIB_THREAD_STACK_SIZE; - } + vlib_thread_index = 0; i = clib_calljmp (thread0, (uword) vm, (void *) (vlib_thread_stacks[0] + diff --git a/src/vnet/adj/adj_l2.c b/src/vnet/adj/adj_l2.c index f68e54e0..20d70dd4 100644 --- a/src/vnet/adj/adj_l2.c +++ b/src/vnet/adj/adj_l2.c @@ -52,7 +52,7 @@ adj_l2_rewrite_inline (vlib_main_t * vm, { u32 * from = vlib_frame_vector_args (frame); u32 n_left_from, n_left_to_next, * to_next, next_index; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); ethernet_main_t * em = ðernet_main; n_left_from = frame->n_vectors; @@ -93,7 +93,7 @@ adj_l2_rewrite_inline (vlib_main_t * vm, vnet_buffer(p0)->sw_if_index[VLIB_TX] = adj0->rewrite_header.sw_if_index; vlib_increment_combined_counter(&adjacency_counters, - cpu_index, + thread_index, adj_index0, /* packet increment */ 0, /* byte increment */ rw_len0); diff --git a/src/vnet/adj/adj_midchain.c b/src/vnet/adj/adj_midchain.c index e8087f08..5756de43 100644 --- a/src/vnet/adj/adj_midchain.c +++ b/src/vnet/adj/adj_midchain.c @@ -49,7 +49,7 @@ adj_midchain_tx_inline (vlib_main_t * vm, u32 next_index; vnet_main_t *vnm = vnet_get_main (); vnet_interface_main_t *im = &vnm->interface_main; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; /* Vector of buffer / pkt indices we're supposed to process */ from = vlib_frame_vector_args (frame); @@ -124,13 +124,13 @@ adj_midchain_tx_inline (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, + thread_index, adj0->rewrite_header.sw_if_index, 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, + thread_index, adj1->rewrite_header.sw_if_index, 1, vlib_buffer_length_in_chain (vm, b1)); @@ -181,7 +181,7 @@ adj_midchain_tx_inline (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, + thread_index, adj0->rewrite_header.sw_if_index, 1, vlib_buffer_length_in_chain (vm, b0)); diff --git a/src/vnet/adj/adj_nsh.c b/src/vnet/adj/adj_nsh.c index 9a0f9d8b..128570b0 100644 --- a/src/vnet/adj/adj_nsh.c +++ b/src/vnet/adj/adj_nsh.c @@ -53,7 +53,7 @@ adj_nsh_rewrite_inline (vlib_main_t * vm, { u32 * from = vlib_frame_vector_args (frame); u32 n_left_from, n_left_to_next, * to_next, next_index; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); n_left_from = frame->n_vectors; next_index = node->cached_next_index; @@ -94,7 +94,7 @@ adj_nsh_rewrite_inline (vlib_main_t * vm, vnet_buffer(p0)->ip.save_rewrite_length = rw_len0; vlib_increment_combined_counter(&adjacency_counters, - cpu_index, + thread_index, adj_index0, /* packet increment */ 0, /* byte increment */ rw_len0); diff --git a/src/vnet/classify/vnet_classify.c b/src/vnet/classify/vnet_classify.c index 98842a48..70a189b0 100644 --- a/src/vnet/classify/vnet_classify.c +++ b/src/vnet/classify/vnet_classify.c @@ -251,12 +251,12 @@ static inline void make_working_copy vnet_classify_entry_##size##_t * working_copy##size = 0; foreach_size_in_u32x4; #undef _ - u32 cpu_number = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); - if (cpu_number >= vec_len (t->working_copies)) + if (thread_index >= vec_len (t->working_copies)) { oldheap = clib_mem_set_heap (t->mheap); - vec_validate (t->working_copies, cpu_number); + vec_validate (t->working_copies, thread_index); clib_mem_set_heap (oldheap); } @@ -265,7 +265,7 @@ static inline void make_working_copy * updates from multiple threads will not result in sporadic, spurious * lookup failures. */ - working_copy = t->working_copies[cpu_number]; + working_copy = t->working_copies[thread_index]; t->saved_bucket.as_u64 = b->as_u64; oldheap = clib_mem_set_heap (t->mheap); @@ -290,7 +290,7 @@ static inline void make_working_copy default: abort(); } - t->working_copies[cpu_number] = working_copy; + t->working_copies[thread_index] = working_copy; } _vec_len(working_copy) = (1<log2_pages)*t->entries_per_page; @@ -318,7 +318,7 @@ static inline void make_working_copy working_bucket.offset = vnet_classify_get_offset (t, working_copy); CLIB_MEMORY_BARRIER(); b->as_u64 = working_bucket.as_u64; - t->working_copies[cpu_number] = working_copy; + t->working_copies[thread_index] = working_copy; } static vnet_classify_entry_t * @@ -387,7 +387,7 @@ int vnet_classify_add_del (vnet_classify_table_t * t, int i; u64 hash, new_hash; u32 new_log2_pages; - u32 cpu_number = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u8 * key_minus_skip; ASSERT ((add_v->flags & VNET_CLASSIFY_ENTRY_FREE) == 0); @@ -498,7 +498,7 @@ int vnet_classify_add_del (vnet_classify_table_t * t, new_log2_pages = t->saved_bucket.log2_pages + 1; expand_again: - working_copy = t->working_copies[cpu_number]; + working_copy = t->working_copies[thread_index]; new_v = split_and_rehash (t, working_copy, new_log2_pages); if (new_v == 0) diff --git a/src/vnet/cop/ip4_whitelist.c b/src/vnet/cop/ip4_whitelist.c index 6ef3d7d7..1b5e336b 100644 --- a/src/vnet/cop/ip4_whitelist.c +++ b/src/vnet/cop/ip4_whitelist.c @@ -60,7 +60,7 @@ ip4_cop_whitelist_node_fn (vlib_main_t * vm, cop_feature_type_t next_index; cop_main_t *cm = &cop_main; vlib_combined_counter_main_t * vcm = &load_balance_main.lbm_via_counters; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -177,12 +177,12 @@ ip4_cop_whitelist_node_fn (vlib_main_t * vm, dpo1 = load_balance_get_bucket_i(lb1, 0); vlib_increment_combined_counter - (vcm, cpu_index, lb_index0, 1, + (vcm, thread_index, lb_index0, 1, vlib_buffer_length_in_chain (vm, b0) + sizeof(ethernet_header_t)); vlib_increment_combined_counter - (vcm, cpu_index, lb_index1, 1, + (vcm, thread_index, lb_index1, 1, vlib_buffer_length_in_chain (vm, b1) + sizeof(ethernet_header_t)); @@ -273,7 +273,7 @@ ip4_cop_whitelist_node_fn (vlib_main_t * vm, dpo0 = load_balance_get_bucket_i(lb0, 0); vlib_increment_combined_counter - (vcm, cpu_index, lb_index0, 1, + (vcm, thread_index, lb_index0, 1, vlib_buffer_length_in_chain (vm, b0) + sizeof(ethernet_header_t)); diff --git a/src/vnet/cop/ip6_whitelist.c b/src/vnet/cop/ip6_whitelist.c index c2e16ccf..f3fe62e3 100644 --- a/src/vnet/cop/ip6_whitelist.c +++ b/src/vnet/cop/ip6_whitelist.c @@ -61,7 +61,7 @@ ip6_cop_whitelist_node_fn (vlib_main_t * vm, cop_main_t *cm = &cop_main; ip6_main_t * im6 = &ip6_main; vlib_combined_counter_main_t * vcm = &load_balance_main.lbm_via_counters; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -153,12 +153,12 @@ ip6_cop_whitelist_node_fn (vlib_main_t * vm, dpo1 = load_balance_get_bucket_i(lb1, 0); vlib_increment_combined_counter - (vcm, cpu_index, lb_index0, 1, + (vcm, thread_index, lb_index0, 1, vlib_buffer_length_in_chain (vm, b0) + sizeof(ethernet_header_t)); vlib_increment_combined_counter - (vcm, cpu_index, lb_index1, 1, + (vcm, thread_index, lb_index1, 1, vlib_buffer_length_in_chain (vm, b1) + sizeof(ethernet_header_t)); @@ -233,7 +233,7 @@ ip6_cop_whitelist_node_fn (vlib_main_t * vm, dpo0 = load_balance_get_bucket_i(lb0, 0); vlib_increment_combined_counter - (vcm, cpu_index, lb_index0, 1, + (vcm, thread_index, lb_index0, 1, vlib_buffer_length_in_chain (vm, b0) + sizeof(ethernet_header_t)); diff --git a/src/vnet/devices/af_packet/node.c b/src/vnet/devices/af_packet/node.c index ba337f3f..76980102 100644 --- a/src/vnet/devices/af_packet/node.c +++ b/src/vnet/devices/af_packet/node.c @@ -124,7 +124,7 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, u32 frame_num = apif->rx_req->tp_frame_nr; u8 *block_start = apif->rx_ring + block * block_size; uword n_trace = vlib_get_trace_count (vm, node); - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); u32 min_bufs = apif->rx_req->tp_frame_size / n_buffer_bytes; @@ -132,15 +132,15 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, if (apif->per_interface_next_index != ~0) next_index = apif->per_interface_next_index; - n_free_bufs = vec_len (apm->rx_buffers[cpu_index]); + n_free_bufs = vec_len (apm->rx_buffers[thread_index]); if (PREDICT_FALSE (n_free_bufs < VLIB_FRAME_SIZE)) { - vec_validate (apm->rx_buffers[cpu_index], + vec_validate (apm->rx_buffers[thread_index], VLIB_FRAME_SIZE + n_free_bufs - 1); n_free_bufs += - vlib_buffer_alloc (vm, &apm->rx_buffers[cpu_index][n_free_bufs], + vlib_buffer_alloc (vm, &apm->rx_buffers[thread_index][n_free_bufs], VLIB_FRAME_SIZE); - _vec_len (apm->rx_buffers[cpu_index]) = n_free_bufs; + _vec_len (apm->rx_buffers[thread_index]) = n_free_bufs; } rx_frame = apif->next_rx_frame; @@ -163,11 +163,11 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, { /* grab free buffer */ u32 last_empty_buffer = - vec_len (apm->rx_buffers[cpu_index]) - 1; + vec_len (apm->rx_buffers[thread_index]) - 1; prev_bi0 = bi0; - bi0 = apm->rx_buffers[cpu_index][last_empty_buffer]; + bi0 = apm->rx_buffers[thread_index][last_empty_buffer]; b0 = vlib_get_buffer (vm, bi0); - _vec_len (apm->rx_buffers[cpu_index]) = last_empty_buffer; + _vec_len (apm->rx_buffers[thread_index]) = last_empty_buffer; n_free_bufs--; /* copy data */ @@ -236,9 +236,9 @@ af_packet_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (vnet_get_main ()->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number (), apif->hw_if_index, n_rx_packets, n_rx_bytes); + vlib_get_thread_index (), apif->hw_if_index, n_rx_packets, n_rx_bytes); - vnet_device_increment_rx_packets (cpu_index, n_rx_packets); + vnet_device_increment_rx_packets (thread_index, n_rx_packets); return n_rx_packets; } diff --git a/src/vnet/devices/devices.c b/src/vnet/devices/devices.c index 41645220..5e5e812c 100644 --- a/src/vnet/devices/devices.c +++ b/src/vnet/devices/devices.c @@ -104,7 +104,7 @@ vnet_device_queue_sort (void *a1, void *a2) void vnet_device_input_assign_thread (u32 hw_if_index, - u16 queue_id, uword cpu_index) + u16 queue_id, uword thread_index) { vnet_main_t *vnm = vnet_get_main (); vnet_device_main_t *vdm = &vnet_device_main; @@ -115,19 +115,19 @@ vnet_device_input_assign_thread (u32 hw_if_index, ASSERT (hw->input_node_index > 0); - if (vdm->first_worker_cpu_index == 0) - cpu_index = 0; + if (vdm->first_worker_thread_index == 0) + thread_index = 0; - if (cpu_index != 0 && - (cpu_index < vdm->first_worker_cpu_index || - cpu_index > vdm->last_worker_cpu_index)) + if (thread_index != 0 && + (thread_index < vdm->first_worker_thread_index || + thread_index > vdm->last_worker_thread_index)) { - cpu_index = vdm->next_worker_cpu_index++; - if (vdm->next_worker_cpu_index > vdm->last_worker_cpu_index) - vdm->next_worker_cpu_index = vdm->first_worker_cpu_index; + thread_index = vdm->next_worker_thread_index++; + if (vdm->next_worker_thread_index > vdm->last_worker_thread_index) + vdm->next_worker_thread_index = vdm->first_worker_thread_index; } - vm = vlib_mains[cpu_index]; + vm = vlib_mains[thread_index]; rt = vlib_node_get_runtime_data (vm, hw->input_node_index); vec_add2 (rt->devices_and_queues, dq, 1); @@ -136,33 +136,33 @@ vnet_device_input_assign_thread (u32 hw_if_index, dq->queue_id = queue_id; vec_sort_with_function (rt->devices_and_queues, vnet_device_queue_sort); - vec_validate (hw->input_node_cpu_index_by_queue, queue_id); - hw->input_node_cpu_index_by_queue[queue_id] = cpu_index; + vec_validate (hw->input_node_thread_index_by_queue, queue_id); + hw->input_node_thread_index_by_queue[queue_id] = thread_index; } static int vnet_device_input_unassign_thread (u32 hw_if_index, u16 queue_id, - uword cpu_index) + uword thread_index) { vnet_main_t *vnm = vnet_get_main (); vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); vnet_device_input_runtime_t *rt; vnet_device_and_queue_t *dq; - uword old_cpu_index; + uword old_thread_index; - if (hw->input_node_cpu_index_by_queue == 0) + if (hw->input_node_thread_index_by_queue == 0) return VNET_API_ERROR_INVALID_INTERFACE; - if (vec_len (hw->input_node_cpu_index_by_queue) < queue_id + 1) + if (vec_len (hw->input_node_thread_index_by_queue) < queue_id + 1) return VNET_API_ERROR_INVALID_INTERFACE; - old_cpu_index = hw->input_node_cpu_index_by_queue[queue_id]; + old_thread_index = hw->input_node_thread_index_by_queue[queue_id]; - if (old_cpu_index == cpu_index) + if (old_thread_index == thread_index) return 0; rt = - vlib_node_get_runtime_data (vlib_mains[old_cpu_index], + vlib_node_get_runtime_data (vlib_mains[old_thread_index], hw->input_node_index); vec_foreach (dq, rt->devices_and_queues) @@ -240,7 +240,7 @@ set_device_placement (vlib_main_t * vm, unformat_input_t * input, vnet_device_main_t *vdm = &vnet_device_main; u32 hw_if_index = (u32) ~ 0; u32 queue_id = (u32) 0; - u32 cpu_index = (u32) ~ 0; + u32 thread_index = (u32) ~ 0; int rv; if (!unformat_user (input, unformat_line_input, line_input)) @@ -253,10 +253,10 @@ set_device_placement (vlib_main_t * vm, unformat_input_t * input, ; else if (unformat (line_input, "queue %d", &queue_id)) ; - else if (unformat (line_input, "main", &cpu_index)) - cpu_index = 0; - else if (unformat (line_input, "worker %d", &cpu_index)) - cpu_index += vdm->first_worker_cpu_index; + else if (unformat (line_input, "main", &thread_index)) + thread_index = 0; + else if (unformat (line_input, "worker %d", &thread_index)) + thread_index += vdm->first_worker_thread_index; else { error = clib_error_return (0, "parse error: '%U'", @@ -271,16 +271,17 @@ set_device_placement (vlib_main_t * vm, unformat_input_t * input, if (hw_if_index == (u32) ~ 0) return clib_error_return (0, "please specify valid interface name"); - if (cpu_index > vdm->last_worker_cpu_index) + if (thread_index > vdm->last_worker_thread_index) return clib_error_return (0, "please specify valid worker thread or main"); - rv = vnet_device_input_unassign_thread (hw_if_index, queue_id, cpu_index); + rv = + vnet_device_input_unassign_thread (hw_if_index, queue_id, thread_index); if (rv) return clib_error_return (0, "not found"); - vnet_device_input_assign_thread (hw_if_index, queue_id, cpu_index); + vnet_device_input_assign_thread (hw_if_index, queue_id, thread_index); return 0; } @@ -326,9 +327,9 @@ vnet_device_init (vlib_main_t * vm) tr = p ? (vlib_thread_registration_t *) p[0] : 0; if (tr && tr->count > 0) { - vdm->first_worker_cpu_index = tr->first_index; - vdm->next_worker_cpu_index = tr->first_index; - vdm->last_worker_cpu_index = tr->first_index + tr->count - 1; + vdm->first_worker_thread_index = tr->first_index; + vdm->next_worker_thread_index = tr->first_index; + vdm->last_worker_thread_index = tr->first_index + tr->count - 1; } return 0; } diff --git a/src/vnet/devices/devices.h b/src/vnet/devices/devices.h index bbb29fe3..966f8302 100644 --- a/src/vnet/devices/devices.h +++ b/src/vnet/devices/devices.h @@ -50,9 +50,9 @@ typedef struct typedef struct { vnet_device_per_worker_data_t *workers; - uword first_worker_cpu_index; - uword last_worker_cpu_index; - uword next_worker_cpu_index; + uword first_worker_thread_index; + uword last_worker_thread_index; + uword next_worker_thread_index; } vnet_device_main_t; typedef struct @@ -80,7 +80,7 @@ vnet_set_device_input_node (u32 hw_if_index, u32 node_index) } void vnet_device_input_assign_thread (u32 hw_if_index, u16 queue_id, - uword cpu_index); + uword thread_index); static inline u64 vnet_get_aggregate_rx_packets (void) @@ -95,12 +95,12 @@ vnet_get_aggregate_rx_packets (void) } static inline void -vnet_device_increment_rx_packets (u32 cpu_index, u64 count) +vnet_device_increment_rx_packets (u32 thread_index, u64 count) { vnet_device_main_t *vdm = &vnet_device_main; vnet_device_per_worker_data_t *pwd; - pwd = vec_elt_at_index (vdm->workers, cpu_index); + pwd = vec_elt_at_index (vdm->workers, thread_index); pwd->aggregate_rx_packets += count; } @@ -117,9 +117,9 @@ vnet_device_input_set_interrupt_pending (vnet_main_t * vnm, u32 hw_if_index, { vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); - ASSERT (queue_id < vec_len (hw->input_node_cpu_index_by_queue)); - u32 cpu_index = hw->input_node_cpu_index_by_queue[queue_id]; - vlib_node_set_interrupt_pending (vlib_mains[cpu_index], + ASSERT (queue_id < vec_len (hw->input_node_thread_index_by_queue)); + u32 thread_index = hw->input_node_thread_index_by_queue[queue_id]; + vlib_node_set_interrupt_pending (vlib_mains[thread_index], hw->input_node_index); } diff --git a/src/vnet/devices/netmap/node.c b/src/vnet/devices/netmap/node.c index 68ea7832..e120eeae 100644 --- a/src/vnet/devices/netmap/node.c +++ b/src/vnet/devices/netmap/node.c @@ -98,22 +98,22 @@ netmap_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, u32 n_free_bufs; struct netmap_ring *ring; int cur_ring; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 n_buffer_bytes = vlib_buffer_free_list_buffer_size (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); if (nif->per_interface_next_index != ~0) next_index = nif->per_interface_next_index; - n_free_bufs = vec_len (nm->rx_buffers[cpu_index]); + n_free_bufs = vec_len (nm->rx_buffers[thread_index]); if (PREDICT_FALSE (n_free_bufs < VLIB_FRAME_SIZE)) { - vec_validate (nm->rx_buffers[cpu_index], + vec_validate (nm->rx_buffers[thread_index], VLIB_FRAME_SIZE + n_free_bufs - 1); n_free_bufs += - vlib_buffer_alloc (vm, &nm->rx_buffers[cpu_index][n_free_bufs], + vlib_buffer_alloc (vm, &nm->rx_buffers[thread_index][n_free_bufs], VLIB_FRAME_SIZE); - _vec_len (nm->rx_buffers[cpu_index]) = n_free_bufs; + _vec_len (nm->rx_buffers[thread_index]) = n_free_bufs; } cur_ring = nif->first_rx_ring; @@ -163,11 +163,11 @@ netmap_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_buffer_t *b0; /* grab free buffer */ u32 last_empty_buffer = - vec_len (nm->rx_buffers[cpu_index]) - 1; + vec_len (nm->rx_buffers[thread_index]) - 1; prev_bi0 = bi0; - bi0 = nm->rx_buffers[cpu_index][last_empty_buffer]; + bi0 = nm->rx_buffers[thread_index][last_empty_buffer]; b0 = vlib_get_buffer (vm, bi0); - _vec_len (nm->rx_buffers[cpu_index]) = last_empty_buffer; + _vec_len (nm->rx_buffers[thread_index]) = last_empty_buffer; n_free_bufs--; /* copy data */ @@ -247,9 +247,9 @@ netmap_device_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (vnet_get_main ()->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number (), nif->hw_if_index, n_rx_packets, n_rx_bytes); + vlib_get_thread_index (), nif->hw_if_index, n_rx_packets, n_rx_bytes); - vnet_device_increment_rx_packets (cpu_index, n_rx_packets); + vnet_device_increment_rx_packets (thread_index, n_rx_packets); return n_rx_packets; } @@ -260,7 +260,7 @@ netmap_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, { int i; u32 n_rx_packets = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); netmap_main_t *nm = &netmap_main; netmap_if_t *nmi; @@ -269,7 +269,7 @@ netmap_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, nmi = vec_elt_at_index (nm->interfaces, i); if (nmi->is_admin_up && (i % nm->input_cpu_count) == - (cpu_index - nm->input_cpu_first_index)) + (thread_index - nm->input_cpu_first_index)) n_rx_packets += netmap_device_input_fn (vm, node, frame, nmi); } diff --git a/src/vnet/devices/ssvm/node.c b/src/vnet/devices/ssvm/node.c index a6c9dfd7..539b4161 100644 --- a/src/vnet/devices/ssvm/node.c +++ b/src/vnet/devices/ssvm/node.c @@ -89,7 +89,7 @@ ssvm_eth_device_input (ssvm_eth_main_t * em, ethernet_header_t *eh0; u16 type0; u32 n_rx_bytes = 0, l3_offset0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 trace_cnt __attribute__ ((unused)) = vlib_get_trace_count (vm, node); volatile u32 *lock; u32 *elt_indices; @@ -284,10 +284,10 @@ out: vlib_increment_combined_counter (vnet_get_main ()->interface_main.combined_sw_if_counters - + VNET_INTERFACE_COUNTER_RX, cpu_index, + + VNET_INTERFACE_COUNTER_RX, thread_index, intfc->vlib_hw_if_index, rx_queue_index, n_rx_bytes); - vnet_device_increment_rx_packets (cpu_index, rx_queue_index); + vnet_device_increment_rx_packets (thread_index, rx_queue_index); return rx_queue_index; } diff --git a/src/vnet/devices/virtio/vhost-user.c b/src/vnet/devices/virtio/vhost-user.c index 00807dc0..5e720f65 100644 --- a/src/vnet/devices/virtio/vhost-user.c +++ b/src/vnet/devices/virtio/vhost-user.c @@ -331,7 +331,7 @@ vhost_user_tx_thread_placement (vhost_user_intf_t * vui) { //Let's try to assign one queue to each thread u32 qid = 0; - u32 cpu_index = 0; + u32 thread_index = 0; vui->use_tx_spinlock = 0; while (1) { @@ -341,20 +341,21 @@ vhost_user_tx_thread_placement (vhost_user_intf_t * vui) if (!rxvq->started || !rxvq->enabled) continue; - vui->per_cpu_tx_qid[cpu_index] = qid; - cpu_index++; - if (cpu_index == vlib_get_thread_main ()->n_vlib_mains) + vui->per_cpu_tx_qid[thread_index] = qid; + thread_index++; + if (thread_index == vlib_get_thread_main ()->n_vlib_mains) return; } //We need to loop, meaning the spinlock has to be used vui->use_tx_spinlock = 1; - if (cpu_index == 0) + if (thread_index == 0) { //Could not find a single valid one - for (cpu_index = 0; - cpu_index < vlib_get_thread_main ()->n_vlib_mains; cpu_index++) + for (thread_index = 0; + thread_index < vlib_get_thread_main ()->n_vlib_mains; + thread_index++) { - vui->per_cpu_tx_qid[cpu_index] = 0; + vui->per_cpu_tx_qid[thread_index] = 0; } return; } @@ -368,7 +369,7 @@ vhost_user_rx_thread_placement () vhost_user_intf_t *vui; vhost_cpu_t *vhc; u32 *workers = 0; - u32 cpu_index; + u32 thread_index; vlib_main_t *vm; //Let's list all workers cpu indexes @@ -400,9 +401,9 @@ vhost_user_rx_thread_placement () continue; i %= vec_len (vui_workers); - cpu_index = vui_workers[i]; + thread_index = vui_workers[i]; i++; - vhc = &vum->cpus[cpu_index]; + vhc = &vum->cpus[thread_index]; iaq.qid = qid; iaq.vhost_iface_index = vui - vum->vhost_user_interfaces; @@ -429,14 +430,14 @@ vhost_user_rx_thread_placement () vhc->operation_mode = mode; } - for (cpu_index = vum->input_cpu_first_index; - cpu_index < vum->input_cpu_first_index + vum->input_cpu_count; - cpu_index++) + for (thread_index = vum->input_cpu_first_index; + thread_index < vum->input_cpu_first_index + vum->input_cpu_count; + thread_index++) { vlib_node_state_t state = VLIB_NODE_STATE_POLLING; - vhc = &vum->cpus[cpu_index]; - vm = vlib_mains ? vlib_mains[cpu_index] : &vlib_global_main; + vhc = &vum->cpus[thread_index]; + vm = vlib_mains ? vlib_mains[thread_index] : &vlib_global_main; switch (vhc->operation_mode) { case VHOST_USER_INTERRUPT_MODE: @@ -532,7 +533,7 @@ vhost_user_set_interrupt_pending (vhost_user_intf_t * vui, u32 ifq) { vhost_user_main_t *vum = &vhost_user_main; vhost_cpu_t *vhc; - u32 cpu_index; + u32 thread_index; vhost_iface_and_queue_t *vhiq; vlib_main_t *vm; u32 ifq2; @@ -553,8 +554,8 @@ vhost_user_set_interrupt_pending (vhost_user_intf_t * vui, u32 ifq) if ((vhiq->vhost_iface_index == (ifq >> 8)) && (VHOST_VRING_IDX_TX (vhiq->qid) == (ifq & 0xff))) { - cpu_index = vhc - vum->cpus; - vm = vlib_mains ? vlib_mains[cpu_index] : &vlib_global_main; + thread_index = vhc - vum->cpus; + vm = vlib_mains ? vlib_mains[thread_index] : &vlib_global_main; /* * Convert RX virtqueue number in the lower byte to vring * queue index for the input node process. Top bytes contain @@ -1592,7 +1593,7 @@ vhost_user_if_input (vlib_main_t * vm, u32 n_trace = vlib_get_trace_count (vm, node); u16 qsz_mask; u32 map_hint = 0; - u16 cpu_index = os_get_cpu_number (); + u16 thread_index = vlib_get_thread_index (); u16 copy_len = 0; { @@ -1651,32 +1652,32 @@ vhost_user_if_input (vlib_main_t * vm, * in the loop and come back later. This is not an issue as for big packet, * processing cost really comes from the memory copy. */ - if (PREDICT_FALSE (vum->cpus[cpu_index].rx_buffers_len < n_left + 1)) + if (PREDICT_FALSE (vum->cpus[thread_index].rx_buffers_len < n_left + 1)) { - u32 curr_len = vum->cpus[cpu_index].rx_buffers_len; - vum->cpus[cpu_index].rx_buffers_len += + u32 curr_len = vum->cpus[thread_index].rx_buffers_len; + vum->cpus[thread_index].rx_buffers_len += vlib_buffer_alloc_from_free_list (vm, - vum->cpus[cpu_index].rx_buffers + + vum->cpus[thread_index].rx_buffers + curr_len, VHOST_USER_RX_BUFFERS_N - curr_len, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); if (PREDICT_FALSE - (vum->cpus[cpu_index].rx_buffers_len < + (vum->cpus[thread_index].rx_buffers_len < VHOST_USER_RX_BUFFER_STARVATION)) { /* In case of buffer starvation, discard some packets from the queue * and log the event. * We keep doing best effort for the remaining packets. */ - u32 flush = (n_left + 1 > vum->cpus[cpu_index].rx_buffers_len) ? - n_left + 1 - vum->cpus[cpu_index].rx_buffers_len : 1; + u32 flush = (n_left + 1 > vum->cpus[thread_index].rx_buffers_len) ? + n_left + 1 - vum->cpus[thread_index].rx_buffers_len : 1; flush = vhost_user_rx_discard_packet (vm, vui, txvq, flush); n_left -= flush; vlib_increment_simple_counter (vnet_main. interface_main.sw_if_counters + VNET_INTERFACE_COUNTER_DROP, - os_get_cpu_number (), + vlib_get_thread_index (), vui->sw_if_index, flush); vlib_error_count (vm, vhost_user_input_node.index, @@ -1696,7 +1697,7 @@ vhost_user_if_input (vlib_main_t * vm, u32 desc_data_offset; vring_desc_t *desc_table = txvq->desc; - if (PREDICT_FALSE (vum->cpus[cpu_index].rx_buffers_len <= 1)) + if (PREDICT_FALSE (vum->cpus[thread_index].rx_buffers_len <= 1)) { /* Not enough rx_buffers * Note: We yeld on 1 so we don't need to do an additional @@ -1707,17 +1708,18 @@ vhost_user_if_input (vlib_main_t * vm, } desc_current = txvq->avail->ring[txvq->last_avail_idx & qsz_mask]; - vum->cpus[cpu_index].rx_buffers_len--; - bi_current = (vum->cpus[cpu_index].rx_buffers) - [vum->cpus[cpu_index].rx_buffers_len]; + vum->cpus[thread_index].rx_buffers_len--; + bi_current = (vum->cpus[thread_index].rx_buffers) + [vum->cpus[thread_index].rx_buffers_len]; b_head = b_current = vlib_get_buffer (vm, bi_current); to_next[0] = bi_current; //We do that now so we can forget about bi_current to_next++; n_left_to_next--; vlib_prefetch_buffer_with_index (vm, - (vum->cpus[cpu_index].rx_buffers) - [vum->cpus[cpu_index]. + (vum-> + cpus[thread_index].rx_buffers) + [vum->cpus[thread_index]. rx_buffers_len - 1], LOAD); /* Just preset the used descriptor id and length for later */ @@ -1791,7 +1793,7 @@ vhost_user_if_input (vlib_main_t * vm, (b_current->current_length == VLIB_BUFFER_DATA_SIZE)) { if (PREDICT_FALSE - (vum->cpus[cpu_index].rx_buffers_len == 0)) + (vum->cpus[thread_index].rx_buffers_len == 0)) { /* Cancel speculation */ to_next--; @@ -1805,17 +1807,18 @@ vhost_user_if_input (vlib_main_t * vm, * but valid. */ vhost_user_input_rewind_buffers (vm, - &vum->cpus[cpu_index], + &vum->cpus + [thread_index], b_head); n_left = 0; goto stop; } /* Get next output */ - vum->cpus[cpu_index].rx_buffers_len--; + vum->cpus[thread_index].rx_buffers_len--; u32 bi_next = - (vum->cpus[cpu_index].rx_buffers)[vum->cpus - [cpu_index].rx_buffers_len]; + (vum->cpus[thread_index].rx_buffers)[vum->cpus + [thread_index].rx_buffers_len]; b_current->next_buffer = bi_next; b_current->flags |= VLIB_BUFFER_NEXT_PRESENT; bi_current = bi_next; @@ -1823,7 +1826,7 @@ vhost_user_if_input (vlib_main_t * vm, } /* Prepare a copy order executed later for the data */ - vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len]; + vhost_copy_t *cpy = &vum->cpus[thread_index].copy[copy_len]; copy_len++; u32 desc_data_l = desc_table[desc_current].len - desc_data_offset; @@ -1880,7 +1883,7 @@ vhost_user_if_input (vlib_main_t * vm, if (PREDICT_FALSE (copy_len >= VHOST_USER_RX_COPY_THRESHOLD)) { if (PREDICT_FALSE - (vhost_user_input_copy (vui, vum->cpus[cpu_index].copy, + (vhost_user_input_copy (vui, vum->cpus[thread_index].copy, copy_len, &map_hint))) { clib_warning @@ -1905,7 +1908,7 @@ vhost_user_if_input (vlib_main_t * vm, /* Do the memory copies */ if (PREDICT_FALSE - (vhost_user_input_copy (vui, vum->cpus[cpu_index].copy, + (vhost_user_input_copy (vui, vum->cpus[thread_index].copy, copy_len, &map_hint))) { clib_warning ("Memory mapping error on interface hw_if_index=%d " @@ -1933,9 +1936,9 @@ vhost_user_if_input (vlib_main_t * vm, vlib_increment_combined_counter (vnet_main.interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number (), vui->sw_if_index, n_rx_packets, n_rx_bytes); + vlib_get_thread_index (), vui->sw_if_index, n_rx_packets, n_rx_bytes); - vnet_device_increment_rx_packets (cpu_index, n_rx_packets); + vnet_device_increment_rx_packets (thread_index, n_rx_packets); return n_rx_packets; } @@ -1946,15 +1949,15 @@ vhost_user_input (vlib_main_t * vm, { vhost_user_main_t *vum = &vhost_user_main; uword n_rx_packets = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); vhost_iface_and_queue_t *vhiq; vhost_user_intf_t *vui; vhost_cpu_t *vhc; - vhc = &vum->cpus[cpu_index]; + vhc = &vum->cpus[thread_index]; if (PREDICT_TRUE (vhc->operation_mode == VHOST_USER_POLLING_MODE)) { - vec_foreach (vhiq, vum->cpus[cpu_index].rx_queues) + vec_foreach (vhiq, vum->cpus[thread_index].rx_queues) { vui = &vum->vhost_user_interfaces[vhiq->vhost_iface_index]; n_rx_packets += vhost_user_if_input (vm, vum, vui, vhiq->qid, node); @@ -2096,7 +2099,7 @@ vhost_user_tx (vlib_main_t * vm, vhost_user_vring_t *rxvq; u16 qsz_mask; u8 error; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 map_hint = 0; u8 retry = 8; u16 copy_len; @@ -2116,7 +2119,7 @@ vhost_user_tx (vlib_main_t * vm, qid = VHOST_VRING_IDX_RX (*vec_elt_at_index - (vui->per_cpu_tx_qid, os_get_cpu_number ())); + (vui->per_cpu_tx_qid, vlib_get_thread_index ())); rxvq = &vui->vrings[qid]; if (PREDICT_FALSE (vui->use_tx_spinlock)) vhost_user_vring_lock (vui, qid); @@ -2143,10 +2146,10 @@ retry: if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - vum->cpus[cpu_index].current_trace = + vum->cpus[thread_index].current_trace = vlib_add_trace (vm, node, b0, - sizeof (*vum->cpus[cpu_index].current_trace)); - vhost_user_tx_trace (vum->cpus[cpu_index].current_trace, + sizeof (*vum->cpus[thread_index].current_trace)); + vhost_user_tx_trace (vum->cpus[thread_index].current_trace, vui, qid / 2, b0, rxvq); } @@ -2188,14 +2191,14 @@ retry: { // Get a header from the header array virtio_net_hdr_mrg_rxbuf_t *hdr = - &vum->cpus[cpu_index].tx_headers[tx_headers_len]; + &vum->cpus[thread_index].tx_headers[tx_headers_len]; tx_headers_len++; hdr->hdr.flags = 0; hdr->hdr.gso_type = 0; hdr->num_buffers = 1; //This is local, no need to check // Prepare a copy order executed later for the header - vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len]; + vhost_copy_t *cpy = &vum->cpus[thread_index].copy[copy_len]; copy_len++; cpy->len = vui->virtio_net_hdr_sz; cpy->dst = buffer_map_addr; @@ -2220,7 +2223,7 @@ retry: else if (vui->virtio_net_hdr_sz == 12) //MRG is available { virtio_net_hdr_mrg_rxbuf_t *hdr = - &vum->cpus[cpu_index].tx_headers[tx_headers_len - 1]; + &vum->cpus[thread_index].tx_headers[tx_headers_len - 1]; //Move from available to used buffer rxvq->used->ring[rxvq->last_used_idx & qsz_mask].id = @@ -2282,7 +2285,7 @@ retry: } { - vhost_copy_t *cpy = &vum->cpus[cpu_index].copy[copy_len]; + vhost_copy_t *cpy = &vum->cpus[thread_index].copy[copy_len]; copy_len++; cpy->len = bytes_left; cpy->len = (cpy->len > buffer_len) ? buffer_len : cpy->len; @@ -2325,8 +2328,8 @@ retry: if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) { - vum->cpus[cpu_index].current_trace->hdr = - vum->cpus[cpu_index].tx_headers[tx_headers_len - 1]; + vum->cpus[thread_index].current_trace->hdr = + vum->cpus[thread_index].tx_headers[tx_headers_len - 1]; } n_left--; //At the end for error counting when 'goto done' is invoked @@ -2336,7 +2339,7 @@ retry: done: //Do the memory copies if (PREDICT_FALSE - (vhost_user_tx_copy (vui, vum->cpus[cpu_index].copy, + (vhost_user_tx_copy (vui, vum->cpus[thread_index].copy, copy_len, &map_hint))) { clib_warning ("Memory mapping error on interface hw_if_index=%d " @@ -2386,7 +2389,7 @@ done3: vlib_increment_simple_counter (vnet_main.interface_main.sw_if_counters + VNET_INTERFACE_COUNTER_DROP, - os_get_cpu_number (), vui->sw_if_index, n_left); + vlib_get_thread_index (), vui->sw_if_index, n_left); } vlib_buffer_free (vm, vlib_frame_args (frame), frame->n_vectors); @@ -2773,11 +2776,11 @@ vhost_user_send_interrupt_process (vlib_main_t * vm, case ~0: vec_foreach (vhc, vum->cpus) { - u32 cpu_index = vhc - vum->cpus; + u32 thread_index = vhc - vum->cpus; f64 next_timeout; next_timeout = timeout; - vec_foreach (vhiq, vum->cpus[cpu_index].rx_queues) + vec_foreach (vhiq, vum->cpus[thread_index].rx_queues) { vui = &vum->vhost_user_interfaces[vhiq->vhost_iface_index]; vhost_user_vring_t *rxvq = diff --git a/src/vnet/dpo/lookup_dpo.c b/src/vnet/dpo/lookup_dpo.c index e94e871c..97ad0a44 100644 --- a/src/vnet/dpo/lookup_dpo.c +++ b/src/vnet/dpo/lookup_dpo.c @@ -266,7 +266,7 @@ lookup_dpo_ip4_inline (vlib_main_t * vm, int table_from_interface) { u32 n_left_from, next_index, * from, * to_next; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters; from = vlib_frame_vector_args (from_frame); @@ -407,10 +407,10 @@ lookup_dpo_ip4_inline (vlib_main_t * vm, vnet_buffer(b1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, b1)); if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) @@ -511,7 +511,7 @@ lookup_dpo_ip4_inline (vlib_main_t * vm, vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) @@ -606,7 +606,7 @@ lookup_dpo_ip6_inline (vlib_main_t * vm, { vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters; u32 n_left_from, next_index, * from, * to_next; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -749,10 +749,10 @@ lookup_dpo_ip6_inline (vlib_main_t * vm, vnet_buffer(b1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, b1)); if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) @@ -853,7 +853,7 @@ lookup_dpo_ip6_inline (vlib_main_t * vm, vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) @@ -930,7 +930,7 @@ lookup_dpo_mpls_inline (vlib_main_t * vm, int table_from_interface) { u32 n_left_from, next_index, * from, * to_next; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters; from = vlib_frame_vector_args (from_frame); @@ -994,7 +994,7 @@ lookup_dpo_mpls_inline (vlib_main_t * vm, vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) diff --git a/src/vnet/dpo/replicate_dpo.c b/src/vnet/dpo/replicate_dpo.c index a9f334be..e25ceae9 100644 --- a/src/vnet/dpo/replicate_dpo.c +++ b/src/vnet/dpo/replicate_dpo.c @@ -627,7 +627,7 @@ replicate_inline (vlib_main_t * vm, vlib_combined_counter_main_t * cm = &replicate_main.repm_counters; replicate_main_t * rm = &replicate_main; u32 n_left_from, * from, * to_next, next_index; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -657,12 +657,12 @@ replicate_inline (vlib_main_t * vm, rep0 = replicate_get(repi0); vlib_increment_combined_counter( - cm, cpu_index, repi0, 1, + cm, thread_index, repi0, 1, vlib_buffer_length_in_chain(vm, b0)); - vec_validate (rm->clones[cpu_index], rep0->rep_n_buckets - 1); + vec_validate (rm->clones[thread_index], rep0->rep_n_buckets - 1); - num_cloned = vlib_buffer_clone (vm, bi0, rm->clones[cpu_index], rep0->rep_n_buckets, 128); + num_cloned = vlib_buffer_clone (vm, bi0, rm->clones[thread_index], rep0->rep_n_buckets, 128); if (num_cloned != rep0->rep_n_buckets) { @@ -673,7 +673,7 @@ replicate_inline (vlib_main_t * vm, for (bucket = 0; bucket < num_cloned; bucket++) { - ci0 = rm->clones[cpu_index][bucket]; + ci0 = rm->clones[thread_index][bucket]; c0 = vlib_get_buffer(vm, ci0); to_next[0] = ci0; @@ -700,7 +700,7 @@ replicate_inline (vlib_main_t * vm, vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); } } - vec_reset_length (rm->clones[cpu_index]); + vec_reset_length (rm->clones[thread_index]); } vlib_put_next_frame (vm, node, next_index, n_left_to_next); diff --git a/src/vnet/ethernet/arp.c b/src/vnet/ethernet/arp.c index ee757505..c74a097e 100644 --- a/src/vnet/ethernet/arp.c +++ b/src/vnet/ethernet/arp.c @@ -1771,7 +1771,7 @@ set_ip4_over_ethernet_rpc_callback (vnet_arp_set_ip4_over_ethernet_rpc_args_t * a) { vnet_main_t *vm = vnet_get_main (); - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); if (a->flags & ETHERNET_ARP_ARGS_REMOVE) vnet_arp_unset_ip4_over_ethernet_internal (vm, a); diff --git a/src/vnet/ethernet/interface.c b/src/vnet/ethernet/interface.c index 9894e3c8..335e3f9f 100644 --- a/src/vnet/ethernet/interface.c +++ b/src/vnet/ethernet/interface.c @@ -362,7 +362,7 @@ simulated_ethernet_interface_tx (vlib_main_t * vm, u32 next_index = VNET_SIMULATED_ETHERNET_TX_NEXT_ETHERNET_INPUT; u32 i, next_node_index, bvi_flag, sw_if_index; u32 n_pkts = 0, n_bytes = 0; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; vnet_main_t *vnm = vnet_get_main (); vnet_interface_main_t *im = &vnm->interface_main; vlib_node_main_t *nm = &vm->node_main; @@ -420,8 +420,9 @@ simulated_ethernet_interface_tx (vlib_main_t * vm, /* increment TX interface stat */ vlib_increment_combined_counter (im->combined_sw_if_counters + - VNET_INTERFACE_COUNTER_TX, cpu_index, - sw_if_index, n_pkts, n_bytes); + VNET_INTERFACE_COUNTER_TX, + thread_index, sw_if_index, n_pkts, + n_bytes); } return n_left_from; diff --git a/src/vnet/ethernet/node.c b/src/vnet/ethernet/node.c index b699e381..f7787ed2 100755 --- a/src/vnet/ethernet/node.c +++ b/src/vnet/ethernet/node.c @@ -291,7 +291,7 @@ ethernet_input_inline (vlib_main_t * vm, vlib_node_runtime_t *error_node; u32 n_left_from, next_index, *from, *to_next; u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 cached_sw_if_index = ~0; u32 cached_is_l2 = 0; /* shut up gcc */ vnet_hw_interface_t *hi = NULL; /* used for main interface only */ @@ -510,7 +510,7 @@ ethernet_input_inline (vlib_main_t * vm, interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, new_sw_if_index0, 1, len0); if (new_sw_if_index1 != old_sw_if_index1 @@ -519,7 +519,7 @@ ethernet_input_inline (vlib_main_t * vm, interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, new_sw_if_index1, 1, len1); @@ -530,7 +530,7 @@ ethernet_input_inline (vlib_main_t * vm, vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = stats_n_bytes = 0; @@ -696,13 +696,13 @@ ethernet_input_inline (vlib_main_t * vm, vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, new_sw_if_index0, 1, len0); + thread_index, new_sw_if_index0, 1, len0); if (stats_n_packets > 0) { vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = stats_n_bytes = 0; } @@ -734,7 +734,7 @@ ethernet_input_inline (vlib_main_t * vm, vlib_increment_combined_counter (vnm->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); node->runtime_data[0] = stats_sw_if_index; } diff --git a/src/vnet/gre/node.c b/src/vnet/gre/node.c index 2683586e..acf15f24 100644 --- a/src/vnet/gre/node.c +++ b/src/vnet/gre/node.c @@ -75,7 +75,7 @@ gre_input (vlib_main_t * vm, u64 cached_tunnel_key6[4]; u32 cached_tunnel_sw_if_index = 0, tunnel_sw_if_index = 0; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u32 len; vnet_interface_main_t *im = &gm->vnet_main->interface_main; @@ -257,7 +257,7 @@ gre_input (vlib_main_t * vm, len = vlib_buffer_length_in_chain (vm, b0); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, tunnel_sw_if_index, 1 /* packets */, len /* bytes */); @@ -324,7 +324,7 @@ drop0: len = vlib_buffer_length_in_chain (vm, b1); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, tunnel_sw_if_index, 1 /* packets */, len /* bytes */); @@ -502,7 +502,7 @@ drop1: len = vlib_buffer_length_in_chain (vm, b0); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, tunnel_sw_if_index, 1 /* packets */, len /* bytes */); diff --git a/src/vnet/interface.h b/src/vnet/interface.h index a1ea2d61..08f08b10 100644 --- a/src/vnet/interface.h +++ b/src/vnet/interface.h @@ -468,7 +468,7 @@ typedef struct vnet_hw_interface_t u32 input_node_index; /* input node cpu index by queue */ - u32 *input_node_cpu_index_by_queue; + u32 *input_node_thread_index_by_queue; } vnet_hw_interface_t; diff --git a/src/vnet/interface_output.c b/src/vnet/interface_output.c index 03f2cdca..663dc309 100644 --- a/src/vnet/interface_output.c +++ b/src/vnet/interface_output.c @@ -196,7 +196,7 @@ slow_path (vlib_main_t * vm, */ static_always_inline void incr_output_stats (vnet_main_t * vnm, - u32 cpu_index, + u32 thread_index, u32 length, u32 sw_if_index, u32 * last_sw_if_index, u32 * n_packets, u32 * n_bytes) @@ -216,7 +216,7 @@ incr_output_stats (vnet_main_t * vnm, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, + thread_index, *last_sw_if_index, *n_packets, *n_bytes); } @@ -240,7 +240,7 @@ vnet_interface_output_node_flatten (vlib_main_t * vm, u32 n_left_to_tx, *from, *from_end, *to_tx; u32 n_bytes, n_buffers, n_packets; u32 last_sw_if_index; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; n_buffers = frame->n_vectors; @@ -266,7 +266,7 @@ vnet_interface_output_node_flatten (vlib_main_t * vm, cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, VNET_INTERFACE_COUNTER_TX_ERROR); - vlib_increment_simple_counter (cm, cpu_index, + vlib_increment_simple_counter (cm, thread_index, rt->sw_if_index, n_buffers); return vlib_error_drop_buffers (vm, node, from, /* buffer stride */ 1, @@ -341,18 +341,18 @@ vnet_interface_output_node_flatten (vlib_main_t * vm, from += 1; to_tx += n_buffers; n_left_to_tx -= n_buffers; - incr_output_stats (vnm, cpu_index, n_slow_bytes, + incr_output_stats (vnm, thread_index, n_slow_bytes, vnet_buffer (b)->sw_if_index[VLIB_TX], &last_sw_if_index, &n_packets, &n_bytes); } } else { - incr_output_stats (vnm, cpu_index, + incr_output_stats (vnm, thread_index, vlib_buffer_length_in_chain (vm, b0), vnet_buffer (b0)->sw_if_index[VLIB_TX], &last_sw_if_index, &n_packets, &n_bytes); - incr_output_stats (vnm, cpu_index, + incr_output_stats (vnm, thread_index, vlib_buffer_length_in_chain (vm, b0), vnet_buffer (b1)->sw_if_index[VLIB_TX], &last_sw_if_index, &n_packets, &n_bytes); @@ -396,7 +396,7 @@ vnet_interface_output_node_flatten (vlib_main_t * vm, to_tx += n_buffers; n_left_to_tx -= n_buffers; } - incr_output_stats (vnm, cpu_index, + incr_output_stats (vnm, thread_index, vlib_buffer_length_in_chain (vm, b0), vnet_buffer (b0)->sw_if_index[VLIB_TX], &last_sw_if_index, &n_packets, &n_bytes); @@ -408,7 +408,7 @@ vnet_interface_output_node_flatten (vlib_main_t * vm, } /* Final update of interface stats. */ - incr_output_stats (vnm, cpu_index, 0, ~0, /* ~0 will flush stats */ + incr_output_stats (vnm, thread_index, 0, ~0, /* ~0 will flush stats */ &last_sw_if_index, &n_packets, &n_bytes); return n_buffers; @@ -428,7 +428,7 @@ vnet_interface_output_node (vlib_main_t * vm, u32 n_left_to_tx, *from, *from_end, *to_tx; u32 n_bytes, n_buffers, n_packets; u32 n_bytes_b0, n_bytes_b1, n_bytes_b2, n_bytes_b3; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; vnet_interface_main_t *im = &vnm->interface_main; u32 next_index = VNET_INTERFACE_OUTPUT_NEXT_TX; u32 current_config_index = ~0; @@ -458,7 +458,7 @@ vnet_interface_output_node (vlib_main_t * vm, cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, VNET_INTERFACE_COUNTER_TX_ERROR); - vlib_increment_simple_counter (cm, cpu_index, + vlib_increment_simple_counter (cm, thread_index, rt->sw_if_index, n_buffers); return vlib_error_drop_buffers (vm, node, from, @@ -558,7 +558,7 @@ vnet_interface_output_node (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, tx_swif0, 1, + thread_index, tx_swif0, 1, n_bytes_b0); } @@ -567,7 +567,7 @@ vnet_interface_output_node (vlib_main_t * vm, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, tx_swif1, 1, + thread_index, tx_swif1, 1, n_bytes_b1); } @@ -576,7 +576,7 @@ vnet_interface_output_node (vlib_main_t * vm, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, tx_swif2, 1, + thread_index, tx_swif2, 1, n_bytes_b2); } if (PREDICT_FALSE (tx_swif3 != rt->sw_if_index)) @@ -584,7 +584,7 @@ vnet_interface_output_node (vlib_main_t * vm, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, tx_swif3, 1, + thread_index, tx_swif3, 1, n_bytes_b3); } } @@ -623,7 +623,7 @@ vnet_interface_output_node (vlib_main_t * vm, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, tx_swif0, 1, + thread_index, tx_swif0, 1, n_bytes_b0); } } @@ -634,7 +634,7 @@ vnet_interface_output_node (vlib_main_t * vm, /* Update main interface stats. */ vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, + thread_index, rt->sw_if_index, n_packets, n_bytes); return n_buffers; } @@ -893,7 +893,7 @@ process_drop_punt (vlib_main_t * vm, u32 current_sw_if_index, n_errors_current_sw_if_index; u64 current_counter; vlib_simple_counter_main_t *cm; - u32 cpu_index = vm->cpu_index; + u32 thread_index = vm->thread_index; static vlib_error_t memory[VNET_ERROR_N_DISPOSITION]; static char memory_init[VNET_ERROR_N_DISPOSITION]; @@ -965,19 +965,19 @@ process_drop_punt (vlib_main_t * vm, current_counter -= 2; n_errors_current_sw_if_index -= 2; - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index1, 1); /* Increment super-interface drop/punt counters for sub-interfaces. */ sw_if0 = vnet_get_sw_interface (vnm, sw_if_index0); vlib_increment_simple_counter - (cm, cpu_index, sw_if0->sup_sw_if_index, + (cm, thread_index, sw_if0->sup_sw_if_index, sw_if0->sup_sw_if_index != sw_if_index0); sw_if1 = vnet_get_sw_interface (vnm, sw_if_index1); vlib_increment_simple_counter - (cm, cpu_index, sw_if1->sup_sw_if_index, + (cm, thread_index, sw_if1->sup_sw_if_index, sw_if1->sup_sw_if_index != sw_if_index1); em->counters[current_counter_index] = current_counter; @@ -1013,11 +1013,12 @@ process_drop_punt (vlib_main_t * vm, sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX]; /* Increment drop/punt counters. */ - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); /* Increment super-interface drop/punt counters for sub-interfaces. */ sw_if0 = vnet_get_sw_interface (vnm, sw_if_index0); - vlib_increment_simple_counter (cm, cpu_index, sw_if0->sup_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, + sw_if0->sup_sw_if_index, sw_if0->sup_sw_if_index != sw_if_index0); if (PREDICT_FALSE (e0 != current_error)) @@ -1041,12 +1042,12 @@ process_drop_punt (vlib_main_t * vm, { vnet_sw_interface_t *si; - vlib_increment_simple_counter (cm, cpu_index, current_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, current_sw_if_index, n_errors_current_sw_if_index); si = vnet_get_sw_interface (vnm, current_sw_if_index); if (si->sup_sw_if_index != current_sw_if_index) - vlib_increment_simple_counter (cm, cpu_index, si->sup_sw_if_index, + vlib_increment_simple_counter (cm, thread_index, si->sup_sw_if_index, n_errors_current_sw_if_index); } diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index ee1703e7..fdfe7f63 100644 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -75,7 +75,7 @@ ip4_lookup_inline (vlib_main_t * vm, vlib_combined_counter_main_t *cm = &load_balance_main.lbm_to_counters; u32 n_left_from, n_left_to_next, *from, *to_next; ip_lookup_next_t next; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -292,19 +292,19 @@ ip4_lookup_inline (vlib_main_t * vm, vnet_buffer (p3)->ip.adj_index[VLIB_TX] = dpo3->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lb_index0, 1, + (cm, thread_index, lb_index0, 1, vlib_buffer_length_in_chain (vm, p0) + sizeof (ethernet_header_t)); vlib_increment_combined_counter - (cm, cpu_index, lb_index1, 1, + (cm, thread_index, lb_index1, 1, vlib_buffer_length_in_chain (vm, p1) + sizeof (ethernet_header_t)); vlib_increment_combined_counter - (cm, cpu_index, lb_index2, 1, + (cm, thread_index, lb_index2, 1, vlib_buffer_length_in_chain (vm, p2) + sizeof (ethernet_header_t)); vlib_increment_combined_counter - (cm, cpu_index, lb_index3, 1, + (cm, thread_index, lb_index3, 1, vlib_buffer_length_in_chain (vm, p3) + sizeof (ethernet_header_t)); @@ -392,7 +392,7 @@ ip4_lookup_inline (vlib_main_t * vm, vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); from += 1; to_next += 1; @@ -479,7 +479,7 @@ ip4_load_balance (vlib_main_t * vm, vlib_combined_counter_main_t *cm = &load_balance_main.lbm_via_counters; u32 n_left_from, n_left_to_next, *from, *to_next; ip_lookup_next_t next; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -584,9 +584,9 @@ ip4_load_balance (vlib_main_t * vm, vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); vlib_validate_buffer_enqueue_x2 (vm, node, next, to_next, n_left_to_next, @@ -639,7 +639,7 @@ ip4_load_balance (vlib_main_t * vm, vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_validate_buffer_enqueue_x1 (vm, node, next, to_next, n_left_to_next, @@ -2330,7 +2330,7 @@ ip4_rewrite_inline (vlib_main_t * vm, n_left_from = frame->n_vectors; next_index = node->cached_next_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -2379,9 +2379,9 @@ ip4_rewrite_inline (vlib_main_t * vm, if (do_counters) { vlib_prefetch_combined_counter (&adjacency_counters, - cpu_index, adj_index0); + thread_index, adj_index0); vlib_prefetch_combined_counter (&adjacency_counters, - cpu_index, adj_index1); + thread_index, adj_index1); } ip0 = vlib_buffer_get_current (p0); @@ -2527,13 +2527,13 @@ ip4_rewrite_inline (vlib_main_t * vm, { vlib_increment_combined_counter (&adjacency_counters, - cpu_index, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); vlib_increment_combined_counter (&adjacency_counters, - cpu_index, + thread_index, adj_index1, 1, vlib_buffer_length_in_chain (vm, p1) + rw_len1); } @@ -2618,7 +2618,7 @@ ip4_rewrite_inline (vlib_main_t * vm, if (do_counters) vlib_prefetch_combined_counter (&adjacency_counters, - cpu_index, adj_index0); + thread_index, adj_index0); /* Guess we are only writing on simple Ethernet header. */ vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t)); @@ -2637,7 +2637,7 @@ ip4_rewrite_inline (vlib_main_t * vm, if (do_counters) vlib_increment_combined_counter (&adjacency_counters, - cpu_index, adj_index0, 1, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); /* Check MTU of outgoing interface. */ diff --git a/src/vnet/ip/ip4_input.c b/src/vnet/ip/ip4_input.c index ba200a9f..3b08f4b0 100644 --- a/src/vnet/ip/ip4_input.c +++ b/src/vnet/ip/ip4_input.c @@ -85,7 +85,7 @@ ip4_input_inline (vlib_main_t * vm, vlib_node_runtime_t *error_node = vlib_node_get_runtime (vm, ip4_input_node.index); vlib_simple_counter_main_t *cm; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -178,8 +178,8 @@ ip4_input_inline (vlib_main_t * vm, vnet_feature_arc_start (arc0, sw_if_index0, &next0, p0); vnet_feature_arc_start (arc1, sw_if_index1, &next1, p1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index1, 1); /* Punt packets with options or wrong version. */ if (PREDICT_FALSE (ip0->ip_version_and_header_length != 0x45)) @@ -299,7 +299,7 @@ ip4_input_inline (vlib_main_t * vm, vnet_buffer (p0)->ip.adj_index[VLIB_RX] = ~0; vnet_feature_arc_start (arc0, sw_if_index0, &next0, p0); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); /* Punt packets with options or wrong version. */ if (PREDICT_FALSE (ip0->ip_version_and_header_length != 0x45)) diff --git a/src/vnet/ip/ip6_forward.c b/src/vnet/ip/ip6_forward.c index c120f12c..c2fc4f87 100644 --- a/src/vnet/ip/ip6_forward.c +++ b/src/vnet/ip/ip6_forward.c @@ -74,7 +74,7 @@ ip6_lookup_inline (vlib_main_t * vm, vlib_combined_counter_main_t *cm = &load_balance_main.lbm_to_counters; u32 n_left_from, n_left_to_next, *from, *to_next; ip_lookup_next_t next; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -185,9 +185,9 @@ ip6_lookup_inline (vlib_main_t * vm, vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); from += 2; to_next += 2; @@ -291,7 +291,7 @@ ip6_lookup_inline (vlib_main_t * vm, vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); from += 1; to_next += 1; @@ -703,7 +703,7 @@ ip6_load_balance (vlib_main_t * vm, vlib_combined_counter_main_t *cm = &load_balance_main.lbm_via_counters; u32 n_left_from, n_left_to_next, *from, *to_next; ip_lookup_next_t next; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); ip6_main_t *im = &ip6_main; from = vlib_frame_vector_args (frame); @@ -824,9 +824,9 @@ ip6_load_balance (vlib_main_t * vm, vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); vlib_validate_buffer_enqueue_x2 (vm, node, next, to_next, n_left_to_next, @@ -886,7 +886,7 @@ ip6_load_balance (vlib_main_t * vm, } vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_validate_buffer_enqueue_x1 (vm, node, next, to_next, n_left_to_next, @@ -1897,7 +1897,7 @@ ip6_rewrite_inline (vlib_main_t * vm, n_left_from = frame->n_vectors; next_index = node->cached_next_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -2019,11 +2019,11 @@ ip6_rewrite_inline (vlib_main_t * vm, { vlib_increment_combined_counter (&adjacency_counters, - cpu_index, adj_index0, 1, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); vlib_increment_combined_counter (&adjacency_counters, - cpu_index, adj_index1, 1, + thread_index, adj_index1, 1, vlib_buffer_length_in_chain (vm, p1) + rw_len1); } @@ -2156,7 +2156,7 @@ ip6_rewrite_inline (vlib_main_t * vm, { vlib_increment_combined_counter (&adjacency_counters, - cpu_index, adj_index0, 1, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); } diff --git a/src/vnet/ip/ip6_input.c b/src/vnet/ip/ip6_input.c index 20306088..ffdc4727 100644 --- a/src/vnet/ip/ip6_input.c +++ b/src/vnet/ip/ip6_input.c @@ -82,7 +82,7 @@ ip6_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) vlib_node_runtime_t *error_node = vlib_node_get_runtime (vm, ip6_input_node.index); vlib_simple_counter_main_t *cm; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -171,8 +171,8 @@ ip6_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) vnet_feature_arc_start (arc0, sw_if_index0, &next0, p0); vnet_feature_arc_start (arc1, sw_if_index1, &next1, p1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index1, 1); error0 = error1 = IP6_ERROR_NONE; @@ -270,7 +270,7 @@ ip6_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) vnet_buffer (p0)->ip.adj_index[VLIB_RX] = ~0; vnet_feature_arc_start (arc0, sw_if_index0, &next0, p0); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); error0 = IP6_ERROR_NONE; /* Version != 6? Drop it. */ diff --git a/src/vnet/ip/ip6_neighbor.c b/src/vnet/ip/ip6_neighbor.c index 5d1fb6f8..2af546df 100644 --- a/src/vnet/ip/ip6_neighbor.c +++ b/src/vnet/ip/ip6_neighbor.c @@ -581,7 +581,7 @@ vnet_set_ip6_ethernet_neighbor (vlib_main_t * vm, u32 next_index; pending_resolution_t *pr, *mc; - if (os_get_cpu_number ()) + if (vlib_get_thread_index ()) { set_unset_ip6_neighbor_rpc (vm, sw_if_index, a, link_layer_address, 1 /* set new neighbor */ , is_static, @@ -722,7 +722,7 @@ vnet_unset_ip6_ethernet_neighbor (vlib_main_t * vm, uword *p; int rv = 0; - if (os_get_cpu_number ()) + if (vlib_get_thread_index ()) { set_unset_ip6_neighbor_rpc (vm, sw_if_index, a, link_layer_address, 0 /* unset */ , 0, 0); diff --git a/src/vnet/ipsec/esp.h b/src/vnet/ipsec/esp.h index 50cac806..799003b9 100644 --- a/src/vnet/ipsec/esp.h +++ b/src/vnet/ipsec/esp.h @@ -282,8 +282,8 @@ hmac_calc (ipsec_integ_alg_t alg, u8 * data, int data_len, u8 * signature, u8 use_esn, u32 seq_hi) { esp_main_t *em = &esp_main; - u32 cpu_index = os_get_cpu_number (); - HMAC_CTX *ctx = &(em->per_thread_data[cpu_index].hmac_ctx); + u32 thread_index = vlib_get_thread_index (); + HMAC_CTX *ctx = &(em->per_thread_data[thread_index].hmac_ctx); const EVP_MD *md = NULL; unsigned int len; @@ -292,10 +292,10 @@ hmac_calc (ipsec_integ_alg_t alg, if (PREDICT_FALSE (em->esp_integ_algs[alg].md == 0)) return 0; - if (PREDICT_FALSE (alg != em->per_thread_data[cpu_index].last_integ_alg)) + if (PREDICT_FALSE (alg != em->per_thread_data[thread_index].last_integ_alg)) { md = em->esp_integ_algs[alg].md; - em->per_thread_data[cpu_index].last_integ_alg = alg; + em->per_thread_data[thread_index].last_integ_alg = alg; } HMAC_Init (ctx, key, key_len, md); diff --git a/src/vnet/ipsec/esp_decrypt.c b/src/vnet/ipsec/esp_decrypt.c index 7289b260..925d2b45 100644 --- a/src/vnet/ipsec/esp_decrypt.c +++ b/src/vnet/ipsec/esp_decrypt.c @@ -85,8 +85,8 @@ esp_decrypt_aes_cbc (ipsec_crypto_alg_t alg, u8 * in, u8 * out, size_t in_len, u8 * key, u8 * iv) { esp_main_t *em = &esp_main; - u32 cpu_index = os_get_cpu_number (); - EVP_CIPHER_CTX *ctx = &(em->per_thread_data[cpu_index].decrypt_ctx); + u32 thread_index = vlib_get_thread_index (); + EVP_CIPHER_CTX *ctx = &(em->per_thread_data[thread_index].decrypt_ctx); const EVP_CIPHER *cipher = NULL; int out_len; @@ -95,10 +95,11 @@ esp_decrypt_aes_cbc (ipsec_crypto_alg_t alg, if (PREDICT_FALSE (em->esp_crypto_algs[alg].type == 0)) return; - if (PREDICT_FALSE (alg != em->per_thread_data[cpu_index].last_decrypt_alg)) + if (PREDICT_FALSE + (alg != em->per_thread_data[thread_index].last_decrypt_alg)) { cipher = em->esp_crypto_algs[alg].type; - em->per_thread_data[cpu_index].last_decrypt_alg = alg; + em->per_thread_data[thread_index].last_decrypt_alg = alg; } EVP_DecryptInit_ex (ctx, cipher, NULL, key, iv); @@ -117,11 +118,11 @@ esp_decrypt_node_fn (vlib_main_t * vm, u32 *recycle = 0; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); ipsec_alloc_empty_buffers (vm, im); - u32 *empty_buffers = im->empty_buffers[cpu_index]; + u32 *empty_buffers = im->empty_buffers[thread_index]; if (PREDICT_FALSE (vec_len (empty_buffers) < n_left_from)) { diff --git a/src/vnet/ipsec/esp_encrypt.c b/src/vnet/ipsec/esp_encrypt.c index 44ae2297..b2bc4e0b 100644 --- a/src/vnet/ipsec/esp_encrypt.c +++ b/src/vnet/ipsec/esp_encrypt.c @@ -88,8 +88,8 @@ esp_encrypt_aes_cbc (ipsec_crypto_alg_t alg, u8 * in, u8 * out, size_t in_len, u8 * key, u8 * iv) { esp_main_t *em = &esp_main; - u32 cpu_index = os_get_cpu_number (); - EVP_CIPHER_CTX *ctx = &(em->per_thread_data[cpu_index].encrypt_ctx); + u32 thread_index = vlib_get_thread_index (); + EVP_CIPHER_CTX *ctx = &(em->per_thread_data[thread_index].encrypt_ctx); const EVP_CIPHER *cipher = NULL; int out_len; @@ -98,10 +98,11 @@ esp_encrypt_aes_cbc (ipsec_crypto_alg_t alg, if (PREDICT_FALSE (em->esp_crypto_algs[alg].type == IPSEC_CRYPTO_ALG_NONE)) return; - if (PREDICT_FALSE (alg != em->per_thread_data[cpu_index].last_encrypt_alg)) + if (PREDICT_FALSE + (alg != em->per_thread_data[thread_index].last_encrypt_alg)) { cipher = em->esp_crypto_algs[alg].type; - em->per_thread_data[cpu_index].last_encrypt_alg = alg; + em->per_thread_data[thread_index].last_encrypt_alg = alg; } EVP_EncryptInit_ex (ctx, cipher, NULL, key, iv); @@ -119,11 +120,11 @@ esp_encrypt_node_fn (vlib_main_t * vm, n_left_from = from_frame->n_vectors; ipsec_main_t *im = &ipsec_main; u32 *recycle = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); ipsec_alloc_empty_buffers (vm, im); - u32 *empty_buffers = im->empty_buffers[cpu_index]; + u32 *empty_buffers = im->empty_buffers[thread_index]; if (PREDICT_FALSE (vec_len (empty_buffers) < n_left_from)) { diff --git a/src/vnet/ipsec/ikev2.c b/src/vnet/ipsec/ikev2.c index 2c1074d8..3f9978a7 100644 --- a/src/vnet/ipsec/ikev2.c +++ b/src/vnet/ipsec/ikev2.c @@ -303,16 +303,16 @@ static void ikev2_delete_sa (ikev2_sa_t * sa) { ikev2_main_t *km = &ikev2_main; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); uword *p; ikev2_sa_free_all_vec (sa); - p = hash_get (km->per_thread_data[cpu_index].sa_by_rspi, sa->rspi); + p = hash_get (km->per_thread_data[thread_index].sa_by_rspi, sa->rspi); if (p) { - hash_unset (km->per_thread_data[cpu_index].sa_by_rspi, sa->rspi); - pool_put (km->per_thread_data[cpu_index].sas, sa); + hash_unset (km->per_thread_data[thread_index].sa_by_rspi, sa->rspi); + pool_put (km->per_thread_data[thread_index].sas, sa); } } @@ -776,29 +776,31 @@ ikev2_initial_contact_cleanup (ikev2_sa_t * sa) ikev2_sa_t *tmp; u32 i, *delete = 0; ikev2_child_sa_t *c; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); if (!sa->initial_contact) return; /* find old IKE SAs with the same authenticated identity */ /* *INDENT-OFF* */ - pool_foreach (tmp, km->per_thread_data[cpu_index].sas, ({ + pool_foreach (tmp, km->per_thread_data[thread_index].sas, ({ if (tmp->i_id.type != sa->i_id.type || vec_len(tmp->i_id.data) != vec_len(sa->i_id.data) || memcmp(sa->i_id.data, tmp->i_id.data, vec_len(sa->i_id.data))) continue; if (sa->rspi != tmp->rspi) - vec_add1(delete, tmp - km->per_thread_data[cpu_index].sas); + vec_add1(delete, tmp - km->per_thread_data[thread_index].sas); })); /* *INDENT-ON* */ for (i = 0; i < vec_len (delete); i++) { - tmp = pool_elt_at_index (km->per_thread_data[cpu_index].sas, delete[i]); - vec_foreach (c, tmp->childs) - ikev2_delete_tunnel_interface (km->vnet_main, tmp, c); + tmp = + pool_elt_at_index (km->per_thread_data[thread_index].sas, delete[i]); + vec_foreach (c, + tmp->childs) ikev2_delete_tunnel_interface (km->vnet_main, + tmp, c); ikev2_delete_sa (tmp); } @@ -1922,10 +1924,10 @@ ikev2_retransmit_sa_init (ike_header_t * ike, { ikev2_main_t *km = &ikev2_main; ikev2_sa_t *sa; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); /* *INDENT-OFF* */ - pool_foreach (sa, km->per_thread_data[cpu_index].sas, ({ + pool_foreach (sa, km->per_thread_data[thread_index].sas, ({ if (sa->ispi == clib_net_to_host_u64(ike->ispi) && sa->iaddr.as_u32 == iaddr.as_u32 && sa->raddr.as_u32 == raddr.as_u32) @@ -2036,7 +2038,7 @@ ikev2_node_fn (vlib_main_t * vm, u32 n_left_from, *from, *to_next; ikev2_next_t next_index; ikev2_main_t *km = &ikev2_main; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -2134,11 +2136,14 @@ ikev2_node_fn (vlib_main_t * vm, if (sa0->state == IKEV2_STATE_SA_INIT) { /* add SA to the pool */ - pool_get (km->per_thread_data[cpu_index].sas, sa0); + pool_get (km->per_thread_data[thread_index].sas, + sa0); clib_memcpy (sa0, &sa, sizeof (*sa0)); - hash_set (km->per_thread_data[cpu_index].sa_by_rspi, + hash_set (km-> + per_thread_data[thread_index].sa_by_rspi, sa0->rspi, - sa0 - km->per_thread_data[cpu_index].sas); + sa0 - + km->per_thread_data[thread_index].sas); } else { @@ -2169,11 +2174,11 @@ ikev2_node_fn (vlib_main_t * vm, if (sa0->state == IKEV2_STATE_SA_INIT) { /* add SA to the pool */ - pool_get (km->per_thread_data[cpu_index].sas, sa0); + pool_get (km->per_thread_data[thread_index].sas, sa0); clib_memcpy (sa0, &sa, sizeof (*sa0)); - hash_set (km->per_thread_data[cpu_index].sa_by_rspi, + hash_set (km->per_thread_data[thread_index].sa_by_rspi, sa0->rspi, - sa0 - km->per_thread_data[cpu_index].sas); + sa0 - km->per_thread_data[thread_index].sas); } else { @@ -2184,12 +2189,13 @@ ikev2_node_fn (vlib_main_t * vm, else if (ike0->exchange == IKEV2_EXCHANGE_IKE_AUTH) { uword *p; - p = hash_get (km->per_thread_data[cpu_index].sa_by_rspi, + p = hash_get (km->per_thread_data[thread_index].sa_by_rspi, clib_net_to_host_u64 (ike0->rspi)); if (p) { - sa0 = pool_elt_at_index (km->per_thread_data[cpu_index].sas, - p[0]); + sa0 = + pool_elt_at_index (km->per_thread_data[thread_index].sas, + p[0]); r = ikev2_retransmit_resp (sa0, ike0); if (r == 1) @@ -2240,12 +2246,13 @@ ikev2_node_fn (vlib_main_t * vm, else if (ike0->exchange == IKEV2_EXCHANGE_INFORMATIONAL) { uword *p; - p = hash_get (km->per_thread_data[cpu_index].sa_by_rspi, + p = hash_get (km->per_thread_data[thread_index].sa_by_rspi, clib_net_to_host_u64 (ike0->rspi)); if (p) { - sa0 = pool_elt_at_index (km->per_thread_data[cpu_index].sas, - p[0]); + sa0 = + pool_elt_at_index (km->per_thread_data[thread_index].sas, + p[0]); r = ikev2_retransmit_resp (sa0, ike0); if (r == 1) @@ -2305,12 +2312,13 @@ ikev2_node_fn (vlib_main_t * vm, else if (ike0->exchange == IKEV2_EXCHANGE_CREATE_CHILD_SA) { uword *p; - p = hash_get (km->per_thread_data[cpu_index].sa_by_rspi, + p = hash_get (km->per_thread_data[thread_index].sa_by_rspi, clib_net_to_host_u64 (ike0->rspi)); if (p) { - sa0 = pool_elt_at_index (km->per_thread_data[cpu_index].sas, - p[0]); + sa0 = + pool_elt_at_index (km->per_thread_data[thread_index].sas, + p[0]); r = ikev2_retransmit_resp (sa0, ike0); if (r == 1) diff --git a/src/vnet/ipsec/ipsec.h b/src/vnet/ipsec/ipsec.h index 58f0f145..c884e360 100644 --- a/src/vnet/ipsec/ipsec.h +++ b/src/vnet/ipsec/ipsec.h @@ -324,21 +324,21 @@ int ipsec_set_interface_key (vnet_main_t * vnm, u32 hw_if_index, always_inline void ipsec_alloc_empty_buffers (vlib_main_t * vm, ipsec_main_t * im) { - u32 cpu_index = os_get_cpu_number (); - uword l = vec_len (im->empty_buffers[cpu_index]); + u32 thread_index = vlib_get_thread_index (); + uword l = vec_len (im->empty_buffers[thread_index]); uword n_alloc = 0; if (PREDICT_FALSE (l < VLIB_FRAME_SIZE)) { - if (!im->empty_buffers[cpu_index]) + if (!im->empty_buffers[thread_index]) { - vec_alloc (im->empty_buffers[cpu_index], 2 * VLIB_FRAME_SIZE); + vec_alloc (im->empty_buffers[thread_index], 2 * VLIB_FRAME_SIZE); } - n_alloc = vlib_buffer_alloc (vm, im->empty_buffers[cpu_index] + l, + n_alloc = vlib_buffer_alloc (vm, im->empty_buffers[thread_index] + l, 2 * VLIB_FRAME_SIZE - l); - _vec_len (im->empty_buffers[cpu_index]) = l + n_alloc; + _vec_len (im->empty_buffers[thread_index]) = l + n_alloc; } } diff --git a/src/vnet/ipsec/ipsec_if.c b/src/vnet/ipsec/ipsec_if.c index dc882004..ed124894 100644 --- a/src/vnet/ipsec/ipsec_if.c +++ b/src/vnet/ipsec/ipsec_if.c @@ -99,7 +99,7 @@ static int ipsec_add_del_tunnel_if_rpc_callback (ipsec_add_del_tunnel_args_t * a) { vnet_main_t *vnm = vnet_get_main (); - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); return ipsec_add_del_tunnel_if_internal (vnm, a); } diff --git a/src/vnet/l2/l2_bvi.h b/src/vnet/l2/l2_bvi.h index dd1130a6..e21a1616 100644 --- a/src/vnet/l2/l2_bvi.h +++ b/src/vnet/l2/l2_bvi.h @@ -97,7 +97,7 @@ l2_to_bvi (vlib_main_t * vlib_main, vlib_increment_combined_counter (vnet_main->interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - vlib_main->cpu_index, + vlib_main->thread_index, vnet_buffer (b0)->sw_if_index[VLIB_RX], 1, vlib_buffer_length_in_chain (vlib_main, b0)); return TO_BVI_ERR_OK; diff --git a/src/vnet/l2/l2_input.c b/src/vnet/l2/l2_input.c index 041ff38d..e5d6878a 100644 --- a/src/vnet/l2/l2_input.c +++ b/src/vnet/l2/l2_input.c @@ -117,7 +117,7 @@ typedef enum static_always_inline void classify_and_dispatch (vlib_main_t * vm, vlib_node_runtime_t * node, - u32 cpu_index, + u32 thread_index, l2input_main_t * msm, vlib_buffer_t * b0, u32 * next0) { /* @@ -237,7 +237,7 @@ l2input_node_inline (vlib_main_t * vm, u32 n_left_from, *from, *to_next; l2input_next_t next_index; l2input_main_t *msm = &l2input_main; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; /* number of packets to process */ @@ -350,10 +350,10 @@ l2input_node_inline (vlib_main_t * vm, vlib_node_increment_counter (vm, l2input_node.index, L2INPUT_ERROR_L2INPUT, 4); - classify_and_dispatch (vm, node, cpu_index, msm, b0, &next0); - classify_and_dispatch (vm, node, cpu_index, msm, b1, &next1); - classify_and_dispatch (vm, node, cpu_index, msm, b2, &next2); - classify_and_dispatch (vm, node, cpu_index, msm, b3, &next3); + classify_and_dispatch (vm, node, thread_index, msm, b0, &next0); + classify_and_dispatch (vm, node, thread_index, msm, b1, &next1); + classify_and_dispatch (vm, node, thread_index, msm, b2, &next2); + classify_and_dispatch (vm, node, thread_index, msm, b3, &next3); /* verify speculative enqueues, maybe switch current next frame */ /* if next0==next1==next_index then nothing special needs to be done */ @@ -393,7 +393,7 @@ l2input_node_inline (vlib_main_t * vm, vlib_node_increment_counter (vm, l2input_node.index, L2INPUT_ERROR_L2INPUT, 1); - classify_and_dispatch (vm, node, cpu_index, msm, b0, &next0); + classify_and_dispatch (vm, node, thread_index, msm, b0, &next0); /* verify speculative enqueue, maybe switch current next frame */ vlib_validate_buffer_enqueue_x1 (vm, node, next_index, diff --git a/src/vnet/l2/l2_output.c b/src/vnet/l2/l2_output.c index 00f22571..e17b2a16 100644 --- a/src/vnet/l2/l2_output.c +++ b/src/vnet/l2/l2_output.c @@ -643,11 +643,11 @@ l2output_create_output_node_mapping (vlib_main_t * vlib_main, vnet_main_t * vnet hw0 = vnet_get_sup_hw_interface (vnet_main, sw_if_index); - uword cpu_number; + uword thread_index; - cpu_number = os_get_cpu_number (); + thread_index = vlib_get_thread_index (); - if (cpu_number) + if (thread_index) { u32 oldflags; diff --git a/src/vnet/l2tp/decap.c b/src/vnet/l2tp/decap.c index e8986935..46104129 100644 --- a/src/vnet/l2tp/decap.c +++ b/src/vnet/l2tp/decap.c @@ -149,7 +149,7 @@ last_stage (vlib_main_t * vm, vlib_node_runtime_t * node, u32 bi) /* per-mapping byte stats include the ethernet header */ vlib_increment_combined_counter (&lm->counter_main, - os_get_cpu_number (), + vlib_get_thread_index (), counter_index, 1 /* packet_increment */ , vlib_buffer_length_in_chain (vm, b) + sizeof (ethernet_header_t)); diff --git a/src/vnet/l2tp/encap.c b/src/vnet/l2tp/encap.c index ed7a9580..dcdfde4b 100644 --- a/src/vnet/l2tp/encap.c +++ b/src/vnet/l2tp/encap.c @@ -124,7 +124,7 @@ last_stage (vlib_main_t * vm, vlib_node_runtime_t * node, u32 bi) /* per-mapping byte stats include the ethernet header */ vlib_increment_combined_counter (&lm->counter_main, - os_get_cpu_number (), + vlib_get_thread_index (), counter_index, 1 /* packet_increment */ , vlib_buffer_length_in_chain (vm, b)); diff --git a/src/vnet/l2tp/l2tp.c b/src/vnet/l2tp/l2tp.c index cb94d7e7..3dedc447 100644 --- a/src/vnet/l2tp/l2tp.c +++ b/src/vnet/l2tp/l2tp.c @@ -157,7 +157,7 @@ test_counters_command_fn (vlib_main_t * vm, u32 session_index; u32 counter_index; u32 nincr = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); /* *INDENT-OFF* */ pool_foreach (session, lm->sessions, @@ -167,11 +167,11 @@ test_counters_command_fn (vlib_main_t * vm, session_index_to_counter_index (session_index, SESSION_COUNTER_USER_TO_NETWORK); vlib_increment_combined_counter (&lm->counter_main, - cpu_index, + thread_index, counter_index, 1/*pkt*/, 1111 /*bytes*/); vlib_increment_combined_counter (&lm->counter_main, - cpu_index, + thread_index, counter_index+1, 1/*pkt*/, 2222 /*bytes*/); nincr++; diff --git a/src/vnet/lisp-gpe/decap.c b/src/vnet/lisp-gpe/decap.c index d887a95f..68769710 100644 --- a/src/vnet/lisp-gpe/decap.c +++ b/src/vnet/lisp-gpe/decap.c @@ -103,7 +103,7 @@ next_index_to_iface (lisp_gpe_main_t * lgm, u32 next_index) } static_always_inline void -incr_decap_stats (vnet_main_t * vnm, u32 cpu_index, u32 length, +incr_decap_stats (vnet_main_t * vnm, u32 thread_index, u32 length, u32 sw_if_index, u32 * last_sw_if_index, u32 * n_packets, u32 * n_bytes) { @@ -122,7 +122,7 @@ incr_decap_stats (vnet_main_t * vnm, u32 cpu_index, u32 length, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, *last_sw_if_index, + thread_index, *last_sw_if_index, *n_packets, *n_bytes); } *last_sw_if_index = sw_if_index; @@ -150,11 +150,11 @@ static uword lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, u8 is_v4) { - u32 n_left_from, next_index, *from, *to_next, cpu_index; + u32 n_left_from, next_index, *from, *to_next, thread_index; u32 n_bytes = 0, n_packets = 0, last_sw_if_index = ~0, drops = 0; lisp_gpe_main_t *lgm = vnet_lisp_gpe_get_main (); - cpu_index = os_get_cpu_number (); + thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -267,7 +267,7 @@ lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (si0) { - incr_decap_stats (lgm->vnet_main, cpu_index, + incr_decap_stats (lgm->vnet_main, thread_index, vlib_buffer_length_in_chain (vm, b0), si0[0], &last_sw_if_index, &n_packets, &n_bytes); vnet_buffer (b0)->sw_if_index[VLIB_RX] = si0[0]; @@ -282,7 +282,7 @@ lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (si1) { - incr_decap_stats (lgm->vnet_main, cpu_index, + incr_decap_stats (lgm->vnet_main, thread_index, vlib_buffer_length_in_chain (vm, b1), si1[0], &last_sw_if_index, &n_packets, &n_bytes); vnet_buffer (b1)->sw_if_index[VLIB_RX] = si1[0]; @@ -397,7 +397,7 @@ lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (si0) { - incr_decap_stats (lgm->vnet_main, cpu_index, + incr_decap_stats (lgm->vnet_main, thread_index, vlib_buffer_length_in_chain (vm, b0), si0[0], &last_sw_if_index, &n_packets, &n_bytes); vnet_buffer (b0)->sw_if_index[VLIB_RX] = si0[0]; @@ -430,7 +430,7 @@ lisp_gpe_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } /* flush iface stats */ - incr_decap_stats (lgm->vnet_main, cpu_index, 0, ~0, &last_sw_if_index, + incr_decap_stats (lgm->vnet_main, thread_index, 0, ~0, &last_sw_if_index, &n_packets, &n_bytes); vlib_node_increment_counter (vm, lisp_gpe_ip4_input_node.index, LISP_GPE_ERROR_NO_TUNNEL, drops); diff --git a/src/vnet/lldp/lldp_input.c b/src/vnet/lldp/lldp_input.c index 762743d0..e88f6fdb 100644 --- a/src/vnet/lldp/lldp_input.c +++ b/src/vnet/lldp/lldp_input.c @@ -35,7 +35,7 @@ typedef struct static void lldp_rpc_update_peer_cb (const lldp_intf_update_t * a) { - ASSERT (os_get_cpu_number () == 0); + ASSERT (vlib_get_thread_index () == 0); lldp_intf_t *n = lldp_get_intf (&lldp_main, a->hw_if_index); if (!n) diff --git a/src/vnet/map/ip4_map.c b/src/vnet/map/ip4_map.c index 1a20d704..e39b6f14 100644 --- a/src/vnet/map/ip4_map.c +++ b/src/vnet/map/ip4_map.c @@ -248,7 +248,7 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) next_index = node->cached_next_index; map_main_t *mm = &map_main; vlib_combined_counter_main_t *cm = mm->domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -377,7 +377,7 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) ip40) ? IP4_MAP_NEXT_IP6_REWRITE : next0; vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, map_domain_index0, 1, clib_net_to_host_u16 (ip6h0->payload_length) + @@ -409,7 +409,7 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) ip41) ? IP4_MAP_NEXT_IP6_REWRITE : next1; vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, map_domain_index1, 1, clib_net_to_host_u16 (ip6h1->payload_length) + @@ -520,7 +520,7 @@ ip4_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) ip40) ? IP4_MAP_NEXT_IP6_REWRITE : next0; vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, map_domain_index0, 1, clib_net_to_host_u16 (ip6h0->payload_length) + @@ -564,7 +564,7 @@ ip4_map_reass (vlib_main_t * vm, next_index = node->cached_next_index; map_main_t *mm = &map_main; vlib_combined_counter_main_t *cm = mm->domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 *fragments_to_drop = NULL; u32 *fragments_to_loopback = NULL; @@ -694,8 +694,8 @@ ip4_map_reass (vlib_main_t * vm, { if (error0 == MAP_ERROR_NONE) vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, map_domain_index0, - 1, + thread_index, + map_domain_index0, 1, clib_net_to_host_u16 (ip60->payload_length) + 40); next0 = diff --git a/src/vnet/map/ip4_map_t.c b/src/vnet/map/ip4_map_t.c index b63d76bf..5f2bcbf9 100644 --- a/src/vnet/map/ip4_map_t.c +++ b/src/vnet/map/ip4_map_t.c @@ -477,7 +477,7 @@ ip4_map_t_icmp (vlib_main_t * vm, n_left_from = frame->n_vectors; next_index = node->cached_next_index; vlib_combined_counter_main_t *cm = map_main.domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -520,7 +520,7 @@ ip4_map_t_icmp (vlib_main_t * vm, if (PREDICT_TRUE (error0 == MAP_ERROR_NONE)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, vnet_buffer (p0)->map_t. map_domain_index, 1, len0); } @@ -1051,7 +1051,7 @@ ip4_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) n_left_from = frame->n_vectors; next_index = node->cached_next_index; vlib_combined_counter_main_t *cm = map_main.domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -1158,7 +1158,7 @@ ip4_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error0 == MAP_ERROR_NONE && next0 != IP4_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, vnet_buffer (p0)->map_t. map_domain_index, 1, clib_net_to_host_u16 (ip40-> @@ -1169,7 +1169,7 @@ ip4_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error1 == MAP_ERROR_NONE && next1 != IP4_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, vnet_buffer (p1)->map_t. map_domain_index, 1, clib_net_to_host_u16 (ip41-> @@ -1252,7 +1252,7 @@ ip4_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error0 == MAP_ERROR_NONE && next0 != IP4_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_TX, - cpu_index, + thread_index, vnet_buffer (p0)->map_t. map_domain_index, 1, clib_net_to_host_u16 (ip40-> diff --git a/src/vnet/map/ip6_map.c b/src/vnet/map/ip6_map.c index f7eb768f..63ada962 100644 --- a/src/vnet/map/ip6_map.c +++ b/src/vnet/map/ip6_map.c @@ -172,7 +172,7 @@ ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) vlib_node_get_runtime (vm, ip6_map_node.index); map_main_t *mm = &map_main; vlib_combined_counter_main_t *cm = mm->domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -319,7 +319,7 @@ ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) IP6_MAP_NEXT_IP4_REWRITE : next0; } vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, map_domain_index0, 1, clib_net_to_host_u16 (ip40->length)); @@ -352,7 +352,7 @@ ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) IP6_MAP_NEXT_IP4_REWRITE : next1; } vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, map_domain_index1, 1, clib_net_to_host_u16 (ip41->length)); @@ -505,7 +505,7 @@ ip6_map (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) IP6_MAP_NEXT_IP4_REWRITE : next0; } vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, map_domain_index0, 1, clib_net_to_host_u16 (ip40->length)); @@ -820,7 +820,7 @@ ip6_map_ip4_reass (vlib_main_t * vm, vlib_node_get_runtime (vm, ip6_map_ip4_reass_node.index); map_main_t *mm = &map_main; vlib_combined_counter_main_t *cm = mm->domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 *fragments_to_drop = NULL; u32 *fragments_to_loopback = NULL; @@ -958,8 +958,8 @@ ip6_map_ip4_reass (vlib_main_t * vm, { if (error0 == MAP_ERROR_NONE) vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, map_domain_index0, - 1, + thread_index, + map_domain_index0, 1, clib_net_to_host_u16 (ip40->length)); next0 = @@ -1015,7 +1015,7 @@ ip6_map_icmp_relay (vlib_main_t * vm, vlib_node_runtime_t *error_node = vlib_node_get_runtime (vm, ip6_map_icmp_relay_node.index); map_main_t *mm = &map_main; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u16 *fragment_ids, *fid; from = vlib_frame_vector_args (frame); @@ -1143,7 +1143,8 @@ ip6_map_icmp_relay (vlib_main_t * vm, ip_csum_t sum = ip_incremental_checksum (0, new_icmp40, nlen - 20); new_icmp40->checksum = ~ip_csum_fold (sum); - vlib_increment_simple_counter (&mm->icmp_relayed, cpu_index, 0, 1); + vlib_increment_simple_counter (&mm->icmp_relayed, thread_index, 0, + 1); error: if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED)) diff --git a/src/vnet/map/ip6_map_t.c b/src/vnet/map/ip6_map_t.c index eb3996c2..99151678 100644 --- a/src/vnet/map/ip6_map_t.c +++ b/src/vnet/map/ip6_map_t.c @@ -448,7 +448,7 @@ ip6_map_t_icmp (vlib_main_t * vm, n_left_from = frame->n_vectors; next_index = node->cached_next_index; vlib_combined_counter_main_t *cm = map_main.domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -493,7 +493,7 @@ ip6_map_t_icmp (vlib_main_t * vm, if (PREDICT_TRUE (error0 == MAP_ERROR_NONE)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, vnet_buffer (p0)-> map_t.map_domain_index, 1, len0); @@ -1051,7 +1051,7 @@ ip6_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) vlib_node_runtime_t *error_node = vlib_node_get_runtime (vm, ip6_map_t_node.index); vlib_combined_counter_main_t *cm = map_main.domain_counters; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; @@ -1218,7 +1218,7 @@ ip6_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error0 == MAP_ERROR_NONE && next0 != IP6_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, vnet_buffer (p0)-> map_t.map_domain_index, 1, clib_net_to_host_u16 @@ -1229,7 +1229,7 @@ ip6_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error1 == MAP_ERROR_NONE && next1 != IP6_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, vnet_buffer (p1)-> map_t.map_domain_index, 1, clib_net_to_host_u16 @@ -1403,7 +1403,7 @@ ip6_map_t (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) (error0 == MAP_ERROR_NONE && next0 != IP6_MAPT_NEXT_MAPT_ICMP)) { vlib_increment_combined_counter (cm + MAP_DOMAIN_COUNTER_RX, - cpu_index, + thread_index, vnet_buffer (p0)-> map_t.map_domain_index, 1, clib_net_to_host_u16 diff --git a/src/vnet/mpls/mpls_input.c b/src/vnet/mpls/mpls_input.c index 893c4511..1b9bdd05 100644 --- a/src/vnet/mpls/mpls_input.c +++ b/src/vnet/mpls/mpls_input.c @@ -76,7 +76,7 @@ mpls_input_inline (vlib_main_t * vm, u32 n_left_from, next_index, * from, * to_next; mpls_input_runtime_t * rt; mpls_main_t * mm; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); vlib_simple_counter_main_t * cm; vnet_main_t * vnm = vnet_get_main(); @@ -151,7 +151,7 @@ mpls_input_inline (vlib_main_t * vm, next0 = MPLS_INPUT_NEXT_LOOKUP; vnet_feature_arc_start(mm->input_feature_arc_index, sw_if_index0, &next0, b0); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); } if (PREDICT_FALSE(h1[3] == 0)) @@ -164,7 +164,7 @@ mpls_input_inline (vlib_main_t * vm, next1 = MPLS_INPUT_NEXT_LOOKUP; vnet_feature_arc_start(mm->input_feature_arc_index, sw_if_index1, &next1, b1); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index1, 1); } if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) @@ -215,7 +215,7 @@ mpls_input_inline (vlib_main_t * vm, { next0 = MPLS_INPUT_NEXT_LOOKUP; vnet_feature_arc_start(mm->input_feature_arc_index, sw_if_index0, &next0, b0); - vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, thread_index, sw_if_index0, 1); } if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) diff --git a/src/vnet/mpls/mpls_lookup.c b/src/vnet/mpls/mpls_lookup.c index 475bb204..ace6a70f 100644 --- a/src/vnet/mpls/mpls_lookup.c +++ b/src/vnet/mpls/mpls_lookup.c @@ -67,7 +67,7 @@ mpls_lookup (vlib_main_t * vm, vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters; u32 n_left_from, next_index, * from, * to_next; mpls_main_t * mm = &mpls_main; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -220,16 +220,16 @@ mpls_lookup (vlib_main_t * vm, vnet_buffer (b3)->ip.adj_index[VLIB_TX] = dpo3->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, b1)); vlib_increment_combined_counter - (cm, cpu_index, lbi2, 1, + (cm, thread_index, lbi2, 1, vlib_buffer_length_in_chain (vm, b2)); vlib_increment_combined_counter - (cm, cpu_index, lbi3, 1, + (cm, thread_index, lbi3, 1, vlib_buffer_length_in_chain (vm, b3)); /* @@ -351,7 +351,7 @@ mpls_lookup (vlib_main_t * vm, vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, b0)); /* @@ -440,7 +440,7 @@ mpls_load_balance (vlib_main_t * vm, { vlib_combined_counter_main_t * cm = &load_balance_main.lbm_via_counters; u32 n_left_from, n_left_to_next, * from, * to_next; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u32 next; from = vlib_frame_vector_args (frame); @@ -536,10 +536,10 @@ mpls_load_balance (vlib_main_t * vm, vnet_buffer (p1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_increment_combined_counter - (cm, cpu_index, lbi1, 1, + (cm, thread_index, lbi1, 1, vlib_buffer_length_in_chain (vm, p1)); if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) @@ -597,7 +597,7 @@ mpls_load_balance (vlib_main_t * vm, vnet_buffer (p0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; vlib_increment_combined_counter - (cm, cpu_index, lbi0, 1, + (cm, thread_index, lbi0, 1, vlib_buffer_length_in_chain (vm, p0)); vlib_validate_buffer_enqueue_x1 (vm, node, next, diff --git a/src/vnet/mpls/mpls_output.c b/src/vnet/mpls/mpls_output.c index 08018fd1..d90dec21 100644 --- a/src/vnet/mpls/mpls_output.c +++ b/src/vnet/mpls/mpls_output.c @@ -64,12 +64,12 @@ mpls_output_inline (vlib_main_t * vm, vlib_frame_t * from_frame, int is_midchain) { - u32 n_left_from, next_index, * from, * to_next, cpu_index; + u32 n_left_from, next_index, * from, * to_next, thread_index; vlib_node_runtime_t * error_node; u32 n_left_to_next; mpls_main_t *mm; - cpu_index = os_get_cpu_number(); + thread_index = vlib_get_thread_index(); error_node = vlib_node_get_runtime (vm, mpls_output_node.index); from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -137,13 +137,13 @@ mpls_output_inline (vlib_main_t * vm, /* Bump the adj counters for packet and bytes */ vlib_increment_combined_counter (&adjacency_counters, - cpu_index, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); vlib_increment_combined_counter (&adjacency_counters, - cpu_index, + thread_index, adj_index1, 1, vlib_buffer_length_in_chain (vm, p1) + rw_len1); @@ -245,7 +245,7 @@ mpls_output_inline (vlib_main_t * vm, vlib_increment_combined_counter (&adjacency_counters, - cpu_index, + thread_index, adj_index0, 1, vlib_buffer_length_in_chain (vm, p0) + rw_len0); diff --git a/src/vnet/pg/input.c b/src/vnet/pg/input.c index 2649798b..597ae060 100644 --- a/src/vnet/pg/input.c +++ b/src/vnet/pg/input.c @@ -893,7 +893,7 @@ pg_generate_set_lengths (pg_main_t * pg, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number (), + vlib_get_thread_index (), si->sw_if_index, n_buffers, length_sum); } @@ -1266,7 +1266,7 @@ pg_stream_fill_helper (pg_main_t * pg, l += vlib_buffer_index_length_in_chain (vm, buffers[i]); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number (), + vlib_get_thread_index (), si->sw_if_index, n_alloc, l); s->current_replay_packet_index += n_alloc; s->current_replay_packet_index %= diff --git a/src/vnet/replication.c b/src/vnet/replication.c index 86d922b5..233a8c2f 100644 --- a/src/vnet/replication.c +++ b/src/vnet/replication.c @@ -31,16 +31,16 @@ replication_prep (vlib_main_t * vm, { replication_main_t *rm = &replication_main; replication_context_t *ctx; - uword cpu_number = vm->cpu_index; + uword thread_index = vm->thread_index; ip4_header_t *ip; u32 ctx_id; /* Allocate a context, reserve context 0 */ - if (PREDICT_FALSE (rm->contexts[cpu_number] == 0)) - pool_get_aligned (rm->contexts[cpu_number], ctx, CLIB_CACHE_LINE_BYTES); + if (PREDICT_FALSE (rm->contexts[thread_index] == 0)) + pool_get_aligned (rm->contexts[thread_index], ctx, CLIB_CACHE_LINE_BYTES); - pool_get_aligned (rm->contexts[cpu_number], ctx, CLIB_CACHE_LINE_BYTES); - ctx_id = ctx - rm->contexts[cpu_number]; + pool_get_aligned (rm->contexts[thread_index], ctx, CLIB_CACHE_LINE_BYTES); + ctx_id = ctx - rm->contexts[thread_index]; /* Save state from vlib buffer */ ctx->saved_free_list_index = b0->free_list_index; @@ -94,11 +94,11 @@ replication_recycle (vlib_main_t * vm, vlib_buffer_t * b0, u32 is_last) { replication_main_t *rm = &replication_main; replication_context_t *ctx; - uword cpu_number = vm->cpu_index; + uword thread_index = vm->thread_index; ip4_header_t *ip; /* Get access to the replication context */ - ctx = pool_elt_at_index (rm->contexts[cpu_number], b0->recycle_count); + ctx = pool_elt_at_index (rm->contexts[thread_index], b0->recycle_count); /* Restore vnet buffer state */ clib_memcpy (vnet_buffer (b0), ctx->vnet_buffer, @@ -133,7 +133,7 @@ replication_recycle (vlib_main_t * vm, vlib_buffer_t * b0, u32 is_last) b0->flags &= ~VLIB_BUFFER_RECYCLE; /* Free context back to its pool */ - pool_put (rm->contexts[cpu_number], ctx); + pool_put (rm->contexts[thread_index], ctx); } return ctx; @@ -160,7 +160,7 @@ replication_recycle_callback (vlib_main_t * vm, vlib_buffer_free_list_t * fl) replication_main_t *rm = &replication_main; replication_context_t *ctx; u32 feature_node_index = 0; - uword cpu_number = vm->cpu_index; + uword thread_index = vm->thread_index; /* * All buffers in the list are destined to the same recycle node. @@ -172,7 +172,7 @@ replication_recycle_callback (vlib_main_t * vm, vlib_buffer_free_list_t * fl) { bi0 = fl->buffers[0]; b0 = vlib_get_buffer (vm, bi0); - ctx = pool_elt_at_index (rm->contexts[cpu_number], b0->recycle_count); + ctx = pool_elt_at_index (rm->contexts[thread_index], b0->recycle_count); feature_node_index = ctx->recycle_node_index; } diff --git a/src/vnet/replication.h b/src/vnet/replication.h index 5dc554c9..ce4b3ff1 100644 --- a/src/vnet/replication.h +++ b/src/vnet/replication.h @@ -100,7 +100,7 @@ replication_get_ctx (vlib_buffer_t * b0) replication_main_t *rm = &replication_main; return replication_is_recycled (b0) ? - pool_elt_at_index (rm->contexts[os_get_cpu_number ()], + pool_elt_at_index (rm->contexts[vlib_get_thread_index ()], b0->recycle_count) : 0; } diff --git a/src/vnet/session/node.c b/src/vnet/session/node.c index b86e87d9..dd211c51 100644 --- a/src/vnet/session/node.c +++ b/src/vnet/session/node.c @@ -311,7 +311,7 @@ session_queue_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, unix_shared_memory_queue_t *q; application_t *app; int n_tx_packets = 0; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; int i, rv; f64 now = vlib_time_now (vm); diff --git a/src/vnet/sr/sr_localsid.c b/src/vnet/sr/sr_localsid.c index 2e3d56de..6d72a506 100755 --- a/src/vnet/sr/sr_localsid.c +++ b/src/vnet/sr/sr_localsid.c @@ -887,7 +887,7 @@ sr_localsid_d_fn (vlib_main_t * vm, vlib_node_runtime_t * node, from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; next_index = node->cached_next_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -974,26 +974,26 @@ sr_localsid_d_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (((next0 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls0 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b0)); + &(sm->sr_ls_valid_counters)), thread_index, ls0 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter (((next1 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls1 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b1)); + &(sm->sr_ls_valid_counters)), thread_index, ls1 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b1)); vlib_increment_combined_counter (((next2 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls2 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b2)); + &(sm->sr_ls_valid_counters)), thread_index, ls2 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b2)); vlib_increment_combined_counter (((next3 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls3 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b3)); + &(sm->sr_ls_valid_counters)), thread_index, ls3 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b3)); vlib_validate_buffer_enqueue_x4 (vm, node, next_index, to_next, n_left_to_next, bi0, bi1, bi2, bi3, @@ -1062,8 +1062,8 @@ sr_localsid_d_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (((next0 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls0 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b0)); + &(sm->sr_ls_valid_counters)), thread_index, ls0 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b0)); vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, n_left_to_next, bi0, next0); @@ -1103,7 +1103,7 @@ sr_localsid_fn (vlib_main_t * vm, vlib_node_runtime_t * node, from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; next_index = node->cached_next_index; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); while (n_left_from > 0) { @@ -1205,26 +1205,26 @@ sr_localsid_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (((next0 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls0 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b0)); + &(sm->sr_ls_valid_counters)), thread_index, ls0 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter (((next1 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls1 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b1)); + &(sm->sr_ls_valid_counters)), thread_index, ls1 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b1)); vlib_increment_combined_counter (((next2 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls2 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b2)); + &(sm->sr_ls_valid_counters)), thread_index, ls2 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b2)); vlib_increment_combined_counter (((next3 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls3 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b3)); + &(sm->sr_ls_valid_counters)), thread_index, ls3 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b3)); vlib_validate_buffer_enqueue_x4 (vm, node, next_index, to_next, n_left_to_next, bi0, bi1, bi2, bi3, @@ -1295,8 +1295,8 @@ sr_localsid_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_increment_combined_counter (((next0 == SR_LOCALSID_NEXT_ERROR) ? &(sm->sr_ls_invalid_counters) : - &(sm->sr_ls_valid_counters)), cpu_index, ls0 - sm->localsids, 1, - vlib_buffer_length_in_chain (vm, b0)); + &(sm->sr_ls_valid_counters)), thread_index, ls0 - sm->localsids, + 1, vlib_buffer_length_in_chain (vm, b0)); vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, n_left_to_next, bi0, next0); diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index e3705060..c1567aa0 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -174,7 +174,7 @@ tclient_thread_fn (void *arg) pthread_sigmask (SIG_SETMASK, &s, 0); } - clib_per_cpu_mheaps[os_get_cpu_number ()] = clib_per_cpu_mheaps[0]; + clib_per_cpu_mheaps[vlib_get_thread_index ()] = clib_per_cpu_mheaps[0]; while (1) { diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index b2a371e2..b6c34828 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -646,10 +646,10 @@ const static transport_proto_vft_t tcp6_proto = { void tcp_timer_keep_handler (u32 conn_index) { - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; - tc = tcp_connection_get (conn_index, cpu_index); + tc = tcp_connection_get (conn_index, thread_index); tc->timers[TCP_TIMER_KEEP] = TCP_TIMER_HANDLE_INVALID; tcp_connection_close (tc); @@ -675,10 +675,10 @@ tcp_timer_establish_handler (u32 conn_index) void tcp_timer_waitclose_handler (u32 conn_index) { - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; - tc = tcp_connection_get (conn_index, cpu_index); + tc = tcp_connection_get (conn_index, thread_index); tc->timers[TCP_TIMER_WAITCLOSE] = TCP_TIMER_HANDLE_INVALID; /* Session didn't come back with a close(). Send FIN either way diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index 0090e15e..eaca672c 100644 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -343,7 +343,7 @@ typedef enum _tcp_dbg_evt } \ else \ { \ - u32 _thread_index = os_get_cpu_number (); \ + u32 _thread_index = vlib_get_thread_index (); \ _tc = tcp_connection_get (_tc_index, _thread_index); \ } \ ELOG_TYPE_DECLARE (_e) = \ diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index a8224dc2..7e9fa47b 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -1142,7 +1142,7 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, int is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index, errors = 0; + u32 my_thread_index = vm->thread_index, errors = 0; tcp_main_t *tm = vnet_get_tcp_main (); from = vlib_frame_vector_args (from_frame); @@ -1332,7 +1332,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { tcp_main_t *tm = vnet_get_tcp_main (); u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index, errors = 0; + u32 my_thread_index = vm->thread_index, errors = 0; u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; from = vlib_frame_vector_args (from_frame); @@ -1634,7 +1634,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { tcp_main_t *tm = vnet_get_tcp_main (); u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index, errors = 0; + u32 my_thread_index = vm->thread_index, errors = 0; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -1989,7 +1989,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, int is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; tcp_main_t *tm = vnet_get_tcp_main (); u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; @@ -2243,7 +2243,7 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, int is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; tcp_main_t *tm = vnet_get_tcp_main (); from = vlib_frame_vector_args (from_frame); diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index ea157bd7..e18bfad7 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -387,8 +387,8 @@ tcp_make_options (tcp_connection_t * tc, tcp_options_t * opts, #define tcp_get_free_buffer_index(tm, bidx) \ do { \ u32 *my_tx_buffers, n_free_buffers; \ - u32 cpu_index = os_get_cpu_number(); \ - my_tx_buffers = tm->tx_buffers[cpu_index]; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ if (PREDICT_FALSE(vec_len (my_tx_buffers) == 0)) \ { \ n_free_buffers = 32; /* TODO config or macro */ \ @@ -396,7 +396,7 @@ do { \ _vec_len(my_tx_buffers) = vlib_buffer_alloc_from_free_list ( \ tm->vlib_main, my_tx_buffers, n_free_buffers, \ VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); \ - tm->tx_buffers[cpu_index] = my_tx_buffers; \ + tm->tx_buffers[thread_index] = my_tx_buffers; \ } \ /* buffer shortage */ \ if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) \ @@ -408,8 +408,8 @@ do { \ #define tcp_return_buffer(tm) \ do { \ u32 *my_tx_buffers; \ - u32 cpu_index = os_get_cpu_number(); \ - my_tx_buffers = tm->tx_buffers[cpu_index]; \ + u32 thread_index = vlib_get_thread_index(); \ + my_tx_buffers = tm->tx_buffers[thread_index]; \ _vec_len (my_tx_buffers) +=1; \ } while (0) @@ -942,7 +942,7 @@ tcp_send_ack (tcp_connection_t * tc) void tcp_timer_delack_handler (u32 index) { - u32 thread_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; tc = tcp_connection_get (index, thread_index); @@ -1022,7 +1022,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); - u32 thread_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; u32 bi, snd_space, n_bytes; @@ -1152,7 +1152,7 @@ tcp_timer_persist_handler (u32 index) { tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); - u32 thread_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; u32 bi, n_bytes; @@ -1313,7 +1313,7 @@ tcp46_output_inline (vlib_main_t * vm, vlib_frame_t * from_frame, int is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; @@ -1524,7 +1524,7 @@ tcp46_send_reset_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, u8 is_ip4) { u32 n_left_from, next_index, *from, *to_next; - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; from = vlib_frame_vector_args (from_frame); n_left_from = from_frame->n_vectors; diff --git a/src/vnet/udp/udp_input.c b/src/vnet/udp/udp_input.c index 4b22109b..810278e6 100644 --- a/src/vnet/udp/udp_input.c +++ b/src/vnet/udp/udp_input.c @@ -70,7 +70,7 @@ udp4_uri_input_node_fn (vlib_main_t * vm, udp4_uri_input_next_t next_index; udp_uri_main_t *um = vnet_get_udp_main (); session_manager_main_t *smm = vnet_get_session_manager_main (); - u32 my_thread_index = vm->cpu_index; + u32 my_thread_index = vm->thread_index; u8 my_enqueue_epoch; u32 *session_indices_to_enqueue; static u32 serial_number; diff --git a/src/vnet/unix/tapcli.c b/src/vnet/unix/tapcli.c index fb1a8bac..0fc62f6c 100644 --- a/src/vnet/unix/tapcli.c +++ b/src/vnet/unix/tapcli.c @@ -366,7 +366,7 @@ static uword tapcli_rx_iface(vlib_main_t * vm, vlib_increment_combined_counter ( vnet_main.interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number(), ti->sw_if_index, + vlib_get_thread_index(), ti->sw_if_index, 1, n_bytes_in_packet); if (PREDICT_FALSE(n_trace > 0)) { diff --git a/src/vnet/unix/tuntap.c b/src/vnet/unix/tuntap.c index 2cfcc92f..ac674653 100644 --- a/src/vnet/unix/tuntap.c +++ b/src/vnet/unix/tuntap.c @@ -189,7 +189,7 @@ tuntap_tx (vlib_main_t * vm, /* Update tuntap interface output stats. */ vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - vm->cpu_index, + vm->thread_index, tm->sw_if_index, n_packets, n_bytes); @@ -297,7 +297,7 @@ tuntap_rx (vlib_main_t * vm, vlib_increment_combined_counter (vnet_main.interface_main.combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - os_get_cpu_number(), + vlib_get_thread_index(), tm->sw_if_index, 1, n_bytes_in_packet); diff --git a/src/vnet/vxlan-gpe/decap.c b/src/vnet/vxlan-gpe/decap.c index 22ab4b62..d4fe4231 100644 --- a/src/vnet/vxlan-gpe/decap.c +++ b/src/vnet/vxlan-gpe/decap.c @@ -115,7 +115,7 @@ vxlan_gpe_input (vlib_main_t * vm, vxlan4_gpe_tunnel_key_t last_key4; vxlan6_gpe_tunnel_key_t last_key6; u32 pkts_decapsulated = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; if (is_ip4) @@ -342,7 +342,7 @@ vxlan_gpe_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; stats_sw_if_index = sw_if_index0; @@ -427,7 +427,7 @@ vxlan_gpe_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len1; stats_sw_if_index = sw_if_index1; @@ -588,7 +588,7 @@ vxlan_gpe_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; stats_sw_if_index = sw_if_index0; @@ -615,7 +615,7 @@ vxlan_gpe_input (vlib_main_t * vm, if (stats_n_packets) { vlib_increment_combined_counter ( - im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, cpu_index, + im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); node->runtime_data[0] = stats_sw_if_index; } diff --git a/src/vnet/vxlan-gpe/encap.c b/src/vnet/vxlan-gpe/encap.c index 3a486e56..67ed94b4 100644 --- a/src/vnet/vxlan-gpe/encap.c +++ b/src/vnet/vxlan-gpe/encap.c @@ -151,7 +151,7 @@ vxlan_gpe_encap (vlib_main_t * vm, vnet_main_t * vnm = ngm->vnet_main; vnet_interface_main_t * im = &vnm->interface_main; u32 pkts_encapsulated = 0; - u32 cpu_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; from = vlib_frame_vector_args (from_frame); @@ -253,7 +253,7 @@ vxlan_gpe_encap (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_sw_if_index = sw_if_index0; stats_n_packets = 2; stats_n_bytes = len0 + len1; @@ -262,10 +262,10 @@ vxlan_gpe_encap (vlib_main_t * vm, { vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, sw_if_index0, 1, len0); + thread_index, sw_if_index0, 1, len0); vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, sw_if_index1, 1, len1); + thread_index, sw_if_index1, 1, len1); } } @@ -335,7 +335,7 @@ vxlan_gpe_encap (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter ( im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; stats_sw_if_index = sw_if_index0; @@ -359,7 +359,7 @@ vxlan_gpe_encap (vlib_main_t * vm, if (stats_n_packets) { vlib_increment_combined_counter ( - im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, cpu_index, + im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); node->runtime_data[0] = stats_sw_if_index; } diff --git a/src/vnet/vxlan/decap.c b/src/vnet/vxlan/decap.c index 514b2c99..2acb1f6f 100644 --- a/src/vnet/vxlan/decap.c +++ b/src/vnet/vxlan/decap.c @@ -81,7 +81,7 @@ vxlan_input (vlib_main_t * vm, vxlan4_tunnel_key_t last_key4; vxlan6_tunnel_key_t last_key6; u32 pkts_decapsulated = 0; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; if (is_ip4) @@ -314,7 +314,7 @@ vxlan_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; @@ -468,7 +468,7 @@ vxlan_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len1; @@ -674,7 +674,7 @@ vxlan_input (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; @@ -711,7 +711,7 @@ vxlan_input (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); node->runtime_data[0] = stats_sw_if_index; } diff --git a/src/vnet/vxlan/encap.c b/src/vnet/vxlan/encap.c index 5b63064a..4cfbbc23 100644 --- a/src/vnet/vxlan/encap.c +++ b/src/vnet/vxlan/encap.c @@ -77,7 +77,7 @@ vxlan_encap_inline (vlib_main_t * vm, vnet_interface_main_t * im = &vnm->interface_main; u32 pkts_encapsulated = 0; u16 old_l0 = 0, old_l1 = 0; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index(); u32 stats_sw_if_index, stats_n_packets, stats_n_bytes; u32 sw_if_index0 = 0, sw_if_index1 = 0; u32 next0 = 0, next1 = 0; @@ -301,7 +301,7 @@ vxlan_encap_inline (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, stats_sw_if_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_sw_if_index = sw_if_index0; stats_n_packets = 2; @@ -311,10 +311,10 @@ vxlan_encap_inline (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, sw_if_index0, 1, len0); + thread_index, sw_if_index0, 1, len0); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, sw_if_index1, 1, len1); + thread_index, sw_if_index1, 1, len1); } } @@ -464,7 +464,7 @@ vxlan_encap_inline (vlib_main_t * vm, if (stats_n_packets) vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, stats_sw_if_index, + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); stats_n_packets = 1; stats_n_bytes = len0; @@ -496,7 +496,7 @@ vxlan_encap_inline (vlib_main_t * vm, { vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX, - cpu_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); + thread_index, stats_sw_if_index, stats_n_packets, stats_n_bytes); node->runtime_data[0] = stats_sw_if_index; } diff --git a/src/vpp/stats/stats.c b/src/vpp/stats/stats.c index 042d02e2..4309cd51 100644 --- a/src/vpp/stats/stats.c +++ b/src/vpp/stats/stats.c @@ -66,14 +66,14 @@ _(VNET_IP6_NBR_COUNTERS, vnet_ip6_nbr_counters) void dslock (stats_main_t * sm, int release_hint, int tag) { - u32 thread_id; + u32 thread_index; data_structure_lock_t *l = sm->data_structure_lock; if (PREDICT_FALSE (l == 0)) return; - thread_id = os_get_cpu_number (); - if (l->lock && l->thread_id == thread_id) + thread_index = vlib_get_thread_index (); + if (l->lock && l->thread_index == thread_index) { l->count++; return; @@ -85,7 +85,7 @@ dslock (stats_main_t * sm, int release_hint, int tag) while (__sync_lock_test_and_set (&l->lock, 1)) /* zzzz */ ; l->tag = tag; - l->thread_id = thread_id; + l->thread_index = thread_index; l->count = 1; } @@ -99,14 +99,14 @@ stats_dslock_with_hint (int hint, int tag) void dsunlock (stats_main_t * sm) { - u32 thread_id; + u32 thread_index; data_structure_lock_t *l = sm->data_structure_lock; if (PREDICT_FALSE (l == 0)) return; - thread_id = os_get_cpu_number (); - ASSERT (l->lock && l->thread_id == thread_id); + thread_index = vlib_get_thread_index (); + ASSERT (l->lock && l->thread_index == thread_index); l->count--; if (l->count == 0) { diff --git a/src/vpp/stats/stats.h b/src/vpp/stats/stats.h index 118115be..024dc78e 100644 --- a/src/vpp/stats/stats.h +++ b/src/vpp/stats/stats.h @@ -30,7 +30,7 @@ typedef struct { volatile u32 lock; volatile u32 release_hint; - u32 thread_id; + u32 thread_index; u32 count; int tag; } data_structure_lock_t; -- cgit 1.2.3-korg From 153646e89c3be70c68348bdd497f8edd2b212a9c Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 5 Apr 2017 18:15:45 +0200 Subject: Common device-input interrupt infra Change-Id: I23b588eb56a3f5690158449a1f9bc8053cd3d251 Signed-off-by: Damjan Marion --- src/vlib/node_funcs.h | 14 ++++ src/vnet/devices/af_packet/af_packet.c | 7 +- src/vnet/devices/af_packet/node.c | 2 +- src/vnet/devices/devices.c | 135 +++++++++++++++++++++++++++++---- src/vnet/devices/devices.h | 60 ++++++++++++--- src/vnet/interface.c | 2 + src/vnet/interface.h | 3 + 7 files changed, 194 insertions(+), 29 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h index 54e36874..4d7cc192 100644 --- a/src/vlib/node_funcs.h +++ b/src/vlib/node_funcs.h @@ -177,6 +177,20 @@ vlib_node_set_state (vlib_main_t * vm, u32 node_index, r->state = new_state; } +/** \brief Get node dispatch state. + @param vm vlib_main_t pointer, varies by thread + @param node_index index of the node + @return state for node, see vlib_node_state_t +*/ +always_inline vlib_node_state_t +vlib_node_get_state (vlib_main_t * vm, u32 node_index) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *n; + n = vec_elt (nm->nodes, node_index); + return n->state; +} + always_inline void vlib_node_set_interrupt_pending (vlib_main_t * vm, u32 node_index) { diff --git a/src/vnet/devices/af_packet/af_packet.c b/src/vnet/devices/af_packet/af_packet.c index 20285107..7464d4e6 100644 --- a/src/vnet/devices/af_packet/af_packet.c +++ b/src/vnet/devices/af_packet/af_packet.c @@ -270,9 +270,12 @@ af_packet_create_if (vlib_main_t * vm, u8 * host_if_name, u8 * hw_addr_set, sw = vnet_get_hw_sw_interface (vnm, apif->hw_if_index); apif->sw_if_index = sw->sw_if_index; - vnet_set_device_input_node (apif->hw_if_index, af_packet_input_node.index); - vnet_device_input_assign_thread (apif->hw_if_index, 0, /* queue */ + vnet_set_device_input_node (vnm, apif->hw_if_index, + af_packet_input_node.index); + vnet_device_input_assign_thread (vnm, apif->hw_if_index, 0, /* queue */ ~0 /* any cpu */ ); + vnet_device_input_set_mode (vnm, apif->hw_if_index, 0, + VNET_DEVICE_INPUT_MODE_INTERRUPT); vnet_hw_interface_set_flags (vnm, apif->hw_if_index, VNET_HW_INTERFACE_FLAG_LINK_UP); diff --git a/src/vnet/devices/af_packet/node.c b/src/vnet/devices/af_packet/node.c index 76980102..d3af41b5 100644 --- a/src/vnet/devices/af_packet/node.c +++ b/src/vnet/devices/af_packet/node.c @@ -251,7 +251,7 @@ af_packet_input_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vnet_device_input_runtime_t *rt = (void *) node->runtime_data; vnet_device_and_queue_t *dq; - vec_foreach (dq, rt->devices_and_queues) + foreach_device_and_queue (dq, rt->devices_and_queues) { af_packet_if_t *apif; apif = vec_elt_at_index (apm->interfaces, dq->dev_instance); diff --git a/src/vnet/devices/devices.c b/src/vnet/devices/devices.c index 5e5e812c..c8a95087 100644 --- a/src/vnet/devices/devices.c +++ b/src/vnet/devices/devices.c @@ -102,11 +102,26 @@ vnet_device_queue_sort (void *a1, void *a2) return 0; } +static void +vnet_device_queue_update (vnet_main_t * vnm, vnet_device_input_runtime_t * rt) +{ + vnet_device_and_queue_t *dq; + vnet_hw_interface_t *hw; + + vec_sort_with_function (rt->devices_and_queues, vnet_device_queue_sort); + + vec_foreach (dq, rt->devices_and_queues) + { + hw = vnet_get_hw_interface (vnm, dq->hw_if_index); + vec_validate (hw->dq_runtime_index_by_queue, dq->queue_id); + hw->dq_runtime_index_by_queue[dq->queue_id] = dq - rt->devices_and_queues; + } +} + void -vnet_device_input_assign_thread (u32 hw_if_index, +vnet_device_input_assign_thread (vnet_main_t * vnm, u32 hw_if_index, u16 queue_id, uword thread_index) { - vnet_main_t *vnm = vnet_get_main (); vnet_device_main_t *vdm = &vnet_device_main; vlib_main_t *vm; vnet_device_input_runtime_t *rt; @@ -135,16 +150,17 @@ vnet_device_input_assign_thread (u32 hw_if_index, dq->dev_instance = hw->dev_instance; dq->queue_id = queue_id; - vec_sort_with_function (rt->devices_and_queues, vnet_device_queue_sort); + vnet_device_queue_update (vnm, rt); vec_validate (hw->input_node_thread_index_by_queue, queue_id); hw->input_node_thread_index_by_queue[queue_id] = thread_index; + vlib_node_set_state (vm, hw->input_node_index, rt->enabled_node_state); } -static int -vnet_device_input_unassign_thread (u32 hw_if_index, u16 queue_id, - uword thread_index) +int +vnet_device_input_unassign_thread (vnet_main_t * vnm, u32 hw_if_index, + u16 queue_id, uword thread_index) { - vnet_main_t *vnm = vnet_get_main (); + vlib_main_t *vm; vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); vnet_device_input_runtime_t *rt; vnet_device_and_queue_t *dq; @@ -161,9 +177,9 @@ vnet_device_input_unassign_thread (u32 hw_if_index, u16 queue_id, if (old_thread_index == thread_index) return 0; - rt = - vlib_node_get_runtime_data (vlib_mains[old_thread_index], - hw->input_node_index); + vm = vlib_mains[old_thread_index]; + + rt = vlib_node_get_runtime_data (vm, hw->input_node_index); vec_foreach (dq, rt->devices_and_queues) if (dq->hw_if_index == hw_if_index && dq->queue_id == queue_id) @@ -175,11 +191,89 @@ vnet_device_input_unassign_thread (u32 hw_if_index, u16 queue_id, return VNET_API_ERROR_INVALID_INTERFACE; deleted: - vec_sort_with_function (rt->devices_and_queues, vnet_device_queue_sort); + + vnet_device_queue_update (vnm, rt); + + if (vec_len (rt->devices_and_queues) == 0) + vlib_node_set_state (vm, hw->input_node_index, VLIB_NODE_STATE_DISABLED); + + return 0; +} + + +int +vnet_device_input_set_mode (vnet_main_t * vnm, u32 hw_if_index, u16 queue_id, + vnet_device_input_mode_t mode) +{ + vlib_main_t *vm; + uword thread_index; + vnet_device_and_queue_t *dq; + vlib_node_state_t enabled_node_state; + ASSERT (mode < VNET_DEVICE_INPUT_N_MODES); + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + vnet_device_input_runtime_t *rt; + int is_polling = 0; + + if (hw->input_node_thread_index_by_queue == 0) + return VNET_API_ERROR_INVALID_INTERFACE; + + thread_index = hw->input_node_thread_index_by_queue[queue_id]; + vm = vlib_mains[thread_index]; + + rt = vlib_node_get_runtime_data (vm, hw->input_node_index); + + vec_foreach (dq, rt->devices_and_queues) + { + if (dq->hw_if_index == hw_if_index && dq->queue_id == queue_id) + dq->mode = mode; + if (dq->mode == VNET_DEVICE_INPUT_MODE_POLLING) + is_polling = 1; + } + + if (is_polling) + enabled_node_state = VLIB_NODE_STATE_POLLING; + else + enabled_node_state = VLIB_NODE_STATE_INTERRUPT; + + if (rt->enabled_node_state != enabled_node_state) + { + rt->enabled_node_state = enabled_node_state; + if (vlib_node_get_state (vm, hw->input_node_index) != + VLIB_NODE_STATE_DISABLED) + vlib_node_set_state (vm, hw->input_node_index, enabled_node_state); + } return 0; } +int +vnet_device_input_get_mode (vnet_main_t * vnm, u32 hw_if_index, u16 queue_id, + vnet_device_input_mode_t * mode) +{ + vlib_main_t *vm; + uword thread_index; + vnet_device_and_queue_t *dq; + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + vnet_device_input_runtime_t *rt; + + if (hw->input_node_thread_index_by_queue == 0) + return VNET_API_ERROR_INVALID_INTERFACE; + + thread_index = hw->input_node_thread_index_by_queue[queue_id]; + vm = vlib_mains[thread_index]; + + rt = vlib_node_get_runtime_data (vm, hw->input_node_index); + + vec_foreach (dq, rt->devices_and_queues) + if (dq->hw_if_index == hw_if_index && dq->queue_id == queue_id) + { + *mode = dq->mode; + return 0; + } + + return VNET_API_ERROR_INVALID_INTERFACE; +} + static clib_error_t * show_device_placement_fn (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) @@ -203,9 +297,11 @@ show_device_placement_fn (vlib_main_t * vm, unformat_input_t * input, vec_foreach (dq, rt->devices_and_queues) { - s = format (s, " %U queue %u\n", + s = format (s, " %U queue %u (%s)\n", format_vnet_sw_if_index_name, vnm, dq->hw_if_index, - dq->queue_id); + dq->queue_id, + dq->mode == VNET_DEVICE_INPUT_MODE_POLLING ? + "polling" : "interrupt"); } })); if (vec_len (s) > 0) @@ -238,6 +334,7 @@ set_device_placement (vlib_main_t * vm, unformat_input_t * input, unformat_input_t _line_input, *line_input = &_line_input; vnet_main_t *vnm = vnet_get_main (); vnet_device_main_t *vdm = &vnet_device_main; + vnet_device_input_mode_t mode; u32 hw_if_index = (u32) ~ 0; u32 queue_id = (u32) 0; u32 thread_index = (u32) ~ 0; @@ -275,13 +372,19 @@ set_device_placement (vlib_main_t * vm, unformat_input_t * input, return clib_error_return (0, "please specify valid worker thread or main"); - rv = - vnet_device_input_unassign_thread (hw_if_index, queue_id, thread_index); + rv = vnet_device_input_get_mode (vnm, hw_if_index, queue_id, &mode); + + if (rv) + return clib_error_return (0, "not found"); + + rv = vnet_device_input_unassign_thread (vnm, hw_if_index, queue_id, + thread_index); if (rv) return clib_error_return (0, "not found"); - vnet_device_input_assign_thread (hw_if_index, queue_id, thread_index); + vnet_device_input_assign_thread (vnm, hw_if_index, queue_id, thread_index); + vnet_device_input_set_mode (vnm, hw_if_index, queue_id, mode); return 0; } diff --git a/src/vnet/devices/devices.h b/src/vnet/devices/devices.h index 966f8302..baf03b7c 100644 --- a/src/vnet/devices/devices.h +++ b/src/vnet/devices/devices.h @@ -55,16 +55,26 @@ typedef struct uword next_worker_thread_index; } vnet_device_main_t; +typedef enum +{ + VNET_DEVICE_INPUT_MODE_POLLING = 0, + VNET_DEVICE_INPUT_MODE_INTERRUPT, + VNET_DEVICE_INPUT_N_MODES, +} vnet_device_input_mode_t; + typedef struct { u32 hw_if_index; u32 dev_instance; u16 queue_id; + vnet_device_input_mode_t mode; + uword interrupt_pending; } vnet_device_and_queue_t; typedef struct { vnet_device_and_queue_t *devices_and_queues; + vlib_node_state_t enabled_node_state; } vnet_device_input_runtime_t; extern vnet_device_main_t vnet_device_main; @@ -72,15 +82,22 @@ extern vlib_node_registration_t device_input_node; extern const u32 device_input_next_node_advance[]; static inline void -vnet_set_device_input_node (u32 hw_if_index, u32 node_index) +vnet_set_device_input_node (vnet_main_t * vnm, u32 hw_if_index, + u32 node_index) { - vnet_main_t *vnm = vnet_get_main (); vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); hw->input_node_index = node_index; } -void vnet_device_input_assign_thread (u32 hw_if_index, u16 queue_id, - uword thread_index); +void vnet_device_input_assign_thread (vnet_main_t * vnm, u32 hw_if_index, + u16 queue_id, uword thread_index); +int vnet_device_input_unassign_thread (vnet_main_t * vnm, u32 hw_if_index, + u16 queue_id, uword thread_index); +int vnet_device_input_set_mode (vnet_main_t * vnm, u32 hw_if_index, + u16 queue_id, vnet_device_input_mode_t mode); +int vnet_device_input_get_mode (vnet_main_t * vnm, u32 hw_if_index, + u16 queue_id, + vnet_device_input_mode_t * mode); static inline u64 vnet_get_aggregate_rx_packets (void) @@ -111,18 +128,41 @@ vnet_get_device_and_queue (vlib_main_t * vm, vlib_node_runtime_t * node) return rt->devices_and_queues; } +static_always_inline uword +vnet_get_device_input_thread_index (vnet_main_t * vnm, u32 hw_if_index, + u16 queue_id) +{ + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); + ASSERT (queue_id < vec_len (hw->input_node_thread_index_by_queue)); + return hw->input_node_thread_index_by_queue[queue_id]; +} + static_always_inline void vnet_device_input_set_interrupt_pending (vnet_main_t * vnm, u32 hw_if_index, u16 queue_id) { - vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, hw_if_index); - - ASSERT (queue_id < vec_len (hw->input_node_thread_index_by_queue)); - u32 thread_index = hw->input_node_thread_index_by_queue[queue_id]; - vlib_node_set_interrupt_pending (vlib_mains[thread_index], - hw->input_node_index); + vlib_main_t *vm; + vnet_hw_interface_t *hw; + vnet_device_input_runtime_t *rt; + vnet_device_and_queue_t *dq; + uword idx; + + hw = vnet_get_hw_interface (vnm, hw_if_index); + idx = vnet_get_device_input_thread_index (vnm, hw_if_index, queue_id); + vm = vlib_mains[idx]; + rt = vlib_node_get_runtime_data (vm, hw->input_node_index); + idx = hw->dq_runtime_index_by_queue[queue_id]; + dq = vec_elt_at_index (rt->devices_and_queues, idx); + dq->interrupt_pending = 1; + + vlib_node_set_interrupt_pending (vm, hw->input_node_index); } +#define foreach_device_and_queue(var,vec) \ + for (var = (vec); var < vec_end (vec); var++) \ + if (clib_smp_swap (&((var)->interrupt_pending), 0) || \ + var->mode == VNET_DEVICE_INPUT_MODE_POLLING) + #endif /* included_vnet_vnet_device_h */ /* diff --git a/src/vnet/interface.c b/src/vnet/interface.c index 45417b2f..24f216f6 100644 --- a/src/vnet/interface.c +++ b/src/vnet/interface.c @@ -919,6 +919,8 @@ vnet_delete_hw_interface (vnet_main_t * vnm, u32 hw_if_index) hash_unset_mem (im->hw_interface_by_name, hw->name); vec_free (hw->name); + vec_free (hw->input_node_thread_index_by_queue); + vec_free (hw->dq_runtime_index_by_queue); pool_put (im->hw_interfaces, hw); } diff --git a/src/vnet/interface.h b/src/vnet/interface.h index 08f08b10..9c223040 100644 --- a/src/vnet/interface.h +++ b/src/vnet/interface.h @@ -470,6 +470,9 @@ typedef struct vnet_hw_interface_t /* input node cpu index by queue */ u32 *input_node_thread_index_by_queue; + /* device input device_and_queue runtime index */ + uword *dq_runtime_index_by_queue; + } vnet_hw_interface_t; extern vnet_device_class_t vnet_local_interface_device_class; -- cgit 1.2.3-korg From ecba903bcfc5046b0d8aee18433ab9fc09c6a68f Mon Sep 17 00:00:00 2001 From: Pierre Pfister Date: Tue, 4 Apr 2017 15:55:21 +0100 Subject: Fix typo in minimal epoll polling time epoll was supposed to not sleep when timeout is less than 1ms, but a typo made it not sleep any time the requested timeout is lower than 1000 seconds (in practice, never...). This patch replaces "1e3" with "1e-3", which represents 1ms. Change-Id: I731851b27a6bf6ab8e41586e017e94b962b09bf3 Signed-off-by: Pierre Pfister (cherry picked from commit ec06222ae189fe8d84b63410130fff04bf446573) --- src/vlib/unix/input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/vlib') diff --git a/src/vlib/unix/input.c b/src/vlib/unix/input.c index 7b4183a4..73783d13 100644 --- a/src/vlib/unix/input.c +++ b/src/vlib/unix/input.c @@ -130,7 +130,7 @@ linux_epoll_input (vlib_main_t * vm, * vm->clib_time.seconds_per_clock) /* subtract off some slop time */ - 50e-6; - if (timeout < 1e3) + if (timeout < 1e-3) { /* We have event happenning in less than 1 ms so don't allow epoll to wait */ -- cgit 1.2.3-korg From 9c2b9f413cafe25840ad418dea571e0e1311ce60 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Fri, 21 Apr 2017 13:56:40 +0200 Subject: vlib: add cli command to dump all CLI commands To be used for bash completion Change-Id: I8f4702f24c2b7e223945e00a1b3560dec6ef39fd Signed-off-by: Damjan Marion --- src/vlib/cli.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'src/vlib') diff --git a/src/vlib/cli.c b/src/vlib/cli.c index 3cc95076..5a0f7ec3 100644 --- a/src/vlib/cli.c +++ b/src/vlib/cli.c @@ -1163,6 +1163,55 @@ done: } #endif +static int +cli_path_compare (void *a1, void *a2) +{ + u8 **s1 = a1; + u8 **s2 = a2; + + if ((vec_len (*s1) < vec_len (*s2)) && + memcmp ((char *) *s1, (char *) *s2, vec_len (*s1)) == 0) + return -1; + + + if ((vec_len (*s1) > vec_len (*s2)) && + memcmp ((char *) *s1, (char *) *s2, vec_len (*s2)) == 0) + return 1; + + return vec_cmp (*s1, *s2); +} + +static clib_error_t * +show_cli_cmd_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vlib_cli_main_t *cm = &vm->cli_main; + vlib_cli_command_t *cli; + u8 **paths = 0, **s; + + /* *INDENT-OFF* */ + vec_foreach (cli, cm->commands) + if (vec_len (cli->path) > 0) + vec_add1 (paths, (u8 *) cli->path); + + vec_sort_with_function (paths, cli_path_compare); + + vec_foreach (s, paths) + vlib_cli_output (vm, "%v", *s); + /* *INDENT-ON* */ + + vec_free (paths); + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_cli_command, static) = { + .path = "show cli", + .short_help = "Show cli commands", + .function = show_cli_cmd_fn, +}; +/* *INDENT-ON* */ + static clib_error_t * vlib_cli_init (vlib_main_t * vm) { -- cgit 1.2.3-korg From 36c1308b35baac817c2c0b21335b1eed02689f0a Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Mon, 24 Apr 2017 20:58:29 +0200 Subject: Fix structure alignment with 32-bit pointers Change-Id: I740de6c0f12dab452b4349e3bf89ff976a6268c0 Signed-off-by: Damjan Marion --- src/vlib/node.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'src/vlib') diff --git a/src/vlib/node.h b/src/vlib/node.h index 1e2f4c38..906d795f 100644 --- a/src/vlib/node.h +++ b/src/vlib/node.h @@ -418,6 +418,10 @@ typedef struct vlib_node_runtime_t vlib_error_t *errors; /**< Vector of errors for this node. */ +#if __SIZEOF_POINTER__ == 4 + u8 pad[8]; +#endif + u32 clocks_since_last_overflow; /**< Number of clock cycles. */ u32 max_clock; /**< Maximum clock cycle for an -- cgit 1.2.3-korg From c9fc77c510e042cf9379b427b63dd3f17a23a584 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Tue, 25 Apr 2017 21:07:52 +0200 Subject: Exit if plugin load attempt fail This happens mainly if plugin contains unresolved symbol. Such situation typically leads on crash a bit later so it is better to exit immediatelly and display meaningful error message. Change-Id: I4abd9a9089a4863400bf609e8d3fd7cebab92913 Signed-off-by: Damjan Marion --- src/vlib/unix/plugin.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/unix/plugin.c b/src/vlib/unix/plugin.c index 9b341cc8..c2741aaa 100644 --- a/src/vlib/unix/plugin.c +++ b/src/vlib/unix/plugin.c @@ -137,15 +137,11 @@ load_one_plugin (plugin_main_t * pm, plugin_info_t * pi, int from_early_init) handle = dlopen ((char *) pi->filename, RTLD_LAZY); - /* - * Note: this can happen if the plugin has an undefined symbol reference, - * so print a warning. Otherwise, the poor slob won't know what happened. - * Ask me how I know that... - */ if (handle == 0) { clib_warning ("%s", dlerror ()); - return -1; + clib_warning ("Failed to load plugin '%s'", pi->name); + os_exit (1); } pi->handle = handle; -- cgit 1.2.3-korg From 816f437d943688f67d61fb6b9708eff59432b2ee Mon Sep 17 00:00:00 2001 From: Filip Tehlar Date: Wed, 26 Apr 2017 16:09:06 +0200 Subject: Fix vnet unit tests Change-Id: Ibe55e4399c6b78d83268d7c49ed498cab7bfdb43 Signed-off-by: Filip Tehlar --- build-data/packages/vpp.mk | 4 +++ src/tests/vnet/lisp-cp/test_cp_serdes.c | 51 ++++++++++++++++++++++++++++++-- src/tests/vnet/lisp-cp/test_lisp_types.c | 45 +++++++++++++++++++++------- src/tests/vnet/lisp-gpe/test.c | 18 ----------- src/vlib/buffer.h | 12 ++++++++ src/vnet.am | 13 -------- src/vnet/lisp-cp/control.c | 14 ++++----- src/vnet/lisp-cp/control.h | 9 ++++++ src/vnet/lisp-cp/lisp_msg_serdes.c | 8 +++++ src/vnet/lisp-cp/lisp_types.c | 31 ++++++++++--------- src/vpp.am | 1 - src/vpp/api/test_client.c | 11 ------- src/vppinfra/test_tw_timer.c | 8 ++--- 13 files changed, 142 insertions(+), 83 deletions(-) delete mode 100644 src/tests/vnet/lisp-gpe/test.c (limited to 'src/vlib') diff --git a/build-data/packages/vpp.mk b/build-data/packages/vpp.mk index 64eb0d89..1acc59b2 100644 --- a/build-data/packages/vpp.mk +++ b/build-data/packages/vpp.mk @@ -30,3 +30,7 @@ ifeq ($($(PLATFORM)_uses_dpdk_mlx5_pmd),yes) vpp_configure_args += --with-dpdk-mlx5-pmd endif endif + +ifeq ($($(PLATFORM)_enable_tests),yes) +vpp_configure_args += --enable-tests +endif diff --git a/src/tests/vnet/lisp-cp/test_cp_serdes.c b/src/tests/vnet/lisp-cp/test_cp_serdes.c index 0766bee1..8e8c8455 100644 --- a/src/tests/vnet/lisp-cp/test_cp_serdes.c +++ b/src/tests/vnet/lisp-cp/test_cp_serdes.c @@ -21,9 +21,6 @@ #include #include -/* FIXME */ -#include - #define _assert(e) \ error = CLIB_ERROR_ASSERT (e); \ if (error) \ @@ -489,6 +486,53 @@ done: return error; } +static vlib_buffer_t * +create_buffer (u8 * data, u32 data_len) +{ + vlib_buffer_t *b; + + u8 *buf_data = clib_mem_alloc(500); + memset (buf_data, 0, 500); + b = (vlib_buffer_t *)buf_data; + + u8 * p = vlib_buffer_put_uninit (b, data_len); + clib_memcpy (p, data, data_len); + + return b; +} + +static clib_error_t * +test_lisp_parse_map_reply () +{ + clib_error_t * error = 0; + u8 map_reply_data[] = + { + 0x00, 0x00, 0x00, 0x01, /* type; rsvd; mapping count */ + 0x00, 0x00, 0x00, 0x00, + }; + vlib_buffer_t *b = create_buffer (map_reply_data, sizeof (map_reply_data)); + map_records_arg_t *mrecs = parse_map_reply (b); + _assert (0 == mrecs); + clib_mem_free (b); + + u8 map_reply_data2[] = + { + 0x00, 0x00, 0x00, 0x01, /* type; rsvd */ + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, /* nonce */ + + /* 1. record - incomplete */ + 0x01, 0x02, 0x03, 0x04, /* record TTL */ + 0x01, /* locator count */ + }; + b = create_buffer (map_reply_data2, sizeof (map_reply_data2)); + mrecs = parse_map_reply (b); + _assert (0 == mrecs); +done: + clib_mem_free (b); + return error; +} + static clib_error_t * test_lisp_parse_lcaf () { @@ -610,6 +654,7 @@ done: _(lisp_msg_push_ecm) \ _(lisp_msg_parse) \ _(lisp_msg_parse_mapping_record) \ + _(lisp_parse_map_reply) \ _(lisp_parse_lcaf) \ _(lisp_map_register) diff --git a/src/tests/vnet/lisp-cp/test_lisp_types.c b/src/tests/vnet/lisp-cp/test_lisp_types.c index fa34a3c6..21575015 100644 --- a/src/tests/vnet/lisp-cp/test_lisp_types.c +++ b/src/tests/vnet/lisp-cp/test_lisp_types.c @@ -18,9 +18,6 @@ #include #include -/* FIXME */ -#include - #define _assert(e) \ error = CLIB_ERROR_ASSERT (e); \ if (error) \ @@ -265,7 +262,6 @@ done: } #endif -#if 0 /* uncomment this once VNI is supported */ static clib_error_t * test_write_mac_in_lcaf (void) { clib_error_t * error = 0; @@ -276,13 +272,12 @@ static clib_error_t * test_write_mac_in_lcaf (void) gid_address_t g = { .mac = {0x1, 0x2, 0x3, 0x4, 0x5, 0x6}, - .vni = 0x30, + .vni = 0x01020304, .vni_mask = 0x10, .type = GID_ADDR_MAC, }; u16 len = gid_address_put (b, &g); - _assert (8 == len); u8 expected[] = { @@ -290,20 +285,20 @@ static clib_error_t * test_write_mac_in_lcaf (void) 0x00, /* reserved1 */ 0x00, /* flags */ 0x02, /* LCAF type = Instance ID */ - 0x20, /* IID/VNI mask len */ - 0x00, 0x0a, /* length */ + 0x10, /* IID/IID mask len */ + 0x00, 0x0c, /* length */ 0x01, 0x02, 0x03, 0x04, /* Instance ID / VNI */ - 0x00, 0x06, /* AFI = MAC */ + 0x40, 0x05, /* AFI = MAC */ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06 /* MAC */ - } + }; + _assert (sizeof (expected) == len); _assert (0 == memcmp (expected, b, len)); done: clib_mem_free (b); return error; } -#endif static clib_error_t * test_mac_address_write (void) { @@ -417,6 +412,32 @@ done: return error; } +static clib_error_t * +test_src_dst_deser_bad_afi (void) +{ + clib_error_t * error = 0; + + u8 expected_data[] = + { + 0x40, 0x03, 0x00, 0x00, /* AFI = LCAF, reserved1, flags */ + 0x0c, 0x00, 0x00, 0x14, /* LCAF type = source/dest key, rsvd, length */ + 0x00, 0x00, 0x00, 0x00, /* reserved; source-ML, Dest-ML */ + + 0xde, 0xad, /* AFI = bad value */ + 0x11, 0x22, 0x33, 0x44, + 0x55, 0x66, /* source */ + + 0x40, 0x05, /* AFI = MAC */ + 0x10, 0x21, 0x32, 0x43, + 0x54, 0x65, /* destination */ + }; + + gid_address_t p; + _assert (~0 == gid_address_parse (expected_data, &p)); +done: + return error; +} + static clib_error_t * test_src_dst_serdes (void) { @@ -537,6 +558,8 @@ done: _(mac_address_write) \ _(gid_address_write) \ _(src_dst_serdes) \ + _(write_mac_in_lcaf) \ + _(src_dst_deser_bad_afi) \ _(src_dst_with_vni_serdes) int run_tests (void) diff --git a/src/tests/vnet/lisp-gpe/test.c b/src/tests/vnet/lisp-gpe/test.c deleted file mode 100644 index dde633ae..00000000 --- a/src/tests/vnet/lisp-gpe/test.c +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Copyright (c) 2016 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -int main(int argc, char **argv) { - return 0; -} diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index 69c8c7cc..18e2437d 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -205,6 +205,18 @@ vlib_buffer_advance (vlib_buffer_t * b, word l) b->current_length -= l; } +/** \brief Check if there is enough space in buffer to advance + + @param b - (vlib_buffer_t *) pointer to the buffer + @param l - (word) size to check + @return - 0 if there is less space than 'l' in buffer +*/ +always_inline u8 +vlib_buffer_has_space (vlib_buffer_t * b, word l) +{ + return b->current_length >= l; +} + /** \brief Reset current header & length to state they were in when packet was received. diff --git a/src/vnet.am b/src/vnet.am index 25b84616..9f3aedba 100644 --- a/src/vnet.am +++ b/src/vnet.am @@ -662,19 +662,6 @@ nobase_include_HEADERS += \ API_FILES += vnet/lisp-gpe/lisp_gpe.api -if ENABLE_TESTS -TESTS += test_test - -test_test_SOURCES = tests/vnet/lisp-gpe/test.c - -test_test_CPPFLAGS = $(AM_CPPFLAGS) -DCLIB_DEBUG - -test_test_LDADD = $(LIBOBJS) - -noinst_PROGRAMS += $(TESTS) -check_PROGRAMS += $(TESTS) -endif - ######################################## # DHCP client ######################################## diff --git a/src/vnet/lisp-cp/control.c b/src/vnet/lisp-cp/control.c index 6408b297..c0093301 100644 --- a/src/vnet/lisp-cp/control.c +++ b/src/vnet/lisp-cp/control.c @@ -44,13 +44,6 @@ typedef struct u8 smr_invoked; } map_request_args_t; -typedef struct -{ - u64 nonce; - u8 is_rloc_probe; - mapping_t *mappings; -} map_records_arg_t; - u8 vnet_lisp_get_map_request_mode (void) { @@ -3485,7 +3478,7 @@ done: vec_free (itr_rlocs); } -static map_records_arg_t * +map_records_arg_t * parse_map_reply (vlib_buffer_t * b) { locator_t probed; @@ -3501,6 +3494,11 @@ parse_map_reply (vlib_buffer_t * b) mrep_hdr = vlib_buffer_get_current (b); a->nonce = MREP_NONCE (mrep_hdr); a->is_rloc_probe = MREP_RLOC_PROBE (mrep_hdr); + if (!vlib_buffer_has_space (b, sizeof (*mrep_hdr))) + { + clib_mem_free (a); + return 0; + } vlib_buffer_pull (b, sizeof (*mrep_hdr)); for (i = 0; i < MREP_REC_COUNT (mrep_hdr); i++) diff --git a/src/vnet/lisp-cp/control.h b/src/vnet/lisp-cp/control.h index eae8a184..cb98eb09 100644 --- a/src/vnet/lisp-cp/control.h +++ b/src/vnet/lisp-cp/control.h @@ -273,6 +273,13 @@ typedef struct u8 key_id; } vnet_lisp_add_del_mapping_args_t; +typedef struct +{ + u64 nonce; + u8 is_rloc_probe; + mapping_t *mappings; +} map_records_arg_t; + int vnet_lisp_map_cache_add_del (vnet_lisp_add_del_mapping_args_t * a, u32 * map_index); @@ -332,6 +339,8 @@ int vnet_lisp_map_register_enable_disable (u8 is_enable); u8 vnet_lisp_map_register_state_get (void); u8 vnet_lisp_rloc_probe_state_get (void); +map_records_arg_t *parse_map_reply (vlib_buffer_t * b); + always_inline mapping_t * lisp_get_petr_mapping (lisp_cp_main_t * lcm) { diff --git a/src/vnet/lisp-cp/lisp_msg_serdes.c b/src/vnet/lisp-cp/lisp_msg_serdes.c index eee1885c..6c0a7219 100644 --- a/src/vnet/lisp-cp/lisp_msg_serdes.c +++ b/src/vnet/lisp-cp/lisp_msg_serdes.c @@ -312,6 +312,8 @@ lisp_msg_parse_loc (vlib_buffer_t * b, locator_t * loc) if (len == ~0) return ~0; + if (!vlib_buffer_has_space (b, sizeof (len))) + return ~0; vlib_buffer_pull (b, len); return len; @@ -326,6 +328,9 @@ lisp_msg_parse_mapping_record (vlib_buffer_t * b, gid_address_t * eid, int i = 0, len = 0, llen = 0; h = vlib_buffer_get_current (b); + if (!vlib_buffer_has_space (b, sizeof (mapping_record_hdr_t))) + return ~0; + vlib_buffer_pull (b, sizeof (mapping_record_hdr_t)); memset (eid, 0, sizeof (*eid)); @@ -333,6 +338,9 @@ lisp_msg_parse_mapping_record (vlib_buffer_t * b, gid_address_t * eid, if (len == ~0) return len; + if (!vlib_buffer_has_space (b, sizeof (len))) + return ~0; + vlib_buffer_pull (b, len); if (GID_ADDR_IP_PREFIX == gid_address_type (eid)) gid_address_ippref_len (eid) = MAP_REC_EID_PLEN (h); diff --git a/src/vnet/lisp-cp/lisp_types.c b/src/vnet/lisp-cp/lisp_types.c index ad3a4bdf..31a80081 100644 --- a/src/vnet/lisp-cp/lisp_types.c +++ b/src/vnet/lisp-cp/lisp_types.c @@ -657,12 +657,19 @@ fid_addr_parse (u8 * p, fid_address_t * a) return ip_address_parse (p, afi, ip_addr); case FID_ADDR_NSH: - ASSERT (0); break; } return ~0; } +#define INC(dst, exp) \ +do { \ + u16 _sum = (exp); \ + if ((u16)~0 == _sum) \ + return ~0; \ + dst += _sum; \ +} while (0); + u16 sd_parse (u8 * p, void *a) { @@ -677,8 +684,8 @@ sd_parse (u8 * p, void *a) sd_hdr = (lcaf_src_dst_hdr_t *) (p + size); size += sizeof (sd_hdr[0]); - size += fid_addr_parse (p + size, src); - size += fid_addr_parse (p + size, dst); + INC (size, fid_addr_parse (p + size, src)); + INC (size, fid_addr_parse (p + size, dst)); if (fid_addr_type (src) == FID_ADDR_IP_PREF) { @@ -704,7 +711,7 @@ try_parse_src_dst_lcaf (u8 * p, gid_address_t * a) if (LCAF_SOURCE_DEST != lcaf_type (&lcaf)) return ~0; - size += sd_parse (p + size, a); + INC (size, sd_parse (p + size, a)); return size; } @@ -724,13 +731,10 @@ vni_parse (u8 * p, void *a) u16 afi = clib_net_to_host_u16 (*((u16 *) (p + size))); if (LISP_AFI_LCAF == afi) { - u16 len = try_parse_src_dst_lcaf (p + size, g); - if ((u16) ~ 0 == len) - return ~0; - size += len; + INC (size, try_parse_src_dst_lcaf (p + size, g)); } else - size += gid_address_parse (p + size, g); + INC (size, gid_address_parse (p + size, g)); return size; } @@ -757,7 +761,7 @@ lcaf_parse (void *offset, gid_address_t * addr) clib_warning ("Unsupported LCAF type: %u", type); return ~0; } - size += (*lcaf_parse_fcts[type]) (offset + size, lcaf); + INC (size, (*lcaf_parse_fcts[type]) (offset + size, lcaf)); return sizeof (u16) + size; } @@ -1419,10 +1423,9 @@ u32 gid_address_parse (u8 * offset, gid_address_t * a) { lisp_afi_e afi; - int len = 0; + u16 len = 0; - if (!a) - return 0; + ASSERT (a); /* NOTE: since gid_address_parse may be called by vni_parse, we can't 0 * the gid address here */ @@ -1458,7 +1461,7 @@ gid_address_parse (u8 * offset, gid_address_t * a) clib_warning ("LISP AFI %d not supported!", afi); return ~0; } - return len; + return (len == (u16) ~ 0) ? ~0 : len; } void diff --git a/src/vpp.am b/src/vpp.am index 8cdc60d9..d8b3e4ec 100644 --- a/src/vpp.am +++ b/src/vpp.am @@ -100,7 +100,6 @@ bin_test_ha_SOURCES = \ bin_test_ha_LDADD = \ libvlibmemoryclient.la \ - libvlibapi.la \ libsvm.la \ libvppinfra.la \ -lpthread -lm -lrt diff --git a/src/vpp/api/test_client.c b/src/vpp/api/test_client.c index 551bdab9..231b3c18 100644 --- a/src/vpp/api/test_client.c +++ b/src/vpp/api/test_client.c @@ -541,13 +541,6 @@ static void vl_api_create_loopback_instance_reply_t_handler ntohl (mp->retval), ntohl (mp->sw_if_index)); } -static void -vl_api_sr_tunnel_add_del_reply_t_handler (vl_api_sr_tunnel_add_del_reply_t * - mp) -{ - fformat (stdout, "sr tunnel add/del reply %d\n", ntohl (mp->retval)); -} - static void vl_api_l2_patch_add_del_reply_t_handler (vl_api_l2_patch_add_del_reply_t * mp) { @@ -949,7 +942,6 @@ add_ip4_neighbor (test_main_t * tm, int add_del) mp->_vl_msg_id = ntohs (VL_API_IP_NEIGHBOR_ADD_DEL); mp->client_index = tm->my_client_index; mp->context = 0xdeadbeef; - mp->vrf_id = ntohl (11); mp->sw_if_index = ntohl (6); mp->is_add = add_del; @@ -972,7 +964,6 @@ add_ip6_neighbor (test_main_t * tm, int add_del) mp->_vl_msg_id = ntohs (VL_API_IP_NEIGHBOR_ADD_DEL); mp->client_index = tm->my_client_index; mp->context = 0xdeadbeef; - mp->vrf_id = ntohl (11); mp->sw_if_index = ntohl (6); mp->is_add = add_del; mp->is_ipv6 = 1; @@ -1055,9 +1046,7 @@ dhcp_set_proxy (test_main_t * tm, int ipv6) mp->_vl_msg_id = ntohs (VL_API_DHCP_PROXY_CONFIG); mp->client_index = tm->my_client_index; mp->context = 0xdeadbeef; - mp->vrf_id = ntohl (0); mp->is_ipv6 = ipv6; - mp->insert_circuit_id = 1; mp->is_add = 1; mp->dhcp_server[0] = 0x20; mp->dhcp_server[1] = 0x01; diff --git a/src/vppinfra/test_tw_timer.c b/src/vppinfra/test_tw_timer.c index a0287b89..26499509 100644 --- a/src/vppinfra/test_tw_timer.c +++ b/src/vppinfra/test_tw_timer.c @@ -138,7 +138,7 @@ test2_single (tw_timer_test_main_t * tm) tw_timer_wheel_init_2t_1w_2048sl (&tm->single_wheel, expired_timer_single_callback, - 1.0 /* timer interval */ ); + 1.0 /* timer interval */ , ~0); /* Prime offset */ initial_wheel_offset = 757; @@ -266,7 +266,7 @@ test2_double (tw_timer_test_main_t * tm) tw_timer_wheel_init_16t_2w_512sl (&tm->double_wheel, expired_timer_double_callback, - 1.0 /* timer interval */ ); + 1.0 /* timer interval */ , ~0); /* Prime offset */ initial_wheel_offset = 757; @@ -387,7 +387,7 @@ test1_single (tw_timer_test_main_t * tm) tw_timer_wheel_init_2t_1w_2048sl (&tm->single_wheel, expired_timer_single_callback, - 1.0 /* timer interval */ ); + 1.0 /* timer interval */ , ~0); /* * Prime offset, to make sure that the wheel starts in a @@ -454,7 +454,7 @@ test1_double (tw_timer_test_main_t * tm) tw_timer_wheel_init_16t_2w_512sl (&tm->double_wheel, expired_timer_double_callback, - 1.0 /* timer interval */ ); + 1.0 /* timer interval */ , ~0); /* * Prime offset, to make sure that the wheel starts in a -- cgit 1.2.3-korg From 3c785e09a039a20517dd9aab9c32d121a19f9d38 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Mon, 8 May 2017 18:37:54 +0200 Subject: vlib: do not unbind devices already bound to vfio-pci Change-Id: I739bed5b9d9504d18ee88206e29ebc4ba1b47d28 Signed-off-by: Damjan Marion --- src/vlib/pci/linux_pci.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'src/vlib') diff --git a/src/vlib/pci/linux_pci.c b/src/vlib/pci/linux_pci.c index f9ee47ac..d43361aa 100644 --- a/src/vlib/pci/linux_pci.c +++ b/src/vlib/pci/linux_pci.c @@ -109,6 +109,11 @@ vlib_pci_bind_to_uio (vlib_pci_device_t * d, char *uio_driver_name) goto done; vec_reset_length (s); + s = format (s, "%v/iommu_group%c", dev_dir_name, 0); + if (access ((char *) s, F_OK) == 0) + goto done; + vec_reset_length (s); + /* walk trough all linux interfaces and if interface belonging to this device is founf check if interface is admin up */ dir = opendir ("/sys/class/net"); -- cgit 1.2.3-korg From 7bee80c823ca77de3aca803fdede77e4c7385a52 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 26 Apr 2017 15:32:12 +0200 Subject: Fix remaining 32-bit compile issues Change-Id: I9664214652229b663c3e3ba7406b4ede96bfb123 Signed-off-by: Damjan Marion --- Makefile | 8 ++++---- build-data/platforms/vpp.mk | 5 +++++ src/plugins/dpdk/buffer.c | 6 +++--- src/plugins/ixge/ixge.c | 5 +++-- src/svm/svm.c | 11 ++++++----- src/svm/svmtool.c | 4 ++-- src/tools/vppapigen/gram.y | 6 +++--- src/tools/vppapigen/node.c | 4 ++-- src/uri/uri_tcp_test.c | 21 ++++++++++++--------- src/uri/uri_udp_test.c | 15 +++++++++------ src/vat/api_format.c | 10 +++++----- src/vlib/threads.c | 2 +- src/vlibmemory/memory_client.c | 2 +- src/vlibmemory/memory_vlib.c | 3 ++- src/vnet/devices/virtio/vhost-user.c | 2 +- src/vnet/session/application_interface.c | 2 +- src/vnet/session/session_api.c | 14 +++++++------- src/vnet/tcp/builtin_client.c | 19 +++++++++++-------- src/vnet/tcp/builtin_server.c | 2 +- src/vppinfra/mheap.c | 2 +- 20 files changed, 80 insertions(+), 63 deletions(-) (limited to 'src/vlib') diff --git a/Makefile b/Makefile index 8240e789..b344f377 100644 --- a/Makefile +++ b/Makefile @@ -263,9 +263,9 @@ define test TEST_DIR=$(WS_ROOT)/test \ VPP_TEST_BUILD_DIR=$(BR)/build-$(2)-native \ VPP_TEST_BIN=$(BR)/install-$(2)-native/vpp/bin/vpp \ - VPP_TEST_PLUGIN_PATH=$(BR)/install-$(2)-native/vpp/lib64/vpp_plugins \ + VPP_TEST_PLUGIN_PATH=$(wildcard $(BR)/install-$(2)-native/vpp/lib*/vpp_plugins) \ VPP_TEST_INSTALL_PATH=$(BR)/install-$(2)-native/ \ - LD_LIBRARY_PATH=$(BR)/install-$(2)-native/vpp/lib64/ \ + LD_LIBRARY_PATH=$(subst $(subst ,, ),:,$(wildcard $(BR)/install-$(2)-native/vpp/lib*/)) \ EXTENDED_TESTS=$(EXTENDED_TESTS) \ PYTHON=$(PYTHON) \ $(3) @@ -325,12 +325,12 @@ define run @echo "WARNING: STARTUP_CONF not defined or file doesn't exist." @echo " Running with minimal startup config: $(MINIMAL_STARTUP_CONF)\n" @cd $(STARTUP_DIR) && \ - sudo $(2) $(1)/vpp/bin/vpp $(MINIMAL_STARTUP_CONF) plugin_path $(1)/vpp/lib64/vpp_plugins + sudo $(2) $(1)/vpp/bin/vpp $(MINIMAL_STARTUP_CONF) plugin_path $(wildcard $(1)/vpp/lib*/vpp_plugins) endef else define run @cd $(STARTUP_DIR) && \ - sudo $(2) $(1)/vpp/bin/vpp $(shell cat $(STARTUP_CONF) | sed -e 's/#.*//') plugin_path $(1)/vpp/lib64/vpp_plugins + sudo $(2) $(1)/vpp/bin/vpp $(shell cat $(STARTUP_CONF) | sed -e 's/#.*//') plugin_path $(wildcard $(1)/vpp/lib*/vpp_plugins) endef endif diff --git a/build-data/platforms/vpp.mk b/build-data/platforms/vpp.mk index 5aafdd76..4577fa2e 100644 --- a/build-data/platforms/vpp.mk +++ b/build-data/platforms/vpp.mk @@ -46,6 +46,11 @@ vpp_root_packages = vpp gmod # vpp_dpdk_lib_dir = /usr/lib # vpp_dpdk_shared_lib = yes +# load balancer plugin is not portable on 32 bit platform +ifeq ($(MACHINE),i686) +vpp_configure_args_vpp = --disable-lb-plugin +endif + vpp_debug_TAG_CFLAGS = -g -O0 -DCLIB_DEBUG -DFORTIFY_SOURCE=2 -march=$(MARCH) \ -fstack-protector-all -fPIC -Werror vpp_debug_TAG_LDFLAGS = -g -O0 -DCLIB_DEBUG -DFORTIFY_SOURCE=2 -march=$(MARCH) \ diff --git a/src/plugins/dpdk/buffer.c b/src/plugins/dpdk/buffer.c index c80b3fa8..2d4762ab 100644 --- a/src/plugins/dpdk/buffer.c +++ b/src/plugins/dpdk/buffer.c @@ -455,8 +455,8 @@ vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, uword save_vpm_start, save_vpm_end, save_vpm_size; struct rte_mempool_memhdr *memhdr; - this_pool_start = ~0ULL; - this_pool_end = 0LL; + this_pool_start = ~0; + this_pool_end = 0; STAILQ_FOREACH (memhdr, &rmp->mem_list, next) { @@ -465,7 +465,7 @@ vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, if (((uword) memhdr->addr) < this_pool_start) this_pool_start = (uword) (memhdr->addr); } - ASSERT (this_pool_start < ~0ULL && this_pool_end > 0); + ASSERT (this_pool_start < ~0 && this_pool_end > 0); this_pool_size = this_pool_end - this_pool_start; if (CLIB_DEBUG > 1) diff --git a/src/plugins/ixge/ixge.c b/src/plugins/ixge/ixge.c index 08f5b692..0d287250 100644 --- a/src/plugins/ixge/ixge.c +++ b/src/plugins/ixge/ixge.c @@ -20,7 +20,7 @@ * Please use supported DPDK driver instead. */ -#if __x86_64__ +#if __x86_64__ || __i386__ #include #ifndef CLIB_HAVE_VEC128 @@ -2929,7 +2929,6 @@ ixge_set_next_node (ixge_rx_next_t next, char *name) break; } } -#endif /* *INDENT-OFF* */ VLIB_PLUGIN_REGISTER () = { @@ -2937,8 +2936,10 @@ VLIB_PLUGIN_REGISTER () = { .default_disabled = 1, .description = "Intel 82599 Family Native Driver (experimental)", }; +#endif /* *INDENT-ON* */ + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/svm/svm.c b/src/svm/svm.c index 97add5a7..c96135cf 100644 --- a/src/svm/svm.c +++ b/src/svm/svm.c @@ -491,7 +491,7 @@ svm_map_region (svm_map_region_args_t * a) return (0); } - rp = mmap ((void *) a->baseva, a->size, + rp = mmap (uword_to_pointer (a->baseva, void *), a->size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, svm_fd, 0); if (rp == (svm_region_t *) MAP_FAILED) @@ -533,9 +533,10 @@ svm_map_region (svm_map_region_args_t * a) rp->virtual_size = a->size; rp->region_heap = - mheap_alloc_with_flags ((void *) (a->baseva + MMAP_PAGESIZE), - (a->pvt_heap_size != 0) ? - a->pvt_heap_size : SVM_PVT_MHEAP_SIZE, + mheap_alloc_with_flags (uword_to_pointer + (a->baseva + MMAP_PAGESIZE, void *), + (a->pvt_heap_size != + 0) ? a->pvt_heap_size : SVM_PVT_MHEAP_SIZE, MHEAP_FLAG_DISABLE_VM); oldheap = svm_push_pvt_heap (rp); @@ -661,7 +662,7 @@ svm_map_region (svm_map_region_args_t * a) a->size = rp->virtual_size; munmap (rp, MMAP_PAGESIZE); - rp = (void *) mmap ((void *) a->baseva, a->size, + rp = (void *) mmap (uword_to_pointer (a->baseva, void *), a->size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, svm_fd, 0); if ((uword) rp == (uword) MAP_FAILED) diff --git a/src/svm/svmtool.c b/src/svm/svmtool.c index b3195514..01ae4221 100644 --- a/src/svm/svmtool.c +++ b/src/svm/svmtool.c @@ -172,7 +172,7 @@ svm_map_region_nolock (svm_map_region_args_t * a) a->size = rp->virtual_size; munmap (rp, MMAP_PAGESIZE); - rp = (void *) mmap ((void *) a->baseva, a->size, + rp = (void *) mmap (uword_to_pointer (a->baseva, void *), a->size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, svm_fd, 0); if ((uword) rp == (uword) MAP_FAILED) @@ -401,7 +401,7 @@ repair (char *chroot_path, int crash_root_region) a->size = root_rp->virtual_size; munmap (root_rp, MMAP_PAGESIZE); - root_rp = (void *) mmap ((void *) a->baseva, a->size, + root_rp = (void *) mmap (uword_to_pointer (a->baseva, void *), a->size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, svm_fd, 0); if ((uword) root_rp == (uword) MAP_FAILED) diff --git a/src/tools/vppapigen/gram.y b/src/tools/vppapigen/gram.y index 9cea6023..52bb65c5 100644 --- a/src/tools/vppapigen/gram.y +++ b/src/tools/vppapigen/gram.y @@ -53,9 +53,9 @@ stmt: flist defn {$$ = set_flags($1, $2);} | defn {$$ = $1;} ; -flist: flist flag {$$ = (YYSTYPE)(unsigned long long) - ((unsigned long long) $1 - | (unsigned long long) $2);} +flist: flist flag {$$ = (YYSTYPE)(unsigned long) + ((unsigned long) $1 + | (unsigned long) $2);} | flag {$$ = $1;} ; diff --git a/src/tools/vppapigen/node.c b/src/tools/vppapigen/node.c index 9f234037..15868ee5 100644 --- a/src/tools/vppapigen/node.c +++ b/src/tools/vppapigen/node.c @@ -397,7 +397,7 @@ void node_define_generate (node_t *this, enum passid which, FILE *fp) fprintf(fp, ",\n"); } indent_me(fp); - fprintf (fp, "{\"crc\" : \"0x%08x\"}\n", (u32)(u64)CDATA3); + fprintf (fp, "{\"crc\" : \"0x%08x\"}\n", (u32)(uword)CDATA3); indent -= 4; indent_me(fp); fprintf(fp, "]"); @@ -1219,7 +1219,7 @@ void generate_msg_name_crc_list (YYSTYPE a1, FILE *fp) if (!(np->flags & NODE_FLAG_TYPEONLY)) { fprintf (fp, "\\\n_(VL_API_%s, %s, %08x) ", uppercase (np->data[0]), (i8 *) np->data[0], - (u32)(u64)np->data[3]); + (u32)(uword)np->data[3]); } } np = np->peer; diff --git a/src/uri/uri_tcp_test.c b/src/uri/uri_tcp_test.c index b15fd6ce..22f246e5 100755 --- a/src/uri/uri_tcp_test.c +++ b/src/uri/uri_tcp_test.c @@ -262,7 +262,8 @@ vl_api_application_attach_reply_t_handler (vl_api_application_attach_reply_t * } utm->our_event_queue = - (unix_shared_memory_queue_t *) mp->app_event_queue_address; + uword_to_pointer (mp->app_event_queue_address, + unix_shared_memory_queue_t *); utm->state = STATE_ATTACHED; } @@ -524,8 +525,9 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) return; } - utm->vpp_event_queue = (unix_shared_memory_queue_t *) - mp->vpp_event_queue_address; + utm->vpp_event_queue = + uword_to_pointer (mp->vpp_event_queue_address, + unix_shared_memory_queue_t *); /* * Setup session @@ -534,9 +536,9 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) pool_get (utm->sessions, session); session_index = session - utm->sessions; - rx_fifo = (svm_fifo_t *) mp->server_rx_fifo; + rx_fifo = uword_to_pointer (mp->server_rx_fifo, svm_fifo_t *); rx_fifo->client_session_index = session_index; - tx_fifo = (svm_fifo_t *) mp->server_tx_fifo; + tx_fifo = uword_to_pointer (mp->server_tx_fifo, svm_fifo_t *); tx_fifo->client_session_index = session_index; session->server_rx_fifo = rx_fifo; @@ -858,16 +860,17 @@ vl_api_accept_session_t_handler (vl_api_accept_session_t * mp) ip_str = format (0, "%U", format_ip46_address, &mp->ip, mp->is_ip4); clib_warning ("Accepted session from: %s:%d", ip_str, clib_net_to_host_u16 (mp->port)); - utm->vpp_event_queue = (unix_shared_memory_queue_t *) - mp->vpp_event_queue_address; + utm->vpp_event_queue = + uword_to_pointer (mp->vpp_event_queue_address, + unix_shared_memory_queue_t *); /* Allocate local session and set it up */ pool_get (utm->sessions, session); session_index = session - utm->sessions; - rx_fifo = (svm_fifo_t *) mp->server_rx_fifo; + rx_fifo = uword_to_pointer (mp->server_rx_fifo, svm_fifo_t *); rx_fifo->client_session_index = session_index; - tx_fifo = (svm_fifo_t *) mp->server_tx_fifo; + tx_fifo = uword_to_pointer (mp->server_tx_fifo, svm_fifo_t *); tx_fifo->client_session_index = session_index; session->server_rx_fifo = rx_fifo; diff --git a/src/uri/uri_udp_test.c b/src/uri/uri_udp_test.c index 266215c8..8fb12ed2 100644 --- a/src/uri/uri_udp_test.c +++ b/src/uri/uri_udp_test.c @@ -232,7 +232,8 @@ vl_api_application_attach_reply_t_handler (vl_api_application_attach_reply_t * } utm->our_event_queue = - (unix_shared_memory_queue_t *) mp->app_event_queue_address; + uword_to_pointer (mp->app_event_queue_address, + unix_shared_memory_queue_t *); } static void @@ -581,7 +582,8 @@ send_reply: vec_free (a->segment_name); - client_q = (unix_shared_memory_queue_t *) mp->client_queue_address; + client_q = + uword_to_pointer (mp->client_queue_address, unix_shared_memory_queue_t *); vl_msg_api_send_shmem (client_q, (u8 *) & rmp); } @@ -608,14 +610,15 @@ vl_api_accept_session_t_handler (vl_api_accept_session_t * mp) if (start_time == 0.0) start_time = clib_time_now (&utm->clib_time); - utm->vpp_event_queue = (unix_shared_memory_queue_t *) - mp->vpp_event_queue_address; + utm->vpp_event_queue = + uword_to_pointer (mp->vpp_event_queue_address, + unix_shared_memory_queue_t *); pool_get (utm->sessions, session); - rx_fifo = (svm_fifo_t *) mp->server_rx_fifo; + rx_fifo = uword_to_pointer (mp->server_rx_fifo, svm_fifo_t *); rx_fifo->client_session_index = session - utm->sessions; - tx_fifo = (svm_fifo_t *) mp->server_tx_fifo; + tx_fifo = uword_to_pointer (mp->server_tx_fifo, svm_fifo_t *); tx_fifo->client_session_index = session - utm->sessions; session->server_rx_fifo = rx_fifo; diff --git a/src/vat/api_format.c b/src/vat/api_format.c index 28b227b4..495b660e 100644 --- a/src/vat/api_format.c +++ b/src/vat/api_format.c @@ -1037,7 +1037,7 @@ vl_api_cli_reply_t_handler (vl_api_cli_reply_t * mp) i32 retval = ntohl (mp->retval); vam->retval = retval; - vam->shmem_result = (u8 *) mp->reply_in_shmem; + vam->shmem_result = uword_to_pointer (mp->reply_in_shmem, u8 *); vam->result_ready = 1; } @@ -1058,7 +1058,7 @@ vl_api_cli_reply_t_handler_json (vl_api_cli_reply_t * mp) pthread_mutex_lock (&am->vlib_rp->mutex); oldheap = svm_push_data_heap (am->vlib_rp); - reply = (u8 *) (mp->reply_in_shmem); + reply = uword_to_pointer (mp->reply_in_shmem, u8 *); vec_free (reply); svm_pop_heap (oldheap); @@ -2405,7 +2405,7 @@ static void vl_api_get_node_graph_reply_t_handler if (retval != 0) return; - reply = (u8 *) (mp->reply_in_shmem); + reply = uword_to_pointer (mp->reply_in_shmem, u8 *); pvt_copy = vec_dup (reply); /* Toss the shared-memory original... */ @@ -2456,7 +2456,7 @@ static void vl_api_get_node_graph_reply_t_handler_json vat_json_object_add_int (&node, "retval", ntohl (mp->retval)); vat_json_object_add_uint (&node, "reply_in_shmem", mp->reply_in_shmem); - reply = (u8 *) (mp->reply_in_shmem); + reply = uword_to_pointer (mp->reply_in_shmem, u8 *); /* Toss the shared-memory original... */ pthread_mutex_lock (&am->vlib_rp->mutex); @@ -4959,7 +4959,7 @@ exec (vat_main_t * vam) svm_pop_heap (oldheap); pthread_mutex_unlock (&am->vlib_rp->mutex); - mp->cmd_in_shmem = (u64) cmd; + mp->cmd_in_shmem = pointer_to_uword (cmd); S (mp); timeout = vat_time_now (vam) + 10.0; diff --git a/src/vlib/threads.c b/src/vlib/threads.c index 4a111f8d..9ccfd3a2 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -1125,7 +1125,7 @@ cpu_config (vlib_main_t * vm, unformat_input_t * input) VLIB_EARLY_CONFIG_FUNCTION (cpu_config, "cpu"); -#if !defined (__x86_64__) && !defined (__aarch64__) && !defined (__powerpc64__) && !defined(__arm__) +#if !defined (__x86_64__) && !defined (__i386__) && !defined (__aarch64__) && !defined (__powerpc64__) && !defined(__arm__) void __sync_fetch_and_add_8 (void) { diff --git a/src/vlibmemory/memory_client.c b/src/vlibmemory/memory_client.c index d48a4fa1..a162d6bb 100644 --- a/src/vlibmemory/memory_client.c +++ b/src/vlibmemory/memory_client.c @@ -137,7 +137,7 @@ vl_api_memclnt_create_reply_t_handler (vl_api_memclnt_create_reply_t * mp) am->msg_index_by_name_and_crc = hash_create_string (0, sizeof (uword)); /* Recreate the vnet-side API message handler table */ - tblv = (u8 *) mp->message_table; + tblv = uword_to_pointer (mp->message_table, u8 *); serialize_open_vector (sm, tblv); unserialize_integer (sm, &nmsgs, sizeof (u32)); diff --git a/src/vlibmemory/memory_vlib.c b/src/vlibmemory/memory_vlib.c index 29a5c2c2..acba8b3f 100644 --- a/src/vlibmemory/memory_vlib.c +++ b/src/vlibmemory/memory_vlib.c @@ -216,7 +216,8 @@ vl_api_memclnt_create_t_handler (vl_api_memclnt_create_t * mp) am->shmem_hdr->application_restarts); rp->context = mp->context; rp->response = ntohl (rv); - rp->message_table = (u64) am->serialized_message_table_in_shmem; + rp->message_table = + pointer_to_uword (am->serialized_message_table_in_shmem); vl_msg_api_send_shmem (q, (u8 *) & rp); } diff --git a/src/vnet/devices/virtio/vhost-user.c b/src/vnet/devices/virtio/vhost-user.c index acc7bf82..6ccc0d87 100644 --- a/src/vnet/devices/virtio/vhost-user.c +++ b/src/vnet/devices/virtio/vhost-user.c @@ -719,7 +719,7 @@ vhost_user_log_dirty_pages_2 (vhost_user_intf_t * vui, } if (is_host_address) { - addr = (u64) map_user_mem (vui, (uword) addr); + addr = pointer_to_uword (map_user_mem (vui, (uword) addr)); } if (PREDICT_FALSE ((addr + len - 1) / VHOST_LOG_PAGE / 8 >= vui->log_size)) { diff --git a/src/vnet/session/application_interface.c b/src/vnet/session/application_interface.c index ad44baa1..f74b0cfe 100644 --- a/src/vnet/session/application_interface.c +++ b/src/vnet/session/application_interface.c @@ -247,7 +247,7 @@ vnet_application_attach (vnet_app_attach_args_t * a) a->session_cb_vft))) return rv; - a->app_event_queue_address = (u64) app->event_queue; + a->app_event_queue_address = pointer_to_uword (app->event_queue); sm = segment_manager_get (app->first_segment_manager); segment_manager_get_segment_info (sm->segment_indices[0], &seg_name, &a->segment_size); diff --git a/src/vnet/session/session_api.c b/src/vnet/session/session_api.c index 5a02a08e..8266922c 100755 --- a/src/vnet/session/session_api.c +++ b/src/vnet/session/session_api.c @@ -102,9 +102,9 @@ send_session_accept_callback (stream_session_t * s) tc = tp_vft->get_connection (s->connection_index, s->thread_index); mp->listener_handle = listen_session_get_handle (listener); mp->handle = stream_session_handle (s); - mp->server_rx_fifo = (u64) s->server_rx_fifo; - mp->server_tx_fifo = (u64) s->server_tx_fifo; - mp->vpp_event_queue_address = (u64) vpp_queue; + mp->server_rx_fifo = pointer_to_uword (s->server_rx_fifo); + mp->server_tx_fifo = pointer_to_uword (s->server_tx_fifo); + mp->vpp_event_queue_address = pointer_to_uword (vpp_queue); mp->port = tc->rmt_port; mp->is_ip4 = tc->is_ip4; clib_memcpy (&mp->ip, &tc->rmt_ip, sizeof (tc->rmt_ip)); @@ -172,10 +172,10 @@ send_session_connected_callback (u32 app_index, u32 api_context, if (!is_fail) { vpp_queue = session_manager_get_vpp_event_queue (s->thread_index); - mp->server_rx_fifo = (u64) s->server_rx_fifo; - mp->server_tx_fifo = (u64) s->server_tx_fifo; + mp->server_rx_fifo = pointer_to_uword (s->server_rx_fifo); + mp->server_tx_fifo = pointer_to_uword (s->server_tx_fifo); mp->handle = stream_session_handle (s); - mp->vpp_event_queue_address = (u64) vpp_queue; + mp->vpp_event_queue_address = pointer_to_uword (vpp_queue); mp->retval = 0; } else @@ -225,7 +225,7 @@ redirect_connect_callback (u32 server_api_client_index, void *mp_arg) } /* Tell the server the client's API queue address, so it can reply */ - mp->client_queue_address = (u64) client_q; + mp->client_queue_address = pointer_to_uword (client_q); app = application_lookup (mp->client_index); if (!app) { diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index 32d69a96..6f890874 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -274,11 +274,12 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) return; } - tm->our_event_queue = (unix_shared_memory_queue_t *) - mp->vpp_event_queue_address; - - tm->vpp_event_queue = (unix_shared_memory_queue_t *) - mp->vpp_event_queue_address; + tm->our_event_queue = + uword_to_pointer (mp->vpp_event_queue_address, + unix_shared_memory_queue_t *); + tm->vpp_event_queue = + uword_to_pointer (mp->vpp_event_queue_address, + unix_shared_memory_queue_t *); /* * Setup session @@ -288,9 +289,11 @@ vl_api_connect_uri_reply_t_handler (vl_api_connect_uri_reply_t * mp) session_index = session - tm->sessions; session->bytes_to_receive = session->bytes_to_send = tm->bytes_to_send; - session->server_rx_fifo = (svm_fifo_t *) mp->server_rx_fifo; + session->server_rx_fifo = + uword_to_pointer (mp->server_rx_fifo, svm_fifo_t *); session->server_rx_fifo->client_session_index = session_index; - session->server_tx_fifo = (svm_fifo_t *) mp->server_tx_fifo; + session->server_tx_fifo = + uword_to_pointer (mp->server_tx_fifo, svm_fifo_t *); session->server_tx_fifo->client_session_index = session_index; session->vpp_session_handle = mp->handle; @@ -321,7 +324,7 @@ create_api_loopback (tclient_main_t * tm) memset (mp, 0, sizeof (*mp)); mp->_vl_msg_id = VL_API_MEMCLNT_CREATE; mp->context = 0xFEEDFACE; - mp->input_queue = (u64) tm->vl_input_queue; + mp->input_queue = pointer_to_uword (tm->vl_input_queue); strncpy ((char *) mp->name, "tcp_tester", sizeof (mp->name) - 1); vl_api_memclnt_create_t_handler (mp); diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c index 34682699..621ce02a 100644 --- a/src/vnet/tcp/builtin_server.c +++ b/src/vnet/tcp/builtin_server.c @@ -244,7 +244,7 @@ create_api_loopback (vlib_main_t * vm) memset (mp, 0, sizeof (*mp)); mp->_vl_msg_id = VL_API_MEMCLNT_CREATE; mp->context = 0xFEEDFACE; - mp->input_queue = (u64) bsm->vl_input_queue; + mp->input_queue = pointer_to_uword (bsm->vl_input_queue); strncpy ((char *) mp->name, "tcp_test_server", sizeof (mp->name) - 1); vl_api_memclnt_create_t_handler (mp); diff --git a/src/vppinfra/mheap.c b/src/vppinfra/mheap.c index b8828f9e..192732db 100644 --- a/src/vppinfra/mheap.c +++ b/src/vppinfra/mheap.c @@ -304,7 +304,7 @@ mheap_small_object_cache_mask (mheap_small_object_cache_t * c, uword bin) uword mask; /* $$$$ ELIOT FIXME: add Altivec version of this routine */ -#if !defined (CLIB_HAVE_VEC128) || defined (__ALTIVEC__) +#if !defined (CLIB_HAVE_VEC128) || defined (__ALTIVEC__) || defined (__i386__) mask = 0; #else u8x16 b = u8x16_splat (bin); -- cgit 1.2.3-korg From f55f9b851f59264d737d92c6277a87588c565d24 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 10 May 2017 21:06:28 +0200 Subject: completelly deprecate os_get_cpu_number, replace new occurences Change-Id: I82c663bc0866c6c68ba354104b0bb059387f4b9d Signed-off-by: Damjan Marion --- src/plugins/flowperpkt/l2_node.c | 20 ++++++++++---------- src/plugins/flowperpkt/node.c | 20 ++++++++++---------- src/plugins/snat/in2out.c | 2 +- src/plugins/snat/out2in.c | 2 +- src/vlib/main.h | 2 +- src/vlib/threads.c | 12 ++---------- src/vlib/threads.h | 3 +-- src/vlib/unix/main.c | 2 +- src/vlibmemory/memory_vlib.c | 2 +- src/vnet/dpo/interface_dpo.c | 8 ++++---- src/vnet/lisp-gpe/lisp_gpe_adjacency.c | 2 +- src/vppinfra/bihash_template.c | 16 ++++++++-------- src/vppinfra/lock.h | 6 +++--- src/vppinfra/mem.h | 6 +++--- src/vppinfra/mhash.c | 2 +- src/vppinfra/mhash.h | 2 +- src/vppinfra/mheap.c | 4 ++-- src/vppinfra/os.h | 20 ++++++++++++++++++-- src/vppinfra/smp.c | 2 +- src/vppinfra/unix-misc.c | 19 +++++++------------ 20 files changed, 77 insertions(+), 75 deletions(-) (limited to 'src/vlib') diff --git a/src/plugins/flowperpkt/l2_node.c b/src/plugins/flowperpkt/l2_node.c index fdaf81d1..db80e990 100644 --- a/src/plugins/flowperpkt/l2_node.c +++ b/src/plugins/flowperpkt/l2_node.c @@ -102,7 +102,7 @@ add_to_flow_record_l2 (vlib_main_t * vm, u8 * src_mac, u8 * dst_mac, u16 ethertype, u64 timestamp, u16 length, int do_flush) { - u32 my_cpu_number = vm->thread_index; + u32 my_thread_index = vm->thread_index; flow_report_main_t *frm = &flow_report_main; ip4_header_t *ip; udp_header_t *udp; @@ -116,7 +116,7 @@ add_to_flow_record_l2 (vlib_main_t * vm, vlib_buffer_free_list_t *fl; /* Find or allocate a buffer */ - b0 = fm->l2_buffers_per_worker[my_cpu_number]; + b0 = fm->l2_buffers_per_worker[my_thread_index]; /* Need to allocate a buffer? */ if (PREDICT_FALSE (b0 == 0)) @@ -130,7 +130,7 @@ add_to_flow_record_l2 (vlib_main_t * vm, return; /* Initialize the buffer */ - b0 = fm->l2_buffers_per_worker[my_cpu_number] = + b0 = fm->l2_buffers_per_worker[my_thread_index] = vlib_get_buffer (vm, bi0); fl = vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); @@ -142,16 +142,16 @@ add_to_flow_record_l2 (vlib_main_t * vm, { /* use the current buffer */ bi0 = vlib_get_buffer_index (vm, b0); - offset = fm->l2_next_record_offset_per_worker[my_cpu_number]; + offset = fm->l2_next_record_offset_per_worker[my_thread_index]; } /* Find or allocate a frame */ - f = fm->l2_frames_per_worker[my_cpu_number]; + f = fm->l2_frames_per_worker[my_thread_index]; if (PREDICT_FALSE (f == 0)) { u32 *to_next; f = vlib_get_frame_to_node (vm, ip4_lookup_node.index); - fm->l2_frames_per_worker[my_cpu_number] = f; + fm->l2_frames_per_worker[my_thread_index] = f; /* Enqueue the buffer */ to_next = vlib_frame_vector_args (f); @@ -299,13 +299,13 @@ add_to_flow_record_l2 (vlib_main_t * vm, } vlib_put_frame_to_node (vm, ip4_lookup_node.index, - fm->l2_frames_per_worker[my_cpu_number]); - fm->l2_frames_per_worker[my_cpu_number] = 0; - fm->l2_buffers_per_worker[my_cpu_number] = 0; + fm->l2_frames_per_worker[my_thread_index]); + fm->l2_frames_per_worker[my_thread_index] = 0; + fm->l2_buffers_per_worker[my_thread_index] = 0; offset = 0; } - fm->l2_next_record_offset_per_worker[my_cpu_number] = offset; + fm->l2_next_record_offset_per_worker[my_thread_index] = offset; } void diff --git a/src/plugins/flowperpkt/node.c b/src/plugins/flowperpkt/node.c index 0277682d..9bac4166 100644 --- a/src/plugins/flowperpkt/node.c +++ b/src/plugins/flowperpkt/node.c @@ -101,7 +101,7 @@ add_to_flow_record_ipv4 (vlib_main_t * vm, u32 src_address, u32 dst_address, u8 tos, u64 timestamp, u16 length, int do_flush) { - u32 my_cpu_number = vm->thread_index; + u32 my_thread_index = vm->thread_index; flow_report_main_t *frm = &flow_report_main; ip4_header_t *ip; udp_header_t *udp; @@ -115,7 +115,7 @@ add_to_flow_record_ipv4 (vlib_main_t * vm, vlib_buffer_free_list_t *fl; /* Find or allocate a buffer */ - b0 = fm->ipv4_buffers_per_worker[my_cpu_number]; + b0 = fm->ipv4_buffers_per_worker[my_thread_index]; /* Need to allocate a buffer? */ if (PREDICT_FALSE (b0 == 0)) @@ -129,7 +129,7 @@ add_to_flow_record_ipv4 (vlib_main_t * vm, return; /* Initialize the buffer */ - b0 = fm->ipv4_buffers_per_worker[my_cpu_number] = + b0 = fm->ipv4_buffers_per_worker[my_thread_index] = vlib_get_buffer (vm, bi0); fl = vlib_buffer_get_free_list (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); @@ -141,16 +141,16 @@ add_to_flow_record_ipv4 (vlib_main_t * vm, { /* use the current buffer */ bi0 = vlib_get_buffer_index (vm, b0); - offset = fm->ipv4_next_record_offset_per_worker[my_cpu_number]; + offset = fm->ipv4_next_record_offset_per_worker[my_thread_index]; } /* Find or allocate a frame */ - f = fm->ipv4_frames_per_worker[my_cpu_number]; + f = fm->ipv4_frames_per_worker[my_thread_index]; if (PREDICT_FALSE (f == 0)) { u32 *to_next; f = vlib_get_frame_to_node (vm, ip4_lookup_node.index); - fm->ipv4_frames_per_worker[my_cpu_number] = f; + fm->ipv4_frames_per_worker[my_thread_index] = f; /* Enqueue the buffer */ to_next = vlib_frame_vector_args (f); @@ -300,13 +300,13 @@ add_to_flow_record_ipv4 (vlib_main_t * vm, } vlib_put_frame_to_node (vm, ip4_lookup_node.index, - fm->ipv4_frames_per_worker[my_cpu_number]); - fm->ipv4_frames_per_worker[my_cpu_number] = 0; - fm->ipv4_buffers_per_worker[my_cpu_number] = 0; + fm->ipv4_frames_per_worker[my_thread_index]); + fm->ipv4_frames_per_worker[my_thread_index] = 0; + fm->ipv4_buffers_per_worker[my_thread_index] = 0; offset = 0; } - fm->ipv4_next_record_offset_per_worker[my_cpu_number] = offset; + fm->ipv4_next_record_offset_per_worker[my_thread_index] = offset; } void diff --git a/src/plugins/snat/in2out.c b/src/plugins/snat/in2out.c index f7d29c69..bc86a7a4 100644 --- a/src/plugins/snat/in2out.c +++ b/src/plugins/snat/in2out.c @@ -1514,7 +1514,7 @@ snat_det_in2out_node_fn (vlib_main_t * vm, u32 pkts_processed = 0; snat_main_t * sm = &snat_main; u32 now = (u32) vlib_time_now (vm); - u32 thread_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; diff --git a/src/plugins/snat/out2in.c b/src/plugins/snat/out2in.c index 3d7b106a..824406ab 100644 --- a/src/plugins/snat/out2in.c +++ b/src/plugins/snat/out2in.c @@ -1168,7 +1168,7 @@ snat_det_out2in_node_fn (vlib_main_t * vm, snat_out2in_next_t next_index; u32 pkts_processed = 0; snat_main_t * sm = &snat_main; - u32 thread_index = os_get_cpu_number (); + u32 thread_index = vlib_get_thread_index (); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; diff --git a/src/vlib/main.h b/src/vlib/main.h index 329bf073..0e8026d1 100644 --- a/src/vlib/main.h +++ b/src/vlib/main.h @@ -320,7 +320,7 @@ always_inline void vlib_set_queue_signal_callback /* Main routine. */ int vlib_main (vlib_main_t * vm, unformat_input_t * input); -/* Thread stacks, for os_get_cpu_number */ +/* Thread stacks, for os_get_thread_index */ extern u8 **vlib_thread_stacks; /* Number of thread stacks that the application needs */ diff --git a/src/vlib/threads.c b/src/vlib/threads.c index 9ccfd3a2..b7bc9e26 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -35,16 +35,8 @@ vl (void *p) vlib_worker_thread_t *vlib_worker_threads; vlib_thread_main_t vlib_thread_main; -__thread uword vlib_thread_index = 0; - -uword -os_get_cpu_number (void) -{ - return vlib_thread_index; -} - uword -os_get_ncpus (void) +os_get_nthreads (void) { u32 len; @@ -467,7 +459,7 @@ vlib_worker_thread_bootstrap_fn (void *arg) w->lwp = syscall (SYS_gettid); w->thread_id = pthread_self (); - vlib_thread_index = w - vlib_worker_threads; + __os_thread_index = w - vlib_worker_threads; rv = (void *) clib_calljmp ((uword (*)(uword)) w->thread_function, diff --git a/src/vlib/threads.h b/src/vlib/threads.h index 101d3d4a..17d35a24 100644 --- a/src/vlib/threads.h +++ b/src/vlib/threads.h @@ -181,11 +181,10 @@ u32 vlib_frame_queue_main_init (u32 node_index, u32 frame_queue_nelts); void vlib_worker_thread_barrier_sync (vlib_main_t * vm); void vlib_worker_thread_barrier_release (vlib_main_t * vm); -extern __thread uword vlib_thread_index; static_always_inline uword vlib_get_thread_index (void) { - return vlib_thread_index; + return __os_thread_index; } always_inline void diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c index db5ddd64..103576db 100644 --- a/src/vlib/unix/main.c +++ b/src/vlib/unix/main.c @@ -565,7 +565,7 @@ vlib_unix_main (int argc, char *argv[]) vlib_thread_stack_init (0); - vlib_thread_index = 0; + __os_thread_index = 0; i = clib_calljmp (thread0, (uword) vm, (void *) (vlib_thread_stacks[0] + diff --git a/src/vlibmemory/memory_vlib.c b/src/vlibmemory/memory_vlib.c index acba8b3f..e5d88732 100644 --- a/src/vlibmemory/memory_vlib.c +++ b/src/vlibmemory/memory_vlib.c @@ -1333,7 +1333,7 @@ vl_api_rpc_call_main_thread (void *fp, u8 * data, u32 data_length) unix_shared_memory_queue_t *q; /* Main thread: call the function directly */ - if (os_get_cpu_number () == 0) + if (vlib_get_thread_index () == 0) { vlib_main_t *vm = vlib_get_main (); void (*call_fp) (void *); diff --git a/src/vnet/dpo/interface_dpo.c b/src/vnet/dpo/interface_dpo.c index 50ca756f..8d700c23 100644 --- a/src/vnet/dpo/interface_dpo.c +++ b/src/vnet/dpo/interface_dpo.c @@ -231,7 +231,7 @@ interface_dpo_inline (vlib_main_t * vm, vlib_frame_t * from_frame) { u32 n_left_from, next_index, * from, * to_next; - u32 cpu_index = os_get_cpu_number(); + u32 thread_index = vlib_get_thread_index (); vnet_interface_main_t *im; im = &vnet_get_main ()->interface_main; @@ -274,13 +274,13 @@ interface_dpo_inline (vlib_main_t * vm, vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, ido0->ido_sw_if_index, 1, vlib_buffer_length_in_chain (vm, b0)); vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, ido1->ido_sw_if_index, 1, vlib_buffer_length_in_chain (vm, b1)); @@ -331,7 +331,7 @@ interface_dpo_inline (vlib_main_t * vm, /* Bump the interface's RX coutners */ vlib_increment_combined_counter (im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, - cpu_index, + thread_index, ido0->ido_sw_if_index, 1, vlib_buffer_length_in_chain (vm, b0)); diff --git a/src/vnet/lisp-gpe/lisp_gpe_adjacency.c b/src/vnet/lisp-gpe/lisp_gpe_adjacency.c index d5f3a28a..7db1c9bb 100644 --- a/src/vnet/lisp-gpe/lisp_gpe_adjacency.c +++ b/src/vnet/lisp-gpe/lisp_gpe_adjacency.c @@ -302,7 +302,7 @@ lisp_gpe_increment_stats_counters (lisp_cp_main_t * lcm, ip_adjacency_t * adj, /* compute payload length starting after GPE */ u32 bytes = b->current_length - (lisp_data - b->data - b->current_data); - vlib_increment_combined_counter (&lgm->counters, os_get_cpu_number (), + vlib_increment_combined_counter (&lgm->counters, vlib_get_thread_index (), p[0], 1, bytes); } diff --git a/src/vppinfra/bihash_template.c b/src/vppinfra/bihash_template.c index d8b97b5f..51fadeb8 100644 --- a/src/vppinfra/bihash_template.c +++ b/src/vppinfra/bihash_template.c @@ -96,12 +96,12 @@ BV (make_working_copy) (BVT (clib_bihash) * h, clib_bihash_bucket_t * b) clib_bihash_bucket_t working_bucket __attribute__ ((aligned (8))); void *oldheap; BVT (clib_bihash_value) * working_copy; - u32 cpu_number = os_get_cpu_number (); + u32 thread_index = os_get_thread_index (); - if (cpu_number >= vec_len (h->working_copies)) + if (thread_index >= vec_len (h->working_copies)) { oldheap = clib_mem_set_heap (h->mheap); - vec_validate (h->working_copies, cpu_number); + vec_validate (h->working_copies, thread_index); clib_mem_set_heap (oldheap); } @@ -110,7 +110,7 @@ BV (make_working_copy) (BVT (clib_bihash) * h, clib_bihash_bucket_t * b) * updates from multiple threads will not result in sporadic, spurious * lookup failures. */ - working_copy = h->working_copies[cpu_number]; + working_copy = h->working_copies[thread_index]; h->saved_bucket.as_u64 = b->as_u64; oldheap = clib_mem_set_heap (h->mheap); @@ -119,7 +119,7 @@ BV (make_working_copy) (BVT (clib_bihash) * h, clib_bihash_bucket_t * b) { vec_validate_aligned (working_copy, (1 << b->log2_pages) - 1, sizeof (u64)); - h->working_copies[cpu_number] = working_copy; + h->working_copies[thread_index] = working_copy; } _vec_len (working_copy) = 1 << b->log2_pages; @@ -132,7 +132,7 @@ BV (make_working_copy) (BVT (clib_bihash) * h, clib_bihash_bucket_t * b) working_bucket.offset = BV (clib_bihash_get_offset) (h, working_copy); CLIB_MEMORY_BARRIER (); b->as_u64 = working_bucket.as_u64; - h->working_copies[cpu_number] = working_copy; + h->working_copies[thread_index] = working_copy; } static @@ -233,7 +233,7 @@ int BV (clib_bihash_add_del) int i, limit; u64 hash, new_hash; u32 new_log2_pages; - u32 cpu_number = os_get_cpu_number (); + u32 thread_index = os_get_thread_index (); int mark_bucket_linear; int resplit_once; @@ -323,7 +323,7 @@ int BV (clib_bihash_add_del) new_log2_pages = h->saved_bucket.log2_pages + 1; mark_bucket_linear = 0; - working_copy = h->working_copies[cpu_number]; + working_copy = h->working_copies[thread_index]; resplit_once = 0; new_v = BV (split_and_rehash) (h, working_copy, new_log2_pages); diff --git a/src/vppinfra/lock.h b/src/vppinfra/lock.h index c60ff414..0cd2b4fe 100644 --- a/src/vppinfra/lock.h +++ b/src/vppinfra/lock.h @@ -24,7 +24,7 @@ typedef struct u32 lock; #if CLIB_DEBUG > 0 pid_t pid; - uword cpu_index; + uword thread_index; void *frame_address; #endif } *clib_spinlock_t; @@ -57,7 +57,7 @@ clib_spinlock_lock (clib_spinlock_t * p) #if CLIB_DEBUG > 0 (*p)->frame_address = __builtin_frame_address (0); (*p)->pid = getpid (); - (*p)->cpu_index = os_get_cpu_number (); + (*p)->thread_index = os_get_thread_index (); #endif } @@ -75,7 +75,7 @@ clib_spinlock_unlock (clib_spinlock_t * p) #if CLIB_DEBUG > 0 (*p)->frame_address = 0; (*p)->pid = 0; - (*p)->cpu_index = 0; + (*p)->thread_index = 0; #endif } diff --git a/src/vppinfra/mem.h b/src/vppinfra/mem.h index 1260eab2..63c5ac16 100644 --- a/src/vppinfra/mem.h +++ b/src/vppinfra/mem.h @@ -54,14 +54,14 @@ extern void *clib_per_cpu_mheaps[CLIB_MAX_MHEAPS]; always_inline void * clib_mem_get_per_cpu_heap (void) { - int cpu = os_get_cpu_number (); + int cpu = os_get_thread_index (); return clib_per_cpu_mheaps[cpu]; } always_inline void * clib_mem_set_per_cpu_heap (u8 * new_heap) { - int cpu = os_get_cpu_number (); + int cpu = os_get_thread_index (); void *old = clib_per_cpu_mheaps[cpu]; clib_per_cpu_mheaps[cpu] = new_heap; return old; @@ -83,7 +83,7 @@ clib_mem_alloc_aligned_at_offset (uword size, uword align, uword align_offset, align_offset = align; } - cpu = os_get_cpu_number (); + cpu = os_get_thread_index (); heap = clib_per_cpu_mheaps[cpu]; heap = mheap_get_aligned (heap, size, align, align_offset, &offset); clib_per_cpu_mheaps[cpu] = heap; diff --git a/src/vppinfra/mhash.c b/src/vppinfra/mhash.c index c917e164..00b67c49 100644 --- a/src/vppinfra/mhash.c +++ b/src/vppinfra/mhash.c @@ -226,7 +226,7 @@ static uword mhash_set_tmp_key (mhash_t * h, const void *key) { u8 *key_tmp; - int my_cpu = os_get_cpu_number (); + int my_cpu = os_get_thread_index (); vec_validate (h->key_tmps, my_cpu); key_tmp = h->key_tmps[my_cpu]; diff --git a/src/vppinfra/mhash.h b/src/vppinfra/mhash.h index 102adf4e..7eb19183 100644 --- a/src/vppinfra/mhash.h +++ b/src/vppinfra/mhash.h @@ -93,7 +93,7 @@ mhash_key_to_mem (mhash_t * h, uword key) { u8 *key_tmp; - int my_cpu = os_get_cpu_number (); + int my_cpu = os_get_thread_index (); vec_validate (h->key_tmps, my_cpu); key_tmp = h->key_tmps[my_cpu]; return key_tmp; diff --git a/src/vppinfra/mheap.c b/src/vppinfra/mheap.c index 192732db..d4010ceb 100644 --- a/src/vppinfra/mheap.c +++ b/src/vppinfra/mheap.c @@ -56,7 +56,7 @@ mheap_maybe_lock (void *v) mheap_t *h = mheap_header (v); if (v && (h->flags & MHEAP_FLAG_THREAD_SAFE)) { - u32 my_cpu = os_get_cpu_number (); + u32 my_cpu = os_get_thread_index (); if (h->owner_cpu == my_cpu) { h->recursion_count++; @@ -77,7 +77,7 @@ mheap_maybe_unlock (void *v) mheap_t *h = mheap_header (v); if (v && h->flags & MHEAP_FLAG_THREAD_SAFE) { - ASSERT (os_get_cpu_number () == h->owner_cpu); + ASSERT (os_get_thread_index () == h->owner_cpu); if (--h->recursion_count == 0) { h->owner_cpu = ~0; diff --git a/src/vppinfra/os.h b/src/vppinfra/os.h index a5c74f8c..33300716 100644 --- a/src/vppinfra/os.h +++ b/src/vppinfra/os.h @@ -56,8 +56,24 @@ void os_out_of_memory (void); /* Estimate, measure or divine CPU timestamp clock frequency. */ f64 os_cpu_clock_frequency (void); -uword os_get_cpu_number (void); -uword os_get_ncpus (void); +extern __thread uword __os_thread_index; + +static_always_inline uword +os_get_thread_index (void) +{ + return __os_thread_index; +} + +static_always_inline uword +os_get_cpu_number (void) __attribute__ ((deprecated)); + +static_always_inline uword +os_get_cpu_number (void) +{ + return __os_thread_index; +} + +uword os_get_nthreads (void); #include diff --git a/src/vppinfra/smp.c b/src/vppinfra/smp.c index 8ac19960..f603283e 100644 --- a/src/vppinfra/smp.c +++ b/src/vppinfra/smp.c @@ -53,7 +53,7 @@ allocate_per_cpu_mheap (uword cpu) void *heap; uword vm_size, stack_size, mheap_flags; - ASSERT (os_get_cpu_number () == cpu); + ASSERT (os_get_thread_index () == cpu); vm_size = (uword) 1 << m->log2_n_per_cpu_vm_bytes; stack_size = (uword) 1 << m->log2_n_per_cpu_stack_bytes; diff --git a/src/vppinfra/unix-misc.c b/src/vppinfra/unix-misc.c index 2928369d..361015b4 100644 --- a/src/vppinfra/unix-misc.c +++ b/src/vppinfra/unix-misc.c @@ -45,6 +45,8 @@ #include #include /* for sprintf */ +__thread uword __os_thread_index = 0; + clib_error_t * unix_file_n_bytes (char *file, uword * result) { @@ -188,14 +190,14 @@ void os_puts (u8 * string, uword string_length, uword is_error) void os_puts (u8 * string, uword string_length, uword is_error) { - int cpu = os_get_cpu_number (); - int ncpus = os_get_ncpus (); + int cpu = os_get_thread_index (); + int nthreads = os_get_nthreads (); char buf[64]; int fd = is_error ? 2 : 1; struct iovec iovs[2]; int n_iovs = 0; - if (ncpus > 1) + if (nthreads > 1) { snprintf (buf, sizeof (buf), "%d: ", cpu); @@ -219,16 +221,9 @@ os_out_of_memory (void) os_panic (); } -uword os_get_cpu_number (void) __attribute__ ((weak)); -uword -os_get_cpu_number (void) -{ - return 0; -} - -uword os_get_ncpus (void) __attribute__ ((weak)); +uword os_get_nthreads (void) __attribute__ ((weak)); uword -os_get_ncpus (void) +os_get_nthreads (void) { return 1; } -- cgit 1.2.3-korg From 814813103bb4acb9ced39c22972bd5e97df13d33 Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Tue, 16 May 2017 09:08:14 -0400 Subject: VPP-845: add configurable elog post-mortem dump Off by default. Enable via cmdline "... vlib { elog-post-mortem-dump } ..." Change-Id: I2056b9de9b37475f2bfeeb5404da838f1b42645a Signed-off-by: Dave Barach --- src/vlib/main.c | 20 ++++++++++++++++++++ src/vlib/main.h | 4 ++++ src/vlib/unix/main.c | 8 ++++---- src/vpp/vnet/main.c | 9 +++++++++ 4 files changed, 37 insertions(+), 4 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/main.c b/src/vlib/main.c index 422d3e26..8af1e7a9 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -707,6 +707,24 @@ elog_save_buffer (vlib_main_t * vm, return error; } +void +elog_post_mortem_dump (void) +{ + vlib_main_t *vm = &vlib_global_main; + elog_main_t *em = &vm->elog_main; + u8 *filename; + clib_error_t *error; + + if (!vm->elog_post_mortem_dump) + return; + + filename = format (0, "/tmp/elog_post_mortem.%d%c", getpid (), 0); + error = elog_write_file (em, (char *) filename, 1 /* flush ring */ ); + if (error) + clib_error_report (error); + vec_free (filename); +} + /* *INDENT-OFF* */ VLIB_CLI_COMMAND (elog_save_cli, static) = { .path = "event-logger save", @@ -1642,6 +1660,8 @@ vlib_main_configure (vlib_main_t * vm, unformat_input_t * input) else if (unformat (input, "elog-events %d", &vm->elog_main.event_ring_size)) ; + else if (unformat (input, "elog-post-mortem-dump")) + vm->elog_post_mortem_dump = 1; else return unformat_parse_error (input); } diff --git a/src/vlib/main.h b/src/vlib/main.h index 0e8026d1..bfa7ddbe 100644 --- a/src/vlib/main.h +++ b/src/vlib/main.h @@ -177,6 +177,10 @@ typedef struct vlib_main_t /* debugging */ volatile int parked_at_barrier; + + /* Attempt to do a post-mortem elog dump */ + int elog_post_mortem_dump; + } vlib_main_t; /* Global main structure. */ diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c index 103576db..e31ea815 100644 --- a/src/vlib/unix/main.c +++ b/src/vlib/unix/main.c @@ -409,12 +409,12 @@ unix_config (vlib_main_t * vm, unformat_input_t * input) format_unformat_error, input); } + error = setup_signal_handlers (um); + if (error) + return error; + if (!(um->flags & UNIX_FLAG_INTERACTIVE)) { - error = setup_signal_handlers (um); - if (error) - return error; - openlog (vm->name, LOG_CONS | LOG_PERROR | LOG_PID, LOG_DAEMON); clib_error_register_handler (unix_error_handler, um); diff --git a/src/vpp/vnet/main.c b/src/vpp/vnet/main.c index d6a12325..ade32aa1 100644 --- a/src/vpp/vnet/main.c +++ b/src/vpp/vnet/main.c @@ -253,11 +253,13 @@ plugin_path_config (vlib_main_t * vm, unformat_input_t * input) VLIB_CONFIG_FUNCTION (plugin_path_config, "plugin_path"); void vl_msg_api_post_mortem_dump (void); +void elog_post_mortem_dump (void); void os_panic (void) { vl_msg_api_post_mortem_dump (); + elog_post_mortem_dump (); abort (); } @@ -280,6 +282,7 @@ os_exit (int code) recursion_block = 1; vl_msg_api_post_mortem_dump (); + elog_post_mortem_dump (); vhost_user_unmap_all (); abort (); } @@ -320,6 +323,12 @@ test_crash_command_fn (vlib_main_t * vm, { u64 *p = (u64 *) 0xdefec8ed; + ELOG_TYPE_DECLARE (e) = + { + .format = "deliberate crash: touching %x",.format_args = "i4",}; + + elog (&vm->elog_main, &e, 0xdefec8ed); + *p = 0xdeadbeef; /* Not so much... */ -- cgit 1.2.3-korg From f3b53643e87e7521c57cccc157385d2fa4bd0d80 Mon Sep 17 00:00:00 2001 From: Steven Date: Mon, 1 May 2017 14:03:02 -0700 Subject: vhost: migrate to use device infra for worker thread assignment, rx-mode. and add adaptive mode support to receive queue - Migrate vhost to use device infra which does the interface/queue to worker thread assignment. - Retire vhost thread CLI and corresponding code which assigns interface/queue to worker thread. set interface placement should be used instead to customize the interface/queue to worker thread assignment. - Retire vhost interrupt/polling option when creating vhost-user interface. Instead, set interface rx-mode should be used. - Add code in vnet_device_input_unassign_thread to change the node state to interrupt if the last polling interface has left the worker thread for the device of the corresponding interface/queue. - Add adaptive mode support. The node state is set to interrupt initially. When the scheduler detects a burst of traffic, it switches the input node to polling. Then we inform the device that we don't need interrupt notification. When the traffic subsides, the scheduler switches the input node back to interrupt. Then we immediately tell the driver that we want interrupt notification again. - Remove some duplicate code in vlib/main.c Change-Id: Id19bb1b9e50e6521c6464f470f5825c26924d3a8 Signed-off-by: Steven --- src/vat/api_format.c | 67 +-- src/vlib/main.c | 11 +- src/vnet/devices/devices.c | 20 + src/vnet/devices/virtio/vhost-user.c | 748 +++++++++++++------------------ src/vnet/devices/virtio/vhost-user.h | 45 +- src/vnet/devices/virtio/vhost_user_api.c | 7 +- 6 files changed, 342 insertions(+), 556 deletions(-) (limited to 'src/vlib') diff --git a/src/vat/api_format.c b/src/vat/api_format.c index aac59bb8..f3e6f64c 100644 --- a/src/vat/api_format.c +++ b/src/vat/api_format.c @@ -403,46 +403,6 @@ api_unformat_sw_if_index (unformat_input_t * input, va_list * args) } #endif /* VPP_API_TEST_BUILTIN */ -#define VHOST_USER_POLLING_MODE 0 -#define VHOST_USER_INTERRUPT_MODE 1 -#define VHOST_USER_ADAPTIVE_MODE 2 - -static u8 * -api_format_vhost_user_operation_mode (u8 * s, va_list * va) -{ - int operation_mode = va_arg (*va, int); - - switch (operation_mode) - { - case VHOST_USER_POLLING_MODE: - s = format (s, "%-9s", "polling"); - break; - case VHOST_USER_INTERRUPT_MODE: - s = format (s, "%-9s", "interrupt"); - break; - default: - s = format (s, "%-9s", "invalid"); - } - return s; -} - -static uword -api_unformat_vhost_user_operation_mode (unformat_input_t * input, - va_list * args) -{ - u8 *operation_mode = va_arg (*args, u8 *); - uword rc = 1; - - if (unformat (input, "interrupt")) - *operation_mode = VHOST_USER_INTERRUPT_MODE; - else if (unformat (input, "polling")) - *operation_mode = VHOST_USER_POLLING_MODE; - else - rc = 0; - - return rc; -} - static uword unformat_policer_rate_type (unformat_input_t * input, va_list * args) { @@ -11481,7 +11441,6 @@ api_create_vhost_user_if (vat_main_t * vam) u8 use_custom_mac = 0; u8 *tag = 0; int ret; - u8 operation_mode = VHOST_USER_POLLING_MODE; /* Shut up coverity */ memset (hwaddr, 0, sizeof (hwaddr)); @@ -11500,10 +11459,6 @@ api_create_vhost_user_if (vat_main_t * vam) is_server = 1; else if (unformat (i, "tag %s", &tag)) ; - else if (unformat (i, "mode %U", - api_unformat_vhost_user_operation_mode, - &operation_mode)) - ; else break; } @@ -11523,7 +11478,6 @@ api_create_vhost_user_if (vat_main_t * vam) M (CREATE_VHOST_USER_IF, mp); - mp->operation_mode = operation_mode; mp->is_server = is_server; clib_memcpy (mp->sock_filename, file_name, vec_len (file_name)); vec_free (file_name); @@ -11555,7 +11509,6 @@ api_modify_vhost_user_if (vat_main_t * vam) u8 sw_if_index_set = 0; u32 sw_if_index = (u32) ~ 0; int ret; - u8 operation_mode = VHOST_USER_POLLING_MODE; while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) { @@ -11571,10 +11524,6 @@ api_modify_vhost_user_if (vat_main_t * vam) ; else if (unformat (i, "server")) is_server = 1; - else if (unformat (i, "mode %U", - api_unformat_vhost_user_operation_mode, - &operation_mode)) - ; else break; } @@ -11600,7 +11549,6 @@ api_modify_vhost_user_if (vat_main_t * vam) M (MODIFY_VHOST_USER_IF, mp); - mp->operation_mode = operation_mode; mp->sw_if_index = ntohl (sw_if_index); mp->is_server = is_server; clib_memcpy (mp->sock_filename, file_name, vec_len (file_name)); @@ -11656,12 +11604,11 @@ static void vl_api_sw_interface_vhost_user_details_t_handler { vat_main_t *vam = &vat_main; - print (vam->ofp, "%-25s %3" PRIu32 " %6" PRIu32 " %8x %6d %7d %U %s", + print (vam->ofp, "%-25s %3" PRIu32 " %6" PRIu32 " %8x %6d %7d %s", (char *) mp->interface_name, ntohl (mp->sw_if_index), ntohl (mp->virtio_net_hdr_sz), clib_net_to_host_u64 (mp->features), mp->is_server, - ntohl (mp->num_regions), api_format_vhost_user_operation_mode, - mp->operation_mode, (char *) mp->sock_filename); + ntohl (mp->num_regions), (char *) mp->sock_filename); print (vam->ofp, " Status: '%s'", strerror (ntohl (mp->sock_errno))); } @@ -11690,7 +11637,6 @@ static void vl_api_sw_interface_vhost_user_details_t_handler_json vat_json_object_add_string_copy (node, "sock_filename", mp->sock_filename); vat_json_object_add_uint (node, "num_regions", ntohl (mp->num_regions)); vat_json_object_add_uint (node, "sock_errno", ntohl (mp->sock_errno)); - vat_json_object_add_uint (node, "mode", mp->operation_mode); } static int @@ -11700,8 +11646,7 @@ api_sw_interface_vhost_user_dump (vat_main_t * vam) vl_api_control_ping_t *mp_ping; int ret; print (vam->ofp, - "Interface name idx hdr_sz features server regions mode" - " filename"); + "Interface name idx hdr_sz features server regions filename"); /* Get list of vhost-user interfaces */ M (SW_INTERFACE_VHOST_USER_DUMP, mp); @@ -19001,12 +18946,10 @@ _(l2_interface_vlan_tag_rewrite, \ "[translate-2-[1|2]] [push_dot1q 0] tag1 tag2 ") \ _(create_vhost_user_if, \ "socket [server] [renumber ] " \ - "[mac ] " \ - "[mode ]") \ + "[mac ]") \ _(modify_vhost_user_if, \ " | sw_if_index socket \n" \ - "[server] [renumber ] " \ - "[mode ]") \ + "[server] [renumber ]") \ _(delete_vhost_user_if, " | sw_if_index ") \ _(sw_interface_vhost_user_dump, "") \ _(show_version, "") \ diff --git a/src/vlib/main.c b/src/vlib/main.c index 8af1e7a9..0e6d66cd 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -1473,20 +1473,11 @@ vlib_main_or_worker_loop (vlib_main_t * vm, int is_main) if (!nm->interrupt_threshold_vector_length) nm->interrupt_threshold_vector_length = 5; - if (is_main) - { - if (!nm->polling_threshold_vector_length) - nm->polling_threshold_vector_length = 10; - if (!nm->interrupt_threshold_vector_length) - nm->interrupt_threshold_vector_length = 5; - - nm->current_process_index = ~0; - } - /* Start all processes. */ if (is_main) { uword i; + nm->current_process_index = ~0; for (i = 0; i < vec_len (nm->processes); i++) cpu_time_now = dispatch_process (vm, nm->processes[i], /* frame */ 0, cpu_time_now); diff --git a/src/vnet/devices/devices.c b/src/vnet/devices/devices.c index e71be602..58c72077 100644 --- a/src/vnet/devices/devices.c +++ b/src/vnet/devices/devices.c @@ -150,6 +150,7 @@ vnet_hw_interface_assign_rx_thread (vnet_main_t * vnm, u32 hw_if_index, dq->dev_instance = hw->dev_instance; dq->queue_id = queue_id; dq->mode = VNET_HW_INTERFACE_RX_MODE_POLLING; + rt->enabled_node_state = VLIB_NODE_STATE_POLLING; vnet_device_queue_update (vnm, rt); vec_validate (hw->input_node_thread_index_by_queue, queue_id); @@ -168,6 +169,7 @@ vnet_hw_interface_unassign_rx_thread (vnet_main_t * vnm, u32 hw_if_index, vnet_device_input_runtime_t *rt; vnet_device_and_queue_t *dq; uword old_thread_index; + vnet_hw_interface_rx_mode mode; if (hw->input_node_thread_index_by_queue == 0) return VNET_API_ERROR_INVALID_INTERFACE; @@ -184,6 +186,7 @@ vnet_hw_interface_unassign_rx_thread (vnet_main_t * vnm, u32 hw_if_index, vec_foreach (dq, rt->devices_and_queues) if (dq->hw_if_index == hw_if_index && dq->queue_id == queue_id) { + mode = dq->mode; vec_del1 (rt->devices_and_queues, dq - rt->devices_and_queues); goto deleted; } @@ -197,6 +200,23 @@ deleted: if (vec_len (rt->devices_and_queues) == 0) vlib_node_set_state (vm, hw->input_node_index, VLIB_NODE_STATE_DISABLED); + else if (mode == VNET_HW_INTERFACE_RX_MODE_POLLING) + { + /* + * if the deleted interface is polling, we may need to set the node state + * to interrupt if there is no more polling interface for this device's + * corresponding thread. This is because mixed interfaces + * (polling and interrupt), assigned to the same thread, set the + * thread to polling prior to the deletion. + */ + vec_foreach (dq, rt->devices_and_queues) + { + if (dq->mode == VNET_HW_INTERFACE_RX_MODE_POLLING) + return 0; + } + rt->enabled_node_state = VLIB_NODE_STATE_INTERRUPT; + vlib_node_set_state (vm, hw->input_node_index, rt->enabled_node_state); + } return 0; } diff --git a/src/vnet/devices/virtio/vhost-user.c b/src/vnet/devices/virtio/vhost-user.c index 451ae434..23188934 100644 --- a/src/vnet/devices/virtio/vhost-user.c +++ b/src/vnet/devices/virtio/vhost-user.c @@ -362,140 +362,71 @@ vhost_user_tx_thread_placement (vhost_user_intf_t * vui) } } +/** + * @brief Unassign existing interface/queue to thread mappings and re-assign + * new interface/queue to thread mappings + */ static void vhost_user_rx_thread_placement () { vhost_user_main_t *vum = &vhost_user_main; vhost_user_intf_t *vui; - vhost_cpu_t *vhc; - u32 *workers = 0; - u32 thread_index; - vlib_main_t *vm; - - //Let's list all workers cpu indexes - u32 i; - for (i = vum->input_cpu_first_index; - i < vum->input_cpu_first_index + vum->input_cpu_count; i++) - { - vlib_node_set_state (vlib_mains[i], vhost_user_input_node.index, - VLIB_NODE_STATE_DISABLED); - vec_add1 (workers, i); - } - - vec_foreach (vhc, vum->cpus) - { - vec_reset_length (vhc->rx_queues); - } + vhost_user_vring_t *txvq; + vnet_main_t *vnm = vnet_get_main (); + u32 qid; + int rv; + u16 *queue; - i = 0; - vhost_iface_and_queue_t iaq; + // Scrap all existing mappings for all interfaces/queues /* *INDENT-OFF* */ pool_foreach (vui, vum->vhost_user_interfaces, { - u32 *vui_workers = vec_len (vui->workers) ? vui->workers : workers; - u32 qid; - for (qid = 0; qid < VHOST_VRING_MAX_N / 2; qid++) + vec_foreach (queue, vui->rx_queues) { - vhost_user_vring_t *txvq = - &vui->vrings[VHOST_VRING_IDX_TX (qid)]; - if (!txvq->started) - continue; - - i %= vec_len (vui_workers); - thread_index = vui_workers[i]; - i++; - vhc = &vum->cpus[thread_index]; - txvq->interrupt_thread_index = thread_index; - - iaq.qid = qid; - iaq.vhost_iface_index = vui - vum->vhost_user_interfaces; - vec_add1 (vhc->rx_queues, iaq); + rv = vnet_hw_interface_unassign_rx_thread (vnm, vui->hw_if_index, + *queue); + if (rv) + clib_warning ("Warning: unable to unassign interface %d, " + "queue %d: rc=%d", vui->hw_if_index, *queue, rv); } + vec_reset_length (vui->rx_queues); }); /* *INDENT-ON* */ - vec_foreach (vhc, vum->cpus) - { - vhost_iface_and_queue_t *vhiq; - u8 mode = VHOST_USER_INTERRUPT_MODE; - - vec_foreach (vhiq, vhc->rx_queues) - { - vui = &vum->vhost_user_interfaces[vhiq->vhost_iface_index]; - if (vui->operation_mode == VHOST_USER_POLLING_MODE) + // Create the rx_queues for all interfaces + /* *INDENT-OFF* */ + pool_foreach (vui, vum->vhost_user_interfaces, { + for (qid = 0; qid < VHOST_VRING_MAX_N / 2; qid++) { - /* At least one interface is polling, cpu is set to polling */ - mode = VHOST_USER_POLLING_MODE; - break; + txvq = &vui->vrings[VHOST_VRING_IDX_TX (qid)]; + if (txvq->started) + { + if (txvq->mode == VNET_HW_INTERFACE_RX_MODE_UNKNOWN) + /* Set polling as the default */ + txvq->mode = VNET_HW_INTERFACE_RX_MODE_POLLING; + vec_add1 (vui->rx_queues, qid); + } } - } - vhc->operation_mode = mode; - } - - for (thread_index = vum->input_cpu_first_index; - thread_index < vum->input_cpu_first_index + vum->input_cpu_count; - thread_index++) - { - vlib_node_state_t state = VLIB_NODE_STATE_POLLING; + }); + /* *INDENT-ON* */ - vhc = &vum->cpus[thread_index]; - vm = vlib_mains ? vlib_mains[thread_index] : &vlib_global_main; - switch (vhc->operation_mode) + // Assign new mappings for all interfaces/queues + /* *INDENT-OFF* */ + pool_foreach (vui, vum->vhost_user_interfaces, { + vnet_hw_interface_set_input_node (vnm, vui->hw_if_index, + vhost_user_input_node.index); + vec_foreach (queue, vui->rx_queues) { - case VHOST_USER_INTERRUPT_MODE: - state = VLIB_NODE_STATE_INTERRUPT; - break; - case VHOST_USER_POLLING_MODE: - state = VLIB_NODE_STATE_POLLING; - break; - default: - clib_warning ("BUG: bad operation mode %d", vhc->operation_mode); - break; + vnet_hw_interface_assign_rx_thread (vnm, vui->hw_if_index, *queue, + ~0); + txvq = &vui->vrings[VHOST_VRING_IDX_TX (*queue)]; + rv = vnet_hw_interface_set_rx_mode (vnm, vui->hw_if_index, *queue, + txvq->mode); + if (rv) + clib_warning ("Warning: unable to set rx mode for interface %d, " + "queue %d: rc=%d", vui->hw_if_index, *queue, rv); } - vlib_node_set_state (vm, vhost_user_input_node.index, state); - } - - vec_free (workers); -} - -static int -vhost_user_thread_placement (u32 sw_if_index, u32 worker_thread_index, u8 del) -{ - vhost_user_main_t *vum = &vhost_user_main; - vhost_user_intf_t *vui; - vnet_hw_interface_t *hw; - - if (worker_thread_index < vum->input_cpu_first_index || - worker_thread_index >= - vum->input_cpu_first_index + vum->input_cpu_count) - return -1; - - if (!(hw = vnet_get_sup_hw_interface (vnet_get_main (), sw_if_index))) - return -2; - - vui = pool_elt_at_index (vum->vhost_user_interfaces, hw->dev_instance); - u32 found = ~0, *w; - vec_foreach (w, vui->workers) - { - if (*w == worker_thread_index) - { - found = w - vui->workers; - break; - } - } - - if (del) - { - if (found == ~0) - return -3; - vec_del1 (vui->workers, found); - } - else if (found == ~0) - { - vec_add1 (vui->workers, worker_thread_index); - } - - vhost_user_rx_thread_placement (); - return 0; + }); + /* *INDENT-ON* */ } /** @brief Returns whether at least one TX and one RX vring are enabled */ @@ -532,37 +463,17 @@ vhost_user_update_iface_state (vhost_user_intf_t * vui) static void vhost_user_set_interrupt_pending (vhost_user_intf_t * vui, u32 ifq) { - vhost_user_main_t *vum = &vhost_user_main; - vhost_cpu_t *vhc; - u32 thread_index; - vlib_main_t *vm; - u32 ifq2, qid; - vhost_user_vring_t *txvq; + u32 qid; + vnet_main_t *vnm = vnet_get_main (); qid = ifq & 0xff; - if ((qid % 2) == 0) - /* Only care about the odd number virtqueue which is TX */ + if ((qid & 1) == 0) + /* Only care about the odd number, or TX, virtqueue */ return; if (vhost_user_intf_ready (vui)) - { - txvq = &vui->vrings[qid]; - thread_index = txvq->interrupt_thread_index; - vhc = &vum->cpus[thread_index]; - if (vhc->operation_mode == VHOST_USER_INTERRUPT_MODE) - { - vm = vlib_mains ? vlib_mains[thread_index] : &vlib_global_main; - /* - * Convert virtqueue number in the lower byte to vring - * queue index for the input node process. Top bytes contain - * the interface, lower byte contains the queue index. - */ - ifq2 = ((ifq >> 8) << 8) | qid / 2; - vhc->pending_input_bitmap = - clib_bitmap_set (vhc->pending_input_bitmap, ifq2, 1); - vlib_node_set_interrupt_pending (vm, vhost_user_input_node.index); - } - } + // qid >> 1 is to convert virtqueue number to vring queue index + vnet_device_input_set_interrupt_pending (vnm, vui->hw_if_index, qid >> 1); } static clib_error_t * @@ -570,14 +481,10 @@ vhost_user_callfd_read_ready (unix_file_t * uf) { __attribute__ ((unused)) int n; u8 buff[8]; - vhost_user_intf_t *vui = - pool_elt_at_index (vhost_user_main.vhost_user_interfaces, - uf->private_data >> 8); n = read (uf->file_descriptor, ((char *) &buff), 8); DBG_SOCK ("if %d CALL queue %d", uf->private_data >> 8, uf->private_data & 0xff); - vhost_user_set_interrupt_pending (vui, uf->private_data); return 0; } @@ -1001,12 +908,8 @@ vhost_user_socket_read (unix_file_t * uf) vui->vrings[msg.state.index].last_avail_idx = vui->vrings[msg.state.index].used->idx; - if (vui->operation_mode == VHOST_USER_POLLING_MODE) - /* tell driver that we don't want interrupts */ - vui->vrings[msg.state.index].used->flags = VRING_USED_F_NO_NOTIFY; - else - /* tell driver that we want interrupts */ - vui->vrings[msg.state.index].used->flags = 0; + /* tell driver that we don't want interrupts */ + vui->vrings[msg.state.index].used->flags = VRING_USED_F_NO_NOTIFY; break; case VHOST_USER_SET_OWNER: @@ -1315,8 +1218,6 @@ vhost_user_init (vlib_main_t * vm) clib_error_t *error; vhost_user_main_t *vum = &vhost_user_main; vlib_thread_main_t *tm = vlib_get_thread_main (); - vlib_thread_registration_t *tr; - uword *p; error = vlib_call_init_function (vm, ip4_init); if (error) @@ -1335,18 +1236,6 @@ vhost_user_init (vlib_main_t * vm) cpu->rx_buffers_len = 0; } - /* find out which cpus will be used for input */ - vum->input_cpu_first_index = 0; - vum->input_cpu_count = 1; - p = hash_get_mem (tm->thread_registrations_by_name, "workers"); - tr = p ? (vlib_thread_registration_t *) p[0] : 0; - - if (tr && tr->count > 0) - { - vum->input_cpu_first_index = tr->first_index; - vum->input_cpu_count = tr->count; - } - vum->random = random_default_seed (); mhash_init_c_string (&vum->if_index_by_sock_name, sizeof (uword)); @@ -1447,9 +1336,16 @@ vhost_user_send_call (vlib_main_t * vm, vhost_user_vring_t * vq) vhost_user_main_t *vum = &vhost_user_main; u64 x = 1; int fd = UNIX_GET_FD (vq->callfd_idx); - int rv __attribute__ ((unused)); - /* TODO: pay attention to rv */ + int rv; + rv = write (fd, &x, sizeof (x)); + if (rv <= 0) + { + clib_unix_warning + ("Error: Could not write to unix socket for callfd %d", fd); + return; + } + vq->n_since_last_int = 0; vq->int_deadline = vlib_time_now (vm) + vum->coalesce_time; } @@ -1564,7 +1460,8 @@ static u32 vhost_user_if_input (vlib_main_t * vm, vhost_user_main_t * vum, vhost_user_intf_t * vui, - u16 qid, vlib_node_runtime_t * node) + u16 qid, vlib_node_runtime_t * node, + vnet_hw_interface_rx_mode mode) { vhost_user_vring_t *txvq = &vui->vrings[VHOST_VRING_IDX_TX (qid)]; u16 n_rx_packets = 0; @@ -1590,6 +1487,26 @@ vhost_user_if_input (vlib_main_t * vm, vhost_user_send_call (vm, rxvq); } + /* + * For adaptive mode, it is optimized to reduce interrupts. + * If the scheduler switches the input node to polling due + * to burst of traffic, we tell the driver no interrupt. + * When the traffic subsides, the scheduler switches the node back to + * interrupt mode. We must tell the driver we want interrupt. + */ + if (PREDICT_FALSE (mode == VNET_HW_INTERFACE_RX_MODE_ADAPTIVE)) + { + if ((node->flags & + VLIB_NODE_FLAG_SWITCH_FROM_POLLING_TO_INTERRUPT_MODE) || + !(node->flags & + VLIB_NODE_FLAG_SWITCH_FROM_INTERRUPT_TO_POLLING_MODE)) + /* Tell driver we want notification */ + txvq->used->flags = 0; + else + /* Tell driver we don't want notification */ + txvq->used->flags = VRING_USED_F_NO_NOTIFY; + } + if (PREDICT_FALSE (txvq->avail->flags & 0xFFFE)) return 0; @@ -1926,34 +1843,23 @@ vhost_user_input (vlib_main_t * vm, { vhost_user_main_t *vum = &vhost_user_main; uword n_rx_packets = 0; - u32 thread_index = vlib_get_thread_index (); - vhost_iface_and_queue_t *vhiq; vhost_user_intf_t *vui; - vhost_cpu_t *vhc; + vnet_device_input_runtime_t *rt = + (vnet_device_input_runtime_t *) node->runtime_data; + vnet_device_and_queue_t *dq; - vhc = &vum->cpus[thread_index]; - if (PREDICT_TRUE (vhc->operation_mode == VHOST_USER_POLLING_MODE)) - { - vec_foreach (vhiq, vum->cpus[thread_index].rx_queues) + vec_foreach (dq, rt->devices_and_queues) + { + if (clib_smp_swap (&dq->interrupt_pending, 0) || + (node->state == VLIB_NODE_STATE_POLLING)) { - vui = &vum->vhost_user_interfaces[vhiq->vhost_iface_index]; - n_rx_packets += vhost_user_if_input (vm, vum, vui, vhiq->qid, node); + vui = + pool_elt_at_index (vum->vhost_user_interfaces, dq->dev_instance); + n_rx_packets = vhost_user_if_input (vm, vum, vui, dq->queue_id, node, + dq->mode); } - } - else - { - int i; - - /* *INDENT-OFF* */ - clib_bitmap_foreach (i, vhc->pending_input_bitmap, ({ - int qid = i & 0xff; + } - clib_bitmap_set (vhc->pending_input_bitmap, i, 0); - vui = pool_elt_at_index (vum->vhost_user_interfaces, i >> 8); - n_rx_packets += vhost_user_if_input (vm, vum, vui, qid, node); - })); - /* *INDENT-ON* */ - } return n_rx_packets; } @@ -2371,6 +2277,161 @@ done3: return frame->n_vectors; } +static uword +vhost_user_send_interrupt_process (vlib_main_t * vm, + vlib_node_runtime_t * rt, vlib_frame_t * f) +{ + vhost_user_intf_t *vui; + f64 timeout = 3153600000.0 /* 100 years */ ; + uword event_type, *event_data = 0; + vhost_user_main_t *vum = &vhost_user_main; + u16 *queue; + f64 now, poll_time_remaining; + f64 next_timeout; + u8 stop_timer = 0; + + while (1) + { + poll_time_remaining = + vlib_process_wait_for_event_or_clock (vm, timeout); + event_type = vlib_process_get_events (vm, &event_data); + vec_reset_length (event_data); + + /* + * Use the remaining timeout if it is less than coalesce time to avoid + * resetting the existing timer in the middle of expiration + */ + timeout = poll_time_remaining; + if (vlib_process_suspend_time_is_zero (timeout) || + (timeout > vum->coalesce_time)) + timeout = vum->coalesce_time; + + now = vlib_time_now (vm); + switch (event_type) + { + case VHOST_USER_EVENT_STOP_TIMER: + stop_timer = 1; + break; + + case VHOST_USER_EVENT_START_TIMER: + stop_timer = 0; + if (!vlib_process_suspend_time_is_zero (poll_time_remaining)) + break; + /* fall through */ + + case ~0: + /* *INDENT-OFF* */ + pool_foreach (vui, vum->vhost_user_interfaces, { + next_timeout = timeout; + vec_foreach (queue, vui->rx_queues) + { + vhost_user_vring_t *rxvq = + &vui->vrings[VHOST_VRING_IDX_RX (*queue)]; + vhost_user_vring_t *txvq = + &vui->vrings[VHOST_VRING_IDX_TX (*queue)]; + + if (txvq->n_since_last_int) + { + if (now >= txvq->int_deadline) + vhost_user_send_call (vm, txvq); + else + next_timeout = txvq->int_deadline - now; + } + + if (rxvq->n_since_last_int) + { + if (now >= rxvq->int_deadline) + vhost_user_send_call (vm, rxvq); + else + next_timeout = rxvq->int_deadline - now; + } + + if ((next_timeout < timeout) && (next_timeout > 0.0)) + timeout = next_timeout; + } + }); + /* *INDENT-ON* */ + break; + + default: + clib_warning ("BUG: unhandled event type %d", event_type); + break; + } + /* No less than 1 millisecond */ + if (timeout < 1e-3) + timeout = 1e-3; + if (stop_timer) + timeout = 3153600000.0; + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (vhost_user_send_interrupt_node,static) = { + .function = vhost_user_send_interrupt_process, + .type = VLIB_NODE_TYPE_PROCESS, + .name = "vhost-user-send-interrupt-process", +}; +/* *INDENT-ON* */ + +static clib_error_t * +vhost_user_interface_rx_mode_change (vnet_main_t * vnm, u32 hw_if_index, + u32 qid, vnet_hw_interface_rx_mode mode) +{ + vlib_main_t *vm = vnm->vlib_main; + vnet_hw_interface_t *hif = vnet_get_hw_interface (vnm, hw_if_index); + vhost_user_main_t *vum = &vhost_user_main; + vhost_user_intf_t *vui = + pool_elt_at_index (vum->vhost_user_interfaces, hif->dev_instance); + vhost_user_vring_t *txvq = &vui->vrings[VHOST_VRING_IDX_TX (qid)]; + + if ((mode == VNET_HW_INTERFACE_RX_MODE_INTERRUPT) || + (mode == VNET_HW_INTERFACE_RX_MODE_ADAPTIVE)) + { + if (txvq->mode == VNET_HW_INTERFACE_RX_MODE_POLLING) + { + vum->ifq_count++; + // Start the timer if this is the first encounter on interrupt + // interface/queue + if ((vum->ifq_count == 1) && + (vum->coalesce_time > 0.0) && (vum->coalesce_frames > 0)) + vlib_process_signal_event (vm, + vhost_user_send_interrupt_node.index, + VHOST_USER_EVENT_START_TIMER, 0); + } + } + else if (mode == VNET_HW_INTERFACE_RX_MODE_POLLING) + { + if (((txvq->mode == VNET_HW_INTERFACE_RX_MODE_INTERRUPT) || + (txvq->mode == VNET_HW_INTERFACE_RX_MODE_ADAPTIVE)) && + vum->ifq_count) + { + vum->ifq_count--; + // Stop the timer if there is no more interrupt interface/queue + if ((vum->ifq_count == 0) && + (vum->coalesce_time > 0.0) && (vum->coalesce_frames > 0)) + vlib_process_signal_event (vm, + vhost_user_send_interrupt_node.index, + VHOST_USER_EVENT_STOP_TIMER, 0); + } + } + + txvq->mode = mode; + if (mode == VNET_HW_INTERFACE_RX_MODE_POLLING) + txvq->used->flags = VRING_USED_F_NO_NOTIFY; + else if ((mode == VNET_HW_INTERFACE_RX_MODE_ADAPTIVE) || + (mode == VNET_HW_INTERFACE_RX_MODE_INTERRUPT)) + txvq->used->flags = 0; + else + { + clib_warning ("BUG: unhandled mode %d changed for if %d queue %d", mode, + hw_if_index, qid); + return clib_error_return (0, "unsupported"); + } + + return 0; +} + static clib_error_t * vhost_user_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) @@ -2399,6 +2460,7 @@ VNET_DEVICE_CLASS (vhost_user_dev_class,static) = { .format_device_name = format_vhost_user_interface_name, .name_renumber = vhost_user_name_renumber, .admin_up_down_function = vhost_user_interface_admin_up_down, + .rx_mode_change_function = vhost_user_interface_rx_mode_change, .format_tx_trace = format_vhost_trace, }; @@ -2523,8 +2585,6 @@ vhost_user_term_if (vhost_user_intf_t * vui) int q; vhost_user_main_t *vum = &vhost_user_main; - // Delete configured thread pinning - vec_reset_length (vui->workers); // disconnect interface sockets vhost_user_if_disconnect (vui); vhost_user_update_iface_state (vui); @@ -2555,6 +2615,7 @@ vhost_user_delete_if (vnet_main_t * vnm, vlib_main_t * vm, u32 sw_if_index) vhost_user_intf_t *vui; int rv = 0; vnet_hw_interface_t *hwif; + u16 *queue; if (!(hwif = vnet_get_sup_hw_interface (vnm, sw_if_index)) || hwif->dev_class_index != vhost_user_dev_class.index) @@ -2565,6 +2626,28 @@ vhost_user_delete_if (vnet_main_t * vnm, vlib_main_t * vm, u32 sw_if_index) vui = pool_elt_at_index (vum->vhost_user_interfaces, hwif->dev_instance); + vec_foreach (queue, vui->rx_queues) + { + vhost_user_vring_t *txvq; + + txvq = &vui->vrings[VHOST_VRING_IDX_TX (*queue)]; + if ((vum->ifq_count > 0) && + ((txvq->mode == VNET_HW_INTERFACE_RX_MODE_INTERRUPT) || + (txvq->mode == VNET_HW_INTERFACE_RX_MODE_ADAPTIVE))) + { + vum->ifq_count--; + // Stop the timer if there is no more interrupt interface/queue + if ((vum->ifq_count == 0) && + (vum->coalesce_time > 0.0) && (vum->coalesce_frames > 0)) + { + vlib_process_signal_event (vm, + vhost_user_send_interrupt_node.index, + VHOST_USER_EVENT_STOP_TIMER, 0); + break; + } + } + } + // Disable and reset interface vhost_user_term_if (vui); @@ -2687,13 +2770,15 @@ vhost_user_vui_init (vnet_main_t * vnm, vhost_user_intf_t * vui, int server_sock_fd, const char *sock_filename, - u64 feature_mask, u32 * sw_if_index, u8 operation_mode) + u64 feature_mask, u32 * sw_if_index) { vnet_sw_interface_t *sw; - sw = vnet_get_hw_sw_interface (vnm, vui->hw_if_index); int q; vhost_user_main_t *vum = &vhost_user_main; + vnet_hw_interface_t *hw; + hw = vnet_get_hw_interface (vnm, vui->hw_if_index); + sw = vnet_get_hw_sw_interface (vnm, vui->hw_if_index); if (server_sock_fd != -1) { unix_file_t template = { 0 }; @@ -2715,7 +2800,6 @@ vhost_user_vui_init (vnet_main_t * vnm, vui->feature_mask = feature_mask; vui->unix_file_index = ~0; vui->log_base_addr = 0; - vui->operation_mode = operation_mode; vui->if_index = vui - vum->vhost_user_interfaces; mhash_set_mem (&vum->if_index_by_sock_name, vui->sock_filename, &vui->if_index, 0); @@ -2723,6 +2807,7 @@ vhost_user_vui_init (vnet_main_t * vnm, for (q = 0; q < VHOST_VRING_MAX_N; q++) vhost_user_vring_init (vui, q); + hw->flags |= VNET_HW_INTERFACE_FLAG_SUPPORTS_INT_MODE; vnet_hw_interface_set_flags (vnm, vui->hw_if_index, 0); if (sw_if_index) @@ -2740,106 +2825,13 @@ vhost_user_vui_init (vnet_main_t * vnm, vhost_user_tx_thread_placement (vui); } -static uword -vhost_user_send_interrupt_process (vlib_main_t * vm, - vlib_node_runtime_t * rt, vlib_frame_t * f) -{ - vhost_user_intf_t *vui; - f64 timeout = 3153600000.0 /* 100 years */ ; - uword event_type, *event_data = 0; - vhost_user_main_t *vum = &vhost_user_main; - vhost_iface_and_queue_t *vhiq; - vhost_cpu_t *vhc; - f64 now, poll_time_remaining; - - while (1) - { - poll_time_remaining = - vlib_process_wait_for_event_or_clock (vm, timeout); - event_type = vlib_process_get_events (vm, &event_data); - vec_reset_length (event_data); - - /* - * Use the remaining timeout if it is less than coalesce time to avoid - * resetting the existing timer in the middle of expiration - */ - timeout = poll_time_remaining; - if (vlib_process_suspend_time_is_zero (timeout) || - (timeout > vum->coalesce_time)) - timeout = vum->coalesce_time; - - now = vlib_time_now (vm); - switch (event_type) - { - case VHOST_USER_EVENT_START_TIMER: - if (!vlib_process_suspend_time_is_zero (poll_time_remaining)) - break; - /* fall through */ - - case ~0: - vec_foreach (vhc, vum->cpus) - { - u32 thread_index = vhc - vum->cpus; - f64 next_timeout; - - next_timeout = timeout; - vec_foreach (vhiq, vum->cpus[thread_index].rx_queues) - { - vui = &vum->vhost_user_interfaces[vhiq->vhost_iface_index]; - vhost_user_vring_t *rxvq = - &vui->vrings[VHOST_VRING_IDX_RX (vhiq->qid)]; - vhost_user_vring_t *txvq = - &vui->vrings[VHOST_VRING_IDX_TX (vhiq->qid)]; - - if (txvq->n_since_last_int) - { - if (now >= txvq->int_deadline) - vhost_user_send_call (vm, txvq); - else - next_timeout = txvq->int_deadline - now; - } - - if (rxvq->n_since_last_int) - { - if (now >= rxvq->int_deadline) - vhost_user_send_call (vm, rxvq); - else - next_timeout = rxvq->int_deadline - now; - } - - if ((next_timeout < timeout) && (next_timeout > 0.0)) - timeout = next_timeout; - } - } - break; - - default: - clib_warning ("BUG: unhandled event type %d", event_type); - break; - } - /* No less than 1 millisecond */ - if (timeout < 1e-3) - timeout = 1e-3; - } - return 0; -} - -/* *INDENT-OFF* */ -VLIB_REGISTER_NODE (vhost_user_send_interrupt_node,static) = { - .function = vhost_user_send_interrupt_process, - .type = VLIB_NODE_TYPE_PROCESS, - .name = "vhost-user-send-interrupt-process", -}; -/* *INDENT-ON* */ - int vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm, const char *sock_filename, u8 is_server, u32 * sw_if_index, u64 feature_mask, - u8 renumber, u32 custom_dev_instance, u8 * hwaddr, - u8 operation_mode) + u8 renumber, u32 custom_dev_instance, u8 * hwaddr) { vhost_user_intf_t *vui = NULL; u32 sw_if_idx = ~0; @@ -2848,10 +2840,6 @@ vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm, vhost_user_main_t *vum = &vhost_user_main; uword *if_index; - if ((operation_mode != VHOST_USER_POLLING_MODE) && - (operation_mode != VHOST_USER_INTERRUPT_MODE)) - return VNET_API_ERROR_UNIMPLEMENTED; - if (sock_filename == NULL || !(strlen (sock_filename) > 0)) { return VNET_API_ERROR_INVALID_ARGUMENT; @@ -2881,7 +2869,7 @@ vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm, vhost_user_create_ethernet (vnm, vm, vui, hwaddr); vhost_user_vui_init (vnm, vui, server_sock_fd, sock_filename, - feature_mask, &sw_if_idx, operation_mode); + feature_mask, &sw_if_idx); if (renumber) vnet_interface_name_renumber (sw_if_idx, custom_dev_instance); @@ -2892,14 +2880,6 @@ vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm, // Process node must connect vlib_process_signal_event (vm, vhost_user_process_node.index, 0, 0); - if ((operation_mode == VHOST_USER_INTERRUPT_MODE) && - !vum->interrupt_mode && (vum->coalesce_time > 0.0) && - (vum->coalesce_frames > 0)) - { - vum->interrupt_mode = 1; - vlib_process_signal_event (vm, vhost_user_send_interrupt_node.index, - VHOST_USER_EVENT_START_TIMER, 0); - } return rv; } @@ -2908,8 +2888,7 @@ vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm, const char *sock_filename, u8 is_server, u32 sw_if_index, - u64 feature_mask, u8 renumber, u32 custom_dev_instance, - u8 operation_mode) + u64 feature_mask, u8 renumber, u32 custom_dev_instance) { vhost_user_main_t *vum = &vhost_user_main; vhost_user_intf_t *vui = NULL; @@ -2919,9 +2898,6 @@ vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm, vnet_hw_interface_t *hwif; uword *if_index; - if ((operation_mode != VHOST_USER_POLLING_MODE) && - (operation_mode != VHOST_USER_INTERRUPT_MODE)) - return VNET_API_ERROR_UNIMPLEMENTED; if (!(hwif = vnet_get_sup_hw_interface (vnm, sw_if_index)) || hwif->dev_class_index != vhost_user_dev_class.index) return VNET_API_ERROR_INVALID_SW_IF_INDEX; @@ -2947,8 +2923,7 @@ vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm, vhost_user_term_if (vui); vhost_user_vui_init (vnm, vui, server_sock_fd, - sock_filename, feature_mask, &sw_if_idx, - operation_mode); + sock_filename, feature_mask, &sw_if_idx); if (renumber) vnet_interface_name_renumber (sw_if_idx, custom_dev_instance); @@ -2956,33 +2931,9 @@ vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm, // Process node must connect vlib_process_signal_event (vm, vhost_user_process_node.index, 0, 0); - if ((operation_mode == VHOST_USER_INTERRUPT_MODE) && - !vum->interrupt_mode && (vum->coalesce_time > 0.0) && - (vum->coalesce_frames > 0)) - { - vum->interrupt_mode = 1; - vlib_process_signal_event (vm, vhost_user_send_interrupt_node.index, - VHOST_USER_EVENT_START_TIMER, 0); - } return rv; } -static uword -unformat_vhost_user_operation_mode (unformat_input_t * input, va_list * args) -{ - u8 *operation_mode = va_arg (*args, u8 *); - uword rc = 1; - - if (unformat (input, "interrupt")) - *operation_mode = VHOST_USER_INTERRUPT_MODE; - else if (unformat (input, "polling")) - *operation_mode = VHOST_USER_POLLING_MODE; - else - rc = 0; - - return rc; -} - clib_error_t * vhost_user_connect_command_fn (vlib_main_t * vm, unformat_input_t * input, @@ -2998,7 +2949,6 @@ vhost_user_connect_command_fn (vlib_main_t * vm, u8 hwaddr[6]; u8 *hw = NULL; clib_error_t *error = NULL; - u8 operation_mode = VHOST_USER_POLLING_MODE; /* Get a line of input. */ if (!unformat_user (input, unformat_line_input, line_input)) @@ -3020,9 +2970,6 @@ vhost_user_connect_command_fn (vlib_main_t * vm, { renumber = 1; } - else if (unformat (line_input, "mode %U", - unformat_vhost_user_operation_mode, &operation_mode)) - ; else { error = clib_error_return (0, "unknown input `%U'", @@ -3036,8 +2983,7 @@ vhost_user_connect_command_fn (vlib_main_t * vm, int rv; if ((rv = vhost_user_create_if (vnm, vm, (char *) sock_filename, is_server, &sw_if_index, feature_mask, - renumber, custom_dev_instance, hw, - operation_mode))) + renumber, custom_dev_instance, hw))) { error = clib_error_return (0, "vhost_user_create_if returned %d", rv); goto done; @@ -3127,7 +3073,6 @@ vhost_user_dump_ifs (vnet_main_t * vnm, vlib_main_t * vm, vui = pool_elt_at_index (vum->vhost_user_interfaces, hi->dev_instance); vec_add2 (r_vuids, vuid, 1); - vuid->operation_mode = vui->operation_mode; vuid->sw_if_index = vui->sw_if_index; vuid->virtio_net_hdr_sz = vui->virtio_net_hdr_sz; vuid->features = vui->features; @@ -3152,25 +3097,6 @@ vhost_user_dump_ifs (vnet_main_t * vnm, vlib_main_t * vm, return rv; } -static u8 * -format_vhost_user_operation_mode (u8 * s, va_list * va) -{ - int operation_mode = va_arg (*va, int); - - switch (operation_mode) - { - case VHOST_USER_POLLING_MODE: - s = format (s, "%s", "polling"); - break; - case VHOST_USER_INTERRUPT_MODE: - s = format (s, "%s", "interrupt"); - break; - default: - s = format (s, "%s", "invalid"); - } - return s; -} - clib_error_t * show_vhost_user_command_fn (vlib_main_t * vm, unformat_input_t * input, @@ -3182,10 +3108,8 @@ show_vhost_user_command_fn (vlib_main_t * vm, vhost_user_intf_t *vui; u32 hw_if_index, *hw_if_indices = 0; vnet_hw_interface_t *hi; - vhost_cpu_t *vhc; - vhost_iface_and_queue_t *vhiq; + u16 *queue; u32 ci; - int i, j, q; int show_descr = 0; struct feat_struct @@ -3238,6 +3162,8 @@ show_vhost_user_command_fn (vlib_main_t * vm, vlib_cli_output (vm, "Virtio vhost-user interfaces"); vlib_cli_output (vm, "Global:\n coalesce frames %d time %e", vum->coalesce_frames, vum->coalesce_time); + vlib_cli_output (vm, " number of rx virtqueues in interrupt mode: %d", + vum->ifq_count); for (i = 0; i < vec_len (hw_if_indices); i++) { @@ -3279,23 +3205,21 @@ show_vhost_user_command_fn (vlib_main_t * vm, (vui->unix_server_index != ~0) ? "server" : "client", strerror (vui->sock_errno)); - vlib_cli_output (vm, " configured mode: %U\n", - format_vhost_user_operation_mode, vui->operation_mode); vlib_cli_output (vm, " rx placement: "); - vec_foreach (vhc, vum->cpus) + + vec_foreach (queue, vui->rx_queues) { - vec_foreach (vhiq, vhc->rx_queues) - { - if (vhiq->vhost_iface_index == vui - vum->vhost_user_interfaces) - { - vlib_cli_output (vm, " thread %d on vring %d\n", - vhc - vum->cpus, - VHOST_VRING_IDX_TX (vhiq->qid)); - vlib_cli_output (vm, " mode: %U\n", - format_vhost_user_operation_mode, - vhc->operation_mode); - } - } + vnet_main_t *vnm = vnet_get_main (); + uword thread_index; + vnet_hw_interface_rx_mode mode; + + thread_index = vnet_get_device_input_thread_index (vnm, + vui->hw_if_index, + *queue); + vnet_hw_interface_get_rx_mode (vnm, vui->hw_if_index, *queue, &mode); + vlib_cli_output (vm, " thread %d on vring %d, %U\n", + thread_index, VHOST_VRING_IDX_TX (*queue), + format_vnet_hw_interface_rx_mode, mode); } vlib_cli_output (vm, " tx placement: %s\n", @@ -3444,8 +3368,7 @@ done: VLIB_CLI_COMMAND (vhost_user_connect_command, static) = { .path = "create vhost-user", .short_help = "create vhost-user socket [server] " - "[feature-mask ] [hwaddr ] [renumber ] " - "[mode {interrupt | polling}]", + "[feature-mask ] [hwaddr ] [renumber ] ", .function = vhost_user_connect_command_fn, }; /* *INDENT-ON* */ @@ -3648,69 +3571,6 @@ vhost_user_unmap_all (void) } } -static clib_error_t * -vhost_thread_command_fn (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) -{ - unformat_input_t _line_input, *line_input = &_line_input; - u32 worker_thread_index; - u32 sw_if_index; - u8 del = 0; - int rv; - clib_error_t *error = NULL; - - /* Get a line of input. */ - if (!unformat_user (input, unformat_line_input, line_input)) - return 0; - - if (!unformat - (line_input, "%U %d", unformat_vnet_sw_interface, vnet_get_main (), - &sw_if_index, &worker_thread_index)) - { - error = clib_error_return (0, "unknown input `%U'", - format_unformat_error, line_input); - goto done; - } - - if (unformat (line_input, "del")) - del = 1; - - if ((rv = - vhost_user_thread_placement (sw_if_index, worker_thread_index, del))) - { - error = clib_error_return (0, "vhost_user_thread_placement returned %d", - rv); - goto done; - } - -done: - unformat_free (line_input); - - return error; -} - - -/*? - * This command is used to move the RX processing for the given - * interfaces to the provided thread. If the 'del' option is used, - * the forced thread assignment is removed and the thread assigment is - * reassigned automatically. Use 'show vhost-user ' - * to see the thread assignment. - * - * @cliexpar - * Example of how to move the RX processing for a given interface to a given thread: - * @cliexcmd{vhost thread VirtualEthernet0/0/0 1} - * Example of how to remove the forced thread assignment for a given interface: - * @cliexcmd{vhost thread VirtualEthernet0/0/0 1 del} -?*/ -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (vhost_user_thread_command, static) = { - .path = "vhost thread", - .short_help = "vhost thread [del]", - .function = vhost_thread_command_fn, -}; -/* *INDENT-ON* */ - /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vnet/devices/virtio/vhost-user.h b/src/vnet/devices/virtio/vhost-user.h index 56b65477..ceced342 100644 --- a/src/vnet/devices/virtio/vhost-user.h +++ b/src/vnet/devices/virtio/vhost-user.h @@ -66,13 +66,11 @@ typedef enum int vhost_user_create_if (vnet_main_t * vnm, vlib_main_t * vm, const char *sock_filename, u8 is_server, u32 * sw_if_index, u64 feature_mask, - u8 renumber, u32 custom_dev_instance, u8 * hwaddr, - u8 operation_mode); + u8 renumber, u32 custom_dev_instance, u8 * hwaddr); int vhost_user_modify_if (vnet_main_t * vnm, vlib_main_t * vm, const char *sock_filename, u8 is_server, u32 sw_if_index, u64 feature_mask, - u8 renumber, u32 custom_dev_instance, - u8 operation_mode); + u8 renumber, u32 custom_dev_instance); int vhost_user_delete_if (vnet_main_t * vnm, vlib_main_t * vm, u32 sw_if_index); @@ -210,14 +208,13 @@ typedef struct u32 callfd_idx; u32 kickfd_idx; u64 log_guest_addr; - u32 interrupt_thread_index; -} vhost_user_vring_t; -#define VHOST_USER_POLLING_MODE 0 -#define VHOST_USER_INTERRUPT_MODE 1 -#define VHOST_USER_ADAPTIVE_MODE 2 + /* The rx queue policy (interrupt/adaptive/polling) for this queue */ + u32 mode; +} vhost_user_vring_t; #define VHOST_USER_EVENT_START_TIMER 1 +#define VHOST_USER_EVENT_STOP_TIMER 2 typedef struct { @@ -258,18 +255,10 @@ typedef struct u8 use_tx_spinlock; u16 *per_cpu_tx_qid; - /* Vector of workers for this interface */ - u32 *workers; - - u8 operation_mode; + /* Vector of active rx queues for this interface */ + u16 *rx_queues; } vhost_user_intf_t; -typedef struct -{ - u16 vhost_iface_index; - u16 qid; -} vhost_iface_and_queue_t; - typedef struct { uword dst; @@ -292,7 +281,6 @@ typedef struct typedef struct { - vhost_iface_and_queue_t *rx_queues; u32 rx_buffers_len; u32 rx_buffers[VHOST_USER_RX_BUFFERS_N]; @@ -302,12 +290,6 @@ typedef struct /* This is here so it doesn't end-up * using stack or registers. */ vhost_trace_t *current_trace; - - /* bitmap of pending rx interfaces */ - uword *pending_input_bitmap; - - /* The operation mode computed per cpu based on interface setting */ - u8 operation_mode; } vhost_cpu_t; typedef struct @@ -320,20 +302,14 @@ typedef struct f64 coalesce_time; int dont_dump_vhost_user_memory; - /** first cpu index */ - u32 input_cpu_first_index; - - /** total cpu count */ - u32 input_cpu_count; - /** Per-CPU data for vhost-user */ vhost_cpu_t *cpus; /** Pseudo random iterator */ u32 random; - /* Node is in interrupt mode */ - u8 interrupt_mode; + /* The number of rx interface/queue pairs in interrupt mode */ + u32 ifq_count; } vhost_user_main_t; typedef struct @@ -346,7 +322,6 @@ typedef struct u8 sock_filename[256]; u32 num_regions; int sock_errno; - u8 operation_mode; } vhost_user_intf_details_t; int vhost_user_dump_ifs (vnet_main_t * vnm, vlib_main_t * vm, diff --git a/src/vnet/devices/virtio/vhost_user_api.c b/src/vnet/devices/virtio/vhost_user_api.c index ac7afa61..8dbd032b 100644 --- a/src/vnet/devices/virtio/vhost_user_api.c +++ b/src/vnet/devices/virtio/vhost_user_api.c @@ -81,8 +81,7 @@ vl_api_create_vhost_user_if_t_handler (vl_api_create_vhost_user_if_t * mp) rv = vhost_user_create_if (vnm, vm, (char *) mp->sock_filename, mp->is_server, &sw_if_index, (u64) ~ 0, mp->renumber, ntohl (mp->custom_dev_instance), - (mp->use_custom_mac) ? mp->mac_address : NULL, - mp->operation_mode); + (mp->use_custom_mac) ? mp->mac_address : NULL); /* Remember an interface tag for the new interface */ if (rv == 0) @@ -117,8 +116,7 @@ vl_api_modify_vhost_user_if_t_handler (vl_api_modify_vhost_user_if_t * mp) rv = vhost_user_modify_if (vnm, vm, (char *) mp->sock_filename, mp->is_server, sw_if_index, (u64) ~ 0, - mp->renumber, ntohl (mp->custom_dev_instance), - mp->operation_mode); + mp->renumber, ntohl (mp->custom_dev_instance)); REPLY_MACRO (VL_API_MODIFY_VHOST_USER_IF_REPLY); } @@ -164,7 +162,6 @@ send_sw_interface_vhost_user_details (vpe_api_main_t * am, mp->virtio_net_hdr_sz = ntohl (vui->virtio_net_hdr_sz); mp->features = clib_net_to_host_u64 (vui->features); mp->is_server = vui->is_server; - mp->operation_mode = vui->operation_mode; mp->num_regions = ntohl (vui->num_regions); mp->sock_errno = ntohl (vui->sock_errno); mp->context = context; -- cgit 1.2.3-korg From 3060e07b71fd4dd6203769d73c4f8f21709f9b9c Mon Sep 17 00:00:00 2001 From: Yoann Desmouceaux Date: Thu, 18 May 2017 11:00:48 +0200 Subject: Add TAB-based auto-completion to the CLI Hitting tab: - in the middle of a uniquely defined subcommand will expand the subcommand - in the middle of a non-uniquely defined (or empty) subcommand will display all possible subcommands, and possibly expand to the lowest common prefix Change-Id: Ib858eefdb0353cd2c3aad472799d15cd537455a0 Signed-off-by: Yoann Desmouceaux --- src/vlib/cli.c | 102 +++++++++++++++++++++++++++++++++++ src/vlib/cli.h | 4 ++ src/vlib/unix/cli.c | 153 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 259 insertions(+) (limited to 'src/vlib') diff --git a/src/vlib/cli.c b/src/vlib/cli.c index 5a0f7ec3..9e14bee2 100644 --- a/src/vlib/cli.c +++ b/src/vlib/cli.c @@ -40,6 +40,7 @@ #include #include #include +#include /* Root of all show commands. */ /* *INDENT-OFF* */ @@ -235,6 +236,107 @@ unformat_vlib_cli_sub_command (unformat_input_t * i, va_list * args) return is_unique; } +static int +vlib_cli_cmp_strings (void *a1, void *a2) +{ + u8 *c1 = *(u8 **) a1; + u8 *c2 = *(u8 **) a2; + + return vec_cmp (c1, c2); +} + +u8 ** +vlib_cli_get_possible_completions (u8 * str) +{ + vlib_cli_command_t *c; + vlib_cli_sub_command_t *sc; + vlib_main_t *vm = vlib_get_main (); + vlib_cli_main_t *vcm = &vm->cli_main; + uword *match_bitmap = 0; + uword index, is_unique, help_next_level; + u8 **result = 0; + unformat_input_t input; + unformat_init_vector (&input, vec_dup (str)); + c = vec_elt_at_index (vcm->commands, 0); + + /* remove trailing whitespace, except for one of them */ + while (vec_len (input.buffer) >= 2 && + isspace (input.buffer[vec_len (input.buffer) - 1]) && + isspace (input.buffer[vec_len (input.buffer) - 2])) + { + vec_del1 (input.buffer, vec_len (input.buffer) - 1); + } + + /* if input is empty, directly return list of root commands */ + if (vec_len (input.buffer) == 0 || + (vec_len (input.buffer) == 1 && isspace (input.buffer[0]))) + { + vec_foreach (sc, c->sub_commands) + { + vec_add1 (result, (u8 *) sc->name); + } + goto done; + } + + /* add a trailing '?' so that vlib_cli_sub_command_match can find + * all commands starting with the input string */ + vec_add1 (input.buffer, '?'); + + while (1) + { + match_bitmap = vlib_cli_sub_command_match (c, &input); + /* no match: return no result */ + if (match_bitmap == 0) + { + goto done; + } + is_unique = clib_bitmap_count_set_bits (match_bitmap) == 1; + /* unique match: try to step one subcommand level further */ + if (is_unique) + { + /* stop if no more input */ + if (input.index >= vec_len (input.buffer) - 1) + { + break; + } + + index = clib_bitmap_first_set (match_bitmap); + c = get_sub_command (vcm, c, index); + clib_bitmap_free (match_bitmap); + continue; + } + /* multiple matches: stop here, return all matches */ + break; + } + + /* remove trailing '?' */ + vec_del1 (input.buffer, vec_len (input.buffer) - 1); + + /* if we have a space at the end of input, and a unique match, + * autocomplete the next level of subcommands */ + help_next_level = (vec_len (str) == 0) || isspace (str[vec_len (str) - 1]); + /* *INDENT-OFF* */ + clib_bitmap_foreach(index, match_bitmap, { + if (help_next_level && is_unique) { + c = get_sub_command (vcm, c, index); + vec_foreach (sc, c->sub_commands) { + vec_add1 (result, (u8*) sc->name); + } + goto done; /* break doesn't work in this macro-loop */ + } + sc = &c->sub_commands[index]; + vec_add1(result, (u8*) sc->name); + }); + /* *INDENT-ON* */ + +done: + clib_bitmap_free (match_bitmap); + unformat_free (&input); + + vec_sort_with_function (result, vlib_cli_cmp_strings); + return result; +} + static u8 * format_vlib_cli_command_help (u8 * s, va_list * args) { diff --git a/src/vlib/cli.h b/src/vlib/cli.h index 009c7e82..e713808f 100644 --- a/src/vlib/cli.h +++ b/src/vlib/cli.h @@ -181,6 +181,10 @@ clib_error_t *vlib_cli_register_parse_rule (struct vlib_main_t *vm, uword unformat_vlib_cli_sub_input (unformat_input_t * i, va_list * args); +/* Return an vector of strings consisting of possible auto-completions + * for a given input string */ +u8 **vlib_cli_get_possible_completions (u8 * input_str); + #endif /* included_vlib_cli_h */ /* diff --git a/src/vlib/unix/cli.c b/src/vlib/unix/cli.c index 88e2453c..cf524213 100644 --- a/src/vlib/unix/cli.c +++ b/src/vlib/unix/cli.c @@ -1230,6 +1230,8 @@ unix_cli_line_process_one (unix_cli_main_t * cm, u8 input, unix_cli_parse_action_t action) { u8 *prev; + u8 *save = 0; + u8 **possible_commands; int j, delta; switch (action) @@ -1553,6 +1555,157 @@ unix_cli_line_process_one (unix_cli_main_t * cm, break; case UNIX_CLI_PARSE_ACTION_TAB: + if (cf->cursor < vec_len (cf->current_command)) + { + /* if we are in the middle of a line, complete only if + * the cursor points to whitespace */ + if (isspace (cf->current_command[cf->cursor])) + { + /* save and clear any input that is after the cursor */ + vec_resize (save, vec_len (cf->current_command) - cf->cursor); + clib_memcpy (save, cf->current_command + cf->cursor, + vec_len (cf->current_command) - cf->cursor); + _vec_len (cf->current_command) = cf->cursor; + } + else + { + unix_vlib_cli_output_raw (cf, uf, (u8 *) "\a", 1); + break; + } + } + possible_commands = + vlib_cli_get_possible_completions (cf->current_command); + if (vec_len (possible_commands) == 1) + { + u32 j = cf->cursor; + u8 *completed = possible_commands[0]; + + /* find the last word of current_command */ + while (j >= 1 && !isspace (cf->current_command[j - 1])) + { + j--; + unix_vlib_cli_output_raw (cf, uf, (u8 *) "\b", 1); + } + _vec_len (cf->current_command) = j; + + /* replace it with the newly expanded command */ + vec_append (cf->current_command, completed); + + /* echo to the terminal */ + unix_vlib_cli_output_raw (cf, uf, completed, vec_len (completed)); + + /* add one trailing space if needed */ + if (vec_len (save) == 0) + { + vec_add1 (cf->current_command, ' '); + unix_vlib_cli_output_raw (cf, uf, (u8 *) " ", 1); + } + + cf->cursor = vec_len (cf->current_command); + + } + else if (vec_len (possible_commands) >= 2) + { + u8 **possible_command; + uword max_command_len = 0, min_command_len = ~0; + u32 i, j; + + vec_foreach (possible_command, possible_commands) + { + if (vec_len (*possible_command) > max_command_len) + { + max_command_len = vec_len (*possible_command); + } + if (vec_len (*possible_command) < min_command_len) + { + min_command_len = vec_len (*possible_command); + } + } + + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1); + + i = 0; + vec_foreach (possible_command, possible_commands) + { + if (i + max_command_len >= cf->width) + { + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1); + i = 0; + } + unix_vlib_cli_output_raw (cf, uf, *possible_command, + vec_len (*possible_command)); + for (j = vec_len (*possible_command); j < max_command_len + 2; + j++) + { + unix_vlib_cli_output_raw (cf, uf, (u8 *) " ", 1); + } + i += max_command_len + 2; + } + + unix_vlib_cli_output_cooked (cf, uf, (u8 *) "\n", 1); + + /* rewrite prompt */ + unix_cli_cli_prompt (cf, uf); + unix_vlib_cli_output_raw (cf, uf, cf->current_command, + vec_len (cf->current_command)); + + /* count length of last word */ + j = cf->cursor; + i = 0; + while (j >= 1 && !isspace (cf->current_command[j - 1])) + { + j--; + i++; + } + + /* determine smallest common command */ + for (; i < min_command_len; i++) + { + u8 common = '\0'; + int stop = 0; + vec_foreach (possible_command, possible_commands) + { + if (common == '\0') + { + common = (*possible_command)[i]; + } + else if (common != (*possible_command)[i]) + { + stop = 1; + break; + } + } + if (!stop) + { + vec_add1 (cf->current_command, common); + cf->cursor++; + unix_vlib_cli_output_raw (cf, uf, (u8 *) & common, 1); + } + else + { + break; + } + } + } + else + { + unix_vlib_cli_output_raw (cf, uf, (u8 *) "\a", 1); + } + + if (vec_len (save) > 0) + { + /* restore remaining input if tab was hit in the middle of a line */ + unix_vlib_cli_output_raw (cf, uf, save, vec_len (save)); + for (j = 0; j < vec_len (save); j++) + { + unix_vlib_cli_output_raw (cf, uf, (u8 *) "\b", 1); + } + vec_append (cf->current_command, save); + vec_free (save); + } + vec_free (possible_commands); + + break; case UNIX_CLI_PARSE_ACTION_YANK: /* TODO */ break; -- cgit 1.2.3-korg From 4227eefcd4ef4aa195bec4527d18f027fcc3815c Mon Sep 17 00:00:00 2001 From: Yoann Desmouceaux Date: Wed, 24 May 2017 15:51:48 +0200 Subject: Fix tab-completion coverity issue Change-Id: I051d015e7eee621dbef273b2c57449ea4c44b768 Signed-off-by: Yoann Desmouceaux --- src/vlib/cli.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'src/vlib') diff --git a/src/vlib/cli.c b/src/vlib/cli.c index 9e14bee2..48cf0426 100644 --- a/src/vlib/cli.c +++ b/src/vlib/cli.c @@ -333,7 +333,8 @@ done: clib_bitmap_free (match_bitmap); unformat_free (&input); - vec_sort_with_function (result, vlib_cli_cmp_strings); + if (result) + vec_sort_with_function (result, vlib_cli_cmp_strings); return result; } -- cgit 1.2.3-korg From a7f7457593a00644c10a7fa00db1cd1c32a1bcb2 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Tue, 23 May 2017 18:32:21 +0200 Subject: vlib: use driver_override in sysfs for binding pci devs to vfio/uio drivers Change-Id: I262e455792fd95d286ee3ebc0049e2352ae5899f Signed-off-by: Damjan Marion --- src/vlib/pci/linux_pci.c | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/pci/linux_pci.c b/src/vlib/pci/linux_pci.c index d43361aa..623737d5 100644 --- a/src/vlib/pci/linux_pci.c +++ b/src/vlib/pci/linux_pci.c @@ -95,24 +95,22 @@ clib_error_t * vlib_pci_bind_to_uio (vlib_pci_device_t * d, char *uio_driver_name) { clib_error_t *error = 0; - u8 *s = 0; + u8 *s = 0, *driver_name = 0; DIR *dir = 0; struct dirent *e; - int fd; + int fd, clear_driver_override = 0; u8 *dev_dir_name = format (0, "/sys/bus/pci/devices/%U", format_vlib_pci_addr, &d->bus_address); - /* if uio sub-directory exists, we are fine, device is - already bound to UIO driver */ - s = format (s, "%v/uio%c", dev_dir_name, 0); - if (access ((char *) s, F_OK) == 0) - goto done; + s = format (s, "%v/driver%c", dev_dir_name, 0); + driver_name = vlib_sysfs_link_to_name ((char *) s); vec_reset_length (s); - s = format (s, "%v/iommu_group%c", dev_dir_name, 0); - if (access ((char *) s, F_OK) == 0) + if (driver_name && + ((strcmp ("vfio-pci", (char *) driver_name) == 0) || + (strcmp ("uio_pci_generic", (char *) driver_name) == 0) || + (strcmp ("igb_uio", (char *) driver_name) == 0))) goto done; - vec_reset_length (s); /* walk trough all linux interfaces and if interface belonging to this device is founf check if interface is admin up */ @@ -187,17 +185,37 @@ vlib_pci_bind_to_uio (vlib_pci_device_t * d, char *uio_driver_name) vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); vec_reset_length (s); - s = format (s, "/sys/bus/pci/drivers/%s/new_id%c", uio_driver_name, 0); - vlib_sysfs_write ((char *) s, "0x%04x 0x%04x", d->vendor_id, d->device_id); + s = format (s, "%v/driver_override%c", dev_dir_name, 0); + if (access ((char *) s, F_OK) == 0) + { + vlib_sysfs_write ((char *) s, "%s", uio_driver_name); + clear_driver_override = 1; + } + else + { + vec_reset_length (s); + s = format (s, "/sys/bus/pci/drivers/%s/new_id%c", uio_driver_name, 0); + vlib_sysfs_write ((char *) s, "0x%04x 0x%04x", d->vendor_id, + d->device_id); + } vec_reset_length (s); s = format (s, "/sys/bus/pci/drivers/%s/bind%c", uio_driver_name, 0); vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); + vec_reset_length (s); + + if (clear_driver_override) + { + s = format (s, "%v/driver_override%c", dev_dir_name, 0); + vlib_sysfs_write ((char *) s, "%c", 0); + vec_reset_length (s); + } done: closedir (dir); vec_free (s); vec_free (dev_dir_name); + vec_free (driver_name); return error; } -- cgit 1.2.3-korg From a9a54c15e6dd987b87d15e80ce4345760ac247f7 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Mon, 5 Jun 2017 21:54:46 +0200 Subject: vlib: add unix_file_del_by_index function Change-Id: I9eca5f9d1c1ae62d5ba5fb36f2f97434dbaf334e Signed-off-by: Damjan Marion --- src/vlib/unix/unix.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'src/vlib') diff --git a/src/vlib/unix/unix.h b/src/vlib/unix/unix.h index ea0d417b..de607c0f 100644 --- a/src/vlib/unix/unix.h +++ b/src/vlib/unix/unix.h @@ -151,6 +151,14 @@ unix_file_del (unix_main_t * um, unix_file_t * f) pool_put (um->file_pool, f); } +always_inline void +unix_file_del_by_index (unix_main_t * um, uword index) +{ + unix_file_t *uf; + uf = pool_elt_at_index (um->file_pool, index); + unix_file_del (um, uf); +} + always_inline uword unix_file_set_data_available_to_write (u32 unix_file_index, uword is_available) -- cgit 1.2.3-korg From 6e8ab16cf38795e8957d0dbb8752c6934e661132 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Mon, 5 Jun 2017 21:56:12 +0200 Subject: vlib: add screen-256color CLI terminal type Change-Id: Ia78e69e5e8ed18020314aef321b94ac37037799b Signed-off-by: Damjan Marion --- src/vlib/unix/cli.c | 1 + 1 file changed, 1 insertion(+) (limited to 'src/vlib') diff --git a/src/vlib/unix/cli.c b/src/vlib/unix/cli.c index cf524213..74dea161 100644 --- a/src/vlib/unix/cli.c +++ b/src/vlib/unix/cli.c @@ -1023,6 +1023,7 @@ unix_cli_terminal_type (u8 * term, uword len) _("xterm-color"); _("xterm-256color"); /* iTerm on Mac */ _("screen"); + _("screen-256color"); /* Screen and tmux */ _("ansi"); /* Microsoft Telnet */ #undef _ -- cgit 1.2.3-korg From a62699954a9f57c8407ba10d08636c79166f56ed Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Wed, 7 Jun 2017 08:18:49 -0400 Subject: VPP-873: fix vector expansion bug in dispatch_pending_node The main interior graph-node dispatch loop had a longstanding dangling vector element reference: for (i = 0; i < _vec_len (nm->pending_frames); i++) cpu_time_now = dispatch_pending_node (vm, nm->pending_frames + i, cpu_time_now); Passing a pointer to a vector element (nm->pending_frames + i) has considerable comedic potential if there's any chance that the vector could expand. dispatch_pending_node() calls dispatch_node(), and indirectly any interior graph node dispatch function. If that node happens to expand nm->pending_frames by filling in a new frame, nm->pending_frames can expand. After calling the node dispatch function, dispatch_node() does the following: nf = vec_elt_at_index (nm->next_frames, p->next_frame_index); If nm->pending_frames expands during dispatch function execution, p is a dangling reference to freed memory. By luck, the TCP stack managed to allocate a fresh frame which included "old-p," which caused p->next_frame_index to be filled with the new-frame poison pattern 0xfefefefe. This has been broken from day 1, summer 2007, first use of the third-generation vector processing library. Change-Id: Ideb6363bb060c4e8bf9b901882c318bd83853121 Signed-off-by: Dave Barach --- src/vlib/main.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/main.c b/src/vlib/main.c index 0e6d66cd..14f680e6 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -1112,14 +1112,18 @@ dispatch_node (vlib_main_t * vm, } static u64 -dispatch_pending_node (vlib_main_t * vm, - vlib_pending_frame_t * p, u64 last_time_stamp) +dispatch_pending_node (vlib_main_t * vm, uword pending_frame_index, + u64 last_time_stamp) { vlib_node_main_t *nm = &vm->node_main; vlib_frame_t *f; vlib_next_frame_t *nf, nf_dummy; vlib_node_runtime_t *n; u32 restore_frame_index; + vlib_pending_frame_t *p; + + /* See comment below about dangling references to nm->pending_frames */ + p = nm->pending_frames + pending_frame_index; n = vec_elt_at_index (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL], p->node_runtime_index); @@ -1169,18 +1173,29 @@ dispatch_pending_node (vlib_main_t * vm, /* Frame is ready to be used again, so restore it. */ if (restore_frame_index != ~0) { - /* we musn't restore a frame that is flagged to be freed. This shouldn't - happen since frames to be freed post dispatch are those used - when the to-node frame becomes full i.e. they form a sort of queue of - frames to a single node. If we get here then the to-node frame and the - pending frame *were* the same, and so we removed the to-node frame. - Therefore this frame is no longer part of the queue for that node - and hence it cannot be it's overspill. + /* + * We musn't restore a frame that is flagged to be freed. This + * shouldn't happen since frames to be freed post dispatch are + * those used when the to-node frame becomes full i.e. they form a + * sort of queue of frames to a single node. If we get here then + * the to-node frame and the pending frame *were* the same, and so + * we removed the to-node frame. Therefore this frame is no + * longer part of the queue for that node and hence it cannot be + * it's overspill. */ ASSERT (!(f->flags & VLIB_FRAME_FREE_AFTER_DISPATCH)); - /* p->next_frame_index can change during node dispatch if node - function decides to change graph hook up. */ + /* + * NB: dispatching node n can result in the creation and scheduling + * of new frames, and hence in the reallocation of nm->pending_frames. + * Recompute p, or no supper. This was broken for more than 10 years. + */ + p = nm->pending_frames + pending_frame_index; + + /* + * p->next_frame_index can change during node dispatch if node + * function decides to change graph hook up. + */ nf = vec_elt_at_index (nm->next_frames, p->next_frame_index); nf->flags |= VLIB_FRAME_IS_ALLOCATED; @@ -1607,8 +1622,7 @@ vlib_main_or_worker_loop (vlib_main_t * vm, int is_main) Process pending vector until there is nothing left. All pending vectors will be processed from input -> output. */ for (i = 0; i < _vec_len (nm->pending_frames); i++) - cpu_time_now = dispatch_pending_node (vm, nm->pending_frames + i, - cpu_time_now); + cpu_time_now = dispatch_pending_node (vm, i, cpu_time_now); /* Reset pending vector for next iteration. */ _vec_len (nm->pending_frames) = 0; -- cgit 1.2.3-korg From 93992a9048cb6e5dcd22de5091e72de778122627 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Wed, 24 May 2017 18:03:56 -0700 Subject: Implement sack based tcp loss recovery (RFC 6675) - refactor existing congestion control code (RFC 6582/5681). Handling of ack feedback now consists of: ack parsing, cc event detection, event handling, congestion control update - extend sack scoreboard to support sack based retransmissions - basic implementation of Eifel detection algorithm (RFC 3522) for detecting spurious retransmissions - actually initialize the per-thread frame freelist hash tables - increase worker stack size to 2mb - fix session queue node out-of-buffer handling - ensure that the local buffer cache vec_len matches reality - avoid 2x spurious event requeues when short of buffers - count out-of-buffer events - make the builtin server thread-safe - fix bihash template threading issue: need to paint -1 across uninitialized working_copy_length vector elements (via rebase from master) Change-Id: I646cb9f1add9a67d08f4a87badbcb117980ebfc4 Signed-off-by: Florin Coras Signed-off-by: Dave Barach --- src/svm/svm_fifo.c | 5 +- src/vlib/node.c | 1 + src/vlib/threads.c | 2 +- src/vlib/threads.h | 2 +- src/vnet/session/node.c | 53 ++-- src/vnet/session/session.c | 11 +- src/vnet/session/session.h | 6 +- src/vnet/session/session_cli.c | 26 +- src/vnet/tcp/builtin_client.c | 40 ++- src/vnet/tcp/builtin_server.c | 20 +- src/vnet/tcp/tcp.c | 57 ++-- src/vnet/tcp/tcp.h | 112 +++++-- src/vnet/tcp/tcp_debug.h | 16 +- src/vnet/tcp/tcp_input.c | 671 +++++++++++++++++++++++++++++------------ src/vnet/tcp/tcp_newreno.c | 20 +- src/vnet/tcp/tcp_output.c | 287 ++++++++++++------ src/vnet/tcp/tcp_test.c | 53 ++-- 17 files changed, 973 insertions(+), 409 deletions(-) (limited to 'src/vlib') diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c index f13f6fea..5c8f244a 100644 --- a/src/svm/svm_fifo.c +++ b/src/svm/svm_fifo.c @@ -540,7 +540,7 @@ svm_fifo_peek (svm_fifo_t * f, u32 relative_offset, u32 max_bytes, /* read cursize, which can only increase while we're working */ cursize = svm_fifo_max_dequeue (f); - if (PREDICT_FALSE (cursize == 0)) + if (PREDICT_FALSE (cursize < relative_offset)) return -2; /* nothing in the fifo */ nitems = f->nitems; @@ -548,7 +548,8 @@ svm_fifo_peek (svm_fifo_t * f, u32 relative_offset, u32 max_bytes, real_head = real_head >= nitems ? real_head - nitems : real_head; /* Number of bytes we're going to copy */ - total_copy_bytes = (cursize < max_bytes) ? cursize : max_bytes; + total_copy_bytes = (cursize - relative_offset < max_bytes) ? + cursize - relative_offset : max_bytes; if (PREDICT_TRUE (copy_here != 0)) { diff --git a/src/vlib/node.c b/src/vlib/node.c index bbd3a42e..eecad274 100644 --- a/src/vlib/node.c +++ b/src/vlib/node.c @@ -502,6 +502,7 @@ vlib_node_main_init (vlib_main_t * vm) vlib_node_t *n; uword ni; + nm->frame_size_hash = hash_create (0, sizeof (uword)); nm->flags |= VLIB_NODE_MAIN_RUNTIME_STARTED; /* Generate sibling relationships */ diff --git a/src/vlib/threads.c b/src/vlib/threads.c index b7bc9e26..0c775e2d 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -670,7 +670,7 @@ start_workers (vlib_main_t * vm) /* zap the (per worker) frame freelists, etc */ nm_clone->frame_sizes = 0; - nm_clone->frame_size_hash = 0; + nm_clone->frame_size_hash = hash_create (0, sizeof (uword)); /* Packet trace buffers are guaranteed to be empty, nothing to do here */ diff --git a/src/vlib/threads.h b/src/vlib/threads.h index 17d35a24..572ce77f 100644 --- a/src/vlib/threads.h +++ b/src/vlib/threads.h @@ -62,7 +62,7 @@ typedef struct vlib_thread_registration_ #define VLIB_CPU_MASK (VLIB_MAX_CPUS - 1) /* 0x3f, max */ #define VLIB_OFFSET_MASK (~VLIB_CPU_MASK) -#define VLIB_LOG2_THREAD_STACK_SIZE (20) +#define VLIB_LOG2_THREAD_STACK_SIZE (21) #define VLIB_THREAD_STACK_SIZE (1<session_type]; @@ -167,9 +169,6 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, /* Check how much we can pull. If buffering, subtract the offset */ max_dequeue0 = svm_fifo_max_dequeue (s0->server_tx_fifo) - rx_offset; - /* Allow enqueuing of a new event */ - svm_fifo_unset_event (s0->server_tx_fifo); - /* Nothing to read return */ if (max_dequeue0 == 0) return 0; @@ -187,8 +186,8 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, max_len_to_snd0 = snd_space0; } - n_bytes_per_buf = vlib_buffer_free_list_buffer_size (vm, - VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + n_bytes_per_buf = vlib_buffer_free_list_buffer_size + (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); n_bytes_per_seg = MAX_HDRS_LEN + snd_mss0; n_bufs_per_seg = ceil ((double) n_bytes_per_seg / n_bytes_per_buf); n_bufs_per_evt = (ceil ((double) max_len_to_snd0 / n_bytes_per_seg)) @@ -205,24 +204,33 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, if (PREDICT_FALSE (n_bufs < VLIB_FRAME_SIZE)) { vec_validate (smm->tx_buffers[thread_index], - n_bufs + VLIB_FRAME_SIZE - 1); - n_bufs += vlib_buffer_alloc (vm, - &smm->tx_buffers[thread_index][n_bufs], - VLIB_FRAME_SIZE); - - /* buffer shortage - * XXX 0.9 because when debugging we might not get a full frame */ - if (PREDICT_FALSE (n_bufs < 0.9 * VLIB_FRAME_SIZE)) + n_bufs + 2 * VLIB_FRAME_SIZE - 1); + + buffers_allocated = 0; + do { - if (svm_fifo_set_event (s0->server_tx_fifo)) - { - vec_add1 (smm->pending_event_vector[thread_index], *e0); - } - return -1; + buffers_allocated_this_call = + vlib_buffer_alloc + (vm, + &smm->tx_buffers[thread_index][n_bufs + buffers_allocated], + 2 * VLIB_FRAME_SIZE - buffers_allocated); + buffers_allocated += buffers_allocated_this_call; } + while (buffers_allocated_this_call > 0 + && ((buffers_allocated + n_bufs < VLIB_FRAME_SIZE))); + + n_bufs += buffers_allocated; _vec_len (smm->tx_buffers[thread_index]) = n_bufs; + + if (PREDICT_FALSE (n_bufs < VLIB_FRAME_SIZE)) + { + vec_add1 (smm->pending_event_vector[thread_index], *e0); + return -1; + } } + /* Allow enqueuing of a new event */ + svm_fifo_unset_event (s0->server_tx_fifo); vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); while (left_to_snd0 && n_left_to_next >= n_bufs_per_seg) @@ -232,7 +240,9 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, */ /* Get free buffer */ + ASSERT (n_bufs >= 1); bi0 = smm->tx_buffers[thread_index][--n_bufs]; + ASSERT (bi0); _vec_len (smm->tx_buffers[thread_index]) = n_bufs; b0 = vlib_get_buffer (vm, bi0); @@ -545,9 +555,10 @@ skip_dequeue: my_thread_index, &n_tx_packets); /* Out of buffers */ - if (rv < 0) + if (PREDICT_FALSE (rv < 0)) { - vec_add1 (smm->pending_event_vector[my_thread_index], *e0); + vlib_node_increment_counter (vm, node->node_index, + SESSION_QUEUE_ERROR_NO_BUFFER, 1); continue; } break; diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 02b0cced..534598d6 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -551,7 +551,7 @@ u8 stream_session_no_space (transport_connection_t * tc, u32 thread_index, u16 data_len) { - stream_session_t *s = stream_session_get (tc->c_index, thread_index); + stream_session_t *s = stream_session_get (tc->s_index, thread_index); if (PREDICT_FALSE (s->session_state != SESSION_STATE_READY)) return 1; @@ -563,6 +563,15 @@ stream_session_no_space (transport_connection_t * tc, u32 thread_index, } u32 +stream_session_tx_fifo_max_dequeue (transport_connection_t * tc) +{ + stream_session_t *s = stream_session_get (tc->s_index, tc->thread_index); + if (s->session_state != SESSION_STATE_READY) + return 0; + return svm_fifo_max_dequeue (s->server_tx_fifo); +} + +int stream_session_peek_bytes (transport_connection_t * tc, u8 * buffer, u32 offset, u32 max_bytes) { diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index a8728649..d9c38bd1 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -352,16 +352,18 @@ stream_session_max_rx_enqueue (transport_connection_t * tc) } always_inline u32 -stream_session_fifo_size (transport_connection_t * tc) +stream_session_rx_fifo_size (transport_connection_t * tc) { stream_session_t *s = stream_session_get (tc->s_index, tc->thread_index); return s->server_rx_fifo->nitems; } +u32 stream_session_tx_fifo_max_dequeue (transport_connection_t * tc); + int stream_session_enqueue_data (transport_connection_t * tc, vlib_buffer_t * b, u32 offset, u8 queue_event, u8 is_in_order); -u32 +int stream_session_peek_bytes (transport_connection_t * tc, u8 * buffer, u32 offset, u32 max_bytes); u32 stream_session_dequeue_drop (transport_connection_t * tc, u32 max_bytes); diff --git a/src/vnet/session/session_cli.c b/src/vnet/session/session_cli.c index 509eedbb..6b8341aa 100755 --- a/src/vnet/session/session_cli.c +++ b/src/vnet/session/session_cli.c @@ -15,6 +15,15 @@ #include #include +u8 * +format_stream_session_fifos (u8 * s, va_list * args) +{ + stream_session_t *ss = va_arg (*args, stream_session_t *); + s = format (s, " Rx fifo: %U", format_svm_fifo, ss->server_rx_fifo, 1); + s = format (s, " Tx fifo: %U", format_svm_fifo, ss->server_tx_fifo, 1); + return s; +} + /** * Format stream session as per the following format * @@ -44,6 +53,8 @@ format_stream_session (u8 * s, va_list * args) ss->thread_index, verbose); if (verbose == 1) s = format (s, "%v", str); + if (verbose > 1) + s = format (s, "%U", format_stream_session_fifos, ss); } else if (ss->session_state == SESSION_STATE_LISTENING) { @@ -57,8 +68,12 @@ format_stream_session (u8 * s, va_list * args) } else if (ss->session_state == SESSION_STATE_CLOSED) { - s = format (s, "[CL] %-40U%v", tp_vft->format_connection, - ss->connection_index, ss->thread_index, verbose, str); + s = format (s, "[CL] %-40U", tp_vft->format_connection, + ss->connection_index, ss->thread_index, verbose); + if (verbose == 1) + s = format (s, "%v", str); + if (verbose > 1) + s = format (s, "%U", format_stream_session_fifos, ss); } else { @@ -124,13 +139,6 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, ({ vec_reset_length (str); str = format (str, "%U", format_stream_session, s, verbose); - if (verbose > 1) - { - str = format (str, " Rx fifo: %U", format_svm_fifo, - s->server_rx_fifo, 1); - str = format (str, " Tx fifo: %U", format_svm_fifo, - s->server_tx_fifo, 1); - } vlib_cli_output (vm, "%v", str); })); /* *INDENT-ON* */ diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index 768f0c3c..7238cda3 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -115,8 +115,17 @@ receive_test_chunk (tclient_main_t * tm, session_t * s) /* Allow enqueuing of new event */ // svm_fifo_unset_event (rx_fifo); - n_read = svm_fifo_dequeue_nowait (rx_fifo, vec_len (tm->rx_buf), - tm->rx_buf); + if (test_bytes) + { + n_read = svm_fifo_dequeue_nowait (rx_fifo, vec_len (tm->rx_buf), + tm->rx_buf); + } + else + { + n_read = svm_fifo_max_dequeue (rx_fifo); + svm_fifo_dequeue_drop (rx_fifo, n_read); + } + if (n_read > 0) { if (TCP_BUILTIN_CLIENT_DBG) @@ -165,6 +174,8 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, int i; int delete_session; u32 *connection_indices; + u32 tx_quota = 0; + u32 delta, prev_bytes_received_this_session; connection_indices = tm->connection_index_by_thread[my_thread_index]; @@ -177,14 +188,19 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, sp = pool_elt_at_index (tm->sessions, connection_indices[i]); - if (sp->bytes_to_send > 0) + if (tx_quota < 60 && sp->bytes_to_send > 0) { send_test_chunk (tm, sp); delete_session = 0; + tx_quota++; } if (sp->bytes_to_receive > 0) { + prev_bytes_received_this_session = sp->bytes_received; receive_test_chunk (tm, sp); + delta = sp->bytes_received - prev_bytes_received_this_session; + if (delta > 0) + tx_quota--; delete_session = 0; } if (PREDICT_FALSE (delete_session == 1)) @@ -195,11 +211,19 @@ builtin_client_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, dmp->_vl_msg_id = ntohs (VL_API_DISCONNECT_SESSION); dmp->client_index = tm->my_client_index; dmp->handle = sp->vpp_session_handle; - vl_msg_api_send_shmem (tm->vl_input_queue, (u8 *) & dmp); - vec_delete (connection_indices, 1, i); - tm->connection_index_by_thread[my_thread_index] = - connection_indices; - __sync_fetch_and_add (&tm->ready_connections, -1); +// vl_msg_api_send_shmem (tm->vl_input_queue, (u8 *) & dmp); + if (!unix_shared_memory_queue_add (tm->vl_input_queue, (u8 *) & dmp, + 1)) + { + vec_delete (connection_indices, 1, i); + tm->connection_index_by_thread[my_thread_index] = + connection_indices; + __sync_fetch_and_add (&tm->ready_connections, -1); + } + else + { + vl_msg_api_free (dmp); + } /* Kick the debug CLI process */ if (tm->ready_connections == 0) diff --git a/src/vnet/tcp/builtin_server.c b/src/vnet/tcp/builtin_server.c index 4f0e211c..8bd2f360 100644 --- a/src/vnet/tcp/builtin_server.c +++ b/src/vnet/tcp/builtin_server.c @@ -39,7 +39,8 @@ typedef struct { - u8 *rx_buf; + /* Per-thread RX buffer */ + u8 **rx_buf; unix_shared_memory_queue_t **vpp_queue; u64 byte_index; @@ -117,13 +118,15 @@ void test_bytes (builtin_server_main_t * bsm, int actual_transfer) { int i; + u32 my_thread_id = vlib_get_thread_index (); for (i = 0; i < actual_transfer; i++) { - if (bsm->rx_buf[i] != ((bsm->byte_index + i) & 0xff)) + if (bsm->rx_buf[my_thread_id][i] != ((bsm->byte_index + i) & 0xff)) { clib_warning ("at %lld expected %d got %d", bsm->byte_index + i, - (bsm->byte_index + i) & 0xff, bsm->rx_buf[i]); + (bsm->byte_index + i) & 0xff, + bsm->rx_buf[my_thread_id][i]); } } bsm->byte_index += actual_transfer; @@ -138,6 +141,7 @@ builtin_server_rx_callback (stream_session_t * s) builtin_server_main_t *bsm = &builtin_server_main; session_fifo_event_t evt; static int serial_number = 0; + u32 my_thread_id = vlib_get_thread_index (); tx_fifo = s->server_tx_fifo; rx_fifo = s->server_rx_fifo; @@ -171,11 +175,12 @@ builtin_server_rx_callback (stream_session_t * s) return 0; } - vec_validate (bsm->rx_buf, max_transfer - 1); - _vec_len (bsm->rx_buf) = max_transfer; + vec_validate (bsm->rx_buf, my_thread_id); + vec_validate (bsm->rx_buf[my_thread_id], max_transfer - 1); + _vec_len (bsm->rx_buf[my_thread_id]) = max_transfer; actual_transfer = svm_fifo_dequeue_nowait (rx_fifo, max_transfer, - bsm->rx_buf); + bsm->rx_buf[my_thread_id]); ASSERT (actual_transfer == max_transfer); // test_bytes (bsm, actual_transfer); @@ -184,7 +189,8 @@ builtin_server_rx_callback (stream_session_t * s) * Echo back */ - n_written = svm_fifo_enqueue_nowait (tx_fifo, actual_transfer, bsm->rx_buf); + n_written = svm_fifo_enqueue_nowait (tx_fifo, actual_transfer, + bsm->rx_buf[my_thread_id]); if (n_written != max_transfer) clib_warning ("short trout!"); diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 9b7b2f65..e0b67a8e 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -195,8 +195,8 @@ tcp_connection_close (tcp_connection_t * tc) TCP_EVT_DBG (TCP_EVT_CLOSE, tc); /* Send FIN if needed */ - if (tc->state == TCP_STATE_ESTABLISHED || tc->state == TCP_STATE_SYN_RCVD - || tc->state == TCP_STATE_CLOSE_WAIT) + if (tc->state == TCP_STATE_ESTABLISHED + || tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_CLOSE_WAIT) tcp_send_fin (tc); /* Switch state */ @@ -480,7 +480,7 @@ u8 * format_tcp_timers (u8 * s, va_list * args) { tcp_connection_t *tc = va_arg (*args, tcp_connection_t *); - int i, last = 0; + int i, last = -1; for (i = 0; i < TCP_N_TIMERS; i++) if (tc->timers[i] != TCP_TIMER_HANDLE_INVALID) @@ -493,7 +493,7 @@ format_tcp_timers (u8 * s, va_list * args) s = format (s, "%s,", tcp_conn_timers[i]); } - if (last > 0) + if (last >= 0) s = format (s, "%s]", tcp_conn_timers[i]); else s = format (s, "]"); @@ -526,19 +526,19 @@ format_tcp_vars (u8 * s, va_list * args) s = format (s, " snd_wnd %u rcv_wnd %u snd_wl1 %u snd_wl2 %u\n", tc->snd_wnd, tc->rcv_wnd, tc->snd_wl1 - tc->irs, tc->snd_wl2 - tc->iss); - s = format (s, " flight size %u send space %u rcv_wnd available %d\n", - tcp_flight_size (tc), tcp_snd_space (tc), - tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las)); + s = format (s, " flight size %u send space %u rcv_wnd_av %d\n", + tcp_flight_size (tc), tcp_available_snd_space (tc), + tcp_rcv_wnd_available (tc)); s = format (s, " cong %U ", format_tcp_congestion_status, tc); s = format (s, "cwnd %u ssthresh %u rtx_bytes %u bytes_acked %u\n", - tc->cwnd, tc->ssthresh, tc->rtx_bytes, tc->bytes_acked); - s = format (s, " prev_ssthresh %u snd_congestion %u\n", tc->prev_ssthresh, - tc->snd_congestion - tc->iss); + tc->cwnd, tc->ssthresh, tc->snd_rxt_bytes, tc->bytes_acked); + s = format (s, " prev_ssthresh %u snd_congestion %u dupack %u\n", + tc->prev_ssthresh, tc->snd_congestion - tc->iss, + tc->rcv_dupacks); s = format (s, " rto %u rto_boff %u srtt %u rttvar %u rtt_ts %u ", tc->rto, tc->rto_boff, tc->srtt, tc->rttvar, tc->rtt_ts); s = format (s, "rtt_seq %u\n", tc->rtt_seq); - if (scoreboard_first_hole (&tc->sack_sb)) - s = format (s, " scoreboard: %U\n", format_tcp_scoreboard, &tc->sack_sb); + s = format (s, " scoreboard: %U\n", format_tcp_scoreboard, &tc->sack_sb); if (vec_len (tc->snd_sacks)) s = format (s, " sacks tx: %U\n", format_tcp_sacks, tc); @@ -595,9 +595,10 @@ format_tcp_session (u8 * s, va_list * args) tc = tcp_connection_get (tci, thread_index); if (tc) - return format (s, "%U", format_tcp_connection, tc, verbose); + s = format (s, "%U", format_tcp_connection, tc, verbose); else - return format (s, "empty"); + s = format (s, "empty"); + return s; } u8 * @@ -643,13 +644,17 @@ format_tcp_scoreboard (u8 * s, va_list * args) { sack_scoreboard_t *sb = va_arg (*args, sack_scoreboard_t *); sack_scoreboard_hole_t *hole; - s = format (s, "head %u tail %u snd_una_adv %u\n", sb->head, sb->tail, - sb->snd_una_adv); - s = format (s, "sacked_bytes %u last_sacked_bytes %u", sb->sacked_bytes, - sb->last_sacked_bytes); - s = format (s, " max_byte_sacked %u\n", sb->max_byte_sacked); - s = format (s, "holes:\n"); + s = format (s, "sacked_bytes %u last_sacked_bytes %u lost_bytes %u\n", + sb->sacked_bytes, sb->last_sacked_bytes, sb->lost_bytes); + s = format (s, " last_bytes_delivered %u high_sacked %u snd_una_adv %u\n", + sb->last_bytes_delivered, sb->high_sacked, sb->snd_una_adv); + s = format (s, " cur_rxt_hole %u high_rxt %u rescue_rxt %u", + sb->cur_rxt_hole, sb->high_rxt, sb->rescue_rxt); + hole = scoreboard_first_hole (sb); + if (hole) + s = format (s, "\n head %u tail %u holes:\n", sb->head, sb->tail); + while (hole) { s = format (s, "%U", format_tcp_sack_hole, hole); @@ -736,7 +741,7 @@ tcp_snd_space (tcp_connection_t * tc) if (tcp_in_recovery (tc)) { tc->snd_nxt = tc->snd_una_max; - snd_space = tcp_available_wnd (tc) - tc->rtx_bytes + snd_space = tcp_available_wnd (tc) - tc->snd_rxt_bytes - (tc->snd_una_max - tc->snd_congestion); if (snd_space <= 0 || (tc->snd_una_max - tc->snd_una) >= tc->snd_wnd) return 0; @@ -744,8 +749,8 @@ tcp_snd_space (tcp_connection_t * tc) } /* If in fast recovery, send 1 SMSS if wnd allows */ - if (tcp_in_fastrecovery (tc) && tcp_available_snd_space (tc) - && tcp_fastrecovery_sent_1_smss (tc)) + if (tcp_in_fastrecovery (tc) + && tcp_available_snd_space (tc) && !tcp_fastrecovery_sent_1_smss (tc)) { tcp_fastrecovery_1_smss_on (tc); return tc->snd_mss; @@ -761,6 +766,12 @@ tcp_session_send_space (transport_connection_t * trans_conn) return tcp_snd_space (tc); } +i32 +tcp_rcv_wnd_available (tcp_connection_t * tc) +{ + return (i32) tc->rcv_wnd - (tc->rcv_nxt - tc->rcv_las); +} + u32 tcp_session_tx_fifo_offset (transport_connection_t * trans_conn) { diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index c3ebe22b..071f1ab1 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -34,6 +34,7 @@ #define TCP_MAX_RX_FIFO_SIZE 2 << 20 #define TCP_IW_N_SEGMENTS 10 #define TCP_ALWAYS_ACK 0 /**< If on, we always ack */ +#define TCP_USE_SACKS 1 /**< Disable only for testing */ /** TCP FSM state definitions as per RFC793. */ #define foreach_tcp_fsm_state \ @@ -94,7 +95,7 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler; #define TCP_DELACK_TIME 1 /* 0.1s */ #define TCP_ESTABLISH_TIME 750 /* 75s */ #define TCP_2MSL_TIME 300 /* 30s */ -#define TCP_CLOSEWAIT_TIME 1 /* 0.1s */ +#define TCP_CLOSEWAIT_TIME 20 /* 0.1s */ #define TCP_CLEANUP_TIME 5 /* 0.5s Time to wait before cleanup */ #define TCP_TIMER_PERSIST_MIN 2 /* 0.2s */ @@ -157,6 +158,7 @@ typedef struct _sack_scoreboard_hole u32 prev; /**< Index for previous entry in linked list */ u32 start; /**< Start sequence number */ u32 end; /**< End sequence number */ + u8 is_lost; /**< Mark hole as lost */ } sack_scoreboard_hole_t; typedef struct _sack_scoreboard @@ -166,8 +168,13 @@ typedef struct _sack_scoreboard u32 tail; /**< Index of last entry */ u32 sacked_bytes; /**< Number of bytes sacked in sb */ u32 last_sacked_bytes; /**< Number of bytes last sacked */ + u32 last_bytes_delivered; /**< Number of sack bytes delivered */ u32 snd_una_adv; /**< Bytes to add to snd_una */ - u32 max_byte_sacked; /**< Highest byte acked */ + u32 high_sacked; /**< Highest byte sacked (fack) */ + u32 high_rxt; /**< Highest retransmitted sequence */ + u32 rescue_rxt; /**< Rescue sequence number */ + u32 lost_bytes; /**< Bytes lost as per RFC6675 */ + u32 cur_rxt_hole; /**< Retransmitting from this hole */ } sack_scoreboard_t; typedef enum _tcp_cc_algorithm_type @@ -211,7 +218,7 @@ typedef struct _tcp_connection u32 irs; /**< initial remote sequence */ /* Options */ - tcp_options_t opt; /**< TCP connection options parsed */ + tcp_options_t rcv_opts; /**< Rx options for connection */ tcp_options_t snd_opts; /**< Tx options for connection */ u8 snd_opts_len; /**< Tx options len */ u8 rcv_wscale; /**< Window scale to advertise to peer */ @@ -229,8 +236,10 @@ typedef struct _tcp_connection u32 cwnd; /**< Congestion window */ u32 ssthresh; /**< Slow-start threshold */ u32 prev_ssthresh; /**< ssthresh before congestion */ + u32 prev_cwnd; /**< ssthresh before congestion */ u32 bytes_acked; /**< Bytes acknowledged by current segment */ - u32 rtx_bytes; /**< Retransmitted bytes */ + u32 snd_rxt_bytes; /**< Retransmitted bytes */ + u32 snd_rxt_ts; /**< Timestamp when first packet is retransmitted */ u32 tsecr_last_ack; /**< Timestamp echoed to us in last healthy ACK */ u32 snd_congestion; /**< snd_una_max when congestion is detected */ tcp_cc_algorithm_t *cc_algo; /**< Congestion control algorithm */ @@ -411,6 +420,7 @@ void tcp_send_syn (tcp_connection_t * tc); void tcp_send_fin (tcp_connection_t * tc); void tcp_init_mss (tcp_connection_t * tc); void tcp_update_snd_mss (tcp_connection_t * tc); +void tcp_update_rto (tcp_connection_t * tc); always_inline u32 tcp_end_seq (tcp_header_t * th, u32 len) @@ -428,17 +438,39 @@ tcp_end_seq (tcp_header_t * th, u32 len) #define timestamp_lt(_t1, _t2) ((i32)((_t1)-(_t2)) < 0) #define timestamp_leq(_t1, _t2) ((i32)((_t1)-(_t2)) <= 0) +/** + * Our estimate of the number of bytes that have left the network + */ +always_inline u32 +tcp_bytes_out (const tcp_connection_t * tc) +{ + if (tcp_opts_sack_permitted (&tc->rcv_opts)) + return tc->sack_sb.sacked_bytes + tc->sack_sb.lost_bytes; + else + return tc->rcv_dupacks * tc->snd_mss; +} + +/** + * Our estimate of the number of bytes in flight (pipe size) + */ always_inline u32 tcp_flight_size (const tcp_connection_t * tc) { int flight_size; - flight_size = (int) ((tc->snd_una_max - tc->snd_una) + tc->rtx_bytes) - - (tc->rcv_dupacks * tc->snd_mss) /* - tc->sack_sb.sacked_bytes */ ; + flight_size = (int) (tc->snd_una_max - tc->snd_una) - tcp_bytes_out (tc) + + tc->snd_rxt_bytes; - /* Happens if we don't clear sacked bytes */ if (flight_size < 0) - return 0; + { + if (0) + clib_warning + ("Negative: %u %u %u dupacks %u sacked bytes %u flags %d", + tc->snd_una_max - tc->snd_una, tcp_bytes_out (tc), + tc->snd_rxt_bytes, tc->rcv_dupacks, tc->sack_sb.sacked_bytes, + tc->rcv_opts.flags); + return 0; + } return flight_size; } @@ -481,14 +513,17 @@ tcp_available_snd_space (const tcp_connection_t * tc) return available_wnd - flight_size; } -u32 tcp_rcv_wnd_available (tcp_connection_t * tc); +i32 tcp_rcv_wnd_available (tcp_connection_t * tc); u32 tcp_snd_space (tcp_connection_t * tc); void tcp_update_rcv_wnd (tcp_connection_t * tc); void tcp_retransmit_first_unacked (tcp_connection_t * tc); +void tcp_fast_retransmit_no_sack (tcp_connection_t * tc); +void tcp_fast_retransmit_sack (tcp_connection_t * tc); void tcp_fast_retransmit (tcp_connection_t * tc); -void tcp_cc_congestion (tcp_connection_t * tc); -void tcp_cc_recover (tcp_connection_t * tc); +void tcp_cc_init_congestion (tcp_connection_t * tc); +int tcp_cc_recover (tcp_connection_t * tc); +void tcp_cc_fastrecovery_exit (tcp_connection_t * tc); /* Made public for unit testing only */ void tcp_update_sack_list (tcp_connection_t * tc, u32 start, u32 end); @@ -563,16 +598,16 @@ tcp_retransmit_timer_set (tcp_connection_t * tc) } always_inline void -tcp_retransmit_timer_update (tcp_connection_t * tc) +tcp_retransmit_timer_reset (tcp_connection_t * tc) { - tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, - clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); + tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT); } always_inline void -tcp_retransmit_timer_reset (tcp_connection_t * tc) +tcp_retransmit_timer_force_update (tcp_connection_t * tc) { - tcp_timer_reset (tc, TCP_TIMER_RETRANSMIT); + tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, + clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); } always_inline void @@ -598,15 +633,43 @@ tcp_persist_timer_reset (tcp_connection_t * tc) tcp_timer_reset (tc, TCP_TIMER_PERSIST); } +always_inline void +tcp_retransmit_timer_update (tcp_connection_t * tc) +{ + if (tc->snd_una == tc->snd_una_max) + { + tcp_retransmit_timer_reset (tc); + if (tc->snd_wnd < tc->snd_mss) + tcp_persist_timer_set (tc); + } + else + tcp_timer_update (tc, TCP_TIMER_RETRANSMIT, + clib_max (tc->rto * TCP_TO_TIMER_TICK, 1)); +} + always_inline u8 tcp_timer_is_active (tcp_connection_t * tc, tcp_timers_e timer) { return tc->timers[timer] != TCP_TIMER_HANDLE_INVALID; } +#define tcp_validate_txf_size(_tc, _a) \ + ASSERT(_tc->state != TCP_STATE_ESTABLISHED \ + || stream_session_tx_fifo_max_dequeue (&_tc->connection) >= _a) + void scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole); +void scoreboard_update_lost (tcp_connection_t * tc, sack_scoreboard_t * sb); +sack_scoreboard_hole_t *scoreboard_insert_hole (sack_scoreboard_t * sb, + u32 prev_index, u32 start, + u32 end); +sack_scoreboard_hole_t *scoreboard_next_rxt_hole (sack_scoreboard_t * sb, + sack_scoreboard_hole_t * + start, u8 have_sent_1_smss, + u8 * can_rescue, + u8 * snd_limited); +void scoreboard_init_high_rxt (sack_scoreboard_t * sb); always_inline sack_scoreboard_hole_t * scoreboard_get_hole (sack_scoreboard_t * sb, u32 index) @@ -624,6 +687,14 @@ scoreboard_next_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) return 0; } +always_inline sack_scoreboard_hole_t * +scoreboard_prev_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) +{ + if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX) + return pool_elt_at_index (sb->holes, hole->prev); + return 0; +} + always_inline sack_scoreboard_hole_t * scoreboard_first_hole (sack_scoreboard_t * sb) { @@ -643,15 +714,19 @@ scoreboard_last_hole (sack_scoreboard_t * sb) always_inline void scoreboard_clear (sack_scoreboard_t * sb) { - sack_scoreboard_hole_t *hole = scoreboard_first_hole (sb); + sack_scoreboard_hole_t *hole; while ((hole = scoreboard_first_hole (sb))) { scoreboard_remove_hole (sb, hole); } sb->sacked_bytes = 0; sb->last_sacked_bytes = 0; + sb->last_bytes_delivered = 0; sb->snd_una_adv = 0; - sb->max_byte_sacked = 0; + sb->high_sacked = 0; + sb->high_rxt = 0; + sb->lost_bytes = 0; + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; } always_inline u32 @@ -671,6 +746,7 @@ scoreboard_init (sack_scoreboard_t * sb) { sb->head = TCP_INVALID_SACK_HOLE_INDEX; sb->tail = TCP_INVALID_SACK_HOLE_INDEX; + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; } void tcp_rcv_sacks (tcp_connection_t * tc, u32 ack); diff --git a/src/vnet/tcp/tcp_debug.h b/src/vnet/tcp/tcp_debug.h index b4497a3b..3a16cf63 100755 --- a/src/vnet/tcp/tcp_debug.h +++ b/src/vnet/tcp/tcp_debug.h @@ -393,7 +393,7 @@ typedef enum _tcp_dbg_evt DECLARE_ETD(_tc, _e, 4); \ ed->data[0] = _seq - _tc->irs; \ ed->data[1] = _end - _tc->irs; \ - ed->data[2] = _tc->opt.tsval; \ + ed->data[2] = _tc->rcv_opts.tsval; \ ed->data[3] = _tc->tsval_recent; \ } @@ -427,27 +427,27 @@ typedef enum _tcp_dbg_evt { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "rtx: snd_nxt %u offset %u snd %u rtx %u", \ + .format = "rxt: snd_nxt %u offset %u snd %u rxt %u", \ .format_args = "i4i4i4i4", \ }; \ DECLARE_ETD(_tc, _e, 4); \ ed->data[0] = _tc->snd_nxt - _tc->iss; \ ed->data[1] = offset; \ ed->data[2] = n_bytes; \ - ed->data[3] = _tc->rtx_bytes; \ + ed->data[3] = _tc->snd_rxt_bytes; \ } #define TCP_EVT_CC_EVT_HANDLER(_tc, _sub_evt, ...) \ { \ ELOG_TYPE_DECLARE (_e) = \ { \ - .format = "cc: %s wnd %u snd_cong %u rtx_bytes %u", \ + .format = "cc: %s wnd %u snd_cong %u rxt_bytes %u", \ .format_args = "t4i4i4i4", \ .n_enum_strings = 5, \ .enum_strings = { \ - "fast-rtx", \ - "rtx-timeout", \ - "first-rtx", \ + "fast-rxt", \ + "rxt-timeout", \ + "first-rxt", \ "recovered", \ "congestion", \ }, \ @@ -456,7 +456,7 @@ typedef enum _tcp_dbg_evt ed->data[0] = _sub_evt; \ ed->data[1] = tcp_available_snd_space (_tc); \ ed->data[2] = _tc->snd_congestion - _tc->iss; \ - ed->data[3] = _tc->rtx_bytes; \ + ed->data[3] = _tc->snd_rxt_bytes; \ } #define TCP_EVT_CC_PACK_HANDLER(_tc, ...) \ diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 35bc9094..ff2229b3 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -231,8 +231,8 @@ tcp_options_parse (tcp_header_t * th, tcp_options_t * to) always_inline int tcp_segment_check_paws (tcp_connection_t * tc) { - return tcp_opts_tstamp (&tc->opt) && tc->tsval_recent - && timestamp_lt (tc->opt.tsval, tc->tsval_recent); + return tcp_opts_tstamp (&tc->rcv_opts) && tc->tsval_recent + && timestamp_lt (tc->rcv_opts.tsval, tc->tsval_recent); } /** @@ -248,10 +248,10 @@ tcp_update_timestamp (tcp_connection_t * tc, u32 seq, u32 seq_end) * then the TSval from the segment is copied to TS.Recent; * otherwise, the TSval is ignored. */ - if (tcp_opts_tstamp (&tc->opt) && tc->tsval_recent + if (tcp_opts_tstamp (&tc->rcv_opts) && tc->tsval_recent && seq_leq (seq, tc->rcv_las) && seq_leq (tc->rcv_las, seq_end)) { - tc->tsval_recent = tc->opt.tsval; + tc->tsval_recent = tc->rcv_opts.tsval; tc->tsval_recent_age = tcp_time_now (); } } @@ -272,14 +272,21 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, if (PREDICT_FALSE (!tcp_ack (th0) && !tcp_rst (th0) && !tcp_syn (th0))) return -1; - if (PREDICT_FALSE (tcp_options_parse (th0, &tc0->opt))) + if (PREDICT_FALSE (tcp_options_parse (th0, &tc0->rcv_opts))) { return -1; } if (tcp_segment_check_paws (tc0)) { - clib_warning ("paws failed"); + if (CLIB_DEBUG > 2) + { + clib_warning ("paws failed\n%U", format_tcp_connection, tc0, 2); + clib_warning ("seq %u seq_end %u ack %u", + vnet_buffer (b0)->tcp.seq_number - tc0->irs, + vnet_buffer (b0)->tcp.seq_end - tc0->irs, + vnet_buffer (b0)->tcp.ack_number - tc0->iss); + } TCP_EVT_DBG (TCP_EVT_PAWS_FAIL, tc0, vnet_buffer (b0)->tcp.seq_number, vnet_buffer (b0)->tcp.seq_end); @@ -348,7 +355,6 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, /* If segment in window, save timestamp */ tcp_update_timestamp (tc0, vnet_buffer (b0)->tcp.seq_number, vnet_buffer (b0)->tcp.seq_end); - return 0; } @@ -391,6 +397,12 @@ tcp_estimate_rtt (tcp_connection_t * tc, u32 mrtt) } } +void +tcp_update_rto (tcp_connection_t * tc) +{ + tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); +} + /** Update RTT estimate and RTO timer * * Measure RTT: We have two sources of RTT measurements: TSOPT and ACK @@ -405,7 +417,7 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) u32 mrtt = 0; u8 rtx_acked; - /* Determine if only rtx bytes are acked. TODO fast retransmit */ + /* Determine if only rtx bytes are acked. TODO XXX fast retransmit */ rtx_acked = tc->rto_boff && (tc->bytes_acked <= tc->snd_mss); /* Karn's rule, part 1. Don't use retransmitted segments to estimate @@ -418,9 +430,10 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) * snd_una, i.e., the left side of the send window: * seq_lt (tc->snd_una, ack). Note: last condition could be dropped, we don't * try to update rtt for dupacks */ - else if (tcp_opts_tstamp (&tc->opt) && tc->opt.tsecr && tc->bytes_acked) + else if (tcp_opts_tstamp (&tc->rcv_opts) && tc->rcv_opts.tsecr + && tc->bytes_acked) { - mrtt = tcp_time_now () - tc->opt.tsecr; + mrtt = tcp_time_now () - tc->rcv_opts.tsecr; } /* Allow measuring of a new RTT */ @@ -436,7 +449,7 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) return 0; tcp_estimate_rtt (tc, mrtt); - tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); + tcp_update_rto (tc); return 0; } @@ -447,25 +460,46 @@ tcp_update_rtt (tcp_connection_t * tc, u32 ack) static void tcp_dequeue_acked (tcp_connection_t * tc, u32 ack) { - /* Dequeue the newly ACKed bytes */ - stream_session_dequeue_drop (&tc->connection, tc->bytes_acked); + /* Dequeue the newly ACKed add SACKed bytes */ + stream_session_dequeue_drop (&tc->connection, + tc->bytes_acked + tc->sack_sb.snd_una_adv); + + tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); /* Update rtt and rto */ tcp_update_rtt (tc, ack); + + /* If everything has been acked, stop retransmit timer + * otherwise update. */ + tcp_retransmit_timer_update (tc); } /** - * Check if dupack as per RFC5681 Sec. 2 - * - * This works only if called before updating snd_wnd. - * */ -always_inline u8 -tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 new_snd_wnd) + * Check if duplicate ack as per RFC5681 Sec. 2 + */ +static u8 +tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 prev_snd_wnd, + u32 prev_snd_una) { - return ((vnet_buffer (b)->tcp.ack_number == tc->snd_una) + return ((vnet_buffer (b)->tcp.ack_number == prev_snd_una) && seq_gt (tc->snd_una_max, tc->snd_una) && (vnet_buffer (b)->tcp.seq_end == vnet_buffer (b)->tcp.seq_number) - && (new_snd_wnd == tc->snd_wnd)); + && (prev_snd_wnd == tc->snd_wnd)); +} + +/** + * Checks if ack is a congestion control event. + */ +static u8 +tcp_ack_is_cc_event (tcp_connection_t * tc, vlib_buffer_t * b, + u32 prev_snd_wnd, u32 prev_snd_una, u8 * is_dack) +{ + /* Check if ack is duplicate. Per RFC 6675, ACKs that SACK new data are + * defined to be 'duplicate' */ + *is_dack = tc->sack_sb.last_sacked_bytes + || tcp_ack_is_dupack (tc, b, prev_snd_wnd, prev_snd_una); + + return (*is_dack || tcp_in_cong_recovery (tc)); } void @@ -478,6 +512,10 @@ scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) next = pool_elt_at_index (sb->holes, hole->next); next->prev = hole->prev; } + else + { + sb->tail = hole->prev; + } if (hole->prev != TCP_INVALID_SACK_HOLE_INDEX) { @@ -489,6 +527,9 @@ scoreboard_remove_hole (sack_scoreboard_t * sb, sack_scoreboard_hole_t * hole) sb->head = hole->next; } + if (scoreboard_hole_index (sb, hole) == sb->cur_rxt_hole) + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; + pool_put (sb->holes, hole); } @@ -527,26 +568,131 @@ scoreboard_insert_hole (sack_scoreboard_t * sb, u32 prev_index, return hole; } +void +scoreboard_update_lost (tcp_connection_t * tc, sack_scoreboard_t * sb) +{ + sack_scoreboard_hole_t *hole, *prev; + u32 bytes = 0, blks = 0; + + sb->lost_bytes = 0; + hole = scoreboard_last_hole (sb); + if (!hole) + return; + + if (seq_gt (sb->high_sacked, hole->end)) + { + bytes = sb->high_sacked - hole->end; + blks = 1; + } + + while ((prev = scoreboard_prev_hole (sb, hole)) + && (bytes < (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss + && blks < TCP_DUPACK_THRESHOLD)) + { + bytes += hole->start - prev->end; + blks++; + hole = prev; + } + + hole = prev; + while (hole) + { + sb->lost_bytes += scoreboard_hole_bytes (hole); + hole->is_lost = 1; + hole = scoreboard_prev_hole (sb, hole); + } +} + +/** + * Figure out the next hole to retransmit + * + * Follows logic proposed in RFC6675 Sec. 4, NextSeg() + */ +sack_scoreboard_hole_t * +scoreboard_next_rxt_hole (sack_scoreboard_t * sb, + sack_scoreboard_hole_t * start, + u8 have_sent_1_smss, + u8 * can_rescue, u8 * snd_limited) +{ + sack_scoreboard_hole_t *hole = 0; + + hole = start ? start : scoreboard_first_hole (sb); + while (hole && seq_leq (hole->end, sb->high_rxt) && hole->is_lost) + hole = scoreboard_next_hole (sb, hole); + + /* Nothing, return */ + if (!hole) + { + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; + return 0; + } + + /* Rule (1): if higher than rxt, less than high_sacked and lost */ + if (hole->is_lost && seq_lt (hole->start, sb->high_sacked)) + { + sb->cur_rxt_hole = scoreboard_hole_index (sb, hole); + } + else + { + /* Rule (2): output takes care of transmitting new data */ + if (!have_sent_1_smss) + { + hole = 0; + sb->cur_rxt_hole = TCP_INVALID_SACK_HOLE_INDEX; + } + /* Rule (3): if hole not lost */ + else if (seq_lt (hole->start, sb->high_sacked)) + { + *snd_limited = 1; + sb->cur_rxt_hole = scoreboard_hole_index (sb, hole); + } + /* Rule (4): if hole beyond high_sacked */ + else + { + ASSERT (seq_geq (hole->start, sb->high_sacked)); + *snd_limited = 1; + *can_rescue = 1; + /* HighRxt MUST NOT be updated */ + return 0; + } + } + + if (hole && seq_lt (sb->high_rxt, hole->start)) + sb->high_rxt = hole->start; + + return hole; +} + +void +scoreboard_init_high_rxt (sack_scoreboard_t * sb) +{ + sack_scoreboard_hole_t *hole; + hole = scoreboard_first_hole (sb); + sb->high_rxt = hole->start; + sb->cur_rxt_hole = sb->head; +} + void tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { sack_scoreboard_t *sb = &tc->sack_sb; sack_block_t *blk, tmp; sack_scoreboard_hole_t *hole, *next_hole, *last_hole, *new_hole; - u32 blk_index = 0, old_sacked_bytes, delivered_bytes, hole_index; + u32 blk_index = 0, old_sacked_bytes, hole_index; int i, j; sb->last_sacked_bytes = 0; sb->snd_una_adv = 0; old_sacked_bytes = sb->sacked_bytes; - delivered_bytes = 0; + sb->last_bytes_delivered = 0; - if (!tcp_opts_sack (&tc->opt) && sb->head == TCP_INVALID_SACK_HOLE_INDEX) + if (!tcp_opts_sack (&tc->rcv_opts) + && sb->head == TCP_INVALID_SACK_HOLE_INDEX) return; /* Remove invalid blocks */ - blk = tc->opt.sacks; - while (blk < vec_end (tc->opt.sacks)) + blk = tc->rcv_opts.sacks; + while (blk < vec_end (tc->rcv_opts.sacks)) { if (seq_lt (blk->start, blk->end) && seq_gt (blk->start, tc->snd_una) @@ -555,7 +701,7 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) blk++; continue; } - vec_del1 (tc->opt.sacks, blk - tc->opt.sacks); + vec_del1 (tc->rcv_opts.sacks, blk - tc->rcv_opts.sacks); } /* Add block for cumulative ack */ @@ -563,20 +709,20 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { tmp.start = tc->snd_una; tmp.end = ack; - vec_add1 (tc->opt.sacks, tmp); + vec_add1 (tc->rcv_opts.sacks, tmp); } - if (vec_len (tc->opt.sacks) == 0) + if (vec_len (tc->rcv_opts.sacks) == 0) return; /* Make sure blocks are ordered */ - for (i = 0; i < vec_len (tc->opt.sacks); i++) - for (j = i + 1; j < vec_len (tc->opt.sacks); j++) - if (seq_lt (tc->opt.sacks[j].start, tc->opt.sacks[i].start)) + for (i = 0; i < vec_len (tc->rcv_opts.sacks); i++) + for (j = i + 1; j < vec_len (tc->rcv_opts.sacks); j++) + if (seq_lt (tc->rcv_opts.sacks[j].start, tc->rcv_opts.sacks[i].start)) { - tmp = tc->opt.sacks[i]; - tc->opt.sacks[i] = tc->opt.sacks[j]; - tc->opt.sacks[j] = tmp; + tmp = tc->rcv_opts.sacks[i]; + tc->rcv_opts.sacks[i] = tc->rcv_opts.sacks[j]; + tc->rcv_opts.sacks[j] = tmp; } if (sb->head == TCP_INVALID_SACK_HOLE_INDEX) @@ -585,25 +731,25 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) last_hole = scoreboard_insert_hole (sb, TCP_INVALID_SACK_HOLE_INDEX, tc->snd_una, tc->snd_una_max); sb->tail = scoreboard_hole_index (sb, last_hole); - tmp = tc->opt.sacks[vec_len (tc->opt.sacks) - 1]; - sb->max_byte_sacked = tmp.end; + tmp = tc->rcv_opts.sacks[vec_len (tc->rcv_opts.sacks) - 1]; + sb->high_sacked = tmp.end; } else { /* If we have holes but snd_una_max is beyond the last hole, update * last hole end */ - tmp = tc->opt.sacks[vec_len (tc->opt.sacks) - 1]; + tmp = tc->rcv_opts.sacks[vec_len (tc->rcv_opts.sacks) - 1]; last_hole = scoreboard_last_hole (sb); - if (seq_gt (tc->snd_una_max, sb->max_byte_sacked) + if (seq_gt (tc->snd_una_max, sb->high_sacked) && seq_gt (tc->snd_una_max, last_hole->end)) last_hole->end = tc->snd_una_max; } /* Walk the holes with the SACK blocks */ hole = pool_elt_at_index (sb->holes, sb->head); - while (hole && blk_index < vec_len (tc->opt.sacks)) + while (hole && blk_index < vec_len (tc->rcv_opts.sacks)) { - blk = &tc->opt.sacks[blk_index]; + blk = &tc->rcv_opts.sacks[blk_index]; if (seq_leq (blk->start, hole->start)) { @@ -617,9 +763,9 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) { /* Bytes lost because snd_wnd left edge advances */ if (next_hole && seq_leq (next_hole->start, ack)) - delivered_bytes += next_hole->start - hole->end; + sb->last_bytes_delivered += next_hole->start - hole->end; else - delivered_bytes += ack - hole->end; + sb->last_bytes_delivered += ack - hole->end; } else { @@ -633,8 +779,8 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) last_hole = scoreboard_last_hole (sb); /* keep track of max byte sacked for when the last hole * is acked */ - if (seq_gt (hole->end, sb->max_byte_sacked)) - sb->max_byte_sacked = hole->end; + if (seq_gt (hole->end, sb->high_sacked)) + sb->high_sacked = hole->end; } /* snd_una needs to be advanced */ @@ -645,12 +791,12 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) sb->snd_una_adv = next_hole->start - ack; /* all these can be delivered */ - delivered_bytes += sb->snd_una_adv; + sb->last_bytes_delivered += sb->snd_una_adv; } else if (!next_hole) { - sb->snd_una_adv = sb->max_byte_sacked - ack; - delivered_bytes += sb->snd_una_adv; + sb->snd_una_adv = sb->high_sacked - ack; + sb->last_bytes_delivered += sb->snd_una_adv; } } @@ -691,28 +837,33 @@ tcp_rcv_sacks (tcp_connection_t * tc, u32 ack) } blk_index++; - hole = scoreboard_next_hole (sb, hole); } - else + else if (seq_leq (blk->start, hole->end)) { sb->sacked_bytes += hole->end - blk->start; hole->end = blk->start; - hole = scoreboard_next_hole (sb, hole); } + + hole = scoreboard_next_hole (sb, hole); } } sb->last_sacked_bytes = sb->sacked_bytes - old_sacked_bytes; - sb->sacked_bytes -= delivered_bytes; + sb->sacked_bytes -= sb->last_bytes_delivered; + scoreboard_update_lost (tc, sb); } -/** Update snd_wnd +/** + * Try to update snd_wnd based on feedback received from peer. * - * If (SND.WL1 < SEG.SEQ or (SND.WL1 = SEG.SEQ and SND.WL2 =< SEG.ACK)), set - * SND.WND <- SEG.WND, set SND.WL1 <- SEG.SEQ, and set SND.WL2 <- SEG.ACK */ + * If successful, and new window is 'effectively' 0, activate persist + * timer. + */ static void tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd) { + /* If (SND.WL1 < SEG.SEQ or (SND.WL1 = SEG.SEQ and SND.WL2 =< SEG.ACK)), set + * SND.WND <- SEG.WND, set SND.WL1 <- SEG.SEQ, and set SND.WL2 <- SEG.ACK */ if (seq_lt (tc->snd_wl1, seq) || (tc->snd_wl1 == seq && seq_leq (tc->snd_wl2, ack))) { @@ -721,138 +872,269 @@ tcp_update_snd_wnd (tcp_connection_t * tc, u32 seq, u32 ack, u32 snd_wnd) tc->snd_wl2 = ack; TCP_EVT_DBG (TCP_EVT_SND_WND, tc); - /* Set probe timer if we just got 0 wnd */ if (tc->snd_wnd < tc->snd_mss) { - if (!tcp_timer_is_active (tc, TCP_TIMER_PERSIST)) + /* Set persist timer if not set and we just got 0 wnd */ + if (!tcp_timer_is_active (tc, TCP_TIMER_PERSIST) + && !tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)) tcp_persist_timer_set (tc); } else - tcp_persist_timer_reset (tc); + { + tcp_persist_timer_reset (tc); + if (!tcp_in_recovery (tc) && tc->rto_boff > 0) + { + tc->rto_boff = 0; + tcp_update_rto (tc); + } + } } } void -tcp_cc_congestion (tcp_connection_t * tc) +tcp_cc_init_congestion (tcp_connection_t * tc) { - tc->snd_congestion = tc->snd_nxt; + tcp_fastrecovery_on (tc); + tc->snd_congestion = tc->snd_una_max; tc->cc_algo->congestion (tc); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 4); } -void -tcp_cc_recover (tcp_connection_t * tc) +static void +tcp_cc_recovery_exit (tcp_connection_t * tc) { - /* TODO: check if time to recover was small. It might be that RTO popped - * too soon. - */ + /* Deflate rto */ + tcp_update_rto (tc); + tc->rto_boff = 0; + tc->snd_rxt_ts = 0; + tcp_recovery_off (tc); +} +void +tcp_cc_fastrecovery_exit (tcp_connection_t * tc) +{ tc->cc_algo->recovered (tc); + tc->snd_rxt_bytes = 0; + tc->rcv_dupacks = 0; + tcp_fastrecovery_off (tc); + tcp_fastrecovery_1_smss_off (tc); +} - tc->rtx_bytes = 0; +static void +tcp_cc_congestion_undo (tcp_connection_t * tc) +{ + tc->cwnd = tc->prev_cwnd; + tc->ssthresh = tc->prev_ssthresh; + tc->snd_nxt = tc->snd_una_max; tc->rcv_dupacks = 0; - tc->snd_nxt = tc->snd_una; + if (tcp_in_recovery (tc)) + tcp_cc_recovery_exit (tc); + ASSERT (tc->rto_boff == 0); + /* TODO extend for fastrecovery */ +} - tc->cc_algo->rcv_ack (tc); - tc->tsecr_last_ack = tc->opt.tsecr; +static u8 +tcp_cc_is_spurious_retransmit (tcp_connection_t * tc) +{ + return (tc->snd_rxt_ts + && tcp_opts_tstamp (&tc->rcv_opts) + && timestamp_lt (tc->rcv_opts.tsecr, tc->snd_rxt_ts)); +} - tcp_cong_recovery_off (tc); +int +tcp_cc_recover (tcp_connection_t * tc) +{ + ASSERT (tcp_in_cong_recovery (tc)); + if (tcp_cc_is_spurious_retransmit (tc)) + { + tcp_cc_congestion_undo (tc); + return 1; + } + + if (tcp_in_recovery (tc)) + tcp_cc_recovery_exit (tc); + else if (tcp_in_fastrecovery (tc)) + tcp_cc_fastrecovery_exit (tc); + + ASSERT (tc->rto_boff == 0); + ASSERT (!tcp_in_cong_recovery (tc)); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 3); + return 0; } static void -tcp_cc_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b) +tcp_cc_update (tcp_connection_t * tc, vlib_buffer_t * b) +{ + ASSERT (!tcp_in_cong_recovery (tc)); + + /* Congestion avoidance */ + tc->cc_algo->rcv_ack (tc); + tc->tsecr_last_ack = tc->rcv_opts.tsecr; + + /* If a cumulative ack, make sure dupacks is 0 */ + tc->rcv_dupacks = 0; + + /* When dupacks hits the threshold we only enter fast retransmit if + * cumulative ack covers more than snd_congestion. Should snd_una + * wrap this test may fail under otherwise valid circumstances. + * Therefore, proactively update snd_congestion when wrap detected. */ + if (PREDICT_FALSE + (seq_leq (tc->snd_congestion, tc->snd_una - tc->bytes_acked) + && seq_gt (tc->snd_congestion, tc->snd_una))) + tc->snd_congestion = tc->snd_una - 1; +} + +static u8 +tcp_should_fastrecover_sack (tcp_connection_t * tc) { - u8 partial_ack; - u32 bytes_advanced; + return (TCP_DUPACK_THRESHOLD - 1) * tc->snd_mss < tc->sack_sb.sacked_bytes; +} - if (tcp_in_fastrecovery (tc)) +static u8 +tcp_should_fastrecover (tcp_connection_t * tc) +{ + return (tc->rcv_dupacks == TCP_DUPACK_THRESHOLD + || tcp_should_fastrecover_sack (tc)); +} + +static void +tcp_cc_handle_event (tcp_connection_t * tc, u32 is_dack) +{ + /* + * Duplicate ACK. Check if we should enter fast recovery, or if already in + * it account for the bytes that left the network. + */ + if (is_dack) { - partial_ack = seq_lt (tc->snd_una, tc->snd_congestion); - if (!partial_ack) + ASSERT (tc->snd_una != tc->snd_una_max + || tc->sack_sb.last_sacked_bytes); + tc->rcv_dupacks++; + + if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD && !tc->bytes_acked) { - /* Clear retransmitted bytes. */ - tcp_cc_recover (tc); + ASSERT (tcp_in_fastrecovery (tc)); + /* Pure duplicate ack. If some data got acked, it's handled lower */ + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + return; } - else + else if (tcp_should_fastrecover (tc)) { - TCP_EVT_DBG (TCP_EVT_CC_PACK, tc); + /* Things are already bad */ + if (tcp_in_cong_recovery (tc)) + { + tc->rcv_dupacks = 0; + goto partial_ack_test; + } - /* Clear retransmitted bytes. XXX should we clear all? */ - tc->rtx_bytes = 0; + /* If of of the two conditions lower hold, reset dupacks + * 1) Cumulative ack does not cover more than congestion threshold + * 2) RFC6582 heuristic to avoid multiple fast retransmits + */ + if (seq_leq (tc->snd_una, tc->snd_congestion) + || tc->rcv_opts.tsecr != tc->tsecr_last_ack) + { + tc->rcv_dupacks = 0; + return; + } + + tcp_cc_init_congestion (tc); + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); - tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK); + /* The first segment MUST be retransmitted */ + tcp_retransmit_first_unacked (tc); - /* In case snd_nxt is still in the past and output tries to - * shove some new bytes */ - tc->snd_nxt = tc->snd_una_max; + /* Post retransmit update cwnd to ssthresh and account for the + * three segments that have left the network and should've been + * buffered at the receiver XXX */ + tc->cwnd = tc->ssthresh + tc->rcv_dupacks * tc->snd_mss; - /* XXX need proper RFC6675 support */ - if (tc->sack_sb.last_sacked_bytes && !tcp_in_recovery (tc)) + /* If cwnd allows, send more data */ + if (tcp_opts_sack_permitted (&tc->rcv_opts) + && scoreboard_first_hole (&tc->sack_sb)) { - tcp_fast_retransmit (tc); + scoreboard_init_high_rxt (&tc->sack_sb); + tcp_fast_retransmit_sack (tc); } else { - /* Retransmit first unacked segment */ - tcp_retransmit_first_unacked (tc); + tcp_fast_retransmit_no_sack (tc); } + + return; } - } - else - { - tc->cc_algo->rcv_ack (tc); - tc->tsecr_last_ack = tc->opt.tsecr; - tc->rcv_dupacks = 0; - if (tcp_in_recovery (tc)) + else if (!tc->bytes_acked + || (tc->bytes_acked && !tcp_in_cong_recovery (tc))) { - bytes_advanced = tc->bytes_acked + tc->sack_sb.snd_una_adv; - tc->rtx_bytes -= clib_min (bytes_advanced, tc->rtx_bytes); - tc->rto = clib_min (tc->srtt + (tc->rttvar << 2), TCP_RTO_MAX); - if (seq_geq (tc->snd_una, tc->snd_congestion)) - { - tc->rtx_bytes = 0; - tcp_recovery_off (tc); - } + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + return; } + else + goto partial_ack; } -} -static void -tcp_cc_rcv_dupack (tcp_connection_t * tc, u32 ack) -{ -// ASSERT (seq_geq(tc->snd_una, ack)); +partial_ack_test: + + if (!tc->bytes_acked) + return; + +partial_ack: + /* + * Legitimate ACK. 1) See if we can exit recovery + */ + /* XXX limit this only to first partial ack? */ + tcp_retransmit_timer_update (tc); - tc->rcv_dupacks++; - if (tc->rcv_dupacks == TCP_DUPACK_THRESHOLD) + if (seq_geq (tc->snd_una, tc->snd_congestion)) { - /* RFC6582 NewReno heuristic to avoid multiple fast retransmits */ - if (tc->opt.tsecr != tc->tsecr_last_ack) - { - tc->rcv_dupacks = 0; - return; - } + /* If spurious return, we've already updated everything */ + if (tcp_cc_recover (tc)) + return; + + tc->snd_nxt = tc->snd_una_max; - tcp_fastrecovery_on (tc); + /* Treat as congestion avoidance ack */ + tc->cc_algo->rcv_ack (tc); + tc->tsecr_last_ack = tc->rcv_opts.tsecr; + return; + } + + /* + * Legitimate ACK. 2) If PARTIAL ACK try to retransmit + */ + TCP_EVT_DBG (TCP_EVT_CC_PACK, tc); + + /* RFC6675: If the incoming ACK is a cumulative acknowledgment, + * reset dupacks to 0 */ + tc->rcv_dupacks = 0; - /* Handle congestion and dupack */ - tcp_cc_congestion (tc); - tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + tcp_retransmit_first_unacked (tc); - tcp_fast_retransmit (tc); + /* Post RTO timeout don't try anything fancy */ + if (tcp_in_recovery (tc)) + return; - /* Post retransmit update cwnd to ssthresh and account for the - * three segments that have left the network and should've been - * buffered at the receiver */ - tc->cwnd = tc->ssthresh + TCP_DUPACK_THRESHOLD * tc->snd_mss; + /* Remove retransmitted bytes that have been delivered */ + if (tc->sack_sb.last_bytes_delivered + && seq_gt (tc->sack_sb.high_rxt, tc->snd_una)) + { + /* If we have sacks and we haven't gotten an ack beyond high_rxt, + * remove sacked bytes delivered */ + tc->snd_rxt_bytes -= tc->sack_sb.last_bytes_delivered; } - else if (tc->rcv_dupacks > TCP_DUPACK_THRESHOLD) + else { - ASSERT (tcp_in_fastrecovery (tc)); - - tc->cc_algo->rcv_cong_ack (tc, TCP_CC_DUPACK); + /* Either all retransmitted holes have been acked, or we're + * "in the blind" and retransmitting segment by segment */ + tc->snd_rxt_bytes = 0; } + + tc->cc_algo->rcv_cong_ack (tc, TCP_CC_PARTIALACK); + + /* + * Since this was a partial ack, try to retransmit some more data + */ + tcp_fast_retransmit (tc); } void @@ -862,14 +1144,18 @@ tcp_cc_init (tcp_connection_t * tc) tc->cc_algo->init (tc); } +/** + * Process incoming ACK + */ static int tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, tcp_header_t * th, u32 * next, u32 * error) { - u32 new_snd_wnd; + u32 prev_snd_wnd, prev_snd_una; + u8 is_dack; /* If the ACK acks something not yet sent (SEG.ACK > SND.NXT) */ - if (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt)) + if (PREDICT_FALSE (seq_gt (vnet_buffer (b)->tcp.ack_number, tc->snd_nxt))) { /* If we have outstanding data and this is within the window, accept it, * probably retransmit has timed out. Otherwise ACK segment and then @@ -892,7 +1178,7 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, } /* If old ACK, probably it's an old dupack */ - if (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una)) + if (PREDICT_FALSE (seq_lt (vnet_buffer (b)->tcp.ack_number, tc->snd_una))) { *error = TCP_ERROR_ACK_OLD; TCP_EVT_DBG (TCP_EVT_ACK_RCV_ERR, tc, 1, @@ -900,54 +1186,50 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, if (tcp_in_fastrecovery (tc) && tc->rcv_dupacks == TCP_DUPACK_THRESHOLD) { TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc); - tcp_cc_rcv_dupack (tc, vnet_buffer (b)->tcp.ack_number); + tcp_cc_handle_event (tc, 1); } /* Don't drop yet */ return 0; } - if (tcp_opts_sack_permitted (&tc->opt)) - tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number); - - new_snd_wnd = clib_net_to_host_u16 (th->window) << tc->snd_wscale; - - if (tcp_ack_is_dupack (tc, b, new_snd_wnd)) - { - TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc, 1); - tcp_cc_rcv_dupack (tc, vnet_buffer (b)->tcp.ack_number); - *error = TCP_ERROR_ACK_DUP; - return -1; - } - /* - * Valid ACK + * Looks okay, process feedback */ - tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una; - tc->snd_una = vnet_buffer (b)->tcp.ack_number + tc->sack_sb.snd_una_adv; + TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc); + + if (tcp_opts_sack_permitted (&tc->rcv_opts)) + tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number); - /* Dequeue ACKed data and update RTT */ - tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number); + prev_snd_wnd = tc->snd_wnd; + prev_snd_una = tc->snd_una; tcp_update_snd_wnd (tc, vnet_buffer (b)->tcp.seq_number, - vnet_buffer (b)->tcp.ack_number, new_snd_wnd); + vnet_buffer (b)->tcp.ack_number, + clib_net_to_host_u16 (th->window) << tc->snd_wscale); + tc->bytes_acked = vnet_buffer (b)->tcp.ack_number - tc->snd_una; + tc->snd_una = vnet_buffer (b)->tcp.ack_number + tc->sack_sb.snd_una_adv; + tcp_validate_txf_size (tc, tc->bytes_acked); - /* If some of our sent bytes have been acked, update cc and retransmit - * timer. */ if (tc->bytes_acked) - { - TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc); + tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number); - /* Updates congestion control (slow start/congestion avoidance) */ - tcp_cc_rcv_ack (tc, b); + /* + * Check if we have congestion event + */ - /* If everything has been acked, stop retransmit timer - * otherwise update. */ - if (tc->snd_una == tc->snd_una_max) - tcp_retransmit_timer_reset (tc); - else - tcp_retransmit_timer_update (tc); + if (tcp_ack_is_cc_event (tc, b, prev_snd_wnd, prev_snd_una, &is_dack)) + { + tcp_cc_handle_event (tc, is_dack); + *error = TCP_ERROR_ACK_DUP; + TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc, 1); + return vnet_buffer (b)->tcp.data_len ? 0 : -1; } + /* + * Update congestion control (slow start/congestion avoidance) + */ + tcp_cc_update (tc, b); + return 0; } @@ -1059,7 +1341,7 @@ tcp_session_enqueue_data (tcp_connection_t * tc, vlib_buffer_t * b, } /* Update SACK list if need be */ - if (tcp_opts_sack_permitted (&tc->opt)) + if (tcp_opts_sack_permitted (&tc->rcv_opts)) { /* Remove SACK blocks that have been delivered */ tcp_update_sack_list (tc, tc->rcv_nxt, tc->rcv_nxt); @@ -1097,7 +1379,7 @@ tcp_session_enqueue_ooo (tcp_connection_t * tc, vlib_buffer_t * b, TCP_EVT_DBG (TCP_EVT_INPUT, tc, 1, data_len, data_len); /* Update SACK list if in use */ - if (tcp_opts_sack_permitted (&tc->opt)) + if (tcp_opts_sack_permitted (&tc->rcv_opts)) { ooo_segment_t *newest; u32 start, end; @@ -1294,7 +1576,6 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, u32 n_left_to_next; vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); - while (n_left_from > 0 && n_left_to_next > 0) { u32 bi0; @@ -1321,7 +1602,6 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } th0 = tcp_buffer_hdr (b0); - is_fin = (th0->flags & TCP_FLAG_FIN) != 0; /* SYNs, FINs and data consume sequence numbers */ @@ -1387,7 +1667,6 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, errors = session_manager_flush_enqueue_events (my_thread_index); tcp_established_inc_counter (vm, is_ip4, TCP_ERROR_EVENT_FIFO_FULL, errors); - return from_frame->n_vectors; } @@ -1582,17 +1861,17 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, new_tc0->irs = seq0; /* Parse options */ - if (tcp_options_parse (tcp0, &new_tc0->opt)) + if (tcp_options_parse (tcp0, &new_tc0->rcv_opts)) goto drop; - if (tcp_opts_tstamp (&new_tc0->opt)) + if (tcp_opts_tstamp (&new_tc0->rcv_opts)) { - new_tc0->tsval_recent = new_tc0->opt.tsval; + new_tc0->tsval_recent = new_tc0->rcv_opts.tsval; new_tc0->tsval_recent_age = tcp_time_now (); } - if (tcp_opts_wscale (&new_tc0->opt)) - new_tc0->snd_wscale = new_tc0->opt.wscale; + if (tcp_opts_wscale (&new_tc0->rcv_opts)) + new_tc0->snd_wscale = new_tc0->rcv_opts.wscale; /* No scaling */ new_tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window); @@ -1845,7 +2124,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Initialize session variables */ tc0->snd_una = vnet_buffer (b0)->tcp.ack_number; tc0->snd_wnd = clib_net_to_host_u16 (tcp0->window) - << tc0->opt.wscale; + << tc0->rcv_opts.wscale; tc0->snd_wl1 = vnet_buffer (b0)->tcp.seq_number; tc0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number; @@ -1903,13 +2182,21 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, break; case TCP_STATE_LAST_ACK: - /* The only thing that can arrive in this state is an + /* The only thing that [should] arrive in this state is an * acknowledgment of our FIN. If our FIN is now acknowledged, * delete the TCB, enter the CLOSED state, and return. */ if (!tcp_rcv_ack_is_acceptable (tc0, b0)) goto drop; + /* Apparently our FIN was lost */ + if (tcp_fin (tcp0)) + { + /* Don't "make" fin since that increments snd_nxt */ + tcp_send_fin (tc0); + goto drop; + } + tc0->state = TCP_STATE_CLOSED; /* Don't delete the connection/session yet. Instead, wait a @@ -1929,8 +2216,15 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, * retransmission of the remote FIN. Acknowledge it, and restart * the 2 MSL timeout. */ - /* TODO */ + if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) + goto drop; + + tcp_make_ack (tc0, b0); + tcp_timer_reset (tc0, TCP_TIMER_WAITCLOSE); + tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + goto drop; + break; default: ASSERT (0); @@ -2194,7 +2488,7 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, goto drop; } - if (tcp_options_parse (th0, &child0->opt)) + if (tcp_options_parse (th0, &child0->rcv_opts)) { goto drop; } @@ -2205,14 +2499,14 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* RFC1323: TSval timestamps sent on {SYN} and {SYN,ACK} * segments are used to initialize PAWS. */ - if (tcp_opts_tstamp (&child0->opt)) + if (tcp_opts_tstamp (&child0->rcv_opts)) { - child0->tsval_recent = child0->opt.tsval; + child0->tsval_recent = child0->rcv_opts.tsval; child0->tsval_recent_age = tcp_time_now (); } - if (tcp_opts_wscale (&child0->opt)) - child0->snd_wscale = child0->opt.wscale; + if (tcp_opts_wscale (&child0->rcv_opts)) + child0->snd_wscale = child0->rcv_opts.wscale; /* No scaling */ child0->snd_wnd = clib_net_to_host_u16 (th0->window); @@ -2477,7 +2771,6 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_add_trace (vm, node, b0, sizeof (*t0)); tcp_set_rx_trace_data (t0, tc0, tcp0, b0, is_ip4); } - vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, n_left_to_next, bi0, next0); } @@ -2600,7 +2893,13 @@ do { \ _(FIN_WAIT_2, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); _(LAST_ACK, TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(LAST_ACK, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(LAST_ACK, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, + TCP_ERROR_NONE); _(LAST_ACK, TCP_FLAG_RST, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(TIME_WAIT, TCP_FLAG_FIN, TCP_INPUT_NEXT_RCV_PROCESS, TCP_ERROR_NONE); + _(TIME_WAIT, TCP_FLAG_FIN | TCP_FLAG_ACK, TCP_INPUT_NEXT_RCV_PROCESS, + TCP_ERROR_NONE); _(CLOSED, TCP_FLAG_ACK, TCP_INPUT_NEXT_RESET, TCP_ERROR_CONNECTION_CLOSED); _(CLOSED, TCP_FLAG_RST, TCP_INPUT_NEXT_DROP, TCP_ERROR_CONNECTION_CLOSED); #undef _ diff --git a/src/vnet/tcp/tcp_newreno.c b/src/vnet/tcp/tcp_newreno.c index 3525f4e5..c66250e4 100644 --- a/src/vnet/tcp/tcp_newreno.c +++ b/src/vnet/tcp/tcp_newreno.c @@ -51,9 +51,23 @@ newreno_rcv_cong_ack (tcp_connection_t * tc, tcp_cc_ack_t ack_type) } else if (ack_type == TCP_CC_PARTIALACK) { - tc->cwnd -= tc->bytes_acked; - if (tc->bytes_acked > tc->snd_mss) - tc->bytes_acked += tc->snd_mss; + /* RFC 6582 Sec. 3.2 */ + if (!tcp_opts_sack_permitted (&tc->rcv_opts)) + { + /* Deflate the congestion window by the amount of new data + * acknowledged by the Cumulative Acknowledgment field. + * If the partial ACK acknowledges at least one SMSS of new data, + * then add back SMSS bytes to the congestion window. This + * artificially inflates the congestion window in order to reflect + * the additional segment that has left the network. This "partial + * window deflation" attempts to ensure that, when fast recovery + * eventually ends, approximately ssthresh amount of data will be + * outstanding in the network.*/ + tc->cwnd = (tc->cwnd > tc->bytes_acked) ? + tc->cwnd - tc->bytes_acked : 0; + if (tc->bytes_acked > tc->snd_mss) + tc->cwnd += tc->snd_mss; + } } } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 49fd6bef..47c94e6d 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -136,10 +136,10 @@ tcp_update_rcv_wnd (tcp_connection_t * tc) * Figure out how much space we have available */ available_space = stream_session_max_rx_enqueue (&tc->connection); - max_fifo = stream_session_fifo_size (&tc->connection); + max_fifo = stream_session_rx_fifo_size (&tc->connection); - ASSERT (tc->opt.mss < max_fifo); - if (available_space < tc->opt.mss && available_space < max_fifo >> 3) + ASSERT (tc->rcv_opts.mss < max_fifo); + if (available_space < tc->rcv_opts.mss && available_space < max_fifo >> 3) available_space = 0; /* @@ -276,8 +276,11 @@ tcp_make_syn_options (tcp_options_t * opts, u8 wnd_scale) opts->tsecr = 0; len += TCP_OPTION_LEN_TIMESTAMP; - opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; - len += TCP_OPTION_LEN_SACK_PERMITTED; + if (TCP_USE_SACKS) + { + opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; + len += TCP_OPTION_LEN_SACK_PERMITTED; + } /* Align to needed boundary */ len += (TCP_OPTS_ALIGN - len % TCP_OPTS_ALIGN) % TCP_OPTS_ALIGN; @@ -293,14 +296,14 @@ tcp_make_synack_options (tcp_connection_t * tc, tcp_options_t * opts) opts->mss = tc->mss; len += TCP_OPTION_LEN_MSS; - if (tcp_opts_wscale (&tc->opt)) + if (tcp_opts_wscale (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_WSCALE; opts->wscale = tc->rcv_wscale; len += TCP_OPTION_LEN_WINDOW_SCALE; } - if (tcp_opts_tstamp (&tc->opt)) + if (tcp_opts_tstamp (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_TSTAMP; opts->tsval = tcp_time_now (); @@ -308,7 +311,7 @@ tcp_make_synack_options (tcp_connection_t * tc, tcp_options_t * opts) len += TCP_OPTION_LEN_TIMESTAMP; } - if (tcp_opts_sack_permitted (&tc->opt)) + if (tcp_opts_sack_permitted (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_SACK_PERMITTED; len += TCP_OPTION_LEN_SACK_PERMITTED; @@ -326,14 +329,14 @@ tcp_make_established_options (tcp_connection_t * tc, tcp_options_t * opts) opts->flags = 0; - if (tcp_opts_tstamp (&tc->opt)) + if (tcp_opts_tstamp (&tc->rcv_opts)) { opts->flags |= TCP_OPTS_FLAG_TSTAMP; opts->tsval = tcp_time_now (); opts->tsecr = tc->tsval_recent; len += TCP_OPTION_LEN_TIMESTAMP; } - if (tcp_opts_sack_permitted (&tc->opt)) + if (tcp_opts_sack_permitted (&tc->rcv_opts)) { if (vec_len (tc->snd_sacks)) { @@ -395,7 +398,7 @@ tcp_update_snd_mss (tcp_connection_t * tc) tcp_make_options (tc, &tc->snd_opts, TCP_STATE_ESTABLISHED); /* XXX check if MTU has been updated */ - tc->snd_mss = clib_min (tc->mss, tc->opt.mss) - tc->snd_opts_len; + tc->snd_mss = clib_min (tc->mss, tc->rcv_opts.mss) - tc->snd_opts_len; ASSERT (tc->snd_mss > 0); } @@ -406,21 +409,21 @@ tcp_init_mss (tcp_connection_t * tc) tcp_update_rcv_mss (tc); /* TODO cache mss and consider PMTU discovery */ - tc->snd_mss = clib_min (tc->opt.mss, tc->mss); + tc->snd_mss = clib_min (tc->rcv_opts.mss, tc->mss); if (tc->snd_mss < 45) { clib_warning ("snd mss is 0"); /* Assume that at least the min default mss works */ tc->snd_mss = default_min_mss; - tc->opt.mss = default_min_mss; + tc->rcv_opts.mss = default_min_mss; } /* We should have enough space for 40 bytes of options */ ASSERT (tc->snd_mss > 45); /* If we use timestamp option, account for it */ - if (tcp_opts_tstamp (&tc->opt)) + if (tcp_opts_tstamp (&tc->rcv_opts)) tc->snd_mss -= TCP_OPTION_LEN_TIMESTAMP; } @@ -879,6 +882,7 @@ tcp_send_fin (tcp_connection_t * tc) tcp_make_fin (tc, b); tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); tc->flags |= TCP_CONN_FINSNT; + tcp_retransmit_timer_force_update (tc); TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc); } @@ -919,10 +923,7 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, if (compute_opts) tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); - /* Write pre-computed options */ tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t); - - /* Get rcv window to advertise */ advertise_wnd = tcp_window_to_advertise (tc, next_state); flags = tcp_make_state_flags (next_state); @@ -930,26 +931,25 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt, tc->rcv_nxt, tcp_hdr_opts_len, flags, advertise_wnd); - opts_write_len = tcp_options_write ((u8 *) (th + 1), &tc->snd_opts); ASSERT (opts_write_len == tc->snd_opts_len); - - /* Tag the buffer with the connection index */ vnet_buffer (b)->tcp.connection_index = tc->c_c_index; + /* + * Update connection variables + */ + tc->snd_nxt += data_len; tc->rcv_las = tc->rcv_nxt; /* TODO this is updated in output as well ... */ - if (tc->snd_nxt > tc->snd_una_max) - tc->snd_una_max = tc->snd_nxt; - - if (tc->rtt_ts == 0) + if (seq_gt (tc->snd_nxt, tc->snd_una_max)) { - tc->rtt_ts = tcp_time_now (); - tc->rtt_seq = tc->snd_nxt; + tc->snd_una_max = tc->snd_nxt; + tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); } + TCP_EVT_DBG (TCP_EVT_PKTIZE, tc); } @@ -987,13 +987,14 @@ tcp_timer_delack_handler (u32 index) * * @return the number of bytes in the segment or 0 if there's nothing to * retransmit - * */ + */ u32 tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, u32 offset, u32 max_bytes) { vlib_main_t *vm = vlib_get_main (); - u32 n_bytes = 0; + int n_bytes = 0; + u32 start; tcp_reuse_buffer (vm, b); @@ -1001,15 +1002,16 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, ASSERT (max_bytes != 0); max_bytes = clib_min (tc->snd_mss, max_bytes); + start = tc->snd_una + offset; /* Start is beyond snd_congestion */ - if (seq_geq (tc->snd_una + offset, tc->snd_congestion)) + if (seq_geq (start, tc->snd_congestion)) goto done; /* Don't overshoot snd_congestion */ - if (seq_gt (tc->snd_nxt + max_bytes, tc->snd_congestion)) + if (seq_gt (start + max_bytes, tc->snd_congestion)) { - max_bytes = tc->snd_congestion - tc->snd_nxt; + max_bytes = tc->snd_congestion - start; if (max_bytes == 0) goto done; } @@ -1021,15 +1023,12 @@ tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, n_bytes = stream_session_peek_bytes (&tc->connection, vlib_buffer_get_current (b), offset, max_bytes); - ASSERT (n_bytes != 0); + ASSERT (n_bytes > 0); b->current_length = n_bytes; tcp_push_hdr_i (tc, b, tc->state, 0); - /* Don't count multiple retransmits of the same segment */ - if (tc->rto_boff > 1) - goto done; - - tc->rtx_bytes += n_bytes; + if (tcp_in_fastrecovery (tc)) + tc->snd_rxt_bytes += n_bytes; done: TCP_EVT_DBG (TCP_EVT_CC_RTX, tc, offset, n_bytes); @@ -1042,18 +1041,15 @@ done: static void tcp_rtx_timeout_cc (tcp_connection_t * tc) { + tc->prev_ssthresh = tc->ssthresh; + tc->prev_cwnd = tc->cwnd; + /* Cleanly recover cc (also clears up fast retransmit) */ if (tcp_in_fastrecovery (tc)) - { - tcp_cc_recover (tc); - } - else - { - tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss); - } + tcp_cc_fastrecovery_exit (tc); /* Start again from the beginning */ - + tc->ssthresh = clib_max (tcp_flight_size (tc) / 2, 2 * tc->snd_mss); tc->cwnd = tcp_loss_wnd (tc); tc->snd_congestion = tc->snd_una_max; tcp_recovery_on (tc); @@ -1081,18 +1077,31 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Make sure timer handle is set to invalid */ tc->timers[TCP_TIMER_RETRANSMIT] = TCP_TIMER_HANDLE_INVALID; + if (!tcp_in_recovery (tc) && tc->rto_boff > 0 + && tc->state >= TCP_STATE_ESTABLISHED) + { + tc->rto_boff = 0; + tcp_update_rto (tc); + } + /* Increment RTO backoff (also equal to number of retries) */ tc->rto_boff += 1; /* Go back to first un-acked byte */ tc->snd_nxt = tc->snd_una; - /* Get buffer */ tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); if (tc->state >= TCP_STATE_ESTABLISHED) { + /* Lost FIN, retransmit and return */ + if (tc->flags & TCP_CONN_FINSNT) + { + tcp_send_fin (tc); + return; + } + /* First retransmit timeout */ if (tc->rto_boff == 1) tcp_rtx_timeout_cc (tc); @@ -1102,24 +1111,30 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1); - /* Send one segment. No fancy recovery for now! */ + /* Send one segment */ n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); + /* TODO be less aggressive about this */ scoreboard_clear (&tc->sack_sb); if (n_bytes == 0) { clib_warning ("could not retransmit anything"); + clib_warning ("%U", format_tcp_connection, tc, 2); + /* Try again eventually */ tcp_retransmit_timer_set (tc); + ASSERT (0 || (tc->rto_boff > 1 + && tc->snd_una == tc->snd_congestion)); return; } + + /* For first retransmit, record timestamp (Eifel detection RFC3522) */ + if (tc->rto_boff == 1) + tc->snd_rxt_ts = tcp_time_now (); } - else + /* Retransmit for SYN/SYNACK */ + else if (tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_SYN_SENT) { - /* Retransmit for SYN/SYNACK */ - ASSERT (tc->state == TCP_STATE_SYN_RCVD - || tc->state == TCP_STATE_SYN_SENT); - /* Try without increasing RTO a number of times. If this fails, * start growing RTO exponentially */ if (tc->rto_boff > TCP_RTO_SYN_RETRIES) @@ -1132,6 +1147,12 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Account for the SYN */ tc->snd_nxt += 1; } + else + { + ASSERT (tc->state == TCP_STATE_CLOSED); + clib_warning ("connection closed ..."); + return; + } if (!is_syn) { @@ -1180,7 +1201,8 @@ tcp_timer_persist_handler (u32 index) u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; - u32 bi, n_bytes; + u32 bi, old_snd_nxt; + int n_bytes = 0; tc = tcp_connection_get_if_valid (index, thread_index); @@ -1202,13 +1224,15 @@ tcp_timer_persist_handler (u32 index) /* Try to force the first unsent segment */ tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); + + tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); n_bytes = stream_session_peek_bytes (&tc->connection, vlib_buffer_get_current (b), tc->snd_una_max - tc->snd_una, tc->snd_mss); /* Nothing to send */ - if (n_bytes == 0) + if (n_bytes <= 0) { clib_warning ("persist found nothing to send"); tcp_return_buffer (tm); @@ -1216,7 +1240,13 @@ tcp_timer_persist_handler (u32 index) } b->current_length = n_bytes; + ASSERT (tc->snd_nxt == tc->snd_una_max || tc->rto_boff > 1 + || tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)); + + /* Allow updating of snd_una_max but don't update snd_nxt */ + old_snd_nxt = tc->snd_nxt; tcp_push_hdr_i (tc, b, tc->state, 0); + tc->snd_nxt = old_snd_nxt; tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); /* Re-enable persist timer */ @@ -1232,8 +1262,9 @@ tcp_retransmit_first_unacked (tcp_connection_t * tc) tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); vlib_buffer_t *b; - u32 bi, n_bytes; + u32 bi, n_bytes, old_snd_nxt; + old_snd_nxt = tc->snd_nxt; tc->snd_nxt = tc->snd_una; /* Get buffer */ @@ -1244,75 +1275,117 @@ tcp_retransmit_first_unacked (tcp_connection_t * tc) n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); if (n_bytes == 0) - goto done; + { + tcp_return_buffer (tm); + goto done; + } tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); done: - tc->snd_nxt = tc->snd_una_max; + tc->snd_nxt = old_snd_nxt; } -sack_scoreboard_hole_t * -scoreboard_first_rtx_hole (sack_scoreboard_t * sb) +/** + * Do fast retransmit with SACKs + */ +void +tcp_fast_retransmit_sack (tcp_connection_t * tc) { - sack_scoreboard_hole_t *hole = 0; - -// hole = scoreboard_first_hole (&tc->sack_sb); -// if (hole) -// { -// -// offset = hole->start - tc->snd_una; -// hole_size = hole->end - hole->start; -// -// ASSERT(hole_size); -// -// if (hole_size < max_bytes) -// max_bytes = hole_size; -// } - return hole; + tcp_main_t *tm = vnet_get_tcp_main (); + vlib_main_t *vm = vlib_get_main (); + u32 n_written = 0, offset = 0, max_bytes; + vlib_buffer_t *b; + sack_scoreboard_hole_t *hole; + sack_scoreboard_t *sb; + u32 bi, old_snd_nxt; + int snd_space; + u8 snd_limited = 0, can_rescue = 0; + + ASSERT (tcp_in_fastrecovery (tc)); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); + + old_snd_nxt = tc->snd_nxt; + sb = &tc->sack_sb; + snd_space = tcp_available_snd_space (tc); + + hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); + while (hole && snd_space > 0) + { + tcp_get_free_buffer_index (tm, &bi); + b = vlib_get_buffer (vm, bi); + + hole = scoreboard_next_rxt_hole (sb, hole, + tcp_fastrecovery_sent_1_smss (tc), + &can_rescue, &snd_limited); + if (!hole) + { + if (!can_rescue || !(seq_lt (sb->rescue_rxt, tc->snd_una) + || seq_gt (sb->rescue_rxt, + tc->snd_congestion))) + break; + + /* If rescue rxt undefined or less than snd_una then one segment of + * up to SMSS octets that MUST include the highest outstanding + * unSACKed sequence number SHOULD be returned, and RescueRxt set to + * RecoveryPoint. HighRxt MUST NOT be updated. + */ + max_bytes = clib_min (tc->snd_mss, snd_space); + offset = tc->snd_congestion - tc->snd_una - max_bytes; + sb->rescue_rxt = tc->snd_congestion; + tc->snd_nxt = tc->snd_una + offset; + tcp_prepare_retransmit_segment (tc, b, offset, max_bytes); + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + break; + } + + max_bytes = snd_limited ? tc->snd_mss : hole->end - sb->high_rxt; + offset = sb->high_rxt - tc->snd_una; + tc->snd_nxt = tc->snd_una + offset; + n_written = tcp_prepare_retransmit_segment (tc, b, offset, max_bytes); + + /* Nothing left to retransmit */ + if (n_written == 0) + { + tcp_return_buffer (tm); + break; + } + + sb->high_rxt += n_written; + tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + snd_space -= n_written; + } + + /* If window allows, send 1 SMSS of new data */ + tc->snd_nxt = old_snd_nxt; } /** - * Do fast retransmit. + * Fast retransmit without SACK info */ void -tcp_fast_retransmit (tcp_connection_t * tc) +tcp_fast_retransmit_no_sack (tcp_connection_t * tc) { tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); - u32 bi; + u32 n_written = 0, offset = 0, bi, old_snd_nxt; int snd_space; - u32 n_written = 0, offset = 0; vlib_buffer_t *b; - u8 use_sacks = 0; ASSERT (tcp_in_fastrecovery (tc)); + TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); /* Start resending from first un-acked segment */ + old_snd_nxt = tc->snd_nxt; tc->snd_nxt = tc->snd_una; - snd_space = tcp_available_snd_space (tc); - TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 0); - - /* If we have SACKs use them */ - if (tcp_opts_sack_permitted (&tc->opt) - && scoreboard_first_hole (&tc->sack_sb)) - use_sacks = 0; while (snd_space > 0) { tcp_get_free_buffer_index (tm, &bi); b = vlib_get_buffer (vm, bi); - if (use_sacks) - { - scoreboard_first_rtx_hole (&tc->sack_sb); - } - else - { - offset += n_written; - } - + offset += n_written; n_written = tcp_prepare_retransmit_segment (tc, b, offset, snd_space); /* Nothing left to retransmit */ @@ -1326,9 +1399,21 @@ tcp_fast_retransmit (tcp_connection_t * tc) snd_space -= n_written; } - /* If window allows, send 1 SMSS of new data */ - if (seq_lt (tc->snd_nxt, tc->snd_congestion)) - tc->snd_nxt = tc->snd_congestion; + /* Restore snd_nxt. If window allows, send 1 SMSS of new data */ + tc->snd_nxt = old_snd_nxt; +} + +/** + * Do fast retransmit + */ +void +tcp_fast_retransmit (tcp_connection_t * tc) +{ + if (tcp_opts_sack_permitted (&tc->rcv_opts) + && scoreboard_first_hole (&tc->sack_sb)) + tcp_fast_retransmit_sack (tc); + else + tcp_fast_retransmit_no_sack (tc); } always_inline u32 @@ -1544,6 +1629,12 @@ tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b) tc = (tcp_connection_t *) tconn; tcp_push_hdr_i (tc, b, TCP_STATE_ESTABLISHED, 0); + + if (tc->rtt_ts == 0) + { + tc->rtt_ts = tcp_time_now (); + tc->rtt_seq = tc->snd_nxt; + } return 0; } diff --git a/src/vnet/tcp/tcp_test.c b/src/vnet/tcp/tcp_test.c index 2af38484..3f8afa40 100644 --- a/src/vnet/tcp/tcp_test.c +++ b/src/vnet/tcp/tcp_test.c @@ -54,7 +54,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) tc->snd_una = 0; tc->snd_una_max = 1000; tc->snd_nxt = 1000; - tc->opt.flags |= TCP_OPTS_FLAG_SACK; + tc->rcv_opts.flags |= TCP_OPTS_FLAG_SACK; scoreboard_init (&tc->sack_sb); for (i = 0; i < 1000 / 100; i++) @@ -70,9 +70,9 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) for (i = 0; i < 1000 / 200; i++) { - vec_add1 (tc->opt.sacks, sacks[i * 2]); + vec_add1 (tc->rcv_opts.sacks, sacks[i * 2]); } - tc->opt.n_sack_blocks = vec_len (tc->opt.sacks); + tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks); tcp_rcv_sacks (tc, 0); if (verbose) @@ -93,18 +93,17 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv); TCP_TEST ((sb->last_sacked_bytes == 400), "last sacked bytes %d", sb->last_sacked_bytes); - TCP_TEST ((sb->max_byte_sacked == 900), - "max byte sacked %u", sb->max_byte_sacked); + TCP_TEST ((sb->high_sacked == 900), "max byte sacked %u", sb->high_sacked); /* * Inject odd blocks */ - vec_reset_length (tc->opt.sacks); + vec_reset_length (tc->rcv_opts.sacks); for (i = 0; i < 1000 / 200; i++) { - vec_add1 (tc->opt.sacks, sacks[i * 2 + 1]); + vec_add1 (tc->rcv_opts.sacks, sacks[i * 2 + 1]); } - tc->opt.n_sack_blocks = vec_len (tc->opt.sacks); + tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks); tcp_rcv_sacks (tc, 0); if (verbose) @@ -118,8 +117,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) "first hole start %u end %u", hole->start, hole->end); TCP_TEST ((sb->sacked_bytes == 900), "sacked bytes %d", sb->sacked_bytes); TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv %u", sb->snd_una_adv); - TCP_TEST ((sb->max_byte_sacked == 1000), - "max sacked byte %u", sb->max_byte_sacked); + TCP_TEST ((sb->high_sacked == 1000), "max sacked byte %u", sb->high_sacked); TCP_TEST ((sb->last_sacked_bytes == 500), "last sacked bytes %d", sb->last_sacked_bytes); @@ -135,8 +133,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) "scoreboard has %d elements", pool_elts (sb->holes)); TCP_TEST ((sb->snd_una_adv == 900), "snd_una_adv after ack %u", sb->snd_una_adv); - TCP_TEST ((sb->max_byte_sacked == 1000), - "max sacked byte %u", sb->max_byte_sacked); + TCP_TEST ((sb->high_sacked == 1000), "max sacked byte %u", sb->high_sacked); TCP_TEST ((sb->sacked_bytes == 0), "sacked bytes %d", sb->sacked_bytes); TCP_TEST ((sb->last_sacked_bytes == 0), "last sacked bytes %d", sb->last_sacked_bytes); @@ -145,11 +142,11 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) * Add new block */ - vec_reset_length (tc->opt.sacks); + vec_reset_length (tc->rcv_opts.sacks); block.start = 1200; block.end = 1300; - vec_add1 (tc->opt.sacks, block); + vec_add1 (tc->rcv_opts.sacks, block); if (verbose) vlib_cli_output (vm, "add [1200, 1300]:\n%U", format_tcp_scoreboard, sb); @@ -171,8 +168,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) "first hole start %u end %u", hole->start, hole->end); TCP_TEST ((sb->snd_una_adv == 0), "snd_una_adv after ack %u", sb->snd_una_adv); - TCP_TEST ((sb->max_byte_sacked == 1300), - "max sacked byte %u", sb->max_byte_sacked); + TCP_TEST ((sb->high_sacked == 1300), "max sacked byte %u", sb->high_sacked); hole = scoreboard_last_hole (sb); TCP_TEST ((hole->start == 1300 && hole->end == 1500), "last hole start %u end %u", hole->start, hole->end); @@ -182,7 +178,7 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) * Ack first hole */ - vec_reset_length (tc->opt.sacks); + vec_reset_length (tc->rcv_opts.sacks); tcp_rcv_sacks (tc, 1200); if (verbose) @@ -196,8 +192,16 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) "scoreboard has %d elements", pool_elts (sb->holes)); /* - * Remove all + * Add some more blocks and then remove all */ + vec_reset_length (tc->rcv_opts.sacks); + for (i = 0; i < 5; i++) + { + block.start = i * 100 + 1200; + block.end = (i + 1) * 100 + 1200; + vec_add1 (tc->rcv_opts.sacks, block); + } + tcp_rcv_sacks (tc, 1900); scoreboard_clear (sb); if (verbose) @@ -205,6 +209,9 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) TCP_TEST ((pool_elts (sb->holes) == 0), "number of holes %d", pool_elts (sb->holes)); + TCP_TEST ((sb->head == TCP_INVALID_SACK_HOLE_INDEX), "head %u", sb->head); + TCP_TEST ((sb->tail == TCP_INVALID_SACK_HOLE_INDEX), "tail %u", sb->tail); + /* * Re-inject odd blocks and ack them all */ @@ -214,9 +221,9 @@ tcp_test_sack_rx (vlib_main_t * vm, unformat_input_t * input) tc->snd_nxt = 1000; for (i = 0; i < 5; i++) { - vec_add1 (tc->opt.sacks, sacks[i * 2 + 1]); + vec_add1 (tc->rcv_opts.sacks, sacks[i * 2 + 1]); } - tc->opt.n_sack_blocks = vec_len (tc->opt.sacks); + tc->rcv_opts.n_sack_blocks = vec_len (tc->rcv_opts.sacks); tcp_rcv_sacks (tc, 0); if (verbose) vlib_cli_output (vm, "sb added odd blocks and ack [0, 950]:\n%U", @@ -740,6 +747,10 @@ tcp_test_fifo1 (vlib_main_t * vm, unformat_input_t * input) TCP_TEST (0, "[%d] peeked %u expected %u", j, data_buf[j], data[j]); } + /* Try to peek beyond the data */ + rv = svm_fifo_peek (f, svm_fifo_max_dequeue (f), vec_len (data), data_buf); + TCP_TEST ((rv == 0), "peeked %u expected 0", rv); + vec_free (data_buf); svm_fifo_free (f); vec_free (test_data); @@ -1239,7 +1250,7 @@ tcp_test_session (vlib_main_t * vm, unformat_input_t * input) tc0->c_thread_index = 0; tc0->c_lcl_ip4.as_u32 = local.as_u32; tc0->c_rmt_ip4.as_u32 = remote.as_u32; - tc0->opt.mss = 1450; + tc0->rcv_opts.mss = 1450; tcp_connection_init_vars (tc0); TCP_EVT_DBG (TCP_EVT_OPEN, tc0); -- cgit 1.2.3-korg From a54230d4e79e088b13f581e301846fc3e259548e Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 21 Jun 2017 11:57:07 +0200 Subject: Add knob to specify effective group id (gid) for VPP process Change-Id: Icf9bd4abda058fb380f1a25d5fe3917ffb38b1c4 Signed-off-by: Damjan Marion --- src/vlib/unix/main.c | 7 +++++++ src/vppinfra/format.h | 3 +++ src/vppinfra/unix-formats.c | 26 ++++++++++++++++++++++++++ 3 files changed, 36 insertions(+) (limited to 'src/vlib') diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c index e31ea815..ad1a7c3c 100644 --- a/src/vlib/unix/main.c +++ b/src/vlib/unix/main.c @@ -48,6 +48,7 @@ #include #include #include +#include /** Default CLI pager limit is not configured in startup.conf */ #define UNIX_CLI_DEFAULT_PAGER_LIMIT 100000 @@ -313,6 +314,7 @@ unix_config (vlib_main_t * vm, unformat_input_t * input) { unix_main_t *um = &unix_main; clib_error_t *error = 0; + gid_t gid; /* Defaults */ um->cli_pager_buffer_limit = UNIX_CLI_DEFAULT_PAGER_LIMIT; @@ -404,6 +406,11 @@ unix_config (vlib_main_t * vm, unformat_input_t * input) vec_free (lv); } } + else if (unformat (input, "gid %U", unformat_unix_gid, &gid)) + { + if (setegid (gid) == -1) + return clib_error_return_unix (0, "setegid"); + } else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); diff --git a/src/vppinfra/format.h b/src/vppinfra/format.h index bec1b6b4..5b7023a3 100644 --- a/src/vppinfra/format.h +++ b/src/vppinfra/format.h @@ -310,6 +310,9 @@ void unformat_init_unix_file (unformat_input_t * input, int file_descriptor); /* Take input from Unix environment variable; returns 1 if variable exists zero otherwise. */ uword unformat_init_unix_env (unformat_input_t * input, char *var); + +/* Unformat unix group id (gid) specified as integer or string */ +unformat_function_t unformat_unix_gid; #endif /* CLIB_UNIX */ /* Test code. */ diff --git a/src/vppinfra/unix-formats.c b/src/vppinfra/unix-formats.c index a4c81ca2..91986516 100644 --- a/src/vppinfra/unix-formats.c +++ b/src/vppinfra/unix-formats.c @@ -49,6 +49,7 @@ #include #include +#include #include #include @@ -915,4 +916,29 @@ u8 * format_ucontext_pc (u8 * s, va_list * args) return format (s, "%p", regs[reg_no]); } +uword +unformat_unix_gid (unformat_input_t * input, va_list * args) +{ + gid_t *gid = va_arg (*args, gid_t *); + struct group *grp = 0; + int r; + u8 *s; + + if (unformat (input, "%d", &r)) + { + grp = getgrgid (r); + } + else if (unformat (input, "%s", &s)) + { + grp = getgrnam ((char *) s); + vec_free (s); + } + if (grp) + { + *gid = grp->gr_gid; + return 1; + } + return 0; +} + #endif /* __KERNEL__ */ -- cgit 1.2.3-korg From f6616388113efa59e5278c75e5223612d209d4a0 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 21 Jun 2017 12:01:37 +0200 Subject: Add option to create clib_socket with group write permissions Also allow group write as default for CLI socket connections. Change-Id: I6af1f277f70581358cd9241bf0f5cb0752fe250f Signed-off-by: Damjan Marion --- src/vlib/unix/cli.c | 5 ++++- src/vppinfra/socket.c | 9 +++++++++ src/vppinfra/socket.h | 1 + 3 files changed, 14 insertions(+), 1 deletion(-) (limited to 'src/vlib') diff --git a/src/vlib/unix/cli.c b/src/vlib/unix/cli.c index 74dea161..953d133c 100644 --- a/src/vlib/unix/cli.c +++ b/src/vlib/unix/cli.c @@ -57,6 +57,8 @@ #include #include #include +#include +#include /** ANSI escape code. */ #define ESC "\x1b" @@ -2640,7 +2642,8 @@ unix_cli_config (vlib_main_t * vm, unformat_input_t * input) /* CLI listen. */ unix_file_t template = { 0 }; - s->flags = SOCKET_IS_SERVER; /* listen, don't connect */ + s->flags = SOCKET_IS_SERVER | /* listen, don't connect */ + SOCKET_ALLOW_GROUP_WRITE; /* PF_LOCAL socket only */ error = clib_socket_init (s); if (error) diff --git a/src/vppinfra/socket.c b/src/vppinfra/socket.c index 99b353fc..4c23c235 100644 --- a/src/vppinfra/socket.c +++ b/src/vppinfra/socket.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -348,6 +349,14 @@ clib_socket_init (clib_socket_t * s) error = clib_error_return_unix (0, "listen"); goto done; } + if (addr.sa.sa_family == PF_LOCAL + && s->flags & SOCKET_ALLOW_GROUP_WRITE) + { + struct stat st = { 0 }; + stat (((struct sockaddr_un *) &addr)->sun_path, &st); + st.st_mode |= S_IWGRP; + chmod (((struct sockaddr_un *) &addr)->sun_path, st.st_mode); + } } else { diff --git a/src/vppinfra/socket.h b/src/vppinfra/socket.h index 08e22e7e..75037208 100644 --- a/src/vppinfra/socket.h +++ b/src/vppinfra/socket.h @@ -58,6 +58,7 @@ typedef struct _socket_t #define SOCKET_IS_SERVER (1 << 0) #define SOCKET_IS_CLIENT (0 << 0) #define SOCKET_NON_BLOCKING_CONNECT (1 << 1) +#define SOCKET_ALLOW_GROUP_WRITE (1 << 2) /* Read returned end-of-file. */ #define SOCKET_RX_END_OF_FILE (1 << 2) -- cgit 1.2.3-korg From 5c20a0131a6a2516c14d5ccfc6db90fd13ec8a33 Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Tue, 13 Jun 2017 08:48:31 -0400 Subject: switch vlib process model to tw_timer_template timer impl Change-Id: I36bb47faea55a6fea7af7ee58d87d8f6dd28f93d Signed-off-by: Dave Barach --- src/vlib/main.c | 62 +++++++++++++++++---------- src/vlib/node.h | 15 ++++--- src/vlib/node_funcs.h | 28 +++++++------ src/vlib/unix/input.c | 61 +++++++++++---------------- src/vnet/lisp-cp/control.h | 1 + src/vppinfra/tw_timer_16t_1w_2048sl.h | 4 ++ src/vppinfra/tw_timer_16t_2w_512sl.h | 4 ++ src/vppinfra/tw_timer_1t_3w_1024sl_ov.h | 4 ++ src/vppinfra/tw_timer_2t_1w_2048sl.h | 4 ++ src/vppinfra/tw_timer_4t_3w_256sl.h | 4 ++ src/vppinfra/tw_timer_4t_3w_4sl_ov.h | 4 ++ src/vppinfra/tw_timer_template.c | 74 +++++++++++++++++++++++++++++++++ src/vppinfra/tw_timer_template.h | 9 ++++ 13 files changed, 197 insertions(+), 77 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/main.c b/src/vlib/main.c index 14f680e6..19d70232 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -41,6 +41,7 @@ #include #include #include +#include #include @@ -1341,9 +1342,16 @@ dispatch_process (vlib_main_t * vm, p->suspended_process_frame_index = pf - nm->suspended_process_frames; if (p->flags & VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK) - timing_wheel_insert (&nm->timing_wheel, p->resume_cpu_time, - vlib_timing_wheel_data_set_suspended_process - (node->runtime_index)); + { + TWT (tw_timer_wheel) * tw = + (TWT (tw_timer_wheel) *) nm->timing_wheel; + p->stop_timer_handle = + TW (tw_timer_start) (tw, + vlib_timing_wheel_data_set_suspended_process + (node->runtime_index) /* [sic] pool idex */ , + 0 /* timer_id */ , + p->resume_clock_interval); + } } else p->flags &= ~VLIB_PROCESS_IS_RUNNING; @@ -1416,9 +1424,14 @@ dispatch_suspended_process (vlib_main_t * vm, n_vectors = 0; p->n_suspends += 1; if (p->flags & VLIB_PROCESS_IS_SUSPENDED_WAITING_FOR_CLOCK) - timing_wheel_insert (&nm->timing_wheel, p->resume_cpu_time, - vlib_timing_wheel_data_set_suspended_process - (node->runtime_index)); + { + p->stop_timer_handle = + TW (tw_timer_start) ((TWT (tw_timer_wheel) *) nm->timing_wheel, + vlib_timing_wheel_data_set_suspended_process + (node->runtime_index) /* [sic] pool idex */ , + 0 /* timer_id */ , + p->resume_clock_interval); + } } else { @@ -1465,17 +1478,6 @@ vlib_main_or_worker_loop (vlib_main_t * vm, int is_main) else cpu_time_now = clib_cpu_time_now (); - /* Arrange for first level of timing wheel to cover times we care - most about. */ - if (is_main) - { - nm->timing_wheel.min_sched_time = 10e-6; - nm->timing_wheel.max_sched_time = 10e-3; - timing_wheel_init (&nm->timing_wheel, - cpu_time_now, vm->clib_time.clocks_per_second); - vec_alloc (nm->data_from_advancing_timing_wheel, 32); - } - /* Pre-allocate interupt runtime indices and lock. */ vec_alloc (nm->pending_interrupt_node_runtime_indices, 32); vec_alloc (last_node_runtime_indices, 32); @@ -1561,12 +1563,15 @@ vlib_main_or_worker_loop (vlib_main_t * vm, int is_main) if (is_main) { /* Check if process nodes have expired from timing wheel. */ - nm->data_from_advancing_timing_wheel - = timing_wheel_advance (&nm->timing_wheel, cpu_time_now, - nm->data_from_advancing_timing_wheel, - &nm->cpu_time_next_process_ready); + ASSERT (nm->data_from_advancing_timing_wheel != 0); + + nm->data_from_advancing_timing_wheel = + TW (tw_timer_expire_timers_vec) + ((TWT (tw_timer_wheel) *) nm->timing_wheel, vlib_time_now (vm), + nm->data_from_advancing_timing_wheel); ASSERT (nm->data_from_advancing_timing_wheel != 0); + if (PREDICT_FALSE (_vec_len (nm->data_from_advancing_timing_wheel) > 0)) { @@ -1612,8 +1617,6 @@ vlib_main_or_worker_loop (vlib_main_t * vm, int is_main) dispatch_suspended_process (vm, di, cpu_time_now); } } - - /* Reset vector. */ _vec_len (nm->data_from_advancing_timing_wheel) = 0; } } @@ -1692,6 +1695,7 @@ int vlib_main (vlib_main_t * volatile vm, unformat_input_t * input) { clib_error_t *volatile error; + vlib_node_main_t *nm = &vm->node_main; vm->queue_signal_callback = dummy_queue_signal_callback; @@ -1746,6 +1750,18 @@ vlib_main (vlib_main_t * volatile vm, unformat_input_t * input) VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES, "default"); + nm->timing_wheel = clib_mem_alloc_aligned (sizeof (TWT (tw_timer_wheel)), + CLIB_CACHE_LINE_BYTES); + + vec_validate (nm->data_from_advancing_timing_wheel, 10); + _vec_len (nm->data_from_advancing_timing_wheel) = 0; + + /* Create the process timing wheel */ + TW (tw_timer_wheel_init) ((TWT (tw_timer_wheel) *) nm->timing_wheel, + 0 /* no callback */ , + 10e-6 /* timer period 10us */ , + ~0 /* max expirations per call */ ); + switch (clib_setjmp (&vm->main_loop_exit, VLIB_MAIN_LOOP_EXIT_NONE)) { case VLIB_MAIN_LOOP_EXIT_NONE: diff --git a/src/vlib/node.h b/src/vlib/node.h index 906d795f..77914272 100644 --- a/src/vlib/node.h +++ b/src/vlib/node.h @@ -43,7 +43,6 @@ #include #include #include -#include #include /* for vlib_trace_filter_t */ /* Forward declaration. */ @@ -542,8 +541,14 @@ typedef struct /* Pool of currently valid event types. */ vlib_process_event_type_t *event_type_pool; - /* When suspending saves cpu cycle counter when process is to be resumed. */ - u64 resume_cpu_time; + /* + * When suspending saves clock time (10us ticks) when process + * is to be resumed. + */ + u64 resume_clock_interval; + + /* Handle from timer code, to cancel an unexpired timer */ + u32 stop_timer_handle; /* Default output function and its argument for any CLI outputs within the process. */ @@ -664,7 +669,7 @@ typedef struct vlib_pending_frame_t *pending_frames; /* Timing wheel for scheduling time-based node dispatch. */ - timing_wheel_t timing_wheel; + void *timing_wheel; vlib_signal_timed_event_data_t *signal_timed_event_data_pool; @@ -672,7 +677,7 @@ typedef struct u32 *data_from_advancing_timing_wheel; /* CPU time of next process to be ready on timing wheel. */ - u64 cpu_time_next_process_ready; + f64 time_next_process_ready; /* Vector of process nodes. One for each node of type VLIB_NODE_TYPE_PROCESS. */ diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h index 4d7cc192..d6588a74 100644 --- a/src/vlib/node_funcs.h +++ b/src/vlib/node_funcs.h @@ -46,6 +46,7 @@ #define included_vlib_node_funcs_h #include +#include /** \brief Get vlib node by index. @warning This function will ASSERT if @c i is out of range. @@ -428,14 +429,14 @@ vlib_current_process (vlib_main_t * vm) return vlib_get_current_process (vm)->node_runtime.node_index; } -/** Returns TRUE if a process suspend time is less than 1us +/** Returns TRUE if a process suspend time is less than 10us @param dt - remaining poll time in seconds - @returns 1 if dt < 1e-6, 0 otherwise + @returns 1 if dt < 10e-6, 0 otherwise */ always_inline uword vlib_process_suspend_time_is_zero (f64 dt) { - return dt < 1e-6; + return dt < 10e-6; } /** Suspend a vlib cooperative multi-tasking thread for a period of time @@ -450,7 +451,6 @@ vlib_process_suspend (vlib_main_t * vm, f64 dt) uword r; vlib_node_main_t *nm = &vm->node_main; vlib_process_t *p = vec_elt (nm->processes, nm->current_process_index); - u64 dt_cpu = dt * vm->clib_time.clocks_per_second; if (vlib_process_suspend_time_is_zero (dt)) return VLIB_PROCESS_RESUME_LONGJMP_RESUME; @@ -459,7 +459,8 @@ vlib_process_suspend (vlib_main_t * vm, f64 dt) r = clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND); if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND) { - p->resume_cpu_time = clib_cpu_time_now () + dt_cpu; + /* expiration time in 10us ticks */ + p->resume_clock_interval = dt * 1e5; clib_longjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_SUSPEND); } @@ -718,8 +719,7 @@ vlib_process_wait_for_event_or_clock (vlib_main_t * vm, f64 dt) r = clib_setjmp (&p->resume_longjmp, VLIB_PROCESS_RESUME_LONGJMP_SUSPEND); if (r == VLIB_PROCESS_RESUME_LONGJMP_SUSPEND) { - p->resume_cpu_time = (clib_cpu_time_now () - + (dt * vm->clib_time.clocks_per_second)); + p->resume_clock_interval = dt * 1e5; clib_longjmp (&p->return_longjmp, VLIB_PROCESS_RETURN_LONGJMP_SUSPEND); } @@ -834,7 +834,8 @@ vlib_process_signal_event_helper (vlib_node_main_t * nm, p->flags = p_flags | VLIB_PROCESS_RESUME_PENDING; vec_add1 (nm->data_from_advancing_timing_wheel, x); if (delete_from_wheel) - timing_wheel_delete (&nm->timing_wheel, x); + TW (tw_timer_stop) ((TWT (tw_timer_wheel) *) nm->timing_wheel, + p->stop_timer_handle); } return data_to_be_written_by_caller; @@ -895,7 +896,6 @@ vlib_process_signal_event_at_time (vlib_main_t * vm, else { vlib_signal_timed_event_data_t *te; - u64 dt_cpu = dt * vm->clib_time.clocks_per_second; pool_get_aligned (nm->signal_timed_event_data_pool, te, sizeof (te[0])); @@ -911,10 +911,12 @@ vlib_process_signal_event_at_time (vlib_main_t * vm, te->process_node_index = n->runtime_index; te->event_type_index = t; - timing_wheel_insert (&nm->timing_wheel, clib_cpu_time_now () + dt_cpu, - vlib_timing_wheel_data_set_timed_event (te - - nm-> - signal_timed_event_data_pool)); + p->stop_timer_handle = + TW (tw_timer_start) ((TWT (tw_timer_wheel) *) nm->timing_wheel, + vlib_timing_wheel_data_set_timed_event + (te - nm->signal_timed_event_data_pool), + 0 /* timer_id */ , + (vlib_time_now (vm) + dt) * 1e5); /* Inline data big enough to hold event? */ if (te->n_data_bytes < sizeof (te->inline_event_data)) diff --git a/src/vlib/unix/input.c b/src/vlib/unix/input.c index 73783d13..515dae94 100644 --- a/src/vlib/unix/input.c +++ b/src/vlib/unix/input.c @@ -40,6 +40,7 @@ #include #include #include +#include /* FIXME autoconf */ #define HAVE_LINUX_EPOLL @@ -113,56 +114,44 @@ linux_epoll_input (vlib_main_t * vm, { vlib_node_main_t *nm = &vm->node_main; - u64 t = nm->cpu_time_next_process_ready; + u32 ticks_until_expiration; f64 timeout; - int timeout_ms, max_timeout_ms = 10; + int timeout_ms = 0, max_timeout_ms = 10; f64 vector_rate = vlib_last_vectors_per_main_loop (vm); - if (t == ~0ULL) + /* If we're not working very hard, decide how long to sleep */ + if (vector_rate < 2 && vm->api_queue_nonempty == 0 + && nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] == 0) { - timeout = 10e-3; - timeout_ms = max_timeout_ms; - } - else - { - timeout = - (((i64) t - (i64) clib_cpu_time_now ()) - * vm->clib_time.seconds_per_clock) - /* subtract off some slop time */ - 50e-6; + ticks_until_expiration = TW (tw_timer_first_expires_in_ticks) + ((TWT (tw_timer_wheel) *) nm->timing_wheel); - if (timeout < 1e-3) + /* Nothing on the fast wheel, sleep 10ms */ + if (ticks_until_expiration == TW_SLOTS_PER_RING) { - /* We have event happenning in less than 1 ms so - don't allow epoll to wait */ - timeout_ms = 0; + timeout = 10e-3; + timeout_ms = max_timeout_ms; } else { - timeout_ms = timeout * 1e3; - - /* Must be between 1 and 10 ms. */ - timeout_ms = clib_max (1, timeout_ms); - timeout_ms = clib_min (max_timeout_ms, timeout_ms); + timeout = (f64) ticks_until_expiration *1e-5; + if (timeout < 1e-3) + timeout_ms = 0; + else + { + timeout_ms = timeout * 1e3; + /* Must be between 1 and 10 ms. */ + timeout_ms = clib_max (1, timeout_ms); + timeout_ms = clib_min (max_timeout_ms, timeout_ms); + } } + node->input_main_loops_per_call = 0; } - - /* If we still have input nodes polling (e.g. vnet packet generator) - don't sleep. */ - if (nm->input_node_counts_by_state[VLIB_NODE_STATE_POLLING] > 0) - timeout_ms = 0; - - /* - * When busy: don't wait & only epoll for input - * every 1024 times through main loop. - */ - if (vector_rate > 1 || vm->api_queue_nonempty) + else /* busy */ { - timeout_ms = 0; + /* Don't come back for a respectable number of dispatch cycles */ node->input_main_loops_per_call = 1024; } - else - /* We're not busy; go to sleep for a while. */ - node->input_main_loops_per_call = 0; /* Allow any signal to wakeup our sleep. */ { diff --git a/src/vnet/lisp-cp/control.h b/src/vnet/lisp-cp/control.h index 577035c4..0e63b3c7 100644 --- a/src/vnet/lisp-cp/control.h +++ b/src/vnet/lisp-cp/control.h @@ -19,6 +19,7 @@ #include #include #include +#include #define NUMBER_OF_RETRIES 1 #define PENDING_MREQ_EXPIRATION_TIME 3.0 /* seconds */ diff --git a/src/vppinfra/tw_timer_16t_1w_2048sl.h b/src/vppinfra/tw_timer_16t_1w_2048sl.h index 6edef17b..66cf7d37 100644 --- a/src/vppinfra/tw_timer_16t_1w_2048sl.h +++ b/src/vppinfra/tw_timer_16t_1w_2048sl.h @@ -25,6 +25,8 @@ #undef LOG2_TW_TIMERS_PER_OBJECT #undef TW_SUFFIX #undef TW_OVERFLOW_VECTOR +#undef TW_FAST_WHEEL_BITMAP +#undef TW_TIMER_ALLOW_DUPLICATE_STOP #define TW_TIMER_WHEELS 1 #define TW_SLOTS_PER_RING 2048 @@ -33,6 +35,8 @@ #define TW_TIMERS_PER_OBJECT 16 #define LOG2_TW_TIMERS_PER_OBJECT 4 #define TW_SUFFIX _16t_1w_2048sl +#define TW_FAST_WHEEL_BITMAP 0 +#define TW_TIMER_ALLOW_DUPLICATE_STOP 0 #include diff --git a/src/vppinfra/tw_timer_16t_2w_512sl.h b/src/vppinfra/tw_timer_16t_2w_512sl.h index 2497b31c..00587b8e 100644 --- a/src/vppinfra/tw_timer_16t_2w_512sl.h +++ b/src/vppinfra/tw_timer_16t_2w_512sl.h @@ -25,6 +25,8 @@ #undef LOG2_TW_TIMERS_PER_OBJECT #undef TW_SUFFIX #undef TW_OVERFLOW_VECTOR +#undef TW_FAST_WHEEL_BITMAP +#undef TW_TIMER_ALLOW_DUPLICATE_STOP #define TW_TIMER_WHEELS 2 #define TW_SLOTS_PER_RING 512 @@ -33,6 +35,8 @@ #define TW_TIMERS_PER_OBJECT 16 #define LOG2_TW_TIMERS_PER_OBJECT 4 #define TW_SUFFIX _16t_2w_512sl +#define TW_FAST_WHEEL_BITMAP 0 +#define TW_TIMER_ALLOW_DUPLICATE_STOP 0 #include diff --git a/src/vppinfra/tw_timer_1t_3w_1024sl_ov.h b/src/vppinfra/tw_timer_1t_3w_1024sl_ov.h index 7327f87b..e5e4cc19 100644 --- a/src/vppinfra/tw_timer_1t_3w_1024sl_ov.h +++ b/src/vppinfra/tw_timer_1t_3w_1024sl_ov.h @@ -25,6 +25,8 @@ #undef LOG2_TW_TIMERS_PER_OBJECT #undef TW_SUFFIX #undef TW_OVERFLOW_VECTOR +#undef TW_FAST_WHEEL_BITMAP +#undef TW_TIMER_ALLOW_DUPLICATE_STOP #define TW_TIMER_WHEELS 3 #define TW_SLOTS_PER_RING 1024 @@ -34,6 +36,8 @@ #define LOG2_TW_TIMERS_PER_OBJECT 0 #define TW_SUFFIX _1t_3w_1024sl_ov #define TW_OVERFLOW_VECTOR 1 +#define TW_FAST_WHEEL_BITMAP 1 +#define TW_TIMER_ALLOW_DUPLICATE_STOP 1 #include diff --git a/src/vppinfra/tw_timer_2t_1w_2048sl.h b/src/vppinfra/tw_timer_2t_1w_2048sl.h index 33b74405..98b548b3 100644 --- a/src/vppinfra/tw_timer_2t_1w_2048sl.h +++ b/src/vppinfra/tw_timer_2t_1w_2048sl.h @@ -25,6 +25,8 @@ #undef LOG2_TW_TIMERS_PER_OBJECT #undef TW_SUFFIX #undef TW_OVERFLOW_VECTOR +#undef TW_FAST_WHEEL_BITMAP +#undef TW_TIMER_ALLOW_DUPLICATE_STOP #define TW_TIMER_WHEELS 1 #define TW_SLOTS_PER_RING 2048 @@ -33,6 +35,8 @@ #define TW_TIMERS_PER_OBJECT 2 #define LOG2_TW_TIMERS_PER_OBJECT 1 #define TW_SUFFIX _2t_1w_2048sl +#define TW_FAST_WHEEL_BITMAP 0 +#define TW_TIMER_ALLOW_DUPLICATE_STOP 0 #include diff --git a/src/vppinfra/tw_timer_4t_3w_256sl.h b/src/vppinfra/tw_timer_4t_3w_256sl.h index 89adb7a2..07203de8 100644 --- a/src/vppinfra/tw_timer_4t_3w_256sl.h +++ b/src/vppinfra/tw_timer_4t_3w_256sl.h @@ -25,6 +25,8 @@ #undef LOG2_TW_TIMERS_PER_OBJECT #undef TW_SUFFIX #undef TW_OVERFLOW_VECTOR +#undef TW_FAST_WHEEL_BITMAP +#undef TW_TIMER_ALLOW_DUPLICATE_STOP #define TW_TIMER_WHEELS 3 #define TW_SLOTS_PER_RING 256 @@ -33,6 +35,8 @@ #define TW_TIMERS_PER_OBJECT 4 #define LOG2_TW_TIMERS_PER_OBJECT 2 #define TW_SUFFIX _4t_3w_256sl +#define TW_FAST_WHEEL_BITMAP 0 +#define TW_TIMER_ALLOW_DUPLICATE_STOP 0 #include diff --git a/src/vppinfra/tw_timer_4t_3w_4sl_ov.h b/src/vppinfra/tw_timer_4t_3w_4sl_ov.h index 0f76164d..20a01d05 100644 --- a/src/vppinfra/tw_timer_4t_3w_4sl_ov.h +++ b/src/vppinfra/tw_timer_4t_3w_4sl_ov.h @@ -25,6 +25,8 @@ #undef LOG2_TW_TIMERS_PER_OBJECT #undef TW_SUFFIX #undef TW_OVERFLOW_VECTOR +#undef TW_FAST_WHEEL_BITMAP +#undef TW_TIMER_ALLOW_DUPLICATE_STOP #define TW_TIMER_WHEELS 3 #define TW_SLOTS_PER_RING 4 @@ -34,6 +36,8 @@ #define LOG2_TW_TIMERS_PER_OBJECT 2 #define TW_SUFFIX _4t_3w_4sl_ov #define TW_OVERFLOW_VECTOR 1 +#define TW_FAST_WHEEL_BITMAP 0 +#define TW_TIMER_ALLOW_DUPLICATE_STOP 0 #include diff --git a/src/vppinfra/tw_timer_template.c b/src/vppinfra/tw_timer_template.c index 9253488c..c0a9685a 100644 --- a/src/vppinfra/tw_timer_template.c +++ b/src/vppinfra/tw_timer_template.c @@ -204,6 +204,11 @@ TW (tw_timer_start) (TWT (tw_timer_wheel) * tw, u32 pool_index, u32 timer_id, ts = &tw->w[TW_TIMER_RING_FAST][fast_ring_offset]; timer_addhead (tw->timers, ts->head_index, t - tw->timers); + +#if TW_FAST_WHEEL_BITMAP + tw->fast_slot_bitmap = clib_bitmap_set (tw->fast_slot_bitmap, + fast_ring_offset, 1); +#endif return t - tw->timers; } @@ -251,6 +256,16 @@ void TW (tw_timer_stop) (TWT (tw_timer_wheel) * tw, u32 handle) { TWT (tw_timer) * t; +#if TW_TIMER_ALLOW_DUPLICATE_STOP + /* + * A vlib process may have its timer expire, and receive + * an event before the expiration is processed. + * That results in a duplicate tw_timer_stop. + */ + if (pool_is_free_index (tw->timers, handle)) + return; +#endif + t = pool_elt_at_index (tw->timers, handle); /* in case of idiotic handle (e.g. passing a listhead index) */ @@ -481,6 +496,11 @@ static inline { ts = &tw->w[TW_TIMER_RING_FAST][t->fast_ring_offset]; timer_addhead (tw->timers, ts->head_index, t - tw->timers); +#if TW_FAST_WHEEL_BITMAP + tw->fast_slot_bitmap = + clib_bitmap_set (tw->fast_slot_bitmap, + t->fast_ring_offset, 1); +#endif } } } @@ -523,6 +543,11 @@ static inline { ts = &tw->w[TW_TIMER_RING_FAST][t->fast_ring_offset]; timer_addhead (tw->timers, ts->head_index, t - tw->timers); +#if TW_FAST_WHEEL_BITMAP + tw->fast_slot_bitmap = + clib_bitmap_set (tw->fast_slot_bitmap, + t->fast_ring_offset, 1); +#endif } else /* typical case */ { @@ -569,6 +594,11 @@ static inline /* Add to fast ring */ ts = &tw->w[TW_TIMER_RING_FAST][t->fast_ring_offset]; timer_addhead (tw->timers, ts->head_index, t - tw->timers); +#if TW_FAST_WHEEL_BITMAP + tw->fast_slot_bitmap = + clib_bitmap_set (tw->fast_slot_bitmap, + t->fast_ring_offset, 1); +#endif } } } @@ -604,6 +634,12 @@ static inline } tw->expired_timer_handles = callback_vector; } + +#if TW_FAST_WHEEL_BITMAP + tw->fast_slot_bitmap = clib_bitmap_set (tw->fast_slot_bitmap, + fast_wheel_index, 0); +#endif + tw->current_tick++; fast_wheel_index++; tw->current_index[TW_TIMER_RING_FAST] = fast_wheel_index; @@ -642,6 +678,44 @@ u32 *TW (tw_timer_expire_timers_vec) (TWT (tw_timer_wheel) * tw, f64 now, return TW (tw_timer_expire_timers_internal) (tw, now, vec); } +#if TW_FAST_WHEEL_BITMAP +/** Returns an approximation to the first timer expiration in + * timer-ticks from "now". To avoid wasting an unjustifiable + * amount of time on the problem, we maintain an approximate fast-wheel slot + * occupancy bitmap. We don't worry about clearing fast wheel bits + * when timers are removed from fast wheel slots. + */ + +u32 TW (tw_timer_first_expires_in_ticks) (TWT (tw_timer_wheel) * tw) +{ + u32 first_expiring_index, fast_ring_index; + i32 delta; + + if (clib_bitmap_is_zero (tw->fast_slot_bitmap)) + return TW_SLOTS_PER_RING; + + fast_ring_index = tw->current_index[TW_TIMER_RING_FAST]; + if (fast_ring_index == TW_SLOTS_PER_RING) + fast_ring_index = 0; + + first_expiring_index = clib_bitmap_next_set (tw->fast_slot_bitmap, + fast_ring_index); + if (first_expiring_index == ~0 && fast_ring_index != 0) + first_expiring_index = clib_bitmap_first_set (tw->fast_slot_bitmap); + + ASSERT (first_expiring_index != ~0); + + delta = (i32) first_expiring_index - (i32) fast_ring_index; + if (delta < 0) + delta += TW_SLOTS_PER_RING; + + ASSERT (delta >= 0); + + return (u32) delta; +} + +#endif + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vppinfra/tw_timer_template.h b/src/vppinfra/tw_timer_template.h index 76755609..0404e3f4 100644 --- a/src/vppinfra/tw_timer_template.h +++ b/src/vppinfra/tw_timer_template.h @@ -19,6 +19,7 @@ #include #include +#include #ifndef _twt #define _twt(a,b) a##b##_t @@ -202,6 +203,11 @@ typedef struct tw_timer_wheel_slot_t overflow; #endif +#if TW_FAST_WHEEL_BITMAP > 0 + /** Fast wheel slot occupancy bitmap */ + uword *fast_slot_bitmap; +#endif + /** expired timer callback, receives a vector of handles */ void (*expired_timer_callback) (u32 * expired_timer_handles); @@ -226,6 +232,9 @@ void TW (tw_timer_wheel_free) (TWT (tw_timer_wheel) * tw); u32 *TW (tw_timer_expire_timers) (TWT (tw_timer_wheel) * tw, f64 now); u32 *TW (tw_timer_expire_timers_vec) (TWT (tw_timer_wheel) * tw, f64 now, u32 * vec); +#if TW_FAST_WHEEL_BITMAP +u32 TW (tw_timer_first_expires_in_ticks) (TWT (tw_timer_wheel) * tw); +#endif /* * fd.io coding-style-patch-verification: ON -- cgit 1.2.3-korg From bb620d74b247f419eb485886c55148099b0213bb Mon Sep 17 00:00:00 2001 From: Neale Ranns Date: Thu, 29 Jun 2017 00:19:08 -0700 Subject: VPP debug image with worker threads hit assert on adding IP route with traffic (VPP-892) When stacking DPOs the VLIB graph is also updated to add the edge between the nodes, if this edge does not yet exist. This addition should be done with the workers stopped. Change-Id: I327e4d7d26f0b23eb280f17e4619ff2093ff7940 Signed-off-by: Neale Ranns (cherry picked from commit c02bd03ddf5eec9e9c79811360685f13e4ba8ee1) --- src/vlib/node.c | 20 ++++++++++++++++++++ src/vlib/node_funcs.h | 3 +++ src/vnet/dpo/dpo.c | 17 ++++++++++++++++- 3 files changed, 39 insertions(+), 1 deletion(-) (limited to 'src/vlib') diff --git a/src/vlib/node.c b/src/vlib/node.c index eecad274..2cda0f06 100644 --- a/src/vlib/node.c +++ b/src/vlib/node.c @@ -151,6 +151,26 @@ vlib_node_runtime_update (vlib_main_t * vm, u32 node_index, u32 next_index) vlib_worker_thread_barrier_release (vm); } +uword +vlib_node_get_next (vlib_main_t * vm, uword node_index, uword next_node_index) +{ + vlib_node_main_t *nm = &vm->node_main; + vlib_node_t *node; + uword *p; + + node = vec_elt (nm->nodes, node_index); + + /* Runtime has to be initialized. */ + ASSERT (nm->flags & VLIB_NODE_MAIN_RUNTIME_STARTED); + + if ((p = hash_get (node->next_slot_by_node, next_node_index))) + { + return p[0]; + } + + return (~0); +} + /* Add next node to given node in given slot. */ uword vlib_node_add_next_with_slot (vlib_main_t * vm, diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h index d6588a74..c0389b2f 100644 --- a/src/vlib/node_funcs.h +++ b/src/vlib/node_funcs.h @@ -1070,6 +1070,9 @@ vlib_node_vectors_per_main_loop_as_integer (vlib_main_t * vm, u32 node_index) void vlib_frame_free (vlib_main_t * vm, vlib_node_runtime_t * r, vlib_frame_t * f); +/* Return the edge index if present, ~0 otherwise */ +uword vlib_node_get_next (vlib_main_t * vm, uword node, uword next_node); + /* Add next node to given node in given slot. */ uword vlib_node_add_next_with_slot (vlib_main_t * vm, diff --git a/src/vnet/dpo/dpo.c b/src/vnet/dpo/dpo.c index 28aa0c23..389f995b 100644 --- a/src/vnet/dpo/dpo.c +++ b/src/vnet/dpo/dpo.c @@ -327,6 +327,8 @@ dpo_get_next_node (dpo_type_t child_type, vm = vlib_get_main(); + vlib_worker_thread_barrier_sync(vm); + ASSERT(NULL != dpo_nodes[child_type]); ASSERT(NULL != dpo_nodes[child_type][child_proto]); ASSERT(NULL != dpo_nodes[parent_type]); @@ -368,6 +370,8 @@ dpo_get_next_node (dpo_type_t child_type, } cc++; } + + vlib_worker_thread_barrier_release(vm); } return (dpo_edges[child_type][child_proto][parent_type][parent_proto]); @@ -445,10 +449,21 @@ dpo_stack_from_node (u32 child_node_index, parent_node = vlib_get_node_by_name(vm, (u8*) dpo_nodes[parent_type][parent_proto][0]); - edge = vlib_node_add_next(vm, + edge = vlib_node_get_next(vm, child_node_index, parent_node->index); + if (~0 == edge) + { + vlib_worker_thread_barrier_sync(vm); + + edge = vlib_node_add_next(vm, + child_node_index, + parent_node->index); + + vlib_worker_thread_barrier_release(vm); + } + dpo_stack_i(edge, dpo, parent); } -- cgit 1.2.3-korg From 7447d07719bb4c9c58501d78aa4a453d6a044863 Mon Sep 17 00:00:00 2001 From: Chris Luke Date: Wed, 5 Jul 2017 12:57:10 -0400 Subject: Buffer name inconsistently used a cstring/vec (VPP-901) Spotted in the output of CLI command "show buffers", the name field sometimes had trailing garbage, the hall sign of a string not being terminated. In this case it was being inconsistently used as a cstring or a vec. - CLI printf needs %v to print the vec srring - vlib_buffer_create_free_list_helper tried to use clib_mem_is_heap_object() to detect a vec object, wheras it should use clib_mem_is_vec() Change-Id: Ib8b242a0c5a18924b8af7e8e1432784eebcf572c Signed-off-by: Chris Luke --- src/vlib/buffer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index be3b41ef..e50064ae 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -383,7 +383,7 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm, f->index = f - bm->buffer_free_list_pool; f->n_data_bytes = vlib_buffer_round_size (n_data_bytes); f->min_n_buffers_each_physmem_alloc = VLIB_FRAME_SIZE; - f->name = clib_mem_is_heap_object (name) ? name : format (0, "%s", name); + f->name = clib_mem_is_vec (name) ? name : format (0, "%s", name); /* Setup free buffer template. */ f->buffer_init_template.free_list_index = f->index; @@ -774,7 +774,7 @@ vlib_packet_template_init (vlib_main_t * vm, { vlib_buffer_main_t *bm = vm->buffer_main; va_list va; - __attribute__ ((unused)) u8 *name; + u8 *name; vlib_buffer_free_list_t *fl; va_start (va, fmt); @@ -962,7 +962,7 @@ format_vlib_buffer_free_list (u8 * s, va_list * va) bytes_alloc = size * f->n_alloc; bytes_free = size * n_free; - s = format (s, "%7d%30s%12d%12d%=12U%=12U%=12d%=12d", threadnum, + s = format (s, "%7d%30v%12d%12d%=12U%=12U%=12d%=12d", threadnum, f->name, f->index, f->n_data_bytes, format_memory_size, bytes_alloc, format_memory_size, bytes_free, f->n_alloc, n_free); -- cgit 1.2.3-korg From 475674ee5aa4e130a0ac0caf08ef9d579b8604b7 Mon Sep 17 00:00:00 2001 From: Chris Luke Date: Wed, 5 Jul 2017 18:02:53 -0400 Subject: unix: mkdir VPP_RUN_DIR before opening a socket in it Change https://gerrit.fd.io/r/#/c/7230/ added a Unix domain CLI socket in the default startup.conf; however unless you had previously run VPP with the DPDK plugin enabled the directory that it is created in. /run/vpp, would not exist and startup would fail. This directory is typically hosted in a tmpfs ramdisk and is thus ephemeral. This patch adds a function that attempts to mkdir VPP_RUN_DIR and uses it in both the DPDK plugin and the CLI code if the CLI socket is to be created in that directory. Change-Id: Ibbf925819099dce2b5eb0fa238b9edca1036d6fd Signed-off-by: Chris Luke --- src/plugins/dpdk/device/init.c | 14 +++++--------- src/vlib/unix/cli.c | 11 +++++++++++ src/vlib/unix/unix.h | 6 ++++++ src/vlib/unix/util.c | 13 +++++++++++++ 4 files changed, 35 insertions(+), 9 deletions(-) (limited to 'src/vlib') diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index d9ab0756..04344f74 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -37,8 +37,7 @@ dpdk_main_t dpdk_main; #define LINK_STATE_ELOGS 0 -#define DEFAULT_HUGE_DIR "/run/vpp/hugepages" -#define VPP_RUN_DIR "/run/vpp" +#define DEFAULT_HUGE_DIR (VPP_RUN_DIR "/hugepages") /* Port configuration, mildly modified Intel app values */ @@ -1047,13 +1046,10 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) vec_free (mem_by_socket); - rv = mkdir (VPP_RUN_DIR, 0755); - if (rv && errno != EEXIST) - { - error = clib_error_return (0, "mkdir '%s' failed errno %d", - VPP_RUN_DIR, errno); - goto done; - } + /* Make sure VPP_RUN_DIR exists */ + error = unix_make_vpp_run_dir (); + if (error) + goto done; rv = mkdir (DEFAULT_HUGE_DIR, 0755); if (rv && errno != EEXIST) diff --git a/src/vlib/unix/cli.c b/src/vlib/unix/cli.c index 953d133c..1befa25d 100644 --- a/src/vlib/unix/cli.c +++ b/src/vlib/unix/cli.c @@ -2642,6 +2642,17 @@ unix_cli_config (vlib_main_t * vm, unformat_input_t * input) /* CLI listen. */ unix_file_t template = { 0 }; + /* If our listen address looks like a path and it starts with + * VPP_RUN_DIR, go make sure VPP_RUN_DIR exists before trying to open + * a socket in it. + */ + if (strncmp (s->config, VPP_RUN_DIR "/", strlen (VPP_RUN_DIR) + 1) == 0) + { + error = unix_make_vpp_run_dir (); + if (error) + return error; + } + s->flags = SOCKET_IS_SERVER | /* listen, don't connect */ SOCKET_ALLOW_GROUP_WRITE; /* PF_LOCAL socket only */ error = clib_socket_init (s); diff --git a/src/vlib/unix/unix.h b/src/vlib/unix/unix.h index de607c0f..ffa92bba 100644 --- a/src/vlib/unix/unix.h +++ b/src/vlib/unix/unix.h @@ -43,6 +43,10 @@ #include #include + +/** VPP runtime ephemeral directory. Typically stored in a tmpfs. */ +#define VPP_RUN_DIR "/run/vpp" + struct unix_file; typedef clib_error_t *(unix_file_function_t) (struct unix_file * f); @@ -229,6 +233,8 @@ clib_error_t *foreach_directory_file (char *dir_name, u8 * file_name), void *arg, int scan_dirs); +clib_error_t *unix_make_vpp_run_dir (void); + #endif /* included_unix_unix_h */ /* diff --git a/src/vlib/unix/util.c b/src/vlib/unix/util.c index edc3e591..51b4a4ed 100644 --- a/src/vlib/unix/util.c +++ b/src/vlib/unix/util.c @@ -222,6 +222,19 @@ done: return r; } +clib_error_t * +unix_make_vpp_run_dir (void) +{ + int rv; + + rv = mkdir (VPP_RUN_DIR, 0755); + if (rv && errno != EEXIST) + return clib_error_return (0, "mkdir '%s' failed errno %d", + VPP_RUN_DIR, errno); + + return 0; +} + /* * fd.io coding-style-patch-verification: ON * -- cgit 1.2.3-korg From 191a1a407ebd299907d2890cbb0369a6d7c19a45 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Mon, 10 Jul 2017 15:38:21 +0200 Subject: vlib: fix issues with PCI handling code - PCI devices not properly discovered - vlib_pci_bus_master_enable () not working Change-Id: I7433ab1b19b890b8900635b43037b9a2017a1921 Signed-off-by: Damjan Marion --- src/vlib/pci/linux_pci.c | 12 ++++++------ src/vlib/pci/pci.h | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/pci/linux_pci.c b/src/vlib/pci/linux_pci.c index 623737d5..2d3c0a88 100644 --- a/src/vlib/pci/linux_pci.c +++ b/src/vlib/pci/linux_pci.c @@ -562,8 +562,6 @@ scan_device (void *arg, u8 * dev_dir_name, u8 * ignored) hash_set (pm->pci_dev_index_by_pci_addr, dev->bus_address.as_u32, dev - pm->pci_devs); - error = init_device (vm, dev, &pdev); - vec_reset_length (f); f = format (f, "%v/vpd%c", dev_dir_name, 0); fd = open ((char *) f, O_RDONLY); @@ -601,10 +599,6 @@ scan_device (void *arg, u8 * dev_dir_name, u8 * ignored) close (fd); } - vec_reset_length (f); - f = format (f, "%v/driver%c", dev_dir_name, 0); - dev->driver_name = vlib_sysfs_link_to_name ((char *) f); - dev->numa_node = -1; vec_reset_length (f); f = format (f, "%v/numa_node%c", dev_dir_name, 0); @@ -625,6 +619,12 @@ scan_device (void *arg, u8 * dev_dir_name, u8 * ignored) vlib_sysfs_read ((char *) f, "0x%x", &tmp); dev->device_id = tmp; + error = init_device (vm, dev, &pdev); + + vec_reset_length (f); + f = format (f, "%v/driver%c", dev_dir_name, 0); + dev->driver_name = vlib_sysfs_link_to_name ((char *) f); + done: vec_free (f); return error; diff --git a/src/vlib/pci/pci.h b/src/vlib/pci/pci.h index 811a6ff2..21410809 100644 --- a/src/vlib/pci/pci.h +++ b/src/vlib/pci/pci.h @@ -215,7 +215,7 @@ vlib_pci_bus_master_enable (vlib_pci_device_t * dev) if (err) return err; - if (!(command & PCI_COMMAND_BUS_MASTER)) + if (command & PCI_COMMAND_BUS_MASTER) return 0; command |= PCI_COMMAND_BUS_MASTER; -- cgit 1.2.3-korg From 04a7f05e91e919f51eaecaee476435484076655b Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Mon, 10 Jul 2017 15:06:17 +0200 Subject: vlib: store buffer memory information in the buffer_main Currently, buffer index is calculated as a offset to the physmem region shifted by log2_cacheline size. When DPDK is used we "hack" physmem data with information taken from dpdk mempool. This makes physmem code not usable with DPDK. This change makes buffer memory start and size independent of physmem basically allowing physmem to be used when DPDK plugin is loaded. Change-Id: Ieb399d398f147583b9baab467152a352d58c9c31 Signed-off-by: Damjan Marion --- src/plugins/dpdk/buffer.c | 69 +++-------------------------------------------- src/vlib/buffer.c | 57 ++++++++++++++++++++++----------------- src/vlib/buffer.h | 32 +++++++++++++++++----- src/vlib/buffer_funcs.h | 13 ++++++--- src/vlib/main.c | 20 ++++++++++++-- src/vlib/unix/physmem.c | 9 ------- src/vnet/pg/input.c | 6 ++--- src/vnet/pg/stream.c | 2 +- src/vnet/replication.c | 2 +- 9 files changed, 96 insertions(+), 114 deletions(-) (limited to 'src/vlib') diff --git a/src/plugins/dpdk/buffer.c b/src/plugins/dpdk/buffer.c index fd1d8415..aa73eb6c 100644 --- a/src/plugins/dpdk/buffer.c +++ b/src/plugins/dpdk/buffer.c @@ -431,7 +431,6 @@ vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, unsigned socket_id) { dpdk_main_t *dm = &dpdk_main; - vlib_physmem_main_t *vpm = &vm->physmem_main; struct rte_mempool *rmp; int i; @@ -453,63 +452,10 @@ vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, if (rmp) { { - uword this_pool_end; - uword this_pool_start; - uword this_pool_size; - uword save_vpm_start, save_vpm_end, save_vpm_size; struct rte_mempool_memhdr *memhdr; - this_pool_start = ~0; - this_pool_end = 0; - STAILQ_FOREACH (memhdr, &rmp->mem_list, next) - { - if (((uword) (memhdr->addr + memhdr->len)) > this_pool_end) - this_pool_end = (uword) (memhdr->addr + memhdr->len); - if (((uword) memhdr->addr) < this_pool_start) - this_pool_start = (uword) (memhdr->addr); - } - ASSERT (this_pool_start < ~0 && this_pool_end > 0); - this_pool_size = this_pool_end - this_pool_start; - - if (CLIB_DEBUG > 1) - { - clib_warning ("%s: pool start %llx pool end %llx pool size %lld", - pool_name, this_pool_start, this_pool_end, - this_pool_size); - clib_warning - ("before: virtual.start %llx virtual.end %llx virtual.size %lld", - vpm->virtual.start, vpm->virtual.end, vpm->virtual.size); - } - - save_vpm_start = vpm->virtual.start; - save_vpm_end = vpm->virtual.end; - save_vpm_size = vpm->virtual.size; - - if ((this_pool_start < vpm->virtual.start) || vpm->virtual.start == 0) - vpm->virtual.start = this_pool_start; - if (this_pool_end > vpm->virtual.end) - vpm->virtual.end = this_pool_end; - - vpm->virtual.size = vpm->virtual.end - vpm->virtual.start; - - if (CLIB_DEBUG > 1) - { - clib_warning - ("after: virtual.start %llx virtual.end %llx virtual.size %lld", - vpm->virtual.start, vpm->virtual.end, vpm->virtual.size); - } - - /* check if fits into buffer index range */ - if ((u64) vpm->virtual.size > - ((u64) 1 << (32 + CLIB_LOG2_CACHE_LINE_BYTES))) - { - clib_warning ("physmem: virtual size out of range!"); - vpm->virtual.start = save_vpm_start; - vpm->virtual.end = save_vpm_end; - vpm->virtual.size = save_vpm_size; - rmp = 0; - } + vlib_buffer_add_mem_range (vm, (uword) memhdr->addr, memhdr->len); } if (rmp) { @@ -564,7 +510,8 @@ buffer_state_validation_init (vlib_main_t * vm) VLIB_INIT_FUNCTION (buffer_state_validation_init); #endif -static vlib_buffer_callbacks_t callbacks = { +/* *INDENT-OFF* */ +VLIB_BUFFER_REGISTER_CALLBACKS (dpdk, static) = { .vlib_buffer_alloc_cb = &dpdk_buffer_alloc, .vlib_buffer_alloc_from_free_list_cb = &dpdk_buffer_alloc_from_free_list, .vlib_buffer_free_cb = &dpdk_buffer_free, @@ -572,15 +519,7 @@ static vlib_buffer_callbacks_t callbacks = { .vlib_packet_template_init_cb = &dpdk_packet_template_init, .vlib_buffer_delete_free_list_cb = &dpdk_buffer_delete_free_list, }; - -static clib_error_t * -dpdk_buffer_init (vlib_main_t * vm) -{ - vlib_buffer_cb_register (vm, &callbacks); - return 0; -} - -VLIB_INIT_FUNCTION (dpdk_buffer_init); +/* *INDENT-ON* */ /** @endcond */ /* diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index e50064ae..b2a095cf 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -46,6 +46,8 @@ #include #include +vlib_buffer_callbacks_t *vlib_buffer_callbacks = 0; + uword vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm, vlib_buffer_t * b_first) @@ -584,11 +586,6 @@ alloc_from_free_list (vlib_main_t * vm, dst = alloc_buffers; - /* wait with buffer memory allocation as long as possible - in case external buffer manager takes over */ - if (PREDICT_FALSE (vm->os_physmem_alloc_aligned == 0)) - unix_physmem_init (vm, 0 /* fail_if_physical_memory_not_present */ ); - n_filled = fill_free_list (vm, free_list, n_alloc_buffers); if (n_filled == 0) return 0; @@ -944,6 +941,36 @@ vlib_buffer_chain_append_data_with_alloc (vlib_main_t * vm, return copied; } +void +vlib_buffer_add_mem_range (vlib_main_t * vm, uword start, uword size) +{ + vlib_buffer_main_t *bm = vm->buffer_main; + + if (bm->buffer_mem_size == 0) + { + bm->buffer_mem_start = start; + bm->buffer_mem_size = size; + } + else if (start < bm->buffer_mem_start) + { + bm->buffer_mem_size += bm->buffer_mem_start - start; + bm->buffer_mem_start = start; + if (size > bm->buffer_mem_size) + bm->buffer_mem_size = size; + } + else if (start > bm->buffer_mem_start) + { + uword new_size = start - bm->buffer_mem_start + size; + if (new_size > bm->buffer_mem_size) + bm->buffer_mem_size = new_size; + } + + if ((u64) bm->buffer_mem_size > + ((u64) 1 << (32 + CLIB_LOG2_CACHE_LINE_BYTES))) + { + clib_panic ("buffer memory size out of range!"); + } +} static u8 * format_vlib_buffer_free_list (u8 * s, va_list * va) @@ -1011,6 +1038,7 @@ void vlib_buffer_cb_init (struct vlib_main_t *vm) { vlib_buffer_main_t *bm = vm->buffer_main; + bm->cb.vlib_buffer_alloc_cb = &vlib_buffer_alloc_internal; bm->cb.vlib_buffer_alloc_from_free_list_cb = &vlib_buffer_alloc_from_free_list_internal; @@ -1018,25 +1046,6 @@ vlib_buffer_cb_init (struct vlib_main_t *vm) bm->cb.vlib_buffer_free_no_next_cb = &vlib_buffer_free_no_next_internal; bm->cb.vlib_buffer_delete_free_list_cb = &vlib_buffer_delete_free_list_internal; - bm->extern_buffer_mgmt = 0; -} - -int -vlib_buffer_cb_register (struct vlib_main_t *vm, vlib_buffer_callbacks_t * cb) -{ - vlib_buffer_main_t *bm = vm->buffer_main; - if (bm->extern_buffer_mgmt) - return -1; - -#define _(x) bm->cb.x = cb->x - _(vlib_buffer_alloc_cb); - _(vlib_buffer_alloc_from_free_list_cb); - _(vlib_buffer_free_cb); - _(vlib_buffer_free_no_next_cb); - _(vlib_buffer_delete_free_list_cb); -#undef _ - bm->extern_buffer_mgmt = 1; - return 0; } /** @endcond */ diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index 18e2437d..b20538b7 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -388,12 +388,20 @@ typedef struct u32 free_list_index); } vlib_buffer_callbacks_t; +extern vlib_buffer_callbacks_t *vlib_buffer_callbacks; + typedef struct { + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + /* Virtual memory address and size of buffer memory, used for calculating + buffer index */ + uword buffer_mem_start; + uword buffer_mem_size; + /* Buffer free callback, for subversive activities */ - u32 (*buffer_free_callback) (struct vlib_main_t * vm, - u32 * buffers, - u32 n_buffers, u32 follow_buffer_next); + u32 (*buffer_free_callback) (struct vlib_main_t * vm, + u32 * buffers, + u32 n_buffers, u32 follow_buffer_next); /* Pool of buffer free lists. Multiple free lists exist for packet generator which uses separate free lists for each packet stream --- so as to avoid @@ -417,12 +425,12 @@ typedef struct /* Callbacks */ vlib_buffer_callbacks_t cb; - int extern_buffer_mgmt; + int callbacks_registered; } vlib_buffer_main_t; +void vlib_buffer_add_mem_range (struct vlib_main_t *vm, uword start, + uword size); void vlib_buffer_cb_init (struct vlib_main_t *vm); -int vlib_buffer_cb_register (struct vlib_main_t *vm, - vlib_buffer_callbacks_t * cb); typedef struct { @@ -498,6 +506,18 @@ serialize_vlib_buffer_n_bytes (serialize_main_t * m) #endif /* included_vlib_buffer_h */ +#define VLIB_BUFFER_REGISTER_CALLBACKS(x,...) \ + __VA_ARGS__ vlib_buffer_callbacks_t __##x##_buffer_callbacks; \ +static void __vlib_add_buffer_callbacks_t_##x (void) \ + __attribute__((__constructor__)) ; \ +static void __vlib_add_buffer_callbacks_t_##x (void) \ +{ \ + if (vlib_buffer_callbacks) \ + clib_panic ("vlib buffer callbacks already registered"); \ + vlib_buffer_callbacks = &__##x##_buffer_callbacks; \ +} \ +__VA_ARGS__ vlib_buffer_callbacks_t __##x##_buffer_callbacks + /* * fd.io coding-style-patch-verification: ON * diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 328660a3..79e3e69c 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -56,8 +56,11 @@ always_inline vlib_buffer_t * vlib_get_buffer (vlib_main_t * vm, u32 buffer_index) { - return vlib_physmem_at_offset (&vm->physmem_main, ((uword) buffer_index) - << CLIB_LOG2_CACHE_LINE_BYTES); + vlib_buffer_main_t *bm = vm->buffer_main; + uword offset = ((uword) buffer_index) << CLIB_LOG2_CACHE_LINE_BYTES; + ASSERT (offset < bm->buffer_mem_size); + + return uword_to_pointer (bm->buffer_mem_start + offset, void *); } /** \brief Translate buffer pointer into buffer index @@ -66,10 +69,14 @@ vlib_get_buffer (vlib_main_t * vm, u32 buffer_index) @param p - (void *) buffer pointer @return - (u32) buffer index */ + always_inline u32 vlib_get_buffer_index (vlib_main_t * vm, void *p) { - uword offset = vlib_physmem_offset_of (&vm->physmem_main, p); + vlib_buffer_main_t *bm = vm->buffer_main; + uword offset = pointer_to_uword (p) - bm->buffer_mem_start; + ASSERT (pointer_to_uword (p) >= bm->buffer_mem_start); + ASSERT (offset < bm->buffer_mem_size); ASSERT ((offset % (1 << CLIB_LOG2_CACHE_LINE_BYTES)) == 0); return offset >> CLIB_LOG2_CACHE_LINE_BYTES; } diff --git a/src/vlib/main.c b/src/vlib/main.c index 19d70232..73548fbe 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -43,6 +43,7 @@ #include #include +#include #include CJ_GLOBAL_LOG_PROTOTYPE; @@ -466,7 +467,7 @@ vlib_put_next_frame (vlib_main_t * vm, vlib_frame_t *f; u32 n_vectors_in_frame; - if (vm->buffer_main->extern_buffer_mgmt == 0 && CLIB_DEBUG > 0) + if (vm->buffer_main->callbacks_registered == 0 && CLIB_DEBUG > 0) vlib_put_next_frame_validate (vm, r, next_index, n_vectors_left); nf = vlib_node_runtime_get_next_frame (vm, r, next_index); @@ -1712,7 +1713,22 @@ vlib_main (vlib_main_t * volatile vm, unformat_input_t * input) vm->name = "VLIB"; vec_validate (vm->buffer_main, 0); - vlib_buffer_cb_init (vm); + if (vlib_buffer_callbacks) + { + /* external plugin has registered own buffer callbacks + so we just copy them */ + vlib_buffer_main_t *bm = vm->buffer_main; + clib_memcpy (&bm->cb, vlib_buffer_callbacks, + sizeof (vlib_buffer_callbacks_t)); + bm->callbacks_registered = 1; + } + else + { + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_buffer_cb_init (vm); + unix_physmem_init (vm, 0 /* fail_if_physical_memory_not_present */ ); + vlib_buffer_add_mem_range (vm, vpm->virtual.start, vpm->virtual.size); + } if ((error = vlib_thread_init (vm))) { diff --git a/src/vlib/unix/physmem.c b/src/vlib/unix/physmem.c index 8d10ad2e..933cc63b 100644 --- a/src/vlib/unix/physmem.c +++ b/src/vlib/unix/physmem.c @@ -45,14 +45,10 @@ static void * unix_physmem_alloc_aligned (vlib_physmem_main_t * vpm, uword n_bytes, uword alignment) { - vlib_main_t *vm = vlib_get_main (); physmem_main_t *pm = &physmem_main; uword lo_offset, hi_offset; uword *to_free = 0; - if (vm->buffer_main->extern_buffer_mgmt) - clib_warning ("unsafe alloc!"); - /* IO memory is always at least cache aligned. */ alignment = clib_max (alignment, CLIB_CACHE_LINE_BYTES); @@ -270,11 +266,6 @@ show_physmem (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) { physmem_main_t *pm = &physmem_main; - if (vm->buffer_main->extern_buffer_mgmt) - { - vlib_cli_output (vm, "Not supported with external buffer management."); - return 0; - } if (pm->heap) vlib_cli_output (vm, "%U", format_mheap, pm->heap, /* verbose */ 1); diff --git a/src/vnet/pg/input.c b/src/vnet/pg/input.c index 597ae060..c3738a6a 100644 --- a/src/vnet/pg/input.c +++ b/src/vnet/pg/input.c @@ -1214,7 +1214,7 @@ pg_stream_fill_helper (pg_main_t * pg, * Historically, the pg maintained its own free lists and * device drivers tx paths would return pkts. */ - if (vm->buffer_main->extern_buffer_mgmt == 0 && + if (vm->buffer_main->callbacks_registered == 0 && !(s->flags & PG_STREAM_FLAGS_DISABLE_BUFFER_RECYCLE)) f->buffer_init_function = pg_buffer_init; f->buffer_init_function_opaque = @@ -1238,7 +1238,7 @@ pg_stream_fill_helper (pg_main_t * pg, n_alloc = n_allocated; /* Reinitialize buffers */ - if (vm->buffer_main->extern_buffer_mgmt == 0 || CLIB_DEBUG > 0 + if (vm->buffer_main->callbacks_registered == 0 || CLIB_DEBUG > 0 || (s->flags & PG_STREAM_FLAGS_DISABLE_BUFFER_RECYCLE)) init_buffers_inline (vm, s, @@ -1246,7 +1246,7 @@ pg_stream_fill_helper (pg_main_t * pg, n_alloc, (bi - s->buffer_indices) * s->buffer_bytes /* data offset */ , s->buffer_bytes, /* set_data */ - vm->buffer_main->extern_buffer_mgmt != 0 + vm->buffer_main->callbacks_registered != 0 || (s->flags & PG_STREAM_FLAGS_DISABLE_BUFFER_RECYCLE) != 0); if (next_buffers) diff --git a/src/vnet/pg/stream.c b/src/vnet/pg/stream.c index 05d820a3..a540b32b 100644 --- a/src/vnet/pg/stream.c +++ b/src/vnet/pg/stream.c @@ -438,7 +438,7 @@ pg_stream_add (pg_main_t * pg, pg_stream_t * s_init) pg_buffer_index_t *bi; int n; - if (vm->buffer_main->extern_buffer_mgmt) + if (vm->buffer_main->callbacks_registered) s->buffer_bytes = VLIB_BUFFER_DATA_SIZE; if (!s->buffer_bytes) diff --git a/src/vnet/replication.c b/src/vnet/replication.c index 233a8c2f..1c6f28d2 100644 --- a/src/vnet/replication.c +++ b/src/vnet/replication.c @@ -214,7 +214,7 @@ replication_recycle_callback (vlib_main_t * vm, vlib_buffer_free_list_t * fl) b0->flags |= VLIB_BUFFER_IS_RECYCLED; #if (CLIB_DEBUG > 0) - if (vm->buffer_main->extern_buffer_mgmt == 0) + if (vm->buffer_main->callbacks_registered == 0) vlib_buffer_set_known_state (vm, bi0, VLIB_BUFFER_KNOWN_ALLOCATED); #endif -- cgit 1.2.3-korg From a2522f6fd57eb93f57dfcc27c59862d4cc32879a Mon Sep 17 00:00:00 2001 From: Chris Luke Date: Fri, 7 Jul 2017 14:57:07 -0400 Subject: dpdk: fix dpdk_buffer_pool_create name - vnet_buffer_pool_create should probably be named dpdk_buffer_pool_create since that is what it does. - Its prototype should also be in a DPDK plugin header, not in vlib/buffer_funcs.h, since the implementation is in the plugin and nobody else should be calling it. Change-Id: I7ba259afa4b888bc94f3ad257305e286b41e7370 Signed-off-by: Chris Luke --- src/plugins/dpdk/buffer.c | 2 +- src/plugins/dpdk/device/dpdk.h | 3 +++ src/plugins/dpdk/device/init.c | 4 ++-- src/vlib/buffer_funcs.h | 3 --- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'src/vlib') diff --git a/src/plugins/dpdk/buffer.c b/src/plugins/dpdk/buffer.c index aa73eb6c..b0f247e1 100644 --- a/src/plugins/dpdk/buffer.c +++ b/src/plugins/dpdk/buffer.c @@ -427,7 +427,7 @@ dpdk_packet_template_init (vlib_main_t * vm, } clib_error_t * -vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, +dpdk_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, unsigned socket_id) { dpdk_main_t *dm = &dpdk_main; diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index a2686978..55f63b37 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -453,6 +453,9 @@ uword admin_up_down_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f); +clib_error_t *dpdk_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, + unsigned socket_id); + #endif /* __included_dpdk_h__ */ /* diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 17e77618..2ec1664b 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -1210,13 +1210,13 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) rte_dump_physmem_layout (stdout); /* main thread 1st */ - error = vlib_buffer_pool_create (vm, conf->num_mbufs, rte_socket_id ()); + error = dpdk_buffer_pool_create (vm, conf->num_mbufs, rte_socket_id ()); if (error) return error; for (i = 0; i < RTE_MAX_LCORE; i++) { - error = vlib_buffer_pool_create (vm, conf->num_mbufs, + error = dpdk_buffer_pool_create (vm, conf->num_mbufs, rte_lcore_to_socket_id (i)); if (error) return error; diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 79e3e69c..97442e12 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -237,9 +237,6 @@ vlib_buffer_set_known_state (vlib_main_t * vm, u8 *vlib_validate_buffer (vlib_main_t * vm, u32 buffer_index, uword follow_chain); -clib_error_t *vlib_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, - unsigned socket_id); - /** \brief Allocate buffers into supplied array @param vm - (vlib_main_t *) vlib main data structure pointer -- cgit 1.2.3-korg From 19e9d954bd9eb4f04d48640d6540198e84ef65d7 Mon Sep 17 00:00:00 2001 From: "Igor Mikhailov (imichail)" Date: Mon, 3 Jul 2017 17:01:50 -0700 Subject: VPP-895 multi-thread: fix vpp crash on show runtime In multi-threaded model (e.g. 1 main and 1 worker threads), after an ethernet interface is deleted (e.g. vhost-user interface), 'show runtime' command produces garbled output and sometimes leads to vpp crash. The reason is because vlib_node_rename() frees and reallocates node's 'n->name' vector, however the change is not propagated into copies of the node on worker threads. Change-Id: Ibf22422913b7f2df22f70f3b2fe8dafd34c1dd06 Signed-off-by: Igor Mikhailov (imichail) (cherry picked from commit 02989064e4c26a4940a5292ba6c47023e6dd3131) --- src/vlib/node.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'src/vlib') diff --git a/src/vlib/node.c b/src/vlib/node.c index 2cda0f06..e6739dc7 100644 --- a/src/vlib/node.c +++ b/src/vlib/node.c @@ -72,6 +72,32 @@ node_set_elog_name (vlib_main_t * vm, uword node_index) n->name_elog_string = elog_string (&vm->elog_main, "%v%c", n->name, 0); } +static void +vlib_worker_thread_node_rename (u32 node_index) +{ + int i; + vlib_main_t *vm; + vlib_node_t *n; + + if (vec_len (vlib_mains) == 1) + return; + + vm = vlib_mains[0]; + n = vlib_get_node (vm, node_index); + + ASSERT (vlib_get_thread_index () == 0); + ASSERT (*vlib_worker_threads->wait_at_barrier == 1); + + for (i = 1; i < vec_len (vlib_mains); i++) + { + vlib_main_t *vm_worker = vlib_mains[i]; + vlib_node_t *n_worker = vlib_get_node (vm_worker, node_index); + + n_worker->name = n->name; + n_worker->name_elog_string = n->name_elog_string; + } +} + void vlib_node_rename (vlib_main_t * vm, u32 node_index, char *fmt, ...) { @@ -87,6 +113,9 @@ vlib_node_rename (vlib_main_t * vm, u32 node_index, char *fmt, ...) hash_set (nm->node_by_name, n->name, n->index); node_set_elog_name (vm, node_index); + + /* Propagate the change to all worker threads */ + vlib_worker_thread_node_rename (node_index); } static void -- cgit 1.2.3-korg From 072401e8096c648b91f958bd911f64ce24fecff9 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 13 Jul 2017 18:53:27 +0200 Subject: Introduce l{2,3,4}_hdr_offset fields in the buffer metadata To save space in the first cacheline following is changed: - total_length_not_including_first_buffer moved to the 2nd cacheline. This field is used only when VLIB_BUFFER_TOTAL_LENGTH_VALID and VLIB_BUFFER_NEXT_PRESENT are both set. - free_list_index is now stored in 4bits inside flags, which allows up to 16 free lists. In case we need more we can store index in the 2nd cachelin Change-Id: Ic8521350819391af470d31d3fa1013e67ecb7681 Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/node.c | 8 ++++++- src/vlib/buffer.c | 16 ++++++++----- src/vlib/buffer.h | 40 +++++++++++++++++--------------- src/vlib/buffer_funcs.h | 50 +++++++++++++++++++++++++++++----------- src/vnet/bfd/bfd_udp.c | 4 ++-- src/vnet/buffer.h | 14 +++-------- src/vnet/dhcp/dhcp4_proxy_node.c | 2 +- src/vnet/dhcp/dhcp6_proxy_node.c | 2 +- src/vnet/ethernet/ethernet.h | 3 +-- src/vnet/ethernet/node.c | 23 ++++++++---------- src/vnet/ip/ip4_forward.c | 6 ++--- src/vnet/ip/ip6_forward.c | 6 ++--- src/vnet/ip/ip6_neighbor.c | 19 +++++++-------- src/vnet/l2/l2_bvi.h | 2 +- src/vnet/lisp-cp/control.c | 2 +- src/vnet/replication.c | 6 ++--- 16 files changed, 111 insertions(+), 92 deletions(-) (limited to 'src/vlib') diff --git a/src/plugins/dpdk/device/node.c b/src/plugins/dpdk/device/node.c index 69acc529..74fb8da1 100644 --- a/src/plugins/dpdk/device/node.c +++ b/src/plugins/dpdk/device/node.c @@ -208,7 +208,13 @@ dpdk_process_subseq_segs (vlib_main_t * vm, vlib_buffer_t * b, mb_seg = mb->next; b_chain = b; - while ((mb->nb_segs > 1) && (nb_seg < mb->nb_segs)) + if (mb->nb_segs < 2) + return; + + b->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + b->total_length_not_including_first_buffer = 0; + + while (nb_seg < mb->nb_segs) { ASSERT (mb_seg != 0); diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index b2a095cf..53b60c16 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -72,8 +72,8 @@ format_vlib_buffer (u8 * s, va_list * args) uword indent = format_get_indent (s); s = format (s, "current data %d, length %d, free-list %d, clone-count %u", - b->current_data, b->current_length, b->free_list_index, - b->n_add_refs); + b->current_data, b->current_length, + vlib_buffer_get_free_list_index (b), b->n_add_refs); if (b->flags & VLIB_BUFFER_TOTAL_LENGTH_VALID) s = format (s, ", totlen-nifb %d", @@ -163,10 +163,14 @@ vlib_validate_buffer_helper (vlib_main_t * vm, vlib_buffer_main_t *bm = vm->buffer_main; vlib_buffer_free_list_t *fl; - if (pool_is_free_index (bm->buffer_free_list_pool, b->free_list_index)) - return format (0, "unknown free list 0x%x", b->free_list_index); + if (pool_is_free_index + (bm->buffer_free_list_pool, vlib_buffer_get_free_list_index (b))) + return format (0, "unknown free list 0x%x", + vlib_buffer_get_free_list_index (b)); - fl = pool_elt_at_index (bm->buffer_free_list_pool, b->free_list_index); + fl = + pool_elt_at_index (bm->buffer_free_list_pool, + vlib_buffer_get_free_list_index (b)); if ((signed) b->current_data < (signed) -VLIB_BUFFER_PRE_DATA_SIZE) return format (0, "current data %d before pre-data", b->current_data); @@ -388,7 +392,7 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm, f->name = clib_mem_is_vec (name) ? name : format (0, "%s", name); /* Setup free buffer template. */ - f->buffer_init_template.free_list_index = f->index; + vlib_buffer_set_free_list_index (&f->buffer_init_template, f->index); f->buffer_init_template.n_add_refs = 0; if (is_public) diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index b20538b7..c810db4e 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -72,6 +72,7 @@ typedef struct the end of this buffer. */ u32 flags; /**< buffer flags: +
VLIB_BUFFER_FREE_LIST_INDEX_MASK: bits used to store free list index,
VLIB_BUFFER_IS_TRACED: trace this buffer.
VLIB_BUFFER_NEXT_PRESENT: this is a multi-chunk buffer.
VLIB_BUFFER_TOTAL_LENGTH_VALID: as it says @@ -82,28 +83,26 @@ typedef struct set to avoid adding it to a flow report
VLIB_BUFFER_FLAG_USER(n): user-defined bit N */ -#define VLIB_BUFFER_IS_TRACED (1 << 0) -#define VLIB_BUFFER_LOG2_NEXT_PRESENT (1) + +/* any change to the following line requres update of + * vlib_buffer_get_free_list_index(...) and + * vlib_buffer_set_free_list_index(...) functions */ +#define VLIB_BUFFER_FREE_LIST_INDEX_MASK ((1 << 4) - 1) + +#define VLIB_BUFFER_IS_TRACED (1 << 4) +#define VLIB_BUFFER_LOG2_NEXT_PRESENT (5) #define VLIB_BUFFER_NEXT_PRESENT (1 << VLIB_BUFFER_LOG2_NEXT_PRESENT) -#define VLIB_BUFFER_IS_RECYCLED (1 << 2) -#define VLIB_BUFFER_TOTAL_LENGTH_VALID (1 << 3) -#define VLIB_BUFFER_REPL_FAIL (1 << 4) -#define VLIB_BUFFER_RECYCLE (1 << 5) -#define VLIB_BUFFER_FLOW_REPORT (1 << 6) -#define VLIB_BUFFER_EXT_HDR_VALID (1 << 7) +#define VLIB_BUFFER_IS_RECYCLED (1 << 6) +#define VLIB_BUFFER_TOTAL_LENGTH_VALID (1 << 7) +#define VLIB_BUFFER_REPL_FAIL (1 << 8) +#define VLIB_BUFFER_RECYCLE (1 << 9) +#define VLIB_BUFFER_FLOW_REPORT (1 << 10) +#define VLIB_BUFFER_EXT_HDR_VALID (1 << 11) /* User defined buffer flags. */ #define LOG2_VLIB_BUFFER_FLAG_USER(n) (32 - (n)) #define VLIB_BUFFER_FLAG_USER(n) (1 << LOG2_VLIB_BUFFER_FLAG_USER(n)) - u32 free_list_index; /**< Buffer free list that this buffer was - allocated from and will be freed to. - */ - - u32 total_length_not_including_first_buffer; - /**< Only valid for first buffer in chain. Current length plus - total length given here give total number of bytes in buffer chain. - */ STRUCT_MARK (template_end); u32 next_buffer; /**< Next buffer for this linked-list of buffers. @@ -128,7 +127,7 @@ typedef struct Before allocating any of it, discussion required! */ - u32 opaque[8]; /**< Opaque data used by sub-graphs for their own purposes. + u32 opaque[10]; /**< Opaque data used by sub-graphs for their own purposes. See .../vnet/vnet/buffer.h */ CLIB_CACHE_LINE_ALIGN_MARK (cacheline1); @@ -137,7 +136,12 @@ typedef struct if VLIB_PACKET_IS_TRACED flag is set. */ u32 recycle_count; /**< Used by L2 path recycle code */ - u32 opaque2[14]; /**< More opaque data, currently unused */ + + u32 total_length_not_including_first_buffer; + /**< Only valid for first buffer in chain. Current length plus + total length given here give total number of bytes in buffer chain. + */ + u32 opaque2[13]; /**< More opaque data, currently unused */ /***** end of second cache line */ CLIB_CACHE_LINE_ALIGN_MARK (cacheline2); diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 97442e12..1aaac0b2 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -106,12 +106,15 @@ uword vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm, always_inline uword vlib_buffer_length_in_chain (vlib_main_t * vm, vlib_buffer_t * b) { - uword l = b->current_length + b->total_length_not_including_first_buffer; - if (PREDICT_FALSE ((b->flags & (VLIB_BUFFER_NEXT_PRESENT - | VLIB_BUFFER_TOTAL_LENGTH_VALID)) - == VLIB_BUFFER_NEXT_PRESENT)) - return vlib_buffer_length_in_chain_slow_path (vm, b); - return l; + uword len = b->current_length; + + if (PREDICT_TRUE ((b->flags & VLIB_BUFFER_NEXT_PRESENT) == 0)) + return len; + + if (PREDICT_TRUE (b->flags & VLIB_BUFFER_TOTAL_LENGTH_VALID)) + return len + b->total_length_not_including_first_buffer; + + return vlib_buffer_length_in_chain_slow_path (vm, b); } /** \brief Get length in bytes of the buffer index buffer chain @@ -261,6 +264,24 @@ vlib_buffer_round_size (u32 size) return round_pow2 (size, sizeof (vlib_buffer_t)); } +always_inline u32 +vlib_buffer_get_free_list_index (vlib_buffer_t * b) +{ + return b->flags & VLIB_BUFFER_FREE_LIST_INDEX_MASK; +} + +always_inline void +vlib_buffer_set_free_list_index (vlib_buffer_t * b, u32 index) +{ + /* if there is an need for more free lists we should consider + storig data in the 2nd cacheline */ + ASSERT (VLIB_BUFFER_FREE_LIST_INDEX_MASK & 1); + ASSERT (index <= VLIB_BUFFER_FREE_LIST_INDEX_MASK); + + b->flags &= ~VLIB_BUFFER_FREE_LIST_INDEX_MASK; + b->flags |= index & VLIB_BUFFER_FREE_LIST_INDEX_MASK; +} + /** \brief Allocate buffers from specific freelist into supplied array @param vm - (vlib_main_t *) vlib main data structure pointer @@ -381,7 +402,7 @@ vlib_buffer_get_buffer_free_list (vlib_main_t * vm, vlib_buffer_t * b, vlib_buffer_main_t *bm = vm->buffer_main; u32 i; - *index = i = b->free_list_index; + *index = i = vlib_buffer_get_free_list_index (b); return pool_elt_at_index (bm->buffer_free_list_pool, i); } @@ -569,7 +590,8 @@ vlib_buffer_clone (vlib_main_t * vm, u32 src_buffer, u32 * buffers, } n_buffers = vlib_buffer_alloc_from_free_list (vm, buffers, n_buffers, - s->free_list_index); + vlib_buffer_get_free_list_index + (s)); if (PREDICT_FALSE (n_buffers == 0)) { buffers[0] = src_buffer; @@ -581,7 +603,8 @@ vlib_buffer_clone (vlib_main_t * vm, u32 src_buffer, u32 * buffers, vlib_buffer_t *d = vlib_get_buffer (vm, buffers[i]); d->current_data = s->current_data; d->current_length = head_end_offset; - d->free_list_index = s->free_list_index; + vlib_buffer_set_free_list_index (d, + vlib_buffer_get_free_list_index (s)); d->total_length_not_including_first_buffer = s->total_length_not_including_first_buffer + s->current_length - head_end_offset; @@ -615,7 +638,8 @@ vlib_buffer_attach_clone (vlib_main_t * vm, vlib_buffer_t * head, vlib_buffer_t * tail) { ASSERT ((head->flags & VLIB_BUFFER_NEXT_PRESENT) == 0); - ASSERT (head->free_list_index == tail->free_list_index); + ASSERT (vlib_buffer_get_free_list_index (head) == + vlib_buffer_get_free_list_index (tail)); head->flags |= VLIB_BUFFER_NEXT_PRESENT; head->flags &= ~VLIB_BUFFER_TOTAL_LENGTH_VALID; @@ -791,7 +815,7 @@ vlib_buffer_init_for_free_list (vlib_buffer_t * dst, CLIB_CACHE_LINE_BYTES * 2); /* Make sure buffer template is sane. */ - ASSERT (fl->index == fl->buffer_init_template.free_list_index); + ASSERT (fl->index == vlib_buffer_get_free_list_index (src)); clib_memcpy (STRUCT_MARK_PTR (dst, template_start), STRUCT_MARK_PTR (src, template_start), @@ -806,7 +830,6 @@ vlib_buffer_init_for_free_list (vlib_buffer_t * dst, _(current_data); _(current_length); _(flags); - _(free_list_index); #undef _ ASSERT (dst->total_length_not_including_first_buffer == 0); ASSERT (dst->n_add_refs == 0); @@ -832,7 +855,7 @@ vlib_buffer_init_two_for_free_list (vlib_buffer_t * dst0, vlib_buffer_t *src = &fl->buffer_init_template; /* Make sure buffer template is sane. */ - ASSERT (fl->index == fl->buffer_init_template.free_list_index); + ASSERT (fl->index == vlib_buffer_get_free_list_index (src)); clib_memcpy (STRUCT_MARK_PTR (dst0, template_start), STRUCT_MARK_PTR (src, template_start), @@ -853,7 +876,6 @@ vlib_buffer_init_two_for_free_list (vlib_buffer_t * dst0, _(current_data); _(current_length); _(flags); - _(free_list_index); #undef _ ASSERT (dst0->total_length_not_including_first_buffer == 0); diff --git a/src/vnet/bfd/bfd_udp.c b/src/vnet/bfd/bfd_udp.c index 346c5495..06b843c6 100644 --- a/src/vnet/bfd/bfd_udp.c +++ b/src/vnet/bfd/bfd_udp.c @@ -843,7 +843,7 @@ bfd_udp4_find_headers (vlib_buffer_t * b, ip4_header_t ** ip4, udp_header_t ** udp) { /* sanity check first */ - const i32 start = vnet_buffer (b)->ip.start_of_ip_header; + const i32 start = vnet_buffer (b)->l3_hdr_offset; if (start < 0 && start < sizeof (b->pre_data)) { BFD_ERR ("Start of ip header is before pre_data, ignoring"); @@ -1000,7 +1000,7 @@ bfd_udp6_find_headers (vlib_buffer_t * b, ip6_header_t ** ip6, udp_header_t ** udp) { /* sanity check first */ - const i32 start = vnet_buffer (b)->ip.start_of_ip_header; + const i32 start = vnet_buffer (b)->l3_hdr_offset; if (start < 0 && start < sizeof (b->pre_data)) { BFD_ERR ("Start of ip header is before pre_data, ignoring"); diff --git a/src/vnet/buffer.h b/src/vnet/buffer.h index 9aba34da..8647db00 100644 --- a/src/vnet/buffer.h +++ b/src/vnet/buffer.h @@ -71,7 +71,6 @@ #define VNET_BUFFER_SPAN_CLONE (1 << LOG2_VNET_BUFFER_SPAN_CLONE) #define foreach_buffer_opaque_union_subtype \ -_(ethernet) \ _(ip) \ _(swt) \ _(l2) \ @@ -100,16 +99,12 @@ _(tcp) typedef struct { u32 sw_if_index[VLIB_N_RX_TX]; + i16 l2_hdr_offset; + i16 l3_hdr_offset; + i16 l4_hdr_offset; union { - /* Ethernet. */ - struct - { - /* Saved value of current header by ethernet-input. */ - i32 start_of_ethernet_header; - } ethernet; - /* IP4/6 buffer opaque. */ struct { @@ -143,9 +138,6 @@ typedef struct u8 code; u32 data; } icmp; - - /* IP header offset from vlib_buffer.data - saved by ip*_local nodes */ - i32 start_of_ip_header; }; } ip; diff --git a/src/vnet/dhcp/dhcp4_proxy_node.c b/src/vnet/dhcp/dhcp4_proxy_node.c index 26e1e65c..1b59cdea 100644 --- a/src/vnet/dhcp/dhcp4_proxy_node.c +++ b/src/vnet/dhcp/dhcp4_proxy_node.c @@ -231,7 +231,7 @@ dhcp_proxy_to_server_input (vlib_main_t * vm, o = (dhcp_option_t *) (((uword) o) + (o->length + 2)); } - fl = vlib_buffer_get_free_list (vm, b0->free_list_index); + fl = vlib_buffer_get_free_list (vm, vlib_buffer_get_free_list_index (b0)); // start write at (option*)o, some packets have padding if (((u8 *)o - (u8 *)b0->data + VPP_DHCP_OPTION82_SIZE) > fl->n_data_bytes) { diff --git a/src/vnet/dhcp/dhcp6_proxy_node.c b/src/vnet/dhcp/dhcp6_proxy_node.c index 885313a5..e109cc4c 100644 --- a/src/vnet/dhcp/dhcp6_proxy_node.c +++ b/src/vnet/dhcp/dhcp6_proxy_node.c @@ -306,7 +306,7 @@ dhcpv6_proxy_to_server_input (vlib_main_t * vm, copy_ip6_address(&r1->link_addr, ia0); link_address_set: - fl = vlib_buffer_get_free_list (vm, b0->free_list_index); + fl = vlib_buffer_get_free_list (vm, vlib_buffer_get_free_list_index (b0)); if ((b0->current_length+sizeof(*id1)+sizeof(*vss1)+sizeof(*cmac)) > fl->n_data_bytes) diff --git a/src/vnet/ethernet/ethernet.h b/src/vnet/ethernet/ethernet.h index dcc656a7..2fc5b804 100644 --- a/src/vnet/ethernet/ethernet.h +++ b/src/vnet/ethernet/ethernet.h @@ -344,8 +344,7 @@ ethernet_setup_node (vlib_main_t * vm, u32 node_index) always_inline ethernet_header_t * ethernet_buffer_get_header (vlib_buffer_t * b) { - return (void *) - (b->data + vnet_buffer (b)->ethernet.start_of_ethernet_header); + return (void *) (b->data + vnet_buffer (b)->l2_hdr_offset); } /** Returns the number of VLAN headers in the current Ethernet frame in the diff --git a/src/vnet/ethernet/node.c b/src/vnet/ethernet/node.c index d9fdff48..421d501a 100755 --- a/src/vnet/ethernet/node.c +++ b/src/vnet/ethernet/node.c @@ -101,7 +101,7 @@ parse_header (ethernet_input_variant_t variant, e0 = (void *) (b0->data + b0->current_data); - vnet_buffer (b0)->ethernet.start_of_ethernet_header = b0->current_data; + vnet_buffer (b0)->l2_hdr_offset = b0->current_data; vlib_buffer_advance (b0, sizeof (e0[0])); @@ -205,9 +205,7 @@ identify_subint (vnet_hw_interface_t * hi, if (!(*is_l2)) { ethernet_header_t *e0; - e0 = - (void *) (b0->data + - vnet_buffer (b0)->ethernet.start_of_ethernet_header); + e0 = (void *) (b0->data + vnet_buffer (b0)->l2_hdr_offset); if (!(ethernet_address_cast (e0->dst_address))) { @@ -238,7 +236,7 @@ determine_next_node (ethernet_main_t * em, { *next0 = em->l2_next; // record the L2 len and reset the buffer so the L2 header is preserved - u32 eth_start = vnet_buffer (b0)->ethernet.start_of_ethernet_header; + u32 eth_start = vnet_buffer (b0)->l2_hdr_offset; vnet_buffer (b0)->l2.l2_len = b0->current_data - eth_start; ASSERT (vnet_buffer (b0)->l2.l2_len == ethernet_buffer_header_size (b0)); @@ -424,10 +422,8 @@ ethernet_input_inline (vlib_main_t * vm, cached_is_l2 = is_l20 = subint0->flags & SUBINT_CONFIG_L2; } - vnet_buffer (b0)->ethernet.start_of_ethernet_header = - b0->current_data; - vnet_buffer (b1)->ethernet.start_of_ethernet_header = - b1->current_data; + vnet_buffer (b0)->l2_hdr_offset = b0->current_data; + vnet_buffer (b1)->l2_hdr_offset = b1->current_data; if (PREDICT_TRUE (is_l20 != 0)) { @@ -519,9 +515,9 @@ ethernet_input_inline (vlib_main_t * vm, { len0 = vlib_buffer_length_in_chain (vm, b0) + b0->current_data - - vnet_buffer (b0)->ethernet.start_of_ethernet_header; + - vnet_buffer (b0)->l2_hdr_offset; len1 = vlib_buffer_length_in_chain (vm, b1) + b1->current_data - - vnet_buffer (b1)->ethernet.start_of_ethernet_header; + - vnet_buffer (b1)->l2_hdr_offset; stats_n_packets += 2; stats_n_bytes += len0 + len1; @@ -646,8 +642,7 @@ ethernet_input_inline (vlib_main_t * vm, cached_is_l2 = is_l20 = subint0->flags & SUBINT_CONFIG_L2; } - vnet_buffer (b0)->ethernet.start_of_ethernet_header = - b0->current_data; + vnet_buffer (b0)->l2_hdr_offset = b0->current_data; if (PREDICT_TRUE (is_l20 != 0)) { @@ -710,7 +705,7 @@ ethernet_input_inline (vlib_main_t * vm, { len0 = vlib_buffer_length_in_chain (vm, b0) + b0->current_data - - vnet_buffer (b0)->ethernet.start_of_ethernet_header; + - vnet_buffer (b0)->l2_hdr_offset; stats_n_packets += 1; stats_n_bytes += len0; diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index 8263e01c..b8dfa847 100755 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -1585,8 +1585,8 @@ ip4_local_inline (vlib_main_t * vm, ip0 = vlib_buffer_get_current (p0); ip1 = vlib_buffer_get_current (p1); - vnet_buffer (p0)->ip.start_of_ip_header = p0->current_data; - vnet_buffer (p1)->ip.start_of_ip_header = p1->current_data; + vnet_buffer (p0)->l3_hdr_offset = p0->current_data; + vnet_buffer (p1)->l3_hdr_offset = p1->current_data; sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX]; sw_if_index1 = vnet_buffer (p1)->sw_if_index[VLIB_RX]; @@ -1788,7 +1788,7 @@ ip4_local_inline (vlib_main_t * vm, ip0 = vlib_buffer_get_current (p0); - vnet_buffer (p0)->ip.start_of_ip_header = p0->current_data; + vnet_buffer (p0)->l3_hdr_offset = p0->current_data; sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX]; diff --git a/src/vnet/ip/ip6_forward.c b/src/vnet/ip/ip6_forward.c index 4b574b9a..2b8c2bd2 100644 --- a/src/vnet/ip/ip6_forward.c +++ b/src/vnet/ip/ip6_forward.c @@ -1362,8 +1362,8 @@ ip6_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) ip0 = vlib_buffer_get_current (p0); ip1 = vlib_buffer_get_current (p1); - vnet_buffer (p0)->ip.start_of_ip_header = p0->current_data; - vnet_buffer (p1)->ip.start_of_ip_header = p1->current_data; + vnet_buffer (p0)->l3_hdr_offset = p0->current_data; + vnet_buffer (p1)->l3_hdr_offset = p1->current_data; type0 = lm->builtin_protocol_by_ip_protocol[ip0->protocol]; type1 = lm->builtin_protocol_by_ip_protocol[ip1->protocol]; @@ -1493,7 +1493,7 @@ ip6_local (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) ip0 = vlib_buffer_get_current (p0); - vnet_buffer (p0)->ip.start_of_ip_header = p0->current_data; + vnet_buffer (p0)->l3_hdr_offset = p0->current_data; type0 = lm->builtin_protocol_by_ip_protocol[ip0->protocol]; next0 = lm->local_next_by_ip_protocol[ip0->protocol]; diff --git a/src/vnet/ip/ip6_neighbor.c b/src/vnet/ip/ip6_neighbor.c index b8f6f9b1..68a8cbbc 100644 --- a/src/vnet/ip/ip6_neighbor.c +++ b/src/vnet/ip/ip6_neighbor.c @@ -1479,9 +1479,8 @@ icmp6_router_solicitation (vlib_main_t * vm, sizeof (icmp6_router_advertisement_header_t); vlib_buffer_add_data (vm, - p0->free_list_index, - bi0, - (void *) &rh, + vlib_buffer_get_free_list_index + (p0), bi0, (void *) &rh, sizeof (icmp6_router_advertisement_header_t)); @@ -1499,9 +1498,8 @@ icmp6_router_solicitation (vlib_main_t * vm, eth_if0->address, 6); vlib_buffer_add_data (vm, - p0->free_list_index, - bi0, - (void *) &h, + vlib_buffer_get_free_list_index + (p0), bi0, (void *) &h, sizeof (icmp6_neighbor_discovery_ethernet_link_layer_address_option_t)); @@ -1525,9 +1523,8 @@ icmp6_router_solicitation (vlib_main_t * vm, sizeof (icmp6_neighbor_discovery_mtu_option_t); vlib_buffer_add_data (vm, - p0->free_list_index, - bi0, - (void *) &h, + vlib_buffer_get_free_list_index + (p0), bi0, (void *) &h, sizeof (icmp6_neighbor_discovery_mtu_option_t)); } @@ -1579,7 +1576,7 @@ icmp6_router_solicitation (vlib_main_t * vm, payload_length += sizeof( icmp6_neighbor_discovery_prefix_information_option_t); vlib_buffer_add_data (vm, - p0->free_list_index, + vlib_buffer_get_free_list_index (p0), bi0, (void *)&h, sizeof(icmp6_neighbor_discovery_prefix_information_option_t)); @@ -2326,7 +2323,7 @@ ip6_neighbor_send_mldpv2_report (u32 sw_if_index) num_addr_records++; vlib_buffer_add_data - (vm, b0->free_list_index, bo0, + (vm, vlib_buffer_get_free_list_index (b0), bo0, (void *)&rr, sizeof(icmp6_multicast_address_record_t)); payload_length += sizeof( icmp6_multicast_address_record_t); diff --git a/src/vnet/l2/l2_bvi.h b/src/vnet/l2/l2_bvi.h index e21a1616..662ec402 100644 --- a/src/vnet/l2/l2_bvi.h +++ b/src/vnet/l2/l2_bvi.h @@ -57,7 +57,7 @@ l2_to_bvi (vlib_main_t * vlib_main, } /* Save L2 header position which may be changed due to packet replication */ - vnet_buffer (b0)->ethernet.start_of_ethernet_header = b0->current_data; + vnet_buffer (b0)->l2_hdr_offset = b0->current_data; /* Strip L2 header */ l2_len = vnet_buffer (b0)->l2.l2_len; diff --git a/src/vnet/lisp-cp/control.c b/src/vnet/lisp-cp/control.c index 22b5c82c..d8a1372d 100644 --- a/src/vnet/lisp-cp/control.c +++ b/src/vnet/lisp-cp/control.c @@ -3706,7 +3706,7 @@ send_map_reply (lisp_cp_main_t * lcm, u32 mi, ip_address_t * dst, static void find_ip_header (vlib_buffer_t * b, u8 ** ip_hdr) { - const i32 start = vnet_buffer (b)->ip.start_of_ip_header; + const i32 start = vnet_buffer (b)->l3_hdr_offset; if (start < 0 && start < -sizeof (b->pre_data)) { *ip_hdr = 0; diff --git a/src/vnet/replication.c b/src/vnet/replication.c index 1c6f28d2..0fdca0bf 100644 --- a/src/vnet/replication.c +++ b/src/vnet/replication.c @@ -43,12 +43,12 @@ replication_prep (vlib_main_t * vm, ctx_id = ctx - rm->contexts[thread_index]; /* Save state from vlib buffer */ - ctx->saved_free_list_index = b0->free_list_index; + ctx->saved_free_list_index = vlib_buffer_get_free_list_index (b0); ctx->current_data = b0->current_data; /* Set up vlib buffer hooks */ b0->recycle_count = ctx_id; - b0->free_list_index = rm->recycle_list_index; + vlib_buffer_set_free_list_index (b0, rm->recycle_list_index); b0->flags |= VLIB_BUFFER_RECYCLE; /* Save feature state */ @@ -129,7 +129,7 @@ replication_recycle (vlib_main_t * vm, vlib_buffer_t * b0, u32 is_last) * This is the last replication in the list. * Restore original buffer free functionality. */ - b0->free_list_index = ctx->saved_free_list_index; + vlib_buffer_set_free_list_index (b0, ctx->saved_free_list_index); b0->flags &= ~VLIB_BUFFER_RECYCLE; /* Free context back to its pool */ -- cgit 1.2.3-korg From d756b35032cdf7fdaaf0d6611388a54d32d72e92 Mon Sep 17 00:00:00 2001 From: Dave Wallace Date: Mon, 3 Jul 2017 13:11:38 -0400 Subject: Fix unlinking of /dev/shm files. - api-segment prefix not used when unlinking shm files - unlink root region on exit if no clients referenced - stale reference to freed segment name - don't add fake client to /db unless CLIB_DEBUG > 2 - turn off the gmond plugin - clean up unused vars in vpp/api Change-Id: I66451fcfd6ee64a12466c2d6c209050e3cdb74b7 Signed-off-by: Dave Wallace Signed-off-by: Dave Barach --- Makefile | 9 +++++++-- build-data/platforms/vpp.mk | 2 +- src/svm/ssvm.c | 11 ++++++++++- src/svm/svm.c | 38 +++++++++++++++++++++++++++++++------- src/svm/svm_fifo_segment.c | 2 +- src/svm/svmdb.c | 15 ++++++++++----- src/vlib/buffer.h | 18 +++++++++--------- src/vpp.am | 6 +++--- src/vpp/api/api.c | 1 - src/vpp/api/api_main.c | 2 -- src/vpp/api/custom_dump.c | 8 +------- src/vpp/stats/stats.c | 1 - 12 files changed, 73 insertions(+), 40 deletions(-) (limited to 'src/vlib') diff --git a/Makefile b/Makefile index 0d21f335..46c51dd8 100644 --- a/Makefile +++ b/Makefile @@ -55,8 +55,10 @@ else ifeq ($(filter rhel centos fedora opensuse,$(OS_ID)),$(OS_ID)) PKG=rpm endif +# +libganglia1-dev if building the gmond plugin + DEB_DEPENDS = curl build-essential autoconf automake bison libssl-dev ccache -DEB_DEPENDS += debhelper dkms git libtool libganglia1-dev libapr1-dev dh-systemd +DEB_DEPENDS += debhelper dkms git libtool libapr1-dev dh-systemd DEB_DEPENDS += libconfuse-dev git-review exuberant-ctags cscope pkg-config DEB_DEPENDS += lcov chrpath autoconf nasm indent DEB_DEPENDS += python-all python-dev python-virtualenv python-pip libffi6 @@ -79,9 +81,12 @@ else RPM_DEPENDS += python-virtualenv RPM_DEPENDS_GROUPS = 'Development Tools' endif + +# +ganglia-devel if building the ganglia plugin + RPM_DEPENDS += chrpath libffi-devel rpm-build RPM_DEPENDS += https://kojipkgs.fedoraproject.org//packages/nasm/2.12.02/2.fc26/x86_64/nasm-2.12.02-2.fc26.x86_64.rpm -EPEL_DEPENDS = libconfuse-devel ganglia-devel epel-rpm-macros +EPEL_DEPENDS = libconfuse-devel epel-rpm-macros ifeq ($(filter rhel centos,$(OS_ID)),$(OS_ID)) EPEL_DEPENDS += lcov else diff --git a/build-data/platforms/vpp.mk b/build-data/platforms/vpp.mk index 4577fa2e..acbe0e7f 100644 --- a/build-data/platforms/vpp.mk +++ b/build-data/platforms/vpp.mk @@ -36,7 +36,7 @@ vpp_uses_dpdk = yes # Uncoment to enable building unit tests # vpp_enable_tests = yes -vpp_root_packages = vpp gmod +vpp_root_packages = vpp # DPDK configuration parameters # vpp_uses_dpdk_cryptodev_sw = yes diff --git a/src/svm/ssvm.c b/src/svm/ssvm.c index 6cda1f27..23e3cf44 100644 --- a/src/svm/ssvm.c +++ b/src/svm/ssvm.c @@ -29,6 +29,9 @@ ssvm_master_init (ssvm_private_t * ssvm, u32 master_index) if (ssvm->ssvm_size == 0) return SSVM_API_ERROR_NO_SIZE; + if (CLIB_DEBUG > 1) + clib_warning ("[%d] creating segment '%s'", getpid (), ssvm->name); + ssvm_filename = format (0, "/dev/shm/%s%c", ssvm->name, 0); unlink ((char *) ssvm_filename); @@ -176,12 +179,18 @@ ssvm_delete (ssvm_private_t * ssvm) fn = format (0, "/dev/shm/%s%c", ssvm->name, 0); + if (CLIB_DEBUG > 1) + clib_warning ("[%d] unlinking ssvm (%s) backing file '%s'", getpid (), + ssvm->name, fn); + /* Throw away the backing file */ if (unlink ((char *) fn) < 0) clib_unix_warning ("unlink segment '%s'", ssvm->name); - munmap ((void *) ssvm->requested_va, ssvm->ssvm_size); vec_free (fn); + vec_free (ssvm->name); + + munmap ((void *) ssvm->requested_va, ssvm->ssvm_size); } diff --git a/src/svm/svm.c b/src/svm/svm.c index c96135cf..600fa744 100644 --- a/src/svm/svm.c +++ b/src/svm/svm.c @@ -458,14 +458,15 @@ svm_map_region (svm_map_region_args_t * a) struct stat stat; struct timespec ts, tsrem; - if (CLIB_DEBUG > 1) - clib_warning ("[%d] map region %s", getpid (), a->name); - ASSERT ((a->size & ~(MMAP_PAGESIZE - 1)) == a->size); ASSERT (a->name); shm_name = shm_name_from_svm_map_region_args (a); + if (CLIB_DEBUG > 1) + clib_warning ("[%d] map region %s: shm_open (%s)", + getpid (), a->name, shm_name); + svm_fd = shm_open ((char *) shm_name, O_RDWR | O_CREAT | O_EXCL, 0777); if (svm_fd >= 0) @@ -947,6 +948,29 @@ svm_region_find_or_create (svm_map_region_args_t * a) return (rp); } +void +svm_region_unlink (svm_region_t * rp) +{ + svm_map_region_args_t _a, *a = &_a; + svm_main_region_t *mp; + u8 *shm_name; + + ASSERT (root_rp); + ASSERT (rp); + ASSERT (vec_c_string_is_terminated (rp->region_name)); + + mp = root_rp->data_base; + ASSERT (mp); + + a->root_path = (char *) mp->root_path; + a->name = rp->region_name; + shm_name = shm_name_from_svm_map_region_args (a); + if (CLIB_DEBUG > 1) + clib_warning ("[%d] shm_unlink (%s)", getpid (), shm_name); + shm_unlink ((const char *) shm_name); + vec_free (shm_name); +} + /* * svm_region_unmap * @@ -1056,7 +1080,7 @@ found: vec_free (name); region_unlock (rp); - shm_unlink (rp->region_name); + svm_region_unlink (rp); munmap ((void *) virtual_base, virtual_size); region_unlock (root_rp); svm_pop_heap (oldheap); @@ -1071,9 +1095,6 @@ found: /* * svm_region_exit - * There is no clean way to unlink the - * root region when all clients go away, - * so remove the pid entry and call it a day. */ void svm_region_exit () @@ -1116,6 +1137,9 @@ svm_region_exit () found: + if (vec_len (root_rp->client_pids) == 0) + svm_region_unlink (root_rp); + region_unlock (root_rp); svm_pop_heap (oldheap); diff --git a/src/svm/svm_fifo_segment.c b/src/svm/svm_fifo_segment.c index 69d4ecb9..c80374a7 100644 --- a/src/svm/svm_fifo_segment.c +++ b/src/svm/svm_fifo_segment.c @@ -105,7 +105,7 @@ svm_fifo_segment_create (svm_fifo_segment_create_args_t * a) s->ssvm.ssvm_size = a->segment_size; s->ssvm.i_am_master = 1; s->ssvm.my_pid = getpid (); - s->ssvm.name = (u8 *) a->segment_name; + s->ssvm.name = format (0, "%s", a->segment_name); s->ssvm.requested_va = sm->next_baseva; rv = ssvm_master_init (&s->ssvm, s - sm->segments); diff --git a/src/svm/svmdb.c b/src/svm/svmdb.c index 03dfe7c3..043b0924 100644 --- a/src/svm/svmdb.c +++ b/src/svm/svmdb.c @@ -106,11 +106,16 @@ svmdb_map (svmdb_map_args_t * dba) } /* Nope, it's our problem... */ - /* Add a bogus client (pid=0) so the svm won't be deallocated */ - oldheap = svm_push_pvt_heap (db_rp); - vec_add1 (client->db_rp->client_pids, 0); - svm_pop_heap (oldheap); - + if (CLIB_DEBUG > 2) + { + /* Add a bogus client (pid=0) so the svm won't be deallocated */ + clib_warning + ("[%d] adding fake client (pid=0) so '%s' won't be unlinked", + getpid (), db_rp->region_name); + oldheap = svm_push_pvt_heap (db_rp); + vec_add1 (client->db_rp->client_pids, 0); + svm_pop_heap (oldheap); + } oldheap = svm_push_data_heap (db_rp); vec_validate (hp, 0); diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index c810db4e..77528e77 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -87,17 +87,17 @@ typedef struct /* any change to the following line requres update of * vlib_buffer_get_free_list_index(...) and * vlib_buffer_set_free_list_index(...) functions */ -#define VLIB_BUFFER_FREE_LIST_INDEX_MASK ((1 << 4) - 1) +#define VLIB_BUFFER_FREE_LIST_INDEX_MASK ((1 << 5) - 1) -#define VLIB_BUFFER_IS_TRACED (1 << 4) -#define VLIB_BUFFER_LOG2_NEXT_PRESENT (5) +#define VLIB_BUFFER_IS_TRACED (1 << 5) +#define VLIB_BUFFER_LOG2_NEXT_PRESENT (6) #define VLIB_BUFFER_NEXT_PRESENT (1 << VLIB_BUFFER_LOG2_NEXT_PRESENT) -#define VLIB_BUFFER_IS_RECYCLED (1 << 6) -#define VLIB_BUFFER_TOTAL_LENGTH_VALID (1 << 7) -#define VLIB_BUFFER_REPL_FAIL (1 << 8) -#define VLIB_BUFFER_RECYCLE (1 << 9) -#define VLIB_BUFFER_FLOW_REPORT (1 << 10) -#define VLIB_BUFFER_EXT_HDR_VALID (1 << 11) +#define VLIB_BUFFER_IS_RECYCLED (1 << 7) +#define VLIB_BUFFER_TOTAL_LENGTH_VALID (1 << 8) +#define VLIB_BUFFER_REPL_FAIL (1 << 9) +#define VLIB_BUFFER_RECYCLE (1 << 10) +#define VLIB_BUFFER_FLOW_REPORT (1 << 11) +#define VLIB_BUFFER_EXT_HDR_VALID (1 << 12) /* User defined buffer flags. */ #define LOG2_VLIB_BUFFER_FLAG_USER(n) (32 - (n)) diff --git a/src/vpp.am b/src/vpp.am index 614bd26a..10a4e311 100644 --- a/src/vpp.am +++ b/src/vpp.am @@ -33,11 +33,11 @@ if WITH_APICLI vpp/api/plugin.h endif -# comment out to disable stats upload to gmond +# uncomment to enable stats upload to gmond +# bin_vpp_SOURCES += \ +# vpp/api/gmon.c bin_vpp_CFLAGS = @APICLI@ -bin_vpp_SOURCES += \ - vpp/api/gmon.c nobase_include_HEADERS += \ vpp/api/vpe_all_api_h.h \ diff --git a/src/vpp/api/api.c b/src/vpp/api/api.c index de9a2477..4e892431 100644 --- a/src/vpp/api/api.c +++ b/src/vpp/api/api.c @@ -686,7 +686,6 @@ static void BAD_SW_IF_INDEX_LABEL; -out: REPLY_MACRO (VL_API_PROXY_ARP_INTFC_ENABLE_DISABLE_REPLY); } diff --git a/src/vpp/api/api_main.c b/src/vpp/api/api_main.c index ac09cd15..c355a5fd 100644 --- a/src/vpp/api/api_main.c +++ b/src/vpp/api/api_main.c @@ -232,8 +232,6 @@ unformat_sw_if_index (unformat_input_t * input, va_list * args) u32 *result = va_arg (*args, u32 *); vnet_main_t *vnm = vnet_get_main (); u32 sw_if_index = ~0; - u8 *if_name; - uword *p; if (unformat (input, "%U", unformat_vnet_sw_interface, vnm, &sw_if_index)) { diff --git a/src/vpp/api/custom_dump.c b/src/vpp/api/custom_dump.c index 3ac8874e..7f3a58d9 100644 --- a/src/vpp/api/custom_dump.c +++ b/src/vpp/api/custom_dump.c @@ -1174,7 +1174,6 @@ static void *vl_api_sr_policy_mod_t_print (vl_api_sr_policy_mod_t * mp, void *handle) { u8 *s; - u32 weight; ip6_address_t *segments = 0, *seg; ip6_address_t *this_address = (ip6_address_t *) mp->segments; @@ -1216,8 +1215,6 @@ static void *vl_api_sr_policy_del_t_print u8 *s; s = format (0, "SCRIPT: sr_policy_del "); - u8 bsid_addr[16]; - u32 sr_policy_index; s = format (s, "To be delivered. Good luck."); FINISH; } @@ -2432,7 +2429,7 @@ static void *vl_api_lisp_add_del_remote_mapping_t_print (vl_api_lisp_add_del_remote_mapping_t * mp, void *handle) { u8 *s; - u32 i, rloc_num = 0; + u32 rloc_num = 0; s = format (0, "SCRIPT: lisp_add_del_remote_mapping "); @@ -2574,7 +2571,6 @@ static void *vl_api_lisp_add_del_locator_set_t_print (vl_api_lisp_add_del_locator_set_t * mp, void *handle) { u8 *s; - u32 loc_num = 0, i; s = format (0, "SCRIPT: lisp_add_del_locator_set "); @@ -2583,8 +2579,6 @@ static void *vl_api_lisp_add_del_locator_set_t_print s = format (s, "locator-set %s ", mp->locator_set_name); - loc_num = clib_net_to_host_u32 (mp->locator_num); - FINISH; } diff --git a/src/vpp/stats/stats.c b/src/vpp/stats/stats.c index 38821da7..422b7b3b 100644 --- a/src/vpp/stats/stats.c +++ b/src/vpp/stats/stats.c @@ -577,7 +577,6 @@ do_ip4_fibs (stats_main_t * sm) ip4_route_t *r; fib_table_t *fib; ip4_fib_t *v4_fib; - ip_lookup_main_t *lm = &im4->lookup_main; static uword *results; vl_api_vnet_ip4_fib_counters_t *mp = 0; u32 items_this_message; -- cgit 1.2.3-korg From 30af5da7522b38e9d4a4e9c906fd5f1c05088212 Mon Sep 17 00:00:00 2001 From: JingLiuZTE Date: Mon, 24 Jul 2017 10:53:31 +0800 Subject: VPP-905: Wrong define used in function start_workers. Change-Id: I6a5faebb63e9360cebfcfb1bc3f3c0eb6b15e937 Signed-off-by: JingLiuZTE --- src/vlib/threads.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'src/vlib') diff --git a/src/vlib/threads.c b/src/vlib/threads.c index 0c775e2d..0661d89a 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -642,7 +642,8 @@ start_workers (vlib_main_t * vm) } nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] = vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]); - vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]) + vec_foreach (rt, + nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]) { vlib_node_t *n = vlib_get_node (vm, rt->node_index); rt->thread_index = vm_clone->thread_index; -- cgit 1.2.3-korg From 6b0f5892833254438adaaf786b7195c82e3fdd30 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 27 Jul 2017 04:01:24 -0400 Subject: Thread safe internal buffer manager Change-Id: I45845b952aa42a854e1c2c396b85f905de987020 Signed-off-by: Damjan Marion --- src/vlib/buffer.c | 7 +------ src/vlib/buffer.h | 2 ++ src/vlib/buffer_funcs.h | 15 +++++++++++++-- src/vlib/unix/physmem.c | 3 ++- 4 files changed, 18 insertions(+), 9 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index 53b60c16..a5c955c7 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -305,12 +305,6 @@ vlib_buffer_validate_alloc_free (vlib_main_t * vm, if (CLIB_DEBUG == 0) return; - ASSERT (vlib_get_thread_index () == 0); - - /* smp disaster check */ - if (vec_len (vlib_mains) > 1) - ASSERT (vm == vlib_mains[0]); - is_free = expected_state == VLIB_BUFFER_KNOWN_ALLOCATED; b = buffers; for (i = 0; i < n_buffers; i++) @@ -1050,6 +1044,7 @@ vlib_buffer_cb_init (struct vlib_main_t *vm) bm->cb.vlib_buffer_free_no_next_cb = &vlib_buffer_free_no_next_internal; bm->cb.vlib_buffer_delete_free_list_cb = &vlib_buffer_delete_free_list_internal; + clib_spinlock_init (&bm->buffer_known_hash_lockp); } /** @endcond */ diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index 77528e77..9047ca9a 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -44,6 +44,7 @@ #include #include #include +#include #include /* for vlib_error_t */ #include /* for __PRE_DATA_SIZE */ @@ -423,6 +424,7 @@ typedef struct If buffer index is not in hash table then this buffer has never been allocated. */ uword *buffer_known_hash; + clib_spinlock_t buffer_known_hash_lockp; /* List of free-lists needing Blue Light Special announcements */ vlib_buffer_free_list_t **announce_list; diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 1aaac0b2..72008dad 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -219,9 +219,10 @@ always_inline vlib_buffer_known_state_t vlib_buffer_is_known (vlib_main_t * vm, u32 buffer_index) { vlib_buffer_main_t *bm = vm->buffer_main; - ASSERT (vlib_get_thread_index () == 0); + clib_spinlock_lock (&bm->buffer_known_hash_lockp); uword *p = hash_get (bm->buffer_known_hash, buffer_index); + clib_spinlock_unlock (&bm->buffer_known_hash_lockp); return p ? p[0] : VLIB_BUFFER_UNKNOWN; } @@ -231,8 +232,9 @@ vlib_buffer_set_known_state (vlib_main_t * vm, vlib_buffer_known_state_t state) { vlib_buffer_main_t *bm = vm->buffer_main; - ASSERT (vlib_get_thread_index () == 0); + clib_spinlock_lock (&bm->buffer_known_hash_lockp); hash_set (bm->buffer_known_hash, buffer_index, state); + clib_spinlock_unlock (&bm->buffer_known_hash_lockp); } /* Validates sanity of a single buffer. @@ -841,10 +843,19 @@ vlib_buffer_add_to_free_list (vlib_main_t * vm, u32 buffer_index, u8 do_init) { vlib_buffer_t *b; + u32 i; b = vlib_get_buffer (vm, buffer_index); if (PREDICT_TRUE (do_init)) vlib_buffer_init_for_free_list (b, f); vec_add1_aligned (f->buffers, buffer_index, CLIB_CACHE_LINE_BYTES); + + if (vec_len (f->buffers) > 3 * VLIB_FRAME_SIZE) + { + /* keep last stored buffers, as they are more likely hot in the cache */ + for (i = 0; i < VLIB_FRAME_SIZE; i++) + vm->os_physmem_free (vlib_get_buffer (vm, i)); + vec_delete (f->buffers, VLIB_FRAME_SIZE, 0); + } } always_inline void diff --git a/src/vlib/unix/physmem.c b/src/vlib/unix/physmem.c index 933cc63b..27a5bacf 100644 --- a/src/vlib/unix/physmem.c +++ b/src/vlib/unix/physmem.c @@ -155,7 +155,8 @@ htlb_init (vlib_main_t * vm) pm->heap = mheap_alloc_with_flags (pm->mem, pm->mem_size, /* Don't want mheap mmap/munmap with IO memory. */ - MHEAP_FLAG_DISABLE_VM); + MHEAP_FLAG_DISABLE_VM | + MHEAP_FLAG_THREAD_SAFE); cur = pointer_to_uword (pm->mem); i = 0; -- cgit 1.2.3-korg From 66b11318a1e5f24880e3ec77c95d70647732a4a8 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Mon, 31 Jul 2017 17:18:03 -0700 Subject: Fix tcp tx buffer allocation - Make tcp output buffer allocation macro an inline function - Use per ip version per thread tx frames for retransmits and timer events - Fix / parameterize tcp data structure preallocation - Add a couple of gdb-callable show commands - Fix local endpoint cleanup Change-Id: I67b47b7570aa14cb4634b6fd93c57cd2eacbfa29 Signed-off-by: Florin Coras Signed-off-by: Dave Barach --- src/plugins/dpdk/device/cli.c | 1 + src/vlib/error.c | 2 +- src/vlib/node_funcs.h | 16 ++-- src/vnet/ip/ip4.h | 2 - src/vnet/session/session.c | 82 +++++++++++++++-- src/vnet/session/session.h | 10 ++ src/vnet/session/session_cli.c | 2 +- src/vnet/session/session_lookup.c | 40 ++++++-- src/vnet/tcp/builtin_client.c | 39 ++++++-- src/vnet/tcp/tcp.c | 52 +++++++---- src/vnet/tcp/tcp.h | 12 ++- src/vnet/tcp/tcp_input.c | 2 + src/vnet/tcp/tcp_output.c | 188 +++++++++++++++++++++++++------------- src/vnet/unix/gdb_funcs.c | 45 ++++++++- src/vppinfra/pool.h | 2 +- 15 files changed, 375 insertions(+), 120 deletions(-) (limited to 'src/vlib') diff --git a/src/plugins/dpdk/device/cli.c b/src/plugins/dpdk/device/cli.c index aeeb772d..fe1c41c2 100644 --- a/src/plugins/dpdk/device/cli.c +++ b/src/plugins/dpdk/device/cli.c @@ -357,6 +357,7 @@ show_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input, "name=\"%s\" available = %7d allocated = %7d total = %7d\n", rmp->name, (u32) count, (u32) free_count, (u32) (count + free_count)); + rte_mempool_dump (stderr, rmp); } else { diff --git a/src/vlib/error.c b/src/vlib/error.c index e4ed4ee3..dec90bbe 100644 --- a/src/vlib/error.c +++ b/src/vlib/error.c @@ -280,7 +280,7 @@ show_errors (vlib_main_t * vm, } /* *INDENT-OFF* */ -VLIB_CLI_COMMAND (cli_show_errors, static) = { +VLIB_CLI_COMMAND (vlib_cli_show_errors) = { .path = "show errors", .short_help = "Show error counts", .function = show_errors, diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h index c0389b2f..c4c06454 100644 --- a/src/vlib/node_funcs.h +++ b/src/vlib/node_funcs.h @@ -410,19 +410,21 @@ vlib_frame_t *vlib_get_frame_to_node (vlib_main_t * vm, u32 to_node_index); void vlib_put_frame_to_node (vlib_main_t * vm, u32 to_node_index, vlib_frame_t * f); -always_inline vlib_process_t * -vlib_get_current_process (vlib_main_t * vm) -{ - vlib_node_main_t *nm = &vm->node_main; - return vec_elt (nm->processes, nm->current_process_index); -} - always_inline uword vlib_in_process_context (vlib_main_t * vm) { return vm->node_main.current_process_index != ~0; } +always_inline vlib_process_t * +vlib_get_current_process (vlib_main_t * vm) +{ + vlib_node_main_t *nm = &vm->node_main; + if (vlib_in_process_context (vm)) + return vec_elt (nm->processes, nm->current_process_index); + return 0; +} + always_inline uword vlib_current_process (vlib_main_t * vm) { diff --git a/src/vnet/ip/ip4.h b/src/vnet/ip/ip4.h index 8f9a8e27..74faa059 100644 --- a/src/vnet/ip/ip4.h +++ b/src/vnet/ip/ip4.h @@ -354,8 +354,6 @@ vlib_buffer_push_ip4 (vlib_main_t * vm, vlib_buffer_t * b, ih->checksum = 0; b->flags |= VNET_BUFFER_F_OFFLOAD_IP_CKSUM | VNET_BUFFER_F_IS_IP4; vnet_buffer (b)->l3_hdr_offset = (u8 *) ih - b->data; - vnet_buffer (b)->l4_hdr_offset = vnet_buffer (b)->l3_hdr_offset + - sizeof (*ih); } else ih->checksum = ip4_header_checksum (ih); diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 004c7193..4ba15291 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -759,6 +759,7 @@ session_manager_main_enable (vlib_main_t * vm) session_manager_main_t *smm = &session_manager_main; vlib_thread_main_t *vtm = vlib_get_thread_main (); u32 num_threads; + u32 preallocated_sessions_per_worker; int i; num_threads = 1 /* main thread */ + vtm->n_threads; @@ -795,15 +796,35 @@ session_manager_main_enable (vlib_main_t * vm) for (i = 0; i < vec_len (smm->vpp_event_queues); i++) session_vpp_event_queue_allocate (smm, i); - /* $$$$ preallocate hack config parameter */ - for (i = 0; i < smm->preallocated_sessions; i++) + /* Preallocate sessions */ + if (num_threads == 1) { - stream_session_t *ss __attribute__ ((unused)); - pool_get_aligned (smm->sessions[0], ss, CLIB_CACHE_LINE_BYTES); + for (i = 0; i < smm->preallocated_sessions; i++) + { + stream_session_t *ss __attribute__ ((unused)); + pool_get_aligned (smm->sessions[0], ss, CLIB_CACHE_LINE_BYTES); + } + + for (i = 0; i < smm->preallocated_sessions; i++) + pool_put_index (smm->sessions[0], i); } + else + { + int j; + preallocated_sessions_per_worker = smm->preallocated_sessions / + (num_threads - 1); - for (i = 0; i < smm->preallocated_sessions; i++) - pool_put_index (smm->sessions[0], i); + for (j = 1; j < num_threads; j++) + { + for (i = 0; i < preallocated_sessions_per_worker; i++) + { + stream_session_t *ss __attribute__ ((unused)); + pool_get_aligned (smm->sessions[j], ss, CLIB_CACHE_LINE_BYTES); + } + for (i = 0; i < preallocated_sessions_per_worker; i++) + pool_put_index (smm->sessions[j], i); + } + } session_lookup_init (); @@ -863,6 +884,7 @@ session_config_fn (vlib_main_t * vm, unformat_input_t * input) { session_manager_main_t *smm = &session_manager_main; u32 nitems; + uword tmp; while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { @@ -873,9 +895,53 @@ session_config_fn (vlib_main_t * vm, unformat_input_t * input) else clib_warning ("event queue length %d too small, ignored", nitems); } - if (unformat (input, "preallocated-sessions %d", - &smm->preallocated_sessions)) + else if (unformat (input, "preallocated-sessions %d", + &smm->preallocated_sessions)) + ; + else if (unformat (input, "v4-session-table-buckets %d", + &smm->configured_v4_session_table_buckets)) ; + else if (unformat (input, "v4-halfopen-table-buckets %d", + &smm->configured_v4_halfopen_table_buckets)) + ; + else if (unformat (input, "v6-session-table-buckets %d", + &smm->configured_v6_session_table_buckets)) + ; + else if (unformat (input, "v6-halfopen-table-buckets %d", + &smm->configured_v6_halfopen_table_buckets)) + ; + else if (unformat (input, "v4-session-table-memory %U", + unformat_memory_size, &tmp)) + { + if (tmp >= 0x100000000) + return clib_error_return (0, "memory size %llx (%lld) too large", + tmp, tmp); + smm->configured_v4_session_table_memory = tmp; + } + else if (unformat (input, "v4-halfopen-table-memory %U", + unformat_memory_size, &tmp)) + { + if (tmp >= 0x100000000) + return clib_error_return (0, "memory size %llx (%lld) too large", + tmp, tmp); + smm->configured_v4_halfopen_table_memory = tmp; + } + else if (unformat (input, "v6-session-table-memory %U", + unformat_memory_size, &tmp)) + { + if (tmp >= 0x100000000) + return clib_error_return (0, "memory size %llx (%lld) too large", + tmp, tmp); + smm->configured_v6_session_table_memory = tmp; + } + else if (unformat (input, "v6-halfopen-table-memory %U", + unformat_memory_size, &tmp)) + { + if (tmp >= 0x100000000) + return clib_error_return (0, "memory size %llx (%lld) too large", + tmp, tmp); + smm->configured_v6_halfopen_table_memory = tmp; + } else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); diff --git a/src/vnet/session/session.h b/src/vnet/session/session.h index 180b9f8a..538433da 100644 --- a/src/vnet/session/session.h +++ b/src/vnet/session/session.h @@ -133,6 +133,16 @@ struct _session_manager_main /** vpp fifo event queue configured length */ u32 configured_event_queue_length; + /** session table size parameters */ + u32 configured_v4_session_table_buckets; + u32 configured_v4_session_table_memory; + u32 configured_v4_halfopen_table_buckets; + u32 configured_v4_halfopen_table_memory; + u32 configured_v6_session_table_buckets; + u32 configured_v6_session_table_memory; + u32 configured_v6_halfopen_table_buckets; + u32 configured_v6_halfopen_table_memory; + /** Unique segment name counter */ u32 unique_segment_name_counter; diff --git a/src/vnet/session/session_cli.c b/src/vnet/session/session_cli.c index de564ea7..9f3d217c 100755 --- a/src/vnet/session/session_cli.c +++ b/src/vnet/session/session_cli.c @@ -312,7 +312,7 @@ show_session_command_fn (vlib_main_t * vm, unformat_input_t * input, } /* *INDENT-OFF* */ -VLIB_CLI_COMMAND (show_session_command, static) = +VLIB_CLI_COMMAND (vlib_cli_show_session_command) = { .path = "show session", .short_help = "show session [verbose]", diff --git a/src/vnet/session/session_lookup.c b/src/vnet/session/session_lookup.c index 1ce22f80..41f9dbf0 100644 --- a/src/vnet/session/session_lookup.c +++ b/src/vnet/session/session_lookup.c @@ -569,23 +569,45 @@ stream_session_lookup_transport6 (ip6_address_t * lcl, ip6_address_t * rmt, return 0; } +#define foreach_hash_table_parameter \ + _(v4,session,buckets,20000) \ + _(v4,session,memory,(64<<20)) \ + _(v6,session,buckets,20000) \ + _(v6,session,memory,(64<<20)) \ + _(v4,halfopen,buckets,20000) \ + _(v4,halfopen,memory,(64<<20)) \ + _(v6,halfopen,buckets,20000) \ + _(v6,halfopen,memory,(64<<20)) + void session_lookup_init (void) { session_lookup_t *sl = &session_lookup; + +#define _(af,table,parm,value) \ + u32 configured_##af##_##table##_table_##parm = value; + foreach_hash_table_parameter; +#undef _ + +#define _(af,table,parm,value) \ + if (session_manager_main.configured_##af##_##table##_table_##parm) \ + configured_##af##_##table##_table_##parm = \ + session_manager_main.configured_##af##_##table##_table_##parm; + foreach_hash_table_parameter; +#undef _ + clib_bihash_init_16_8 (&sl->v4_session_hash, "v4 session table", - 200000 /* $$$$ config parameter nbuckets */ , - (64 << 20) /*$$$ config parameter table size */ ); + configured_v4_session_table_buckets, + configured_v4_session_table_memory); clib_bihash_init_48_8 (&sl->v6_session_hash, "v6 session table", - 200000 /* $$$$ config parameter nbuckets */ , - (64 << 20) /*$$$ config parameter table size */ ); - + configured_v6_session_table_buckets, + configured_v6_session_table_memory); clib_bihash_init_16_8 (&sl->v4_half_open_hash, "v4 half-open table", - 200000 /* $$$$ config parameter nbuckets */ , - (64 << 20) /*$$$ config parameter table size */ ); + configured_v4_halfopen_table_buckets, + configured_v4_halfopen_table_memory); clib_bihash_init_48_8 (&sl->v6_half_open_hash, "v6 half-open table", - 200000 /* $$$$ config parameter nbuckets */ , - (64 << 20) /*$$$ config parameter table size */ ); + configured_v6_halfopen_table_buckets, + configured_v6_halfopen_table_memory); } /* diff --git a/src/vnet/tcp/builtin_client.c b/src/vnet/tcp/builtin_client.c index 27e20f8e..48daffb4 100644 --- a/src/vnet/tcp/builtin_client.c +++ b/src/vnet/tcp/builtin_client.c @@ -597,8 +597,9 @@ clients_connect (vlib_main_t * vm, u8 * uri, u32 n_clients) a->mp = 0; vnet_connect_uri (a); - /* Crude pacing for call setups, 100k/sec */ - vlib_process_suspend (vm, 10e-6); + /* Crude pacing for call setups */ + if ((i % 4) == 0) + vlib_process_suspend (vm, 10e-6); } } @@ -612,8 +613,10 @@ test_tcp_clients_command_fn (vlib_main_t * vm, uword *event_data = 0, event_type; u8 *default_connect_uri = (u8 *) "tcp://6.0.1.1/1234", *uri; u64 tmp, total_bytes; - f64 cli_timeout = 20.0, delta; + f64 test_timeout = 20.0, syn_timeout = 20.0, delta; + f64 time_before_connects; u32 n_clients = 1; + int preallocate_sessions = 0; char *transfer_type; int i; @@ -640,7 +643,9 @@ test_tcp_clients_command_fn (vlib_main_t * vm, ; else if (unformat (input, "uri %s", &tm->connect_uri)) ; - else if (unformat (input, "cli-timeout %f", &cli_timeout)) + else if (unformat (input, "test-timeout %f", &test_timeout)) + ; + else if (unformat (input, "syn-timeout %f", &syn_timeout)) ; else if (unformat (input, "no-return")) tm->no_return = 1; @@ -657,6 +662,8 @@ test_tcp_clients_command_fn (vlib_main_t * vm, tm->private_segment_size = tmp; else if (unformat (input, "preallocate-fifos")) tm->prealloc_fifos = 1; + else if (unformat (input, "preallocate-sessions")) + preallocate_sessions = 1; else if (unformat (input, "client-batch %d", &tm->connections_per_batch)) ; @@ -674,6 +681,7 @@ test_tcp_clients_command_fn (vlib_main_t * vm, return clib_error_return (0, "failed init"); } + tm->ready_connections = 0; tm->expected_connections = n_clients; tm->rx_total = 0; @@ -705,11 +713,21 @@ test_tcp_clients_command_fn (vlib_main_t * vm, vlib_node_set_state (vlib_mains[i], builtin_client_node.index, VLIB_NODE_STATE_POLLING); + if (preallocate_sessions) + { + session_t *sp __attribute__ ((unused)); + for (i = 0; i < n_clients; i++) + pool_get (tm->sessions, sp); + for (i = 0; i < n_clients; i++) + pool_put_index (tm->sessions, i); + } + /* Fire off connect requests */ + time_before_connects = vlib_time_now (vm); clients_connect (vm, uri, n_clients); /* Park until the sessions come up, or ten seconds elapse... */ - vlib_process_wait_for_event_or_clock (vm, 10 /* timeout, seconds */ ); + vlib_process_wait_for_event_or_clock (vm, syn_timeout); event_type = vlib_process_get_events (vm, &event_data); switch (event_type) { @@ -719,6 +737,15 @@ test_tcp_clients_command_fn (vlib_main_t * vm, goto cleanup; case 1: + delta = vlib_time_now (vm) - time_before_connects; + + if (delta != 0.0) + { + vlib_cli_output + (vm, "%d three-way handshakes in %.2f seconds, %.2f/sec", + n_clients, delta, ((f64) n_clients) / delta); + } + tm->test_start_time = vlib_time_now (tm->vlib_main); vlib_cli_output (vm, "Test started at %.6f", tm->test_start_time); break; @@ -729,7 +756,7 @@ test_tcp_clients_command_fn (vlib_main_t * vm, } /* Now wait for the sessions to finish... */ - vlib_process_wait_for_event_or_clock (vm, cli_timeout); + vlib_process_wait_for_event_or_clock (vm, test_timeout); event_type = vlib_process_get_events (vm, &event_data); switch (event_type) { diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 59b20747..8e2eb9f4 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -173,7 +173,7 @@ tcp_connection_cleanup (tcp_connection_t * tc) /* Cleanup local endpoint if this was an active connect */ tepi = transport_endpoint_lookup (&tm->local_endpoints_table, &tc->c_lcl_ip, - tc->c_lcl_port); + clib_net_to_host_u16 (tc->c_lcl_port)); if (tepi != TRANSPORT_ENDPOINT_INVALID_INDEX) { tep = pool_elt_at_index (tm->local_endpoints, tepi); @@ -367,25 +367,24 @@ tcp_allocate_local_port (ip46_address_t * ip) { tcp_main_t *tm = vnet_get_tcp_main (); transport_endpoint_t *tep; - u32 time_now, tei; + u32 tei; u16 min = 1024, max = 65535; /* XXX configurable ? */ - int tries; + int tries, limit; - tries = max - min; - time_now = tcp_time_now (); + limit = max - min; /* Only support active opens from thread 0 */ ASSERT (vlib_get_thread_index () == 0); /* Search for first free slot */ - for (; tries >= 0; tries--) + for (tries = 0; tries < limit; tries++) { u16 port = 0; /* Find a port in the specified range */ while (1) { - port = random_u32 (&time_now) & PORT_MASK; + port = random_u32 (&tm->port_allocator_seed) & PORT_MASK; if (PREDICT_TRUE (port >= min && port < max)) break; } @@ -1189,8 +1188,9 @@ tcp_main_enable (vlib_main_t * vm) vlib_thread_main_t *vtm = vlib_get_thread_main (); clib_error_t *error = 0; u32 num_threads; - int thread, i; + int i, thread; tcp_connection_t *tc __attribute__ ((unused)); + u32 preallocated_connections_per_thread; if ((error = vlib_call_init_function (vm, ip_main_init))) return error; @@ -1224,14 +1224,26 @@ tcp_main_enable (vlib_main_t * vm) vec_validate (tm->connections, num_threads - 1); /* - * Preallocate connections + * Preallocate connections. Assume that thread 0 won't + * use preallocated threads when running multi-core */ - for (thread = 0; thread < num_threads; thread++) + if (num_threads == 1) { - for (i = 0; i < tm->preallocated_connections; i++) + thread = 0; + preallocated_connections_per_thread = tm->preallocated_connections; + } + else + { + thread = 1; + preallocated_connections_per_thread = + tm->preallocated_connections / (num_threads - 1); + } + for (; thread < num_threads; thread++) + { + for (i = 0; i < preallocated_connections_per_thread; i++) pool_get (tm->connections[thread], tc); - for (i = 0; i < tm->preallocated_connections; i++) + for (i = 0; i < preallocated_connections_per_thread; i++) pool_put_index (tm->connections[thread], i); } @@ -1257,13 +1269,21 @@ tcp_main_enable (vlib_main_t * vm) / TCP_TSTAMP_RESOLUTION; clib_bihash_init_24_8 (&tm->local_endpoints_table, "local endpoint table", - 200000 /* $$$$ config parameter nbuckets */ , - (64 << 20) /*$$$ config parameter table size */ ); + 1000000 /* $$$$ config parameter nbuckets */ , + (512 << 20) /*$$$ config parameter table size */ ); + + /* Initialize [port-allocator] random number seed */ + tm->port_allocator_seed = (u32) clib_cpu_time_now (); + if (num_threads > 1) { clib_spinlock_init (&tm->half_open_lock); clib_spinlock_init (&tm->local_endpoints_lock); } + + vec_validate (tm->tx_frames[0], num_threads - 1); + vec_validate (tm->tx_frames[1], num_threads - 1); + return error; } @@ -1289,16 +1309,12 @@ clib_error_t * tcp_init (vlib_main_t * vm) { tcp_main_t *tm = vnet_get_tcp_main (); - - tm->vnet_main = vnet_get_main (); tm->is_enabled = 0; - return 0; } VLIB_INIT_FUNCTION (tcp_init); - static clib_error_t * tcp_config_fn (vlib_main_t * vm, unformat_input_t * input) { diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 4fa681f8..997df76f 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -369,6 +369,8 @@ typedef struct _tcp_main /** per-worker tx buffer free lists */ u32 **tx_buffers; + /** per-worker tx frames to 4/6 output nodes */ + vlib_frame_t **tx_frames[2]; /* Per worker-thread timer wheel for connections timers */ tw_timer_wheel_16t_2w_512sl_t *timer_wheels; @@ -400,11 +402,8 @@ typedef struct _tcp_main u32 last_v6_address_rotor; ip6_address_t *ip6_src_addresses; - /* convenience */ - vlib_main_t *vlib_main; - vnet_main_t *vnet_main; - ip4_main_t *ip4_main; - ip6_main_t *ip6_main; + /** Port allocator random number generator seed */ + u32 port_allocator_seed; } tcp_main_t; extern tcp_main_t tcp_main; @@ -493,6 +492,8 @@ void tcp_send_fin (tcp_connection_t * tc); void tcp_init_mss (tcp_connection_t * tc); void tcp_update_snd_mss (tcp_connection_t * tc); void tcp_update_rto (tcp_connection_t * tc); +void tcp_flush_frame_to_output (vlib_main_t * vm, u8 thread_index, u8 is_ip4); +void tcp_flush_frames_to_output (u8 thread_index); always_inline u32 tcp_end_seq (tcp_header_t * th, u32 len) @@ -614,6 +615,7 @@ tcp_update_time (f64 now, u32 thread_index) { tw_timer_expire_timers_16t_2w_512sl (&tcp_main.timer_wheels[thread_index], now); + tcp_flush_frames_to_output (thread_index); } u32 tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b); diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 6c59d70f..29f4f08d 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -1751,6 +1751,8 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, errors = session_manager_flush_enqueue_events (my_thread_index); tcp_established_inc_counter (vm, is_ip4, TCP_ERROR_EVENT_FIFO_FULL, errors); + tcp_flush_frame_to_output (vm, my_thread_index, is_ip4); + return from_frame->n_vectors; } diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index ad13493a..f8fbb8a9 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -436,34 +436,41 @@ tcp_init_mss (tcp_connection_t * tc) tc->snd_mss -= TCP_OPTION_LEN_TIMESTAMP; } -#define tcp_get_free_buffer_index(tm, bidx) \ -do { \ - u32 *my_tx_buffers, n_free_buffers; \ - u32 thread_index = vlib_get_thread_index(); \ - my_tx_buffers = tm->tx_buffers[thread_index]; \ - if (PREDICT_FALSE(vec_len (my_tx_buffers) == 0)) \ - { \ - n_free_buffers = 32; /* TODO config or macro */ \ - vec_validate (my_tx_buffers, n_free_buffers - 1); \ - _vec_len(my_tx_buffers) = vlib_buffer_alloc_from_free_list ( \ - vlib_get_main(), my_tx_buffers, n_free_buffers, \ - VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); \ - tm->tx_buffers[thread_index] = my_tx_buffers; \ - } \ - /* buffer shortage */ \ - if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) \ - return; \ - *bidx = my_tx_buffers[_vec_len (my_tx_buffers)-1]; \ - _vec_len (my_tx_buffers) -= 1; \ -} while (0) - -#define tcp_return_buffer(tm) \ -do { \ - u32 *my_tx_buffers; \ - u32 thread_index = vlib_get_thread_index(); \ - my_tx_buffers = tm->tx_buffers[thread_index]; \ - _vec_len (my_tx_buffers) +=1; \ -} while (0) +always_inline int +tcp_get_free_buffer_index (tcp_main_t * tm, u32 * bidx) +{ + u32 *my_tx_buffers, n_free_buffers; + u32 thread_index = vlib_get_thread_index (); + my_tx_buffers = tm->tx_buffers[thread_index]; + if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) + { + n_free_buffers = VLIB_FRAME_SIZE; + vec_validate (my_tx_buffers, n_free_buffers - 1); + _vec_len (my_tx_buffers) = + vlib_buffer_alloc_from_free_list (vlib_get_main (), my_tx_buffers, + n_free_buffers, + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + /* buffer shortage, report failure */ + if (vec_len (my_tx_buffers) == 0) + { + clib_warning ("out of buffers"); + return -1; + } + tm->tx_buffers[thread_index] = my_tx_buffers; + } + *bidx = my_tx_buffers[_vec_len (my_tx_buffers) - 1]; + _vec_len (my_tx_buffers) -= 1; + return 0; +} + +always_inline void +tcp_return_buffer (tcp_main_t * tm) +{ + u32 *my_tx_buffers; + u32 thread_index = vlib_get_thread_index (); + my_tx_buffers = tm->tx_buffers[thread_index]; + _vec_len (my_tx_buffers) += 1; +} always_inline void tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b) @@ -706,7 +713,9 @@ tcp_send_reset (tcp_connection_t * tc, vlib_buffer_t * pkt, u8 is_ip4) ip4_header_t *ih4, *pkt_ih4; ip6_header_t *ih6, *pkt_ih6; - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); /* Leave enough space for headers */ @@ -811,7 +820,9 @@ tcp_send_syn (tcp_connection_t * tc) u16 initial_wnd; tcp_options_t snd_opts; - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); /* Leave enough space for headers */ @@ -854,8 +865,11 @@ tcp_send_syn (tcp_connection_t * tc) } always_inline void -tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) +tcp_enqueue_to_output_i (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, + u8 is_ip4, u8 flush) { + tcp_main_t *tm = vnet_get_tcp_main (); + u32 thread_index = vlib_get_thread_index (); u32 *to_next, next_index; vlib_frame_t *f; @@ -872,12 +886,62 @@ tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) b->pre_data[1] = next_index; } - /* Enqueue the packet */ - f = vlib_get_frame_to_node (vm, next_index); + /* Get frame to v4/6 output node */ + f = tm->tx_frames[!is_ip4][thread_index]; + if (!f) + { + f = vlib_get_frame_to_node (vm, next_index); + ASSERT (f); + tm->tx_frames[!is_ip4][thread_index] = f; + } to_next = vlib_frame_vector_args (f); - to_next[0] = bi; - f->n_vectors = 1; - vlib_put_frame_to_node (vm, next_index, f); + to_next[f->n_vectors] = bi; + f->n_vectors += 1; + if (flush || f->n_vectors == VLIB_FRAME_SIZE) + { + vlib_put_frame_to_node (vm, next_index, f); + tm->tx_frames[!is_ip4][thread_index] = 0; + } +} + +always_inline void +tcp_enqueue_to_output (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, u8 is_ip4) +{ + tcp_enqueue_to_output_i (vm, b, bi, is_ip4, 0); +} + +always_inline void +tcp_enqueue_to_output_now (vlib_main_t * vm, vlib_buffer_t * b, u32 bi, + u8 is_ip4) +{ + tcp_enqueue_to_output_i (vm, b, bi, is_ip4, 1); +} + +/** + * Flush tx frame populated by retransmits and timer pops + */ +void +tcp_flush_frame_to_output (vlib_main_t * vm, u8 thread_index, u8 is_ip4) +{ + if (tcp_main.tx_frames[!is_ip4][thread_index]) + { + u32 next_index; + next_index = is_ip4 ? tcp4_output_node.index : tcp6_output_node.index; + vlib_put_frame_to_node (vm, next_index, + tcp_main.tx_frames[!is_ip4][thread_index]); + tcp_main.tx_frames[!is_ip4][thread_index] = 0; + } +} + +/** + * Flush both v4 and v6 tx frames for thread index + */ +void +tcp_flush_frames_to_output (u8 thread_index) +{ + vlib_main_t *vm = vlib_get_main (); + tcp_flush_frame_to_output (vm, thread_index, 1); + tcp_flush_frame_to_output (vm, thread_index, 0); } /** @@ -891,14 +955,15 @@ tcp_send_fin (tcp_connection_t * tc) tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; b = vlib_get_buffer (vm, bi); /* Leave enough space for headers */ vlib_buffer_make_headroom (b, MAX_HDRS_LEN); tcp_make_fin (tc, b); - tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); + tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4); tc->flags |= TCP_CONN_FINSNT; tcp_retransmit_timer_force_update (tc); TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc); @@ -981,7 +1046,8 @@ tcp_send_ack (tcp_connection_t * tc) u32 bi; /* Get buffer */ - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; b = vlib_get_buffer (vm, bi); /* Fill in the ACK */ @@ -1108,7 +1174,9 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Go back to first un-acked byte */ tc->snd_nxt = tc->snd_una; - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); if (tc->state >= TCP_STATE_ESTABLISHED) @@ -1116,6 +1184,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Lost FIN, retransmit and return */ if (tc->flags & TCP_CONN_FINSNT) { + tcp_return_buffer (tm); tcp_send_fin (tc); return; } @@ -1143,6 +1212,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) tcp_retransmit_timer_set (tc); ASSERT (0 || (tc->rto_boff > 1 && tc->snd_una == tc->snd_congestion)); + tcp_return_buffer (tm); return; } @@ -1164,6 +1234,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) clib_warning ("could not remove half-open connection"); ASSERT (0); } + tcp_return_buffer (tm); return; } @@ -1185,6 +1256,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { ASSERT (tc->state == TCP_STATE_CLOSED); clib_warning ("connection closed ..."); + tcp_return_buffer (tm); return; } @@ -1254,7 +1326,9 @@ tcp_timer_persist_handler (u32 index) tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); /* Try to force the first unsent segment */ - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); @@ -1300,7 +1374,9 @@ tcp_retransmit_first_unacked (tcp_connection_t * tc) tc->snd_nxt = tc->snd_una; /* Get buffer */ - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2); @@ -1344,9 +1420,10 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); while (hole && snd_space > 0) { - tcp_get_free_buffer_index (tm, &bi); - b = vlib_get_buffer (vm, bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); hole = scoreboard_next_rxt_hole (sb, hole, tcp_fastrecovery_sent_1_smss (tc), &can_rescue, &snd_limited); @@ -1414,9 +1491,9 @@ tcp_fast_retransmit_no_sack (tcp_connection_t * tc) while (snd_space > 0) { - tcp_get_free_buffer_index (tm, &bi); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; b = vlib_get_buffer (vm, bi); - offset += n_written; n_written = tcp_prepare_retransmit_segment (tc, b, offset, snd_space); @@ -1506,32 +1583,21 @@ tcp46_output_inline (vlib_main_t * vm, if (is_ip4) { - ip4_header_t *ih0; - ih0 = vlib_buffer_push_ip4 (vm, b0, &tc0->c_lcl_ip4, - &tc0->c_rmt_ip4, IP_PROTOCOL_TCP, - 1); - b0->flags |= - VNET_BUFFER_F_IS_IP4 | VNET_BUFFER_F_OFFLOAD_IP_CKSUM | - VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; - vnet_buffer (b0)->l3_hdr_offset = (u8 *) ih0 - b0->data; + vlib_buffer_push_ip4 (vm, b0, &tc0->c_lcl_ip4, &tc0->c_rmt_ip4, + IP_PROTOCOL_TCP, 1); + b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; vnet_buffer (b0)->l4_hdr_offset = (u8 *) th0 - b0->data; th0->checksum = 0; } else { ip6_header_t *ih0; - int bogus = ~0; - ih0 = vlib_buffer_push_ip6 (vm, b0, &tc0->c_lcl_ip6, &tc0->c_rmt_ip6, IP_PROTOCOL_TCP); - - b0->flags |= VNET_BUFFER_F_IS_IP6 | - VNET_BUFFER_F_OFFLOAD_IP_CKSUM | - VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; + b0->flags |= VNET_BUFFER_F_OFFLOAD_TCP_CKSUM; vnet_buffer (b0)->l3_hdr_offset = (u8 *) ih0 - b0->data; vnet_buffer (b0)->l4_hdr_offset = (u8 *) th0 - b0->data; th0->checksum = 0; - ASSERT (!bogus); } /* Filter out DUPACKs if there are no OOO segments left */ diff --git a/src/vnet/unix/gdb_funcs.c b/src/vnet/unix/gdb_funcs.c index cca2e420..32e22d92 100644 --- a/src/vnet/unix/gdb_funcs.c +++ b/src/vnet/unix/gdb_funcs.c @@ -21,7 +21,7 @@ #include #include - +#include /** * @brief GDB callable function: vl - Return vector length of vector @@ -135,6 +135,47 @@ void vlib_runtime_index_to_node_name (u32 index) fformat(stderr, "node runtime index %d name %s\n", index, nm->nodes[index]->name); } +void gdb_show_errors (int verbose) +{ + extern vlib_cli_command_t vlib_cli_show_errors; + unformat_input_t input; + vlib_main_t * vm = vlib_get_main(); + + if (verbose == 0) + unformat_init_string (&input, "verbose 0", 9); + else if (verbose == 1) + unformat_init_string (&input, "verbose 1", 9); + else + { + fformat(stderr, "verbose not 0 or 1\n"); + return; + } + + vlib_cli_show_errors.function (vm, &input, 0 /* cmd */); + unformat_free (&input); +} + +void gdb_show_session (int verbose) +{ + extern vlib_cli_command_t vlib_cli_show_session_command; + unformat_input_t input; + vlib_main_t * vm = vlib_get_main(); + + if (verbose == 0) + unformat_init_string (&input, "verbose 0", 9); + else if (verbose == 1) + unformat_init_string (&input, "verbose 1", 9); + else if (verbose == 2) + unformat_init_string (&input, "verbose 2", 9); + else + { + fformat(stderr, "verbose not 0 - 2\n"); + return; + } + + vlib_cli_show_session_command.function (vm, &input, 0 /* cmd */); + unformat_free (&input); +} /** * @brief GDB callable function: show_gdb_command_fn - show gdb @@ -151,6 +192,8 @@ show_gdb_command_fn (vlib_main_t * vm, vlib_cli_output (vm, "vl(p) returns vec_len(p)"); vlib_cli_output (vm, "pe(p) returns pool_elts(p)"); vlib_cli_output (vm, "pifi(p, i) returns pool_is_free_index(p, i)"); + vlib_cli_output (vm, "gdb_show_errors(0|1) dumps error counters"); + vlib_cli_output (vm, "gdb_show_session dumps session counters"); vlib_cli_output (vm, "debug_hex_bytes (ptr, n_bytes) dumps n_bytes in hex"); vlib_cli_output (vm, "vlib_dump_frame_ownership() does what it says"); vlib_cli_output (vm, "vlib_runtime_index_to_node_name (index) prints NN"); diff --git a/src/vppinfra/pool.h b/src/vppinfra/pool.h index 57838e1c..56536b77 100644 --- a/src/vppinfra/pool.h +++ b/src/vppinfra/pool.h @@ -200,7 +200,7 @@ do { \ #define pool_get(P,E) pool_get_aligned(P,E,0) /** See if pool_get will expand the pool or not */ -#define pool_get_aligned_will_expand (P,YESNO,A) \ +#define pool_get_aligned_will_expand(P,YESNO,A) \ do { \ pool_header_t * _pool_var (p) = pool_header (P); \ uword _pool_var (l); \ -- cgit 1.2.3-korg From b2215d6b0d8ef7d425d2b9eea524a1c055a9f3b3 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Tue, 1 Aug 2017 16:56:58 -0700 Subject: Fix tcp multi buffer segments retransmission - Fix tcp/udp sw checksum computation - Fix allocation of multi buffer tcp segments for retransmits - Send FIN only if/when tx fifo is empty Change-Id: I2e43a14b87a72c9e547b4339b9a51811cf5732c4 Signed-off-by: Florin Coras --- src/vlib/buffer_funcs.h | 7 +- src/vnet/ip/ip4_forward.c | 12 +- src/vnet/session/session.c | 6 +- src/vnet/session/session_node.c | 39 ++++-- src/vnet/tcp/tcp.c | 39 ++++-- src/vnet/tcp/tcp.h | 18 ++- src/vnet/tcp/tcp_input.c | 48 +++++-- src/vnet/tcp/tcp_output.c | 290 +++++++++++++++++++++++++--------------- 8 files changed, 304 insertions(+), 155 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 72008dad..6a662416 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -833,7 +833,12 @@ vlib_buffer_init_for_free_list (vlib_buffer_t * dst, _(current_length); _(flags); #undef _ - ASSERT (dst->total_length_not_including_first_buffer == 0); + /* ASSERT (dst->total_length_not_including_first_buffer == 0); */ + /* total_length_not_including_first_buffer is not in the template anymore + * so it may actually not zeroed for some buffers. One option is to + * uncomment the line lower (comes at a cost), the other, is to just not + * care */ + /* dst->total_length_not_including_first_buffer = 0; */ ASSERT (dst->n_add_refs == 0); } diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index 7a8d7a0c..496df3c7 100755 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -1454,7 +1454,7 @@ ip4_tcp_udp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0, { ip_csum_t sum0; u32 ip_header_length, payload_length_host_byte_order; - u32 n_this_buffer, n_bytes_left; + u32 n_this_buffer, n_bytes_left, n_ip_bytes_this_buffer; u16 sum16; void *data_this_buffer; @@ -1481,10 +1481,12 @@ ip4_tcp_udp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0, n_bytes_left = n_this_buffer = payload_length_host_byte_order; data_this_buffer = (void *) ip0 + ip_header_length; - if (n_this_buffer + ip_header_length > p0->current_length) - n_this_buffer = - p0->current_length > - ip_header_length ? p0->current_length - ip_header_length : 0; + n_ip_bytes_this_buffer = p0->current_length - (((u8 *) ip0 - p0->data) - p0->current_data); + if (n_this_buffer + ip_header_length > n_ip_bytes_this_buffer) + { + n_this_buffer = n_ip_bytes_this_buffer > ip_header_length ? + n_ip_bytes_this_buffer - ip_header_length : 0; + } while (1) { sum0 = ip_incremental_checksum (sum0, data_this_buffer, n_this_buffer); diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 533a6c22..3a3e4dfe 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -98,9 +98,9 @@ session_enqueue_chain_tail (stream_session_t * s, vlib_buffer_t * b, u32 offset, u8 is_in_order) { vlib_buffer_t *chain_b; - u32 chain_bi = b->next_buffer; + u32 chain_bi = b->next_buffer, len; vlib_main_t *vm = vlib_get_main (); - u8 *data, len; + u8 *data; u16 written = 0; int rv = 0; @@ -226,7 +226,7 @@ u32 stream_session_tx_fifo_max_dequeue (transport_connection_t * tc) { stream_session_t *s = stream_session_get (tc->s_index, tc->thread_index); - if (s->session_state != SESSION_STATE_READY) + if (!s->server_tx_fifo) return 0; return svm_fifo_max_dequeue (s->server_tx_fifo); } diff --git a/src/vnet/session/session_node.c b/src/vnet/session/session_node.c index 8d703b0b..9c5b17d9 100644 --- a/src/vnet/session/session_node.c +++ b/src/vnet/session/session_node.c @@ -75,20 +75,25 @@ always_inline void session_tx_fifo_chain_tail (session_manager_main_t * smm, vlib_main_t * vm, u8 thread_index, svm_fifo_t * fifo, vlib_buffer_t * b0, u32 bi0, u8 n_bufs_per_seg, - u32 * left_to_snd0, u16 * n_bufs, u32 * rx_offset, - u16 deq_per_buf, u8 peek_data) + u32 left_from_seg, u32 * left_to_snd0, + u16 * n_bufs, u32 * rx_offset, u16 deq_per_buf, + u8 peek_data) { vlib_buffer_t *chain_b0, *prev_b0; - u32 chain_bi0; + u32 chain_bi0, to_deq; u16 len_to_deq0, n_bytes_read; u8 *data0, j; + b0->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + b0->total_length_not_including_first_buffer = 0; + chain_bi0 = bi0; chain_b0 = b0; + to_deq = left_from_seg; for (j = 1; j < n_bufs_per_seg; j++) { prev_b0 = chain_b0; - len_to_deq0 = clib_min (*left_to_snd0, deq_per_buf); + len_to_deq0 = clib_min (to_deq, deq_per_buf); *n_bufs -= 1; chain_bi0 = smm->tx_buffers[thread_index][*n_bufs]; @@ -117,10 +122,12 @@ session_tx_fifo_chain_tail (session_manager_main_t * smm, vlib_main_t * vm, /* update current buffer */ chain_b0->next_buffer = 0; - *left_to_snd0 -= n_bytes_read; - if (*left_to_snd0 == 0) + to_deq -= n_bytes_read; + if (to_deq == 0) break; } + ASSERT (to_deq == 0); + *left_to_snd0 -= left_from_seg; } always_inline int @@ -223,7 +230,6 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, && ((buffers_allocated + n_bufs < VLIB_FRAME_SIZE))); n_bufs += buffers_allocated; - _vec_len (smm->tx_buffers[thread_index]) = n_bufs; if (PREDICT_FALSE (n_bufs < VLIB_FRAME_SIZE)) @@ -289,11 +295,15 @@ session_tx_fifo_read_and_snd_i (vlib_main_t * vm, vlib_node_runtime_t * node, * Fill in the remaining buffers in the chain, if any */ if (PREDICT_FALSE (n_bufs_per_seg > 1)) - session_tx_fifo_chain_tail (smm, vm, thread_index, - s0->server_tx_fifo, b0, bi0, - n_bufs_per_seg, &left_to_snd0, - &n_bufs, &rx_offset, deq_per_buf, - peek_data); + { + u32 left_for_seg; + left_for_seg = clib_min (snd_mss0 - n_bytes_read, left_to_snd0); + session_tx_fifo_chain_tail (smm, vm, thread_index, + s0->server_tx_fifo, b0, bi0, + n_bufs_per_seg, left_for_seg, + &left_to_snd0, &n_bufs, &rx_offset, + deq_per_buf, peek_data); + } /* Ask transport to push header after current_length and * total_length_not_including_first_buffer are updated */ @@ -607,8 +617,9 @@ skip_dequeue: clib_warning ("It's dead, Jim!"); continue; } - - if (PREDICT_FALSE (s0->session_state == SESSION_STATE_CLOSED)) + /* Can retransmit for closed sessions but can't do anything if + * session is not ready or closed */ + if (PREDICT_FALSE (s0->session_state < SESSION_STATE_READY)) continue; /* Spray packets in per session type frames, since they go to * different nodes */ diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 8e2eb9f4..4652618b 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -288,18 +288,31 @@ tcp_connection_close (tcp_connection_t * tc) { TCP_EVT_DBG (TCP_EVT_CLOSE, tc); - /* Send FIN if needed */ - if (tc->state == TCP_STATE_ESTABLISHED - || tc->state == TCP_STATE_SYN_RCVD || tc->state == TCP_STATE_CLOSE_WAIT) - tcp_send_fin (tc); - - /* Switch state */ - if (tc->state == TCP_STATE_ESTABLISHED || tc->state == TCP_STATE_SYN_RCVD) - tc->state = TCP_STATE_FIN_WAIT_1; - else if (tc->state == TCP_STATE_SYN_SENT) - tc->state = TCP_STATE_CLOSED; - else if (tc->state == TCP_STATE_CLOSE_WAIT) - tc->state = TCP_STATE_LAST_ACK; + /* Send/Program FIN if needed and switch state */ + switch (tc->state) + { + case TCP_STATE_SYN_SENT: + tc->state = TCP_STATE_CLOSED; + break; + case TCP_STATE_SYN_RCVD: + tcp_send_fin (tc); + tc->state = TCP_STATE_FIN_WAIT_1; + break; + case TCP_STATE_ESTABLISHED: + if (!stream_session_tx_fifo_max_dequeue (&tc->connection)) + tcp_send_fin (tc); + else + tc->flags |= TCP_CONN_FINPNDG; + tc->state = TCP_STATE_FIN_WAIT_1; + break; + case TCP_STATE_CLOSE_WAIT: + tcp_send_fin (tc); + tc->state = TCP_STATE_LAST_ACK; + break; + default: + clib_warning ("shouldn't be here"); + } + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc); /* If in CLOSED and WAITCLOSE timer is not set, delete connection now */ @@ -1284,6 +1297,8 @@ tcp_main_enable (vlib_main_t * vm) vec_validate (tm->tx_frames[0], num_threads - 1); vec_validate (tm->tx_frames[1], num_threads - 1); + tm->bytes_per_buffer = vlib_buffer_free_list_buffer_size + (vm, VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); return error; } diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 997df76f..a17262fa 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -116,7 +116,8 @@ extern timer_expiration_handler tcp_timer_retransmit_syn_handler; _(RECOVERY, "Recovery on") \ _(FAST_RECOVERY, "Fast Recovery on") \ _(FR_1_SMSS, "Sent 1 SMSS") \ - _(HALF_OPEN_DONE, "Half-open completed") + _(HALF_OPEN_DONE, "Half-open completed") \ + _(FINPNDG, "FIN pending") typedef enum _tcp_connection_flag_bits { @@ -404,6 +405,9 @@ typedef struct _tcp_main /** Port allocator random number generator seed */ u32 port_allocator_seed; + + /** vlib buffer size */ + u32 bytes_per_buffer; } tcp_main_t; extern tcp_main_t tcp_main; @@ -587,6 +591,14 @@ tcp_available_snd_space (const tcp_connection_t * tc) return available_wnd - flight_size; } +always_inline u8 +tcp_is_lost_fin (tcp_connection_t * tc) +{ + if ((tc->flags & TCP_CONN_FINSNT) && tc->snd_una_max - tc->snd_una == 1) + return 1; + return 0; +} + i32 tcp_rcv_wnd_available (tcp_connection_t * tc); u32 tcp_snd_space (tcp_connection_t * tc); void tcp_update_rcv_wnd (tcp_connection_t * tc); @@ -621,8 +633,8 @@ tcp_update_time (f64 now, u32 thread_index) u32 tcp_push_header (transport_connection_t * tconn, vlib_buffer_t * b); u32 -tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, - u32 offset, u32 max_bytes); +tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, + u32 max_bytes, vlib_buffer_t ** b); void tcp_connection_timers_init (tcp_connection_t * tc); void tcp_connection_timers_reset (tcp_connection_t * tc); diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 29f4f08d..a3b48d83 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -492,14 +492,6 @@ tcp_ack_is_dupack (tcp_connection_t * tc, vlib_buffer_t * b, u32 prev_snd_wnd, && (prev_snd_wnd == tc->snd_wnd)); } -static u8 -tcp_is_lost_fin (tcp_connection_t * tc) -{ - if ((tc->flags & TCP_CONN_FINSNT) && tc->snd_una_max - tc->snd_una == 1) - return 1; - return 0; -} - /** * Checks if ack is a congestion control event. */ @@ -1162,7 +1154,8 @@ partial_ack: /* Remove retransmitted bytes that have been delivered */ ASSERT (tc->bytes_acked + tc->sack_sb.snd_una_adv - >= tc->sack_sb.last_bytes_delivered); + >= tc->sack_sb.last_bytes_delivered + || (tc->flags & TCP_CONN_FINSNT)); if (seq_lt (tc->snd_una, tc->sack_sb.high_rxt)) { @@ -1273,6 +1266,8 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, if (tcp_ack_is_cc_event (tc, b, prev_snd_wnd, prev_snd_una, &is_dack)) { tcp_cc_handle_event (tc, is_dack); + if (!tcp_in_cong_recovery (tc)) + return 0; *error = TCP_ERROR_ACK_DUP; TCP_EVT_DBG (TCP_EVT_DUPACK_RCVD, tc, 1); return vnet_buffer (b)->tcp.data_len ? 0 : -1; @@ -1497,6 +1492,29 @@ tcp_can_delack (tcp_connection_t * tc) return 1; } +static int +tcp_buffer_discard_bytes (vlib_buffer_t * b, u32 n_bytes_to_drop) +{ + u32 discard; + vlib_main_t *vm = vlib_get_main (); + + /* Handle multi segment packets */ + if (n_bytes_to_drop > b->current_length) + { + if (!(b->flags & VLIB_BUFFER_NEXT_PRESENT)) + return -1; + do + { + discard = clib_min (n_bytes_to_drop, b->current_length); + vlib_buffer_advance (b, discard); + b = vlib_get_buffer (vm, b->next_buffer); + n_bytes_to_drop -= discard; + } + while (n_bytes_to_drop); + } + return 0; +} + static int tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, u32 * next0) @@ -1530,7 +1548,8 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number; n_data_bytes -= n_bytes_to_drop; vnet_buffer (b)->tcp.seq_number = tc->rcv_nxt; - vlib_buffer_advance (b, n_bytes_to_drop); + if (tcp_buffer_discard_bytes (b, n_bytes_to_drop)) + goto done; goto in_order; } @@ -2252,8 +2271,15 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (tcp_rcv_ack (tc0, b0, tcp0, &next0, &error0)) goto drop; + /* Still have to send the FIN */ + if (tc0->flags & TCP_CONN_FINPNDG) + { + /* TX fifo finally drained */ + if (!stream_session_tx_fifo_max_dequeue (&tc0->connection)) + tcp_send_fin (tc0); + } /* If FIN is ACKed */ - if (tc0->snd_una == tc0->snd_una_max) + else if (tc0->snd_una == tc0->snd_una_max) { ASSERT (tcp_fin (tcp0)); tc0->rcv_nxt += 1; diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index f8fbb8a9..4c1add21 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -15,6 +15,7 @@ #include #include +#include vlib_node_registration_t tcp4_output_node; vlib_node_registration_t tcp6_output_node; @@ -84,7 +85,7 @@ void tcp_update_rcv_mss (tcp_connection_t * tc) { /* TODO find our iface MTU */ - tc->mss = dummy_mtu; + tc->mss = dummy_mtu - sizeof (tcp_header_t); } /** @@ -436,28 +437,35 @@ tcp_init_mss (tcp_connection_t * tc) tc->snd_mss -= TCP_OPTION_LEN_TIMESTAMP; } +always_inline int +tcp_alloc_tx_buffers (tcp_main_t * tm, u8 thread_index, u32 n_free_buffers) +{ + vec_validate (tm->tx_buffers[thread_index], n_free_buffers - 1); + _vec_len (tm->tx_buffers[thread_index]) = + vlib_buffer_alloc_from_free_list (vlib_get_main (), + tm->tx_buffers[thread_index], + n_free_buffers, + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + /* buffer shortage, report failure */ + if (vec_len (tm->tx_buffers[thread_index]) == 0) + { + clib_warning ("out of buffers"); + return -1; + } + return 0; +} + always_inline int tcp_get_free_buffer_index (tcp_main_t * tm, u32 * bidx) { - u32 *my_tx_buffers, n_free_buffers; + u32 *my_tx_buffers; u32 thread_index = vlib_get_thread_index (); - my_tx_buffers = tm->tx_buffers[thread_index]; - if (PREDICT_FALSE (vec_len (my_tx_buffers) == 0)) + if (PREDICT_FALSE (vec_len (tm->tx_buffers[thread_index]) == 0)) { - n_free_buffers = VLIB_FRAME_SIZE; - vec_validate (my_tx_buffers, n_free_buffers - 1); - _vec_len (my_tx_buffers) = - vlib_buffer_alloc_from_free_list (vlib_get_main (), my_tx_buffers, - n_free_buffers, - VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); - /* buffer shortage, report failure */ - if (vec_len (my_tx_buffers) == 0) - { - clib_warning ("out of buffers"); - return -1; - } - tm->tx_buffers[thread_index] = my_tx_buffers; + if (tcp_alloc_tx_buffers (tm, thread_index, VLIB_FRAME_SIZE)) + return -1; } + my_tx_buffers = tm->tx_buffers[thread_index]; *bidx = my_tx_buffers[_vec_len (my_tx_buffers) - 1]; _vec_len (my_tx_buffers) -= 1; return 0; @@ -476,6 +484,7 @@ always_inline void tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b) { vlib_buffer_t *it = b; + u32 save_free_list = b->flags & VLIB_BUFFER_FREE_LIST_INDEX_MASK; do { it->current_data = 0; @@ -485,6 +494,10 @@ tcp_reuse_buffer (vlib_main_t * vm, vlib_buffer_t * b) while ((it->flags & VLIB_BUFFER_NEXT_PRESENT) && (it = vlib_get_buffer (vm, it->next_buffer))); + if (b->flags & VLIB_BUFFER_NEXT_PRESENT) + vlib_buffer_free_one (vm, b->next_buffer); + b->flags = save_free_list; + /* Leave enough space for headers */ vlib_buffer_make_headroom (b, MAX_HDRS_LEN); vnet_buffer (b)->tcp.flags = 0; @@ -959,18 +972,16 @@ tcp_send_fin (tcp_connection_t * tc) return; b = vlib_get_buffer (vm, bi); - /* Leave enough space for headers */ - vlib_buffer_make_headroom (b, MAX_HDRS_LEN); - tcp_make_fin (tc, b); tcp_enqueue_to_output_now (vm, b, bi, tc->c_is_ip4); tc->flags |= TCP_CONN_FINSNT; + tc->flags &= ~TCP_CONN_FINPNDG; tcp_retransmit_timer_force_update (tc); TCP_EVT_DBG (TCP_EVT_FIN_SENT, tc); } always_inline u8 -tcp_make_state_flags (tcp_state_t next_state) +tcp_make_state_flags (tcp_connection_t * tc, tcp_state_t next_state) { switch (next_state) { @@ -982,7 +993,10 @@ tcp_make_state_flags (tcp_state_t next_state) return TCP_FLAG_SYN; case TCP_STATE_LAST_ACK: case TCP_STATE_FIN_WAIT_1: - return TCP_FLAG_FIN; + if (tc->snd_nxt + 1 < tc->snd_una_max) + return TCP_FLAG_ACK; + else + return TCP_FLAG_FIN; default: clib_warning ("Shouldn't be here!"); } @@ -1008,7 +1022,7 @@ tcp_push_hdr_i (tcp_connection_t * tc, vlib_buffer_t * b, tcp_hdr_opts_len = tc->snd_opts_len + sizeof (tcp_header_t); advertise_wnd = tcp_window_to_advertise (tc, next_state); - flags = tcp_make_state_flags (next_state); + flags = tcp_make_state_flags (tc, next_state); /* Push header and options */ th = vlib_buffer_push_tcp (b, tc->c_lcl_port, tc->c_rmt_port, tc->snd_nxt, @@ -1055,7 +1069,11 @@ tcp_send_ack (tcp_connection_t * tc) tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); } -/* Send delayed ACK when timer expires */ +/** + * Delayed ack timer handler + * + * Sends delayed ACK when timer expires + */ void tcp_timer_delack_handler (u32 index) { @@ -1067,49 +1085,138 @@ tcp_timer_delack_handler (u32 index) tcp_send_ack (tc); } -/** Build a retransmit segment +/** + * Build a retransmit segment * * @return the number of bytes in the segment or 0 if there's nothing to * retransmit */ u32 -tcp_prepare_retransmit_segment (tcp_connection_t * tc, vlib_buffer_t * b, - u32 offset, u32 max_bytes) +tcp_prepare_retransmit_segment (tcp_connection_t * tc, u32 offset, + u32 max_deq_bytes, vlib_buffer_t ** b) { + tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); int n_bytes = 0; - u32 start; - - tcp_reuse_buffer (vm, b); + u32 start, bi, available_bytes; ASSERT (tc->state >= TCP_STATE_ESTABLISHED); - ASSERT (max_bytes != 0); + ASSERT (max_deq_bytes != 0); - max_bytes = clib_min (tc->snd_mss, max_bytes); + /* + * Make sure we can retransmit something + */ + max_deq_bytes = clib_min (tc->snd_mss, max_deq_bytes); + available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection); + if (!available_bytes) + return 0; + max_deq_bytes = clib_min (available_bytes, max_deq_bytes); start = tc->snd_una + offset; /* Start is beyond snd_congestion */ if (seq_geq (start, tc->snd_congestion)) - goto done; + { + goto done; + } /* Don't overshoot snd_congestion */ - if (seq_gt (start + max_bytes, tc->snd_congestion)) + if (seq_gt (start + max_deq_bytes, tc->snd_congestion)) { - max_bytes = tc->snd_congestion - start; - if (max_bytes == 0) - goto done; + max_deq_bytes = tc->snd_congestion - start; + if (max_deq_bytes == 0) + { + goto done; + } } + /* + * Prepare options + */ tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); - ASSERT (max_bytes <= tc->snd_mss); + /* + * Allocate and fill in buffer(s) + */ + + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return 0; + *b = vlib_get_buffer (vm, bi); + + /* Easy case, buffer size greater than mss */ + if (PREDICT_TRUE (max_deq_bytes <= tm->bytes_per_buffer)) + { + n_bytes = stream_session_peek_bytes (&tc->connection, + vlib_buffer_get_current (*b), + offset, max_deq_bytes); + ASSERT (n_bytes == max_deq_bytes); + b[0]->current_length = n_bytes; + tcp_push_hdr_i (tc, *b, tc->state, 0); + } + /* Split mss into multiple buffers */ + else + { + u32 chain_bi = ~0, n_bufs_per_seg; + u32 thread_index = vlib_get_thread_index (); + u16 n_peeked, len_to_deq, available_bufs; + vlib_buffer_t *chain_b, *prev_b; + u8 *data0; + int i; + + n_bufs_per_seg = ceil ((double) max_deq_bytes / tm->bytes_per_buffer); + ASSERT (available_bytes >= max_deq_bytes); + + /* Make sure we have enough buffers */ + available_bufs = vec_len (tm->tx_buffers[thread_index]); + if (n_bufs_per_seg > available_bufs) + { + if (tcp_alloc_tx_buffers (tm, thread_index, + VLIB_FRAME_SIZE - available_bufs)) + { + tcp_return_buffer (tm); + return 0; + } + } + + n_bytes = stream_session_peek_bytes (&tc->connection, + vlib_buffer_get_current (*b), + offset, tm->bytes_per_buffer); + b[0]->current_length = n_bytes; + b[0]->flags |= VLIB_BUFFER_TOTAL_LENGTH_VALID; + b[0]->total_length_not_including_first_buffer = 0; + + tcp_push_hdr_i (tc, *b, tc->state, 0); + max_deq_bytes -= n_bytes; + + chain_b = *b; + for (i = 1; i < n_bufs_per_seg; i++) + { + prev_b = chain_b; + len_to_deq = clib_min (max_deq_bytes, tm->bytes_per_buffer); + tcp_get_free_buffer_index (tm, &chain_bi); + ASSERT (chain_bi != (u32) ~ 0); + chain_b = vlib_get_buffer (vm, chain_bi); + chain_b->current_data = 0; + data0 = vlib_buffer_get_current (chain_b); + n_peeked = stream_session_peek_bytes (&tc->connection, data0, + n_bytes, len_to_deq); + n_bytes += n_peeked; + ASSERT (n_peeked == len_to_deq); + chain_b->current_length = n_peeked; + b[0]->total_length_not_including_first_buffer += + chain_b->current_length; + + /* update previous buffer */ + prev_b->next_buffer = chain_bi; + prev_b->flags |= VLIB_BUFFER_NEXT_PRESENT; + + /* update current buffer */ + chain_b->next_buffer = 0; + + max_deq_bytes -= n_peeked; + } + } - n_bytes = stream_session_peek_bytes (&tc->connection, - vlib_buffer_get_current (b), offset, - max_bytes); ASSERT (n_bytes > 0); - b->current_length = n_bytes; - tcp_push_hdr_i (tc, b, tc->state, 0); if (tcp_in_fastrecovery (tc)) tc->snd_rxt_bytes += n_bytes; @@ -1147,7 +1254,7 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) vlib_main_t *vm = vlib_get_main (); u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; - vlib_buffer_t *b; + vlib_buffer_t *b = 0; u32 bi, n_bytes; if (is_syn) @@ -1174,17 +1281,11 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) /* Go back to first un-acked byte */ tc->snd_nxt = tc->snd_una; - if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) - return; - - b = vlib_get_buffer (vm, bi); - if (tc->state >= TCP_STATE_ESTABLISHED) { /* Lost FIN, retransmit and return */ - if (tc->flags & TCP_CONN_FINSNT) + if (tcp_is_lost_fin (tc)) { - tcp_return_buffer (tm); tcp_send_fin (tc); return; } @@ -1199,7 +1300,9 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 1); /* Send one segment */ - n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); + n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b); + ASSERT (n_bytes); + bi = vlib_get_buffer_index (vm, b); /* TODO be less aggressive about this */ scoreboard_clear (&tc->sack_sb); @@ -1212,7 +1315,6 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) tcp_retransmit_timer_set (tc); ASSERT (0 || (tc->rto_boff > 1 && tc->snd_una == tc->snd_congestion)); - tcp_return_buffer (tm); return; } @@ -1234,7 +1336,6 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) clib_warning ("could not remove half-open connection"); ASSERT (0); } - tcp_return_buffer (tm); return; } @@ -1243,6 +1344,9 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) if (tc->rto_boff > TCP_RTO_SYN_RETRIES) tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); + if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) + return; + b = vlib_get_buffer (vm, bi); vlib_buffer_make_headroom (b, MAX_HDRS_LEN); tcp_push_hdr_i (tc, b, tc->state, 1); @@ -1256,7 +1360,6 @@ tcp_timer_retransmit_handler_i (u32 index, u8 is_syn) { ASSERT (tc->state == TCP_STATE_CLOSED); clib_warning ("connection closed ..."); - tcp_return_buffer (tm); return; } @@ -1305,7 +1408,7 @@ tcp_timer_persist_handler (u32 index) u32 thread_index = vlib_get_thread_index (); tcp_connection_t *tc; vlib_buffer_t *b; - u32 bi, old_snd_nxt; + u32 bi, old_snd_nxt, snd_bytes = 0, available_bytes = 0; int n_bytes = 0; tc = tcp_connection_get_if_valid (index, thread_index); @@ -1317,34 +1420,31 @@ tcp_timer_persist_handler (u32 index) tc->timers[TCP_TIMER_PERSIST] = TCP_TIMER_HANDLE_INVALID; /* Problem already solved or worse */ + available_bytes = stream_session_tx_fifo_max_dequeue (&tc->connection); if (tc->state == TCP_STATE_CLOSED || tc->state > TCP_STATE_ESTABLISHED - || tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc)) + || tc->snd_wnd > tc->snd_mss || tcp_in_recovery (tc) + || !available_bytes) return; /* Increment RTO backoff */ tc->rto_boff += 1; tc->rto = clib_min (tc->rto << 1, TCP_RTO_MAX); - /* Try to force the first unsent segment */ + /* + * Try to force the first unsent segment (or buffer) + */ if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) return; - b = vlib_get_buffer (vm, bi); tcp_validate_txf_size (tc, tc->snd_una_max - tc->snd_una); tc->snd_opts_len = tcp_make_options (tc, &tc->snd_opts, tc->state); + snd_bytes = clib_min (tc->snd_mss, tm->bytes_per_buffer); n_bytes = stream_session_peek_bytes (&tc->connection, vlib_buffer_get_current (b), tc->snd_una_max - tc->snd_una, - tc->snd_mss); - /* Nothing to send */ - if (n_bytes <= 0) - { - // clib_warning ("persist found nothing to send"); - tcp_return_buffer (tm); - return; - } - + snd_bytes); + ASSERT (n_bytes != 0); b->current_length = n_bytes; ASSERT (tc->snd_nxt == tc->snd_una_max || tc->rto_boff > 1 || tcp_timer_is_active (tc, TCP_TIMER_RETRANSMIT)); @@ -1365,32 +1465,20 @@ tcp_timer_persist_handler (u32 index) void tcp_retransmit_first_unacked (tcp_connection_t * tc) { - tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); vlib_buffer_t *b; - u32 bi, n_bytes, old_snd_nxt; + u32 bi, old_snd_nxt, n_bytes; old_snd_nxt = tc->snd_nxt; tc->snd_nxt = tc->snd_una; - /* Get buffer */ - if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) - return; - - b = vlib_get_buffer (vm, bi); - TCP_EVT_DBG (TCP_EVT_CC_EVT, tc, 2); - - n_bytes = tcp_prepare_retransmit_segment (tc, b, 0, tc->snd_mss); - if (n_bytes == 0) - { - tcp_return_buffer (tm); - goto done; - } - + n_bytes = tcp_prepare_retransmit_segment (tc, 0, tc->snd_mss, &b); + if (!n_bytes) + return; + bi = vlib_get_buffer_index (vm, b); tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); -done: tc->snd_nxt = old_snd_nxt; } @@ -1400,10 +1488,9 @@ done: void tcp_fast_retransmit_sack (tcp_connection_t * tc) { - tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); u32 n_written = 0, offset = 0, max_bytes; - vlib_buffer_t *b; + vlib_buffer_t *b = 0; sack_scoreboard_hole_t *hole; sack_scoreboard_t *sb; u32 bi, old_snd_nxt; @@ -1420,10 +1507,6 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) hole = scoreboard_get_hole (sb, sb->cur_rxt_hole); while (hole && snd_space > 0) { - if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) - return; - - b = vlib_get_buffer (vm, bi); hole = scoreboard_next_rxt_hole (sb, hole, tcp_fastrecovery_sent_1_smss (tc), &can_rescue, &snd_limited); @@ -1443,7 +1526,10 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) offset = tc->snd_congestion - tc->snd_una - max_bytes; sb->rescue_rxt = tc->snd_congestion; tc->snd_nxt = tc->snd_una + offset; - tcp_prepare_retransmit_segment (tc, b, offset, max_bytes); + n_written = tcp_prepare_retransmit_segment (tc, offset, max_bytes, + &b); + ASSERT (n_written); + bi = vlib_get_buffer_index (vm, b); tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); break; } @@ -1451,15 +1537,13 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) max_bytes = snd_limited ? tc->snd_mss : hole->end - sb->high_rxt; offset = sb->high_rxt - tc->snd_una; tc->snd_nxt = tc->snd_una + offset; - n_written = tcp_prepare_retransmit_segment (tc, b, offset, max_bytes); + n_written = tcp_prepare_retransmit_segment (tc, offset, max_bytes, &b); /* Nothing left to retransmit */ if (n_written == 0) - { - tcp_return_buffer (tm); - break; - } + break; + bi = vlib_get_buffer_index (vm, b); sb->high_rxt += n_written; tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); snd_space -= n_written; @@ -1475,7 +1559,6 @@ tcp_fast_retransmit_sack (tcp_connection_t * tc) void tcp_fast_retransmit_no_sack (tcp_connection_t * tc) { - tcp_main_t *tm = vnet_get_tcp_main (); vlib_main_t *vm = vlib_get_main (); u32 n_written = 0, offset = 0, bi, old_snd_nxt; int snd_space; @@ -1491,19 +1574,14 @@ tcp_fast_retransmit_no_sack (tcp_connection_t * tc) while (snd_space > 0) { - if (PREDICT_FALSE (tcp_get_free_buffer_index (tm, &bi))) - return; - b = vlib_get_buffer (vm, bi); offset += n_written; - n_written = tcp_prepare_retransmit_segment (tc, b, offset, snd_space); + n_written = tcp_prepare_retransmit_segment (tc, offset, snd_space, &b); /* Nothing left to retransmit */ if (n_written == 0) - { - tcp_return_buffer (tm); - break; - } + break; + bi = vlib_get_buffer_index (vm, b); tcp_enqueue_to_output (vm, b, bi, tc->c_is_ip4); snd_space -= n_written; } -- cgit 1.2.3-korg From 57d963f88b2c99e698e2b29f72e190f47f41b1ad Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 20 Jul 2017 19:17:06 +0200 Subject: Make VPP runtime directory configurable New startup config command: unix { runtime-dir /run/vpp } Also, adds recursive mkdir funtion for use in deifferent places like cli-config socket path and dpdk hugepage directory path. Change-Id: I1446ceab9c220c25804e73a743a3ebb383450124 Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/init.c | 29 +++++++++++++---------------- src/plugins/memif/memif.c | 14 +++++++++----- src/plugins/memif/private.h | 1 - src/vlib/unix/cli.c | 20 ++++++++++++-------- src/vlib/unix/main.c | 13 ++++++++++++- src/vlib/unix/unix.h | 14 ++++++++++---- src/vlib/unix/util.c | 36 +++++++++++++++++++++++++++++------- src/vpp/vnet/main.c | 5 +++++ 8 files changed, 90 insertions(+), 42 deletions(-) (limited to 'src/vlib') diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index c6c9ee34..6f7e168b 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -37,8 +37,6 @@ dpdk_main_t dpdk_main; #define LINK_STATE_ELOGS 0 -#define DEFAULT_HUGE_DIR (VPP_RUN_DIR "/hugepages") - /* Port configuration, mildly modified Intel app values */ static struct rte_eth_conf port_conf_template = { @@ -835,6 +833,10 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) u8 huge_dir = 0; u8 file_prefix = 0; u8 *socket_mem = 0; + u8 *huge_dir_path = 0; + + huge_dir_path = + format (0, "%s/hugepages%c", vlib_unix_get_runtime_dir (), 0); conf->device_config_index_by_pci_addr = hash_create (0, sizeof (uword)); log_level = RTE_LOG_NOTICE; @@ -980,7 +982,7 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) u8 less_than_1g = 1; int rv; - umount (DEFAULT_HUGE_DIR); + umount ((char *) huge_dir_path); /* Process "socket-mem" parameter value */ if (vec_len (socket_mem)) @@ -1057,27 +1059,20 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) vec_free (mem_by_socket); - /* Make sure VPP_RUN_DIR exists */ - error = unix_make_vpp_run_dir (); + error = vlib_unix_recursive_mkdir ((char *) huge_dir_path); if (error) - goto done; - - rv = mkdir (DEFAULT_HUGE_DIR, 0755); - if (rv && errno != EEXIST) { - error = clib_error_return (0, "mkdir '%s' failed errno %d", - DEFAULT_HUGE_DIR, errno); goto done; } if (use_1g && !(less_than_1g && use_2m)) { - rv = - mount ("none", DEFAULT_HUGE_DIR, "hugetlbfs", 0, "pagesize=1G"); + rv = mount ("none", (char *) huge_dir_path, "hugetlbfs", 0, + "pagesize=1G"); } else if (use_2m) { - rv = mount ("none", DEFAULT_HUGE_DIR, "hugetlbfs", 0, NULL); + rv = mount ("none", (char *) huge_dir_path, "hugetlbfs", 0, NULL); } else { @@ -1092,7 +1087,7 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) tmp = format (0, "--huge-dir%c", 0); vec_add1 (conf->eal_init_args, tmp); - tmp = format (0, "%s%c", DEFAULT_HUGE_DIR, 0); + tmp = format (0, "%s%c", huge_dir_path, 0); vec_add1 (conf->eal_init_args, tmp); if (!file_prefix) { @@ -1209,7 +1204,9 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) (char **) conf->eal_init_args); /* lazy umount hugepages */ - umount2 (DEFAULT_HUGE_DIR, MNT_DETACH); + umount2 ((char *) huge_dir_path, MNT_DETACH); + rmdir ((char *) huge_dir_path); + vec_free (huge_dir_path); if (ret < 0) return clib_error_return (0, "rte_eal_init returned %d", ret); diff --git a/src/plugins/memif/memif.c b/src/plugins/memif/memif.c index ba123149..af81faf2 100644 --- a/src/plugins/memif/memif.c +++ b/src/plugins/memif/memif.c @@ -556,15 +556,19 @@ memif_create_if (vlib_main_t * vm, memif_create_if_args_t * args) if (args->socket_filename == 0 || args->socket_filename[0] != '/') { - rv = mkdir (MEMIF_DEFAULT_SOCKET_DIR, 0755); - if (rv && errno != EEXIST) - return VNET_API_ERROR_SYSCALL_ERROR_1; + clib_error_t *error; + error = vlib_unix_recursive_mkdir (vlib_unix_get_runtime_dir ()); + if (error) + { + clib_error_free (error); + return VNET_API_ERROR_SYSCALL_ERROR_1; + } if (args->socket_filename == 0) - socket_filename = format (0, "%s/%s%c", MEMIF_DEFAULT_SOCKET_DIR, + socket_filename = format (0, "%s/%s%c", vlib_unix_get_runtime_dir (), MEMIF_DEFAULT_SOCKET_FILENAME, 0); else - socket_filename = format (0, "%s/%s%c", MEMIF_DEFAULT_SOCKET_DIR, + socket_filename = format (0, "%s/%s%c", vlib_unix_get_runtime_dir (), args->socket_filename, 0); } diff --git a/src/plugins/memif/private.h b/src/plugins/memif/private.h index 0f82f1e9..985ac5ec 100644 --- a/src/plugins/memif/private.h +++ b/src/plugins/memif/private.h @@ -17,7 +17,6 @@ #include -#define MEMIF_DEFAULT_SOCKET_DIR "/run/vpp" #define MEMIF_DEFAULT_SOCKET_FILENAME "memif.sock" #define MEMIF_DEFAULT_RING_SIZE 1024 #define MEMIF_DEFAULT_RX_QUEUES 1 diff --git a/src/vlib/unix/cli.c b/src/vlib/unix/cli.c index 1befa25d..068a4e16 100644 --- a/src/vlib/unix/cli.c +++ b/src/vlib/unix/cli.c @@ -2642,15 +2642,19 @@ unix_cli_config (vlib_main_t * vm, unformat_input_t * input) /* CLI listen. */ unix_file_t template = { 0 }; - /* If our listen address looks like a path and it starts with - * VPP_RUN_DIR, go make sure VPP_RUN_DIR exists before trying to open - * a socket in it. - */ - if (strncmp (s->config, VPP_RUN_DIR "/", strlen (VPP_RUN_DIR) + 1) == 0) + /* mkdir of file socketu, only under /run */ + if (strncmp (s->config, "/run", 4) == 0) { - error = unix_make_vpp_run_dir (); - if (error) - return error; + u8 *tmp = format (0, "%s", s->config); + int i = vec_len (tmp); + while (i && tmp[--i] != '/') + ; + + tmp[i] = 0; + + if (i) + vlib_unix_recursive_mkdir ((char *) tmp); + vec_free (tmp); } s->flags = SOCKET_IS_SERVER | /* listen, don't connect */ diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c index ad1a7c3c..cb34a89f 100644 --- a/src/vlib/unix/main.c +++ b/src/vlib/unix/main.c @@ -56,6 +56,8 @@ /** Default CLI history depth if not configured in startup.conf */ #define UNIX_CLI_DEFAULT_HISTORY 50 +char *vlib_default_runtime_dir __attribute__ ((weak)); +char *vlib_default_runtime_dir = "/run/vlib"; unix_main_t unix_main; @@ -332,6 +334,8 @@ unix_config (vlib_main_t * vm, unformat_input_t * input) else if (unformat (input, "cli-listen %s", &um->cli_listen_socket.config)) ; + else if (unformat (input, "runtime-dir %s", &um->runtime_dir)) + ; else if (unformat (input, "cli-line-mode")) um->cli_line_mode = 1; else if (unformat (input, "cli-no-banner")) @@ -432,6 +436,9 @@ unix_config (vlib_main_t * vm, unformat_input_t * input) } um->unix_config_complete = 1; + if (um->runtime_dir == 0) + um->runtime_dir = format (0, "%s%c", vlib_default_runtime_dir, 0); + return 0; } @@ -463,6 +470,10 @@ unix_config (vlib_main_t * vm, unformat_input_t * input) * Ask the Linux kernel to dump all memory-mapped address regions, instead * of just text+data+bss. * + * @cfgcmd{runtime-dir} + * Define directory where VPP is going to store all runtime files. + * Default is /run/vpp. + * * @cfgcmd{cli-listen, <address:port>} * Bind the CLI to listen at the address and port given. @clocalhost * on TCP port @c 5002, given as cli-listen localhost:5002, @@ -489,7 +500,7 @@ unix_config (vlib_main_t * vm, unformat_input_t * input) * Limit pager buffer to @c nn lines of output. * A value of @c 0 disables the pager. Default value: @c 100000 ?*/ -VLIB_CONFIG_FUNCTION (unix_config, "unix"); +VLIB_EARLY_CONFIG_FUNCTION (unix_config, "unix"); static clib_error_t * unix_exit (vlib_main_t * vm) diff --git a/src/vlib/unix/unix.h b/src/vlib/unix/unix.h index ffa92bba..ee1312e3 100644 --- a/src/vlib/unix/unix.h +++ b/src/vlib/unix/unix.h @@ -44,9 +44,6 @@ #include -/** VPP runtime ephemeral directory. Typically stored in a tmpfs. */ -#define VPP_RUN_DIR "/run/vpp" - struct unix_file; typedef clib_error_t *(unix_file_function_t) (struct unix_file * f); @@ -106,6 +103,9 @@ typedef struct /* startup-config filename */ u8 *startup_config_filename; + /* runtime directory path */ + u8 *runtime_dir; + /* unix config complete */ volatile int unix_config_complete; @@ -214,6 +214,12 @@ vlib_unix_get_main (void) return &unix_main; } +static inline char * +vlib_unix_get_runtime_dir (void) +{ + return (char *) unix_main.runtime_dir; +} + /* thread stack array; vec_len = max number of threads */ extern u8 **vlib_thread_stacks; @@ -233,7 +239,7 @@ clib_error_t *foreach_directory_file (char *dir_name, u8 * file_name), void *arg, int scan_dirs); -clib_error_t *unix_make_vpp_run_dir (void); +clib_error_t *vlib_unix_recursive_mkdir (char *path); #endif /* included_unix_unix_h */ diff --git a/src/vlib/unix/util.c b/src/vlib/unix/util.c index 51b4a4ed..93aeb99c 100644 --- a/src/vlib/unix/util.c +++ b/src/vlib/unix/util.c @@ -223,16 +223,38 @@ done: } clib_error_t * -unix_make_vpp_run_dir (void) +vlib_unix_recursive_mkdir (char *path) { - int rv; + clib_error_t *error = 0; + char *c = 0; + int i = 0; - rv = mkdir (VPP_RUN_DIR, 0755); - if (rv && errno != EEXIST) - return clib_error_return (0, "mkdir '%s' failed errno %d", - VPP_RUN_DIR, errno); + while (path[i] != 0) + { + if (c && path[i] == '/') + { + vec_add1 (c, 0); + if ((mkdir (c, 0755)) && (errno != EEXIST)) + { + error = clib_error_return_unix (0, "mkdir '%s'", c); + goto done; + } + _vec_len (c)--; + } + vec_add1 (c, path[i]); + i++; + } - return 0; + if ((mkdir (path, 0755)) && (errno != EEXIST)) + { + error = clib_error_return_unix (0, "mkdir '%s'", path); + goto done; + } + +done: + vec_free (c); + + return error; } /* diff --git a/src/vpp/vnet/main.c b/src/vpp/vnet/main.c index ade32aa1..9fe65fe2 100644 --- a/src/vpp/vnet/main.c +++ b/src/vpp/vnet/main.c @@ -41,6 +41,11 @@ vpe_main_init (vlib_main_t * vm) vat_plugin_hash_create (); } +/* + * Default path for runtime data + */ +char *vlib_default_runtime_dir = "/run/vpp"; + /* * Load plugins from /usr/lib/vpp_plugins by default */ -- cgit 1.2.3-korg From d84ba85c0071a28fe888c912c3dc37f471b0caeb Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Tue, 22 Aug 2017 17:56:46 -0400 Subject: TCP horizontal scaling - Remove frame handoff support machinery. We haven't used it in a long time. - Configuration support for the local endpoints bihash table - Drop lookup failure packets in tcp46_syn_sent Change-Id: Icd51e6785f74661c741e76fac23d21c4cc998d17 Signed-off-by: Dave Barach --- src/vlib/main.c | 9 +-------- src/vlib/node.h | 5 +---- src/vlib/node_funcs.h | 15 ++++++--------- src/vnet/tcp/tcp.c | 23 +++++++++++++++++++++-- src/vnet/tcp/tcp.h | 4 ++++ src/vnet/tcp/tcp_input.c | 6 +++++- 6 files changed, 38 insertions(+), 24 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/main.c b/src/vlib/main.c index 73548fbe..5d99e899 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -138,19 +138,12 @@ vlib_frame_alloc_to_node (vlib_main_t * vm, u32 to_node_index, else { f = clib_mem_alloc_aligned_no_fail (n, VLIB_FRAME_ALIGN); - f->thread_index = vm->thread_index; fi = vlib_frame_index_no_check (vm, f); } /* Poison frame when debugging. */ if (CLIB_DEBUG > 0) - { - u32 save_thread_index = f->thread_index; - - memset (f, 0xfe, n); - - f->thread_index = save_thread_index; - } + memset (f, 0xfe, n); /* Insert magic number. */ { diff --git a/src/vlib/node.h b/src/vlib/node.h index 77914272..2acd61ce 100644 --- a/src/vlib/node.h +++ b/src/vlib/node.h @@ -326,7 +326,7 @@ typedef struct vlib_node_t /* Max number of vector elements to process at once per node. */ #define VLIB_FRAME_SIZE 256 -#define VLIB_FRAME_ALIGN VLIB_MAX_CPUS +#define VLIB_FRAME_ALIGN CLIB_CACHE_LINE_BYTES /* Calling frame (think stack frame) for a node. */ typedef struct vlib_frame_t @@ -343,9 +343,6 @@ typedef struct vlib_frame_t /* Number of vector elements currently in frame. */ u16 n_vectors; - /* Owner thread / heap id */ - u16 thread_index; - /* Scalar and vector arguments to next node. */ u8 arguments[0]; } vlib_frame_t; diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h index c4c06454..0059b9be 100644 --- a/src/vlib/node_funcs.h +++ b/src/vlib/node_funcs.h @@ -216,24 +216,21 @@ always_inline vlib_frame_t * vlib_get_frame_no_check (vlib_main_t * vm, uword frame_index) { vlib_frame_t *f; - u32 thread_index = frame_index & VLIB_CPU_MASK; - u32 offset = frame_index & VLIB_OFFSET_MASK; - vm = vlib_mains[thread_index]; - f = vm->heap_base + offset; + f = vm->heap_base + (frame_index * VLIB_FRAME_ALIGN); return f; } always_inline u32 vlib_frame_index_no_check (vlib_main_t * vm, vlib_frame_t * f) { - u32 i; + uword i; - ASSERT (((uword) f & VLIB_CPU_MASK) == 0); - - vm = vlib_mains[f->thread_index]; + ASSERT (((uword) f & (VLIB_FRAME_ALIGN - 1)) == 0); i = ((u8 *) f - (u8 *) vm->heap_base); - return i | f->thread_index; + ASSERT ((i / VLIB_FRAME_ALIGN) <= 0xFFFFFFFFULL); + + return i / VLIB_FRAME_ALIGN; } always_inline vlib_frame_t * diff --git a/src/vnet/tcp/tcp.c b/src/vnet/tcp/tcp.c index 197fff96..6b2b4759 100644 --- a/src/vnet/tcp/tcp.c +++ b/src/vnet/tcp/tcp.c @@ -1321,9 +1321,14 @@ tcp_main_enable (vlib_main_t * vm) tm->tstamp_ticks_per_clock = vm->clib_time.seconds_per_clock / TCP_TSTAMP_RESOLUTION; + if (tm->local_endpoints_table_buckets == 0) + tm->local_endpoints_table_buckets = 250000; + if (tm->local_endpoints_table_memory == 0) + tm->local_endpoints_table_memory = 512 << 20; + clib_bihash_init_24_8 (&tm->local_endpoints_table, "local endpoint table", - 1000000 /* $$$$ config parameter nbuckets */ , - (512 << 20) /*$$$ config parameter table size */ ); + tm->local_endpoints_table_buckets, + tm->local_endpoints_table_memory); /* Initialize [port-allocator] random number seed */ tm->port_allocator_seed = (u32) clib_cpu_time_now (); @@ -1377,6 +1382,7 @@ static clib_error_t * tcp_config_fn (vlib_main_t * vm, unformat_input_t * input) { tcp_main_t *tm = vnet_get_tcp_main (); + u64 tmp; while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { @@ -1387,6 +1393,19 @@ tcp_config_fn (vlib_main_t * vm, unformat_input_t * input) else if (unformat (input, "preallocated-half-open-connections %d", &tm->preallocated_half_open_connections)) ; + else if (unformat (input, "local-endpoints-table-memory %U", + unformat_memory_size, &tmp)) + { + if (tmp >= 0x100000000) + return clib_error_return (0, "memory size %llx (%lld) too large", + tmp, tmp); + tm->local_endpoints_table_memory = tmp; + } + else if (unformat (input, "local-endpoints-table-buckets %d", + &tm->local_endpoints_table_buckets)) + ; + + else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); diff --git a/src/vnet/tcp/tcp.h b/src/vnet/tcp/tcp.h index 9e4660b8..11d61f5d 100644 --- a/src/vnet/tcp/tcp.h +++ b/src/vnet/tcp/tcp.h @@ -399,6 +399,10 @@ typedef struct _tcp_main u32 preallocated_connections; u32 preallocated_half_open_connections; + /** Transport table (preallocation) size parameters */ + u32 local_endpoints_table_memory; + u32 local_endpoints_table_buckets; + /** Vectors of src addresses. Optional unless one needs > 63K active-opens */ ip4_address_t *ip4_src_addresses; u32 last_v4_address_rotor; diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index 95f9ade1..66e2b88f 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -1882,7 +1882,11 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tc0 = tcp_half_open_connection_get (vnet_buffer (b0)-> tcp.connection_index); - ASSERT (tc0); + if (PREDICT_FALSE (tc0 == 0)) + { + error0 = TCP_ERROR_INVALID_CONNECTION; + goto drop; + } ack0 = vnet_buffer (b0)->tcp.ack_number; seq0 = vnet_buffer (b0)->tcp.seq_number; -- cgit 1.2.3-korg From b6a8ed7fa0709bbf8e091826803f50e6330689cf Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Tue, 29 Aug 2017 00:15:35 +0200 Subject: Thread safe internal buffer manager, take two First attempt to make internal buffer manager thread safe was not succesfull, so trying again. This time with more testing. Change-Id: I01b8385a9c26d233934a3339255ea4bd31c865ac Signed-off-by: Damjan Marion --- src/vlib/buffer.c | 19 +++++++++++++++++++ src/vlib/buffer.h | 6 ++++++ src/vlib/buffer_funcs.h | 11 +++++++---- 3 files changed, 32 insertions(+), 4 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index a5c955c7..908368c0 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -396,6 +396,8 @@ vlib_buffer_create_free_list_helper (vlib_main_t * vm, hash_set (bm->free_list_by_size, f->n_data_bytes, f->index); } + clib_spinlock_init (&f->global_buffers_lock); + for (i = 1; i < vec_len (vlib_mains); i++) { vlib_buffer_main_t *wbm = vlib_mains[i]->buffer_main; @@ -509,6 +511,7 @@ fill_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * fl, uword min_free_buffers) { vlib_buffer_t *buffers, *b; + vlib_buffer_free_list_t *mfl; int n, n_bytes, i; u32 *bi; u32 n_remaining, n_alloc, n_this_chunk; @@ -518,6 +521,22 @@ fill_free_list (vlib_main_t * vm, if (n <= 0) return min_free_buffers; + mfl = vlib_buffer_get_free_list (vlib_mains[0], fl->index); + if (vec_len (mfl->global_buffers) > 0) + { + int n_copy, n_left; + clib_spinlock_lock (&mfl->global_buffers_lock); + n_copy = clib_min (vec_len (mfl->global_buffers), n); + n_left = vec_len (mfl->global_buffers) - n_copy; + vec_add_aligned (fl->buffers, mfl->global_buffers + n_left, n_copy, + CLIB_CACHE_LINE_BYTES); + _vec_len (mfl->global_buffers) = n_left; + clib_spinlock_unlock (&mfl->global_buffers_lock); + n = min_free_buffers - vec_len (fl->buffers); + if (n <= 0) + return min_free_buffers; + } + /* Always allocate round number of buffers. */ n = round_pow2 (n, CLIB_CACHE_LINE_BYTES / sizeof (u32)); diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index 9047ca9a..5504bf7c 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -350,6 +350,12 @@ typedef struct vlib_buffer_free_list_t /* Vector of free buffers. Each element is a byte offset into I/O heap. */ u32 *buffers; + /* global vector of free buffers, used only on main thread. + Bufers are returned to global buffers only in case when number of + buffers on free buffers list grows about threshold */ + u32 *global_buffers; + clib_spinlock_t global_buffers_lock; + /* Memory chunks allocated for this free list recorded here so they can be freed when free list is deleted. */ diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 6a662416..78bf9317 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -848,18 +848,21 @@ vlib_buffer_add_to_free_list (vlib_main_t * vm, u32 buffer_index, u8 do_init) { vlib_buffer_t *b; - u32 i; b = vlib_get_buffer (vm, buffer_index); if (PREDICT_TRUE (do_init)) vlib_buffer_init_for_free_list (b, f); vec_add1_aligned (f->buffers, buffer_index, CLIB_CACHE_LINE_BYTES); - if (vec_len (f->buffers) > 3 * VLIB_FRAME_SIZE) + if (vec_len (f->buffers) > 4 * VLIB_FRAME_SIZE) { + vlib_buffer_free_list_t *mf; + mf = vlib_buffer_get_free_list (vlib_mains[0], f->index); + clib_spinlock_lock (&mf->global_buffers_lock); /* keep last stored buffers, as they are more likely hot in the cache */ - for (i = 0; i < VLIB_FRAME_SIZE; i++) - vm->os_physmem_free (vlib_get_buffer (vm, i)); + vec_add_aligned (mf->global_buffers, f->buffers, VLIB_FRAME_SIZE, + CLIB_CACHE_LINE_BYTES); vec_delete (f->buffers, VLIB_FRAME_SIZE, 0); + clib_spinlock_unlock (&mf->global_buffers_lock); } } -- cgit 1.2.3-korg From c67787be51c4cbf4e5c1a28af621822aa8cae513 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Wed, 30 Aug 2017 17:12:25 +0200 Subject: Set runtime_path properly when running non-root non-root users should use /run/user/$PID/... Change-Id: I1ca136df7a339eff193ed9c9a396d6965b192d0e Signed-off-by: Damjan Marion --- src/vlib/unix/main.c | 13 +++++++++++-- src/vpp/vnet/main.c | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c index cb34a89f..f52316e5 100644 --- a/src/vlib/unix/main.c +++ b/src/vlib/unix/main.c @@ -57,7 +57,7 @@ #define UNIX_CLI_DEFAULT_HISTORY 50 char *vlib_default_runtime_dir __attribute__ ((weak)); -char *vlib_default_runtime_dir = "/run/vlib"; +char *vlib_default_runtime_dir = "vlib"; unix_main_t unix_main; @@ -437,7 +437,16 @@ unix_config (vlib_main_t * vm, unformat_input_t * input) um->unix_config_complete = 1; if (um->runtime_dir == 0) - um->runtime_dir = format (0, "%s%c", vlib_default_runtime_dir, 0); + { + uid_t uid = geteuid (); + if (uid == 00) + um->runtime_dir = format (0, "/run/%s%c", + vlib_default_runtime_dir, 0); + else + um->runtime_dir = format (0, "/run/user/%u/%s%c", uid, + vlib_default_runtime_dir, 0); + } + return 0; } diff --git a/src/vpp/vnet/main.c b/src/vpp/vnet/main.c index 9fe65fe2..76371dbe 100644 --- a/src/vpp/vnet/main.c +++ b/src/vpp/vnet/main.c @@ -44,7 +44,7 @@ vpe_main_init (vlib_main_t * vm) /* * Default path for runtime data */ -char *vlib_default_runtime_dir = "/run/vpp"; +char *vlib_default_runtime_dir = "vpp"; /* * Load plugins from /usr/lib/vpp_plugins by default -- cgit 1.2.3-korg From 215961829c4ae5f738ffcd01a8d1afcab13bd0e2 Mon Sep 17 00:00:00 2001 From: Colin Tregenza Dancer Date: Mon, 4 Sep 2017 15:27:49 +0100 Subject: Refork worker thread data structures in parallel (VPP-970) Change the rebuilding of worker thread clone datastructures to run in parallel on the workers, instead of serially on main. Change-Id: Ib76bcfbef1e51f2399972090f4057be7aaa84e08 Signed-off-by: Colin Tregenza Dancer --- src/vlib/main.h | 6 + src/vlib/threads.c | 333 +++++++++++++++++++++++++++++++---------------------- src/vlib/threads.h | 11 ++ 3 files changed, 214 insertions(+), 136 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/main.h b/src/vlib/main.h index bfa7ddbe..b63c63fa 100644 --- a/src/vlib/main.h +++ b/src/vlib/main.h @@ -181,6 +181,12 @@ typedef struct vlib_main_t /* Attempt to do a post-mortem elog dump */ int elog_post_mortem_dump; + /* + * Need to call vlib_worker_thread_node_runtime_update before + * releasing worker thread barrier. Only valid in vlib_global_main. + */ + int need_vlib_worker_thread_node_runtime_update; + } vlib_main_t; /* Global main structure. */ diff --git a/src/vlib/threads.c b/src/vlib/threads.c index 0661d89a..6cd325b3 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -547,10 +547,17 @@ start_workers (vlib_main_t * vm) vlib_worker_threads->workers_at_barrier = clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES); + vlib_worker_threads->node_reforks_required = + clib_mem_alloc_aligned (sizeof (u32), CLIB_CACHE_LINE_BYTES); + /* Ask for an initial barrier sync */ *vlib_worker_threads->workers_at_barrier = 0; *vlib_worker_threads->wait_at_barrier = 1; + /* Without update or refork */ + *vlib_worker_threads->node_reforks_required = 0; + vm->need_vlib_worker_thread_node_runtime_update = 0; + worker_thread_index = 1; for (i = 0; i < vec_len (tm->registrations); i++) @@ -568,6 +575,8 @@ start_workers (vlib_main_t * vm) for (k = 0; k < tr->count; k++) { + vlib_node_t *n; + vec_add2 (vlib_worker_threads, w, 1); if (tr->mheap_size) w->thread_mheap = @@ -628,10 +637,12 @@ start_workers (vlib_main_t * vm) /* fork nodes */ nm_clone->nodes = 0; + + /* Allocate all nodes in single block for speed */ + n = clib_mem_alloc_no_fail (vec_len (nm->nodes) * sizeof (*n)); + for (j = 0; j < vec_len (nm->nodes); j++) { - vlib_node_t *n; - n = clib_mem_alloc_no_fail (sizeof (*n)); clib_memcpy (n, nm->nodes[j], sizeof (*n)); /* none of the copied nodes have enqueue rights given out */ n->owner_node_index = VLIB_INVALID_NODE_INDEX; @@ -639,6 +650,7 @@ start_workers (vlib_main_t * vm) memset (&n->stats_last_clear, 0, sizeof (n->stats_last_clear)); vec_add1 (nm_clone->nodes, n); + n++; } nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] = vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]); @@ -778,17 +790,14 @@ start_workers (vlib_main_t * vm) VLIB_MAIN_LOOP_ENTER_FUNCTION (start_workers); -void -vlib_worker_thread_node_runtime_update (void) +static inline void +worker_thread_node_runtime_update_internal (void) { int i, j; - vlib_worker_thread_t *w; vlib_main_t *vm; vlib_node_main_t *nm, *nm_clone; - vlib_node_t **old_nodes_clone; vlib_main_t *vm_clone; - vlib_node_runtime_t *rt, *old_rt; - void *oldheap; + vlib_node_runtime_t *rt; never_inline void vlib_node_runtime_sync_stats (vlib_main_t * vm, vlib_node_runtime_t * r, @@ -797,13 +806,9 @@ vlib_worker_thread_node_runtime_update (void) ASSERT (vlib_get_thread_index () == 0); - if (vec_len (vlib_mains) == 1) - return; - vm = vlib_mains[0]; nm = &vm->node_main; - ASSERT (vlib_get_thread_index () == 0); ASSERT (*vlib_worker_threads->wait_at_barrier == 1); /* @@ -833,146 +838,170 @@ vlib_worker_thread_node_runtime_update (void) } } - for (i = 1; i < vec_len (vlib_mains); i++) - { - vlib_node_runtime_t *rt; - w = vlib_worker_threads + i; - oldheap = clib_mem_set_heap (w->thread_mheap); + /* Per-worker clone rebuilds are now done on each thread */ +} - vm_clone = vlib_mains[i]; - /* Re-clone error heap */ - u64 *old_counters = vm_clone->error_main.counters; - u64 *old_counters_all_clear = vm_clone->error_main.counters_last_clear; - clib_memcpy (&vm_clone->error_main, &vm->error_main, - sizeof (vm->error_main)); - j = vec_len (vm->error_main.counters) - 1; - vec_validate_aligned (old_counters, j, CLIB_CACHE_LINE_BYTES); - vec_validate_aligned (old_counters_all_clear, j, CLIB_CACHE_LINE_BYTES); - vm_clone->error_main.counters = old_counters; - vm_clone->error_main.counters_last_clear = old_counters_all_clear; +void +vlib_worker_thread_node_refork (void) +{ + vlib_main_t *vm, *vm_clone; + vlib_node_main_t *nm, *nm_clone; + vlib_node_t **old_nodes_clone; + vlib_node_runtime_t *rt, *old_rt; - nm_clone = &vm_clone->node_main; - vec_free (nm_clone->next_frames); - nm_clone->next_frames = vec_dup (nm->next_frames); + vlib_node_t *new_n_clone; - for (j = 0; j < vec_len (nm_clone->next_frames); j++) - { - vlib_next_frame_t *nf = &nm_clone->next_frames[j]; - u32 save_node_runtime_index; - u32 save_flags; - - save_node_runtime_index = nf->node_runtime_index; - save_flags = nf->flags & VLIB_FRAME_NO_FREE_AFTER_DISPATCH; - vlib_next_frame_init (nf); - nf->node_runtime_index = save_node_runtime_index; - nf->flags = save_flags; - } + int j; - old_nodes_clone = nm_clone->nodes; - nm_clone->nodes = 0; + vm = vlib_mains[0]; + nm = &vm->node_main; + vm_clone = vlib_get_main (); + nm_clone = &vm_clone->node_main; + + /* Re-clone error heap */ + u64 *old_counters = vm_clone->error_main.counters; + u64 *old_counters_all_clear = vm_clone->error_main.counters_last_clear; + + clib_memcpy (&vm_clone->error_main, &vm->error_main, + sizeof (vm->error_main)); + j = vec_len (vm->error_main.counters) - 1; + vec_validate_aligned (old_counters, j, CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (old_counters_all_clear, j, CLIB_CACHE_LINE_BYTES); + vm_clone->error_main.counters = old_counters; + vm_clone->error_main.counters_last_clear = old_counters_all_clear; + + nm_clone = &vm_clone->node_main; + vec_free (nm_clone->next_frames); + nm_clone->next_frames = vec_dup (nm->next_frames); + + for (j = 0; j < vec_len (nm_clone->next_frames); j++) + { + vlib_next_frame_t *nf = &nm_clone->next_frames[j]; + u32 save_node_runtime_index; + u32 save_flags; + + save_node_runtime_index = nf->node_runtime_index; + save_flags = nf->flags & VLIB_FRAME_NO_FREE_AFTER_DISPATCH; + vlib_next_frame_init (nf); + nf->node_runtime_index = save_node_runtime_index; + nf->flags = save_flags; + } - /* re-fork nodes */ - for (j = 0; j < vec_len (nm->nodes); j++) - { - vlib_node_t *old_n_clone; - vlib_node_t *new_n, *new_n_clone; + old_nodes_clone = nm_clone->nodes; + nm_clone->nodes = 0; - new_n = nm->nodes[j]; - old_n_clone = old_nodes_clone[j]; + /* re-fork nodes */ - new_n_clone = clib_mem_alloc_no_fail (sizeof (*new_n_clone)); - clib_memcpy (new_n_clone, new_n, sizeof (*new_n)); - /* none of the copied nodes have enqueue rights given out */ - new_n_clone->owner_node_index = VLIB_INVALID_NODE_INDEX; + /* Allocate all nodes in single block for speed */ + new_n_clone = + clib_mem_alloc_no_fail (vec_len (nm->nodes) * sizeof (*new_n_clone)); + for (j = 0; j < vec_len (nm->nodes); j++) + { + vlib_node_t *old_n_clone; + vlib_node_t *new_n; - if (j >= vec_len (old_nodes_clone)) - { - /* new node, set to zero */ - memset (&new_n_clone->stats_total, 0, - sizeof (new_n_clone->stats_total)); - memset (&new_n_clone->stats_last_clear, 0, - sizeof (new_n_clone->stats_last_clear)); - } - else - { - /* Copy stats if the old data is valid */ - clib_memcpy (&new_n_clone->stats_total, - &old_n_clone->stats_total, - sizeof (new_n_clone->stats_total)); - clib_memcpy (&new_n_clone->stats_last_clear, - &old_n_clone->stats_last_clear, - sizeof (new_n_clone->stats_last_clear)); - - /* keep previous node state */ - new_n_clone->state = old_n_clone->state; - } - vec_add1 (nm_clone->nodes, new_n_clone); - } - /* Free the old node clone */ - for (j = 0; j < vec_len (old_nodes_clone); j++) - clib_mem_free (old_nodes_clone[j]); - vec_free (old_nodes_clone); - - - /* re-clone internal nodes */ - old_rt = nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]; - nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] = - vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]); - - vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]) - { - vlib_node_t *n = vlib_get_node (vm, rt->node_index); - rt->thread_index = vm_clone->thread_index; - /* copy runtime_data, will be overwritten later for existing rt */ - if (n->runtime_data && n->runtime_data_bytes > 0) - clib_memcpy (rt->runtime_data, n->runtime_data, - clib_min (VLIB_NODE_RUNTIME_DATA_SIZE, - n->runtime_data_bytes)); - } - - for (j = 0; j < vec_len (old_rt); j++) + new_n = nm->nodes[j]; + old_n_clone = old_nodes_clone[j]; + + clib_memcpy (new_n_clone, new_n, sizeof (*new_n)); + /* none of the copied nodes have enqueue rights given out */ + new_n_clone->owner_node_index = VLIB_INVALID_NODE_INDEX; + + if (j >= vec_len (old_nodes_clone)) { - rt = vlib_node_get_runtime (vm_clone, old_rt[j].node_index); - rt->state = old_rt[j].state; - clib_memcpy (rt->runtime_data, old_rt[j].runtime_data, - VLIB_NODE_RUNTIME_DATA_SIZE); + /* new node, set to zero */ + memset (&new_n_clone->stats_total, 0, + sizeof (new_n_clone->stats_total)); + memset (&new_n_clone->stats_last_clear, 0, + sizeof (new_n_clone->stats_last_clear)); } - - vec_free (old_rt); - - /* re-clone input nodes */ - old_rt = nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]; - nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] = - vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]); - - vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]) - { - vlib_node_t *n = vlib_get_node (vm, rt->node_index); - rt->thread_index = vm_clone->thread_index; - /* copy runtime_data, will be overwritten later for existing rt */ - if (n->runtime_data && n->runtime_data_bytes > 0) - clib_memcpy (rt->runtime_data, n->runtime_data, - clib_min (VLIB_NODE_RUNTIME_DATA_SIZE, - n->runtime_data_bytes)); - } - - for (j = 0; j < vec_len (old_rt); j++) + else { - rt = vlib_node_get_runtime (vm_clone, old_rt[j].node_index); - rt->state = old_rt[j].state; - clib_memcpy (rt->runtime_data, old_rt[j].runtime_data, - VLIB_NODE_RUNTIME_DATA_SIZE); + /* Copy stats if the old data is valid */ + clib_memcpy (&new_n_clone->stats_total, + &old_n_clone->stats_total, + sizeof (new_n_clone->stats_total)); + clib_memcpy (&new_n_clone->stats_last_clear, + &old_n_clone->stats_last_clear, + sizeof (new_n_clone->stats_last_clear)); + + /* keep previous node state */ + new_n_clone->state = old_n_clone->state; } + vec_add1 (nm_clone->nodes, new_n_clone); + new_n_clone++; + } + /* Free the old node clones */ + clib_mem_free (old_nodes_clone[0]); + + vec_free (old_nodes_clone); - vec_free (old_rt); - nm_clone->processes = vec_dup (nm->processes); + /* re-clone internal nodes */ + old_rt = nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]; + nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL] = + vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]); - clib_mem_set_heap (oldheap); + vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INTERNAL]) + { + vlib_node_t *n = vlib_get_node (vm, rt->node_index); + rt->thread_index = vm_clone->thread_index; + /* copy runtime_data, will be overwritten later for existing rt */ + if (n->runtime_data && n->runtime_data_bytes > 0) + clib_memcpy (rt->runtime_data, n->runtime_data, + clib_min (VLIB_NODE_RUNTIME_DATA_SIZE, + n->runtime_data_bytes)); + } - // vnet_main_fork_fixup (i); + for (j = 0; j < vec_len (old_rt); j++) + { + rt = vlib_node_get_runtime (vm_clone, old_rt[j].node_index); + rt->state = old_rt[j].state; + clib_memcpy (rt->runtime_data, old_rt[j].runtime_data, + VLIB_NODE_RUNTIME_DATA_SIZE); } + + vec_free (old_rt); + + /* re-clone input nodes */ + old_rt = nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]; + nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT] = + vec_dup (nm->nodes_by_type[VLIB_NODE_TYPE_INPUT]); + + vec_foreach (rt, nm_clone->nodes_by_type[VLIB_NODE_TYPE_INPUT]) + { + vlib_node_t *n = vlib_get_node (vm, rt->node_index); + rt->thread_index = vm_clone->thread_index; + /* copy runtime_data, will be overwritten later for existing rt */ + if (n->runtime_data && n->runtime_data_bytes > 0) + clib_memcpy (rt->runtime_data, n->runtime_data, + clib_min (VLIB_NODE_RUNTIME_DATA_SIZE, + n->runtime_data_bytes)); + } + + for (j = 0; j < vec_len (old_rt); j++) + { + rt = vlib_node_get_runtime (vm_clone, old_rt[j].node_index); + rt->state = old_rt[j].state; + clib_memcpy (rt->runtime_data, old_rt[j].runtime_data, + VLIB_NODE_RUNTIME_DATA_SIZE); + } + + vec_free (old_rt); + + nm_clone->processes = vec_dup (nm->processes); +} + + +void +vlib_worker_thread_node_runtime_update (void) +{ + /* + * Make a note that we need to do a node runtime update + * prior to releasing the barrier. + */ + vlib_global_main.need_vlib_worker_thread_node_runtime_update = 1; } u32 @@ -1172,6 +1201,8 @@ vlib_worker_thread_barrier_sync (vlib_main_t * vm) if (vec_len (vlib_mains) < 2) return; + ASSERT (vlib_get_thread_index () == 0); + count = vec_len (vlib_mains) - 1; /* Tolerate recursive calls */ @@ -1180,8 +1211,6 @@ vlib_worker_thread_barrier_sync (vlib_main_t * vm) vlib_worker_threads[0].barrier_sync_count++; - ASSERT (vlib_get_thread_index () == 0); - deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT; *vlib_worker_threads->wait_at_barrier = 1; @@ -1199,13 +1228,29 @@ void vlib_worker_thread_barrier_release (vlib_main_t * vm) { f64 deadline; + int refork_needed = 0; if (vec_len (vlib_mains) < 2) return; + ASSERT (vlib_get_thread_index () == 0); + if (--vlib_worker_threads[0].recursion_level > 0) return; + /* Update (all) node runtimes before releasing the barrier, if needed */ + if (vm->need_vlib_worker_thread_node_runtime_update) + { + /* Do stats elements on main thread */ + worker_thread_node_runtime_update_internal (); + vm->need_vlib_worker_thread_node_runtime_update = 0; + + /* Do per thread rebuilds in parallel */ + refork_needed = 1; + clib_smp_atomic_add (vlib_worker_threads->node_reforks_required, + (vec_len (vlib_mains) - 1)); + } + deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT; *vlib_worker_threads->wait_at_barrier = 0; @@ -1218,6 +1263,22 @@ vlib_worker_thread_barrier_release (vlib_main_t * vm) os_panic (); } } + + /* Wait for reforks before continuing */ + if (refork_needed) + { + deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT; + + while (*vlib_worker_threads->node_reforks_required > 0) + { + if (vlib_time_now (vm) > deadline) + { + fformat (stderr, "%s: worker thread refork deadlock\n", + __FUNCTION__); + os_panic (); + } + } + } } /* diff --git a/src/vlib/threads.h b/src/vlib/threads.h index 572ce77f..c3f1cade 100644 --- a/src/vlib/threads.h +++ b/src/vlib/threads.h @@ -102,6 +102,7 @@ typedef struct vlib_thread_registration_t *registration; u8 *name; u64 barrier_sync_count; + volatile u32 *node_reforks_required; long lwp; int lcore_id; @@ -180,6 +181,7 @@ u32 vlib_frame_queue_main_init (u32 node_index, u32 frame_queue_nelts); void vlib_worker_thread_barrier_sync (vlib_main_t * vm); void vlib_worker_thread_barrier_release (vlib_main_t * vm); +void vlib_worker_thread_node_refork (void); static_always_inline uword vlib_get_thread_index (void) @@ -369,6 +371,15 @@ vlib_worker_thread_barrier_check (void) if (CLIB_DEBUG > 0) vm->parked_at_barrier = 0; clib_smp_atomic_add (vlib_worker_threads->workers_at_barrier, -1); + + if (PREDICT_FALSE (*vlib_worker_threads->node_reforks_required)) + { + vlib_worker_thread_node_refork (); + clib_smp_atomic_add (vlib_worker_threads->node_reforks_required, + -1); + while (*vlib_worker_threads->node_reforks_required) + ; + } } } -- cgit 1.2.3-korg From 9a244bb5e4a21835cac51ba7f35095b9c24547e6 Mon Sep 17 00:00:00 2001 From: Pierre Pfister Date: Thu, 27 Jul 2017 22:57:34 -0400 Subject: Add pidfile cmdline option Change-Id: Ibaa61b624eb6683b1be6901a7b29f5f73aad27b2 Signed-off-by: Pierre Pfister --- src/vlib/unix/main.c | 51 ++++++++++++++++++++++++++++++++++++++++++--------- src/vlib/unix/unix.h | 7 +++++++ src/vlib/unix/util.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 98 insertions(+), 9 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c index f52316e5..c90e1331 100644 --- a/src/vlib/unix/main.c +++ b/src/vlib/unix/main.c @@ -317,6 +317,7 @@ unix_config (vlib_main_t * vm, unformat_input_t * input) unix_main_t *um = &unix_main; clib_error_t *error = 0; gid_t gid; + int pidfd = -1; /* Defaults */ um->cli_pager_buffer_limit = UNIX_CLI_DEFAULT_PAGER_LIMIT; @@ -415,11 +416,38 @@ unix_config (vlib_main_t * vm, unformat_input_t * input) if (setegid (gid) == -1) return clib_error_return_unix (0, "setegid"); } + else if (unformat (input, "pidfile %s", &um->pidfile)) + ; else return clib_error_return (0, "unknown input `%U'", format_unformat_error, input); } + if (um->runtime_dir == 0) + { + uid_t uid = geteuid (); + if (uid == 00) + um->runtime_dir = format (0, "/run/%s%c", + vlib_default_runtime_dir, 0); + else + um->runtime_dir = format (0, "/run/user/%u/%s%c", uid, + vlib_default_runtime_dir, 0); + } + + if (um->pidfile) + { + if ((error = vlib_unix_validate_runtime_file (um, + (char *) um->pidfile, + &um->pidfile))) + return error; + + if (((pidfd = open ((char *) um->pidfile, + O_CREAT | O_WRONLY | O_TRUNC, 0644)) < 0)) + { + return clib_error_return_unix (0, "open"); + } + } + error = setup_signal_handlers (um); if (error) return error; @@ -434,19 +462,21 @@ unix_config (vlib_main_t * vm, unformat_input_t * input) 0) < 0) clib_error_return (0, "daemon () fails"); } - um->unix_config_complete = 1; - if (um->runtime_dir == 0) + if (pidfd >= 0) { - uid_t uid = geteuid (); - if (uid == 00) - um->runtime_dir = format (0, "/run/%s%c", - vlib_default_runtime_dir, 0); - else - um->runtime_dir = format (0, "/run/user/%u/%s%c", uid, - vlib_default_runtime_dir, 0); + u8 *lv = format (0, "%d", getpid ()); + if (write (pidfd, (char *) lv, vec_len (lv)) != vec_len (lv)) + { + vec_free (lv); + close (pidfd); + return clib_error_return_unix (0, "write"); + } + vec_free (lv); + close (pidfd); } + um->unix_config_complete = 1; return 0; } @@ -475,6 +505,9 @@ unix_config (vlib_main_t * vm, unformat_input_t * input) * Very useful in situations where folks don't remember or can't be bothered * to include CLI commands in bug reports. * + * @cfgcmd{pidfile, <filename>} + * Writes the pid of the main thread in @c filename. + * * @cfgcmd{full-coredump} * Ask the Linux kernel to dump all memory-mapped address regions, instead * of just text+data+bss. diff --git a/src/vlib/unix/unix.h b/src/vlib/unix/unix.h index ee1312e3..97f58944 100644 --- a/src/vlib/unix/unix.h +++ b/src/vlib/unix/unix.h @@ -106,6 +106,9 @@ typedef struct /* runtime directory path */ u8 *runtime_dir; + /* pidfile filename */ + u8 *pidfile; + /* unix config complete */ volatile int unix_config_complete; @@ -241,6 +244,10 @@ clib_error_t *foreach_directory_file (char *dir_name, clib_error_t *vlib_unix_recursive_mkdir (char *path); +clib_error_t *vlib_unix_validate_runtime_file (unix_main_t * um, + const char *path, + u8 ** full_path); + #endif /* included_unix_unix_h */ /* diff --git a/src/vlib/unix/util.c b/src/vlib/unix/util.c index 93aeb99c..312cc9b5 100644 --- a/src/vlib/unix/util.c +++ b/src/vlib/unix/util.c @@ -257,6 +257,55 @@ done: return error; } +clib_error_t * +vlib_unix_validate_runtime_file (unix_main_t * um, + const char *path, u8 ** full_path) +{ + u8 *fp = 0; + char *last_slash = 0; + + if (path[0] == '\0') + { + return clib_error_return (0, "path is an empty string"); + } + else if (strncmp (path, "../", 3) == 0 || strstr (path, "/../")) + { + return clib_error_return (0, "'..' not allowed in runtime path"); + } + else if (path[0] == '/') + { + /* Absolute path. Has to start with runtime directory */ + if (strncmp ((char *) um->runtime_dir, path, + strlen ((char *) um->runtime_dir))) + { + return clib_error_return (0, + "file %s is not in runtime directory %s", + path, um->runtime_dir); + } + fp = format (0, "%s%c", path, '\0'); + } + else + { + /* Relative path, just append to runtime */ + fp = format (0, "%s/%s%c", um->runtime_dir, path, '\0'); + } + + /* We don't want to create a directory out of the last file */ + if ((last_slash = strrchr ((char *) fp, '/')) != NULL) + *last_slash = '\0'; + + clib_error_t *error = vlib_unix_recursive_mkdir ((char *) fp); + + if (last_slash != NULL) + *last_slash = '/'; + + if (error) + vec_free (fp); + + *full_path = fp; + return error; +} + /* * fd.io coding-style-patch-verification: ON * -- cgit 1.2.3-korg From 49d66f1f42cbc310e4fa0dc526b9fdb91d0ca220 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 20 Jul 2017 18:10:35 +0200 Subject: vlib physmem rework This patch adds supprot support for multiple numa-aware physmem regions. Change-Id: I5c69a6f4da33c8ee21bdb8604d52fd2886f2327e Signed-off-by: Damjan Marion --- src/examples/vlib/main_stub.c | 3 +- src/plugins/dpdk/api/dpdk_api.c | 1 - src/plugins/dpdk/device/init.c | 17 +- src/plugins/dpdk/hqos/hqos.c | 1 - src/plugins/ixge/ixge.c | 32 ++- src/plugins/ixge/ixge.h | 2 + src/vlib.am | 4 +- src/vlib/buffer.c | 72 ++++- src/vlib/buffer.h | 3 +- src/vlib/buffer_funcs.h | 39 +-- src/vlib/main.c | 20 +- src/vlib/main.h | 16 +- src/vlib/physmem.h | 69 ++--- src/vlib/physmem_funcs.h | 161 +++++++++++ src/vlib/unix/physmem.c | 572 +++++++++++++++++++--------------------- src/vlib/unix/physmem.h | 65 ----- src/vlib/unix/unix.h | 24 +- src/vlib/unix/util.c | 113 +++++++- src/vlib/vlib.h | 3 +- 19 files changed, 700 insertions(+), 517 deletions(-) create mode 100644 src/vlib/physmem_funcs.h delete mode 100644 src/vlib/unix/physmem.h (limited to 'src/vlib') diff --git a/src/examples/vlib/main_stub.c b/src/examples/vlib/main_stub.c index 4d74bd77..3b19c53f 100644 --- a/src/examples/vlib/main_stub.c +++ b/src/examples/vlib/main_stub.c @@ -27,8 +27,7 @@ main_stub_init (vlib_main_t * vm) { clib_error_t *error; - if ((error = - unix_physmem_init (vm, /* fail_if_physical_memory_not_present */ 0))) + if ((error = unix_physmem_init (vm))) return error; if ((error = vlib_call_init_function (vm, unix_cli_init))) diff --git a/src/plugins/dpdk/api/dpdk_api.c b/src/plugins/dpdk/api/dpdk_api.c index 08afdd70..97c4bc75 100755 --- a/src/plugins/dpdk/api/dpdk_api.c +++ b/src/plugins/dpdk/api/dpdk_api.c @@ -20,7 +20,6 @@ #include #include -#include #include #include diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index a795ba0e..e23542f7 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -17,10 +17,10 @@ #include #include #include +#include #include #include -#include #include #include @@ -1026,21 +1026,28 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) clib_bitmap_foreach (c, tm->cpu_socket_bitmap, ( { int pages_avail, page_size, mem; + clib_error_t *e = 0; vec_validate(mem_by_socket, c); mem = mem_by_socket[c]; page_size = 1024; - pages_avail = vlib_sysfs_get_free_hugepages(c, page_size * 1024); + e = vlib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail); - if (pages_avail < 0 || page_size * pages_avail < mem) + if (e != 0 || pages_avail < 0 || page_size * pages_avail < mem) use_1g = 0; + if (e) + clib_error_free (e); + page_size = 2; - pages_avail = vlib_sysfs_get_free_hugepages(c, page_size * 1024); + e = vlib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail); - if (pages_avail < 0 || page_size * pages_avail < mem) + if (e != 0 || pages_avail < 0 || page_size * pages_avail < mem) use_2m = 0; + + if (e) + clib_error_free (e); })); /* *INDENT-ON* */ diff --git a/src/plugins/dpdk/hqos/hqos.c b/src/plugins/dpdk/hqos/hqos.c index 813eb91c..c9b85652 100644 --- a/src/plugins/dpdk/hqos/hqos.c +++ b/src/plugins/dpdk/hqos/hqos.c @@ -29,7 +29,6 @@ #include #include -#include #include #include #include /* enumerate all vlib messages */ diff --git a/src/plugins/ixge/ixge.c b/src/plugins/ixge/ixge.c index e0150f41..222c148c 100644 --- a/src/plugins/ixge/ixge.c +++ b/src/plugins/ixge/ixge.c @@ -2493,10 +2493,11 @@ ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) round_pow2 (xm->n_descriptors[rt], xm->n_descriptors_per_cache_line); dq->head_index = dq->tail_index = 0; - dq->descriptors = vlib_physmem_alloc_aligned (vm, &error, - dq->n_descriptors * - sizeof (dq->descriptors[0]), - 128 /* per chip spec */ ); + dq->descriptors = + vlib_physmem_alloc_aligned (vm, xm->physmem_region, &error, + dq->n_descriptors * + sizeof (dq->descriptors[0]), + 128 /* per chip spec */ ); if (error) return error; @@ -2518,7 +2519,8 @@ ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) vlib_buffer_t *b = vlib_get_buffer (vm, dq->descriptor_buffer_indices[i]); dq->descriptors[i].rx_to_hw.tail_address = - vlib_physmem_virtual_to_physical (vm, b->data); + vlib_physmem_virtual_to_physical (vm, xm->physmem_region, + b->data); } } else @@ -2526,7 +2528,8 @@ ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) u32 i; dq->tx.head_index_write_back = - vlib_physmem_alloc (vm, &error, CLIB_CACHE_LINE_BYTES); + vlib_physmem_alloc (vm, vm->buffer_main->physmem_region, &error, + CLIB_CACHE_LINE_BYTES); for (i = 0; i < dq->n_descriptors; i++) dq->descriptors[i].tx = xm->tx_descriptor_template; @@ -2538,7 +2541,9 @@ ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) ixge_dma_regs_t *dr = get_dma_regs (xd, rt, queue_index); u64 a; - a = vlib_physmem_virtual_to_physical (vm, dq->descriptors); + a = + vlib_physmem_virtual_to_physical (vm, vm->buffer_main->physmem_region, + dq->descriptors); dr->descriptor_address[0] = a & 0xFFFFFFFF; dr->descriptor_address[1] = a >> (u64) 32; dr->n_descriptor_bytes = dq->n_descriptors * sizeof (dq->descriptors[0]); @@ -2564,7 +2569,9 @@ ixge_dma_init (ixge_device_t * xd, vlib_rx_or_tx_t rt, u32 queue_index) dq->tx.head_index_write_back[0] = dq->head_index; a = - vlib_physmem_virtual_to_physical (vm, dq->tx.head_index_write_back); + vlib_physmem_virtual_to_physical (vm, + vm->buffer_main->physmem_region, + dq->tx.head_index_write_back); dr->tx.head_index_write_back_address[0] = /* enable bit */ 1 | a; dr->tx.head_index_write_back_address[1] = (u64) a >> (u64) 32; } @@ -2850,9 +2857,12 @@ ixge_pci_init (vlib_main_t * vm, vlib_pci_device_t * dev) void *r; ixge_device_t *xd; - /* Device found: make sure we have dma memory. */ - if (unix_physmem_is_fake (vm)) - return clib_error_return (0, "no physical memory available"); + /* Allocate physmem region for DMA buffers */ + error = vlib_physmem_region_alloc (vm, "ixge decriptors", 2 << 20, 0, + VLIB_PHYSMEM_F_INIT_MHEAP, + &xm->physmem_region); + if (error) + return error; error = vlib_pci_map_resource (dev, 0, &r); if (error) diff --git a/src/plugins/ixge/ixge.h b/src/plugins/ixge/ixge.h index 779603b3..42c1bfa5 100644 --- a/src/plugins/ixge/ixge.h +++ b/src/plugins/ixge/ixge.h @@ -1266,6 +1266,8 @@ typedef struct u32 *rx_buffers_to_add; f64 time_last_stats_update; + + vlib_physmem_region_index_t physmem_region; } ixge_main_t; ixge_main_t ixge_main; diff --git a/src/vlib.am b/src/vlib.am index 111dcfa3..cab90e2d 100644 --- a/src/vlib.am +++ b/src/vlib.am @@ -13,7 +13,7 @@ lib_LTLIBRARIES += libvlib.la -libvlib_la_LIBADD = libvppinfra.la -ldl -lpthread +libvlib_la_LIBADD = libvppinfra.la -ldl -lpthread -lnuma libvlib_la_DEPENDENCIES = libvppinfra.la BUILT_SOURCES += vlib/config.h @@ -65,6 +65,7 @@ nobase_include_HEADERS += \ vlib/physmem.h \ vlib/pci/pci.h \ vlib/pci/pci_config.h \ + vlib/physmem_funcs.h \ vlib/threads.h \ vlib/trace_funcs.h \ vlib/trace.h \ @@ -84,7 +85,6 @@ libvlib_la_SOURCES += \ nobase_include_HEADERS += \ vlib/unix/cj.h \ vlib/unix/mc_socket.h \ - vlib/unix/physmem.h \ vlib/unix/plugin.h \ vlib/unix/unix.h diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index 908368c0..a5ec0e0a 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -47,6 +47,7 @@ #include vlib_buffer_callbacks_t *vlib_buffer_callbacks = 0; +static u32 vlib_buffer_physmem_sz = 32 << 20; uword vlib_buffer_length_in_chain_slow_path (vlib_main_t * vm, @@ -461,7 +462,8 @@ del_free_list (vlib_main_t * vm, vlib_buffer_free_list_t * f) u32 i; for (i = 0; i < vec_len (f->buffer_memory_allocated); i++) - vm->os_physmem_free (f->buffer_memory_allocated[i]); + vm->os_physmem_free (vm, vm->buffer_main->physmem_region, + f->buffer_memory_allocated[i]); vec_free (f->name); vec_free (f->buffer_memory_allocated); vec_free (f->buffers); @@ -552,9 +554,9 @@ fill_free_list (vlib_main_t * vm, n_bytes = n_this_chunk * (sizeof (b[0]) + fl->n_data_bytes); /* drb: removed power-of-2 ASSERT */ - buffers = vm->os_physmem_alloc_aligned (&vm->physmem_main, - n_bytes, - sizeof (vlib_buffer_t)); + buffers = + vm->os_physmem_alloc_aligned (vm, vm->buffer_main->physmem_region, + n_bytes, sizeof (vlib_buffer_t)); if (!buffers) return n_alloc; @@ -1051,10 +1053,25 @@ VLIB_CLI_COMMAND (show_buffers_command, static) = { }; /* *INDENT-ON* */ -void -vlib_buffer_cb_init (struct vlib_main_t *vm) +clib_error_t * +vlib_buffer_main_init (struct vlib_main_t * vm) { - vlib_buffer_main_t *bm = vm->buffer_main; + vlib_buffer_main_t *bm; + clib_error_t *error; + + vec_validate (vm->buffer_main, 0); + bm = vm->buffer_main; + + if (vlib_buffer_callbacks) + { + /* external plugin has registered own buffer callbacks + so we just copy them and quit */ + vlib_buffer_main_t *bm = vm->buffer_main; + clib_memcpy (&bm->cb, vlib_buffer_callbacks, + sizeof (vlib_buffer_callbacks_t)); + bm->callbacks_registered = 1; + return 0; + } bm->cb.vlib_buffer_alloc_cb = &vlib_buffer_alloc_internal; bm->cb.vlib_buffer_alloc_from_free_list_cb = @@ -1064,8 +1081,49 @@ vlib_buffer_cb_init (struct vlib_main_t *vm) bm->cb.vlib_buffer_delete_free_list_cb = &vlib_buffer_delete_free_list_internal; clib_spinlock_init (&bm->buffer_known_hash_lockp); + + /* allocate default region */ + error = vlib_physmem_region_alloc (vm, "buffers", + vlib_buffer_physmem_sz, 0, + VLIB_PHYSMEM_F_INIT_MHEAP | + VLIB_PHYSMEM_F_HAVE_BUFFERS, + &bm->physmem_region); + + if (error == 0) + return 0; + + clib_error_free (error); + + /* we my be running unpriviledged, so try to allocate fake physmem */ + error = vlib_physmem_region_alloc (vm, "buffers (fake)", + vlib_buffer_physmem_sz, 0, + VLIB_PHYSMEM_F_FAKE | + VLIB_PHYSMEM_F_INIT_MHEAP | + VLIB_PHYSMEM_F_HAVE_BUFFERS, + &bm->physmem_region); + return error; } +static clib_error_t * +vlib_buffers_configure (vlib_main_t * vm, unformat_input_t * input) +{ + u32 size_in_mb; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "memory-size-in-mb %d", &size_in_mb)) + vlib_buffer_physmem_sz = size_in_mb << 20; + else + return unformat_parse_error (input); + } + + unformat_free (input); + return 0; +} + +VLIB_EARLY_CONFIG_FUNCTION (vlib_buffers_configure, "buffers"); + + /** @endcond */ /* * fd.io coding-style-patch-verification: ON diff --git a/src/vlib/buffer.h b/src/vlib/buffer.h index 5504bf7c..e47dbc6d 100644 --- a/src/vlib/buffer.h +++ b/src/vlib/buffer.h @@ -408,6 +408,7 @@ typedef struct buffer index */ uword buffer_mem_start; uword buffer_mem_size; + vlib_physmem_region_index_t physmem_region; /* Buffer free callback, for subversive activities */ u32 (*buffer_free_callback) (struct vlib_main_t * vm, @@ -442,7 +443,7 @@ typedef struct void vlib_buffer_add_mem_range (struct vlib_main_t *vm, uword start, uword size); -void vlib_buffer_cb_init (struct vlib_main_t *vm); +clib_error_t *vlib_buffer_main_init (struct vlib_main_t *vm); typedef struct { diff --git a/src/vlib/buffer_funcs.h b/src/vlib/buffer_funcs.h index 78bf9317..d51de6be 100644 --- a/src/vlib/buffer_funcs.h +++ b/src/vlib/buffer_funcs.h @@ -162,7 +162,7 @@ vlib_buffer_contents (vlib_main_t * vm, u32 buffer_index, u8 * contents) always_inline u64 vlib_get_buffer_data_physical_address (vlib_main_t * vm, u32 buffer_index) { - return vlib_physmem_offset_to_physical (&vm->physmem_main, + return vlib_physmem_offset_to_physical (vm, vm->buffer_main->physmem_region, (((uword) buffer_index) << CLIB_LOG2_CACHE_LINE_BYTES) + STRUCT_OFFSET_OF (vlib_buffer_t, @@ -455,43 +455,6 @@ vlib_copy_buffers (u32 * dst, u32 * src, u32 n) } } -always_inline void * -vlib_physmem_alloc_aligned (vlib_main_t * vm, clib_error_t ** error, - uword n_bytes, uword alignment) -{ - void *r = - vm->os_physmem_alloc_aligned (&vm->physmem_main, n_bytes, alignment); - if (!r) - *error = - clib_error_return (0, "failed to allocate %wd bytes of I/O memory", - n_bytes); - else - *error = 0; - return r; -} - -/* By default allocate I/O memory with cache line alignment. */ -always_inline void * -vlib_physmem_alloc (vlib_main_t * vm, clib_error_t ** error, uword n_bytes) -{ - return vlib_physmem_alloc_aligned (vm, error, n_bytes, - CLIB_CACHE_LINE_BYTES); -} - -always_inline void -vlib_physmem_free (vlib_main_t * vm, void *mem) -{ - return vm->os_physmem_free (mem); -} - -always_inline u64 -vlib_physmem_virtual_to_physical (vlib_main_t * vm, void *mem) -{ - vlib_physmem_main_t *pm = &vm->physmem_main; - uword o = pointer_to_uword (mem) - pm->virtual.start; - return vlib_physmem_offset_to_physical (pm, o); -} - /* Append given data to end of buffer, possibly allocating new buffers. */ u32 vlib_buffer_add_data (vlib_main_t * vm, u32 free_list_index, diff --git a/src/vlib/main.c b/src/vlib/main.c index 5d99e899..7875f62a 100644 --- a/src/vlib/main.c +++ b/src/vlib/main.c @@ -1705,22 +1705,16 @@ vlib_main (vlib_main_t * volatile vm, unformat_input_t * input) if (!vm->name) vm->name = "VLIB"; - vec_validate (vm->buffer_main, 0); - if (vlib_buffer_callbacks) + if ((error = unix_physmem_init (vm))) { - /* external plugin has registered own buffer callbacks - so we just copy them */ - vlib_buffer_main_t *bm = vm->buffer_main; - clib_memcpy (&bm->cb, vlib_buffer_callbacks, - sizeof (vlib_buffer_callbacks_t)); - bm->callbacks_registered = 1; + clib_error_report (error); + goto done; } - else + + if ((error = vlib_buffer_main_init (vm))) { - vlib_physmem_main_t *vpm = &vm->physmem_main; - vlib_buffer_cb_init (vm); - unix_physmem_init (vm, 0 /* fail_if_physical_memory_not_present */ ); - vlib_buffer_add_mem_range (vm, vpm->virtual.start, vpm->virtual.size); + clib_error_report (error); + goto done; } if ((error = vlib_thread_init (vm))) diff --git a/src/vlib/main.h b/src/vlib/main.h index b63c63fa..4c0cde3f 100644 --- a/src/vlib/main.h +++ b/src/vlib/main.h @@ -107,9 +107,21 @@ typedef struct vlib_main_t /* Allocate/free buffer memory for DMA transfers, descriptor rings, etc. buffer memory is guaranteed to be cache-aligned. */ - void *(*os_physmem_alloc_aligned) (vlib_physmem_main_t * pm, + + clib_error_t *(*os_physmem_region_alloc) (struct vlib_main_t * vm, + char *name, u32 size, + u8 numa_node, u32 flags, + vlib_physmem_region_index_t * + idx); + + void (*os_physmem_region_free) (struct vlib_main_t * vm, + vlib_physmem_region_index_t idx); + + void *(*os_physmem_alloc_aligned) (struct vlib_main_t * vm, + vlib_physmem_region_index_t idx, uword n_bytes, uword alignment); - void (*os_physmem_free) (void *x); + void (*os_physmem_free) (struct vlib_main_t * vm, + vlib_physmem_region_index_t idx, void *x); /* Node graph main structure. */ vlib_node_main_t node_main; diff --git a/src/vlib/physmem.h b/src/vlib/physmem.h index 9e7d52a6..a7fed124 100644 --- a/src/vlib/physmem.h +++ b/src/vlib/physmem.h @@ -40,62 +40,35 @@ #ifndef included_vlib_physmem_h #define included_vlib_physmem_h -typedef struct -{ - uword start, end, size; -} vlib_physmem_region_t; +typedef u8 vlib_physmem_region_index_t; typedef struct { - vlib_physmem_region_t virtual; - - uword log2_n_bytes_per_page; - - /* 1 << log2_n_bytes_per_page - 1. */ - uword page_mask; - + vlib_physmem_region_index_t index; + void *mem; + uword size; + int fd; + u8 log2_page_size; + u16 n_pages; + u32 page_mask; + + void *heap; + u32 flags; +#define VLIB_PHYSMEM_F_INIT_MHEAP (1<<0) +#define VLIB_PHYSMEM_F_HAVE_BUFFERS (1<<1) +#define VLIB_PHYSMEM_F_FAKE (1<<2) + + u8 numa_node; u64 *page_table; + u8 *name; +} vlib_physmem_region_t; - /* is fake physmem */ - u8 is_fake; -} vlib_physmem_main_t; - -always_inline u64 -vlib_physmem_offset_to_physical (vlib_physmem_main_t * pm, uword o) -{ - uword page_index = o >> pm->log2_n_bytes_per_page; - ASSERT (o < pm->virtual.size); - ASSERT (pm->page_table[page_index] != 0); - return (vec_elt (pm->page_table, page_index) + (o & pm->page_mask)); -} - -always_inline int -vlib_physmem_is_virtual (vlib_physmem_main_t * pm, uword p) -{ - return p >= pm->virtual.start && p < pm->virtual.end; -} - -always_inline uword -vlib_physmem_offset_of (vlib_physmem_main_t * pm, void *p) -{ - uword a = pointer_to_uword (p); - uword o; - - ASSERT (vlib_physmem_is_virtual (pm, a)); - o = a - pm->virtual.start; - - /* Offset must fit in 32 bits. */ - ASSERT ((uword) o == a - pm->virtual.start); - return o; -} -always_inline void * -vlib_physmem_at_offset (vlib_physmem_main_t * pm, uword offset) +typedef struct { - ASSERT (offset < pm->virtual.size); - return uword_to_pointer (pm->virtual.start + offset, void *); -} + vlib_physmem_region_t *regions; +} vlib_physmem_main_t; #endif /* included_vlib_physmem_h */ diff --git a/src/vlib/physmem_funcs.h b/src/vlib/physmem_funcs.h new file mode 100644 index 00000000..dbb8d9de --- /dev/null +++ b/src/vlib/physmem_funcs.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * physmem.h: virtual <-> physical memory mapping for VLIB buffers + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vlib_physmem_funcs_h +#define included_vlib_physmem_funcs_h + +always_inline vlib_physmem_region_t * +vlib_physmem_get_region (vlib_main_t * vm, u8 index) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + return pool_elt_at_index (vpm->regions, index); +} + +always_inline u64 +vlib_physmem_offset_to_physical (vlib_main_t * vm, + vlib_physmem_region_index_t idx, uword o) +{ + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + uword page_index = o >> pr->log2_page_size; + ASSERT (o < pr->size); + ASSERT (pr->page_table[page_index] != 0); + return (vec_elt (pr->page_table, page_index) + (o & pr->page_mask)); +} + +always_inline int +vlib_physmem_is_virtual (vlib_main_t * vm, vlib_physmem_region_index_t idx, + uword p) +{ + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + return p >= pointer_to_uword (pr->mem) + && p < (pointer_to_uword (pr->mem) + pr->size); +} + +always_inline uword +vlib_physmem_offset_of (vlib_main_t * vm, vlib_physmem_region_index_t idx, + void *p) +{ + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + uword a = pointer_to_uword (p); + uword o; + + ASSERT (vlib_physmem_is_virtual (vm, idx, a)); + o = a - pointer_to_uword (pr->mem); + + /* Offset must fit in 32 bits. */ + ASSERT ((uword) o == a - pointer_to_uword (pr->mem)); + + return o; +} + +always_inline void * +vlib_physmem_at_offset (vlib_main_t * vm, vlib_physmem_region_index_t idx, + uword offset) +{ + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + ASSERT (offset < pr->size); + return uword_to_pointer (pointer_to_uword (pr->mem) + offset, void *); +} + +always_inline void * +vlib_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx, + clib_error_t ** error, + uword n_bytes, uword alignment) +{ + void *r = vm->os_physmem_alloc_aligned (vm, idx, n_bytes, alignment); + if (!r) + *error = + clib_error_return (0, "failed to allocate %wd bytes of I/O memory", + n_bytes); + else + *error = 0; + return r; +} + +/* By default allocate I/O memory with cache line alignment. */ +always_inline void * +vlib_physmem_alloc (vlib_main_t * vm, vlib_physmem_region_index_t idx, + clib_error_t ** error, uword n_bytes) +{ + return vlib_physmem_alloc_aligned (vm, idx, error, n_bytes, + CLIB_CACHE_LINE_BYTES); +} + +always_inline void +vlib_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx, + void *mem) +{ + return vm->os_physmem_free (vm, idx, mem); +} + +always_inline u64 +vlib_physmem_virtual_to_physical (vlib_main_t * vm, + vlib_physmem_region_index_t idx, void *mem) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr = pool_elt_at_index (vpm->regions, idx); + uword o = mem - pr->mem; + return vlib_physmem_offset_to_physical (vm, idx, o); +} + + +always_inline clib_error_t * +vlib_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, + u8 numa_node, u32 flags, + vlib_physmem_region_index_t * idx) +{ + return vm->os_physmem_region_alloc (vm, name, size, numa_node, flags, idx); +} + +always_inline void +vlib_physmem_region_free (struct vlib_main_t *vm, + vlib_physmem_region_index_t idx) +{ + vm->os_physmem_region_free (vm, idx); +} + +#endif /* included_vlib_physmem_funcs_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/unix/physmem.c b/src/vlib/unix/physmem.c index 27a5bacf..d5d5d6c8 100644 --- a/src/vlib/unix/physmem.c +++ b/src/vlib/unix/physmem.c @@ -37,24 +37,66 @@ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifndef __NR_memfd_create +#if defined __x86_64__ +#define __NR_memfd_create 319 +#elif defined __arm__ +#define __NR_memfd_create 385 +#elif defined __aarch64__ +#define __NR_memfd_create 279 +#else +#error "__NR_memfd_create unknown for this architecture" +#endif +#endif + +static inline int +memfd_create (const char *name, unsigned int flags) +{ + return syscall (__NR_memfd_create, name, flags); +} + +#ifndef F_LINUX_SPECIFIC_BASE +#define F_LINUX_SPECIFIC_BASE 1024 +#endif +#define MFD_ALLOW_SEALING 0x0002U +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) -static physmem_main_t physmem_main; +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE 0x0008 /* prevent writes */ static void * -unix_physmem_alloc_aligned (vlib_physmem_main_t * vpm, uword n_bytes, - uword alignment) +unix_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx, + uword n_bytes, uword alignment) { - physmem_main_t *pm = &physmem_main; + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); uword lo_offset, hi_offset; uword *to_free = 0; + if (pr->heap == 0) + return 0; + /* IO memory is always at least cache aligned. */ alignment = clib_max (alignment, CLIB_CACHE_LINE_BYTES); while (1) { - mheap_get_aligned (pm->heap, n_bytes, + mheap_get_aligned (pr->heap, n_bytes, /* align */ alignment, /* align offset */ 0, &lo_offset); @@ -63,11 +105,14 @@ unix_physmem_alloc_aligned (vlib_physmem_main_t * vpm, uword n_bytes, if (lo_offset == ~0) break; + if (pr->flags & VLIB_PHYSMEM_F_FAKE) + break; + /* Make sure allocation does not span DMA physical chunk boundary. */ hi_offset = lo_offset + n_bytes - 1; - if ((lo_offset >> vpm->log2_n_bytes_per_page) == - (hi_offset >> vpm->log2_n_bytes_per_page)) + if ((lo_offset >> pr->log2_page_size) == + (hi_offset >> pr->log2_page_size)) break; /* Allocation would span chunk boundary, queue it to be freed as soon as @@ -79,380 +124,311 @@ unix_physmem_alloc_aligned (vlib_physmem_main_t * vpm, uword n_bytes, { uword i; for (i = 0; i < vec_len (to_free); i++) - mheap_put (pm->heap, to_free[i]); + mheap_put (pr->heap, to_free[i]); vec_free (to_free); } - return lo_offset != ~0 ? pm->heap + lo_offset : 0; + return lo_offset != ~0 ? pr->heap + lo_offset : 0; } static void -unix_physmem_free (void *x) +unix_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx, void *x) { - physmem_main_t *pm = &physmem_main; - + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); /* Return object to region's heap. */ - mheap_put (pm->heap, x - pm->heap); + mheap_put (pr->heap, x - pr->heap); } -static void -htlb_shutdown (void) +static u64 +get_page_paddr (int fd, uword addr) { - physmem_main_t *pm = &physmem_main; - - if (!pm->shmid) - return; - shmctl (pm->shmid, IPC_RMID, 0); - pm->shmid = 0; -} + int pagesize = sysconf (_SC_PAGESIZE); + u64 seek, pagemap = 0; -/* try to use huge TLB pgs if possible */ -static int -htlb_init (vlib_main_t * vm) -{ - vlib_physmem_main_t *vpm = &vm->physmem_main; - physmem_main_t *pm = &physmem_main; - u64 hugepagesize, pagesize; - u64 pfn, seek_loc; - u64 cur, physaddr, ptbits; - int fd, i; - - pm->shmid = shmget (11 /* key, my amp goes to 11 */ , pm->mem_size, - IPC_CREAT | SHM_HUGETLB | SHM_R | SHM_W); - if (pm->shmid < 0) + seek = ((u64) addr / pagesize) * sizeof (u64); + if (lseek (fd, seek, SEEK_SET) != seek) { - clib_unix_warning ("shmget"); + clib_unix_warning ("lseek to 0x%llx", seek); return 0; } - - pm->mem = shmat (pm->shmid, NULL, 0 /* flags */ ); - if (pm->mem == 0) + if (read (fd, &pagemap, sizeof (pagemap)) != (sizeof (pagemap))) { - shmctl (pm->shmid, IPC_RMID, 0); + clib_unix_warning ("read ptbits"); return 0; } + if ((pagemap & (1ULL << 63)) == 0) + return 0; - memset (pm->mem, 0, pm->mem_size); + pagemap &= pow2_mask (55); - /* $$$ get page size info from /proc/meminfo */ - hugepagesize = 2 << 20; - pagesize = 4 << 10; - vpm->log2_n_bytes_per_page = min_log2 (hugepagesize); - vec_resize (vpm->page_table, pm->mem_size / hugepagesize); + return pagemap * pagesize; +} - vpm->page_mask = pow2_mask (vpm->log2_n_bytes_per_page); - vpm->virtual.start = pointer_to_uword (pm->mem); - vpm->virtual.size = pm->mem_size; - vpm->virtual.end = vpm->virtual.start + vpm->virtual.size; +static clib_error_t * +unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, + u8 numa_node, u32 flags, + vlib_physmem_region_index_t * idx) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr; + clib_error_t *error = 0; + int pagemap_fd = -1; + u8 *mount_dir = 0; + u8 *filename = 0; + struct stat st; + int old_mpol; + int mmap_flags; + struct bitmask *old_mask = numa_allocate_nodemask (); - fd = open ("/proc/self/pagemap", O_RDONLY); + if (geteuid () != 0 && (flags & VLIB_PHYSMEM_F_FAKE) == 0) + return clib_error_return (0, "not allowed"); - if (fd < 0) + pool_get (vpm->regions, pr); + + if ((pr - vpm->regions) >= 256) { - (void) shmdt (pm->mem); - return 0; + error = clib_error_return (0, "maximum number of regions reached"); + goto error; } - pm->heap = mheap_alloc_with_flags (pm->mem, pm->mem_size, - /* Don't want mheap mmap/munmap with IO memory. */ - MHEAP_FLAG_DISABLE_VM | - MHEAP_FLAG_THREAD_SAFE); + pr->index = pr - vpm->regions; + pr->fd = -1; + pr->flags = flags; - cur = pointer_to_uword (pm->mem); - i = 0; + if (get_mempolicy (&old_mpol, old_mask->maskp, old_mask->size + 1, NULL, 0) + == -1) + { + error = clib_error_return_unix (0, "get_mempolicy"); + goto error; + } - while (cur < pointer_to_uword (pm->mem) + pm->mem_size) + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) { - pfn = (u64) cur / pagesize; - seek_loc = pfn * sizeof (u64); - if (lseek (fd, seek_loc, SEEK_SET) != seek_loc) - { - clib_unix_warning ("lseek to 0x%llx", seek_loc); - shmctl (pm->shmid, IPC_RMID, 0); - close (fd); - return 0; - } - if (read (fd, &ptbits, sizeof (ptbits)) != (sizeof (ptbits))) + if ((pagemap_fd = open ((char *) "/proc/self/pagemap", O_RDONLY)) == -1) { - clib_unix_warning ("read ptbits"); - shmctl (pm->shmid, IPC_RMID, 0); - close (fd); - return 0; + error = clib_error_return_unix (0, "open '/proc/self/pagemap'"); + goto error; } - /* bits 0-54 are the physical page number */ - physaddr = (ptbits & 0x7fffffffffffffULL) * pagesize; - if (CLIB_DEBUG > 1) - fformat (stderr, "pm: virtual 0x%llx physical 0x%llx\n", - cur, physaddr); - vpm->page_table[i++] = physaddr; + mount_dir = format (0, "%s/physmem_region%d%c", + vlib_unix_get_runtime_dir (), pr->index, 0); + filename = format (0, "%s/mem%c", mount_dir, 0); - cur += hugepagesize; - } - close (fd); - atexit (htlb_shutdown); - return 1; -} - -int vlib_app_physmem_init (vlib_main_t * vm, - physmem_main_t * pm, int) __attribute__ ((weak)); -int -vlib_app_physmem_init (vlib_main_t * vm, physmem_main_t * pm, int x) -{ - return 0; -} - -clib_error_t * -unix_physmem_init (vlib_main_t * vm, int physical_memory_required) -{ - vlib_physmem_main_t *vpm = &vm->physmem_main; - physmem_main_t *pm = &physmem_main; - clib_error_t *error = 0; - - /* Avoid multiple calls. */ - if (vm->os_physmem_alloc_aligned) - return error; + unlink ((char *) mount_dir); - vm->os_physmem_alloc_aligned = unix_physmem_alloc_aligned; - vm->os_physmem_free = unix_physmem_free; - pm->mem = MAP_FAILED; + error = vlib_unix_recursive_mkdir ((char *) mount_dir); + if (error) + goto error; - if (pm->mem_size == 0) - pm->mem_size = 16 << 20; + if (mount ("none", (char *) mount_dir, "hugetlbfs", 0, NULL)) + { + error = clib_error_return_unix (0, "mount hugetlb directory '%s'", + mount_dir); + goto error; + } - /* OK, Mr. App, you tell us */ - if (vlib_app_physmem_init (vm, pm, physical_memory_required)) - return 0; + if ((pr->fd = open ((char *) filename, O_CREAT | O_RDWR, 0755)) == -1) + { + error = clib_error_return_unix (0, "open"); + goto error; + } - if (!pm->no_hugepages && htlb_init (vm)) + mmap_flags = MAP_SHARED | MAP_HUGETLB | MAP_LOCKED; + } + else { - fformat (stderr, "%s: use huge pages\n", __FUNCTION__); - return 0; + if ((pr->fd = memfd_create (name, MFD_ALLOW_SEALING)) == -1) + return clib_error_return_unix (0, "memfd_create"); + + if ((fcntl (pr->fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1) + { + error = + clib_error_return_unix (0, "fcntl (F_ADD_SEALS, F_SEAL_SHRINK)"); + goto error; + } + mmap_flags = MAP_SHARED; } - pm->mem = - mmap (0, pm->mem_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (pm->mem == MAP_FAILED) + if (fstat (pr->fd, &st)) { - error = clib_error_return_unix (0, "mmap"); - goto done; + error = clib_error_return_unix (0, "fstat"); + goto error; } - pm->heap = mheap_alloc (pm->mem, pm->mem_size); - - /* Identity map with a single page. */ - vpm->log2_n_bytes_per_page = min_log2 (pm->mem_size); - vec_add1 (vpm->page_table, pointer_to_uword (pm->mem)); - - vpm->page_mask = pow2_mask (vpm->log2_n_bytes_per_page); - vpm->virtual.start = pointer_to_uword (pm->mem); - vpm->virtual.size = pm->mem_size; - vpm->virtual.end = vpm->virtual.start + vpm->virtual.size; - vpm->is_fake = 1; + pr->log2_page_size = min_log2 (st.st_blksize); + pr->n_pages = ((size - 1) >> pr->log2_page_size) + 1; + size = pr->n_pages * (1 << pr->log2_page_size); - fformat (stderr, "%s: use fake dma pages\n", __FUNCTION__); + if ((ftruncate (pr->fd, size)) == -1) + { + error = clib_error_return_unix (0, "ftruncate length: %d", size); + goto error; + } -done: - if (error) + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) { - if (pm->mem != MAP_FAILED) - munmap (pm->mem, pm->mem_size); + error = vlib_sysfs_prealloc_hugepages (numa_node, + 1 << (pr->log2_page_size - 10), + pr->n_pages); + if (error) + goto error; } - return error; -} -static clib_error_t * -show_physmem (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) -{ - physmem_main_t *pm = &physmem_main; + numa_set_preferred (numa_node); - if (pm->heap) - vlib_cli_output (vm, "%U", format_mheap, pm->heap, /* verbose */ 1); - else - vlib_cli_output (vm, "No physmem allocated."); - return 0; -} + pr->mem = mmap (0, size, (PROT_READ | PROT_WRITE), mmap_flags, pr->fd, 0); -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (show_physmem_command, static) = { - .path = "show physmem", - .short_help = "Show physical memory allocation", - .function = show_physmem, -}; -/* *INDENT-ON* */ + if (pr->mem == MAP_FAILED) + { + pr->mem = 0; + error = clib_error_return_unix (0, "mmap"); + goto error; + } -static clib_error_t * -show_affinity (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) -{ - cpu_set_t set; - cpu_set_t *setp = &set; - int i, rv; - u8 *s = 0; - int first_set_bit_in_run = -1; - int last_set_bit_in_run = -1; - int output_done = 0; - - rv = sched_getaffinity (0 /* pid, 0 = this proc */ , - sizeof (*setp), setp); - if (rv < 0) + if (set_mempolicy (old_mpol, old_mask->maskp, old_mask->size + 1) == -1) { - vlib_cli_output (vm, "Couldn't get affinity mask: %s\n", - strerror (errno)); - return 0; + error = clib_error_return_unix (0, "set_mempolicy"); + goto error; } - for (i = 0; i < 64; i++) + pr->size = pr->n_pages << pr->log2_page_size; + pr->page_mask = (1 << pr->log2_page_size) - 1; + pr->numa_node = numa_node; + pr->name = format (0, "%s", name); + + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) { - if (CPU_ISSET (i, setp)) - { - if (first_set_bit_in_run == -1) - { - first_set_bit_in_run = i; - last_set_bit_in_run = i; - if (output_done) - s = format (s, ","); - s = format (s, "%d-", i); - output_done = 1; - } - else - { - if (i == (last_set_bit_in_run + 1)) - last_set_bit_in_run = i; - } - } - else + int i; + for (i = 0; i < pr->n_pages; i++) { - if (first_set_bit_in_run != -1) + void *ptr = pr->mem + (i << pr->log2_page_size); + int node; + move_pages (0, 1, &ptr, 0, &node, 0); + if (numa_node != node) { - if (first_set_bit_in_run == (i - 1)) - { - _vec_len (s) -= 2 + ((first_set_bit_in_run / 10)); - } - s = format (s, "%d", last_set_bit_in_run); - first_set_bit_in_run = -1; - last_set_bit_in_run = -1; + clib_warning + ("physmem page for region \'%s\' allocated on the wrong" + " numa node (requested %u actual %u)", pr->name, + pr->numa_node, node, i); + break; } } } - if (first_set_bit_in_run != -1) - s = format (s, "%d", first_set_bit_in_run); - - vlib_cli_output (vm, "Process runs on: %v", s); - return 0; -} - -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (show_affinity_command, static) = { - .path = "show affinity", - .short_help = "Show process cpu affinity", - .function = show_affinity, -}; -/* *INDENT-ON* */ + if (flags & VLIB_PHYSMEM_F_INIT_MHEAP) + { + pr->heap = mheap_alloc_with_flags (pr->mem, pr->size, + /* Don't want mheap mmap/munmap with IO memory. */ + MHEAP_FLAG_DISABLE_VM | + MHEAP_FLAG_THREAD_SAFE); + fformat (stdout, "%U", format_mheap, pr->heap, /* verbose */ 1); + } -static clib_error_t * -set_affinity (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) -{ - cpu_set_t set; - cpu_set_t *setp = &set; - int i, rv; - int another_round; - u32 first, last; + if (flags & VLIB_PHYSMEM_F_HAVE_BUFFERS) + { + vlib_buffer_add_mem_range (vm, pointer_to_uword (pr->mem), pr->size); + } - memset (setp, 0, sizeof (*setp)); + *idx = pr->index; - do + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) { - another_round = 0; - if (unformat (input, "%d-%d,", &first, &last)) + int i; + for (i = 0; i < pr->n_pages; i++) { - if (first > 64 || last > 64) - { - barf1: - vlib_cli_output (vm, "range %d-%d invalid", first, last); - return 0; - } - - for (i = first; i <= last; i++) - CPU_SET (i, setp); - another_round = 1; + uword vaddr = + pointer_to_uword (pr->mem) + (((u64) i) << pr->log2_page_size); + u64 page_paddr = get_page_paddr (pagemap_fd, vaddr); + vec_add1 (pr->page_table, page_paddr); } - else if (unformat (input, "%d-%d", &first, &last)) - { - if (first > 64 || last > 64) - goto barf1; + } - for (i = first; i <= last; i++) - CPU_SET (i, setp); - } - else if (unformat (input, "%d,", &first)) - { - if (first > 64) - { - barf2: - vlib_cli_output (vm, "cpu %d invalid", first); - return 0; - } - CPU_SET (first, setp); - another_round = 1; - } - else if (unformat (input, "%d", &first)) - { - if (first > 64) - goto barf2; + goto done; - CPU_SET (first, setp); - } - } - while (another_round); +error: + if (pr->fd > -1) + close (pr->fd); - rv = sched_setaffinity (0 /* pid, 0 = this proc */ , - sizeof (*setp), setp); + if (pr->mem) + munmap (pr->mem, size); - if (rv < 0) + memset (pr, 0, sizeof (*pr)); + pool_put (vpm->regions, pr); + +done: + if (mount_dir) { - vlib_cli_output (vm, "Couldn't get affinity mask: %s\n", - strerror (errno)); - return 0; + umount2 ((char *) mount_dir, MNT_DETACH); + rmdir ((char *) mount_dir); + vec_free (mount_dir); } - return show_affinity (vm, input, cmd); + numa_free_cpumask (old_mask); + vec_free (filename); + if (pagemap_fd > -1) + close (pagemap_fd); + return error; } -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (set_affinity_command, static) = { - .path = "set affinity", - .short_help = "Set process cpu affinity", - .function = set_affinity, -}; -/* *INDENT-ON* */ +static void +unix_physmem_region_free (vlib_main_t * vm, vlib_physmem_region_index_t idx) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + + if (pr->fd > 0) + close (pr->fd); + munmap (pr->mem, pr->size); + vec_free (pr->name); + pool_put (vpm->regions, pr); +} + +clib_error_t * +unix_physmem_init (vlib_main_t * vm) +{ + clib_error_t *error = 0; + + /* Avoid multiple calls. */ + if (vm->os_physmem_alloc_aligned) + return error; + + vm->os_physmem_alloc_aligned = unix_physmem_alloc_aligned; + vm->os_physmem_free = unix_physmem_free; + vm->os_physmem_region_alloc = unix_physmem_region_alloc; + vm->os_physmem_region_free = unix_physmem_region_free; + + return error; +} static clib_error_t * -vlib_physmem_configure (vlib_main_t * vm, unformat_input_t * input) +show_physmem (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) { - physmem_main_t *pm = &physmem_main; - u32 size_in_mb; + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr; - while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + /* *INDENT-OFF* */ + pool_foreach (pr, vpm->regions, ( { - if (unformat (input, "no-huge") || unformat (input, "no-huge-pages")) - pm->no_hugepages = 1; - - else if (unformat (input, "size-in-mb %d", &size_in_mb) || - unformat (input, "size %d", &size_in_mb)) - pm->mem_size = size_in_mb << 20; + vlib_cli_output (vm, "index %u name '%s' page-size %uKB num-pages %d " + "numa-node %u fd %d\n", + pr->index, pr->name, (1 << (pr->log2_page_size -10)), + pr->n_pages, pr->numa_node, pr->fd); + if (pr->heap) + vlib_cli_output (vm, " %U", format_mheap, pr->heap, /* verbose */ 1); else - return unformat_parse_error (input); - } - - unformat_free (input); + vlib_cli_output (vm, " no heap\n"); + })); + /* *INDENT-ON* */ return 0; } -VLIB_EARLY_CONFIG_FUNCTION (vlib_physmem_configure, "physmem"); +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_physmem_command, static) = { + .path = "show physmem", + .short_help = "Show physical memory allocation", + .function = show_physmem, +}; +/* *INDENT-ON* */ /* * fd.io coding-style-patch-verification: ON diff --git a/src/vlib/unix/physmem.h b/src/vlib/unix/physmem.h deleted file mode 100644 index 5519a7d6..00000000 --- a/src/vlib/unix/physmem.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __included_physmem_h__ -#define __included_physmem_h__ - -/* Manage I/O physical memory. */ -#define _GNU_SOURCE -#include -#include -#include -#include -#include - -#include -#include - -#include /* for open */ -#include /* for flock */ -#include -#include -#include -#include -#include -#include - -typedef struct -{ - /* Virtual memory via mmaped. */ - void *mem; - - /* Size in bytes. */ - uword mem_size; - - /* Heap allocated out of virtual memory. */ - void *heap; - - /* huge TLB segment id */ - int shmid; - - /* should we try to use htlb ? */ - int no_hugepages; - -} physmem_main_t; - -#endif /* __included_physmem_h__ */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vlib/unix/unix.h b/src/vlib/unix/unix.h index 97f58944..b5a33427 100644 --- a/src/vlib/unix/unix.h +++ b/src/vlib/unix/unix.h @@ -195,18 +195,7 @@ unix_save_error (unix_main_t * um, clib_error_t * error) /* Main function for Unix VLIB. */ int vlib_unix_main (int argc, char *argv[]); -/* Call to allocate/initialize physical DMA memory subsystem. - This is not an init function so that users can explicitly enable/disable - physmem when its not needed. */ -clib_error_t *unix_physmem_init (vlib_main_t * vm, - int fail_if_physical_memory_not_present); - -static inline int -unix_physmem_is_fake (vlib_main_t * vm) -{ - vlib_physmem_main_t *vpm = &vm->physmem_main; - return vpm->is_fake; -} +clib_error_t *unix_physmem_init (vlib_main_t * vm); /* Set prompt for CLI. */ void vlib_unix_cli_set_prompt (char *prompt); @@ -234,7 +223,16 @@ clib_error_t *vlib_sysfs_read (char *file_name, char *fmt, ...); u8 *vlib_sysfs_link_to_name (char *link); -int vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size); +clib_error_t *vlib_sysfs_set_nr_hugepages (unsigned int numa_node, + int page_size, int nr); +clib_error_t *vlib_sysfs_get_nr_hugepages (unsigned int numa_node, + int page_size, int *v); +clib_error_t *vlib_sysfs_get_free_hugepages (unsigned int numa_node, + int page_size, int *v); +clib_error_t *vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, + int page_size, int *v); +clib_error_t *vlib_sysfs_prealloc_hugepages (unsigned int numa_node, + int page_size, int nr); clib_error_t *foreach_directory_file (char *dir_name, clib_error_t * (*f) (void *arg, diff --git a/src/vlib/unix/util.c b/src/vlib/unix/util.c index 312cc9b5..0e252aca 100644 --- a/src/vlib/unix/util.c +++ b/src/vlib/unix/util.c @@ -189,37 +189,132 @@ vlib_sysfs_link_to_name (char *link) return s; } -int -vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size) +clib_error_t * +vlib_sysfs_set_nr_hugepages (unsigned int numa_node, int page_size, int nr) { + clib_error_t *error = 0; struct stat sb; u8 *p = 0; - int r = -1; p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); if (stat ((char *) p, &sb) == 0) { if (S_ISDIR (sb.st_mode) == 0) - goto done; + { + error = clib_error_return (0, "'%s' is not directory", p); + goto done; + } } else if (numa_node == 0) { vec_reset_length (p); p = format (p, "/sys/kernel/mm%c", 0); if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) - goto done; + { + error = clib_error_return (0, "'%s' does not exist or it is not " + "directory", p); + goto done; + } } else - goto done; + { + error = clib_error_return (0, "'%s' does not exist", p); + goto done; + } _vec_len (p) -= 1; - p = format (p, "/hugepages/hugepages-%ukB/free_hugepages%c", page_size, 0); - vlib_sysfs_read ((char *) p, "%d", &r); + p = format (p, "/hugepages/hugepages-%ukB/nr_hugepages%c", page_size, 0); + vlib_sysfs_write ((char *) p, "%d", nr); done: vec_free (p); - return r; + return error; +} + + +static clib_error_t * +vlib_sysfs_get_xxx_hugepages (char *type, unsigned int numa_node, + int page_size, int *val) +{ + clib_error_t *error = 0; + struct stat sb; + u8 *p = 0; + + p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); + + if (stat ((char *) p, &sb) == 0) + { + if (S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' is not directory", p); + goto done; + } + } + else if (numa_node == 0) + { + vec_reset_length (p); + p = format (p, "/sys/kernel/mm%c", 0); + if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' does not exist or it is not " + "directory", p); + goto done; + } + } + else + { + error = clib_error_return (0, "'%s' does not exist", p); + goto done; + } + + _vec_len (p) -= 1; + p = format (p, "/hugepages/hugepages-%ukB/%s_hugepages%c", page_size, + type, 0); + error = vlib_sysfs_read ((char *) p, "%d", val); + +done: + vec_free (p); + return error; +} + +clib_error_t * +vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size, int *v) +{ + return vlib_sysfs_get_xxx_hugepages ("free", numa_node, page_size, v); +} + +clib_error_t * +vlib_sysfs_get_nr_hugepages (unsigned int numa_node, int page_size, int *v) +{ + return vlib_sysfs_get_xxx_hugepages ("nr", numa_node, page_size, v); +} + +clib_error_t * +vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, int page_size, + int *v) +{ + return vlib_sysfs_get_xxx_hugepages ("surplus", numa_node, page_size, v); +} + +clib_error_t * +vlib_sysfs_prealloc_hugepages (unsigned int numa_node, int page_size, int nr) +{ + clib_error_t *error = 0; + int n, needed; + error = vlib_sysfs_get_free_hugepages (numa_node, page_size, &n); + if (error) + return error; + needed = nr - n; + if (needed <= 0) + return 0; + + error = vlib_sysfs_get_nr_hugepages (numa_node, page_size, &n); + if (error) + return error; + clib_warning ("pre-allocating %u additional %uK hugepages on numa node %u", + needed, page_size, numa_node); + return vlib_sysfs_set_nr_hugepages (numa_node, page_size, n + needed); } clib_error_t * diff --git a/src/vlib/vlib.h b/src/vlib/vlib.h index b146a49b..eed5c5bc 100644 --- a/src/vlib/vlib.h +++ b/src/vlib/vlib.h @@ -50,6 +50,7 @@ struct vlib_main_t; /* All includes in alphabetical order. */ +#include #include #include #include @@ -57,7 +58,6 @@ struct vlib_main_t; #include #include #include -#include #include /* Main include depends on other vlib/ includes so we put it last. */ @@ -65,6 +65,7 @@ struct vlib_main_t; /* Inline/extern function declarations. */ #include +#include #include #include #include -- cgit 1.2.3-korg From ab7b8d93cf1098970bc17fb4937376bb1ff33a21 Mon Sep 17 00:00:00 2001 From: Chris Luke Date: Thu, 7 Sep 2017 07:40:13 -0400 Subject: Fixes for issues reported by Coverity (VPP-972) Change-Id: I25238debb7081b4467aec4620dfdef33fbef3295 Signed-off-by: Chris Luke --- src/svm/svm_fifo.c | 2 +- src/uri/sock_test_client.c | 2 ++ src/uri/sock_test_server.c | 12 +++++++++--- src/uri/vppcom.c | 17 ++++++++--------- src/vlib/unix/main.c | 8 ++++---- src/vnet/ipsec/ikev2.c | 11 ++++++++++- src/vppinfra/socket.c | 16 ++++++++++++++-- 7 files changed, 48 insertions(+), 20 deletions(-) (limited to 'src/vlib') diff --git a/src/svm/svm_fifo.c b/src/svm/svm_fifo.c index 8fe82f56..42eb1ee8 100644 --- a/src/svm/svm_fifo.c +++ b/src/svm/svm_fifo.c @@ -443,7 +443,7 @@ ooo_segment_try_collect (svm_fifo_t * f, u32 n_bytes_enqueued) } } - ASSERT (bytes >= 0 && bytes <= f->nitems); + ASSERT (bytes <= f->nitems); return bytes; } diff --git a/src/uri/sock_test_client.c b/src/uri/sock_test_client.c index 4319f01b..ab8e5a0e 100644 --- a/src/uri/sock_test_client.c +++ b/src/uri/sock_test_client.c @@ -895,6 +895,8 @@ main (int argc, char **argv) case 'w': fprintf (stderr, "ERROR: Option -%c requires an argument.\n", optopt); + break; + default: if (isprint (optopt)) fprintf (stderr, "ERROR: Unknown option `-%c'.\n", optopt); diff --git a/src/uri/sock_test_server.c b/src/uri/sock_test_server.c index f703b177..29adea25 100644 --- a/src/uri/sock_test_server.c +++ b/src/uri/sock_test_server.c @@ -514,9 +514,15 @@ main (int argc, char **argv) continue; } - else if (strlen ((char *) conn->buf)) - printf ("\nSERVER (fd %d): RX (%d bytes) - '%s'\n", - conn->fd, rx_bytes, conn->buf); + else if (((char *) conn->buf)[0] != 0) + { + // If it looks vaguely like a string, make sure it's terminated + ((char *) conn->buf)[rx_bytes < + conn->buf_size ? rx_bytes : + conn->buf_size - 1] = 0; + printf ("\nSERVER (fd %d): RX (%d bytes) - '%s'\n", + conn->fd, rx_bytes, conn->buf); + } } else // rx_bytes < 0 { diff --git a/src/uri/vppcom.c b/src/uri/vppcom.c index aec1295f..aa307f1d 100644 --- a/src/uri/vppcom.c +++ b/src/uri/vppcom.c @@ -1369,23 +1369,18 @@ vppcom_cfg_heapsize (char *conf_fname) argc++; char **tmp = realloc (argv, argc * sizeof (char *)); if (tmp == NULL) - { - fclose (fp); - goto defaulted; - } + goto defaulted; argv = tmp; arg = strndup (p, 1024); if (arg == NULL) - { - fclose (fp); - goto defaulted; - } + goto defaulted; argv[argc - 1] = arg; p = strtok (NULL, " \t\n"); } } fclose (fp); + fp = NULL; char **tmp = realloc (argv, (argc + 1) * sizeof (char *)); if (tmp == NULL) @@ -1438,6 +1433,10 @@ vppcom_cfg_heapsize (char *conf_fname) } defaulted: + if (fp != NULL) + fclose (fp); + if (argv != NULL) + free (argv); if (!clib_mem_init (0, vcl_cfg->heapsize)) clib_warning ("[%d] vppcom heap allocation failure!", vcm->my_pid); else if (VPPCOM_DEBUG > 0) @@ -1687,7 +1686,7 @@ input_done: unformat_free (input); file_done: - if (fd > 0) + if (fd >= 0) close (fd); } diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c index c90e1331..3a92b2e3 100644 --- a/src/vlib/unix/main.c +++ b/src/vlib/unix/main.c @@ -434,6 +434,10 @@ unix_config (vlib_main_t * vm, unformat_input_t * input) vlib_default_runtime_dir, 0); } + error = setup_signal_handlers (um); + if (error) + return error; + if (um->pidfile) { if ((error = vlib_unix_validate_runtime_file (um, @@ -448,10 +452,6 @@ unix_config (vlib_main_t * vm, unformat_input_t * input) } } - error = setup_signal_handlers (um); - if (error) - return error; - if (!(um->flags & UNIX_FLAG_INTERACTIVE)) { openlog (vm->name, LOG_CONS | LOG_PERROR | LOG_PID, LOG_DAEMON); diff --git a/src/vnet/ipsec/ikev2.c b/src/vnet/ipsec/ikev2.c index 296654ec..a3dc7b87 100644 --- a/src/vnet/ipsec/ikev2.c +++ b/src/vnet/ipsec/ikev2.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -1595,8 +1596,16 @@ ikev2_create_tunnel_interface (vnet_main_t * vnm, ikev2_sa_t * sa, + sa->profile->lifetime; if (sa->profile->lifetime_jitter) { + // This is not much better than rand(3), which Coverity warns + // is unsuitable for security applications; random_u32 is + // however fast. If this perturbance to the expiration time + // needs to use a better RNG then we may need to use something + // like /dev/urandom which has significant overhead. + u32 rnd = (u32) (vlib_time_now (vnm->vlib_main) * 1e6); + rnd = random_u32 (&rnd); + child->time_to_expiration += - 1 + (rand () % sa->profile->lifetime_jitter); + 1 + (rnd % sa->profile->lifetime_jitter); } } diff --git a/src/vppinfra/socket.c b/src/vppinfra/socket.c index 7ade440c..37dcbbfd 100644 --- a/src/vppinfra/socket.c +++ b/src/vppinfra/socket.c @@ -359,9 +359,21 @@ clib_socket_init (clib_socket_t * s) && s->flags & SOCKET_ALLOW_GROUP_WRITE) { struct stat st = { 0 }; - stat (((struct sockaddr_un *) &addr)->sun_path, &st); + if (stat (((struct sockaddr_un *) &addr)->sun_path, &st) < 0) + { + error = clib_error_return_unix (0, "stat (fd %d, '%s')", + s->fd, s->config); + goto done; + } st.st_mode |= S_IWGRP; - chmod (((struct sockaddr_un *) &addr)->sun_path, st.st_mode); + if (chmod (((struct sockaddr_un *) &addr)->sun_path, st.st_mode) < + 0) + { + error = + clib_error_return_unix (0, "chmod (fd %d, '%s', mode %o)", + s->fd, s->config, st.st_mode); + goto done; + } } } else -- cgit 1.2.3-korg From 3b64d6334b4e8d0759cff043a55042f88d1ccb0e Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Fri, 8 Sep 2017 12:26:12 +0200 Subject: vlib: move linux-specific code to vlib/linux Change-Id: Id79d2c2be7a98e15416a537c890a8f2dd6d4464d Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/init.c | 1 + src/plugins/memif/memif.c | 1 + src/plugins/memif/private.h | 30 -- src/vlib.am | 7 +- src/vlib/linux/pci.c | 666 +++++++++++++++++++++++++++++++++ src/vlib/linux/physmem.c | 411 ++++++++++++++++++++ src/vlib/linux/syscall.h | 58 +++ src/vlib/linux/sysfs.c | 250 +++++++++++++ src/vlib/linux/sysfs.h | 44 +++ src/vlib/pci/linux_pci.c | 665 -------------------------------- src/vlib/threads_cli.c | 1 + src/vlib/unix/physmem.c | 439 ---------------------- src/vlib/unix/unix.h | 17 - src/vlib/unix/util.c | 219 ----------- src/vnet/devices/af_packet/af_packet.c | 1 + 15 files changed, 1438 insertions(+), 1372 deletions(-) create mode 100644 src/vlib/linux/pci.c create mode 100644 src/vlib/linux/physmem.c create mode 100644 src/vlib/linux/syscall.h create mode 100644 src/vlib/linux/sysfs.c create mode 100644 src/vlib/linux/sysfs.h delete mode 100644 src/vlib/pci/linux_pci.c delete mode 100644 src/vlib/unix/physmem.c (limited to 'src/vlib') diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index e23542f7..4ef3b676 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/src/plugins/memif/memif.c b/src/plugins/memif/memif.c index 7e2d947f..4c387b92 100644 --- a/src/plugins/memif/memif.c +++ b/src/plugins/memif/memif.c @@ -33,6 +33,7 @@ #include #include +#include #include #include #include diff --git a/src/plugins/memif/private.h b/src/plugins/memif/private.h index 985ac5ec..b5f2f8ff 100644 --- a/src/plugins/memif/private.h +++ b/src/plugins/memif/private.h @@ -228,24 +228,6 @@ int memif_create_if (vlib_main_t * vm, memif_create_if_args_t * args); int memif_delete_if (vlib_main_t * vm, memif_if_t * mif); clib_error_t *memif_plugin_api_hookup (vlib_main_t * vm); -#ifndef __NR_memfd_create -#if defined __x86_64__ -#define __NR_memfd_create 319 -#elif defined __arm__ -#define __NR_memfd_create 385 -#elif defined __aarch64__ -#define __NR_memfd_create 279 -#else -#error "__NR_memfd_create unknown for this architecture" -#endif -#endif - -static inline int -memfd_create (const char *name, unsigned int flags) -{ - return syscall (__NR_memfd_create, name, flags); -} - static_always_inline void * memif_get_buffer (memif_if_t * mif, memif_ring_t * ring, u16 slot) { @@ -253,18 +235,6 @@ memif_get_buffer (memif_if_t * mif, memif_ring_t * ring, u16 slot) return mif->regions[region].shm + ring->desc[slot].offset; } -#ifndef F_LINUX_SPECIFIC_BASE -#define F_LINUX_SPECIFIC_BASE 1024 -#endif -#define MFD_ALLOW_SEALING 0x0002U -#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) -#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) - -#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ -#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ -#define F_SEAL_GROW 0x0004 /* prevent file from growing */ -#define F_SEAL_WRITE 0x0008 /* prevent writes */ - /* memif.c */ clib_error_t *memif_init_regions_and_queues (memif_if_t * mif); clib_error_t *memif_connect (memif_if_t * mif); diff --git a/src/vlib.am b/src/vlib.am index cab90e2d..41d68690 100644 --- a/src/vlib.am +++ b/src/vlib.am @@ -32,13 +32,15 @@ libvlib_la_SOURCES = \ vlib/format.c \ vlib/i2c.c \ vlib/init.c \ + vlib/linux/pci.c \ + vlib/linux/physmem.c \ + vlib/linux/sysfs.c \ vlib/main.c \ vlib/mc.c \ vlib/node.c \ vlib/node_cli.c \ vlib/node_format.c \ vlib/pci/pci.c \ - vlib/pci/linux_pci.c \ vlib/threads.c \ vlib/threads_cli.c \ vlib/trace.c @@ -58,6 +60,8 @@ nobase_include_HEADERS += \ vlib/global_funcs.h \ vlib/i2c.h \ vlib/init.h \ + vlib/linux/sysfs.h \ + vlib/linux/syscall.h \ vlib/main.h \ vlib/mc.h \ vlib/node_funcs.h \ @@ -79,7 +83,6 @@ libvlib_la_SOURCES += \ vlib/unix/mc_socket.c \ vlib/unix/plugin.c \ vlib/unix/plugin.h \ - vlib/unix/physmem.c \ vlib/unix/util.c nobase_include_HEADERS += \ diff --git a/src/vlib/linux/pci.c b/src/vlib/linux/pci.c new file mode 100644 index 00000000..cd2affdc --- /dev/null +++ b/src/vlib/linux/pci.c @@ -0,0 +1,666 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * pci.c: Linux user space PCI bus management. + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct +{ + /* /sys/bus/pci/devices/... directory name for this device. */ + u8 *dev_dir_name; + + /* Resource file descriptors. */ + int *resource_fds; + + /* File descriptor for config space read/write. */ + int config_fd; + + /* File descriptor for /dev/uio%d */ + int uio_fd; + + /* Minor device for uio device. */ + u32 uio_minor; + + /* Index given by unix_file_add. */ + u32 unix_file_index; + +} linux_pci_device_t; + +/* Pool of PCI devices. */ +typedef struct +{ + vlib_main_t *vlib_main; + linux_pci_device_t *linux_pci_devices; +} linux_pci_main_t; + +extern linux_pci_main_t linux_pci_main; + +/* Call to allocate/initialize the pci subsystem. + This is not an init function so that users can explicitly enable + pci only when it's needed. */ +clib_error_t *pci_bus_init (vlib_main_t * vm); + +clib_error_t *vlib_pci_bind_to_uio (vlib_pci_device_t * d, + char *uio_driver_name); + +linux_pci_main_t linux_pci_main; + +clib_error_t * +vlib_pci_bind_to_uio (vlib_pci_device_t * d, char *uio_driver_name) +{ + clib_error_t *error = 0; + u8 *s = 0, *driver_name = 0; + DIR *dir = 0; + struct dirent *e; + int fd, clear_driver_override = 0; + u8 *dev_dir_name = format (0, "/sys/bus/pci/devices/%U", + format_vlib_pci_addr, &d->bus_address); + + s = format (s, "%v/driver%c", dev_dir_name, 0); + driver_name = vlib_sysfs_link_to_name ((char *) s); + vec_reset_length (s); + + if (driver_name && + ((strcmp ("vfio-pci", (char *) driver_name) == 0) || + (strcmp ("uio_pci_generic", (char *) driver_name) == 0) || + (strcmp ("igb_uio", (char *) driver_name) == 0))) + goto done; + + /* walk trough all linux interfaces and if interface belonging to + this device is founf check if interface is admin up */ + dir = opendir ("/sys/class/net"); + s = format (s, "%U%c", format_vlib_pci_addr, &d->bus_address, 0); + + if (!dir) + { + error = clib_error_return (0, "Skipping PCI device %U: failed to " + "read /sys/class/net", + format_vlib_pci_addr, &d->bus_address); + goto done; + } + + fd = socket (PF_INET, SOCK_DGRAM, 0); + if (fd < 0) + { + error = clib_error_return_unix (0, "socket"); + goto done; + } + + while ((e = readdir (dir))) + { + struct ifreq ifr; + struct ethtool_drvinfo drvinfo; + + if (e->d_name[0] == '.') /* skip . and .. */ + continue; + + memset (&ifr, 0, sizeof ifr); + memset (&drvinfo, 0, sizeof drvinfo); + ifr.ifr_data = (char *) &drvinfo; + strncpy (ifr.ifr_name, e->d_name, IFNAMSIZ - 1); + drvinfo.cmd = ETHTOOL_GDRVINFO; + if (ioctl (fd, SIOCETHTOOL, &ifr) < 0) + { + /* Some interfaces (eg "lo") don't support this ioctl */ + if ((errno != ENOTSUP) && (errno != ENODEV)) + clib_unix_warning ("ioctl fetch intf %s bus info error", + e->d_name); + continue; + } + + if (strcmp ((char *) s, drvinfo.bus_info)) + continue; + + memset (&ifr, 0, sizeof (ifr)); + strncpy (ifr.ifr_name, e->d_name, IFNAMSIZ - 1); + if (ioctl (fd, SIOCGIFFLAGS, &ifr) < 0) + { + error = clib_error_return_unix (0, "ioctl fetch intf %s flags", + e->d_name); + close (fd); + goto done; + } + + if (ifr.ifr_flags & IFF_UP) + { + error = clib_error_return (0, "Skipping PCI device %U as host " + "interface %s is up", + format_vlib_pci_addr, &d->bus_address, + e->d_name); + close (fd); + goto done; + } + } + + close (fd); + vec_reset_length (s); + + s = format (s, "%v/driver/unbind%c", dev_dir_name, 0); + vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); + vec_reset_length (s); + + s = format (s, "%v/driver_override%c", dev_dir_name, 0); + if (access ((char *) s, F_OK) == 0) + { + vlib_sysfs_write ((char *) s, "%s", uio_driver_name); + clear_driver_override = 1; + } + else + { + vec_reset_length (s); + s = format (s, "/sys/bus/pci/drivers/%s/new_id%c", uio_driver_name, 0); + vlib_sysfs_write ((char *) s, "0x%04x 0x%04x", d->vendor_id, + d->device_id); + } + vec_reset_length (s); + + s = format (s, "/sys/bus/pci/drivers/%s/bind%c", uio_driver_name, 0); + vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); + vec_reset_length (s); + + if (clear_driver_override) + { + s = format (s, "%v/driver_override%c", dev_dir_name, 0); + vlib_sysfs_write ((char *) s, "%c", 0); + vec_reset_length (s); + } + +done: + closedir (dir); + vec_free (s); + vec_free (dev_dir_name); + vec_free (driver_name); + return error; +} + + +static clib_error_t * +scan_uio_dir (void *arg, u8 * path_name, u8 * file_name) +{ + linux_pci_device_t *l = arg; + unformat_input_t input; + + unformat_init_string (&input, (char *) file_name, vec_len (file_name)); + + if (!unformat (&input, "uio%d", &l->uio_minor)) + abort (); + + unformat_free (&input); + return 0; +} + +static clib_error_t * +linux_pci_uio_read_ready (unix_file_t * uf) +{ + vlib_pci_main_t *pm = &pci_main; + vlib_pci_device_t *d; + int __attribute__ ((unused)) rv; + + u32 icount; + rv = read (uf->file_descriptor, &icount, 4); + + d = pool_elt_at_index (pm->pci_devs, uf->private_data); + + if (d->interrupt_handler) + d->interrupt_handler (d); + + vlib_pci_intr_enable (d); + + return /* no error */ 0; +} + +static clib_error_t * +linux_pci_uio_error_ready (unix_file_t * uf) +{ + u32 error_index = (u32) uf->private_data; + + return clib_error_return (0, "pci device %d: error", error_index); +} + +static void +add_device (vlib_pci_device_t * dev, linux_pci_device_t * pdev) +{ + vlib_pci_main_t *pm = &pci_main; + linux_pci_main_t *lpm = &linux_pci_main; + linux_pci_device_t *l; + + pool_get (lpm->linux_pci_devices, l); + l[0] = pdev[0]; + + l->dev_dir_name = vec_dup (l->dev_dir_name); + + dev->os_handle = l - lpm->linux_pci_devices; + + { + u8 *uio_dir = format (0, "%s/uio", l->dev_dir_name); + foreach_directory_file ((char *) uio_dir, scan_uio_dir, l, /* scan_dirs */ + 1); + vec_free (uio_dir); + } + + { + char *uio_name = (char *) format (0, "/dev/uio%d%c", l->uio_minor, 0); + l->uio_fd = open (uio_name, O_RDWR); + if (l->uio_fd < 0) + clib_unix_error ("open `%s'", uio_name); + vec_free (uio_name); + } + + { + unix_file_t template = { 0 }; + unix_main_t *um = &unix_main; + + template.read_function = linux_pci_uio_read_ready; + template.file_descriptor = l->uio_fd; + template.error_function = linux_pci_uio_error_ready; + template.private_data = dev - pm->pci_devs; + + l->unix_file_index = unix_file_add (um, &template); + } +} + +static void +linux_pci_device_free (linux_pci_device_t * l) +{ + int i; + for (i = 0; i < vec_len (l->resource_fds); i++) + if (l->resource_fds[i] > 0) + close (l->resource_fds[i]); + if (l->config_fd > 0) + close (l->config_fd); + if (l->uio_fd > 0) + close (l->uio_fd); + vec_free (l->resource_fds); + vec_free (l->dev_dir_name); +} + +/* Configuration space read/write. */ +clib_error_t * +vlib_pci_read_write_config (vlib_pci_device_t * dev, + vlib_read_or_write_t read_or_write, + uword address, void *data, u32 n_bytes) +{ + linux_pci_main_t *lpm = &linux_pci_main; + linux_pci_device_t *p; + int n; + + p = pool_elt_at_index (lpm->linux_pci_devices, dev->os_handle); + + if (read_or_write == VLIB_READ) + n = pread (p->config_fd, data, n_bytes, address); + else + n = pwrite (p->config_fd, data, n_bytes, address); + + if (n != n_bytes) + return clib_error_return_unix (0, "%s", + read_or_write == VLIB_READ + ? "read" : "write"); + + return 0; +} + +static clib_error_t * +os_map_pci_resource_internal (uword os_handle, + u32 resource, u8 * addr, void **result) +{ + linux_pci_main_t *pm = &linux_pci_main; + linux_pci_device_t *p; + struct stat stat_buf; + u8 *file_name; + int fd; + clib_error_t *error; + int flags = MAP_SHARED; + + error = 0; + p = pool_elt_at_index (pm->linux_pci_devices, os_handle); + + file_name = format (0, "%v/resource%d%c", p->dev_dir_name, resource, 0); + fd = open ((char *) file_name, O_RDWR); + if (fd < 0) + { + error = clib_error_return_unix (0, "open `%s'", file_name); + goto done; + } + + if (fstat (fd, &stat_buf) < 0) + { + error = clib_error_return_unix (0, "fstat `%s'", file_name); + goto done; + } + + vec_validate (p->resource_fds, resource); + p->resource_fds[resource] = fd; + if (addr != 0) + flags |= MAP_FIXED; + + *result = mmap (addr, + /* size */ stat_buf.st_size, + PROT_READ | PROT_WRITE, flags, + /* file */ fd, + /* offset */ 0); + if (*result == (void *) -1) + { + error = clib_error_return_unix (0, "mmap `%s'", file_name); + goto done; + } + +done: + if (error) + { + if (fd >= 0) + close (fd); + } + vec_free (file_name); + return error; +} + +clib_error_t * +vlib_pci_map_resource (vlib_pci_device_t * dev, u32 resource, void **result) +{ + return (os_map_pci_resource_internal + (dev->os_handle, resource, 0 /* addr */ , + result)); +} + +clib_error_t * +vlib_pci_map_resource_fixed (vlib_pci_device_t * dev, + u32 resource, u8 * addr, void **result) +{ + return (os_map_pci_resource_internal + (dev->os_handle, resource, addr, result)); +} + +void +vlib_pci_free_device (vlib_pci_device_t * dev) +{ + linux_pci_main_t *pm = &linux_pci_main; + linux_pci_device_t *l; + + l = pool_elt_at_index (pm->linux_pci_devices, dev->os_handle); + linux_pci_device_free (l); + pool_put (pm->linux_pci_devices, l); +} + +pci_device_registration_t * __attribute__ ((unused)) +pci_device_next_registered (pci_device_registration_t * r) +{ + uword i; + + /* Null vendor id marks end of initialized list. */ + for (i = 0; r->supported_devices[i].vendor_id != 0; i++) + ; + + return clib_elf_section_data_next (r, i * sizeof (r->supported_devices[0])); +} + +static clib_error_t * +init_device_from_registered (vlib_main_t * vm, + vlib_pci_device_t * dev, + linux_pci_device_t * pdev) +{ + vlib_pci_main_t *pm = &pci_main; + pci_device_registration_t *r; + pci_device_id_t *i; + clib_error_t *error; + + r = pm->pci_device_registrations; + + while (r) + { + for (i = r->supported_devices; i->vendor_id != 0; i++) + if (i->vendor_id == dev->vendor_id && i->device_id == dev->device_id) + { + error = vlib_pci_bind_to_uio (dev, "uio_pci_generic"); + if (error) + { + clib_error_report (error); + continue; + } + + add_device (dev, pdev); + dev->interrupt_handler = r->interrupt_handler; + return r->init_function (vm, dev); + } + r = r->next_registration; + } + /* No driver, close the PCI config-space FD */ + close (pdev->config_fd); + return 0; +} + +static clib_error_t * +init_device (vlib_main_t * vm, + vlib_pci_device_t * dev, linux_pci_device_t * pdev) +{ + return init_device_from_registered (vm, dev, pdev); +} + +static clib_error_t * +scan_device (void *arg, u8 * dev_dir_name, u8 * ignored) +{ + vlib_main_t *vm = arg; + vlib_pci_main_t *pm = &pci_main; + int fd; + u8 *f; + clib_error_t *error = 0; + vlib_pci_device_t *dev; + linux_pci_device_t pdev = { 0 }; + u32 tmp; + + f = format (0, "%v/config%c", dev_dir_name, 0); + fd = open ((char *) f, O_RDWR); + + /* Try read-only access if write fails. */ + if (fd < 0) + fd = open ((char *) f, O_RDONLY); + + if (fd < 0) + { + error = clib_error_return_unix (0, "open `%s'", f); + goto done; + } + + pool_get (pm->pci_devs, dev); + + /* You can only read more that 64 bytes of config space as root; so we try to + read the full space but fall back to just the first 64 bytes. */ + if (read (fd, &dev->config_data, sizeof (dev->config_data)) != + sizeof (dev->config_data) + && read (fd, &dev->config0, + sizeof (dev->config0)) != sizeof (dev->config0)) + { + pool_put (pm->pci_devs, dev); + error = clib_error_return_unix (0, "read `%s'", f); + close (fd); + goto done; + } + + { + static pci_config_header_t all_ones; + if (all_ones.vendor_id == 0) + memset (&all_ones, ~0, sizeof (all_ones)); + + if (!memcmp (&dev->config0.header, &all_ones, sizeof (all_ones))) + { + pool_put (pm->pci_devs, dev); + error = clib_error_return (0, "invalid PCI config for `%s'", f); + close (fd); + goto done; + } + } + + if (dev->config0.header.header_type == 0) + pci_config_type0_little_to_host (&dev->config0); + else + pci_config_type1_little_to_host (&dev->config1); + + /* Parse bus, dev, function from directory name. */ + { + unformat_input_t input; + + unformat_init_string (&input, (char *) dev_dir_name, + vec_len (dev_dir_name)); + + if (!unformat (&input, "/sys/bus/pci/devices/%U", + unformat_vlib_pci_addr, &dev->bus_address)) + abort (); + + unformat_free (&input); + + } + + + pdev.config_fd = fd; + pdev.dev_dir_name = dev_dir_name; + + hash_set (pm->pci_dev_index_by_pci_addr, dev->bus_address.as_u32, + dev - pm->pci_devs); + + vec_reset_length (f); + f = format (f, "%v/vpd%c", dev_dir_name, 0); + fd = open ((char *) f, O_RDONLY); + if (fd >= 0) + { + while (1) + { + u8 tag[3]; + u8 *data = 0; + int len; + + if (read (fd, &tag, 3) != 3) + break; + + if (tag[0] != 0x82 && tag[0] != 0x90 && tag[0] != 0x91) + break; + + len = (tag[2] << 8) | tag[1]; + vec_validate (data, len); + + if (read (fd, data, len) != len) + { + vec_free (data); + break; + } + if (tag[0] == 0x82) + dev->product_name = data; + else if (tag[0] == 0x90) + dev->vpd_r = data; + else if (tag[0] == 0x91) + dev->vpd_w = data; + + data = 0; + } + close (fd); + } + + dev->numa_node = -1; + vec_reset_length (f); + f = format (f, "%v/numa_node%c", dev_dir_name, 0); + vlib_sysfs_read ((char *) f, "%u", &dev->numa_node); + + vec_reset_length (f); + f = format (f, "%v/class%c", dev_dir_name, 0); + vlib_sysfs_read ((char *) f, "0x%x", &tmp); + dev->device_class = tmp >> 8; + + vec_reset_length (f); + f = format (f, "%v/vendor%c", dev_dir_name, 0); + vlib_sysfs_read ((char *) f, "0x%x", &tmp); + dev->vendor_id = tmp; + + vec_reset_length (f); + f = format (f, "%v/device%c", dev_dir_name, 0); + vlib_sysfs_read ((char *) f, "0x%x", &tmp); + dev->device_id = tmp; + + error = init_device (vm, dev, &pdev); + + vec_reset_length (f); + f = format (f, "%v/driver%c", dev_dir_name, 0); + dev->driver_name = vlib_sysfs_link_to_name ((char *) f); + +done: + vec_free (f); + return error; +} + +clib_error_t * +linux_pci_init (vlib_main_t * vm) +{ + vlib_pci_main_t *pm = &pci_main; + clib_error_t *error; + + pm->vlib_main = vm; + + if ((error = vlib_call_init_function (vm, unix_input_init))) + return error; + + ASSERT (sizeof (vlib_pci_addr_t) == sizeof (u32)); + pm->pci_dev_index_by_pci_addr = hash_create (0, sizeof (uword)); + + error = foreach_directory_file ("/sys/bus/pci/devices", scan_device, vm, + /* scan_dirs */ 0); + + /* Complain and continue. might not be root, etc. */ + if (error) + clib_error_report (error); + + return error; +} + +VLIB_INIT_FUNCTION (linux_pci_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/linux/physmem.c b/src/vlib/linux/physmem.c new file mode 100644 index 00000000..6731295c --- /dev/null +++ b/src/vlib/linux/physmem.c @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * physmem.c: Unix physical memory + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static void * +unix_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx, + uword n_bytes, uword alignment) +{ + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + uword lo_offset, hi_offset; + uword *to_free = 0; + + if (pr->heap == 0) + return 0; + + /* IO memory is always at least cache aligned. */ + alignment = clib_max (alignment, CLIB_CACHE_LINE_BYTES); + + while (1) + { + mheap_get_aligned (pr->heap, n_bytes, + /* align */ alignment, + /* align offset */ 0, + &lo_offset); + + /* Allocation failed? */ + if (lo_offset == ~0) + break; + + if (pr->flags & VLIB_PHYSMEM_F_FAKE) + break; + + /* Make sure allocation does not span DMA physical chunk boundary. */ + hi_offset = lo_offset + n_bytes - 1; + + if ((lo_offset >> pr->log2_page_size) == + (hi_offset >> pr->log2_page_size)) + break; + + /* Allocation would span chunk boundary, queue it to be freed as soon as + we find suitable chunk. */ + vec_add1 (to_free, lo_offset); + } + + if (to_free != 0) + { + uword i; + for (i = 0; i < vec_len (to_free); i++) + mheap_put (pr->heap, to_free[i]); + vec_free (to_free); + } + + return lo_offset != ~0 ? pr->heap + lo_offset : 0; +} + +static void +unix_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx, void *x) +{ + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + /* Return object to region's heap. */ + mheap_put (pr->heap, x - pr->heap); +} + +static u64 +get_page_paddr (int fd, uword addr) +{ + int pagesize = sysconf (_SC_PAGESIZE); + u64 seek, pagemap = 0; + + seek = ((u64) addr / pagesize) * sizeof (u64); + if (lseek (fd, seek, SEEK_SET) != seek) + { + clib_unix_warning ("lseek to 0x%llx", seek); + return 0; + } + if (read (fd, &pagemap, sizeof (pagemap)) != (sizeof (pagemap))) + { + clib_unix_warning ("read ptbits"); + return 0; + } + if ((pagemap & (1ULL << 63)) == 0) + return 0; + + pagemap &= pow2_mask (55); + + return pagemap * pagesize; +} + +static clib_error_t * +unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, + u8 numa_node, u32 flags, + vlib_physmem_region_index_t * idx) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr; + clib_error_t *error = 0; + int pagemap_fd = -1; + u8 *mount_dir = 0; + u8 *filename = 0; + struct stat st; + int old_mpol; + int mmap_flags; + struct bitmask *old_mask = numa_allocate_nodemask (); + + if (geteuid () != 0 && (flags & VLIB_PHYSMEM_F_FAKE) == 0) + return clib_error_return (0, "not allowed"); + + pool_get (vpm->regions, pr); + + if ((pr - vpm->regions) >= 256) + { + error = clib_error_return (0, "maximum number of regions reached"); + goto error; + } + + pr->index = pr - vpm->regions; + pr->fd = -1; + pr->flags = flags; + + if (get_mempolicy (&old_mpol, old_mask->maskp, old_mask->size + 1, NULL, 0) + == -1) + { + error = clib_error_return_unix (0, "get_mempolicy"); + goto error; + } + + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) + { + if ((pagemap_fd = open ((char *) "/proc/self/pagemap", O_RDONLY)) == -1) + { + error = clib_error_return_unix (0, "open '/proc/self/pagemap'"); + goto error; + } + + mount_dir = format (0, "%s/physmem_region%d%c", + vlib_unix_get_runtime_dir (), pr->index, 0); + filename = format (0, "%s/mem%c", mount_dir, 0); + + unlink ((char *) mount_dir); + + error = vlib_unix_recursive_mkdir ((char *) mount_dir); + if (error) + goto error; + + if (mount ("none", (char *) mount_dir, "hugetlbfs", 0, NULL)) + { + error = clib_error_return_unix (0, "mount hugetlb directory '%s'", + mount_dir); + goto error; + } + + if ((pr->fd = open ((char *) filename, O_CREAT | O_RDWR, 0755)) == -1) + { + error = clib_error_return_unix (0, "open"); + goto error; + } + + mmap_flags = MAP_SHARED | MAP_HUGETLB | MAP_LOCKED; + } + else + { + if ((pr->fd = memfd_create (name, MFD_ALLOW_SEALING)) == -1) + return clib_error_return_unix (0, "memfd_create"); + + if ((fcntl (pr->fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1) + { + error = + clib_error_return_unix (0, "fcntl (F_ADD_SEALS, F_SEAL_SHRINK)"); + goto error; + } + mmap_flags = MAP_SHARED; + } + + if (fstat (pr->fd, &st)) + { + error = clib_error_return_unix (0, "fstat"); + goto error; + } + + pr->log2_page_size = min_log2 (st.st_blksize); + pr->n_pages = ((size - 1) >> pr->log2_page_size) + 1; + size = pr->n_pages * (1 << pr->log2_page_size); + + if ((ftruncate (pr->fd, size)) == -1) + { + error = clib_error_return_unix (0, "ftruncate length: %d", size); + goto error; + } + + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) + { + error = vlib_sysfs_prealloc_hugepages (numa_node, + 1 << (pr->log2_page_size - 10), + pr->n_pages); + if (error) + goto error; + } + + numa_set_preferred (numa_node); + + pr->mem = mmap (0, size, (PROT_READ | PROT_WRITE), mmap_flags, pr->fd, 0); + + if (pr->mem == MAP_FAILED) + { + pr->mem = 0; + error = clib_error_return_unix (0, "mmap"); + goto error; + } + + if (set_mempolicy (old_mpol, old_mask->maskp, old_mask->size + 1) == -1) + { + error = clib_error_return_unix (0, "set_mempolicy"); + goto error; + } + + pr->size = pr->n_pages << pr->log2_page_size; + pr->page_mask = (1 << pr->log2_page_size) - 1; + pr->numa_node = numa_node; + pr->name = format (0, "%s", name); + + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) + { + int i; + for (i = 0; i < pr->n_pages; i++) + { + void *ptr = pr->mem + (i << pr->log2_page_size); + int node; + move_pages (0, 1, &ptr, 0, &node, 0); + if (numa_node != node) + { + clib_warning + ("physmem page for region \'%s\' allocated on the wrong" + " numa node (requested %u actual %u)", pr->name, + pr->numa_node, node, i); + break; + } + } + } + + if (flags & VLIB_PHYSMEM_F_INIT_MHEAP) + { + pr->heap = mheap_alloc_with_flags (pr->mem, pr->size, + /* Don't want mheap mmap/munmap with IO memory. */ + MHEAP_FLAG_DISABLE_VM | + MHEAP_FLAG_THREAD_SAFE); + fformat (stdout, "%U", format_mheap, pr->heap, /* verbose */ 1); + } + + if (flags & VLIB_PHYSMEM_F_HAVE_BUFFERS) + { + vlib_buffer_add_mem_range (vm, pointer_to_uword (pr->mem), pr->size); + } + + *idx = pr->index; + + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) + { + int i; + for (i = 0; i < pr->n_pages; i++) + { + uword vaddr = + pointer_to_uword (pr->mem) + (((u64) i) << pr->log2_page_size); + u64 page_paddr = get_page_paddr (pagemap_fd, vaddr); + vec_add1 (pr->page_table, page_paddr); + } + } + + goto done; + +error: + if (pr->fd > -1) + close (pr->fd); + + if (pr->mem) + munmap (pr->mem, size); + + memset (pr, 0, sizeof (*pr)); + pool_put (vpm->regions, pr); + +done: + if (mount_dir) + { + umount2 ((char *) mount_dir, MNT_DETACH); + rmdir ((char *) mount_dir); + vec_free (mount_dir); + } + numa_free_cpumask (old_mask); + vec_free (filename); + if (pagemap_fd > -1) + close (pagemap_fd); + return error; +} + +static void +unix_physmem_region_free (vlib_main_t * vm, vlib_physmem_region_index_t idx) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); + + if (pr->fd > 0) + close (pr->fd); + munmap (pr->mem, pr->size); + vec_free (pr->name); + pool_put (vpm->regions, pr); +} + +clib_error_t * +unix_physmem_init (vlib_main_t * vm) +{ + clib_error_t *error = 0; + + /* Avoid multiple calls. */ + if (vm->os_physmem_alloc_aligned) + return error; + + vm->os_physmem_alloc_aligned = unix_physmem_alloc_aligned; + vm->os_physmem_free = unix_physmem_free; + vm->os_physmem_region_alloc = unix_physmem_region_alloc; + vm->os_physmem_region_free = unix_physmem_region_free; + + return error; +} + +static clib_error_t * +show_physmem (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_physmem_main_t *vpm = &vm->physmem_main; + vlib_physmem_region_t *pr; + + /* *INDENT-OFF* */ + pool_foreach (pr, vpm->regions, ( + { + vlib_cli_output (vm, "index %u name '%s' page-size %uKB num-pages %d " + "numa-node %u fd %d\n", + pr->index, pr->name, (1 << (pr->log2_page_size -10)), + pr->n_pages, pr->numa_node, pr->fd); + if (pr->heap) + vlib_cli_output (vm, " %U", format_mheap, pr->heap, /* verbose */ 1); + else + vlib_cli_output (vm, " no heap\n"); + })); + /* *INDENT-ON* */ + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_physmem_command, static) = { + .path = "show physmem", + .short_help = "Show physical memory allocation", + .function = show_physmem, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/linux/syscall.h b/src/vlib/linux/syscall.h new file mode 100644 index 00000000..9e37997e --- /dev/null +++ b/src/vlib/linux/syscall.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_linux_syscall_h +#define included_linux_syscall_h + +#ifndef __NR_memfd_create +#if defined __x86_64__ +#define __NR_memfd_create 319 +#elif defined __arm__ +#define __NR_memfd_create 385 +#elif defined __aarch64__ +#define __NR_memfd_create 279 +#else +#error "__NR_memfd_create unknown for this architecture" +#endif +#endif + +static inline int +memfd_create (const char *name, unsigned int flags) +{ + return syscall (__NR_memfd_create, name, flags); +} + +#ifndef F_LINUX_SPECIFIC_BASE +#define F_LINUX_SPECIFIC_BASE 1024 +#endif +#define MFD_ALLOW_SEALING 0x0002U +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) + +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE 0x0008 /* prevent writes */ + + +#endif /* included_linux_syscall_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/linux/sysfs.c b/src/vlib/linux/sysfs.c new file mode 100644 index 00000000..f92f9ef5 --- /dev/null +++ b/src/vlib/linux/sysfs.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include + +clib_error_t * +vlib_sysfs_write (char *file_name, char *fmt, ...) +{ + u8 *s; + int fd; + clib_error_t *error = 0; + + fd = open (file_name, O_WRONLY); + if (fd < 0) + return clib_error_return_unix (0, "open `%s'", file_name); + + va_list va; + va_start (va, fmt); + s = va_format (0, fmt, &va); + va_end (va); + + if (write (fd, s, vec_len (s)) < 0) + error = clib_error_return_unix (0, "write `%s'", file_name); + + vec_free (s); + close (fd); + return error; +} + +clib_error_t * +vlib_sysfs_read (char *file_name, char *fmt, ...) +{ + unformat_input_t input; + u8 *s = 0; + int fd; + ssize_t sz; + uword result; + + fd = open (file_name, O_RDONLY); + if (fd < 0) + return clib_error_return_unix (0, "open `%s'", file_name); + + vec_validate (s, 4095); + + sz = read (fd, s, vec_len (s)); + if (sz < 0) + { + close (fd); + vec_free (s); + return clib_error_return_unix (0, "read `%s'", file_name); + } + + _vec_len (s) = sz; + unformat_init_vector (&input, s); + + va_list va; + va_start (va, fmt); + result = va_unformat (&input, fmt, &va); + va_end (va); + + vec_free (s); + close (fd); + + if (result == 0) + return clib_error_return (0, "unformat error"); + + return 0; +} + +u8 * +vlib_sysfs_link_to_name (char *link) +{ + char *p, buffer[64]; + unformat_input_t in; + u8 *s = 0; + int r; + + r = readlink (link, buffer, sizeof (buffer) - 1); + + if (r < 0) + return 0; + + buffer[r] = 0; + p = strrchr (buffer, '/'); + + if (!p) + return 0; + + unformat_init_string (&in, p + 1, strlen (p + 1)); + if (unformat (&in, "%s", &s) != 1) + clib_unix_warning ("no string?"); + unformat_free (&in); + + return s; +} + +clib_error_t * +vlib_sysfs_set_nr_hugepages (unsigned int numa_node, int page_size, int nr) +{ + clib_error_t *error = 0; + struct stat sb; + u8 *p = 0; + + p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); + + if (stat ((char *) p, &sb) == 0) + { + if (S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' is not directory", p); + goto done; + } + } + else if (numa_node == 0) + { + vec_reset_length (p); + p = format (p, "/sys/kernel/mm%c", 0); + if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' does not exist or it is not " + "directory", p); + goto done; + } + } + else + { + error = clib_error_return (0, "'%s' does not exist", p); + goto done; + } + + _vec_len (p) -= 1; + p = format (p, "/hugepages/hugepages-%ukB/nr_hugepages%c", page_size, 0); + vlib_sysfs_write ((char *) p, "%d", nr); + +done: + vec_free (p); + return error; +} + + +static clib_error_t * +vlib_sysfs_get_xxx_hugepages (char *type, unsigned int numa_node, + int page_size, int *val) +{ + clib_error_t *error = 0; + struct stat sb; + u8 *p = 0; + + p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); + + if (stat ((char *) p, &sb) == 0) + { + if (S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' is not directory", p); + goto done; + } + } + else if (numa_node == 0) + { + vec_reset_length (p); + p = format (p, "/sys/kernel/mm%c", 0); + if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' does not exist or it is not " + "directory", p); + goto done; + } + } + else + { + error = clib_error_return (0, "'%s' does not exist", p); + goto done; + } + + _vec_len (p) -= 1; + p = format (p, "/hugepages/hugepages-%ukB/%s_hugepages%c", page_size, + type, 0); + error = vlib_sysfs_read ((char *) p, "%d", val); + +done: + vec_free (p); + return error; +} + +clib_error_t * +vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size, int *v) +{ + return vlib_sysfs_get_xxx_hugepages ("free", numa_node, page_size, v); +} + +clib_error_t * +vlib_sysfs_get_nr_hugepages (unsigned int numa_node, int page_size, int *v) +{ + return vlib_sysfs_get_xxx_hugepages ("nr", numa_node, page_size, v); +} + +clib_error_t * +vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, int page_size, + int *v) +{ + return vlib_sysfs_get_xxx_hugepages ("surplus", numa_node, page_size, v); +} + +clib_error_t * +vlib_sysfs_prealloc_hugepages (unsigned int numa_node, int page_size, int nr) +{ + clib_error_t *error = 0; + int n, needed; + error = vlib_sysfs_get_free_hugepages (numa_node, page_size, &n); + if (error) + return error; + needed = nr - n; + if (needed <= 0) + return 0; + + error = vlib_sysfs_get_nr_hugepages (numa_node, page_size, &n); + if (error) + return error; + clib_warning ("pre-allocating %u additional %uK hugepages on numa node %u", + needed, page_size, numa_node); + return vlib_sysfs_set_nr_hugepages (numa_node, page_size, n + needed); +} + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/linux/sysfs.h b/src/vlib/linux/sysfs.h new file mode 100644 index 00000000..14b71317 --- /dev/null +++ b/src/vlib/linux/sysfs.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_linux_sysfs_h +#define included_linux_sysfs_h + +clib_error_t *vlib_sysfs_write (char *file_name, char *fmt, ...); + +clib_error_t *vlib_sysfs_read (char *file_name, char *fmt, ...); + +u8 *vlib_sysfs_link_to_name (char *link); + +clib_error_t *vlib_sysfs_set_nr_hugepages (unsigned int numa_node, + int page_size, int nr); +clib_error_t *vlib_sysfs_get_nr_hugepages (unsigned int numa_node, + int page_size, int *v); +clib_error_t *vlib_sysfs_get_free_hugepages (unsigned int numa_node, + int page_size, int *v); +clib_error_t *vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, + int page_size, int *v); +clib_error_t *vlib_sysfs_prealloc_hugepages (unsigned int numa_node, + int page_size, int nr); + +#endif /* included_linux_sysfs_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vlib/pci/linux_pci.c b/src/vlib/pci/linux_pci.c deleted file mode 100644 index 2d3c0a88..00000000 --- a/src/vlib/pci/linux_pci.c +++ /dev/null @@ -1,665 +0,0 @@ -/* - * Copyright (c) 2016 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * pci.c: Linux user space PCI bus management. - * - * Copyright (c) 2008 Eliot Dresselhaus - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -typedef struct -{ - /* /sys/bus/pci/devices/... directory name for this device. */ - u8 *dev_dir_name; - - /* Resource file descriptors. */ - int *resource_fds; - - /* File descriptor for config space read/write. */ - int config_fd; - - /* File descriptor for /dev/uio%d */ - int uio_fd; - - /* Minor device for uio device. */ - u32 uio_minor; - - /* Index given by unix_file_add. */ - u32 unix_file_index; - -} linux_pci_device_t; - -/* Pool of PCI devices. */ -typedef struct -{ - vlib_main_t *vlib_main; - linux_pci_device_t *linux_pci_devices; -} linux_pci_main_t; - -extern linux_pci_main_t linux_pci_main; - -/* Call to allocate/initialize the pci subsystem. - This is not an init function so that users can explicitly enable - pci only when it's needed. */ -clib_error_t *pci_bus_init (vlib_main_t * vm); - -clib_error_t *vlib_pci_bind_to_uio (vlib_pci_device_t * d, - char *uio_driver_name); - -linux_pci_main_t linux_pci_main; - -clib_error_t * -vlib_pci_bind_to_uio (vlib_pci_device_t * d, char *uio_driver_name) -{ - clib_error_t *error = 0; - u8 *s = 0, *driver_name = 0; - DIR *dir = 0; - struct dirent *e; - int fd, clear_driver_override = 0; - u8 *dev_dir_name = format (0, "/sys/bus/pci/devices/%U", - format_vlib_pci_addr, &d->bus_address); - - s = format (s, "%v/driver%c", dev_dir_name, 0); - driver_name = vlib_sysfs_link_to_name ((char *) s); - vec_reset_length (s); - - if (driver_name && - ((strcmp ("vfio-pci", (char *) driver_name) == 0) || - (strcmp ("uio_pci_generic", (char *) driver_name) == 0) || - (strcmp ("igb_uio", (char *) driver_name) == 0))) - goto done; - - /* walk trough all linux interfaces and if interface belonging to - this device is founf check if interface is admin up */ - dir = opendir ("/sys/class/net"); - s = format (s, "%U%c", format_vlib_pci_addr, &d->bus_address, 0); - - if (!dir) - { - error = clib_error_return (0, "Skipping PCI device %U: failed to " - "read /sys/class/net", - format_vlib_pci_addr, &d->bus_address); - goto done; - } - - fd = socket (PF_INET, SOCK_DGRAM, 0); - if (fd < 0) - { - error = clib_error_return_unix (0, "socket"); - goto done; - } - - while ((e = readdir (dir))) - { - struct ifreq ifr; - struct ethtool_drvinfo drvinfo; - - if (e->d_name[0] == '.') /* skip . and .. */ - continue; - - memset (&ifr, 0, sizeof ifr); - memset (&drvinfo, 0, sizeof drvinfo); - ifr.ifr_data = (char *) &drvinfo; - strncpy (ifr.ifr_name, e->d_name, IFNAMSIZ - 1); - drvinfo.cmd = ETHTOOL_GDRVINFO; - if (ioctl (fd, SIOCETHTOOL, &ifr) < 0) - { - /* Some interfaces (eg "lo") don't support this ioctl */ - if ((errno != ENOTSUP) && (errno != ENODEV)) - clib_unix_warning ("ioctl fetch intf %s bus info error", - e->d_name); - continue; - } - - if (strcmp ((char *) s, drvinfo.bus_info)) - continue; - - memset (&ifr, 0, sizeof (ifr)); - strncpy (ifr.ifr_name, e->d_name, IFNAMSIZ - 1); - if (ioctl (fd, SIOCGIFFLAGS, &ifr) < 0) - { - error = clib_error_return_unix (0, "ioctl fetch intf %s flags", - e->d_name); - close (fd); - goto done; - } - - if (ifr.ifr_flags & IFF_UP) - { - error = clib_error_return (0, "Skipping PCI device %U as host " - "interface %s is up", - format_vlib_pci_addr, &d->bus_address, - e->d_name); - close (fd); - goto done; - } - } - - close (fd); - vec_reset_length (s); - - s = format (s, "%v/driver/unbind%c", dev_dir_name, 0); - vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); - vec_reset_length (s); - - s = format (s, "%v/driver_override%c", dev_dir_name, 0); - if (access ((char *) s, F_OK) == 0) - { - vlib_sysfs_write ((char *) s, "%s", uio_driver_name); - clear_driver_override = 1; - } - else - { - vec_reset_length (s); - s = format (s, "/sys/bus/pci/drivers/%s/new_id%c", uio_driver_name, 0); - vlib_sysfs_write ((char *) s, "0x%04x 0x%04x", d->vendor_id, - d->device_id); - } - vec_reset_length (s); - - s = format (s, "/sys/bus/pci/drivers/%s/bind%c", uio_driver_name, 0); - vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); - vec_reset_length (s); - - if (clear_driver_override) - { - s = format (s, "%v/driver_override%c", dev_dir_name, 0); - vlib_sysfs_write ((char *) s, "%c", 0); - vec_reset_length (s); - } - -done: - closedir (dir); - vec_free (s); - vec_free (dev_dir_name); - vec_free (driver_name); - return error; -} - - -static clib_error_t * -scan_uio_dir (void *arg, u8 * path_name, u8 * file_name) -{ - linux_pci_device_t *l = arg; - unformat_input_t input; - - unformat_init_string (&input, (char *) file_name, vec_len (file_name)); - - if (!unformat (&input, "uio%d", &l->uio_minor)) - abort (); - - unformat_free (&input); - return 0; -} - -static clib_error_t * -linux_pci_uio_read_ready (unix_file_t * uf) -{ - vlib_pci_main_t *pm = &pci_main; - vlib_pci_device_t *d; - int __attribute__ ((unused)) rv; - - u32 icount; - rv = read (uf->file_descriptor, &icount, 4); - - d = pool_elt_at_index (pm->pci_devs, uf->private_data); - - if (d->interrupt_handler) - d->interrupt_handler (d); - - vlib_pci_intr_enable (d); - - return /* no error */ 0; -} - -static clib_error_t * -linux_pci_uio_error_ready (unix_file_t * uf) -{ - u32 error_index = (u32) uf->private_data; - - return clib_error_return (0, "pci device %d: error", error_index); -} - -static void -add_device (vlib_pci_device_t * dev, linux_pci_device_t * pdev) -{ - vlib_pci_main_t *pm = &pci_main; - linux_pci_main_t *lpm = &linux_pci_main; - linux_pci_device_t *l; - - pool_get (lpm->linux_pci_devices, l); - l[0] = pdev[0]; - - l->dev_dir_name = vec_dup (l->dev_dir_name); - - dev->os_handle = l - lpm->linux_pci_devices; - - { - u8 *uio_dir = format (0, "%s/uio", l->dev_dir_name); - foreach_directory_file ((char *) uio_dir, scan_uio_dir, l, /* scan_dirs */ - 1); - vec_free (uio_dir); - } - - { - char *uio_name = (char *) format (0, "/dev/uio%d%c", l->uio_minor, 0); - l->uio_fd = open (uio_name, O_RDWR); - if (l->uio_fd < 0) - clib_unix_error ("open `%s'", uio_name); - vec_free (uio_name); - } - - { - unix_file_t template = { 0 }; - unix_main_t *um = &unix_main; - - template.read_function = linux_pci_uio_read_ready; - template.file_descriptor = l->uio_fd; - template.error_function = linux_pci_uio_error_ready; - template.private_data = dev - pm->pci_devs; - - l->unix_file_index = unix_file_add (um, &template); - } -} - -static void -linux_pci_device_free (linux_pci_device_t * l) -{ - int i; - for (i = 0; i < vec_len (l->resource_fds); i++) - if (l->resource_fds[i] > 0) - close (l->resource_fds[i]); - if (l->config_fd > 0) - close (l->config_fd); - if (l->uio_fd > 0) - close (l->uio_fd); - vec_free (l->resource_fds); - vec_free (l->dev_dir_name); -} - -/* Configuration space read/write. */ -clib_error_t * -vlib_pci_read_write_config (vlib_pci_device_t * dev, - vlib_read_or_write_t read_or_write, - uword address, void *data, u32 n_bytes) -{ - linux_pci_main_t *lpm = &linux_pci_main; - linux_pci_device_t *p; - int n; - - p = pool_elt_at_index (lpm->linux_pci_devices, dev->os_handle); - - if (read_or_write == VLIB_READ) - n = pread (p->config_fd, data, n_bytes, address); - else - n = pwrite (p->config_fd, data, n_bytes, address); - - if (n != n_bytes) - return clib_error_return_unix (0, "%s", - read_or_write == VLIB_READ - ? "read" : "write"); - - return 0; -} - -static clib_error_t * -os_map_pci_resource_internal (uword os_handle, - u32 resource, u8 * addr, void **result) -{ - linux_pci_main_t *pm = &linux_pci_main; - linux_pci_device_t *p; - struct stat stat_buf; - u8 *file_name; - int fd; - clib_error_t *error; - int flags = MAP_SHARED; - - error = 0; - p = pool_elt_at_index (pm->linux_pci_devices, os_handle); - - file_name = format (0, "%v/resource%d%c", p->dev_dir_name, resource, 0); - fd = open ((char *) file_name, O_RDWR); - if (fd < 0) - { - error = clib_error_return_unix (0, "open `%s'", file_name); - goto done; - } - - if (fstat (fd, &stat_buf) < 0) - { - error = clib_error_return_unix (0, "fstat `%s'", file_name); - goto done; - } - - vec_validate (p->resource_fds, resource); - p->resource_fds[resource] = fd; - if (addr != 0) - flags |= MAP_FIXED; - - *result = mmap (addr, - /* size */ stat_buf.st_size, - PROT_READ | PROT_WRITE, flags, - /* file */ fd, - /* offset */ 0); - if (*result == (void *) -1) - { - error = clib_error_return_unix (0, "mmap `%s'", file_name); - goto done; - } - -done: - if (error) - { - if (fd >= 0) - close (fd); - } - vec_free (file_name); - return error; -} - -clib_error_t * -vlib_pci_map_resource (vlib_pci_device_t * dev, u32 resource, void **result) -{ - return (os_map_pci_resource_internal - (dev->os_handle, resource, 0 /* addr */ , - result)); -} - -clib_error_t * -vlib_pci_map_resource_fixed (vlib_pci_device_t * dev, - u32 resource, u8 * addr, void **result) -{ - return (os_map_pci_resource_internal - (dev->os_handle, resource, addr, result)); -} - -void -vlib_pci_free_device (vlib_pci_device_t * dev) -{ - linux_pci_main_t *pm = &linux_pci_main; - linux_pci_device_t *l; - - l = pool_elt_at_index (pm->linux_pci_devices, dev->os_handle); - linux_pci_device_free (l); - pool_put (pm->linux_pci_devices, l); -} - -pci_device_registration_t * __attribute__ ((unused)) -pci_device_next_registered (pci_device_registration_t * r) -{ - uword i; - - /* Null vendor id marks end of initialized list. */ - for (i = 0; r->supported_devices[i].vendor_id != 0; i++) - ; - - return clib_elf_section_data_next (r, i * sizeof (r->supported_devices[0])); -} - -static clib_error_t * -init_device_from_registered (vlib_main_t * vm, - vlib_pci_device_t * dev, - linux_pci_device_t * pdev) -{ - vlib_pci_main_t *pm = &pci_main; - pci_device_registration_t *r; - pci_device_id_t *i; - clib_error_t *error; - - r = pm->pci_device_registrations; - - while (r) - { - for (i = r->supported_devices; i->vendor_id != 0; i++) - if (i->vendor_id == dev->vendor_id && i->device_id == dev->device_id) - { - error = vlib_pci_bind_to_uio (dev, "uio_pci_generic"); - if (error) - { - clib_error_report (error); - continue; - } - - add_device (dev, pdev); - dev->interrupt_handler = r->interrupt_handler; - return r->init_function (vm, dev); - } - r = r->next_registration; - } - /* No driver, close the PCI config-space FD */ - close (pdev->config_fd); - return 0; -} - -static clib_error_t * -init_device (vlib_main_t * vm, - vlib_pci_device_t * dev, linux_pci_device_t * pdev) -{ - return init_device_from_registered (vm, dev, pdev); -} - -static clib_error_t * -scan_device (void *arg, u8 * dev_dir_name, u8 * ignored) -{ - vlib_main_t *vm = arg; - vlib_pci_main_t *pm = &pci_main; - int fd; - u8 *f; - clib_error_t *error = 0; - vlib_pci_device_t *dev; - linux_pci_device_t pdev = { 0 }; - u32 tmp; - - f = format (0, "%v/config%c", dev_dir_name, 0); - fd = open ((char *) f, O_RDWR); - - /* Try read-only access if write fails. */ - if (fd < 0) - fd = open ((char *) f, O_RDONLY); - - if (fd < 0) - { - error = clib_error_return_unix (0, "open `%s'", f); - goto done; - } - - pool_get (pm->pci_devs, dev); - - /* You can only read more that 64 bytes of config space as root; so we try to - read the full space but fall back to just the first 64 bytes. */ - if (read (fd, &dev->config_data, sizeof (dev->config_data)) != - sizeof (dev->config_data) - && read (fd, &dev->config0, - sizeof (dev->config0)) != sizeof (dev->config0)) - { - pool_put (pm->pci_devs, dev); - error = clib_error_return_unix (0, "read `%s'", f); - close (fd); - goto done; - } - - { - static pci_config_header_t all_ones; - if (all_ones.vendor_id == 0) - memset (&all_ones, ~0, sizeof (all_ones)); - - if (!memcmp (&dev->config0.header, &all_ones, sizeof (all_ones))) - { - pool_put (pm->pci_devs, dev); - error = clib_error_return (0, "invalid PCI config for `%s'", f); - close (fd); - goto done; - } - } - - if (dev->config0.header.header_type == 0) - pci_config_type0_little_to_host (&dev->config0); - else - pci_config_type1_little_to_host (&dev->config1); - - /* Parse bus, dev, function from directory name. */ - { - unformat_input_t input; - - unformat_init_string (&input, (char *) dev_dir_name, - vec_len (dev_dir_name)); - - if (!unformat (&input, "/sys/bus/pci/devices/%U", - unformat_vlib_pci_addr, &dev->bus_address)) - abort (); - - unformat_free (&input); - - } - - - pdev.config_fd = fd; - pdev.dev_dir_name = dev_dir_name; - - hash_set (pm->pci_dev_index_by_pci_addr, dev->bus_address.as_u32, - dev - pm->pci_devs); - - vec_reset_length (f); - f = format (f, "%v/vpd%c", dev_dir_name, 0); - fd = open ((char *) f, O_RDONLY); - if (fd >= 0) - { - while (1) - { - u8 tag[3]; - u8 *data = 0; - int len; - - if (read (fd, &tag, 3) != 3) - break; - - if (tag[0] != 0x82 && tag[0] != 0x90 && tag[0] != 0x91) - break; - - len = (tag[2] << 8) | tag[1]; - vec_validate (data, len); - - if (read (fd, data, len) != len) - { - vec_free (data); - break; - } - if (tag[0] == 0x82) - dev->product_name = data; - else if (tag[0] == 0x90) - dev->vpd_r = data; - else if (tag[0] == 0x91) - dev->vpd_w = data; - - data = 0; - } - close (fd); - } - - dev->numa_node = -1; - vec_reset_length (f); - f = format (f, "%v/numa_node%c", dev_dir_name, 0); - vlib_sysfs_read ((char *) f, "%u", &dev->numa_node); - - vec_reset_length (f); - f = format (f, "%v/class%c", dev_dir_name, 0); - vlib_sysfs_read ((char *) f, "0x%x", &tmp); - dev->device_class = tmp >> 8; - - vec_reset_length (f); - f = format (f, "%v/vendor%c", dev_dir_name, 0); - vlib_sysfs_read ((char *) f, "0x%x", &tmp); - dev->vendor_id = tmp; - - vec_reset_length (f); - f = format (f, "%v/device%c", dev_dir_name, 0); - vlib_sysfs_read ((char *) f, "0x%x", &tmp); - dev->device_id = tmp; - - error = init_device (vm, dev, &pdev); - - vec_reset_length (f); - f = format (f, "%v/driver%c", dev_dir_name, 0); - dev->driver_name = vlib_sysfs_link_to_name ((char *) f); - -done: - vec_free (f); - return error; -} - -clib_error_t * -linux_pci_init (vlib_main_t * vm) -{ - vlib_pci_main_t *pm = &pci_main; - clib_error_t *error; - - pm->vlib_main = vm; - - if ((error = vlib_call_init_function (vm, unix_input_init))) - return error; - - ASSERT (sizeof (vlib_pci_addr_t) == sizeof (u32)); - pm->pci_dev_index_by_pci_addr = hash_create (0, sizeof (uword)); - - error = foreach_directory_file ("/sys/bus/pci/devices", scan_device, vm, - /* scan_dirs */ 0); - - /* Complain and continue. might not be root, etc. */ - if (error) - clib_error_report (error); - - return error; -} - -VLIB_INIT_FUNCTION (linux_pci_init); - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vlib/threads_cli.c b/src/vlib/threads_cli.c index 36f8109e..f8d5d8f9 100644 --- a/src/vlib/threads_cli.c +++ b/src/vlib/threads_cli.c @@ -18,6 +18,7 @@ #include #include +#include #include static u8 * diff --git a/src/vlib/unix/physmem.c b/src/vlib/unix/physmem.c deleted file mode 100644 index d5d5d6c8..00000000 --- a/src/vlib/unix/physmem.c +++ /dev/null @@ -1,439 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - * physmem.c: Unix physical memory - * - * Copyright (c) 2008 Eliot Dresselhaus - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#ifndef __NR_memfd_create -#if defined __x86_64__ -#define __NR_memfd_create 319 -#elif defined __arm__ -#define __NR_memfd_create 385 -#elif defined __aarch64__ -#define __NR_memfd_create 279 -#else -#error "__NR_memfd_create unknown for this architecture" -#endif -#endif - -static inline int -memfd_create (const char *name, unsigned int flags) -{ - return syscall (__NR_memfd_create, name, flags); -} - -#ifndef F_LINUX_SPECIFIC_BASE -#define F_LINUX_SPECIFIC_BASE 1024 -#endif -#define MFD_ALLOW_SEALING 0x0002U -#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) -#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) - -#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ -#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ -#define F_SEAL_GROW 0x0004 /* prevent file from growing */ -#define F_SEAL_WRITE 0x0008 /* prevent writes */ - -static void * -unix_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx, - uword n_bytes, uword alignment) -{ - vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); - uword lo_offset, hi_offset; - uword *to_free = 0; - - if (pr->heap == 0) - return 0; - - /* IO memory is always at least cache aligned. */ - alignment = clib_max (alignment, CLIB_CACHE_LINE_BYTES); - - while (1) - { - mheap_get_aligned (pr->heap, n_bytes, - /* align */ alignment, - /* align offset */ 0, - &lo_offset); - - /* Allocation failed? */ - if (lo_offset == ~0) - break; - - if (pr->flags & VLIB_PHYSMEM_F_FAKE) - break; - - /* Make sure allocation does not span DMA physical chunk boundary. */ - hi_offset = lo_offset + n_bytes - 1; - - if ((lo_offset >> pr->log2_page_size) == - (hi_offset >> pr->log2_page_size)) - break; - - /* Allocation would span chunk boundary, queue it to be freed as soon as - we find suitable chunk. */ - vec_add1 (to_free, lo_offset); - } - - if (to_free != 0) - { - uword i; - for (i = 0; i < vec_len (to_free); i++) - mheap_put (pr->heap, to_free[i]); - vec_free (to_free); - } - - return lo_offset != ~0 ? pr->heap + lo_offset : 0; -} - -static void -unix_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx, void *x) -{ - vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); - /* Return object to region's heap. */ - mheap_put (pr->heap, x - pr->heap); -} - -static u64 -get_page_paddr (int fd, uword addr) -{ - int pagesize = sysconf (_SC_PAGESIZE); - u64 seek, pagemap = 0; - - seek = ((u64) addr / pagesize) * sizeof (u64); - if (lseek (fd, seek, SEEK_SET) != seek) - { - clib_unix_warning ("lseek to 0x%llx", seek); - return 0; - } - if (read (fd, &pagemap, sizeof (pagemap)) != (sizeof (pagemap))) - { - clib_unix_warning ("read ptbits"); - return 0; - } - if ((pagemap & (1ULL << 63)) == 0) - return 0; - - pagemap &= pow2_mask (55); - - return pagemap * pagesize; -} - -static clib_error_t * -unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, - u8 numa_node, u32 flags, - vlib_physmem_region_index_t * idx) -{ - vlib_physmem_main_t *vpm = &vm->physmem_main; - vlib_physmem_region_t *pr; - clib_error_t *error = 0; - int pagemap_fd = -1; - u8 *mount_dir = 0; - u8 *filename = 0; - struct stat st; - int old_mpol; - int mmap_flags; - struct bitmask *old_mask = numa_allocate_nodemask (); - - if (geteuid () != 0 && (flags & VLIB_PHYSMEM_F_FAKE) == 0) - return clib_error_return (0, "not allowed"); - - pool_get (vpm->regions, pr); - - if ((pr - vpm->regions) >= 256) - { - error = clib_error_return (0, "maximum number of regions reached"); - goto error; - } - - pr->index = pr - vpm->regions; - pr->fd = -1; - pr->flags = flags; - - if (get_mempolicy (&old_mpol, old_mask->maskp, old_mask->size + 1, NULL, 0) - == -1) - { - error = clib_error_return_unix (0, "get_mempolicy"); - goto error; - } - - if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) - { - if ((pagemap_fd = open ((char *) "/proc/self/pagemap", O_RDONLY)) == -1) - { - error = clib_error_return_unix (0, "open '/proc/self/pagemap'"); - goto error; - } - - mount_dir = format (0, "%s/physmem_region%d%c", - vlib_unix_get_runtime_dir (), pr->index, 0); - filename = format (0, "%s/mem%c", mount_dir, 0); - - unlink ((char *) mount_dir); - - error = vlib_unix_recursive_mkdir ((char *) mount_dir); - if (error) - goto error; - - if (mount ("none", (char *) mount_dir, "hugetlbfs", 0, NULL)) - { - error = clib_error_return_unix (0, "mount hugetlb directory '%s'", - mount_dir); - goto error; - } - - if ((pr->fd = open ((char *) filename, O_CREAT | O_RDWR, 0755)) == -1) - { - error = clib_error_return_unix (0, "open"); - goto error; - } - - mmap_flags = MAP_SHARED | MAP_HUGETLB | MAP_LOCKED; - } - else - { - if ((pr->fd = memfd_create (name, MFD_ALLOW_SEALING)) == -1) - return clib_error_return_unix (0, "memfd_create"); - - if ((fcntl (pr->fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1) - { - error = - clib_error_return_unix (0, "fcntl (F_ADD_SEALS, F_SEAL_SHRINK)"); - goto error; - } - mmap_flags = MAP_SHARED; - } - - if (fstat (pr->fd, &st)) - { - error = clib_error_return_unix (0, "fstat"); - goto error; - } - - pr->log2_page_size = min_log2 (st.st_blksize); - pr->n_pages = ((size - 1) >> pr->log2_page_size) + 1; - size = pr->n_pages * (1 << pr->log2_page_size); - - if ((ftruncate (pr->fd, size)) == -1) - { - error = clib_error_return_unix (0, "ftruncate length: %d", size); - goto error; - } - - if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) - { - error = vlib_sysfs_prealloc_hugepages (numa_node, - 1 << (pr->log2_page_size - 10), - pr->n_pages); - if (error) - goto error; - } - - numa_set_preferred (numa_node); - - pr->mem = mmap (0, size, (PROT_READ | PROT_WRITE), mmap_flags, pr->fd, 0); - - if (pr->mem == MAP_FAILED) - { - pr->mem = 0; - error = clib_error_return_unix (0, "mmap"); - goto error; - } - - if (set_mempolicy (old_mpol, old_mask->maskp, old_mask->size + 1) == -1) - { - error = clib_error_return_unix (0, "set_mempolicy"); - goto error; - } - - pr->size = pr->n_pages << pr->log2_page_size; - pr->page_mask = (1 << pr->log2_page_size) - 1; - pr->numa_node = numa_node; - pr->name = format (0, "%s", name); - - if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) - { - int i; - for (i = 0; i < pr->n_pages; i++) - { - void *ptr = pr->mem + (i << pr->log2_page_size); - int node; - move_pages (0, 1, &ptr, 0, &node, 0); - if (numa_node != node) - { - clib_warning - ("physmem page for region \'%s\' allocated on the wrong" - " numa node (requested %u actual %u)", pr->name, - pr->numa_node, node, i); - break; - } - } - } - - if (flags & VLIB_PHYSMEM_F_INIT_MHEAP) - { - pr->heap = mheap_alloc_with_flags (pr->mem, pr->size, - /* Don't want mheap mmap/munmap with IO memory. */ - MHEAP_FLAG_DISABLE_VM | - MHEAP_FLAG_THREAD_SAFE); - fformat (stdout, "%U", format_mheap, pr->heap, /* verbose */ 1); - } - - if (flags & VLIB_PHYSMEM_F_HAVE_BUFFERS) - { - vlib_buffer_add_mem_range (vm, pointer_to_uword (pr->mem), pr->size); - } - - *idx = pr->index; - - if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) - { - int i; - for (i = 0; i < pr->n_pages; i++) - { - uword vaddr = - pointer_to_uword (pr->mem) + (((u64) i) << pr->log2_page_size); - u64 page_paddr = get_page_paddr (pagemap_fd, vaddr); - vec_add1 (pr->page_table, page_paddr); - } - } - - goto done; - -error: - if (pr->fd > -1) - close (pr->fd); - - if (pr->mem) - munmap (pr->mem, size); - - memset (pr, 0, sizeof (*pr)); - pool_put (vpm->regions, pr); - -done: - if (mount_dir) - { - umount2 ((char *) mount_dir, MNT_DETACH); - rmdir ((char *) mount_dir); - vec_free (mount_dir); - } - numa_free_cpumask (old_mask); - vec_free (filename); - if (pagemap_fd > -1) - close (pagemap_fd); - return error; -} - -static void -unix_physmem_region_free (vlib_main_t * vm, vlib_physmem_region_index_t idx) -{ - vlib_physmem_main_t *vpm = &vm->physmem_main; - vlib_physmem_region_t *pr = vlib_physmem_get_region (vm, idx); - - if (pr->fd > 0) - close (pr->fd); - munmap (pr->mem, pr->size); - vec_free (pr->name); - pool_put (vpm->regions, pr); -} - -clib_error_t * -unix_physmem_init (vlib_main_t * vm) -{ - clib_error_t *error = 0; - - /* Avoid multiple calls. */ - if (vm->os_physmem_alloc_aligned) - return error; - - vm->os_physmem_alloc_aligned = unix_physmem_alloc_aligned; - vm->os_physmem_free = unix_physmem_free; - vm->os_physmem_region_alloc = unix_physmem_region_alloc; - vm->os_physmem_region_free = unix_physmem_region_free; - - return error; -} - -static clib_error_t * -show_physmem (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) -{ - vlib_physmem_main_t *vpm = &vm->physmem_main; - vlib_physmem_region_t *pr; - - /* *INDENT-OFF* */ - pool_foreach (pr, vpm->regions, ( - { - vlib_cli_output (vm, "index %u name '%s' page-size %uKB num-pages %d " - "numa-node %u fd %d\n", - pr->index, pr->name, (1 << (pr->log2_page_size -10)), - pr->n_pages, pr->numa_node, pr->fd); - if (pr->heap) - vlib_cli_output (vm, " %U", format_mheap, pr->heap, /* verbose */ 1); - else - vlib_cli_output (vm, " no heap\n"); - })); - /* *INDENT-ON* */ - return 0; -} - -/* *INDENT-OFF* */ -VLIB_CLI_COMMAND (show_physmem_command, static) = { - .path = "show physmem", - .short_help = "Show physical memory allocation", - .function = show_physmem, -}; -/* *INDENT-ON* */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vlib/unix/unix.h b/src/vlib/unix/unix.h index b5a33427..1b0d8b9d 100644 --- a/src/vlib/unix/unix.h +++ b/src/vlib/unix/unix.h @@ -217,23 +217,6 @@ extern u8 **vlib_thread_stacks; /* utils */ -clib_error_t *vlib_sysfs_write (char *file_name, char *fmt, ...); - -clib_error_t *vlib_sysfs_read (char *file_name, char *fmt, ...); - -u8 *vlib_sysfs_link_to_name (char *link); - -clib_error_t *vlib_sysfs_set_nr_hugepages (unsigned int numa_node, - int page_size, int nr); -clib_error_t *vlib_sysfs_get_nr_hugepages (unsigned int numa_node, - int page_size, int *v); -clib_error_t *vlib_sysfs_get_free_hugepages (unsigned int numa_node, - int page_size, int *v); -clib_error_t *vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, - int page_size, int *v); -clib_error_t *vlib_sysfs_prealloc_hugepages (unsigned int numa_node, - int page_size, int nr); - clib_error_t *foreach_directory_file (char *dir_name, clib_error_t * (*f) (void *arg, u8 * path_name, diff --git a/src/vlib/unix/util.c b/src/vlib/unix/util.c index 0e252aca..5472751e 100644 --- a/src/vlib/unix/util.c +++ b/src/vlib/unix/util.c @@ -98,225 +98,6 @@ foreach_directory_file (char *dir_name, return error; } -clib_error_t * -vlib_sysfs_write (char *file_name, char *fmt, ...) -{ - u8 *s; - int fd; - clib_error_t *error = 0; - - fd = open (file_name, O_WRONLY); - if (fd < 0) - return clib_error_return_unix (0, "open `%s'", file_name); - - va_list va; - va_start (va, fmt); - s = va_format (0, fmt, &va); - va_end (va); - - if (write (fd, s, vec_len (s)) < 0) - error = clib_error_return_unix (0, "write `%s'", file_name); - - vec_free (s); - close (fd); - return error; -} - -clib_error_t * -vlib_sysfs_read (char *file_name, char *fmt, ...) -{ - unformat_input_t input; - u8 *s = 0; - int fd; - ssize_t sz; - uword result; - - fd = open (file_name, O_RDONLY); - if (fd < 0) - return clib_error_return_unix (0, "open `%s'", file_name); - - vec_validate (s, 4095); - - sz = read (fd, s, vec_len (s)); - if (sz < 0) - { - close (fd); - vec_free (s); - return clib_error_return_unix (0, "read `%s'", file_name); - } - - _vec_len (s) = sz; - unformat_init_vector (&input, s); - - va_list va; - va_start (va, fmt); - result = va_unformat (&input, fmt, &va); - va_end (va); - - vec_free (s); - close (fd); - - if (result == 0) - return clib_error_return (0, "unformat error"); - - return 0; -} - -u8 * -vlib_sysfs_link_to_name (char *link) -{ - char *p, buffer[64]; - unformat_input_t in; - u8 *s = 0; - int r; - - r = readlink (link, buffer, sizeof (buffer) - 1); - - if (r < 0) - return 0; - - buffer[r] = 0; - p = strrchr (buffer, '/'); - - if (!p) - return 0; - - unformat_init_string (&in, p + 1, strlen (p + 1)); - if (unformat (&in, "%s", &s) != 1) - clib_unix_warning ("no string?"); - unformat_free (&in); - - return s; -} - -clib_error_t * -vlib_sysfs_set_nr_hugepages (unsigned int numa_node, int page_size, int nr) -{ - clib_error_t *error = 0; - struct stat sb; - u8 *p = 0; - - p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); - - if (stat ((char *) p, &sb) == 0) - { - if (S_ISDIR (sb.st_mode) == 0) - { - error = clib_error_return (0, "'%s' is not directory", p); - goto done; - } - } - else if (numa_node == 0) - { - vec_reset_length (p); - p = format (p, "/sys/kernel/mm%c", 0); - if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) - { - error = clib_error_return (0, "'%s' does not exist or it is not " - "directory", p); - goto done; - } - } - else - { - error = clib_error_return (0, "'%s' does not exist", p); - goto done; - } - - _vec_len (p) -= 1; - p = format (p, "/hugepages/hugepages-%ukB/nr_hugepages%c", page_size, 0); - vlib_sysfs_write ((char *) p, "%d", nr); - -done: - vec_free (p); - return error; -} - - -static clib_error_t * -vlib_sysfs_get_xxx_hugepages (char *type, unsigned int numa_node, - int page_size, int *val) -{ - clib_error_t *error = 0; - struct stat sb; - u8 *p = 0; - - p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); - - if (stat ((char *) p, &sb) == 0) - { - if (S_ISDIR (sb.st_mode) == 0) - { - error = clib_error_return (0, "'%s' is not directory", p); - goto done; - } - } - else if (numa_node == 0) - { - vec_reset_length (p); - p = format (p, "/sys/kernel/mm%c", 0); - if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) - { - error = clib_error_return (0, "'%s' does not exist or it is not " - "directory", p); - goto done; - } - } - else - { - error = clib_error_return (0, "'%s' does not exist", p); - goto done; - } - - _vec_len (p) -= 1; - p = format (p, "/hugepages/hugepages-%ukB/%s_hugepages%c", page_size, - type, 0); - error = vlib_sysfs_read ((char *) p, "%d", val); - -done: - vec_free (p); - return error; -} - -clib_error_t * -vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size, int *v) -{ - return vlib_sysfs_get_xxx_hugepages ("free", numa_node, page_size, v); -} - -clib_error_t * -vlib_sysfs_get_nr_hugepages (unsigned int numa_node, int page_size, int *v) -{ - return vlib_sysfs_get_xxx_hugepages ("nr", numa_node, page_size, v); -} - -clib_error_t * -vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, int page_size, - int *v) -{ - return vlib_sysfs_get_xxx_hugepages ("surplus", numa_node, page_size, v); -} - -clib_error_t * -vlib_sysfs_prealloc_hugepages (unsigned int numa_node, int page_size, int nr) -{ - clib_error_t *error = 0; - int n, needed; - error = vlib_sysfs_get_free_hugepages (numa_node, page_size, &n); - if (error) - return error; - needed = nr - n; - if (needed <= 0) - return 0; - - error = vlib_sysfs_get_nr_hugepages (numa_node, page_size, &n); - if (error) - return error; - clib_warning ("pre-allocating %u additional %uK hugepages on numa node %u", - needed, page_size, numa_node); - return vlib_sysfs_set_nr_hugepages (numa_node, page_size, n + needed); -} - clib_error_t * vlib_unix_recursive_mkdir (char *path) { diff --git a/src/vnet/devices/af_packet/af_packet.c b/src/vnet/devices/af_packet/af_packet.c index ea52878d..e7e69214 100644 --- a/src/vnet/devices/af_packet/af_packet.c +++ b/src/vnet/devices/af_packet/af_packet.c @@ -26,6 +26,7 @@ #include #include +#include #include #include -- cgit 1.2.3-korg From 816122e303efb5012f6897bd53ff8fe28cd8fa1c Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Fri, 8 Sep 2017 19:21:00 +0200 Subject: physmem: make fake allocation non-fatal if we cannot pick numa node Change-Id: I563c043ed82e3ef199fc3d47931108f31cc01728 Signed-off-by: Damjan Marion --- src/vlib/linux/physmem.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/linux/physmem.c b/src/vlib/linux/physmem.c index 6731295c..fddff2ea 100644 --- a/src/vlib/linux/physmem.c +++ b/src/vlib/linux/physmem.c @@ -170,8 +170,13 @@ unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, if (get_mempolicy (&old_mpol, old_mask->maskp, old_mask->size + 1, NULL, 0) == -1) { - error = clib_error_return_unix (0, "get_mempolicy"); - goto error; + if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) + { + error = clib_error_return_unix (0, "get_mempolicy"); + goto error; + } + else + old_mpol = -1; } if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) @@ -246,7 +251,8 @@ unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, goto error; } - numa_set_preferred (numa_node); + if (old_mpol != -1) + numa_set_preferred (numa_node); pr->mem = mmap (0, size, (PROT_READ | PROT_WRITE), mmap_flags, pr->fd, 0); @@ -257,7 +263,8 @@ unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, goto error; } - if (set_mempolicy (old_mpol, old_mask->maskp, old_mask->size + 1) == -1) + if (old_mpol != -1 && + set_mempolicy (old_mpol, old_mask->maskp, old_mask->size + 1) == -1) { error = clib_error_return_unix (0, "set_mempolicy"); goto error; -- cgit 1.2.3-korg From 56dd5438b04b869065d8e901c315496bb6777455 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Fri, 8 Sep 2017 19:52:02 +0200 Subject: move unix_file_* code to vppinfra This will allow us to use this code in client libraries without vlib. Change-Id: I8557b752496841ba588aa36b6082cbe2cd1867fe Signed-off-by: Damjan Marion --- src/plugins/memif/memif.c | 36 ++++----- src/plugins/memif/private.h | 38 +++++----- src/plugins/memif/socket.c | 56 +++++++------- src/vlib/linux/pci.c | 13 ++-- src/vlib/unix/cli.c | 85 +++++++++++---------- src/vlib/unix/input.c | 13 ++-- src/vlib/unix/main.c | 1 + src/vlib/unix/mc_socket.c | 69 ++++++++--------- src/vlib/unix/mc_socket.h | 2 +- src/vlib/unix/unix.h | 76 +------------------ src/vlibapi/api_common.h | 2 +- src/vlibsocket/api.h | 12 +-- src/vlibsocket/sockclnt_vlib.c | 12 +-- src/vlibsocket/socksvr_vlib.c | 92 +++++++++++----------- src/vnet/devices/af_packet/af_packet.c | 12 +-- src/vnet/devices/af_packet/af_packet.h | 2 +- src/vnet/devices/netmap/netmap.c | 14 ++-- src/vnet/devices/netmap/netmap.h | 2 +- src/vnet/devices/virtio/vhost-user.c | 62 +++++++-------- src/vnet/devices/virtio/vhost-user.h | 2 +- src/vnet/ip/punt.c | 8 +- src/vnet/ip/punt.h | 2 +- src/vnet/unix/tapcli.c | 19 ++--- src/vnet/unix/tuntap.c | 10 +-- src/vppinfra.am | 1 + src/vppinfra/file.h | 134 +++++++++++++++++++++++++++++++++ 26 files changed, 423 insertions(+), 352 deletions(-) create mode 100644 src/vppinfra/file.h (limited to 'src/vlib') diff --git a/src/plugins/memif/memif.c b/src/plugins/memif/memif.c index 4c387b92..8fec409a 100644 --- a/src/plugins/memif/memif.c +++ b/src/plugins/memif/memif.c @@ -52,10 +52,10 @@ memif_eth_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, u32 flags) static void memif_queue_intfd_close (memif_queue_t * mq) { - if (mq->int_unix_file_index != ~0) + if (mq->int_clib_file_index != ~0) { - memif_file_del_by_index (mq->int_unix_file_index); - mq->int_unix_file_index = ~0; + memif_file_del_by_index (mq->int_clib_file_index); + mq->int_clib_file_index = ~0; mq->int_fd = -1; } else if (mq->int_fd > -1) @@ -94,13 +94,13 @@ memif_disconnect (memif_if_t * mif, clib_error_t * err) vnet_hw_interface_set_flags (vnm, mif->hw_if_index, 0); /* close connection socket */ - if (mif->conn_unix_file_index != ~0) + if (mif->conn_clib_file_index != ~0) { memif_socket_file_t *msf = vec_elt_at_index (mm->socket_files, mif->socket_file_index); hash_unset (msf->dev_instance_by_fd, mif->conn_fd); - memif_file_del_by_index (mif->conn_unix_file_index); - mif->conn_unix_file_index = ~0; + memif_file_del_by_index (mif->conn_clib_file_index); + mif->conn_clib_file_index = ~0; } else if (mif->conn_fd > -1) close (mif->conn_fd); @@ -145,7 +145,7 @@ memif_disconnect (memif_if_t * mif, clib_error_t * err) } static clib_error_t * -memif_int_fd_read_ready (unix_file_t * uf) +memif_int_fd_read_ready (clib_file_t * uf) { memif_main_t *mm = &memif_main; vnet_main_t *vnm = vnet_get_main (); @@ -173,7 +173,7 @@ clib_error_t * memif_connect (memif_if_t * mif) { vnet_main_t *vnm = vnet_get_main (); - unix_file_t template = { 0 }; + clib_file_t template = { 0 }; memif_region_t *mr; int i; @@ -219,7 +219,7 @@ memif_connect (memif_if_t * mif) { template.file_descriptor = mq->int_fd; template.private_data = (mif->dev_instance << 16) | (i & 0xFFFF); - memif_file_add (&mq->int_unix_file_index, &template); + memif_file_add (&mq->int_clib_file_index, &template); } vnet_hw_interface_assign_rx_thread (vnm, mif->hw_if_index, i, ~0); rv = vnet_hw_interface_set_rx_mode (vnm, mif->hw_if_index, i, @@ -330,7 +330,7 @@ memif_init_regions_and_queues (memif_if_t * mif) memif_queue_t *mq = vec_elt_at_index (mif->tx_queues, i); if ((mq->int_fd = eventfd (0, EFD_NONBLOCK)) < 0) return clib_error_return_unix (0, "eventfd[tx queue %u]", i); - mq->int_unix_file_index = ~0; + mq->int_clib_file_index = ~0; mq->ring = memif_get_ring (mif, MEMIF_RING_S2M, i); mq->log2_ring_size = mif->cfg.log2_ring_size; mq->region = 0; @@ -346,7 +346,7 @@ memif_init_regions_and_queues (memif_if_t * mif) memif_queue_t *mq = vec_elt_at_index (mif->rx_queues, i); if ((mq->int_fd = eventfd (0, EFD_NONBLOCK)) < 0) return clib_error_return_unix (0, "eventfd[rx queue %u]", i); - mq->int_unix_file_index = ~0; + mq->int_clib_file_index = ~0; mq->ring = memif_get_ring (mif, MEMIF_RING_M2S, i); mq->log2_ring_size = mif->cfg.log2_ring_size; mq->region = 0; @@ -432,7 +432,7 @@ memif_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) (sockfd, (struct sockaddr *) &sun, sizeof (struct sockaddr_un)) == 0) { - unix_file_t t = { 0 }; + clib_file_t t = { 0 }; mif->conn_fd = sockfd; t.read_function = memif_slave_conn_fd_read_ready; @@ -440,7 +440,7 @@ memif_process (vlib_main_t * vm, vlib_node_runtime_t * rt, vlib_frame_t * f) t.error_function = memif_slave_conn_fd_error; t.file_descriptor = mif->conn_fd; t.private_data = mif->dev_instance; - memif_file_add (&mif->conn_unix_file_index, &t); + memif_file_add (&mif->conn_clib_file_index, &t); hash_set (msf->dev_instance_by_fd, mif->conn_fd, mif->dev_instance); mif->flags |= MEMIF_IF_FLAG_CONNECTING; @@ -507,7 +507,7 @@ memif_delete_if (vlib_main_t * vm, memif_if_t * mif) if (msf->is_listener) { uword *x; - memif_file_del_by_index (msf->unix_file_index); + memif_file_del_by_index (msf->clib_file_index); vec_foreach (x, msf->pending_file_indices) { memif_file_del_by_index (*x); @@ -639,7 +639,7 @@ memif_create_if (vlib_main_t * vm, memif_create_if_args_t * args) mif->socket_file_index = msf - mm->socket_files; mif->id = args->id; mif->sw_if_index = mif->hw_if_index = mif->per_interface_next_index = ~0; - mif->conn_unix_file_index = ~0; + mif->conn_clib_file_index = ~0; mif->conn_fd = -1; mif->mode = args->mode; if (args->secret) @@ -737,12 +737,12 @@ memif_create_if (vlib_main_t * vm, memif_create_if_args_t * args) goto error; } - msf->unix_file_index = ~0; - unix_file_t template = { 0 }; + msf->clib_file_index = ~0; + clib_file_t template = { 0 }; template.read_function = memif_conn_fd_accept_ready; template.file_descriptor = msf->fd; template.private_data = mif->socket_file_index; - memif_file_add (&msf->unix_file_index, &template); + memif_file_add (&msf->clib_file_index, &template); } msf->ref_cnt++; diff --git a/src/plugins/memif/private.h b/src/plugins/memif/private.h index b5f2f8ff..912ec59a 100644 --- a/src/plugins/memif/private.h +++ b/src/plugins/memif/private.h @@ -41,34 +41,34 @@ #if MEMIF_DEBUG == 1 #define memif_file_add(a, b) do { \ ASSERT (*a == ~0); \ - *a = unix_file_add (&unix_main, b); \ - clib_warning ("unix_file_add fd %d private_data %u idx %u", \ + *a = clib_file_add (&file_main, b); \ + clib_warning ("clib_file_add fd %d private_data %u idx %u", \ (b)->file_descriptor, (b)->private_data, *a); \ } while (0) #define memif_file_del(a) do { \ - clib_warning ("unix_file_del idx %u",a - unix_main.file_pool); \ - unix_file_del (&unix_main, a); \ + clib_warning ("clib_file_del idx %u",a - file_main.file_pool); \ + clib_file_del (&file_main, a); \ } while (0) #define memif_file_del_by_index(a) do { \ - clib_warning ("unix_file_del idx %u", a); \ - unix_file_del_by_index (&unix_main, a); \ + clib_warning ("clib_file_del idx %u", a); \ + clib_file_del_by_index (&file_main, a); \ } while (0) #else #define memif_file_add(a, b) do { \ ASSERT (*a == ~0); \ - *a = unix_file_add (&unix_main, b); \ + *a = clib_file_add (&file_main, b); \ } while (0) -#define memif_file_del(a) unix_file_del(&unix_main, a) -#define memif_file_del_by_index(a) unix_file_del_by_index(&unix_main, a) +#define memif_file_del(a) clib_file_del(&file_main, a) +#define memif_file_del_by_index(a) clib_file_del_by_index(&file_main, a) #endif typedef struct { u8 *filename; int fd; - uword unix_file_index; + uword clib_file_index; uword *pending_file_indices; int ref_cnt; int is_listener; @@ -106,7 +106,7 @@ typedef struct /* interrupts */ int int_fd; - uword int_unix_file_index; + uword int_clib_file_index; u64 int_count; } memif_queue_t; @@ -140,7 +140,7 @@ typedef struct /* socket connection */ uword socket_file_index; int conn_fd; - uword conn_unix_file_index; + uword conn_clib_file_index; memif_msg_fifo_elt_t *msg_queue; u8 *secret; @@ -241,13 +241,13 @@ clib_error_t *memif_connect (memif_if_t * mif); void memif_disconnect (memif_if_t * mif, clib_error_t * err); /* socket.c */ -clib_error_t *memif_conn_fd_accept_ready (unix_file_t * uf); -clib_error_t *memif_master_conn_fd_read_ready (unix_file_t * uf); -clib_error_t *memif_slave_conn_fd_read_ready (unix_file_t * uf); -clib_error_t *memif_master_conn_fd_write_ready (unix_file_t * uf); -clib_error_t *memif_slave_conn_fd_write_ready (unix_file_t * uf); -clib_error_t *memif_master_conn_fd_error (unix_file_t * uf); -clib_error_t *memif_slave_conn_fd_error (unix_file_t * uf); +clib_error_t *memif_conn_fd_accept_ready (clib_file_t * uf); +clib_error_t *memif_master_conn_fd_read_ready (clib_file_t * uf); +clib_error_t *memif_slave_conn_fd_read_ready (clib_file_t * uf); +clib_error_t *memif_master_conn_fd_write_ready (clib_file_t * uf); +clib_error_t *memif_slave_conn_fd_write_ready (clib_file_t * uf); +clib_error_t *memif_master_conn_fd_error (clib_file_t * uf); +clib_error_t *memif_slave_conn_fd_error (clib_file_t * uf); clib_error_t *memif_msg_send_disconnect (memif_if_t * mif, clib_error_t * err); u8 *format_memif_device_name (u8 * s, va_list * args); diff --git a/src/plugins/memif/socket.c b/src/plugins/memif/socket.c index 79ae07be..1abc0f11 100644 --- a/src/plugins/memif/socket.c +++ b/src/plugins/memif/socket.c @@ -246,7 +246,7 @@ memif_msg_receive_hello (memif_if_t * mif, memif_msg_t * msg) static clib_error_t * memif_msg_receive_init (memif_if_t ** mifp, memif_msg_t * msg, - unix_file_t * uf) + clib_file_t * uf) { memif_main_t *mm = &memif_main; memif_socket_file_t *msf = @@ -258,7 +258,7 @@ memif_msg_receive_init (memif_if_t ** mifp, memif_msg_t * msg, if (i->version != MEMIF_VERSION) { - memif_file_del_by_index (uf - unix_main.file_pool); + memif_file_del_by_index (uf - file_main.file_pool); return clib_error_return (0, "unsupported version"); } @@ -291,7 +291,7 @@ memif_msg_receive_init (memif_if_t ** mifp, memif_msg_t * msg, } mif->conn_fd = uf->file_descriptor; - mif->conn_unix_file_index = uf - unix_main.file_pool; + mif->conn_clib_file_index = uf - file_main.file_pool; hash_set (msf->dev_instance_by_fd, mif->conn_fd, mif->dev_instance); mif->remote_name = memif_str2vec (i->name, sizeof (i->name)); *mifp = mif; @@ -316,7 +316,7 @@ memif_msg_receive_init (memif_if_t ** mifp, memif_msg_t * msg, error: tmp.conn_fd = uf->file_descriptor; memif_msg_send_disconnect (&tmp, err); - memif_file_del_by_index (uf - unix_main.file_pool); + memif_file_del_by_index (uf - file_main.file_pool); return err; } @@ -377,7 +377,7 @@ memif_msg_receive_add_ring (memif_if_t * mif, memif_msg_t * msg, int fd) } mq->int_fd = fd; - mq->int_unix_file_index = ~0; + mq->int_clib_file_index = ~0; mq->log2_ring_size = ar->log2_ring_size; mq->region = ar->region; mq->offset = ar->offset; @@ -422,7 +422,7 @@ memif_msg_receive_disconnect (memif_if_t * mif, memif_msg_t * msg) } static clib_error_t * -memif_msg_receive (memif_if_t ** mifp, unix_file_t * uf) +memif_msg_receive (memif_if_t ** mifp, clib_file_t * uf) { char ctl[CMSG_SPACE (sizeof (int)) + CMSG_SPACE (sizeof (struct ucred))] = { 0 }; @@ -544,20 +544,21 @@ memif_msg_receive (memif_if_t ** mifp, unix_file_t * uf) return err; } - if (clib_fifo_elts (mif->msg_queue) && mif->conn_unix_file_index != ~0) - unix_file_set_data_available_to_write (mif->conn_unix_file_index, 1); + if (clib_fifo_elts (mif->msg_queue) && mif->conn_clib_file_index != ~0) + clib_file_set_data_available_to_write (&file_main, + mif->conn_clib_file_index, 1); return 0; } clib_error_t * -memif_master_conn_fd_read_ready (unix_file_t * uf) +memif_master_conn_fd_read_ready (clib_file_t * uf) { memif_main_t *mm = &memif_main; memif_socket_file_t *msf = pool_elt_at_index (mm->socket_files, uf->private_data); uword *p; memif_if_t *mif = 0; - uword conn_unix_file_index = ~0; + uword conn_clib_file_index = ~0; clib_error_t *err = 0; p = hash_get (msf->dev_instance_by_fd, uf->file_descriptor); @@ -570,13 +571,13 @@ memif_master_conn_fd_read_ready (unix_file_t * uf) /* This is new connection, remove index from pending vector */ int i; vec_foreach_index (i, msf->pending_file_indices) - if (msf->pending_file_indices[i] == uf - unix_main.file_pool) + if (msf->pending_file_indices[i] == uf - file_main.file_pool) { - conn_unix_file_index = msf->pending_file_indices[i]; + conn_clib_file_index = msf->pending_file_indices[i]; vec_del1 (msf->pending_file_indices, i); break; } - ASSERT (conn_unix_file_index != ~0); + ASSERT (conn_clib_file_index != ~0); } err = memif_msg_receive (&mif, uf); if (err) @@ -588,7 +589,7 @@ memif_master_conn_fd_read_ready (unix_file_t * uf) } clib_error_t * -memif_slave_conn_fd_read_ready (unix_file_t * uf) +memif_slave_conn_fd_read_ready (clib_file_t * uf) { memif_main_t *mm = &memif_main; clib_error_t *err; @@ -603,17 +604,18 @@ memif_slave_conn_fd_read_ready (unix_file_t * uf) } static clib_error_t * -memif_conn_fd_write_ready (unix_file_t * uf, memif_if_t * mif) +memif_conn_fd_write_ready (clib_file_t * uf, memif_if_t * mif) { memif_msg_fifo_elt_t *e; clib_fifo_sub2 (mif->msg_queue, e); - unix_file_set_data_available_to_write (mif->conn_unix_file_index, 0); + clib_file_set_data_available_to_write (&file_main, + mif->conn_clib_file_index, 0); memif_msg_send (mif->conn_fd, &e->msg, e->fd); return 0; } clib_error_t * -memif_master_conn_fd_write_ready (unix_file_t * uf) +memif_master_conn_fd_write_ready (clib_file_t * uf) { memif_main_t *mm = &memif_main; memif_socket_file_t *msf = @@ -630,7 +632,7 @@ memif_master_conn_fd_write_ready (unix_file_t * uf) } clib_error_t * -memif_slave_conn_fd_write_ready (unix_file_t * uf) +memif_slave_conn_fd_write_ready (clib_file_t * uf) { memif_main_t *mm = &memif_main; memif_if_t *mif = vec_elt_at_index (mm->interfaces, uf->private_data); @@ -638,7 +640,7 @@ memif_slave_conn_fd_write_ready (unix_file_t * uf) } clib_error_t * -memif_slave_conn_fd_error (unix_file_t * uf) +memif_slave_conn_fd_error (clib_file_t * uf) { memif_main_t *mm = &memif_main; memif_if_t *mif = vec_elt_at_index (mm->interfaces, uf->private_data); @@ -652,7 +654,7 @@ memif_slave_conn_fd_error (unix_file_t * uf) } clib_error_t * -memif_master_conn_fd_error (unix_file_t * uf) +memif_master_conn_fd_error (clib_file_t * uf) { memif_main_t *mm = &memif_main; memif_socket_file_t *msf = @@ -674,7 +676,7 @@ memif_master_conn_fd_error (unix_file_t * uf) { int i; vec_foreach_index (i, msf->pending_file_indices) - if (msf->pending_file_indices[i] == uf - unix_main.file_pool) + if (msf->pending_file_indices[i] == uf - file_main.file_pool) { vec_del1 (msf->pending_file_indices, i); memif_file_del (uf); @@ -689,7 +691,7 @@ memif_master_conn_fd_error (unix_file_t * uf) clib_error_t * -memif_conn_fd_accept_ready (unix_file_t * uf) +memif_conn_fd_accept_ready (clib_file_t * uf) { memif_main_t *mm = &memif_main; memif_socket_file_t *msf = @@ -697,8 +699,8 @@ memif_conn_fd_accept_ready (unix_file_t * uf) int addr_len; struct sockaddr_un client; int conn_fd; - unix_file_t template = { 0 }; - uword unix_file_index = ~0; + clib_file_t template = { 0 }; + uword clib_file_index = ~0; clib_error_t *err; @@ -715,16 +717,16 @@ memif_conn_fd_accept_ready (unix_file_t * uf) template.file_descriptor = conn_fd; template.private_data = uf->private_data; - memif_file_add (&unix_file_index, &template); + memif_file_add (&clib_file_index, &template); err = memif_msg_enq_hello (conn_fd); if (err) { clib_error_report (err); - memif_file_del_by_index (unix_file_index); + memif_file_del_by_index (clib_file_index); } else - vec_add1 (msf->pending_file_indices, unix_file_index); + vec_add1 (msf->pending_file_indices, clib_file_index); return 0; } diff --git a/src/vlib/linux/pci.c b/src/vlib/linux/pci.c index cd2affdc..4ce19190 100644 --- a/src/vlib/linux/pci.c +++ b/src/vlib/linux/pci.c @@ -68,8 +68,8 @@ typedef struct /* Minor device for uio device. */ u32 uio_minor; - /* Index given by unix_file_add. */ - u32 unix_file_index; + /* Index given by clib_file_add. */ + u32 clib_file_index; } linux_pci_device_t; @@ -237,7 +237,7 @@ scan_uio_dir (void *arg, u8 * path_name, u8 * file_name) } static clib_error_t * -linux_pci_uio_read_ready (unix_file_t * uf) +linux_pci_uio_read_ready (clib_file_t * uf) { vlib_pci_main_t *pm = &pci_main; vlib_pci_device_t *d; @@ -257,7 +257,7 @@ linux_pci_uio_read_ready (unix_file_t * uf) } static clib_error_t * -linux_pci_uio_error_ready (unix_file_t * uf) +linux_pci_uio_error_ready (clib_file_t * uf) { u32 error_index = (u32) uf->private_data; @@ -294,15 +294,14 @@ add_device (vlib_pci_device_t * dev, linux_pci_device_t * pdev) } { - unix_file_t template = { 0 }; - unix_main_t *um = &unix_main; + clib_file_t template = { 0 }; template.read_function = linux_pci_uio_read_ready; template.file_descriptor = l->uio_fd; template.error_function = linux_pci_uio_error_ready; template.private_data = dev - pm->pci_devs; - l->unix_file_index = unix_file_add (um, &template); + l->clib_file_index = clib_file_add (&file_main, &template); } } diff --git a/src/vlib/unix/cli.c b/src/vlib/unix/cli.c index 068a4e16..39368823 100644 --- a/src/vlib/unix/cli.c +++ b/src/vlib/unix/cli.c @@ -141,7 +141,7 @@ typedef struct typedef struct { /** The file index held by unix.c */ - u32 unix_file_index; + u32 clib_file_index; /** Vector of output pending write to file descriptor. */ u8 *output_vector; @@ -502,11 +502,11 @@ unix_cli_match_action (unix_cli_parse_actions_t * a, * are available to be sent. */ static void -unix_cli_add_pending_output (unix_file_t * uf, +unix_cli_add_pending_output (clib_file_t * uf, unix_cli_file_t * cf, u8 * buffer, uword buffer_bytes) { - unix_main_t *um = &unix_main; + clib_file_main_t *fm = &file_main; vec_add (cf->output_vector, buffer, buffer_bytes); if (vec_len (cf->output_vector) > 0) @@ -514,7 +514,7 @@ unix_cli_add_pending_output (unix_file_t * uf, int skip_update = 0 != (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE); uf->flags |= UNIX_FILE_DATA_AVAILABLE_TO_WRITE; if (!skip_update) - um->file_update (uf, UNIX_FILE_UPDATE_MODIFY); + fm->file_update (uf, UNIX_FILE_UPDATE_MODIFY); } } @@ -522,10 +522,10 @@ unix_cli_add_pending_output (unix_file_t * uf, * that no more bytes are available to be sent. */ static void -unix_cli_del_pending_output (unix_file_t * uf, +unix_cli_del_pending_output (clib_file_t * uf, unix_cli_file_t * cf, uword n_bytes) { - unix_main_t *um = &unix_main; + clib_file_main_t *fm = &file_main; vec_delete (cf->output_vector, n_bytes, 0); if (vec_len (cf->output_vector) <= 0) @@ -533,7 +533,7 @@ unix_cli_del_pending_output (unix_file_t * uf, int skip_update = 0 == (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE); uf->flags &= ~UNIX_FILE_DATA_AVAILABLE_TO_WRITE; if (!skip_update) - um->file_update (uf, UNIX_FILE_UPDATE_MODIFY); + fm->file_update (uf, UNIX_FILE_UPDATE_MODIFY); } } @@ -580,7 +580,7 @@ unix_vlib_findchr (u8 chr, u8 * str, word len) */ static void unix_vlib_cli_output_raw (unix_cli_file_t * cf, - unix_file_t * uf, u8 * buffer, uword buffer_bytes) + clib_file_t * uf, u8 * buffer, uword buffer_bytes) { int n = 0; @@ -610,7 +610,7 @@ unix_vlib_cli_output_raw (unix_cli_file_t * cf, */ static void unix_vlib_cli_output_cooked (unix_cli_file_t * cf, - unix_file_t * uf, + clib_file_t * uf, u8 * buffer, uword buffer_bytes) { word end = 0, start = 0; @@ -646,7 +646,7 @@ unix_vlib_cli_output_cooked (unix_cli_file_t * cf, /** @brief Output the CLI prompt */ static void -unix_cli_cli_prompt (unix_cli_file_t * cf, unix_file_t * uf) +unix_cli_cli_prompt (unix_cli_file_t * cf, clib_file_t * uf) { unix_cli_main_t *cm = &unix_cli_main; @@ -655,7 +655,7 @@ unix_cli_cli_prompt (unix_cli_file_t * cf, unix_file_t * uf) /** @brief Output a pager prompt and show number of buffered lines */ static void -unix_cli_pager_prompt (unix_cli_file_t * cf, unix_file_t * uf) +unix_cli_pager_prompt (unix_cli_file_t * cf, clib_file_t * uf) { u8 *prompt; u32 h; @@ -678,7 +678,7 @@ unix_cli_pager_prompt (unix_cli_file_t * cf, unix_file_t * uf) /** @brief Output a pager "skipping" message */ static void -unix_cli_pager_message (unix_cli_file_t * cf, unix_file_t * uf, +unix_cli_pager_message (unix_cli_file_t * cf, clib_file_t * uf, char *message, char *postfix) { u8 *prompt; @@ -694,7 +694,7 @@ unix_cli_pager_message (unix_cli_file_t * cf, unix_file_t * uf, /** @brief Erase the printed pager prompt */ static void -unix_cli_pager_prompt_erase (unix_cli_file_t * cf, unix_file_t * uf) +unix_cli_pager_prompt_erase (unix_cli_file_t * cf, clib_file_t * uf) { if (cf->ansi_capable) { @@ -716,7 +716,7 @@ unix_cli_pager_prompt_erase (unix_cli_file_t * cf, unix_file_t * uf) /** @brief Uses an ANSI escape sequence to move the cursor */ static void -unix_cli_ansi_cursor (unix_cli_file_t * cf, unix_file_t * uf, u16 x, u16 y) +unix_cli_ansi_cursor (unix_cli_file_t * cf, clib_file_t * uf, u16 x, u16 y) { u8 *str; @@ -732,7 +732,7 @@ unix_cli_ansi_cursor (unix_cli_file_t * cf, unix_file_t * uf, u16 x, u16 y) * @param uf Unix file of the CLI session. */ static void -unix_cli_pager_redraw (unix_cli_file_t * cf, unix_file_t * uf) +unix_cli_pager_redraw (unix_cli_file_t * cf, clib_file_t * uf) { unix_cli_pager_index_t *pi = NULL; u8 *line = NULL; @@ -930,12 +930,13 @@ static void unix_vlib_cli_output (uword cli_file_index, u8 * buffer, uword buffer_bytes) { unix_main_t *um = &unix_main; + clib_file_main_t *fm = &file_main; unix_cli_main_t *cm = &unix_cli_main; unix_cli_file_t *cf; - unix_file_t *uf; + clib_file_t *uf; cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index); - uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + uf = pool_elt_at_index (fm->file_pool, cf->clib_file_index); if (cf->no_pager || um->cli_pager_buffer_limit == 0 || cf->height == 0) { @@ -1037,7 +1038,8 @@ static void unix_cli_file_welcome (unix_cli_main_t * cm, unix_cli_file_t * cf) { unix_main_t *um = &unix_main; - unix_file_t *uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + clib_file_main_t *fm = &file_main; + clib_file_t *uf = pool_elt_at_index (fm->file_pool, cf->clib_file_index); unix_cli_banner_t *banner; int i, len; @@ -1104,7 +1106,7 @@ unix_cli_file_welcome_timer (any arg, f64 delay) static i32 unix_cli_process_telnet (unix_main_t * um, unix_cli_file_t * cf, - unix_file_t * uf, u8 * input_vector, uword len) + clib_file_t * uf, u8 * input_vector, uword len) { /* Input_vector starts at IAC byte. * See if we have a complete message; if not, return -1 so we wait for more. @@ -1229,7 +1231,7 @@ static int unix_cli_line_process_one (unix_cli_main_t * cm, unix_main_t * um, unix_cli_file_t * cf, - unix_file_t * uf, + clib_file_t * uf, u8 input, unix_cli_parse_action_t action) { u8 *prev; @@ -2059,10 +2061,10 @@ unix_cli_line_process_one (unix_cli_main_t * cm, /** @brief Process input bytes on a stream to provide line editing and * command history in the CLI. */ static int -unix_cli_line_edit (unix_cli_main_t * cm, - unix_main_t * um, unix_cli_file_t * cf) +unix_cli_line_edit (unix_cli_main_t * cm, unix_main_t * um, + clib_file_main_t * fm, unix_cli_file_t * cf) { - unix_file_t *uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + clib_file_t *uf = pool_elt_at_index (fm->file_pool, cf->clib_file_index); int i; for (i = 0; i < vec_len (cf->input_vector); i++) @@ -2139,7 +2141,8 @@ static void unix_cli_process_input (unix_cli_main_t * cm, uword cli_file_index) { unix_main_t *um = &unix_main; - unix_file_t *uf; + clib_file_main_t *fm = &file_main; + clib_file_t *uf; unix_cli_file_t *cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index); unformat_input_t input; int vlib_parse_eval (u8 *); @@ -2157,7 +2160,7 @@ more: else { /* Line edit, echo, etc. */ - if (unix_cli_line_edit (cm, um, cf)) + if (unix_cli_line_edit (cm, um, fm, cf)) /* want more input */ return; } @@ -2196,7 +2199,7 @@ more: /* Re-fetch pointer since pool may have moved. */ cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index); - uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + uf = pool_elt_at_index (fm->file_pool, cf->clib_file_index); done: /* reset vector; we'll re-use it later */ @@ -2240,12 +2243,13 @@ static void unix_cli_kill (unix_cli_main_t * cm, uword cli_file_index) { unix_main_t *um = &unix_main; + clib_file_main_t *fm = &file_main; unix_cli_file_t *cf; - unix_file_t *uf; + clib_file_t *uf; int i; cf = pool_elt_at_index (cm->cli_file_pool, cli_file_index); - uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + uf = pool_elt_at_index (fm->file_pool, cf->clib_file_index); /* Quit/EOF on stdin means quit program. */ if (uf->file_descriptor == UNIX_CLI_STDIN_FD) @@ -2259,7 +2263,7 @@ unix_cli_kill (unix_cli_main_t * cm, uword cli_file_index) vec_free (cf->command_history); - unix_file_del (um, uf); + clib_file_del (fm, uf); unix_cli_file_free (cf); pool_put (cm->cli_file_pool, cf); @@ -2311,7 +2315,7 @@ done: /** Called when a CLI session file descriptor can be written to without * blocking. */ static clib_error_t * -unix_cli_write_ready (unix_file_t * uf) +unix_cli_write_ready (clib_file_t * uf) { unix_cli_main_t *cm = &unix_cli_main; unix_cli_file_t *cf; @@ -2334,7 +2338,7 @@ unix_cli_write_ready (unix_file_t * uf) /** Called when a CLI session file descriptor has data to be read. */ static clib_error_t * -unix_cli_read_ready (unix_file_t * uf) +unix_cli_read_ready (clib_file_t * uf) { unix_main_t *um = &unix_main; unix_cli_main_t *cm = &unix_cli_main; @@ -2380,8 +2384,9 @@ static u32 unix_cli_file_add (unix_cli_main_t * cm, char *name, int fd) { unix_main_t *um = &unix_main; + clib_file_main_t *fm = &file_main; unix_cli_file_t *cf; - unix_file_t template = { 0 }; + clib_file_t template = { 0 }; vlib_main_t *vm = um->vlib_main; vlib_node_t *n; @@ -2424,7 +2429,7 @@ unix_cli_file_add (unix_cli_main_t * cm, char *name, int fd) template.private_data = cf - cm->cli_file_pool; cf->process_node_index = n->index; - cf->unix_file_index = unix_file_add (um, &template); + cf->clib_file_index = clib_file_add (fm, &template); cf->output_vector = 0; cf->input_vector = 0; @@ -2439,9 +2444,10 @@ unix_cli_file_add (unix_cli_main_t * cm, char *name, int fd) /** Telnet listening socket has a new connection. */ static clib_error_t * -unix_cli_listen_read_ready (unix_file_t * uf) +unix_cli_listen_read_ready (clib_file_t * uf) { unix_main_t *um = &unix_main; + clib_file_main_t *fm = &file_main; unix_cli_main_t *cm = &unix_cli_main; clib_socket_t *s = &um->cli_listen_socket; clib_socket_t client; @@ -2497,7 +2503,7 @@ unix_cli_listen_read_ready (unix_file_t * uf) /* Setup the pager */ cf->no_pager = um->cli_no_pager; - uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + uf = pool_elt_at_index (fm->file_pool, cf->clib_file_index); /* Send the telnet options */ unix_vlib_cli_output_raw (cf, uf, charmode_option, @@ -2517,11 +2523,11 @@ unix_cli_listen_read_ready (unix_file_t * uf) static void unix_cli_resize_interrupt (int signum) { - unix_main_t *um = &unix_main; + clib_file_main_t *fm = &file_main; unix_cli_main_t *cm = &unix_cli_main; unix_cli_file_t *cf = pool_elt_at_index (cm->cli_file_pool, cm->stdin_cli_file_index); - unix_file_t *uf = pool_elt_at_index (um->file_pool, cf->unix_file_index); + clib_file_t *uf = pool_elt_at_index (fm->file_pool, cf->clib_file_index); struct winsize ws; (void) signum; @@ -2548,6 +2554,7 @@ static clib_error_t * unix_cli_config (vlib_main_t * vm, unformat_input_t * input) { unix_main_t *um = &unix_main; + clib_file_main_t *fm = &file_main; unix_cli_main_t *cm = &unix_cli_main; int flags; clib_error_t *error = 0; @@ -2640,7 +2647,7 @@ unix_cli_config (vlib_main_t * vm, unformat_input_t * input) if (s->config && s->config[0] != 0) { /* CLI listen. */ - unix_file_t template = { 0 }; + clib_file_t template = { 0 }; /* mkdir of file socketu, only under /run */ if (strncmp (s->config, "/run", 4) == 0) @@ -2667,7 +2674,7 @@ unix_cli_config (vlib_main_t * vm, unformat_input_t * input) template.read_function = unix_cli_listen_read_ready; template.file_descriptor = s->fd; - unix_file_add (um, &template); + clib_file_add (fm, &template); } /* Set CLI prompt. */ diff --git a/src/vlib/unix/input.c b/src/vlib/unix/input.c index 515dae94..ecd31791 100644 --- a/src/vlib/unix/input.c +++ b/src/vlib/unix/input.c @@ -62,9 +62,9 @@ typedef struct static linux_epoll_main_t linux_epoll_main; static void -linux_epoll_file_update (unix_file_t * f, unix_file_update_type_t update_type) +linux_epoll_file_update (clib_file_t * f, unix_file_update_type_t update_type) { - unix_main_t *um = &unix_main; + clib_file_main_t *fm = &file_main; linux_epoll_main_t *em = &linux_epoll_main; struct epoll_event e; int op; @@ -76,7 +76,7 @@ linux_epoll_file_update (unix_file_t * f, unix_file_update_type_t update_type) e.events |= EPOLLOUT; if (f->flags & UNIX_FILE_EVENT_EDGE_TRIGGERED) e.events |= EPOLLET; - e.data.u32 = f - um->file_pool; + e.data.u32 = f - fm->file_pool; op = -1; @@ -108,6 +108,7 @@ linux_epoll_input (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { unix_main_t *um = &unix_main; + clib_file_main_t *fm = &file_main; linux_epoll_main_t *em = &linux_epoll_main; struct epoll_event *e; int n_fds_ready; @@ -186,7 +187,7 @@ linux_epoll_input (vlib_main_t * vm, for (e = em->epoll_events; e < em->epoll_events + n_fds_ready; e++) { u32 i = e->data.u32; - unix_file_t *f = pool_elt_at_index (um->file_pool, i); + clib_file_t *f = pool_elt_at_index (fm->file_pool, i); clib_error_t *errors[4]; int n_errors = 0; @@ -236,7 +237,7 @@ clib_error_t * linux_epoll_input_init (vlib_main_t * vm) { linux_epoll_main_t *em = &linux_epoll_main; - unix_main_t *um = &unix_main; + clib_file_main_t *fm = &file_main; /* Allocate some events. */ vec_resize (em->epoll_events, VLIB_FRAME_SIZE); @@ -245,7 +246,7 @@ linux_epoll_input_init (vlib_main_t * vm) if (em->epoll_fd < 0) return clib_error_return_unix (0, "epoll_create"); - um->file_update = linux_epoll_file_update; + fm->file_update = linux_epoll_file_update; return 0; } diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c index 3a92b2e3..ed0631ec 100644 --- a/src/vlib/unix/main.c +++ b/src/vlib/unix/main.c @@ -60,6 +60,7 @@ char *vlib_default_runtime_dir __attribute__ ((weak)); char *vlib_default_runtime_dir = "vlib"; unix_main_t unix_main; +clib_file_main_t file_main; static clib_error_t * unix_main_init (vlib_main_t * vm) diff --git a/src/vlib/unix/mc_socket.c b/src/vlib/unix/mc_socket.c index 9c12ad3b..3f1cd99d 100644 --- a/src/vlib/unix/mc_socket.c +++ b/src/vlib/unix/mc_socket.c @@ -243,7 +243,7 @@ recvmsg_helper (mc_socket_main_t * msm, } static clib_error_t * -mastership_socket_read_ready (unix_file_t * uf) +mastership_socket_read_ready (clib_file_t * uf) { mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; mc_main_t *mcm = &msm->mc_main; @@ -263,7 +263,7 @@ mastership_socket_read_ready (unix_file_t * uf) } static clib_error_t * -to_relay_socket_read_ready (unix_file_t * uf) +to_relay_socket_read_ready (clib_file_t * uf) { mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; mc_main_t *mcm = &msm->mc_main; @@ -297,7 +297,7 @@ to_relay_socket_read_ready (unix_file_t * uf) } static clib_error_t * -from_relay_socket_read_ready (unix_file_t * uf) +from_relay_socket_read_ready (clib_file_t * uf) { mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; mc_main_t *mcm = &msm->mc_main; @@ -317,7 +317,7 @@ from_relay_socket_read_ready (unix_file_t * uf) } static clib_error_t * -join_socket_read_ready (unix_file_t * uf) +join_socket_read_ready (clib_file_t * uf) { mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; mc_main_t *mcm = &msm->mc_main; @@ -354,7 +354,7 @@ join_socket_read_ready (unix_file_t * uf) } static clib_error_t * -ack_socket_read_ready (unix_file_t * uf) +ack_socket_read_ready (clib_file_t * uf) { mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; mc_main_t *mcm = &msm->mc_main; @@ -371,10 +371,11 @@ ack_socket_read_ready (unix_file_t * uf) static void catchup_cleanup (mc_socket_main_t * msm, - mc_socket_catchup_t * c, unix_main_t * um, unix_file_t * uf) + mc_socket_catchup_t * c, clib_file_main_t * um, + clib_file_t * uf) { hash_unset (msm->catchup_index_by_file_descriptor, uf->file_descriptor); - unix_file_del (um, uf); + clib_file_del (um, uf); vec_free (c->input_vector); vec_free (c->output_vector); pool_put (msm->catchups, c); @@ -390,9 +391,9 @@ find_catchup_from_file_descriptor (mc_socket_main_t * msm, } static clib_error_t * -catchup_socket_read_ready (unix_file_t * uf, int is_server) +catchup_socket_read_ready (clib_file_t * uf, int is_server) { - unix_main_t *um = &unix_main; + clib_file_main_t *um = &file_main; mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; mc_main_t *mcm = &msm->mc_main; mc_socket_catchup_t *c = @@ -440,13 +441,13 @@ catchup_socket_read_ready (unix_file_t * uf, int is_server) } static clib_error_t * -catchup_server_read_ready (unix_file_t * uf) +catchup_server_read_ready (clib_file_t * uf) { return catchup_socket_read_ready (uf, /* is_server */ 1); } static clib_error_t * -catchup_client_read_ready (unix_file_t * uf) +catchup_client_read_ready (clib_file_t * uf) { if (MC_EVENT_LOGGING) { @@ -460,9 +461,9 @@ catchup_client_read_ready (unix_file_t * uf) } static clib_error_t * -catchup_socket_write_ready (unix_file_t * uf, int is_server) +catchup_socket_write_ready (clib_file_t * uf, int is_server) { - unix_main_t *um = &unix_main; + clib_file_main_t *um = &file_main; mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; mc_socket_catchup_t *c = find_catchup_from_file_descriptor (msm, uf->file_descriptor); @@ -522,7 +523,7 @@ catchup_socket_write_ready (unix_file_t * uf, int is_server) if (!is_server) { uf->flags &= ~UNIX_FILE_DATA_AVAILABLE_TO_WRITE; - unix_main.file_update (uf, UNIX_FILE_UPDATE_MODIFY); + file_main.file_update (uf, UNIX_FILE_UPDATE_MODIFY); /* Send EOF to other side. */ shutdown (uf->file_descriptor, SHUT_WR); return error; @@ -537,21 +538,21 @@ catchup_socket_write_ready (unix_file_t * uf, int is_server) } static clib_error_t * -catchup_server_write_ready (unix_file_t * uf) +catchup_server_write_ready (clib_file_t * uf) { return catchup_socket_write_ready (uf, /* is_server */ 1); } static clib_error_t * -catchup_client_write_ready (unix_file_t * uf) +catchup_client_write_ready (clib_file_t * uf) { return catchup_socket_write_ready (uf, /* is_server */ 0); } static clib_error_t * -catchup_socket_error_ready (unix_file_t * uf) +catchup_socket_error_ready (clib_file_t * uf) { - unix_main_t *um = &unix_main; + clib_file_main_t *um = &file_main; mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; mc_socket_catchup_t *c = find_catchup_from_file_descriptor (msm, uf->file_descriptor); @@ -560,13 +561,13 @@ catchup_socket_error_ready (unix_file_t * uf) } static clib_error_t * -catchup_listen_read_ready (unix_file_t * uf) +catchup_listen_read_ready (clib_file_t * uf) { mc_socket_main_t *msm = (mc_socket_main_t *) uf->private_data; struct sockaddr_in client_addr; int client_len; mc_socket_catchup_t *c; - unix_file_t template = { 0 }; + clib_file_t template = { 0 }; pool_get (msm->catchups, c); memset (c, 0, sizeof (c[0])); @@ -616,7 +617,7 @@ catchup_listen_read_ready (unix_file_t * uf) template.error_function = catchup_socket_error_ready; template.file_descriptor = c->socket; template.private_data = pointer_to_uword (msm); - c->unix_file_index = unix_file_add (&unix_main, &template); + c->clib_file_index = clib_file_add (&file_main, &template); hash_set (msm->catchup_index_by_file_descriptor, c->socket, c - msm->catchups); @@ -772,45 +773,45 @@ socket_setup (mc_socket_main_t * msm) /* epoll setup for multicast mastership socket */ { - unix_file_t template = { 0 }; + clib_file_t template = { 0 }; template.read_function = mastership_socket_read_ready; template.file_descriptor = msm->multicast_sockets[MC_TRANSPORT_MASTERSHIP].socket; template.private_data = (uword) msm; - unix_file_add (&unix_main, &template); + clib_file_add (&file_main, &template); /* epoll setup for multicast to_relay socket */ template.read_function = to_relay_socket_read_ready; template.file_descriptor = msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_TO_RELAY].socket; template.private_data = (uword) msm; - unix_file_add (&unix_main, &template); + clib_file_add (&file_main, &template); /* epoll setup for multicast from_relay socket */ template.read_function = from_relay_socket_read_ready; template.file_descriptor = msm->multicast_sockets[MC_TRANSPORT_USER_REQUEST_FROM_RELAY].socket; template.private_data = (uword) msm; - unix_file_add (&unix_main, &template); + clib_file_add (&file_main, &template); template.read_function = join_socket_read_ready; template.file_descriptor = msm->multicast_sockets[MC_TRANSPORT_JOIN].socket; template.private_data = (uword) msm; - unix_file_add (&unix_main, &template); + clib_file_add (&file_main, &template); /* epoll setup for ack rx socket */ template.read_function = ack_socket_read_ready; template.file_descriptor = msm->ack_socket; template.private_data = (uword) msm; - unix_file_add (&unix_main, &template); + clib_file_add (&file_main, &template); /* epoll setup for TCP catchup server */ template.read_function = catchup_listen_read_ready; template.file_descriptor = msm->catchup_server_socket; template.private_data = (uword) msm; - unix_file_add (&unix_main, &template); + clib_file_add (&file_main, &template); } return 0; @@ -820,8 +821,8 @@ static void * catchup_add_pending_output (mc_socket_catchup_t * c, uword n_bytes, u8 * set_output_vector) { - unix_file_t *uf = pool_elt_at_index (unix_main.file_pool, - c->unix_file_index); + clib_file_t *uf = pool_elt_at_index (file_main.file_pool, + c->clib_file_index); u8 *result = 0; if (set_output_vector) @@ -833,7 +834,7 @@ catchup_add_pending_output (mc_socket_catchup_t * c, uword n_bytes, int skip_update = 0 != (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE); uf->flags |= UNIX_FILE_DATA_AVAILABLE_TO_WRITE; if (!skip_update) - unix_main.file_update (uf, UNIX_FILE_UPDATE_MODIFY); + file_main.file_update (uf, UNIX_FILE_UPDATE_MODIFY); } return result; } @@ -847,7 +848,7 @@ catchup_request_fun (void *transport_main, vlib_main_t *vm = mcm->vlib_main; mc_socket_catchup_t *c; struct sockaddr_in addr; - unix_main_t *um = &unix_main; + clib_file_main_t *um = &file_main; int one = 1; pool_get (msm->catchups, c); @@ -895,14 +896,14 @@ catchup_request_fun (void *transport_main, } { - unix_file_t template = { 0 }; + clib_file_t template = { 0 }; template.read_function = catchup_client_read_ready; template.write_function = catchup_client_write_ready; template.error_function = catchup_socket_error_ready; template.file_descriptor = c->socket; template.private_data = (uword) msm; - c->unix_file_index = unix_file_add (um, &template); + c->clib_file_index = clib_file_add (um, &template); hash_set (msm->catchup_index_by_file_descriptor, c->socket, c - msm->catchups); diff --git a/src/vlib/unix/mc_socket.h b/src/vlib/unix/mc_socket.h index 273c9ad4..3686c824 100644 --- a/src/vlib/unix/mc_socket.h +++ b/src/vlib/unix/mc_socket.h @@ -31,7 +31,7 @@ typedef struct typedef struct { int socket; - u32 unix_file_index; + u32 clib_file_index; u8 *input_vector; u8 *output_vector; diff --git a/src/vlib/unix/unix.h b/src/vlib/unix/unix.h index 1b0d8b9d..4c8566b7 100644 --- a/src/vlib/unix/unix.h +++ b/src/vlib/unix/unix.h @@ -40,42 +40,16 @@ #ifndef included_unix_unix_h #define included_unix_unix_h +#include #include #include - -struct unix_file; -typedef clib_error_t *(unix_file_function_t) (struct unix_file * f); - -typedef struct unix_file -{ - /* Unix file descriptor from open/socket. */ - u32 file_descriptor; - - u32 flags; -#define UNIX_FILE_DATA_AVAILABLE_TO_WRITE (1 << 0) -#define UNIX_FILE_EVENT_EDGE_TRIGGERED (1 << 1) - - /* Data available for function's use. */ - uword private_data; - - /* Functions to be called when read/write data becomes ready. */ - unix_file_function_t *read_function, *write_function, *error_function; -} unix_file_t; - typedef struct { f64 time; clib_error_t *error; } unix_error_history_t; -typedef enum -{ - UNIX_FILE_UPDATE_ADD, - UNIX_FILE_UPDATE_MODIFY, - UNIX_FILE_UPDATE_DELETE, -} unix_file_update_type_t; - typedef struct { /* Back pointer to main structure. */ @@ -86,15 +60,9 @@ typedef struct #define UNIX_FLAG_INTERACTIVE (1 << 0) #define UNIX_FLAG_NODAEMON (1 << 1) - /* Pool of files to poll for input/output. */ - unix_file_t *file_pool; - /* CLI listen socket. */ clib_socket_t cli_listen_socket; - void (*file_update) (unix_file_t * file, - unix_file_update_type_t update_type); - /* Circular buffer of last unix errors. */ unix_error_history_t error_history[128]; u32 error_history_index; @@ -138,47 +106,7 @@ typedef struct /* Global main structure. */ extern unix_main_t unix_main; - -always_inline uword -unix_file_add (unix_main_t * um, unix_file_t * template) -{ - unix_file_t *f; - pool_get (um->file_pool, f); - f[0] = template[0]; - um->file_update (f, UNIX_FILE_UPDATE_ADD); - return f - um->file_pool; -} - -always_inline void -unix_file_del (unix_main_t * um, unix_file_t * f) -{ - um->file_update (f, UNIX_FILE_UPDATE_DELETE); - close (f->file_descriptor); - f->file_descriptor = ~0; - pool_put (um->file_pool, f); -} - -always_inline void -unix_file_del_by_index (unix_main_t * um, uword index) -{ - unix_file_t *uf; - uf = pool_elt_at_index (um->file_pool, index); - unix_file_del (um, uf); -} - -always_inline uword -unix_file_set_data_available_to_write (u32 unix_file_index, - uword is_available) -{ - unix_file_t *uf = pool_elt_at_index (unix_main.file_pool, unix_file_index); - uword was_available = (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE); - if ((was_available != 0) != (is_available != 0)) - { - uf->flags ^= UNIX_FILE_DATA_AVAILABLE_TO_WRITE; - unix_main.file_update (uf, UNIX_FILE_UPDATE_MODIFY); - } - return was_available != 0; -} +extern clib_file_main_t file_main; always_inline void unix_save_error (unix_main_t * um, clib_error_t * error) diff --git a/src/vlibapi/api_common.h b/src/vlibapi/api_common.h index b84d269e..651566ae 100644 --- a/src/vlibapi/api_common.h +++ b/src/vlibapi/api_common.h @@ -51,7 +51,7 @@ typedef struct vl_api_registration_ unix_shared_memory_queue_t *vl_input_queue; /* socket server and client */ - u32 unix_file_index; + u32 clib_file_index; i8 *unprocessed_input; u32 unprocessed_msg_length; u8 *output_vector; diff --git a/src/vlibsocket/api.h b/src/vlibsocket/api.h index 79c0d08a..d7b7055d 100644 --- a/src/vlibsocket/api.h +++ b/src/vlibsocket/api.h @@ -41,27 +41,27 @@ typedef struct * or to a shared-memory connection. */ vl_api_registration_t *current_rp; - unix_file_t *current_uf; + clib_file_t *current_uf; /* One input buffer, shared across all sockets */ i8 *input_buffer; } socket_main_t; extern socket_main_t socket_main; -void socksvr_add_pending_output (struct unix_file *uf, +void socksvr_add_pending_output (clib_file_t * uf, struct vl_api_registration_ *cf, u8 * buffer, uword buffer_bytes); #define SOCKSVR_DEFAULT_PORT 32741 /* whatever */ void vl_free_socket_registration_index (u32 pool_index); -void vl_socket_process_msg (struct unix_file *uf, +void vl_socket_process_msg (clib_file_t * uf, struct vl_api_registration_ *rp, i8 * input_v); -clib_error_t *vl_socket_read_ready (struct unix_file *uf); -void vl_socket_add_pending_output (struct unix_file *uf, +clib_error_t *vl_socket_read_ready (clib_file_t * uf); +void vl_socket_add_pending_output (clib_file_t * uf, struct vl_api_registration_ *rp, u8 * buffer, uword buffer_bytes); -clib_error_t *vl_socket_write_ready (struct unix_file *uf); +clib_error_t *vl_socket_write_ready (clib_file_t * uf); void vl_socket_api_send (vl_api_registration_t * rp, u8 * elem); void vl_socket_api_send_with_data (vl_api_registration_t * rp, u8 * elem, u8 * data_vector); diff --git a/src/vlibsocket/sockclnt_vlib.c b/src/vlibsocket/sockclnt_vlib.c index e16adfeb..760ad944 100644 --- a/src/vlibsocket/sockclnt_vlib.c +++ b/src/vlibsocket/sockclnt_vlib.c @@ -60,11 +60,11 @@ vl_api_sockclnt_create_reply_t_handler (vl_api_sockclnt_create_reply_t * mp) static void vl_api_sockclnt_delete_reply_t_handler (vl_api_sockclnt_delete_reply_t * mp) { - unix_main_t *um = &unix_main; - unix_file_t *uf = socket_main.current_uf; + clib_file_main_t *fm = &file_main; + clib_file_t *uf = socket_main.current_uf; vl_api_registration_t *rp = socket_main.current_rp; - unix_file_del (um, uf); + clib_file_del (fm, uf); vl_free_socket_registration_index (rp->vl_api_registration_pool_index); } @@ -72,8 +72,8 @@ u32 sockclnt_open_index (char *client_name, char *hostname, int port) { vl_api_registration_t *rp; - unix_main_t *um = &unix_main; - unix_file_t template = { 0 }; + clib_file_main_t *fm = &file_main; + clib_file_t template = { 0 }; int sockfd; int one = 1; int rv; @@ -129,7 +129,7 @@ sockclnt_open_index (char *client_name, char *hostname, int port) template.file_descriptor = sockfd; template.private_data = rp - socket_main.registration_pool; - rp->unix_file_index = unix_file_add (um, &template); + rp->clib_file_index = clib_file_add (fm, &template); rp->name = format (0, "%s:%d", hostname, port); mp = vl_msg_api_alloc (sizeof (*mp)); diff --git a/src/vlibsocket/socksvr_vlib.c b/src/vlibsocket/socksvr_vlib.c index dc8c63eb..31b33df5 100644 --- a/src/vlibsocket/socksvr_vlib.c +++ b/src/vlibsocket/socksvr_vlib.c @@ -53,8 +53,8 @@ dump_socket_clients (vlib_main_t * vm, api_main_t * am) { vl_api_registration_t *reg; socket_main_t *sm = &socket_main; - unix_main_t *um = &unix_main; - unix_file_t *f; + clib_file_main_t *fm = &file_main; + clib_file_t *f; /* * Must have at least one active client, not counting the @@ -69,7 +69,7 @@ dump_socket_clients (vlib_main_t * vm, api_main_t * am) pool_foreach (reg, sm->registration_pool, ({ if (reg->registration_type == REGISTRATION_TYPE_SOCKET_SERVER) { - f = pool_elt_at_index (um->file_pool, reg->unix_file_index); + f = pool_elt_at_index (fm->file_pool, reg->clib_file_index); vlib_cli_output (vm, "%16s %8d", reg->name, f->file_descriptor); } @@ -99,13 +99,13 @@ vl_socket_api_send (vl_api_registration_t * rp, u8 * elem) nbytes += msg_length; tmp = clib_host_to_net_u32 (nbytes); - vl_socket_add_pending_output (rp->unix_file_index - + unix_main.file_pool, + vl_socket_add_pending_output (rp->clib_file_index + + file_main.file_pool, rp->vl_api_registration_pool_index + socket_main.registration_pool, (u8 *) & tmp, sizeof (tmp)); - vl_socket_add_pending_output (rp->unix_file_index - + unix_main.file_pool, + vl_socket_add_pending_output (rp->clib_file_index + + file_main.file_pool, rp->vl_api_registration_pool_index + socket_main.registration_pool, elem, msg_length); @@ -139,18 +139,18 @@ vl_socket_api_send_with_data (vl_api_registration_t * rp, /* Length in network byte order */ tmp = clib_host_to_net_u32 (nbytes); - vl_socket_add_pending_output (rp->unix_file_index - + unix_main.file_pool, + vl_socket_add_pending_output (rp->clib_file_index + + file_main.file_pool, rp->vl_api_registration_pool_index + socket_main.registration_pool, (u8 *) & tmp, sizeof (tmp)); - vl_socket_add_pending_output (rp->unix_file_index - + unix_main.file_pool, + vl_socket_add_pending_output (rp->clib_file_index + + file_main.file_pool, rp->vl_api_registration_pool_index + socket_main.registration_pool, elem, msg_length); - vl_socket_add_pending_output (rp->unix_file_index - + unix_main.file_pool, + vl_socket_add_pending_output (rp->clib_file_index + + file_main.file_pool, rp->vl_api_registration_pool_index + socket_main.registration_pool, data_vector, vec_len (data_vector)); @@ -181,13 +181,13 @@ vl_socket_api_send_with_length_internal (vl_api_registration_t * rp, /* Length in network byte order */ tmp = clib_host_to_net_u32 (nbytes); - vl_socket_add_pending_output (rp->unix_file_index - + unix_main.file_pool, + vl_socket_add_pending_output (rp->clib_file_index + + file_main.file_pool, rp->vl_api_registration_pool_index + socket_main.registration_pool, (u8 *) & tmp, sizeof (tmp)); - vl_socket_add_pending_output (rp->unix_file_index - + unix_main.file_pool, + vl_socket_add_pending_output (rp->clib_file_index + + file_main.file_pool, rp->vl_api_registration_pool_index + socket_main.registration_pool, elem, msg_length); @@ -231,7 +231,7 @@ vl_free_socket_registration_index (u32 pool_index) } static inline void -socket_process_msg (unix_file_t * uf, vl_api_registration_t * rp, +socket_process_msg (clib_file_t * uf, vl_api_registration_t * rp, i8 * input_v) { u8 *the_msg = (u8 *) (input_v + sizeof (u32)); @@ -243,9 +243,9 @@ socket_process_msg (unix_file_t * uf, vl_api_registration_t * rp, } clib_error_t * -vl_socket_read_ready (unix_file_t * uf) +vl_socket_read_ready (clib_file_t * uf) { - unix_main_t *um = &unix_main; + clib_file_main_t *fm = &file_main; vl_api_registration_t *rp; int n; i8 *msg_buffer = 0; @@ -259,7 +259,7 @@ vl_socket_read_ready (unix_file_t * uf) if (n <= 0 && errno != EAGAIN) { - unix_file_del (um, uf); + clib_file_del (fm, uf); if (!pool_is_free (socket_main.registration_pool, rp)) { @@ -352,11 +352,11 @@ turf_it: } void -vl_socket_add_pending_output (unix_file_t * uf, +vl_socket_add_pending_output (clib_file_t * uf, vl_api_registration_t * rp, u8 * buffer, uword buffer_bytes) { - unix_main_t *um = &unix_main; + clib_file_main_t *fm = &file_main; vec_add (rp->output_vector, buffer, buffer_bytes); if (vec_len (rp->output_vector) > 0) @@ -364,15 +364,15 @@ vl_socket_add_pending_output (unix_file_t * uf, int skip_update = 0 != (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE); uf->flags |= UNIX_FILE_DATA_AVAILABLE_TO_WRITE; if (!skip_update) - um->file_update (uf, UNIX_FILE_UPDATE_MODIFY); + fm->file_update (uf, UNIX_FILE_UPDATE_MODIFY); } } static void -socket_del_pending_output (unix_file_t * uf, +socket_del_pending_output (clib_file_t * uf, vl_api_registration_t * rp, uword n_bytes) { - unix_main_t *um = &unix_main; + clib_file_main_t *fm = &file_main; vec_delete (rp->output_vector, n_bytes, 0); if (vec_len (rp->output_vector) <= 0) @@ -380,14 +380,14 @@ socket_del_pending_output (unix_file_t * uf, int skip_update = 0 == (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE); uf->flags &= ~UNIX_FILE_DATA_AVAILABLE_TO_WRITE; if (!skip_update) - um->file_update (uf, UNIX_FILE_UPDATE_MODIFY); + fm->file_update (uf, UNIX_FILE_UPDATE_MODIFY); } } clib_error_t * -vl_socket_write_ready (unix_file_t * uf) +vl_socket_write_ready (clib_file_t * uf) { - unix_main_t *um = &unix_main; + clib_file_main_t *fm = &file_main; vl_api_registration_t *rp; int n; @@ -402,7 +402,7 @@ vl_socket_write_ready (unix_file_t * uf) #if DEBUG > 2 clib_warning ("write error, close the file...\n"); #endif - unix_file_del (um, uf); + clib_file_del (fm, uf); vl_free_socket_registration_index (rp - socket_main.registration_pool); return 0; @@ -415,23 +415,23 @@ vl_socket_write_ready (unix_file_t * uf) } clib_error_t * -vl_socket_error_ready (unix_file_t * uf) +vl_socket_error_ready (clib_file_t * uf) { vl_api_registration_t *rp; - unix_main_t *um = &unix_main; + clib_file_main_t *fm = &file_main; rp = pool_elt_at_index (socket_main.registration_pool, uf->private_data); - unix_file_del (um, uf); + clib_file_del (fm, uf); vl_free_socket_registration_index (rp - socket_main.registration_pool); return 0; } void -socksvr_file_add (unix_main_t * um, int fd) +socksvr_file_add (clib_file_main_t * fm, int fd) { vl_api_registration_t *rp; - unix_file_t template = { 0 }; + clib_file_t template = { 0 }; pool_get (socket_main.registration_pool, rp); memset (rp, 0, sizeof (*rp)); @@ -444,13 +444,13 @@ socksvr_file_add (unix_main_t * um, int fd) rp->registration_type = REGISTRATION_TYPE_SOCKET_SERVER; rp->vl_api_registration_pool_index = rp - socket_main.registration_pool; - rp->unix_file_index = unix_file_add (um, &template); + rp->clib_file_index = clib_file_add (fm, &template); } static clib_error_t * -socksvr_accept_ready (unix_file_t * uf) +socksvr_accept_ready (clib_file_t * uf) { - unix_main_t *um = &unix_main; + clib_file_main_t *fm = &file_main; struct sockaddr_in client_addr; int client_fd; int client_len; @@ -468,12 +468,12 @@ socksvr_accept_ready (unix_file_t * uf) if (client_fd < 0) return clib_error_return_unix (0, "socksvr_accept_ready: accept"); - socksvr_file_add (um, client_fd); + socksvr_file_add (fm, client_fd); return 0; } static clib_error_t * -socksvr_bogus_write (unix_file_t * uf) +socksvr_bogus_write (clib_file_t * uf) { clib_warning ("why am I here?"); return 0; @@ -525,7 +525,7 @@ vl_api_sockclnt_delete_t_handler (vl_api_sockclnt_delete_t * mp) vl_msg_api_send (regp, (u8 *) rp); - unix_file_del (&unix_main, unix_main.file_pool + regp->unix_file_index); + clib_file_del (&file_main, file_main.file_pool + regp->clib_file_index); vl_free_socket_registration_index (mp->index); } @@ -542,8 +542,8 @@ _(SOCKCLNT_DELETE, sockclnt_delete) static clib_error_t * socksvr_api_init (vlib_main_t * vm) { - unix_main_t *um = &unix_main; - unix_file_t template = { 0 }; + clib_file_main_t *fm = &file_main; + clib_file_t template = { 0 }; int sockfd; int one = 1; int rv; @@ -625,14 +625,14 @@ socksvr_api_init (vlib_main_t * vm) template.file_descriptor = sockfd; template.private_data = rp - socket_main.registration_pool; - rp->unix_file_index = unix_file_add (um, &template); + rp->clib_file_index = clib_file_add (fm, &template); return 0; } static clib_error_t * socket_exit (vlib_main_t * vm) { - unix_main_t *um = &unix_main; + clib_file_main_t *fm = &file_main; vl_api_registration_t *rp; /* Defensive driving in case something wipes out early */ @@ -641,7 +641,7 @@ socket_exit (vlib_main_t * vm) u32 index; /* *INDENT-OFF* */ pool_foreach (rp, socket_main.registration_pool, ({ - unix_file_del (um, um->file_pool + rp->unix_file_index); + clib_file_del (fm, fm->file_pool + rp->clib_file_index); index = rp->vl_api_registration_pool_index; vl_free_socket_registration_index (index); })); diff --git a/src/vnet/devices/af_packet/af_packet.c b/src/vnet/devices/af_packet/af_packet.c index e7e69214..62bb228f 100644 --- a/src/vnet/devices/af_packet/af_packet.c +++ b/src/vnet/devices/af_packet/af_packet.c @@ -89,7 +89,7 @@ af_packet_eth_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, } static clib_error_t * -af_packet_fd_read_ready (unix_file_t * uf) +af_packet_fd_read_ready (clib_file_t * uf) { af_packet_main_t *apm = &af_packet_main; vnet_main_t *vnm = vnet_get_main (); @@ -281,12 +281,12 @@ af_packet_create_if (vlib_main_t * vm, u8 * host_if_name, u8 * hw_addr_set, clib_spinlock_init (&apif->lockp); { - unix_file_t template = { 0 }; + clib_file_t template = { 0 }; template.read_function = af_packet_fd_read_ready; template.file_descriptor = fd; template.private_data = if_index; template.flags = UNIX_FILE_EVENT_EDGE_TRIGGERED; - apif->unix_file_index = unix_file_add (&unix_main, &template); + apif->clib_file_index = clib_file_add (&file_main, &template); } /*use configured or generate random MAC address */ @@ -371,10 +371,10 @@ af_packet_delete_if (vlib_main_t * vm, u8 * host_if_name) vnet_hw_interface_unassign_rx_thread (vnm, apif->hw_if_index, 0); /* clean up */ - if (apif->unix_file_index != ~0) + if (apif->clib_file_index != ~0) { - unix_file_del (&unix_main, unix_main.file_pool + apif->unix_file_index); - apif->unix_file_index = ~0; + clib_file_del (&file_main, file_main.file_pool + apif->clib_file_index); + apif->clib_file_index = ~0; } else close (apif->fd); diff --git a/src/vnet/devices/af_packet/af_packet.h b/src/vnet/devices/af_packet/af_packet.h index 194977f0..95c7e7cf 100644 --- a/src/vnet/devices/af_packet/af_packet.h +++ b/src/vnet/devices/af_packet/af_packet.h @@ -32,7 +32,7 @@ typedef struct u8 *tx_ring; u32 hw_if_index; u32 sw_if_index; - u32 unix_file_index; + u32 clib_file_index; u32 next_rx_frame; u32 next_tx_frame; diff --git a/src/vnet/devices/netmap/netmap.c b/src/vnet/devices/netmap/netmap.c index 09afc764..fc49ed62 100644 --- a/src/vnet/devices/netmap/netmap.c +++ b/src/vnet/devices/netmap/netmap.c @@ -36,7 +36,7 @@ netmap_eth_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, } static clib_error_t * -netmap_fd_read_ready (unix_file_t * uf) +netmap_fd_read_ready (clib_file_t * uf) { vlib_main_t *vm = vlib_get_main (); netmap_main_t *nm = &netmap_main; @@ -54,10 +54,10 @@ netmap_fd_read_ready (unix_file_t * uf) static void close_netmap_if (netmap_main_t * nm, netmap_if_t * nif) { - if (nif->unix_file_index != ~0) + if (nif->clib_file_index != ~0) { - unix_file_del (&unix_main, unix_main.file_pool + nif->unix_file_index); - nif->unix_file_index = ~0; + clib_file_del (&file_main, file_main.file_pool + nif->clib_file_index); + nif->clib_file_index = ~0; } else if (nif->fd > -1) close (nif->fd); @@ -137,7 +137,7 @@ netmap_create_if (vlib_main_t * vm, u8 * if_name, u8 * hw_addr_set, pool_get (nm->interfaces, nif); nif->if_index = nif - nm->interfaces; nif->fd = fd; - nif->unix_file_index = ~0; + nif->clib_file_index = ~0; vec_validate (req, 0); nif->req = req; @@ -188,11 +188,11 @@ netmap_create_if (vlib_main_t * vm, u8 * if_name, u8 * hw_addr_set, clib_spinlock_init (&nif->lockp); { - unix_file_t template = { 0 }; + clib_file_t template = { 0 }; template.read_function = netmap_fd_read_ready; template.file_descriptor = nif->fd; template.private_data = nif->if_index; - nif->unix_file_index = unix_file_add (&unix_main, &template); + nif->clib_file_index = clib_file_add (&file_main, &template); } /*use configured or generate random MAC address */ diff --git a/src/vnet/devices/netmap/netmap.h b/src/vnet/devices/netmap/netmap.h index e04f045d..04731890 100644 --- a/src/vnet/devices/netmap/netmap.h +++ b/src/vnet/devices/netmap/netmap.h @@ -50,7 +50,7 @@ typedef struct uword if_index; u32 hw_if_index; u32 sw_if_index; - u32 unix_file_index; + u32 clib_file_index; u32 per_interface_next_index; u8 is_admin_up; diff --git a/src/vnet/devices/virtio/vhost-user.c b/src/vnet/devices/virtio/vhost-user.c index 5fe378cb..2af96ee7 100644 --- a/src/vnet/devices/virtio/vhost-user.c +++ b/src/vnet/devices/virtio/vhost-user.c @@ -89,7 +89,7 @@ #define UNIX_GET_FD(unixfd_idx) \ (unixfd_idx != ~0) ? \ - pool_elt_at_index (unix_main.file_pool, \ + pool_elt_at_index (file_main.file_pool, \ unixfd_idx)->file_descriptor : -1; #define foreach_virtio_trace_flags \ @@ -477,7 +477,7 @@ vhost_user_set_interrupt_pending (vhost_user_intf_t * vui, u32 ifq) } static clib_error_t * -vhost_user_callfd_read_ready (unix_file_t * uf) +vhost_user_callfd_read_ready (clib_file_t * uf) { __attribute__ ((unused)) int n; u8 buff[8]; @@ -488,7 +488,7 @@ vhost_user_callfd_read_ready (unix_file_t * uf) } static clib_error_t * -vhost_user_kickfd_read_ready (unix_file_t * uf) +vhost_user_kickfd_read_ready (clib_file_t * uf) { __attribute__ ((unused)) int n; u8 buff[8]; @@ -569,16 +569,16 @@ vhost_user_vring_close (vhost_user_intf_t * vui, u32 qid) vhost_user_vring_t *vring = &vui->vrings[qid]; if (vring->kickfd_idx != ~0) { - unix_file_t *uf = pool_elt_at_index (unix_main.file_pool, + clib_file_t *uf = pool_elt_at_index (file_main.file_pool, vring->kickfd_idx); - unix_file_del (&unix_main, uf); + clib_file_del (&file_main, uf); vring->kickfd_idx = ~0; } if (vring->callfd_idx != ~0) { - unix_file_t *uf = pool_elt_at_index (unix_main.file_pool, + clib_file_t *uf = pool_elt_at_index (file_main.file_pool, vring->callfd_idx); - unix_file_del (&unix_main, uf); + clib_file_del (&file_main, uf); vring->callfd_idx = ~0; } if (vring->errfd != -1) @@ -597,10 +597,10 @@ vhost_user_if_disconnect (vhost_user_intf_t * vui) vnet_hw_interface_set_flags (vnm, vui->hw_if_index, 0); - if (vui->unix_file_index != ~0) + if (vui->clib_file_index != ~0) { - unix_file_del (&unix_main, unix_main.file_pool + vui->unix_file_index); - vui->unix_file_index = ~0; + clib_file_del (&file_main, file_main.file_pool + vui->clib_file_index); + vui->clib_file_index = ~0; } vui->is_up = 0; @@ -654,7 +654,7 @@ vhost_user_log_dirty_pages (vhost_user_intf_t * vui, u64 addr, u64 len) } static clib_error_t * -vhost_user_socket_read (unix_file_t * uf) +vhost_user_socket_read (clib_file_t * uf) { int n, i; int fd, number_of_fds = 0; @@ -666,7 +666,7 @@ vhost_user_socket_read (unix_file_t * uf) vhost_user_intf_t *vui; struct cmsghdr *cmsg; u8 q; - unix_file_t template = { 0 }; + clib_file_t template = { 0 }; vnet_main_t *vnm = vnet_get_main (); vui = pool_elt_at_index (vum->vhost_user_interfaces, uf->private_data); @@ -927,9 +927,9 @@ vhost_user_socket_read (unix_file_t * uf) /* if there is old fd, delete and close it */ if (vui->vrings[q].callfd_idx != ~0) { - unix_file_t *uf = pool_elt_at_index (unix_main.file_pool, + clib_file_t *uf = pool_elt_at_index (file_main.file_pool, vui->vrings[q].callfd_idx); - unix_file_del (&unix_main, uf); + clib_file_del (&file_main, uf); vui->vrings[q].callfd_idx = ~0; } @@ -945,7 +945,7 @@ vhost_user_socket_read (unix_file_t * uf) template.file_descriptor = fds[0]; template.private_data = ((vui - vhost_user_main.vhost_user_interfaces) << 8) + q; - vui->vrings[q].callfd_idx = unix_file_add (&unix_main, &template); + vui->vrings[q].callfd_idx = clib_file_add (&file_main, &template); } else vui->vrings[q].callfd_idx = ~0; @@ -959,9 +959,9 @@ vhost_user_socket_read (unix_file_t * uf) if (vui->vrings[q].kickfd_idx != ~0) { - unix_file_t *uf = pool_elt_at_index (unix_main.file_pool, + clib_file_t *uf = pool_elt_at_index (file_main.file_pool, vui->vrings[q].kickfd_idx); - unix_file_del (&unix_main, uf); + clib_file_del (&file_main, uf); vui->vrings[q].kickfd_idx = ~0; } @@ -978,7 +978,7 @@ vhost_user_socket_read (unix_file_t * uf) template.private_data = (((uword) (vui - vhost_user_main.vhost_user_interfaces)) << 8) + q; - vui->vrings[q].kickfd_idx = unix_file_add (&unix_main, &template); + vui->vrings[q].kickfd_idx = clib_file_add (&file_main, &template); } else { @@ -1168,7 +1168,7 @@ close_socket: } static clib_error_t * -vhost_user_socket_error (unix_file_t * uf) +vhost_user_socket_error (clib_file_t * uf) { vlib_main_t *vm = vlib_get_main (); vhost_user_main_t *vum = &vhost_user_main; @@ -1184,11 +1184,11 @@ vhost_user_socket_error (unix_file_t * uf) } static clib_error_t * -vhost_user_socksvr_accept_ready (unix_file_t * uf) +vhost_user_socksvr_accept_ready (clib_file_t * uf) { int client_fd, client_len; struct sockaddr_un client; - unix_file_t template = { 0 }; + clib_file_t template = { 0 }; vhost_user_main_t *vum = &vhost_user_main; vhost_user_intf_t *vui; @@ -1207,7 +1207,7 @@ vhost_user_socksvr_accept_ready (unix_file_t * uf) template.error_function = vhost_user_socket_error; template.file_descriptor = client_fd; template.private_data = vui - vhost_user_main.vhost_user_interfaces; - vui->unix_file_index = unix_file_add (&unix_main, &template); + vui->clib_file_index = clib_file_add (&file_main, &template); return 0; } @@ -2475,7 +2475,7 @@ vhost_user_process (vlib_main_t * vm, vhost_user_intf_t *vui; struct sockaddr_un sun; int sockfd; - unix_file_t template = { 0 }; + clib_file_t template = { 0 }; f64 timeout = 3153600000.0 /* 100 years */ ; uword *event_data = 0; @@ -2496,7 +2496,7 @@ vhost_user_process (vlib_main_t * vm, pool_foreach (vui, vum->vhost_user_interfaces, { if (vui->unix_server_index == ~0) { //Nothing to do for server sockets - if (vui->unix_file_index == ~0) + if (vui->clib_file_index == ~0) { if ((sockfd < 0) && ((sockfd = socket (AF_UNIX, SOCK_STREAM, 0)) < 0)) @@ -2534,7 +2534,7 @@ vhost_user_process (vlib_main_t * vm, template.file_descriptor = sockfd; template.private_data = vui - vhost_user_main.vhost_user_interfaces; - vui->unix_file_index = unix_file_add (&unix_main, &template); + vui->clib_file_index = clib_file_add (&file_main, &template); /* This sockfd is considered consumed */ sockfd = -1; @@ -2549,7 +2549,7 @@ vhost_user_process (vlib_main_t * vm, /* check if socket is alive */ int error = 0; socklen_t len = sizeof (error); - int fd = UNIX_GET_FD(vui->unix_file_index); + int fd = UNIX_GET_FD(vui->clib_file_index); int retval = getsockopt (fd, SOL_SOCKET, SO_ERROR, &error, &len); @@ -2596,9 +2596,9 @@ vhost_user_term_if (vhost_user_intf_t * vui) if (vui->unix_server_index != ~0) { //Close server socket - unix_file_t *uf = pool_elt_at_index (unix_main.file_pool, + clib_file_t *uf = pool_elt_at_index (file_main.file_pool, vui->unix_server_index); - unix_file_del (&unix_main, uf); + clib_file_del (&file_main, uf); vui->unix_server_index = ~0; unlink (vui->sock_filename); } @@ -2780,11 +2780,11 @@ vhost_user_vui_init (vnet_main_t * vnm, sw = vnet_get_hw_sw_interface (vnm, vui->hw_if_index); if (server_sock_fd != -1) { - unix_file_t template = { 0 }; + clib_file_t template = { 0 }; template.read_function = vhost_user_socksvr_accept_ready; template.file_descriptor = server_sock_fd; template.private_data = vui - vum->vhost_user_interfaces; //hw index - vui->unix_server_index = unix_file_add (&unix_main, &template); + vui->unix_server_index = clib_file_add (&file_main, &template); } else { @@ -2797,7 +2797,7 @@ vhost_user_vui_init (vnet_main_t * vnm, vui->sock_errno = 0; vui->is_up = 0; vui->feature_mask = feature_mask; - vui->unix_file_index = ~0; + vui->clib_file_index = ~0; vui->log_base_addr = 0; vui->if_index = vui - vum->vhost_user_interfaces; mhash_set_mem (&vum->if_index_by_sock_name, vui->sock_filename, diff --git a/src/vnet/devices/virtio/vhost-user.h b/src/vnet/devices/virtio/vhost-user.h index ae3b88e8..105b92b7 100644 --- a/src/vnet/devices/virtio/vhost-user.h +++ b/src/vnet/devices/virtio/vhost-user.h @@ -223,7 +223,7 @@ typedef struct u32 is_up; u32 admin_up; u32 unix_server_index; - u32 unix_file_index; + u32 clib_file_index; char sock_filename[256]; int sock_errno; uword if_index; diff --git a/src/vnet/ip/punt.c b/src/vnet/ip/punt.c index 67c54c3c..1ea32fa0 100644 --- a/src/vnet/ip/punt.c +++ b/src/vnet/ip/punt.c @@ -550,7 +550,7 @@ VLIB_REGISTER_NODE (punt_socket_rx_node, static) = format_punt_trace,}; static clib_error_t * -punt_socket_read_ready (unix_file_t * uf) +punt_socket_read_ready (clib_file_t * uf) { vlib_main_t *vm = vlib_get_main (); punt_main_t *pm = &punt_main; @@ -790,11 +790,11 @@ punt_config (vlib_main_t * vm, unformat_input_t * input) } /* Register socket */ - unix_main_t *um = &unix_main; - unix_file_t template = { 0 }; + clib_file_main_t *fm = &file_main; + clib_file_t template = { 0 }; template.read_function = punt_socket_read_ready; template.file_descriptor = pm->socket_fd; - pm->unix_file_index = unix_file_add (um, &template); + pm->clib_file_index = clib_file_add (fm, &template); pm->is_configured = true; diff --git a/src/vnet/ip/punt.h b/src/vnet/ip/punt.h index 0103249c..9defa881 100644 --- a/src/vnet/ip/punt.h +++ b/src/vnet/ip/punt.h @@ -72,7 +72,7 @@ typedef struct char sun_path[sizeof (struct sockaddr_un)]; punt_client_t *clients_by_dst_port4; punt_client_t *clients_by_dst_port6; - u32 unix_file_index; + u32 clib_file_index; bool is_configured; vlib_node_t *interface_output_node; u32 *ready_fds; diff --git a/src/vnet/unix/tapcli.c b/src/vnet/unix/tapcli.c index 0fc62f6c..13154b3b 100644 --- a/src/vnet/unix/tapcli.c +++ b/src/vnet/unix/tapcli.c @@ -56,7 +56,7 @@ static void tapcli_nopunt_frame (vlib_main_t * vm, */ typedef struct { u32 unix_fd; - u32 unix_file_index; + u32 clib_file_index; u32 provision_fd; /** For counters */ u32 sw_if_index; @@ -137,8 +137,6 @@ typedef struct { vlib_main_t * vlib_main; /** convenience - vnet_main_t */ vnet_main_t * vnet_main; - /** convenience - unix_main_t */ - unix_main_t * unix_main; } tapcli_main_t; static tapcli_main_t tapcli_main; @@ -453,12 +451,12 @@ VLIB_REGISTER_NODE (tapcli_rx_node, static) = { /** * @brief Gets called when file descriptor is ready from epoll. * - * @param *uf - unix_file_t + * @param *uf - clib_file_t * * @return error - clib_error_t * */ -static clib_error_t * tapcli_read_ready (unix_file_t * uf) +static clib_error_t * tapcli_read_ready (clib_file_t * uf) { vlib_main_t * vm = vlib_get_main(); tapcli_main_t * tm = &tapcli_main; @@ -999,10 +997,10 @@ int vnet_tap_connect (vlib_main_t * vm, vnet_tap_connect_args_t *ap) } { - unix_file_t template = {0}; + clib_file_t template = {0}; template.read_function = tapcli_read_ready; template.file_descriptor = dev_net_tun_fd; - ti->unix_file_index = unix_file_add (&unix_main, &template); + ti->clib_file_index = clib_file_add (&file_main, &template); ti->unix_fd = dev_net_tun_fd; ti->provision_fd = dev_tap_fd; clib_memcpy (&ti->ifr, &ifr, sizeof (ifr)); @@ -1079,9 +1077,9 @@ static int tapcli_tap_disconnect (tapcli_interface_t *ti) // bring interface down vnet_sw_interface_set_flags (vnm, sw_if_index, 0); - if (ti->unix_file_index != ~0) { - unix_file_del (&unix_main, unix_main.file_pool + ti->unix_file_index); - ti->unix_file_index = ~0; + if (ti->clib_file_index != ~0) { + clib_file_del (&file_main, file_main.file_pool + ti->clib_file_index); + ti->clib_file_index = ~0; } else close(ti->unix_fd); @@ -1455,7 +1453,6 @@ tapcli_init (vlib_main_t * vm) tm->vlib_main = vm; tm->vnet_main = vnet_get_main(); - tm->unix_main = &unix_main; tm->mtu_bytes = TAP_MTU_DEFAULT; tm->tapcli_interface_index_by_sw_if_index = hash_create (0, sizeof(uword)); tm->tapcli_interface_index_by_unix_fd = hash_create (0, sizeof (uword)); diff --git a/src/vnet/unix/tuntap.c b/src/vnet/unix/tuntap.c index 2c403679..9616feb2 100644 --- a/src/vnet/unix/tuntap.c +++ b/src/vnet/unix/tuntap.c @@ -104,7 +104,7 @@ typedef struct { mhash_t subif_mhash; /** Unix file index */ - u32 unix_file_index; + u32 clib_file_index; /** For the "normal" interface, if configured */ u32 hw_if_index, sw_if_index; @@ -388,11 +388,11 @@ VLIB_REGISTER_NODE (tuntap_rx_node,static) = { /** * @brief Gets called when file descriptor is ready from epoll. * - * @param *uf - unix_file_t + * @param *uf - clib_file_t * * @return error - clib_error_t */ -static clib_error_t * tuntap_read_ready (unix_file_t * uf) +static clib_error_t * tuntap_read_ready (clib_file_t * uf) { vlib_main_t * vm = vlib_get_main(); vlib_node_set_interrupt_pending (vm, tuntap_rx_node.index); @@ -645,10 +645,10 @@ tuntap_config (vlib_main_t * vm, unformat_input_t * input) } { - unix_file_t template = {0}; + clib_file_t template = {0}; template.read_function = tuntap_read_ready; template.file_descriptor = tm->dev_net_tun_fd; - tm->unix_file_index = unix_file_add (&unix_main, &template); + tm->clib_file_index = clib_file_add (&file_main, &template); } done: diff --git a/src/vppinfra.am b/src/vppinfra.am index 8f01114c..a5769a0d 100644 --- a/src/vppinfra.am +++ b/src/vppinfra.am @@ -183,6 +183,7 @@ nobase_include_HEADERS = \ vppinfra/error.h \ vppinfra/error_bootstrap.h \ vppinfra/fifo.h \ + vppinfra/file.h \ vppinfra/format.h \ vppinfra/graph.h \ vppinfra/hash.h \ diff --git a/src/vppinfra/file.h b/src/vppinfra/file.h new file mode 100644 index 00000000..69facea9 --- /dev/null +++ b/src/vppinfra/file.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * file.h: unix file handling + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_clib_file_h +#define included_clib_file_h + +#include +#include + + +struct clib_file; +typedef clib_error_t *(clib_file_function_t) (struct clib_file * f); + +typedef struct clib_file +{ + /* Unix file descriptor from open/socket. */ + u32 file_descriptor; + + u32 flags; +#define UNIX_FILE_DATA_AVAILABLE_TO_WRITE (1 << 0) +#define UNIX_FILE_EVENT_EDGE_TRIGGERED (1 << 1) + + /* Data available for function's use. */ + uword private_data; + + /* Functions to be called when read/write data becomes ready. */ + clib_file_function_t *read_function, *write_function, *error_function; +} clib_file_t; + +typedef enum +{ + UNIX_FILE_UPDATE_ADD, + UNIX_FILE_UPDATE_MODIFY, + UNIX_FILE_UPDATE_DELETE, +} unix_file_update_type_t; + +typedef struct +{ + /* Pool of files to poll for input/output. */ + clib_file_t *file_pool; + + void (*file_update) (clib_file_t * file, + unix_file_update_type_t update_type); + +} clib_file_main_t; + +always_inline uword +clib_file_add (clib_file_main_t * um, clib_file_t * template) +{ + clib_file_t *f; + pool_get (um->file_pool, f); + f[0] = template[0]; + um->file_update (f, UNIX_FILE_UPDATE_ADD); + return f - um->file_pool; +} + +always_inline void +clib_file_del (clib_file_main_t * um, clib_file_t * f) +{ + um->file_update (f, UNIX_FILE_UPDATE_DELETE); + close (f->file_descriptor); + f->file_descriptor = ~0; + pool_put (um->file_pool, f); +} + +always_inline void +clib_file_del_by_index (clib_file_main_t * um, uword index) +{ + clib_file_t *uf; + uf = pool_elt_at_index (um->file_pool, index); + clib_file_del (um, uf); +} + +always_inline uword +clib_file_set_data_available_to_write (clib_file_main_t * um, + u32 clib_file_index, + uword is_available) +{ + clib_file_t *uf = pool_elt_at_index (um->file_pool, clib_file_index); + uword was_available = (uf->flags & UNIX_FILE_DATA_AVAILABLE_TO_WRITE); + if ((was_available != 0) != (is_available != 0)) + { + uf->flags ^= UNIX_FILE_DATA_AVAILABLE_TO_WRITE; + um->file_update (uf, UNIX_FILE_UPDATE_MODIFY); + } + return was_available != 0; +} + + +#endif /* included_clib_file_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ -- cgit 1.2.3-korg From eb1ac1732f15f9a99edbeffeb94c525b9ff25c1d Mon Sep 17 00:00:00 2001 From: Colin Tregenza Dancer Date: Wed, 6 Sep 2017 20:23:24 +0100 Subject: Recombine diags and minimum barrier open time changes (VPP-968) Support logging to both syslog and elog Also include DaveB is_mp_safe fix, which had been lost Change-Id: If82f7969e2f43c63c3fed5b1a0c7434c90c1f380 Signed-off-by: Colin Tregenza Dancer --- src/vlib/main.h | 9 ++ src/vlib/threads.c | 313 +++++++++++++++++++++++++++++++++++++++++-- src/vlib/threads.h | 29 +++- src/vlibapi/api_common.h | 6 + src/vlibapi/api_shared.c | 10 +- src/vlibmemory/memory_vlib.c | 5 + src/vpp/vnet/main.c | 8 ++ 7 files changed, 367 insertions(+), 13 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/main.h b/src/vlib/main.h index 4c0cde3f..fb67334e 100644 --- a/src/vlib/main.h +++ b/src/vlib/main.h @@ -199,6 +199,15 @@ typedef struct vlib_main_t */ int need_vlib_worker_thread_node_runtime_update; + /* + * Barrier epoch - Set to current time, each time barrier_sync or + * barrier_release is called with zero recursion. + */ + f64 barrier_epoch; + + /* Earliest barrier can be closed again */ + f64 barrier_no_close_before; + } vlib_main_t; /* Global main structure. */ diff --git a/src/vlib/threads.c b/src/vlib/threads.c index 6cd325b3..2d9ce84a 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -35,6 +35,222 @@ vl (void *p) vlib_worker_thread_t *vlib_worker_threads; vlib_thread_main_t vlib_thread_main; +/* + * Barrier tracing can be enabled on a normal build to collect information + * on barrier use, including timings and call stacks. Deliberately not + * keyed off CLIB_DEBUG, because that can add significant overhead which + * imapacts observed timings. + */ + +#ifdef BARRIER_TRACING + /* + * Output of barrier tracing can be to syslog or elog as suits + */ +#ifdef BARRIER_TRACING_ELOG +static u32 +elog_id_for_msg_name (const char *msg_name) +{ + uword *p, r; + static uword *h; + u8 *name_copy; + + if (!h) + h = hash_create_string (0, sizeof (uword)); + + p = hash_get_mem (h, msg_name); + if (p) + return p[0]; + r = elog_string (&vlib_global_main.elog_main, "%s", msg_name); + + name_copy = format (0, "%s%c", msg_name, 0); + + hash_set_mem (h, name_copy, r); + + return r; +} + + /* + * elog Barrier trace functions, which are nulled out if BARRIER_TRACING isn't + * defined + */ + +static inline void +barrier_trace_sync (f64 t_entry, f64 t_open, f64 t_closed) +{ + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "barrier <%d#%s(O:%dus:%dus)(%dus)", + .format_args = "i4T4i4i4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 count, caller, t_entry, t_open, t_closed; + } *ed = 0; + + ed = ELOG_DATA (&vlib_global_main.elog_main, e); + ed->count = (int) vlib_worker_threads[0].barrier_sync_count; + ed->caller = elog_id_for_msg_name (vlib_worker_threads[0].barrier_caller); + ed->t_entry = (int) (1000000.0 * t_entry); + ed->t_open = (int) (1000000.0 * t_open); + ed->t_closed = (int) (1000000.0 * t_closed); +} + +static inline void +barrier_trace_sync_rec (f64 t_entry) +{ + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "barrier <%d(%dus)%s", + .format_args = "i4i4T4", + }; + /* *INDENT-ON* */ + struct + { + u32 depth, t_entry, caller; + } *ed = 0; + + ed = ELOG_DATA (&vlib_global_main.elog_main, e); + ed->depth = (int) vlib_worker_threads[0].recursion_level - 1; + ed->t_entry = (int) (1000000.0 * t_entry); + ed->caller = elog_id_for_msg_name (vlib_worker_threads[0].barrier_caller); +} + +static inline void +barrier_trace_release_rec (f64 t_entry) +{ + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "barrier (%dus)%d>", + .format_args = "i4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 t_entry, depth; + } *ed = 0; + + ed = ELOG_DATA (&vlib_global_main.elog_main, e); + ed->t_entry = (int) (1000000.0 * t_entry); + ed->depth = (int) vlib_worker_threads[0].recursion_level; +} + +static inline void +barrier_trace_release (f64 t_entry, f64 t_closed_total, f64 t_update_main) +{ + /* *INDENT-OFF* */ + ELOG_TYPE_DECLARE (e) = + { + .format = "barrier (%dus){%d}(C:%dus)#%d>", + .format_args = "i4i4i4i4", + }; + /* *INDENT-ON* */ + struct + { + u32 t_entry, t_update_main, t_closed_total, count; + } *ed = 0; + + ed = ELOG_DATA (&vlib_global_main.elog_main, e); + ed->t_entry = (int) (1000000.0 * t_entry); + ed->t_update_main = (int) (1000000.0 * t_update_main); + ed->t_closed_total = (int) (1000000.0 * t_closed_total); + ed->count = (int) vlib_worker_threads[0].barrier_sync_count; + + /* Reset context for next trace */ + vlib_worker_threads[0].barrier_context = NULL; +} +#else +char barrier_trace[65536]; +char *btp = barrier_trace; + + /* + * syslog Barrier trace functions, which are nulled out if BARRIER_TRACING + * isn't defined + */ + + +static inline void +barrier_trace_sync (f64 t_entry, f64 t_open, f64 t_closed) +{ + btp += sprintf (btp, "<%u#%s", + (unsigned int) vlib_worker_threads[0].barrier_sync_count, + vlib_worker_threads[0].barrier_caller); + + if (vlib_worker_threads[0].barrier_context) + { + btp += sprintf (btp, "[%s]", vlib_worker_threads[0].barrier_context); + + } + + btp += sprintf (btp, "(O:%dus:%dus)(%dus):", + (int) (1000000.0 * t_entry), + (int) (1000000.0 * t_open), (int) (1000000.0 * t_closed)); + +} + +static inline void +barrier_trace_sync_rec (f64 t_entry) +{ + btp += sprintf (btp, "<%u(%dus)%s:", + (int) vlib_worker_threads[0].recursion_level - 1, + (int) (1000000.0 * t_entry), + vlib_worker_threads[0].barrier_caller); +} + +static inline void +barrier_trace_release_rec (f64 t_entry) +{ + btp += sprintf (btp, ":(%dus)%u>", (int) (1000000.0 * t_entry), + (int) vlib_worker_threads[0].recursion_level); +} + +static inline void +barrier_trace_release (f64 t_entry, f64 t_closed_total, f64 t_update_main) +{ + + btp += sprintf (btp, ":(%dus)", (int) (1000000.0 * t_entry)); + if (t_update_main > 0) + { + btp += sprintf (btp, "{%dus}", (int) (1000000.0 * t_update_main)); + } + + btp += sprintf (btp, "(C:%dus)#%u>", + (int) (1000000.0 * t_closed_total), + (int) vlib_worker_threads[0].barrier_sync_count); + + /* Dump buffer to syslog, and reset for next trace */ + fformat (stderr, "BTRC %s\n", barrier_trace); + btp = barrier_trace; + vlib_worker_threads[0].barrier_context = NULL; +} +#endif +#else + + /* Null functions for default case where barrier tracing isn't used */ +static inline void +barrier_trace_sync (f64 t_entry, f64 t_open, f64 t_closed) +{ +} + +static inline void +barrier_trace_sync_rec (f64 t_entry) +{ +} + +static inline void +barrier_trace_release_rec (f64 t_entry) +{ +} + +static inline void +barrier_trace_release (f64 t_entry, f64 t_closed_total, f64 t_update_main) +{ +} +#endif + uword os_get_nthreads (void) { @@ -558,6 +774,10 @@ start_workers (vlib_main_t * vm) *vlib_worker_threads->node_reforks_required = 0; vm->need_vlib_worker_thread_node_runtime_update = 0; + /* init timing */ + vm->barrier_epoch = 0; + vm->barrier_no_close_before = 0; + worker_thread_index = 1; for (i = 0; i < vec_len (tm->registrations); i++) @@ -790,6 +1010,7 @@ start_workers (vlib_main_t * vm) VLIB_MAIN_LOOP_ENTER_FUNCTION (start_workers); + static inline void worker_thread_node_runtime_update_internal (void) { @@ -993,7 +1214,6 @@ vlib_worker_thread_node_refork (void) nm_clone->processes = vec_dup (nm->processes); } - void vlib_worker_thread_node_runtime_update (void) { @@ -1192,10 +1412,29 @@ vlib_worker_thread_fork_fixup (vlib_fork_fixup_t which) vlib_worker_thread_barrier_release (vm); } + /* + * Enforce minimum open time to minimize packet loss due to Rx overflow, + * based on a test based heuristic that barrier should be open for at least + * 3 time as long as it is closed (with an upper bound of 1ms because by that + * point it is probably too late to make a difference) + */ + +#ifndef BARRIER_MINIMUM_OPEN_LIMIT +#define BARRIER_MINIMUM_OPEN_LIMIT 0.001 +#endif + +#ifndef BARRIER_MINIMUM_OPEN_FACTOR +#define BARRIER_MINIMUM_OPEN_FACTOR 3 +#endif + void -vlib_worker_thread_barrier_sync (vlib_main_t * vm) +vlib_worker_thread_barrier_sync_int (vlib_main_t * vm) { f64 deadline; + f64 now; + f64 t_entry; + f64 t_open; + f64 t_closed; u32 count; if (vec_len (vlib_mains) < 2) @@ -1205,29 +1444,55 @@ vlib_worker_thread_barrier_sync (vlib_main_t * vm) count = vec_len (vlib_mains) - 1; + /* Record entry relative to last close */ + now = vlib_time_now (vm); + t_entry = now - vm->barrier_epoch; + /* Tolerate recursive calls */ if (++vlib_worker_threads[0].recursion_level > 1) - return; + { + barrier_trace_sync_rec (t_entry); + return; + } vlib_worker_threads[0].barrier_sync_count++; - deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT; + /* Enforce minimum barrier open time to minimize packet loss */ + ASSERT (vm->barrier_no_close_before <= (now + BARRIER_MINIMUM_OPEN_LIMIT)); + while ((now = vlib_time_now (vm)) < vm->barrier_no_close_before) + ; + + /* Record time of closure */ + t_open = now - vm->barrier_epoch; + vm->barrier_epoch = now; + + deadline = now + BARRIER_SYNC_TIMEOUT; *vlib_worker_threads->wait_at_barrier = 1; while (*vlib_worker_threads->workers_at_barrier != count) { - if (vlib_time_now (vm) > deadline) + if ((now = vlib_time_now (vm)) > deadline) { fformat (stderr, "%s: worker thread deadlock\n", __FUNCTION__); os_panic (); } } + + t_closed = now - vm->barrier_epoch; + + barrier_trace_sync (t_entry, t_open, t_closed); + } void vlib_worker_thread_barrier_release (vlib_main_t * vm) { f64 deadline; + f64 now; + f64 minimum_open; + f64 t_entry; + f64 t_closed_total; + f64 t_update_main = 0.0; int refork_needed = 0; if (vec_len (vlib_mains) < 2) @@ -1235,8 +1500,15 @@ vlib_worker_thread_barrier_release (vlib_main_t * vm) ASSERT (vlib_get_thread_index () == 0); + + now = vlib_time_now (vm); + t_entry = now - vm->barrier_epoch; + if (--vlib_worker_threads[0].recursion_level > 0) - return; + { + barrier_trace_release_rec (t_entry); + return; + } /* Update (all) node runtimes before releasing the barrier, if needed */ if (vm->need_vlib_worker_thread_node_runtime_update) @@ -1249,15 +1521,17 @@ vlib_worker_thread_barrier_release (vlib_main_t * vm) refork_needed = 1; clib_smp_atomic_add (vlib_worker_threads->node_reforks_required, (vec_len (vlib_mains) - 1)); + now = vlib_time_now (vm); + t_update_main = now - vm->barrier_epoch; } - deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT; + deadline = now + BARRIER_SYNC_TIMEOUT; *vlib_worker_threads->wait_at_barrier = 0; while (*vlib_worker_threads->workers_at_barrier > 0) { - if (vlib_time_now (vm) > deadline) + if ((now = vlib_time_now (vm)) > deadline) { fformat (stderr, "%s: worker thread deadlock\n", __FUNCTION__); os_panic (); @@ -1267,11 +1541,13 @@ vlib_worker_thread_barrier_release (vlib_main_t * vm) /* Wait for reforks before continuing */ if (refork_needed) { - deadline = vlib_time_now (vm) + BARRIER_SYNC_TIMEOUT; + now = vlib_time_now (vm); + + deadline = now + BARRIER_SYNC_TIMEOUT; while (*vlib_worker_threads->node_reforks_required > 0) { - if (vlib_time_now (vm) > deadline) + if ((now = vlib_time_now (vm)) > deadline) { fformat (stderr, "%s: worker thread refork deadlock\n", __FUNCTION__); @@ -1279,6 +1555,23 @@ vlib_worker_thread_barrier_release (vlib_main_t * vm) } } } + + t_closed_total = now - vm->barrier_epoch; + + minimum_open = t_closed_total * BARRIER_MINIMUM_OPEN_FACTOR; + + if (minimum_open > BARRIER_MINIMUM_OPEN_LIMIT) + { + minimum_open = BARRIER_MINIMUM_OPEN_LIMIT; + } + + vm->barrier_no_close_before = now + minimum_open; + + /* Record barrier epoch (used to enforce minimum open time) */ + vm->barrier_epoch = now; + + barrier_trace_release (t_entry, t_closed_total, t_update_main); + } /* diff --git a/src/vlib/threads.h b/src/vlib/threads.h index c3f1cade..72340ee1 100644 --- a/src/vlib/threads.h +++ b/src/vlib/threads.h @@ -18,6 +18,22 @@ #include #include +/* + * To enable detailed tracing of barrier usage, including call stacks and + * timings, define BARRIER_TRACING here or in relevant TAGS. If also used + * with CLIB_DEBUG, timing will _not_ be representative of normal code + * execution. + * + */ + +// #define BARRIER_TRACING 1 + +/* + * Two options for barrier tracing output: syslog & elog. + */ + +// #define BARRIER_TRACING_ELOG 1 + extern vlib_main_t **vlib_mains; void vlib_set_thread_name (char *name); @@ -102,6 +118,10 @@ typedef struct vlib_thread_registration_t *registration; u8 *name; u64 barrier_sync_count; +#ifdef BARRIER_TRACING + const char *barrier_caller; + const char *barrier_context; +#endif volatile u32 *node_reforks_required; long lwp; @@ -179,7 +199,14 @@ u32 vlib_frame_queue_main_init (u32 node_index, u32 frame_queue_nelts); #define BARRIER_SYNC_TIMEOUT (1.0) #endif -void vlib_worker_thread_barrier_sync (vlib_main_t * vm); +#ifdef BARRIER_TRACING +#define vlib_worker_thread_barrier_sync(X) {vlib_worker_threads[0].barrier_caller=__FUNCTION__;vlib_worker_thread_barrier_sync_int(X);} +#else +#define vlib_worker_thread_barrier_sync(X) vlib_worker_thread_barrier_sync_int(X) +#endif + + +void vlib_worker_thread_barrier_sync_int (vlib_main_t * vm); void vlib_worker_thread_barrier_release (vlib_main_t * vm); void vlib_worker_thread_node_refork (void); diff --git a/src/vlibapi/api_common.h b/src/vlibapi/api_common.h index 651566ae..bbeccfc2 100644 --- a/src/vlibapi/api_common.h +++ b/src/vlibapi/api_common.h @@ -144,6 +144,12 @@ void vl_msg_api_queue_handler (unix_shared_memory_queue_t * q); void vl_msg_api_barrier_sync (void) __attribute__ ((weak)); void vl_msg_api_barrier_release (void) __attribute__ ((weak)); +#ifdef BARRIER_TRACING +void vl_msg_api_barrier_trace_context (const char *context) + __attribute__ ((weak)); +#else +#define vl_msg_api_barrier_trace_context(X) +#endif void vl_msg_api_free (void *); void vl_noop_handler (void *mp); void vl_msg_api_increment_missing_client_counter (void); diff --git a/src/vlibapi/api_shared.c b/src/vlibapi/api_shared.c index 5c1a9940..59dc2375 100644 --- a/src/vlibapi/api_shared.c +++ b/src/vlibapi/api_shared.c @@ -418,7 +418,10 @@ msg_handler_internal (api_main_t * am, if (do_it) { if (!am->is_mp_safe[id]) - vl_msg_api_barrier_sync (); + { + vl_msg_api_barrier_trace_context (am->msg_names[id]); + vl_msg_api_barrier_sync (); + } (*am->msg_handlers[id]) (the_msg); if (!am->is_mp_safe[id]) vl_msg_api_barrier_release (); @@ -498,7 +501,10 @@ vl_msg_api_handler_with_vm_node (api_main_t * am, vl_msg_api_trace (am, am->rx_trace, the_msg); if (!am->is_mp_safe[id]) - vl_msg_api_barrier_sync (); + { + vl_msg_api_barrier_trace_context (am->msg_names[id]); + vl_msg_api_barrier_sync (); + } (*handler) (the_msg, vm, node); if (!am->is_mp_safe[id]) vl_msg_api_barrier_release (); diff --git a/src/vlibmemory/memory_vlib.c b/src/vlibmemory/memory_vlib.c index 688ce604..55a90d64 100644 --- a/src/vlibmemory/memory_vlib.c +++ b/src/vlibmemory/memory_vlib.c @@ -1462,6 +1462,7 @@ _(TRACE_PLUGIN_MSG_IDS,trace_plugin_msg_ids) static clib_error_t * rpc_api_hookup (vlib_main_t * vm) { + api_main_t *am = &api_main; #define _(N,n) \ vl_msg_api_set_handlers(VL_API_##N, #n, \ vl_api_##n##_t_handler, \ @@ -1481,6 +1482,10 @@ rpc_api_hookup (vlib_main_t * vm) sizeof(vl_api_##n##_t), 1 /* do trace */); foreach_plugin_trace_msg; #undef _ + + /* No reason to halt the parade to create a trace record... */ + am->is_mp_safe[VL_API_TRACE_PLUGIN_MSG_IDS] = 1; + return 0; } diff --git a/src/vpp/vnet/main.c b/src/vpp/vnet/main.c index 76371dbe..b330f60f 100644 --- a/src/vpp/vnet/main.c +++ b/src/vpp/vnet/main.c @@ -294,6 +294,14 @@ os_exit (int code) exit (code); } +#ifdef BARRIER_TRACING +void +vl_msg_api_barrier_trace_context (const char *context) +{ + vlib_worker_threads[0].barrier_context = context; +} +#endif + void vl_msg_api_barrier_sync (void) { -- cgit 1.2.3-korg From d9226b25f145c64e5bc4a38c3fee7e9b2eaac2de Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Tue, 12 Sep 2017 15:34:17 +0200 Subject: physmem: remove debug leftovers Change-Id: I5a5dc0794d3398e749b64b07dfd1e2fc2230089b Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/cli.c | 1 - src/vlib/linux/physmem.c | 1 - 2 files changed, 2 deletions(-) (limited to 'src/vlib') diff --git a/src/plugins/dpdk/device/cli.c b/src/plugins/dpdk/device/cli.c index fe1c41c2..aeeb772d 100644 --- a/src/plugins/dpdk/device/cli.c +++ b/src/plugins/dpdk/device/cli.c @@ -357,7 +357,6 @@ show_dpdk_buffer (vlib_main_t * vm, unformat_input_t * input, "name=\"%s\" available = %7d allocated = %7d total = %7d\n", rmp->name, (u32) count, (u32) free_count, (u32) (count + free_count)); - rte_mempool_dump (stderr, rmp); } else { diff --git a/src/vlib/linux/physmem.c b/src/vlib/linux/physmem.c index fddff2ea..d8c5dc9b 100644 --- a/src/vlib/linux/physmem.c +++ b/src/vlib/linux/physmem.c @@ -300,7 +300,6 @@ unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, /* Don't want mheap mmap/munmap with IO memory. */ MHEAP_FLAG_DISABLE_VM | MHEAP_FLAG_THREAD_SAFE); - fformat (stdout, "%U", format_mheap, pr->heap, /* verbose */ 1); } if (flags & VLIB_PHYSMEM_F_HAVE_BUFFERS) -- cgit 1.2.3-korg From f49921f73f4e1f0b67823be445aafeb9ff2333a6 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Mon, 11 Sep 2017 16:52:11 +0200 Subject: clib_socket: add sendmsg / recvmsg with ancillary data support Change-Id: Ie18580e05ec12291e7026f21ad874e088a712c8e Signed-off-by: Damjan Marion --- src/vlib/unix/cli.c | 4 +- src/vpp/app/vppctl.c | 2 +- src/vppinfra/socket.c | 130 ++++++++++++++++++++++++++++++++++++++++----- src/vppinfra/socket.h | 49 +++++++++++++---- src/vppinfra/test_socket.c | 6 +-- 5 files changed, 164 insertions(+), 27 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/unix/cli.c b/src/vlib/unix/cli.c index 39368823..1567cc2a 100644 --- a/src/vlib/unix/cli.c +++ b/src/vlib/unix/cli.c @@ -2664,8 +2664,8 @@ unix_cli_config (vlib_main_t * vm, unformat_input_t * input) vec_free (tmp); } - s->flags = SOCKET_IS_SERVER | /* listen, don't connect */ - SOCKET_ALLOW_GROUP_WRITE; /* PF_LOCAL socket only */ + s->flags = CLIB_SOCKET_F_IS_SERVER | /* listen, don't connect */ + CLIB_SOCKET_F_ALLOW_GROUP_WRITE; /* PF_LOCAL socket only */ error = clib_socket_init (s); if (error) diff --git a/src/vpp/app/vppctl.c b/src/vpp/app/vppctl.c index a8f3eab0..980936f1 100644 --- a/src/vpp/app/vppctl.c +++ b/src/vpp/app/vppctl.c @@ -158,7 +158,7 @@ main (int argc, char *argv[]) while (argc--) cmd = format (cmd, "%s%c", (argv++)[0], argc ? ' ' : 0); - s->flags = SOCKET_IS_CLIENT; + s->flags = CLIB_SOCKET_F_IS_CLIENT; error = clib_socket_init (s); if (error) diff --git a/src/vppinfra/socket.c b/src/vppinfra/socket.c index 37dcbbfd..87a9333f 100644 --- a/src/vppinfra/socket.c +++ b/src/vppinfra/socket.c @@ -35,17 +35,18 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include +#include +#include /* strchr */ +#define __USE_GNU #include #include +#include #include #include #include #include #include -#include #include -#include /* strchr */ #include #include @@ -233,7 +234,7 @@ default_socket_read (clib_socket_t * sock, int n_bytes) u8 *buf; /* RX side of socket is down once end of file is reached. */ - if (sock->flags & SOCKET_RX_END_OF_FILE) + if (sock->flags & CLIB_SOCKET_F_RX_END_OF_FILE) return 0; fd = sock->fd; @@ -255,7 +256,7 @@ default_socket_read (clib_socket_t * sock, int n_bytes) /* Other side closed the socket. */ if (n_read == 0) - sock->flags |= SOCKET_RX_END_OF_FILE; + sock->flags |= CLIB_SOCKET_F_RX_END_OF_FILE; non_fatal: _vec_len (sock->rx_buffer) += n_read - n_bytes; @@ -271,6 +272,91 @@ default_socket_close (clib_socket_t * s) return 0; } +static clib_error_t * +default_socket_sendmsg (clib_socket_t * s, void *msg, int msglen, + int fds[], int num_fds) +{ + struct msghdr mh = { 0 }; + struct iovec iov[1]; + char ctl[CMSG_SPACE (sizeof (int)) * num_fds]; + int rv; + + iov[0].iov_base = msg; + iov[0].iov_len = msglen; + mh.msg_iov = iov; + mh.msg_iovlen = 1; + + if (num_fds > 0) + { + struct cmsghdr *cmsg; + memset (&ctl, 0, sizeof (ctl)); + mh.msg_control = ctl; + mh.msg_controllen = sizeof (ctl); + cmsg = CMSG_FIRSTHDR (&mh); + cmsg->cmsg_len = CMSG_LEN (sizeof (int) * num_fds); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy (CMSG_DATA (cmsg), fds, sizeof (int) * num_fds); + } + rv = sendmsg (s->fd, &mh, 0); + if (rv < 0) + return clib_error_return_unix (0, "sendmsg"); + return 0; +} + + +static clib_error_t * +default_socket_recvmsg (clib_socket_t * s, void *msg, int msglen, + int fds[], int num_fds) +{ + char ctl[CMSG_SPACE (sizeof (int) * num_fds) + + CMSG_SPACE (sizeof (struct ucred))]; + struct msghdr mh = { 0 }; + struct iovec iov[1]; + ssize_t size; + struct ucred *cr = 0; + struct cmsghdr *cmsg; + + iov[0].iov_base = msg; + iov[0].iov_len = msglen; + mh.msg_iov = iov; + mh.msg_iovlen = 1; + mh.msg_control = ctl; + mh.msg_controllen = sizeof (ctl); + + memset (ctl, 0, sizeof (ctl)); + + /* receive the incoming message */ + size = recvmsg (s->fd, &mh, 0); + if (size != msglen) + { + return (size == 0) ? clib_error_return (0, "disconnected") : + clib_error_return_unix (0, "recvmsg: malformed message (fd %d, '%s')", + s->fd, s->config); + } + + cmsg = CMSG_FIRSTHDR (&mh); + while (cmsg) + { + if (cmsg->cmsg_level == SOL_SOCKET) + { + if (cmsg->cmsg_type == SCM_CREDENTIALS) + { + cr = (struct ucred *) CMSG_DATA (cmsg); + s->uid = cr->uid; + s->gid = cr->gid; + s->pid = cr->pid; + } + else if (cmsg->cmsg_type == SCM_RIGHTS) + { + clib_memcpy (fds, CMSG_DATA (cmsg), num_fds * sizeof (int)); + } + } + cmsg = CMSG_NXTHDR (&mh, cmsg); + } + return 0; +} + static void socket_init_funcs (clib_socket_t * s) { @@ -280,6 +366,10 @@ socket_init_funcs (clib_socket_t * s) s->read_func = default_socket_read; if (!s->close_func) s->close_func = default_socket_close; + if (!s->sendmsg_func) + s->sendmsg_func = default_socket_sendmsg; + if (!s->recvmsg_func) + s->recvmsg_func = default_socket_recvmsg; } clib_error_t * @@ -291,18 +381,22 @@ clib_socket_init (clib_socket_t * s) struct sockaddr_un su; } addr; socklen_t addr_len = 0; + int socket_type; clib_error_t *error = 0; word port; error = socket_config (s->config, &addr.sa, &addr_len, - (s->flags & SOCKET_IS_SERVER + (s->flags & CLIB_SOCKET_F_IS_SERVER ? INADDR_LOOPBACK : INADDR_ANY)); if (error) goto done; socket_init_funcs (s); - s->fd = socket (addr.sa.sa_family, SOCK_STREAM, 0); + socket_type = s->flags & CLIB_SOCKET_F_SEQPACKET ? + SOCK_SEQPACKET : SOCK_STREAM; + + s->fd = socket (addr.sa.sa_family, socket_type, 0); if (s->fd < 0) { error = clib_error_return_unix (0, "socket (fd %d, '%s')", @@ -314,7 +408,7 @@ clib_socket_init (clib_socket_t * s) if (addr.sa.sa_family == PF_INET) port = ((struct sockaddr_in *) &addr)->sin_port; - if (s->flags & SOCKET_IS_SERVER) + if (s->flags & CLIB_SOCKET_F_IS_SERVER) { uword need_bind = 1; @@ -342,6 +436,18 @@ clib_socket_init (clib_socket_t * s) clib_unix_warning ("setsockopt SO_REUSEADDR fails"); } + if (addr.sa.sa_family == PF_LOCAL && s->flags & CLIB_SOCKET_F_PASSCRED) + { + int x = 1; + if (setsockopt (s->fd, SOL_SOCKET, SO_PASSCRED, &x, sizeof (x)) < 0) + { + error = clib_error_return_unix (0, "setsockopt (SO_PASSCRED, " + "fd %d, '%s')", s->fd, + s->config); + goto done; + } + } + if (need_bind && bind (s->fd, &addr.sa, addr_len) < 0) { error = clib_error_return_unix (0, "bind (fd %d, '%s')", @@ -356,7 +462,7 @@ clib_socket_init (clib_socket_t * s) goto done; } if (addr.sa.sa_family == PF_LOCAL - && s->flags & SOCKET_ALLOW_GROUP_WRITE) + && s->flags & CLIB_SOCKET_F_ALLOW_GROUP_WRITE) { struct stat st = { 0 }; if (stat (((struct sockaddr_un *) &addr)->sun_path, &st) < 0) @@ -378,7 +484,7 @@ clib_socket_init (clib_socket_t * s) } else { - if ((s->flags & SOCKET_NON_BLOCKING_CONNECT) + if ((s->flags & CLIB_SOCKET_F_NON_BLOCKING_CONNECT) && fcntl (s->fd, F_SETFL, O_NONBLOCK) < 0) { error = clib_error_return_unix (0, "fcntl NONBLOCK (fd %d, '%s')", @@ -387,7 +493,7 @@ clib_socket_init (clib_socket_t * s) } if (connect (s->fd, &addr.sa, addr_len) < 0 - && !((s->flags & SOCKET_NON_BLOCKING_CONNECT) && + && !((s->flags & CLIB_SOCKET_F_NON_BLOCKING_CONNECT) && errno == EINPROGRESS)) { error = clib_error_return_unix (0, "connect (fd %d, '%s')", @@ -434,7 +540,7 @@ clib_socket_accept (clib_socket_t * server, clib_socket_t * client) goto close_client; } - client->flags = SOCKET_IS_CLIENT; + client->flags = CLIB_SOCKET_F_IS_CLIENT; socket_init_funcs (client); return 0; diff --git a/src/vppinfra/socket.h b/src/vppinfra/socket.h index 75037208..4f9e9509 100644 --- a/src/vppinfra/socket.h +++ b/src/vppinfra/socket.h @@ -55,13 +55,14 @@ typedef struct _socket_t char *config; u32 flags; -#define SOCKET_IS_SERVER (1 << 0) -#define SOCKET_IS_CLIENT (0 << 0) -#define SOCKET_NON_BLOCKING_CONNECT (1 << 1) -#define SOCKET_ALLOW_GROUP_WRITE (1 << 2) +#define CLIB_SOCKET_F_IS_SERVER (1 << 0) +#define CLIB_SOCKET_F_IS_CLIENT (0 << 0) +#define CLIB_SOCKET_F_RX_END_OF_FILE (1 << 2) +#define CLIB_SOCKET_F_NON_BLOCKING_CONNECT (1 << 3) +#define CLIB_SOCKET_F_ALLOW_GROUP_WRITE (1 << 4) +#define CLIB_SOCKET_F_SEQPACKET (1 << 5) +#define CLIB_SOCKET_F_PASSCRED (1 << 6) - /* Read returned end-of-file. */ -#define SOCKET_RX_END_OF_FILE (1 << 2) /* Transmit buffer. Holds data waiting to be written. */ u8 *tx_buffer; @@ -72,10 +73,19 @@ typedef struct _socket_t /* Peer socket we are connected to. */ struct sockaddr_in peer; + /* Credentials, populated if CLIB_SOCKET_F_PASSCRED is set */ + pid_t pid; + uid_t uid; + gid_t gid; + clib_error_t *(*write_func) (struct _socket_t * sock); clib_error_t *(*read_func) (struct _socket_t * sock, int min_bytes); clib_error_t *(*close_func) (struct _socket_t * sock); - void *private_data; + clib_error_t *(*recvmsg_func) (struct _socket_t * s, void *msg, int msglen, + int fds[], int num_fds); + clib_error_t *(*sendmsg_func) (struct _socket_t * s, void *msg, int msglen, + int fds[], int num_fds); + uword private_data; } clib_socket_t; /* socket config format is host:port. @@ -89,7 +99,7 @@ clib_error_t *clib_socket_accept (clib_socket_t * server, always_inline uword clib_socket_is_server (clib_socket_t * sock) { - return (sock->flags & SOCKET_IS_SERVER) != 0; + return (sock->flags & CLIB_SOCKET_F_IS_SERVER) != 0; } always_inline uword @@ -98,10 +108,17 @@ clib_socket_is_client (clib_socket_t * s) return !clib_socket_is_server (s); } +always_inline uword +clib_socket_is_connected (clib_socket_t * sock) +{ + return sock->fd > 0; +} + + always_inline int clib_socket_rx_end_of_file (clib_socket_t * s) { - return s->flags & SOCKET_RX_END_OF_FILE; + return s->flags & CLIB_SOCKET_F_RX_END_OF_FILE; } always_inline void * @@ -130,6 +147,20 @@ clib_socket_rx (clib_socket_t * s, int n_bytes) return s->read_func (s, n_bytes); } +always_inline clib_error_t * +clib_socket_sendmsg (clib_socket_t * s, void *msg, int msglen, + int fds[], int num_fds) +{ + return s->sendmsg_func (s, msg, msglen, fds, num_fds); +} + +always_inline clib_error_t * +clib_socket_recvmsg (clib_socket_t * s, void *msg, int msglen, + int fds[], int num_fds) +{ + return s->recvmsg_func (s, msg, msglen, fds, num_fds); +} + always_inline void clib_socket_free (clib_socket_t * s) { diff --git a/src/vppinfra/test_socket.c b/src/vppinfra/test_socket.c index 0b05467a..2f25eccd 100644 --- a/src/vppinfra/test_socket.c +++ b/src/vppinfra/test_socket.c @@ -50,15 +50,15 @@ test_socket_main (unformat_input_t * input) clib_error_t *error; s->config = "localhost:22"; - s->flags = SOCKET_IS_CLIENT; + s->flags = CLIB_SOCKET_F_IS_CLIENT; while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { if (unformat (input, "server %s %=", &config, - &s->flags, SOCKET_IS_SERVER)) + &s->flags, CLIB_SOCKET_F_IS_SERVER)) ; else if (unformat (input, "client %s %=", &config, - &s->flags, SOCKET_IS_CLIENT)) + &s->flags, CLIB_SOCKET_F_IS_CLIENT)) ; else { -- cgit 1.2.3-korg From 01914ce45729833cec88c65689de9a0336cd40cc Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 14 Sep 2017 19:04:50 +0200 Subject: vppinfra: add clib_mem_vm_ext_alloc function Change-Id: Iff33694fc42cc3bcc73cf1372339053a6365039c Signed-off-by: Damjan Marion --- src/plugins/dpdk/device/init.c | 6 +- src/plugins/memif/memif.c | 21 ++- src/vlib.am | 5 +- src/vlib/linux/pci.c | 25 ++-- src/vlib/linux/physmem.c | 192 ++++-------------------- src/vlib/linux/syscall.h | 58 -------- src/vlib/linux/sysfs.c | 250 ------------------------------- src/vlib/linux/sysfs.h | 44 ------ src/vlib/threads.c | 6 +- src/vlib/threads_cli.c | 6 +- src/vnet/devices/af_packet/af_packet.c | 4 +- src/vppinfra.am | 5 +- src/vppinfra/linux/mem.c | 260 +++++++++++++++++++++++++++++++++ src/vppinfra/linux/syscall.h | 56 +++++++ src/vppinfra/linux/sysfs.c | 250 +++++++++++++++++++++++++++++++ src/vppinfra/linux/sysfs.h | 46 ++++++ src/vppinfra/mem.h | 94 ++++++++++-- src/vppinfra/vm_linux_kernel.h | 78 ---------- src/vppinfra/vm_standalone.h | 74 ---------- src/vppinfra/vm_unix.h | 106 -------------- 20 files changed, 761 insertions(+), 825 deletions(-) delete mode 100644 src/vlib/linux/syscall.h delete mode 100644 src/vlib/linux/sysfs.c delete mode 100644 src/vlib/linux/sysfs.h create mode 100644 src/vppinfra/linux/mem.c create mode 100644 src/vppinfra/linux/syscall.h create mode 100644 src/vppinfra/linux/sysfs.c create mode 100644 src/vppinfra/linux/sysfs.h delete mode 100644 src/vppinfra/vm_linux_kernel.h delete mode 100644 src/vppinfra/vm_standalone.h delete mode 100644 src/vppinfra/vm_unix.h (limited to 'src/vlib') diff --git a/src/plugins/dpdk/device/init.c b/src/plugins/dpdk/device/init.c index 95176fb8..ee61f94e 100755 --- a/src/plugins/dpdk/device/init.c +++ b/src/plugins/dpdk/device/init.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include @@ -1040,7 +1040,7 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) mem = mem_by_socket[c]; page_size = 1024; - e = vlib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail); + e = clib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail); if (e != 0 || pages_avail < 0 || page_size * pages_avail < mem) use_1g = 0; @@ -1049,7 +1049,7 @@ dpdk_config (vlib_main_t * vm, unformat_input_t * input) clib_error_free (e); page_size = 2; - e = vlib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail); + e = clib_sysfs_get_free_hugepages(c, page_size * 1024, &pages_avail); if (e != 0 || pages_avail < 0 || page_size * pages_avail < mem) use_2m = 0; diff --git a/src/plugins/memif/memif.c b/src/plugins/memif/memif.c index 8fec409a..6a609a57 100644 --- a/src/plugins/memif/memif.c +++ b/src/plugins/memif/memif.c @@ -33,7 +33,7 @@ #include #include -#include +#include #include #include #include @@ -267,6 +267,8 @@ memif_init_regions_and_queues (memif_if_t * mif) int i, j; u64 buffer_offset; memif_region_t *r; + clib_mem_vm_alloc_t alloc = { 0 }; + clib_error_t *err; vec_validate_aligned (mif->regions, 0, CLIB_CACHE_LINE_BYTES); r = vec_elt_at_index (mif->regions, 0); @@ -279,18 +281,15 @@ memif_init_regions_and_queues (memif_if_t * mif) mif->run.buffer_size * (1 << mif->run.log2_ring_size) * (mif->run.num_s2m_rings + mif->run.num_m2s_rings); - if ((r->fd = memfd_create ("memif region 0", MFD_ALLOW_SEALING)) == -1) - return clib_error_return_unix (0, "memfd_create"); - - if ((fcntl (r->fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1) - return clib_error_return_unix (0, "fcntl (F_ADD_SEALS, F_SEAL_SHRINK)"); + alloc.name = "memif region"; + alloc.size = r->region_size; + alloc.flags = CLIB_MEM_VM_F_SHARED; - if ((ftruncate (r->fd, r->region_size)) == -1) - return clib_error_return_unix (0, "ftruncate"); + err = clib_mem_vm_ext_alloc (&alloc); + if (err) + return err; - if ((r->shm = mmap (NULL, r->region_size, PROT_READ | PROT_WRITE, - MAP_SHARED, r->fd, 0)) == MAP_FAILED) - return clib_error_return_unix (0, "mmap"); + r->fd = alloc.fd; for (i = 0; i < mif->run.num_s2m_rings; i++) { diff --git a/src/vlib.am b/src/vlib.am index 41d68690..067e4afc 100644 --- a/src/vlib.am +++ b/src/vlib.am @@ -13,7 +13,7 @@ lib_LTLIBRARIES += libvlib.la -libvlib_la_LIBADD = libvppinfra.la -ldl -lpthread -lnuma +libvlib_la_LIBADD = libvppinfra.la -ldl -lpthread libvlib_la_DEPENDENCIES = libvppinfra.la BUILT_SOURCES += vlib/config.h @@ -34,7 +34,6 @@ libvlib_la_SOURCES = \ vlib/init.c \ vlib/linux/pci.c \ vlib/linux/physmem.c \ - vlib/linux/sysfs.c \ vlib/main.c \ vlib/mc.c \ vlib/node.c \ @@ -60,8 +59,6 @@ nobase_include_HEADERS += \ vlib/global_funcs.h \ vlib/i2c.h \ vlib/init.h \ - vlib/linux/sysfs.h \ - vlib/linux/syscall.h \ vlib/main.h \ vlib/mc.h \ vlib/node_funcs.h \ diff --git a/src/vlib/linux/pci.c b/src/vlib/linux/pci.c index 4ce19190..790f168a 100644 --- a/src/vlib/linux/pci.c +++ b/src/vlib/linux/pci.c @@ -37,10 +37,11 @@ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include -#include #include #include @@ -104,7 +105,7 @@ vlib_pci_bind_to_uio (vlib_pci_device_t * d, char *uio_driver_name) format_vlib_pci_addr, &d->bus_address); s = format (s, "%v/driver%c", dev_dir_name, 0); - driver_name = vlib_sysfs_link_to_name ((char *) s); + driver_name = clib_sysfs_link_to_name ((char *) s); vec_reset_length (s); if (driver_name && @@ -183,32 +184,32 @@ vlib_pci_bind_to_uio (vlib_pci_device_t * d, char *uio_driver_name) vec_reset_length (s); s = format (s, "%v/driver/unbind%c", dev_dir_name, 0); - vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); + clib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); vec_reset_length (s); s = format (s, "%v/driver_override%c", dev_dir_name, 0); if (access ((char *) s, F_OK) == 0) { - vlib_sysfs_write ((char *) s, "%s", uio_driver_name); + clib_sysfs_write ((char *) s, "%s", uio_driver_name); clear_driver_override = 1; } else { vec_reset_length (s); s = format (s, "/sys/bus/pci/drivers/%s/new_id%c", uio_driver_name, 0); - vlib_sysfs_write ((char *) s, "0x%04x 0x%04x", d->vendor_id, + clib_sysfs_write ((char *) s, "0x%04x 0x%04x", d->vendor_id, d->device_id); } vec_reset_length (s); s = format (s, "/sys/bus/pci/drivers/%s/bind%c", uio_driver_name, 0); - vlib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); + clib_sysfs_write ((char *) s, "%U", format_vlib_pci_addr, &d->bus_address); vec_reset_length (s); if (clear_driver_override) { s = format (s, "%v/driver_override%c", dev_dir_name, 0); - vlib_sysfs_write ((char *) s, "%c", 0); + clib_sysfs_write ((char *) s, "%c", 0); vec_reset_length (s); } @@ -602,28 +603,28 @@ scan_device (void *arg, u8 * dev_dir_name, u8 * ignored) dev->numa_node = -1; vec_reset_length (f); f = format (f, "%v/numa_node%c", dev_dir_name, 0); - vlib_sysfs_read ((char *) f, "%u", &dev->numa_node); + clib_sysfs_read ((char *) f, "%u", &dev->numa_node); vec_reset_length (f); f = format (f, "%v/class%c", dev_dir_name, 0); - vlib_sysfs_read ((char *) f, "0x%x", &tmp); + clib_sysfs_read ((char *) f, "0x%x", &tmp); dev->device_class = tmp >> 8; vec_reset_length (f); f = format (f, "%v/vendor%c", dev_dir_name, 0); - vlib_sysfs_read ((char *) f, "0x%x", &tmp); + clib_sysfs_read ((char *) f, "0x%x", &tmp); dev->vendor_id = tmp; vec_reset_length (f); f = format (f, "%v/device%c", dev_dir_name, 0); - vlib_sysfs_read ((char *) f, "0x%x", &tmp); + clib_sysfs_read ((char *) f, "0x%x", &tmp); dev->device_id = tmp; error = init_device (vm, dev, &pdev); vec_reset_length (f); f = format (f, "%v/driver%c", dev_dir_name, 0); - dev->driver_name = vlib_sysfs_link_to_name ((char *) f); + dev->driver_name = clib_sysfs_link_to_name ((char *) f); done: vec_free (f); diff --git a/src/vlib/linux/physmem.c b/src/vlib/linux/physmem.c index d8c5dc9b..3cc42a06 100644 --- a/src/vlib/linux/physmem.c +++ b/src/vlib/linux/physmem.c @@ -43,14 +43,12 @@ #include #include #include -#include -#include +#include +#include #include #include #include -#include -#include static void * unix_physmem_alloc_aligned (vlib_main_t * vm, vlib_physmem_region_index_t idx, @@ -111,31 +109,6 @@ unix_physmem_free (vlib_main_t * vm, vlib_physmem_region_index_t idx, void *x) mheap_put (pr->heap, x - pr->heap); } -static u64 -get_page_paddr (int fd, uword addr) -{ - int pagesize = sysconf (_SC_PAGESIZE); - u64 seek, pagemap = 0; - - seek = ((u64) addr / pagesize) * sizeof (u64); - if (lseek (fd, seek, SEEK_SET) != seek) - { - clib_unix_warning ("lseek to 0x%llx", seek); - return 0; - } - if (read (fd, &pagemap, sizeof (pagemap)) != (sizeof (pagemap))) - { - clib_unix_warning ("read ptbits"); - return 0; - } - if ((pagemap & (1ULL << 63)) == 0) - return 0; - - pagemap &= pow2_mask (55); - - return pagemap * pagesize; -} - static clib_error_t * unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, u8 numa_node, u32 flags, @@ -144,13 +117,8 @@ unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, vlib_physmem_main_t *vpm = &vm->physmem_main; vlib_physmem_region_t *pr; clib_error_t *error = 0; - int pagemap_fd = -1; - u8 *mount_dir = 0; - u8 *filename = 0; - struct stat st; - int old_mpol; - int mmap_flags; - struct bitmask *old_mask = numa_allocate_nodemask (); + clib_mem_vm_alloc_t alloc = { 0 }; + if (geteuid () != 0 && (flags & VLIB_PHYSMEM_F_FAKE) == 0) return clib_error_return (0, "not allowed"); @@ -163,113 +131,32 @@ unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, goto error; } - pr->index = pr - vpm->regions; - pr->fd = -1; - pr->flags = flags; - - if (get_mempolicy (&old_mpol, old_mask->maskp, old_mask->size + 1, NULL, 0) - == -1) - { - if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) - { - error = clib_error_return_unix (0, "get_mempolicy"); - goto error; - } - else - old_mpol = -1; - } + alloc.name = name; + alloc.size = size; + alloc.numa_node = numa_node; + alloc.flags = CLIB_MEM_VM_F_SHARED; if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) { - if ((pagemap_fd = open ((char *) "/proc/self/pagemap", O_RDONLY)) == -1) - { - error = clib_error_return_unix (0, "open '/proc/self/pagemap'"); - goto error; - } - - mount_dir = format (0, "%s/physmem_region%d%c", - vlib_unix_get_runtime_dir (), pr->index, 0); - filename = format (0, "%s/mem%c", mount_dir, 0); - - unlink ((char *) mount_dir); - - error = vlib_unix_recursive_mkdir ((char *) mount_dir); - if (error) - goto error; - - if (mount ("none", (char *) mount_dir, "hugetlbfs", 0, NULL)) - { - error = clib_error_return_unix (0, "mount hugetlb directory '%s'", - mount_dir); - goto error; - } - - if ((pr->fd = open ((char *) filename, O_CREAT | O_RDWR, 0755)) == -1) - { - error = clib_error_return_unix (0, "open"); - goto error; - } - - mmap_flags = MAP_SHARED | MAP_HUGETLB | MAP_LOCKED; + alloc.flags |= CLIB_MEM_VM_F_HUGETLB; + alloc.flags |= CLIB_MEM_VM_F_HUGETLB_PREALLOC; + alloc.flags |= CLIB_MEM_VM_F_NUMA_FORCE; } else { - if ((pr->fd = memfd_create (name, MFD_ALLOW_SEALING)) == -1) - return clib_error_return_unix (0, "memfd_create"); - - if ((fcntl (pr->fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1) - { - error = - clib_error_return_unix (0, "fcntl (F_ADD_SEALS, F_SEAL_SHRINK)"); - goto error; - } - mmap_flags = MAP_SHARED; - } - - if (fstat (pr->fd, &st)) - { - error = clib_error_return_unix (0, "fstat"); - goto error; - } - - pr->log2_page_size = min_log2 (st.st_blksize); - pr->n_pages = ((size - 1) >> pr->log2_page_size) + 1; - size = pr->n_pages * (1 << pr->log2_page_size); - - if ((ftruncate (pr->fd, size)) == -1) - { - error = clib_error_return_unix (0, "ftruncate length: %d", size); - goto error; - } - - if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) - { - error = vlib_sysfs_prealloc_hugepages (numa_node, - 1 << (pr->log2_page_size - 10), - pr->n_pages); - if (error) - goto error; - } - - if (old_mpol != -1) - numa_set_preferred (numa_node); - - pr->mem = mmap (0, size, (PROT_READ | PROT_WRITE), mmap_flags, pr->fd, 0); - - if (pr->mem == MAP_FAILED) - { - pr->mem = 0; - error = clib_error_return_unix (0, "mmap"); - goto error; + alloc.flags |= CLIB_MEM_VM_F_NUMA_PREFER; } - if (old_mpol != -1 && - set_mempolicy (old_mpol, old_mask->maskp, old_mask->size + 1) == -1) - { - error = clib_error_return_unix (0, "set_mempolicy"); - goto error; - } + error = clib_mem_vm_ext_alloc (&alloc); + if (error) + goto error; + pr->index = pr - vpm->regions; + pr->flags = flags; + pr->fd = alloc.fd; + pr->mem = alloc.addr; + pr->log2_page_size = alloc.log2_page_size; + pr->n_pages = alloc.n_pages; pr->size = pr->n_pages << pr->log2_page_size; pr->page_mask = (1 << pr->log2_page_size) - 1; pr->numa_node = numa_node; @@ -285,13 +172,14 @@ unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, move_pages (0, 1, &ptr, 0, &node, 0); if (numa_node != node) { - clib_warning - ("physmem page for region \'%s\' allocated on the wrong" - " numa node (requested %u actual %u)", pr->name, - pr->numa_node, node, i); + clib_warning ("physmem page for region \'%s\' allocated on the" + " wrong numa node (requested %u actual %u)", + pr->name, pr->numa_node, node, i); break; } } + pr->page_table = clib_mem_vm_get_paddr (pr->mem, pr->log2_page_size, + pr->n_pages); } if (flags & VLIB_PHYSMEM_F_INIT_MHEAP) @@ -309,41 +197,13 @@ unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, *idx = pr->index; - if ((flags & VLIB_PHYSMEM_F_FAKE) == 0) - { - int i; - for (i = 0; i < pr->n_pages; i++) - { - uword vaddr = - pointer_to_uword (pr->mem) + (((u64) i) << pr->log2_page_size); - u64 page_paddr = get_page_paddr (pagemap_fd, vaddr); - vec_add1 (pr->page_table, page_paddr); - } - } - goto done; error: - if (pr->fd > -1) - close (pr->fd); - - if (pr->mem) - munmap (pr->mem, size); - memset (pr, 0, sizeof (*pr)); pool_put (vpm->regions, pr); done: - if (mount_dir) - { - umount2 ((char *) mount_dir, MNT_DETACH); - rmdir ((char *) mount_dir); - vec_free (mount_dir); - } - numa_free_cpumask (old_mask); - vec_free (filename); - if (pagemap_fd > -1) - close (pagemap_fd); return error; } diff --git a/src/vlib/linux/syscall.h b/src/vlib/linux/syscall.h deleted file mode 100644 index 9e37997e..00000000 --- a/src/vlib/linux/syscall.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2017 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef included_linux_syscall_h -#define included_linux_syscall_h - -#ifndef __NR_memfd_create -#if defined __x86_64__ -#define __NR_memfd_create 319 -#elif defined __arm__ -#define __NR_memfd_create 385 -#elif defined __aarch64__ -#define __NR_memfd_create 279 -#else -#error "__NR_memfd_create unknown for this architecture" -#endif -#endif - -static inline int -memfd_create (const char *name, unsigned int flags) -{ - return syscall (__NR_memfd_create, name, flags); -} - -#ifndef F_LINUX_SPECIFIC_BASE -#define F_LINUX_SPECIFIC_BASE 1024 -#endif -#define MFD_ALLOW_SEALING 0x0002U -#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) -#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) - -#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ -#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ -#define F_SEAL_GROW 0x0004 /* prevent file from growing */ -#define F_SEAL_WRITE 0x0008 /* prevent writes */ - - -#endif /* included_linux_syscall_h */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vlib/linux/sysfs.c b/src/vlib/linux/sysfs.c deleted file mode 100644 index f92f9ef5..00000000 --- a/src/vlib/linux/sysfs.c +++ /dev/null @@ -1,250 +0,0 @@ -/* - * Copyright (c) 2017 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include -#include -#include -#include - -clib_error_t * -vlib_sysfs_write (char *file_name, char *fmt, ...) -{ - u8 *s; - int fd; - clib_error_t *error = 0; - - fd = open (file_name, O_WRONLY); - if (fd < 0) - return clib_error_return_unix (0, "open `%s'", file_name); - - va_list va; - va_start (va, fmt); - s = va_format (0, fmt, &va); - va_end (va); - - if (write (fd, s, vec_len (s)) < 0) - error = clib_error_return_unix (0, "write `%s'", file_name); - - vec_free (s); - close (fd); - return error; -} - -clib_error_t * -vlib_sysfs_read (char *file_name, char *fmt, ...) -{ - unformat_input_t input; - u8 *s = 0; - int fd; - ssize_t sz; - uword result; - - fd = open (file_name, O_RDONLY); - if (fd < 0) - return clib_error_return_unix (0, "open `%s'", file_name); - - vec_validate (s, 4095); - - sz = read (fd, s, vec_len (s)); - if (sz < 0) - { - close (fd); - vec_free (s); - return clib_error_return_unix (0, "read `%s'", file_name); - } - - _vec_len (s) = sz; - unformat_init_vector (&input, s); - - va_list va; - va_start (va, fmt); - result = va_unformat (&input, fmt, &va); - va_end (va); - - vec_free (s); - close (fd); - - if (result == 0) - return clib_error_return (0, "unformat error"); - - return 0; -} - -u8 * -vlib_sysfs_link_to_name (char *link) -{ - char *p, buffer[64]; - unformat_input_t in; - u8 *s = 0; - int r; - - r = readlink (link, buffer, sizeof (buffer) - 1); - - if (r < 0) - return 0; - - buffer[r] = 0; - p = strrchr (buffer, '/'); - - if (!p) - return 0; - - unformat_init_string (&in, p + 1, strlen (p + 1)); - if (unformat (&in, "%s", &s) != 1) - clib_unix_warning ("no string?"); - unformat_free (&in); - - return s; -} - -clib_error_t * -vlib_sysfs_set_nr_hugepages (unsigned int numa_node, int page_size, int nr) -{ - clib_error_t *error = 0; - struct stat sb; - u8 *p = 0; - - p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); - - if (stat ((char *) p, &sb) == 0) - { - if (S_ISDIR (sb.st_mode) == 0) - { - error = clib_error_return (0, "'%s' is not directory", p); - goto done; - } - } - else if (numa_node == 0) - { - vec_reset_length (p); - p = format (p, "/sys/kernel/mm%c", 0); - if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) - { - error = clib_error_return (0, "'%s' does not exist or it is not " - "directory", p); - goto done; - } - } - else - { - error = clib_error_return (0, "'%s' does not exist", p); - goto done; - } - - _vec_len (p) -= 1; - p = format (p, "/hugepages/hugepages-%ukB/nr_hugepages%c", page_size, 0); - vlib_sysfs_write ((char *) p, "%d", nr); - -done: - vec_free (p); - return error; -} - - -static clib_error_t * -vlib_sysfs_get_xxx_hugepages (char *type, unsigned int numa_node, - int page_size, int *val) -{ - clib_error_t *error = 0; - struct stat sb; - u8 *p = 0; - - p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); - - if (stat ((char *) p, &sb) == 0) - { - if (S_ISDIR (sb.st_mode) == 0) - { - error = clib_error_return (0, "'%s' is not directory", p); - goto done; - } - } - else if (numa_node == 0) - { - vec_reset_length (p); - p = format (p, "/sys/kernel/mm%c", 0); - if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) - { - error = clib_error_return (0, "'%s' does not exist or it is not " - "directory", p); - goto done; - } - } - else - { - error = clib_error_return (0, "'%s' does not exist", p); - goto done; - } - - _vec_len (p) -= 1; - p = format (p, "/hugepages/hugepages-%ukB/%s_hugepages%c", page_size, - type, 0); - error = vlib_sysfs_read ((char *) p, "%d", val); - -done: - vec_free (p); - return error; -} - -clib_error_t * -vlib_sysfs_get_free_hugepages (unsigned int numa_node, int page_size, int *v) -{ - return vlib_sysfs_get_xxx_hugepages ("free", numa_node, page_size, v); -} - -clib_error_t * -vlib_sysfs_get_nr_hugepages (unsigned int numa_node, int page_size, int *v) -{ - return vlib_sysfs_get_xxx_hugepages ("nr", numa_node, page_size, v); -} - -clib_error_t * -vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, int page_size, - int *v) -{ - return vlib_sysfs_get_xxx_hugepages ("surplus", numa_node, page_size, v); -} - -clib_error_t * -vlib_sysfs_prealloc_hugepages (unsigned int numa_node, int page_size, int nr) -{ - clib_error_t *error = 0; - int n, needed; - error = vlib_sysfs_get_free_hugepages (numa_node, page_size, &n); - if (error) - return error; - needed = nr - n; - if (needed <= 0) - return 0; - - error = vlib_sysfs_get_nr_hugepages (numa_node, page_size, &n); - if (error) - return error; - clib_warning ("pre-allocating %u additional %uK hugepages on numa node %u", - needed, page_size, numa_node); - return vlib_sysfs_set_nr_hugepages (numa_node, page_size, n + needed); -} - - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vlib/linux/sysfs.h b/src/vlib/linux/sysfs.h deleted file mode 100644 index 14b71317..00000000 --- a/src/vlib/linux/sysfs.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2017 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef included_linux_sysfs_h -#define included_linux_sysfs_h - -clib_error_t *vlib_sysfs_write (char *file_name, char *fmt, ...); - -clib_error_t *vlib_sysfs_read (char *file_name, char *fmt, ...); - -u8 *vlib_sysfs_link_to_name (char *link); - -clib_error_t *vlib_sysfs_set_nr_hugepages (unsigned int numa_node, - int page_size, int nr); -clib_error_t *vlib_sysfs_get_nr_hugepages (unsigned int numa_node, - int page_size, int *v); -clib_error_t *vlib_sysfs_get_free_hugepages (unsigned int numa_node, - int page_size, int *v); -clib_error_t *vlib_sysfs_get_surplus_hugepages (unsigned int numa_node, - int page_size, int *v); -clib_error_t *vlib_sysfs_prealloc_hugepages (unsigned int numa_node, - int page_size, int nr); - -#endif /* included_linux_sysfs_h */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vlib/threads.c b/src/vlib/threads.c index 2d9ce84a..f9c7043c 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -289,7 +289,7 @@ sort_registrations_by_no_clone (void *a0, void *a1) } static uword * -vlib_sysfs_list_to_bitmap (char *filename) +clib_sysfs_list_to_bitmap (char *filename) { FILE *fp; uword *r = 0; @@ -331,9 +331,9 @@ vlib_thread_init (vlib_main_t * vm) /* get bitmaps of active cpu cores and sockets */ tm->cpu_core_bitmap = - vlib_sysfs_list_to_bitmap ("/sys/devices/system/cpu/online"); + clib_sysfs_list_to_bitmap ("/sys/devices/system/cpu/online"); tm->cpu_socket_bitmap = - vlib_sysfs_list_to_bitmap ("/sys/devices/system/node/online"); + clib_sysfs_list_to_bitmap ("/sys/devices/system/node/online"); avail_cpu = clib_bitmap_dup (tm->cpu_core_bitmap); diff --git a/src/vlib/threads_cli.c b/src/vlib/threads_cli.c index f8d5d8f9..02bdea5c 100644 --- a/src/vlib/threads_cli.c +++ b/src/vlib/threads_cli.c @@ -15,10 +15,10 @@ #define _GNU_SOURCE #include +#include #include #include -#include #include static u8 * @@ -98,14 +98,14 @@ show_threads_fn (vlib_main_t * vm, u8 *p = 0; p = format (p, "%s%u/topology/core_id%c", sys_cpu_path, lcore, 0); - vlib_sysfs_read ((char *) p, "%d", &core_id); + clib_sysfs_read ((char *) p, "%d", &core_id); vec_reset_length (p); p = format (p, "%s%u/topology/physical_package_id%c", sys_cpu_path, lcore, 0); - vlib_sysfs_read ((char *) p, "%d", &socket_id); + clib_sysfs_read ((char *) p, "%d", &socket_id); vec_free (p); line = format (line, "%-7u%-7u%-7u%", lcore, core_id, socket_id); diff --git a/src/vnet/devices/af_packet/af_packet.c b/src/vnet/devices/af_packet/af_packet.c index 62bb228f..32696014 100644 --- a/src/vnet/devices/af_packet/af_packet.c +++ b/src/vnet/devices/af_packet/af_packet.c @@ -24,9 +24,9 @@ #include #include +#include #include #include -#include #include #include @@ -75,7 +75,7 @@ af_packet_eth_flag_change (vnet_main_t * vnm, vnet_hw_interface_t * hi, { s = format (0, "/sys/class/net/%s/mtu%c", apif->host_if_name, 0); - error = vlib_sysfs_write ((char *) s, "%d", hi->max_packet_bytes); + error = clib_sysfs_write ((char *) s, "%d", hi->max_packet_bytes); vec_free (s); if (error) diff --git a/src/vppinfra.am b/src/vppinfra.am index a5769a0d..daca9954 100644 --- a/src/vppinfra.am +++ b/src/vppinfra.am @@ -188,6 +188,8 @@ nobase_include_HEADERS = \ vppinfra/graph.h \ vppinfra/hash.h \ vppinfra/heap.h \ + vppinfra/linux/sysfs.h \ + vppinfra/linux/syscall.h \ vppinfra/lock.h \ vppinfra/longjmp.h \ vppinfra/macros.h \ @@ -233,7 +235,6 @@ nobase_include_HEADERS = \ vppinfra/vector_neon.h \ vppinfra/vector_sse2.h \ vppinfra/valgrind.h \ - vppinfra/vm_unix.h \ vppinfra/xxhash.h \ vppinfra/xy.h \ vppinfra/zvec.h @@ -291,6 +292,8 @@ CLIB_CORE = \ libvppinfra_la_SOURCES = \ $(CLIB_CORE) \ vppinfra/elf_clib.c \ + vppinfra/linux/mem.c \ + vppinfra/linux/sysfs.c \ vppinfra/socket.c \ vppinfra/timer.c \ vppinfra/unix-formats.c \ diff --git a/src/vppinfra/linux/mem.c b/src/vppinfra/linux/mem.c new file mode 100644 index 00000000..665ddf61 --- /dev/null +++ b/src/vppinfra/linux/mem.c @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifndef F_LINUX_SPECIFIC_BASE +#define F_LINUX_SPECIFIC_BASE 1024 +#endif + +#ifndef F_ADD_SEALS +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) + +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE 0x0008 /* prevent writes */ +#endif + +int +clib_mem_vm_get_log2_page_size (int fd) +{ + struct stat st = { 0 }; + if (fstat (fd, &st)) + return 0; + return min_log2 (st.st_blksize); +} + +clib_error_t * +clib_mem_vm_ext_alloc (clib_mem_vm_alloc_t * a) +{ + int fd = -1; + clib_error_t *err = 0; + void *addr = 0; + u8 *filename = 0; + int mmap_flags = MAP_SHARED; + int log2_page_size; + int n_pages; + int old_mpol = -1; + u64 old_mask[16] = { 0 }; + + /* save old numa mem policy if needed */ + if (a->flags & (CLIB_MEM_VM_F_NUMA_PREFER | CLIB_MEM_VM_F_NUMA_FORCE)) + { + int rv; + rv = + get_mempolicy (&old_mpol, old_mask, sizeof (old_mask) * 8 + 1, 0, 0); + + if (rv == -1) + { + if ((a->flags & CLIB_MEM_VM_F_NUMA_FORCE) != 0) + { + err = clib_error_return_unix (0, "get_mempolicy"); + goto error; + } + else + old_mpol = -1; + } + } + + /* if we are creating shared segment, we need file descriptor */ + if (a->flags & CLIB_MEM_VM_F_SHARED) + { + /* if hugepages are needed we need to create mount point */ + if (a->flags & CLIB_MEM_VM_F_HUGETLB) + { + char *mount_dir; + char template[] = "/tmp/hugepage_mount.XXXXXX"; + + mount_dir = mkdtemp (template); + if (mount_dir == 0) + return clib_error_return_unix (0, "mkdtemp \'%s\'", template); + + if (mount ("none", (char *) mount_dir, "hugetlbfs", 0, NULL)) + { + err = clib_error_return_unix (0, "mount hugetlb directory '%s'", + mount_dir); + goto error; + } + + filename = format (0, "%s/%s%c", mount_dir, a->name, 0); + + if ((fd = open ((char *) filename, O_CREAT | O_RDWR, 0755)) == -1) + { + err = clib_error_return_unix (0, "open"); + goto error; + } + umount2 ((char *) mount_dir, MNT_DETACH); + rmdir ((char *) mount_dir); + mmap_flags |= MAP_LOCKED; + } + else + { + if ((fd = memfd_create (a->name, MFD_ALLOW_SEALING)) == -1) + { + err = clib_error_return_unix (0, "memfd_create"); + goto error; + } + + if ((fcntl (fd, F_ADD_SEALS, F_SEAL_SHRINK)) == -1) + { + err = clib_error_return_unix (0, "fcntl (F_ADD_SEALS)"); + goto error; + } + } + log2_page_size = clib_mem_vm_get_log2_page_size (fd); + } + else /* not CLIB_MEM_VM_F_SHARED */ + { + if (a->flags & CLIB_MEM_VM_F_HUGETLB) + { + mmap_flags |= MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS; + log2_page_size = 21; + } + else + { + mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS; + log2_page_size = min_log2 (sysconf (_SC_PAGESIZE)); + } + } + + n_pages = ((a->size - 1) >> log2_page_size) + 1; + + + if (a->flags & CLIB_MEM_VM_F_HUGETLB_PREALLOC) + { + err = clib_sysfs_prealloc_hugepages (a->numa_node, + 1 << (log2_page_size - 10), + n_pages); + if (err) + goto error; + + } + + if (fd != -1) + if ((ftruncate (fd, a->size)) == -1) + { + err = clib_error_return_unix (0, "ftruncate"); + goto error; + } + + if (old_mpol != -1) + { + int rv; + u64 mask[16] = { 0 }; + mask[0] = 1 << a->numa_node; + rv = set_mempolicy (MPOL_BIND, mask, sizeof (mask) * 8 + 1); + if (rv) + { + err = clib_error_return_unix (0, "set_mempolicy"); + goto error; + } + } + + addr = mmap (0, a->size, (PROT_READ | PROT_WRITE), mmap_flags, fd, 0); + if (addr == MAP_FAILED) + { + err = clib_error_return_unix (0, "mmap"); + goto error; + } + + /* re-apply ole numa memory policy */ + if (old_mpol != -1 && + set_mempolicy (old_mpol, old_mask, sizeof (old_mask) * 8 + 1) == -1) + { + err = clib_error_return_unix (0, "set_mempolicy"); + goto error; + } + + a->log2_page_size = log2_page_size; + a->n_pages = n_pages; + a->addr = addr; + a->fd = fd; + goto done; + +error: + if (fd != -1) + close (fd); + +done: + vec_free (filename); + return err; +} + +u64 * +clib_mem_vm_get_paddr (void *mem, int log2_page_size, int n_pages) +{ + int pagesize = sysconf (_SC_PAGESIZE); + int fd; + int i; + u64 *r = 0; + + if ((fd = open ((char *) "/proc/self/pagemap", O_RDONLY)) == -1) + return 0; + + for (i = 0; i < n_pages; i++) + { + u64 seek, pagemap = 0; + uword vaddr = pointer_to_uword (mem) + (((u64) i) << log2_page_size); + seek = ((u64) vaddr / pagesize) * sizeof (u64); + if (lseek (fd, seek, SEEK_SET) != seek) + goto done; + + if (read (fd, &pagemap, sizeof (pagemap)) != (sizeof (pagemap))) + goto done; + + if ((pagemap & (1ULL << 63)) == 0) + goto done; + + pagemap &= pow2_mask (55); + vec_add1 (r, pagemap * pagesize); + } + +done: + close (fd); + if (vec_len (r) != n_pages) + { + vec_free (r); + return 0; + } + return r; +} + + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vppinfra/linux/syscall.h b/src/vppinfra/linux/syscall.h new file mode 100644 index 00000000..f8ec5919 --- /dev/null +++ b/src/vppinfra/linux/syscall.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_linux_syscall_h +#define included_linux_syscall_h + +#include +#include + +static inline long +set_mempolicy (int mode, const unsigned long *nodemask, unsigned long maxnode) +{ + return syscall (__NR_set_mempolicy, mode, nodemask, maxnode); +} + +static inline int +get_mempolicy (int *mode, unsigned long *nodemask, unsigned long maxnode, + void *addr, unsigned long flags) +{ + return syscall (__NR_get_mempolicy, mode, nodemask, maxnode, addr, flags); +} + +static inline long +move_pages (int pid, unsigned long count, void **pages, const int *nodes, + int *status, int flags) +{ + return syscall (__NR_move_pages, pid, count, pages, nodes, status, flags); +} + +static inline int +memfd_create (const char *name, unsigned int flags) +{ + return syscall (__NR_memfd_create, name, flags); +} + +#endif /* included_linux_syscall_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vppinfra/linux/sysfs.c b/src/vppinfra/linux/sysfs.c new file mode 100644 index 00000000..5f611e6a --- /dev/null +++ b/src/vppinfra/linux/sysfs.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include + +clib_error_t * +clib_sysfs_write (char *file_name, char *fmt, ...) +{ + u8 *s; + int fd; + clib_error_t *error = 0; + + fd = open (file_name, O_WRONLY); + if (fd < 0) + return clib_error_return_unix (0, "open `%s'", file_name); + + va_list va; + va_start (va, fmt); + s = va_format (0, fmt, &va); + va_end (va); + + if (write (fd, s, vec_len (s)) < 0) + error = clib_error_return_unix (0, "write `%s'", file_name); + + vec_free (s); + close (fd); + return error; +} + +clib_error_t * +clib_sysfs_read (char *file_name, char *fmt, ...) +{ + unformat_input_t input; + u8 *s = 0; + int fd; + ssize_t sz; + uword result; + + fd = open (file_name, O_RDONLY); + if (fd < 0) + return clib_error_return_unix (0, "open `%s'", file_name); + + vec_validate (s, 4095); + + sz = read (fd, s, vec_len (s)); + if (sz < 0) + { + close (fd); + vec_free (s); + return clib_error_return_unix (0, "read `%s'", file_name); + } + + _vec_len (s) = sz; + unformat_init_vector (&input, s); + + va_list va; + va_start (va, fmt); + result = va_unformat (&input, fmt, &va); + va_end (va); + + vec_free (s); + close (fd); + + if (result == 0) + return clib_error_return (0, "unformat error"); + + return 0; +} + +u8 * +clib_sysfs_link_to_name (char *link) +{ + char *p, buffer[64]; + unformat_input_t in; + u8 *s = 0; + int r; + + r = readlink (link, buffer, sizeof (buffer) - 1); + + if (r < 0) + return 0; + + buffer[r] = 0; + p = strrchr (buffer, '/'); + + if (!p) + return 0; + + unformat_init_string (&in, p + 1, strlen (p + 1)); + if (unformat (&in, "%s", &s) != 1) + clib_unix_warning ("no string?"); + unformat_free (&in); + + return s; +} + +clib_error_t * +clib_sysfs_set_nr_hugepages (int numa_node, int page_size, int nr) +{ + clib_error_t *error = 0; + struct stat sb; + u8 *p = 0; + + p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); + + if (stat ((char *) p, &sb) == 0) + { + if (S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' is not directory", p); + goto done; + } + } + else if (numa_node == 0) + { + vec_reset_length (p); + p = format (p, "/sys/kernel/mm%c", 0); + if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' does not exist or it is not " + "directory", p); + goto done; + } + } + else + { + error = clib_error_return (0, "'%s' does not exist", p); + goto done; + } + + _vec_len (p) -= 1; + p = format (p, "/hugepages/hugepages-%ukB/nr_hugepages%c", page_size, 0); + clib_sysfs_write ((char *) p, "%d", nr); + +done: + vec_free (p); + return error; +} + + +static clib_error_t * +clib_sysfs_get_xxx_hugepages (char *type, int numa_node, + int page_size, int *val) +{ + clib_error_t *error = 0; + struct stat sb; + u8 *p = 0; + + p = format (p, "/sys/devices/system/node/node%u%c", numa_node, 0); + + if (stat ((char *) p, &sb) == 0) + { + if (S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' is not directory", p); + goto done; + } + } + else if (numa_node == 0) + { + vec_reset_length (p); + p = format (p, "/sys/kernel/mm%c", 0); + if (stat ((char *) p, &sb) < 0 || S_ISDIR (sb.st_mode) == 0) + { + error = clib_error_return (0, "'%s' does not exist or it is not " + "directory", p); + goto done; + } + } + else + { + error = clib_error_return (0, "'%s' does not exist", p); + goto done; + } + + _vec_len (p) -= 1; + p = format (p, "/hugepages/hugepages-%ukB/%s_hugepages%c", page_size, + type, 0); + error = clib_sysfs_read ((char *) p, "%d", val); + +done: + vec_free (p); + return error; +} + +clib_error_t * +clib_sysfs_get_free_hugepages (int numa_node, int page_size, int *v) +{ + return clib_sysfs_get_xxx_hugepages ("free", numa_node, page_size, v); +} + +clib_error_t * +clib_sysfs_get_nr_hugepages (int numa_node, int page_size, int *v) +{ + return clib_sysfs_get_xxx_hugepages ("nr", numa_node, page_size, v); +} + +clib_error_t * +clib_sysfs_get_surplus_hugepages (int numa_node, int page_size, int *v) +{ + return clib_sysfs_get_xxx_hugepages ("surplus", numa_node, page_size, v); +} + +clib_error_t * +clib_sysfs_prealloc_hugepages (int numa_node, int page_size, int nr) +{ + clib_error_t *error = 0; + int n, needed; + error = clib_sysfs_get_free_hugepages (numa_node, page_size, &n); + if (error) + return error; + needed = nr - n; + if (needed <= 0) + return 0; + + error = clib_sysfs_get_nr_hugepages (numa_node, page_size, &n); + if (error) + return error; + clib_warning ("pre-allocating %u additional %uK hugepages on numa node %u", + needed, page_size, numa_node); + return clib_sysfs_set_nr_hugepages (numa_node, page_size, n + needed); +} + + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vppinfra/linux/sysfs.h b/src/vppinfra/linux/sysfs.h new file mode 100644 index 00000000..6c80cf95 --- /dev/null +++ b/src/vppinfra/linux/sysfs.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2017 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_linux_sysfs_h +#define included_linux_sysfs_h + +#include + +clib_error_t *clib_sysfs_write (char *file_name, char *fmt, ...); + +clib_error_t *clib_sysfs_read (char *file_name, char *fmt, ...); + +u8 *clib_sysfs_link_to_name (char *link); + +clib_error_t *clib_sysfs_set_nr_hugepages (int numa_node, + int page_size, int nr); +clib_error_t *clib_sysfs_get_nr_hugepages (int numa_node, + int page_size, int *v); +clib_error_t *clib_sysfs_get_free_hugepages (int numa_node, + int page_size, int *v); +clib_error_t *clib_sysfs_get_surplus_hugepages (int numa_node, + int page_size, int *v); +clib_error_t *clib_sysfs_prealloc_hugepages (int numa_node, + int page_size, int nr); + +#endif /* included_linux_sysfs_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/vppinfra/mem.h b/src/vppinfra/mem.h index 63c5ac16..69ab8803 100644 --- a/src/vppinfra/mem.h +++ b/src/vppinfra/mem.h @@ -39,8 +39,11 @@ #define _included_clib_mem_h #include +#include +#include #include /* uword, etc */ +#include #include #include #include /* memcpy, memset */ @@ -264,19 +267,90 @@ void clib_mem_usage (clib_mem_usage_t * usage); u8 *format_clib_mem_usage (u8 * s, va_list * args); -/* Include appropriate VM functions depending on whether - we are compiling for linux kernel, for Unix or standalone. */ -#ifdef CLIB_LINUX_KERNEL -#include -#endif +/* Allocate virtual address space. */ +always_inline void * +clib_mem_vm_alloc (uword size) +{ + void *mmap_addr; + uword flags = MAP_PRIVATE; -#ifdef CLIB_UNIX -#include +#ifdef MAP_ANONYMOUS + flags |= MAP_ANONYMOUS; #endif -#ifdef CLIB_STANDALONE -#include -#endif + mmap_addr = mmap (0, size, PROT_READ | PROT_WRITE, flags, -1, 0); + if (mmap_addr == (void *) -1) + mmap_addr = 0; + + return mmap_addr; +} + +always_inline void +clib_mem_vm_free (void *addr, uword size) +{ + munmap (addr, size); +} + +always_inline void * +clib_mem_vm_unmap (void *addr, uword size) +{ + void *mmap_addr; + uword flags = MAP_PRIVATE | MAP_FIXED; + + /* To unmap we "map" with no protection. If we actually called + munmap then other callers could steal the address space. By + changing to PROT_NONE the kernel can free up the pages which is + really what we want "unmap" to mean. */ + mmap_addr = mmap (addr, size, PROT_NONE, flags, -1, 0); + if (mmap_addr == (void *) -1) + mmap_addr = 0; + + return mmap_addr; +} + +always_inline void * +clib_mem_vm_map (void *addr, uword size) +{ + void *mmap_addr; + uword flags = MAP_PRIVATE | MAP_FIXED; + + mmap_addr = mmap (addr, size, (PROT_READ | PROT_WRITE), flags, -1, 0); + if (mmap_addr == (void *) -1) + mmap_addr = 0; + + return mmap_addr; +} + +typedef struct +{ +#define CLIB_MEM_VM_F_SHARED (1 << 0) +#define CLIB_MEM_VM_F_HUGETLB (1 << 1) +#define CLIB_MEM_VM_F_NUMA_PREFER (1 << 2) +#define CLIB_MEM_VM_F_NUMA_FORCE (1 << 3) +#define CLIB_MEM_VM_F_HUGETLB_PREALLOC (1 << 4) + u32 flags; /**< vm allocation flags: +
CLIB_MEM_VM_F_SHARED: request shared memory, file + destiptor will be provided on successful allocation. +
CLIB_MEM_VM_F_HUGETLB: request hugepages. +
CLIB_MEM_VM_F_NUMA_PREFER: numa_node field contains valid + numa node preference. +
CLIB_MEM_VM_F_NUMA_FORCE: fail if setting numa policy fails. +
CLIB_MEM_VM_F_HUGETLB_PREALLOC: pre-allocate hugepages if + number of available pages is not sufficient. + */ + char *name; /**< Name for memory allocation, set by caller. */ + uword size; /**< Allocation size, set by caller. */ + int numa_node; /**< numa node preference. Valid if CLIB_MEM_VM_F_NUMA_PREFER set. */ + void *addr; /**< Pointer to allocated memory, set on successful allocation. */ + int fd; /**< File desriptor, set on successful allocation if CLIB_MEM_VM_F_SHARED is set. */ + int log2_page_size; /* Page size in log2 format, set on successful allocation. */ + int n_pages; /* Number of pages. */ +} clib_mem_vm_alloc_t; + +clib_error_t *clib_mem_vm_ext_alloc (clib_mem_vm_alloc_t * a); +int clib_mem_vm_get_log2_page_size (int fd); +u64 *clib_mem_vm_get_paddr (void *mem, int log2_page_size, int n_pages); + #include /* clib_panic */ diff --git a/src/vppinfra/vm_linux_kernel.h b/src/vppinfra/vm_linux_kernel.h deleted file mode 100644 index fd9e6148..00000000 --- a/src/vppinfra/vm_linux_kernel.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - Copyright (c) 2001, 2002, 2003 Eliot Dresselhaus - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -#ifndef included_vm_linux_kernel_h -#define included_vm_linux_kernel_h - -#include -#include /* for GFP_* */ -#include /* for PAGE_KERNEL */ - -/* Allocate virtual address space. */ -always_inline void * -clib_mem_vm_alloc (uword size) -{ - return vmalloc (size); -} - -always_inline void -clib_mem_vm_free (void *addr, uword size) -{ - vfree (addr); -} - -always_inline void * -clib_mem_vm_unmap (void *addr, uword size) -{ - return 0; -} - -always_inline void * -clib_mem_vm_map (void *addr, uword size) -{ - return addr; -} - -#endif /* included_vm_linux_kernel_h */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vppinfra/vm_standalone.h b/src/vppinfra/vm_standalone.h deleted file mode 100644 index 2cd431bc..00000000 --- a/src/vppinfra/vm_standalone.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - Copyright (c) 2001, 2002, 2003 Eliot Dresselhaus - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -#ifndef included_vm_standalone_h -#define included_vm_standalone_h - -/* Stubs for standalone "system" which has no VM support. */ - -always_inline void * -clib_mem_vm_alloc (uword size) -{ - return 0; -} - -always_inline void -clib_mem_vm_free (void *addr, uword size) -{ -} - -always_inline void * -clib_mem_vm_unmap (void *addr, uword size) -{ - return 0; -} - -always_inline void * -clib_mem_vm_map (void *addr, uword size) -{ - return addr; -} - -#endif /* included_vm_standalone_h */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ diff --git a/src/vppinfra/vm_unix.h b/src/vppinfra/vm_unix.h deleted file mode 100644 index 07e86516..00000000 --- a/src/vppinfra/vm_unix.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2015 Cisco and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/* - Copyright (c) 2001, 2002, 2003 Eliot Dresselhaus - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE - LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -#ifndef included_vm_unix_h -#define included_vm_unix_h - -#include -#include - -/* Allocate virtual address space. */ -always_inline void * -clib_mem_vm_alloc (uword size) -{ - void *mmap_addr; - uword flags = MAP_PRIVATE; - -#ifdef MAP_ANONYMOUS - flags |= MAP_ANONYMOUS; -#endif - - mmap_addr = mmap (0, size, PROT_READ | PROT_WRITE, flags, -1, 0); - if (mmap_addr == (void *) -1) - mmap_addr = 0; - - return mmap_addr; -} - -always_inline void -clib_mem_vm_free (void *addr, uword size) -{ - munmap (addr, size); -} - -always_inline void * -clib_mem_vm_unmap (void *addr, uword size) -{ - void *mmap_addr; - uword flags = MAP_PRIVATE | MAP_FIXED; - - /* To unmap we "map" with no protection. If we actually called - munmap then other callers could steal the address space. By - changing to PROT_NONE the kernel can free up the pages which is - really what we want "unmap" to mean. */ - mmap_addr = mmap (addr, size, PROT_NONE, flags, -1, 0); - if (mmap_addr == (void *) -1) - mmap_addr = 0; - - return mmap_addr; -} - -always_inline void * -clib_mem_vm_map (void *addr, uword size) -{ - void *mmap_addr; - uword flags = MAP_PRIVATE | MAP_FIXED; - - mmap_addr = mmap (addr, size, (PROT_READ | PROT_WRITE), flags, -1, 0); - if (mmap_addr == (void *) -1) - mmap_addr = 0; - - return mmap_addr; -} - -#endif /* included_vm_unix_h */ - -/* - * fd.io coding-style-patch-verification: ON - * - * Local Variables: - * eval: (c-set-style "gnu") - * End: - */ -- cgit 1.2.3-korg From 2f9b0c05fca7ca829ea438da1d87e2bf93969500 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Mon, 11 Sep 2017 20:54:15 -0400 Subject: dpdk: cli to check for buffer leakage Use buffer pre_data and existing buffer trace trajectory code to find out dpdk buffer leakages. Change-Id: I26a5d8bd2f23d01cb6070ffc3ddcc6d3d863b575 Signed-off-by: Florin Coras --- src/plugins/dpdk/buffer.c | 63 +++++++++++++++++++++++++++++++++++++++- src/plugins/dpdk/device/cli.c | 53 +++++++++++++++++++++++++++++++++ src/plugins/dpdk/device/device.c | 6 ++++ src/plugins/dpdk/device/dpdk.h | 5 ++++ src/vlib/buffer.c | 2 +- src/vnet/ip/ip4_forward.c | 1 + src/vnet/tcp/tcp_output.c | 10 ++++--- 7 files changed, 134 insertions(+), 6 deletions(-) (limited to 'src/vlib') diff --git a/src/plugins/dpdk/buffer.c b/src/plugins/dpdk/buffer.c index 28af100a..e09d8019 100644 --- a/src/plugins/dpdk/buffer.c +++ b/src/plugins/dpdk/buffer.c @@ -340,7 +340,7 @@ vlib_buffer_free_inline (vlib_main_t * vm, vlib_buffer_t *b; b = vlib_get_buffer (vm, buffers[i]); - + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b); fl = vlib_buffer_get_buffer_free_list (vm, b, &fi); /* The only current use of this callback: multicast recycle */ @@ -493,6 +493,67 @@ buffer_state_validation_init (vlib_main_t * vm) VLIB_INIT_FUNCTION (buffer_state_validation_init); #endif +#if CLI_DEBUG +struct dpdk_validate_buf_result +{ + u32 invalid; + u32 uninitialized; +}; + +#define DPDK_TRAJECTORY_POISON 31 + +static void +dpdk_buffer_validate_trajectory (struct rte_mempool *mp, void *opaque, + void *obj, unsigned obj_idx) +{ + vlib_buffer_t *b; + struct dpdk_validate_buf_result *counter = opaque; + b = vlib_buffer_from_rte_mbuf ((struct rte_mbuf *) obj); + if (b->pre_data[0] != 0) + { + if (b->pre_data[0] == DPDK_TRAJECTORY_POISON) + counter->uninitialized++; + else + counter->invalid++; + } +} + +int +dpdk_buffer_validate_trajectory_all (u32 * uninitialized) +{ + dpdk_main_t *dm = &dpdk_main; + struct dpdk_validate_buf_result counter = { 0 }; + int i; + + for (i = 0; i < vec_len (dm->pktmbuf_pools); i++) + rte_mempool_obj_iter (dm->pktmbuf_pools[i], + dpdk_buffer_validate_trajectory, &counter); + if (uninitialized) + *uninitialized = counter.uninitialized; + return counter.invalid; +} + +static void +dpdk_buffer_poison_trajectory (struct rte_mempool *mp, void *opaque, + void *obj, unsigned obj_idx) +{ + vlib_buffer_t *b; + b = vlib_buffer_from_rte_mbuf ((struct rte_mbuf *) obj); + b->pre_data[0] = DPDK_TRAJECTORY_POISON; +} + +void +dpdk_buffer_poison_trajectory_all (void) +{ + dpdk_main_t *dm = &dpdk_main; + int i; + + for (i = 0; i < vec_len (dm->pktmbuf_pools); i++) + rte_mempool_obj_iter (dm->pktmbuf_pools[i], dpdk_buffer_poison_trajectory, + 0); +} +#endif + /* *INDENT-OFF* */ VLIB_BUFFER_REGISTER_CALLBACKS (dpdk, static) = { .vlib_buffer_alloc_cb = &dpdk_buffer_alloc, diff --git a/src/plugins/dpdk/device/cli.c b/src/plugins/dpdk/device/cli.c index aeeb772d..c9fcea5c 100644 --- a/src/plugins/dpdk/device/cli.c +++ b/src/plugins/dpdk/device/cli.c @@ -1885,6 +1885,59 @@ VLIB_CLI_COMMAND (show_vpe_version_command, static) = { }; /* *INDENT-ON* */ +#if CLI_DEBUG + +static clib_error_t * +dpdk_validate_buffers_fn (vlib_main_t * vm, unformat_input_t * input, + vlib_cli_command_t * cmd_arg) +{ + u32 n_invalid_bufs = 0, uninitialized = 0; + u32 is_poison = 0, is_test = 0; + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "poison")) + is_poison = 1; + else if (unformat (input, "trajectory")) + is_test = 1; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + if (VLIB_BUFFER_TRACE_TRAJECTORY == 0) + { + vlib_cli_output (vm, "Trajectory not enabled. Recompile with " + "VLIB_BUFFER_TRACE_TRAJECTORY 1"); + return 0; + } + if (is_poison) + { + dpdk_buffer_poison_trajectory_all (); + } + if (is_test) + { + n_invalid_bufs = dpdk_buffer_validate_trajectory_all (&uninitialized); + if (!n_invalid_bufs) + vlib_cli_output (vm, "All buffers are valid %d uninitialized", + uninitialized); + else + vlib_cli_output (vm, "Found %d invalid buffers and %d uninitialized", + n_invalid_bufs, uninitialized); + } + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (test_dpdk_buffers_command, static) = +{ + .path = "test dpdk buffers", + .short_help = "test dpdk buffers [poison] [trajectory]", + .function = dpdk_validate_buffers_fn, +}; +/* *INDENT-ON* */ + +#endif + clib_error_t * dpdk_cli_init (vlib_main_t * vm) { diff --git a/src/plugins/dpdk/device/device.c b/src/plugins/dpdk/device/device.c index 97c13630..aa134327 100644 --- a/src/plugins/dpdk/device/device.c +++ b/src/plugins/dpdk/device/device.c @@ -462,6 +462,11 @@ dpdk_interface_tx (vlib_main_t * vm, or_flags = b0->flags | b1->flags | b2->flags | b3->flags; + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b1); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b2); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b3); + if (or_flags & VLIB_BUFFER_NEXT_PRESENT) { dpdk_validate_rte_mbuf (vm, b0, 1); @@ -556,6 +561,7 @@ dpdk_interface_tx (vlib_main_t * vm, from++; b0 = vlib_get_buffer (vm, bi0); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); dpdk_validate_rte_mbuf (vm, b0, 1); diff --git a/src/plugins/dpdk/device/dpdk.h b/src/plugins/dpdk/device/dpdk.h index 849e687b..9762c713 100644 --- a/src/plugins/dpdk/device/dpdk.h +++ b/src/plugins/dpdk/device/dpdk.h @@ -467,6 +467,11 @@ admin_up_down_process (vlib_main_t * vm, clib_error_t *dpdk_buffer_pool_create (vlib_main_t * vm, unsigned num_mbufs, unsigned socket_id); +#if CLI_DEBUG +int dpdk_buffer_validate_trajectory_all (u32 * uninitialized); +void dpdk_buffer_poison_trajectory_all (void); +#endif + #endif /* __included_dpdk_h__ */ /* diff --git a/src/vlib/buffer.c b/src/vlib/buffer.c index a5ec0e0a..7399b618 100644 --- a/src/vlib/buffer.c +++ b/src/vlib/buffer.c @@ -686,7 +686,7 @@ vlib_buffer_free_inline (vlib_main_t * vm, u32 bi = buffers[i]; b = vlib_get_buffer (vm, bi); - + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b); fl = vlib_buffer_get_buffer_free_list (vm, b, &fi); /* The only current use of this callback: multicast recycle */ diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index b3de1201..c526003c 100755 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -2131,6 +2131,7 @@ ip4_arp_inline (vlib_main_t * vm, vlib_buffer_copy_trace_flag (vm, p0, bi0); b0 = vlib_get_buffer (vm, bi0); + VLIB_BUFFER_TRACE_TRAJECTORY_INIT (b0); vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0; vlib_buffer_advance (b0, -adj0->rewrite_header.data_bytes); diff --git a/src/vnet/tcp/tcp_output.c b/src/vnet/tcp/tcp_output.c index 9cb3e779..b843c926 100644 --- a/src/vnet/tcp/tcp_output.c +++ b/src/vnet/tcp/tcp_output.c @@ -439,14 +439,16 @@ tcp_init_mss (tcp_connection_t * tc) always_inline int tcp_alloc_tx_buffers (tcp_main_t * tm, u8 thread_index, u32 n_free_buffers) { + vlib_main_t *vm = vlib_get_main (); u32 current_length = vec_len (tm->tx_buffers[thread_index]); + u32 n_allocated; vec_validate (tm->tx_buffers[thread_index], current_length + n_free_buffers - 1); - _vec_len (tm->tx_buffers[thread_index]) = current_length - + vlib_buffer_alloc (vlib_get_main (), - &tm->tx_buffers[thread_index][current_length], - n_free_buffers); + n_allocated = + vlib_buffer_alloc (vm, &tm->tx_buffers[thread_index][current_length], + n_free_buffers); + _vec_len (tm->tx_buffers[thread_index]) = current_length + n_allocated; /* buffer shortage, report failure */ if (vec_len (tm->tx_buffers[thread_index]) == 0) { -- cgit 1.2.3-korg From b2bcad6238b7e8a669ae29c74079eb9bb9fbb694 Mon Sep 17 00:00:00 2001 From: Chris Luke Date: Mon, 18 Sep 2017 08:51:22 -0400 Subject: Fixes for issues Coverity has reported (VPP-972) 177117: fstat() returns -1 on error; the code is checking for any positive value instead 175142: final return could never be reached; simple refactoring 175235,175236: Warning suppressed with an explicit cast to (void) 174817: Final return couldn't be reached; is is_in_order is 0 then 'rv' is already returned above 172095,172093: If is_is_set does not get set to 1, then return 0 has already been invoked 174405: Re-kill this (nothing sets rv) 171136: Looks like a cmd line flag to set test_bytes was missing; added it, and refactored the argc/argv processing to avoid two other potential segv's 176813: Add range checking for term width/height. First stab at a reasonable range is 1-512 for both. 175350: Fix implicit casting in shift operation 174272: Not a c+p error; try using a coverity annotation to ignore it 174273,175320: Annotated FORWARD_NULL Change-Id: I58d0f860fc2209f59f8d1b6b344d631b8d429ace Signed-off-by: Chris Luke --- src/uri/sock_test_client.c | 1 + src/uri/uri_socket_test.c | 24 ++++++++++++++++++++++-- src/uri/vppcom.c | 8 ++++++-- src/vlib/linux/physmem.c | 2 +- src/vlib/unix/cli.c | 30 ++++++++++++++++++++++++++++++ src/vnet/mpls/mpls_api.c | 2 +- src/vnet/session/session.c | 5 +---- src/vnet/session/session_cli.c | 37 +++++++------------------------------ src/vnet/session/session_lookup.c | 8 ++++---- src/vppinfra/linux/mem.c | 2 +- 10 files changed, 74 insertions(+), 45 deletions(-) (limited to 'src/vlib') diff --git a/src/uri/sock_test_client.c b/src/uri/sock_test_client.c index ab8e5a0e..151c90b2 100644 --- a/src/uri/sock_test_client.c +++ b/src/uri/sock_test_client.c @@ -429,6 +429,7 @@ exit_client (void) tsock = &scm->test_socket[i]; tsock->cfg.test = SOCK_TEST_TYPE_EXIT; + /* coverity[COPY_PASTE_ERROR] */ if (ctrl->cfg.verbose) { printf ("\nCLIENT (fd %d): Sending exit cfg to server...\n", diff --git a/src/uri/uri_socket_test.c b/src/uri/uri_socket_test.c index 5f7084d5..4469b03d 100644 --- a/src/uri/uri_socket_test.c +++ b/src/uri/uri_socket_test.c @@ -36,8 +36,6 @@ main (int argc, char *argv[]) if (argc >= 3) { - bytes = ((long) atoi (argv[4])) << 20; - no_echo = atoi (argv[3]); portno = atoi (argv[2]); server = gethostbyname (argv[1]); if (server == NULL) @@ -45,6 +43,28 @@ main (int argc, char *argv[]) clib_unix_warning ("gethostbyname"); exit (1); } + + argc -= 3; + argv += 3; + + if (argc) + { + bytes = ((long) atoi (argv[0])) << 20; + argc--; + argv++; + } + if (argc) + { + no_echo = atoi (argv[0]); + argc--; + argv++; + } + if (argc) + { + test_bytes = atoi (argv[0]); + argc--; + argv++; + } } else { diff --git a/src/uri/vppcom.c b/src/uri/vppcom.c index 8a8a806c..c7ae0ea5 100644 --- a/src/uri/vppcom.c +++ b/src/uri/vppcom.c @@ -1478,7 +1478,7 @@ vppcom_cfg_read (char *conf_fname) while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { - unformat_user (input, unformat_line_input, line_input); + (void) unformat_user (input, unformat_line_input, line_input); unformat_skip_white_space (line_input); if (unformat (line_input, "vppcom {")) @@ -2359,12 +2359,14 @@ vppcom_select (unsigned long n_bits, unsigned long *read_map, clib_bitmap_get (vcm->ex_bitmap, session_index) && (rv < 0)) { // TBD: clib_warning + /* coverity[FORWARD_NULL] */ clib_bitmap_set_no_check (except_map, session_index, 1); bits_set++; } else if (rv > 0) { // TBD: clib_warning + /* coverity[FORWARD_NULL] */ clib_bitmap_set_no_check (read_map, session_index, 1); bits_set++; } @@ -2387,9 +2389,10 @@ vppcom_select (unsigned long n_bits, unsigned long *read_map, rv = vppcom_session_write_ready (session, session_index); clib_spinlock_unlock (&vcm->sessions_lockp); - if (rv > 0) + if (rv > 0 ) { // TBD: clib_warning + /* coverity[FORWARD_NULL] */ clib_bitmap_set_no_check (write_map, session_index, 1); bits_set++; } @@ -2415,6 +2418,7 @@ vppcom_select (unsigned long n_bits, unsigned long *read_map, if (rv < 0) { // TBD: clib_warning + /* coverity[FORWARD_NULL] */ clib_bitmap_set_no_check (except_map, session_index, 1); bits_set++; } diff --git a/src/vlib/linux/physmem.c b/src/vlib/linux/physmem.c index 3cc42a06..6d3f7c55 100644 --- a/src/vlib/linux/physmem.c +++ b/src/vlib/linux/physmem.c @@ -157,7 +157,7 @@ unix_physmem_region_alloc (vlib_main_t * vm, char *name, u32 size, pr->mem = alloc.addr; pr->log2_page_size = alloc.log2_page_size; pr->n_pages = alloc.n_pages; - pr->size = pr->n_pages << pr->log2_page_size; + pr->size = (u64) pr->n_pages << (u64) pr->log2_page_size; pr->page_mask = (1 << pr->log2_page_size) - 1; pr->numa_node = numa_node; pr->name = format (0, "%s", name); diff --git a/src/vlib/unix/cli.c b/src/vlib/unix/cli.c index 1567cc2a..1624ce38 100644 --- a/src/vlib/unix/cli.c +++ b/src/vlib/unix/cli.c @@ -91,6 +91,15 @@ * protocol message. This is a saftey measure. */ #define UNIX_CLI_MAX_DEPTH_TELNET 24 +/** Minimum terminal width we will accept */ +#define UNIX_CLI_MIN_TERMINAL_WIDTH 1 +/** Maximum terminal width we will accept */ +#define UNIX_CLI_MAX_TERMINAL_WIDTH 512 +/** Minimum terminal height we will accept */ +#define UNIX_CLI_MIN_TERMINAL_HEIGHT 1 +/** Maximum terminal height we will accept */ +#define UNIX_CLI_MAX_TERMINAL_HEIGHT 512 + /** Unix standard in */ #define UNIX_CLI_STDIN_FD 0 @@ -1164,10 +1173,21 @@ unix_cli_process_telnet (unix_main_t * um, /* Window size */ if (i != 8) /* check message is correct size */ break; + cf->width = clib_net_to_host_u16 (*((u16 *) (input_vector + 3))); + if (cf->width > UNIX_CLI_MAX_TERMINAL_WIDTH) + cf->width = UNIX_CLI_MAX_TERMINAL_WIDTH; + if (cf->width < UNIX_CLI_MIN_TERMINAL_WIDTH) + cf->width = UNIX_CLI_MIN_TERMINAL_WIDTH; + cf->height = clib_net_to_host_u16 (*((u16 *) (input_vector + 5))); + if (cf->height > UNIX_CLI_MAX_TERMINAL_HEIGHT) + cf->height = UNIX_CLI_MAX_TERMINAL_HEIGHT; + if (cf->height < UNIX_CLI_MIN_TERMINAL_HEIGHT) + cf->height = UNIX_CLI_MIN_TERMINAL_HEIGHT; + /* reindex pager buffer */ unix_cli_pager_reindex (cf); /* redraw page */ @@ -2539,8 +2559,18 @@ unix_cli_resize_interrupt (int signum) /* We can't trust ws.XXX... */ return; } + cf->width = ws.ws_col; + if (cf->width > UNIX_CLI_MAX_TERMINAL_WIDTH) + cf->width = UNIX_CLI_MAX_TERMINAL_WIDTH; + if (cf->width < UNIX_CLI_MIN_TERMINAL_WIDTH) + cf->width = UNIX_CLI_MIN_TERMINAL_WIDTH; + cf->height = ws.ws_row; + if (cf->height > UNIX_CLI_MAX_TERMINAL_HEIGHT) + cf->height = UNIX_CLI_MAX_TERMINAL_HEIGHT; + if (cf->height < UNIX_CLI_MIN_TERMINAL_HEIGHT) + cf->height = UNIX_CLI_MIN_TERMINAL_HEIGHT; /* Reindex the pager buffer */ unix_cli_pager_reindex (cf); diff --git a/src/vnet/mpls/mpls_api.c b/src/vnet/mpls/mpls_api.c index 988c2c98..762c40ff 100644 --- a/src/vnet/mpls/mpls_api.c +++ b/src/vnet/mpls/mpls_api.c @@ -96,7 +96,7 @@ vl_api_mpls_table_add_del_t_handler (vl_api_mpls_table_add_del_t * mp) else mpls_table_delete (ntohl (mp->mt_table_id), 1); - rv = (rv == 0) ? vnm->api_errno : rv; + // NB: Nothing sets rv; none of the above returns an error REPLY_MACRO (VL_API_MPLS_TABLE_ADD_DEL_REPLY); } diff --git a/src/vnet/session/session.c b/src/vnet/session/session.c index 4544f9a0..792e6612 100644 --- a/src/vnet/session/session.c +++ b/src/vnet/session/session.c @@ -267,10 +267,7 @@ stream_session_enqueue_data (transport_connection_t * tc, vlib_buffer_t * b, } } - if (is_in_order) - return enqueued; - - return 0; + return enqueued; } /** Check if we have space in rx fifo to push more bytes */ diff --git a/src/vnet/session/session_cli.c b/src/vnet/session/session_cli.c index d9f516be..8c30a1df 100755 --- a/src/vnet/session/session_cli.c +++ b/src/vnet/session/session_cli.c @@ -127,13 +127,8 @@ unformat_stream_session_id (unformat_input_t * input, va_list * args) *is_ip4 = 0; tuple_is_set = 1; } - else - return 0; - - if (tuple_is_set) - return 1; - return 0; + return tuple_is_set; } uword @@ -144,21 +139,12 @@ unformat_stream_session (unformat_input_t * input, va_list * args) u8 proto = ~0; ip46_address_t lcl, rmt; u32 lcl_port = 0, rmt_port = 0; - u8 is_ip4 = 0, s_type = ~0, id_is_set = 0; + u8 is_ip4 = 0, s_type = ~0; - if (unformat (input, "%U", unformat_stream_session_id, &proto, &lcl, &rmt, - &lcl_port, &rmt_port, &is_ip4)) - { - id_is_set = 1; - } - else + if (!unformat (input, "%U", unformat_stream_session_id, &proto, &lcl, &rmt, + &lcl_port, &rmt_port, &is_ip4)) return 0; - if (!id_is_set) - { - return 0; - } - s_type = session_type_from_proto_and_ip (proto, is_ip4); if (is_ip4) s = stream_session_lookup4 (&lcl.ip4, &rmt.ip4, @@ -185,21 +171,12 @@ unformat_transport_connection (unformat_input_t * input, va_list * args) u8 proto = ~0; ip46_address_t lcl, rmt; u32 lcl_port = 0, rmt_port = 0; - u8 is_ip4 = 0, s_type = ~0, id_is_set = 0; + u8 is_ip4 = 0, s_type = ~0; - if (unformat (input, "%U", unformat_stream_session_id, &proto, &lcl, &rmt, - &lcl_port, &rmt_port, &is_ip4)) - { - id_is_set = 1; - } - else + if (!unformat (input, "%U", unformat_stream_session_id, &proto, &lcl, &rmt, + &lcl_port, &rmt_port, &is_ip4)) return 0; - if (!id_is_set) - { - return 0; - } - proto = (proto == (u8) ~ 0) ? suggested_proto : proto; if (proto == (u8) ~ 0) return 0; diff --git a/src/vnet/session/session_lookup.c b/src/vnet/session/session_lookup.c index 0f9abf9a..4487b1c3 100644 --- a/src/vnet/session/session_lookup.c +++ b/src/vnet/session/session_lookup.c @@ -233,15 +233,15 @@ stream_session_half_open_table_add (transport_connection_t * tc, u64 value) { make_v4_ss_kv_from_tc (&kv4, tc); kv4.value = value; - clib_bihash_add_del_16_8 (&sl->v4_half_open_hash, &kv4, - 1 /* is_add */ ); + (void) clib_bihash_add_del_16_8 (&sl->v4_half_open_hash, &kv4, + 1 /* is_add */ ); } else { make_v6_ss_kv_from_tc (&kv6, tc); kv6.value = value; - clib_bihash_add_del_48_8 (&sl->v6_half_open_hash, &kv6, - 1 /* is_add */ ); + (void) clib_bihash_add_del_48_8 (&sl->v6_half_open_hash, &kv6, + 1 /* is_add */ ); } } diff --git a/src/vppinfra/linux/mem.c b/src/vppinfra/linux/mem.c index 665ddf61..df46763a 100644 --- a/src/vppinfra/linux/mem.c +++ b/src/vppinfra/linux/mem.c @@ -49,7 +49,7 @@ int clib_mem_vm_get_log2_page_size (int fd) { struct stat st = { 0 }; - if (fstat (fd, &st)) + if (fstat (fd, &st) == -1) return 0; return min_log2 (st.st_blksize); } -- cgit 1.2.3-korg From 609707ea530de6a0f9fa989b8269b973dd89174e Mon Sep 17 00:00:00 2001 From: John Lo Date: Tue, 19 Sep 2017 21:45:10 -0400 Subject: Fix DHCP client so it works for worker threads Fix dhcp_client_for_us() function to utilize rpc_call_main_thread to call vlib_process_signal_event() to ensure proper handling irrespective of it being called in main thread or worker thread. Added ASSERT to vlib_process_sinal.. path to make sure it is called in main thread. Change-Id: I4109cc049d8e4225d896ce492ce201011dc9c911 Signed-off-by: John Lo --- src/vlib/node_funcs.h | 3 +++ src/vnet/dhcp/client.c | 16 ++++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h index 0059b9be..3ae4e541 100644 --- a/src/vlib/node_funcs.h +++ b/src/vlib/node_funcs.h @@ -851,6 +851,9 @@ vlib_process_signal_event_data (vlib_main_t * vm, vlib_process_t *p = vec_elt (nm->processes, n->runtime_index); uword *h, t; + /* Must be in main thread */ + ASSERT (vlib_get_thread_index () == 0); + h = hash_get (p->event_type_index_by_type_opaque, type_opaque); if (!h) { diff --git a/src/vnet/dhcp/client.c b/src/vnet/dhcp/client.c index dd5e99f2..5986438b 100644 --- a/src/vnet/dhcp/client.c +++ b/src/vnet/dhcp/client.c @@ -120,6 +120,17 @@ set_l2_rewrite (dhcp_client_main_t * dcm, dhcp_client_t * c) 0 /* broadcast */); } +void vl_api_rpc_call_main_thread (void *fp, u8 * data, u32 data_length); + +static void +dhcp_client_proc_callback (uword * client_index) +{ + vlib_main_t *vm = vlib_get_main (); + ASSERT (vlib_get_thread_index () == 0); + vlib_process_signal_event (vm, dhcp_client_process_node.index, + EVENT_DHCP_CLIENT_WAKEUP, *client_index); +} + /* * dhcp_client_for_us - server-to-client callback. * Called from proxy_node.c:dhcp_proxy_to_client_input(). @@ -251,8 +262,9 @@ int dhcp_client_for_us (u32 bi, vlib_buffer_t * b, c->retry_count = 0; c->next_transmit = 0; /* send right now... */ /* Poke the client process, which will send the request */ - vlib_process_signal_event (vm, dhcp_client_process_node.index, - EVENT_DHCP_CLIENT_WAKEUP, c - dcm->clients); + uword client_id = c - dcm->clients; + vl_api_rpc_call_main_thread (dhcp_client_proc_callback, + (u8 *) &client_id, sizeof (uword)); break; case DHCP_BOUND: -- cgit 1.2.3-korg From 561ae0a63516e092672532424904706f51eb2b84 Mon Sep 17 00:00:00 2001 From: Yoann Desmouceaux Date: Wed, 20 Sep 2017 10:08:28 +0200 Subject: CLI: fix segfault when browsing an empty history When one starts VPP, types a command, presses up rather than enter, then types a new command, unix_cli_line_process_one() segfaults. This is due to cf->cursor not being reset upon pressing up if the history is empty. Change-Id: Ie503f20a9cb551e735abb8b0f4feb8c0006d2b61 Signed-off-by: Yoann Desmouceaux --- src/vlib/unix/cli.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/unix/cli.c b/src/vlib/unix/cli.c index 1624ce38..0e035b13 100644 --- a/src/vlib/unix/cli.c +++ b/src/vlib/unix/cli.c @@ -1406,10 +1406,8 @@ unix_cli_line_process_one (unix_cli_main_t * cm, unix_vlib_cli_output_cooked (cf, uf, cf->current_command, vec_len (cf->current_command)); } - cf->cursor = vec_len (cf->current_command); - - break; } + cf->cursor = vec_len (cf->current_command); break; case UNIX_CLI_PARSE_ACTION_HOME: -- cgit 1.2.3-korg From 03add7f5b5e5351790187ea6d7e83803d5be2440 Mon Sep 17 00:00:00 2001 From: Chris Luke Date: Wed, 20 Sep 2017 23:31:24 -0400 Subject: vppctl,cli: Improve non-interactive vppctl (VPP-944) Short version: Make vppctl behave as expected when run from scripts, or without a controlling terminal, and especially when using it with VPP commands on its command line ("non-interactively"). In particular, prevent the welcome banner and VPP CLI prompt from being sent by VPP when being used in these ways. vppctl ------ - Improve vppctl's detection of non-interactive sessions. - Pass non-interactiveness in the terminal type telnet option as a value distinct from "dumb" (which means non-ANSI capable.) - Make tty setup handling more robust. - Only send non-interactive command once we've sent the terminal type, to ensure correct event sequence; we need the VPP cli session to be in line-by-line mode. - Ignore stdin when it looks something like /dev/null. - Skip NUL bytes received from VPP. VPP CLI ------- - Detect "non-interactive" terminal types and set session parameters accordingly. - Add an "interactive" flag that controls whether the welcome banner and CLI prompt are sent. - Detect if telnet options processing switched us into line mode and act accordingly for the rest of the current input buffer. This was causing the command string to be echoed by the CLI editor code. - For non-interactive sessions, send a NUL byte after the input buffer has been processed. This is because vppctl depends on seeing traffic before it will try to close the session; a command with no output would cause it to hang. NUL bytes are ignored by all decent terminals, but we have vppctl strip them out anyway. - Prevent certain commands from running in non-interactive sessions since they manipulate interactive-related features. - For interactive sessions, quench the prompt that prints on VPP shutdown. - Detect and handle socket errors in the CLI; sessions were leaking. - Pevent SIGPIPE from ever being raised; handle EPIPE instead. We don't need VPP to die just because a socket closed just before we try to write to it! - Add a command to dump a list of current CLI sessions; mostly this was to detect session leakage, but it may have some general utility. Change-Id: Ia147da013317180882c1d967b18eefb8519a55fb Signed-off-by: Chris Luke --- src/vlib/unix/cli.c | 359 +++++++++++++++++++++++++++++++++++++++++++-------- src/vpp/app/vppctl.c | 134 +++++++++++++------ 2 files changed, 404 insertions(+), 89 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/unix/cli.c b/src/vlib/unix/cli.c index 0e035b13..ffe5de6d 100644 --- a/src/vlib/unix/cli.c +++ b/src/vlib/unix/cli.c @@ -100,9 +100,6 @@ /** Maximum terminal height we will accept */ #define UNIX_CLI_MAX_TERMINAL_HEIGHT 512 -/** Unix standard in */ -#define UNIX_CLI_STDIN_FD 0 - /** A CLI banner line. */ typedef struct @@ -200,6 +197,18 @@ typedef struct /** Disable the pager? */ u8 no_pager; + /** Whether the session is interactive or not. + * Controls things like initial banner, the CLI prompt etc. */ + u8 is_interactive; + + /** Whether the session is attached to a socket. */ + u8 is_socket; + + /** If EPIPE has been detected, prevent further write-related + * activity on the descriptor. + */ + u8 has_epipe; + /** Pager buffer */ u8 **pager_vector; @@ -593,12 +602,33 @@ unix_vlib_cli_output_raw (unix_cli_file_t * cf, { int n = 0; + if (cf->has_epipe) /* don't try writing anything */ + return; + if (vec_len (cf->output_vector) == 0) - n = write (uf->file_descriptor, buffer, buffer_bytes); + { + if (cf->is_socket) + /* If it's a socket we use MSG_NOSIGNAL to prevent SIGPIPE */ + n = send (uf->file_descriptor, buffer, buffer_bytes, MSG_NOSIGNAL); + else + n = write (uf->file_descriptor, buffer, buffer_bytes); + } if (n < 0 && errno != EAGAIN) { - clib_unix_warning ("write"); + if (errno == EPIPE) + { + /* connection closed on us */ + unix_main_t *um = &unix_main; + cf->has_epipe = 1; + vlib_process_signal_event (um->vlib_main, cf->process_node_index, + UNIX_CLI_PROCESS_EVENT_QUIT, + uf->private_data); + } + else + { + clib_unix_warning ("write"); + } } else if ((word) n < (word) buffer_bytes) { @@ -659,7 +689,9 @@ unix_cli_cli_prompt (unix_cli_file_t * cf, clib_file_t * uf) { unix_cli_main_t *cm = &unix_cli_main; - unix_vlib_cli_output_raw (cf, uf, cm->cli_prompt, vec_len (cm->cli_prompt)); + if (cf->is_interactive) /* Only interactive sessions get a prompt */ + unix_vlib_cli_output_raw (cf, uf, cm->cli_prompt, + vec_len (cm->cli_prompt)); } /** @brief Output a pager prompt and show number of buffered lines */ @@ -1024,7 +1056,7 @@ unix_vlib_cli_output (uword cli_file_index, u8 * buffer, uword buffer_bytes) * terminal sequences; @c 0 otherwise. */ static u8 -unix_cli_terminal_type (u8 * term, uword len) +unix_cli_terminal_type_ansi (u8 * term, uword len) { /* This may later be better done as a hash of some sort. */ #define _(a) do { \ @@ -1042,6 +1074,45 @@ unix_cli_terminal_type (u8 * term, uword len) return 0; } +/** Identify whether a terminal type is non-interactive. + * + * Compares the string given in @c term with a list of terminal types known + * to be non-interactive, as send by tools such as @c vppctl . + * + * This list contains, for example, @c vppctl. + * + * @param term A string with a terminal type in it. + * @param len The length of the string in @c term. + * + * @return @c 1 if the terminal type is recognized as being non-interactive; + * @c 0 otherwise. + */ +static u8 +unix_cli_terminal_type_noninteractive (u8 * term, uword len) +{ + /* This may later be better done as a hash of some sort. */ +#define _(a) do { \ + if (strncasecmp(a, (char *)term, (size_t)len) == 0) return 1; \ + } while(0) + + _("vppctl"); +#undef _ + + return 0; +} + +/** Set a session to be non-interactive. */ +static void +unix_cli_set_session_noninteractive (unix_cli_file_t * cf) +{ + /* Non-interactive sessions don't get these */ + cf->is_interactive = 0; + cf->no_pager = 1; + cf->history_limit = 0; + cf->has_history = 0; + cf->line_mode = 1; +} + /** @brief Emit initial welcome banner and prompt on a connection. */ static void unix_cli_file_welcome (unix_cli_main_t * cm, unix_cli_file_t * cf) @@ -1052,6 +1123,12 @@ unix_cli_file_welcome (unix_cli_main_t * cm, unix_cli_file_t * cf) unix_cli_banner_t *banner; int i, len; + /* Mark the session as started if we get here */ + cf->started = 1; + + if (!(cf->is_interactive)) /* No banner for non-interactive sessions */ + return; + /* * Put the first bytes directly into the buffer so that further output is * queued until everything is ready. (oterwise initial prompt can appear @@ -1082,7 +1159,6 @@ unix_cli_file_welcome (unix_cli_main_t * cm, unix_cli_file_t * cf) /* Prompt. */ unix_cli_cli_prompt (cf, uf); - cf->started = 1; } /** @brief A failsafe triggered on a timer to ensure we send the prompt @@ -1160,9 +1236,20 @@ unix_cli_process_telnet (unix_main_t * um, case TELOPT_TTYPE: if (input_vector[3] != 0) break; - /* See if the terminal type is ANSI capable */ - cf->ansi_capable = - unix_cli_terminal_type (input_vector + 4, i - 5); + { + /* See if the the terminal type is recognized */ + u8 *term = input_vector + 4; + uword len = i - 5; + + /* See if the terminal type is ANSI capable */ + cf->ansi_capable = + unix_cli_terminal_type_ansi (term, len); + + /* See if the terminal type indicates non-interactive */ + if (unix_cli_terminal_type_noninteractive (term, len)) + unix_cli_set_session_noninteractive (cf); + } + /* If session not started, we can release the pause */ if (!cf->started) /* Send the welcome banner and initial prompt */ @@ -2122,10 +2209,12 @@ unix_cli_line_edit (unix_cli_main_t * cm, unix_main_t * um, vec_len (cf->input_vector) - i); if (matched < 0) { + /* There was a partial match which means we need more bytes + * than the input buffer currently has. + */ if (i) { - /* There was a partial match which means we need more bytes - * than the input buffer currently has. + /* * Since the bytes before here have been processed, shift * the remaining contents to the start of the input buffer. */ @@ -2136,6 +2225,16 @@ unix_cli_line_edit (unix_cli_main_t * cm, unix_main_t * um, break; default: + /* If telnet option processing switched us to line mode, get us + * out of here! + */ + if (cf->line_mode) + { + vec_delete (cf->input_vector, i, 0); + cf->current_command = cf->input_vector; + return 0; + } + /* process the action */ if (!unix_cli_line_process_one (cm, um, cf, uf, cf->input_vector[i], action)) @@ -2165,11 +2264,14 @@ unix_cli_process_input (unix_cli_main_t * cm, uword cli_file_index) unformat_input_t input; int vlib_parse_eval (u8 *); + cm->current_input_file_index = cli_file_index; + more: /* Try vlibplex first. Someday... */ if (0 && vlib_parse_eval (cf->input_vector) == 0) goto done; + if (cf->line_mode) { /* just treat whatever we got as a complete line of input */ @@ -2190,20 +2292,16 @@ more: lv = format (lv, "%U[%d]: %v", format_timeval, 0 /* current bat-time */ , 0 /* current bat-format */ , - cli_file_index, cf->input_vector); - { - int rv __attribute__ ((unused)) = - write (um->log_fd, lv, vec_len (lv)); - } + cli_file_index, cf->current_command); + int rv __attribute__ ((unused)) = write (um->log_fd, lv, vec_len (lv)); } - /* Copy our input command to a new string */ + /* Build an unformat structure around our command */ unformat_init_vector (&input, cf->current_command); /* Remove leading white space from input. */ (void) unformat (&input, ""); - cm->current_input_file_index = cli_file_index; cf->pager_start = 0; /* start a new pager session */ if (unformat_check_input (&input) != UNFORMAT_END_OF_INPUT) @@ -2222,9 +2320,14 @@ more: done: /* reset vector; we'll re-use it later */ if (cf->line_mode) - vec_reset_length (cf->input_vector); + { + vec_reset_length (cf->input_vector); + cf->current_command = 0; + } else - vec_reset_length (cf->current_command); + { + vec_reset_length (cf->current_command); + } if (cf->no_pager == 2) { @@ -2251,6 +2354,15 @@ done: /* Any residual data in the input vector? */ if (vec_len (cf->input_vector)) goto more; + + /* For non-interactive sessions send a NUL byte. + * Specifically this is because vppctl needs to see some traffic in + * order to move on to closing the session. Commands with no output + * would thus cause vppctl to hang indefinitely in non-interactive mode + * since there is also no prompt sent after the command completes. + */ + if (!cf->is_interactive) + unix_vlib_cli_output_raw (cf, uf, (u8 *) "\0", 1); } /** Destroy a CLI session. @@ -2270,7 +2382,7 @@ unix_cli_kill (unix_cli_main_t * cm, uword cli_file_index) uf = pool_elt_at_index (fm->file_pool, cf->clib_file_index); /* Quit/EOF on stdin means quit program. */ - if (uf->file_descriptor == UNIX_CLI_STDIN_FD) + if (uf->file_descriptor == STDIN_FILENO) clib_longjmp (&um->vlib_main->main_loop_exit, VLIB_MAIN_LOOP_EXIT_CLI); vec_free (cf->current_command); @@ -2342,11 +2454,30 @@ unix_cli_write_ready (clib_file_t * uf) cf = pool_elt_at_index (cm->cli_file_pool, uf->private_data); /* Flush output vector. */ - n = write (uf->file_descriptor, - cf->output_vector, vec_len (cf->output_vector)); + if (cf->is_socket) + /* If it's a socket we use MSG_NOSIGNAL to prevent SIGPIPE */ + n = send (uf->file_descriptor, + cf->output_vector, vec_len (cf->output_vector), MSG_NOSIGNAL); + else + n = write (uf->file_descriptor, + cf->output_vector, vec_len (cf->output_vector)); if (n < 0 && errno != EAGAIN) - return clib_error_return_unix (0, "write"); + { + if (errno == EPIPE) + { + /* connection closed on us */ + unix_main_t *um = &unix_main; + cf->has_epipe = 1; + vlib_process_signal_event (um->vlib_main, cf->process_node_index, + UNIX_CLI_PROCESS_EVENT_QUIT, + uf->private_data); + } + else + { + return clib_error_return_unix (0, "write"); + } + } else if (n > 0) unix_cli_del_pending_output (uf, cf, n); @@ -2393,6 +2524,24 @@ unix_cli_read_ready (clib_file_t * uf) return /* no error */ 0; } +/** Called when a CLI session file descriptor has an error condition. */ +static clib_error_t * +unix_cli_error_detected (clib_file_t * uf) +{ + unix_main_t *um = &unix_main; + unix_cli_main_t *cm = &unix_cli_main; + unix_cli_file_t *cf; + + cf = pool_elt_at_index (cm->cli_file_pool, uf->private_data); + cf->has_epipe = 1; /* prevent writes while the close is pending */ + vlib_process_signal_event (um->vlib_main, + cf->process_node_index, + UNIX_CLI_PROCESS_EVENT_QUIT, + /* event data */ uf->private_data); + + return /* no error */ 0; +} + /** Store a new CLI session. * @param name The name of the session. * @param fd The file descriptor for the session I/O. @@ -2443,6 +2592,7 @@ unix_cli_file_add (unix_cli_main_t * cm, char *name, int fd) template.read_function = unix_cli_read_ready; template.write_function = unix_cli_write_ready; + template.error_function = unix_cli_error_detected; template.file_descriptor = fd; template.private_data = cf - cm->cli_file_pool; @@ -2482,10 +2632,10 @@ unix_cli_listen_read_ready (clib_file_t * uf) cf_index = unix_cli_file_add (cm, client_name, client.fd); cf = pool_elt_at_index (cm->cli_file_pool, cf_index); + cf->is_socket = 1; /* No longer need CLIB version of socket. */ clib_socket_free (&client); - vec_free (client_name); /* if we're supposed to run telnet session in character mode (default) */ @@ -2512,6 +2662,9 @@ unix_cli_listen_read_ready (clib_file_t * uf) cf->history_limit = um->cli_history_limit; cf->has_history = cf->history_limit != 0; + /* This is an interactive session until we decide otherwise */ + cf->is_interactive = 1; + /* Make sure this session is in line mode */ cf->line_mode = 0; @@ -2521,9 +2674,8 @@ unix_cli_listen_read_ready (clib_file_t * uf) /* Setup the pager */ cf->no_pager = um->cli_no_pager; - uf = pool_elt_at_index (fm->file_pool, cf->clib_file_index); - /* Send the telnet options */ + uf = pool_elt_at_index (fm->file_pool, cf->clib_file_index); unix_vlib_cli_output_raw (cf, uf, charmode_option, ARRAY_LEN (charmode_option)); @@ -2550,7 +2702,7 @@ unix_cli_resize_interrupt (int signum) (void) signum; /* Terminal resized, fetch the new size */ - if (ioctl (UNIX_CLI_STDIN_FD, TIOCGWINSZ, &ws) < 0) + if (ioctl (STDIN_FILENO, TIOCGWINSZ, &ws) < 0) { /* "Should never happen..." */ clib_unix_warning ("TIOCGWINSZ"); @@ -2600,17 +2752,17 @@ unix_cli_config (vlib_main_t * vm, unformat_input_t * input) if (um->flags & UNIX_FLAG_INTERACTIVE) { /* Set stdin to be non-blocking. */ - if ((flags = fcntl (UNIX_CLI_STDIN_FD, F_GETFL, 0)) < 0) + if ((flags = fcntl (STDIN_FILENO, F_GETFL, 0)) < 0) flags = 0; - (void) fcntl (UNIX_CLI_STDIN_FD, F_SETFL, flags | O_NONBLOCK); + (void) fcntl (STDIN_FILENO, F_SETFL, flags | O_NONBLOCK); - cf_index = unix_cli_file_add (cm, "stdin", UNIX_CLI_STDIN_FD); + cf_index = unix_cli_file_add (cm, "stdin", STDIN_FILENO); cf = pool_elt_at_index (cm->cli_file_pool, cf_index); cm->stdin_cli_file_index = cf_index; /* If stdin is a tty and we are using chacracter mode, enable * history on the CLI and set the tty line discipline accordingly. */ - if (isatty (UNIX_CLI_STDIN_FD) && um->cli_line_mode == 0) + if (isatty (STDIN_FILENO) && um->cli_line_mode == 0) { /* Capture terminal resize events */ memset (&sa, 0, sizeof (sa)); @@ -2619,7 +2771,7 @@ unix_cli_config (vlib_main_t * vm, unformat_input_t * input) clib_panic ("sigaction"); /* Retrieve the current terminal size */ - ioctl (UNIX_CLI_STDIN_FD, TIOCGWINSZ, &ws); + ioctl (STDIN_FILENO, TIOCGWINSZ, &ws); cf->width = ws.ws_col; cf->height = ws.ws_row; @@ -2634,11 +2786,14 @@ unix_cli_config (vlib_main_t * vm, unformat_input_t * input) /* Setup the pager */ cf->no_pager = um->cli_no_pager; + /* This is an interactive session until we decide otherwise */ + cf->is_interactive = 1; + /* We're going to be in char by char mode */ cf->line_mode = 0; /* Save the original tty state so we can restore it later */ - tcgetattr (UNIX_CLI_STDIN_FD, &um->tio_stdin); + tcgetattr (STDIN_FILENO, &um->tio_stdin); um->tio_isset = 1; /* Tweak the tty settings */ @@ -2647,23 +2802,23 @@ unix_cli_config (vlib_main_t * vm, unformat_input_t * input) tio.c_lflag &= ~(ECHO | ICANON | IEXTEN); tio.c_cc[VMIN] = 1; /* 1 byte at a time */ tio.c_cc[VTIME] = 0; /* no timer */ - tcsetattr (UNIX_CLI_STDIN_FD, TCSAFLUSH, &tio); + tcsetattr (STDIN_FILENO, TCSAFLUSH, &tio); /* See if we can do ANSI/VT100 output */ term = (u8 *) getenv ("TERM"); if (term != NULL) - cf->ansi_capable = unix_cli_terminal_type (term, - strlen ((char *) - term)); + { + int len = strlen ((char *) term); + cf->ansi_capable = unix_cli_terminal_type_ansi (term, len); + if (unix_cli_terminal_type_noninteractive (term, len)) + unix_cli_set_session_noninteractive (cf); + } } else { notty: - /* No tty, so make sure these things are off */ - cf->no_pager = 1; - cf->history_limit = 0; - cf->has_history = 0; - cf->line_mode = 1; + /* No tty, so make sure the session doesn't have tty-like features */ + unix_cli_set_session_noninteractive (cf); } /* Send banner and initial prompt */ @@ -2726,8 +2881,8 @@ unix_cli_exit (vlib_main_t * vm) unix_main_t *um = &unix_main; /* If stdin is a tty and we saved the tty state, reset the tty state */ - if (isatty (UNIX_CLI_STDIN_FD) && um->tio_isset) - tcsetattr (UNIX_CLI_STDIN_FD, TCSAFLUSH, &um->tio_stdin); + if (isatty (STDIN_FILENO) && um->tio_isset) + tcsetattr (STDIN_FILENO, TCSAFLUSH, &um->tio_stdin); return 0; } @@ -2758,6 +2913,12 @@ unix_cli_quit (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) { unix_cli_main_t *cm = &unix_cli_main; + unix_cli_file_t *cf = pool_elt_at_index (cm->cli_file_pool, + cm->current_input_file_index); + + /* Cosmetic: suppress the final prompt from appearing before we die */ + cf->is_interactive = 0; + cf->started = 1; vlib_process_signal_event (vm, vlib_current_process (vm), @@ -2940,6 +3101,9 @@ unix_cli_show_history (vlib_main_t * vm, cf = pool_elt_at_index (cm->cli_file_pool, cm->current_input_file_index); + if (!cf->is_interactive) + return clib_error_return (0, "invalid for non-interactive sessions"); + if (cf->has_history && cf->history_limit) { i = 1 + cf->command_number - vec_len (cf->command_history); @@ -2985,6 +3149,8 @@ unix_cli_show_terminal (vlib_main_t * vm, vlib_cli_output (vm, "Terminal height: %d\n", cf->height); vlib_cli_output (vm, "ANSI capable: %s\n", cf->ansi_capable ? "yes" : "no"); + vlib_cli_output (vm, "Interactive: %s\n", + cf->is_interactive ? "yes" : "no"); vlib_cli_output (vm, "History enabled: %s%s\n", cf->has_history ? "yes" : "no", !cf->has_history || cf->history_limit ? "" : @@ -3017,6 +3183,7 @@ unix_cli_show_terminal (vlib_main_t * vm, * Terminal width: 123 * Terminal height: 48 * ANSI capable: yes + * Interactive: yes * History enabled: yes * History limit: 50 * Pager enabled: yes @@ -3032,6 +3199,87 @@ VLIB_CLI_COMMAND (cli_unix_cli_show_terminal, static) = { }; /* *INDENT-ON* */ +/** CLI command to display a list of CLI sessions. */ +static clib_error_t * +unix_cli_show_cli_sessions (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + //unix_main_t *um = &unix_main; + unix_cli_main_t *cm = &unix_cli_main; + clib_file_main_t *fm = &file_main; + unix_cli_file_t *cf; + clib_file_t *uf; + vlib_node_t *n; + + vlib_cli_output (vm, "%-5s %-5s %-20s %s", "PNI", "FD", "Name", "Flags"); + +#define fl(x, y) ( (x) ? toupper((y)) : tolower((y)) ) + /* *INDENT-OFF* */ + pool_foreach (cf, cm->cli_file_pool, ({ + uf = pool_elt_at_index (fm->file_pool, cf->clib_file_index); + n = vlib_get_node (vm, cf->process_node_index); + vlib_cli_output (vm, + "%-5d %-5d %-20v %c%c%c%c%c\n", + cf->process_node_index, + uf->file_descriptor, + n->name, + fl (cf->is_interactive, 'i'), + fl (cf->is_socket, 's'), + fl (cf->line_mode, 'l'), + fl (cf->has_epipe, 'p'), + fl (cf->ansi_capable, 'a')); + })); + /* *INDENT-ON* */ +#undef fl + + return 0; +} + +/*? + * Displays a summary of all the current CLI sessions. + * + * Typically used to diagnose connection issues with the CLI + * socket. + * + * @cliexpar + * @cliexstart{show cli-sessions} + * PNI FD Name Flags + * 343 0 unix-cli-stdin IslpA + * 344 7 unix-cli-local:20 ISlpA + * 346 8 unix-cli-local:21 iSLpa + * @cliexend + + * In this example we have the debug console of the running process + * on stdin/out, we have an interactive socket session and we also + * have a non-interactive socket session. + * + * Fields: + * + * - @em PNI: Process node index. + * - @em FD: Unix file descriptor. + * - @em Name: Name of the session. + * - @em Flags: Various flags that describe the state of the session. + * + * @em Flags have the following meanings; lower-case typically negates + * upper-case: + * + * - @em I Interactive session. + * - @em S Connected by socket. + * - @em s Not a socket, likely stdin. + * - @em L Line-by-line mode. + * - @em l Char-by-char mode. + * - @em P EPIPE detected on connection; it will close soon. + * - @em A ANSI-capable terminal. +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (cli_unix_cli_show_cli_sessions, static) = { + .path = "show cli-sessions", + .short_help = "Show current CLI sessions", + .function = unix_cli_show_cli_sessions, +}; +/* *INDENT-ON* */ + /** CLI command to set terminal pager settings. */ static clib_error_t * unix_cli_set_terminal_pager (vlib_main_t * vm, @@ -3044,11 +3292,14 @@ unix_cli_set_terminal_pager (vlib_main_t * vm, unformat_input_t _line_input, *line_input = &_line_input; clib_error_t *error = 0; + cf = pool_elt_at_index (cm->cli_file_pool, cm->current_input_file_index); + + if (!cf->is_interactive) + return clib_error_return (0, "invalid for non-interactive sessions"); + if (!unformat_user (input, unformat_line_input, line_input)) return 0; - cf = pool_elt_at_index (cm->cli_file_pool, cm->current_input_file_index); - while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) { if (unformat (line_input, "on")) @@ -3100,11 +3351,14 @@ unix_cli_set_terminal_history (vlib_main_t * vm, u32 limit; clib_error_t *error = 0; + cf = pool_elt_at_index (cm->cli_file_pool, cm->current_input_file_index); + + if (!cf->is_interactive) + return clib_error_return (0, "invalid for non-interactive sessions"); + if (!unformat_user (input, unformat_line_input, line_input)) return 0; - cf = pool_elt_at_index (cm->cli_file_pool, cm->current_input_file_index); - while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) { if (unformat (line_input, "on")) @@ -3162,6 +3416,9 @@ unix_cli_set_terminal_ansi (vlib_main_t * vm, cf = pool_elt_at_index (cm->cli_file_pool, cm->current_input_file_index); + if (!cf->is_interactive) + return clib_error_return (0, "invalid for non-interactive sessions"); + if (unformat (input, "on")) cf->ansi_capable = 1; else if (unformat (input, "off")) diff --git a/src/vpp/app/vppctl.c b/src/vpp/app/vppctl.c index 980936f1..f81a0ce9 100644 --- a/src/vpp/app/vppctl.c +++ b/src/vpp/app/vppctl.c @@ -41,12 +41,16 @@ volatile int window_resized = 0; struct termios orig_tio; static void -send_ttype (clib_socket_t * s, int is_dumb) +send_ttype (clib_socket_t * s, int is_interactive) { + char *term; + + term = is_interactive ? getenv ("TERM") : "vppctl"; + if (term == NULL) + term = "dumb"; + clib_socket_tx_add_formatted (s, "%c%c%c" "%c%s" "%c%c", - IAC, SB, TELOPT_TTYPE, - 0, is_dumb ? "dumb" : getenv ("TERM"), - IAC, SE); + IAC, SB, TELOPT_TTYPE, 0, term, IAC, SE); clib_socket_tx (s); } @@ -81,7 +85,8 @@ signal_handler_term (int signum) } static u8 * -process_input (u8 * str, clib_socket_t * s, int is_interactive) +process_input (u8 * str, clib_socket_t * s, int is_interactive, + int *sent_ttype) { int i = 0; @@ -104,7 +109,10 @@ process_input (u8 * str, clib_socket_t * s, int is_interactive) vec_free (sb); i += 2; if (opt == TELOPT_TTYPE) - send_ttype (s, !is_interactive); + { + send_ttype (s, is_interactive); + *sent_ttype = 1; + } else if (is_interactive && opt == TELOPT_NAWS) send_naws (s); } @@ -138,6 +146,8 @@ main (int argc, char *argv[]) u8 *str = 0; u8 *cmd = 0; int do_quit = 0; + int is_interactive = 0; + int sent_ttype = 0; clib_mem_init (0, 64ULL << 10); @@ -164,33 +174,47 @@ main (int argc, char *argv[]) if (error) goto done; - /* Capture terminal resize events */ - memset (&sa, 0, sizeof (struct sigaction)); - sa.sa_handler = signal_handler_winch; + is_interactive = isatty (STDIN_FILENO) && cmd == 0; - if (sigaction (SIGWINCH, &sa, 0) < 0) + if (is_interactive) { - error = clib_error_return_unix (0, "sigaction"); - goto done; - } + /* Capture terminal resize events */ + memset (&sa, 0, sizeof (struct sigaction)); + sa.sa_handler = signal_handler_winch; + if (sigaction (SIGWINCH, &sa, 0) < 0) + { + error = clib_error_return_unix (0, "sigaction"); + goto done; + } - sa.sa_handler = signal_handler_term; - if (sigaction (SIGTERM, &sa, 0) < 0) - { - error = clib_error_return_unix (0, "sigaction"); - goto done; - } + /* Capture SIGTERM to reset tty settings */ + sa.sa_handler = signal_handler_term; + if (sigaction (SIGTERM, &sa, 0) < 0) + { + error = clib_error_return_unix (0, "sigaction"); + goto done; + } - /* Save the original tty state so we can restore it later */ - tcgetattr (STDIN_FILENO, &orig_tio); + /* Save the original tty state so we can restore it later */ + if (tcgetattr (STDIN_FILENO, &orig_tio) < 0) + { + error = clib_error_return_unix (0, "tcgetattr"); + goto done; + } + + /* Tweak the tty settings */ + tio = orig_tio; + /* echo off, canonical mode off, ext'd input processing off */ + tio.c_lflag &= ~(ECHO | ICANON | IEXTEN); + tio.c_cc[VMIN] = 1; /* 1 byte at a time */ + tio.c_cc[VTIME] = 0; /* no timer */ - /* Tweak the tty settings */ - tio = orig_tio; - /* echo off, canonical mode off, ext'd input processing off */ - tio.c_lflag &= ~(ECHO | ICANON | IEXTEN); - tio.c_cc[VMIN] = 1; /* 1 byte at a time */ - tio.c_cc[VTIME] = 0; /* no timer */ - tcsetattr (STDIN_FILENO, TCSAFLUSH, &tio); + if (tcsetattr (STDIN_FILENO, TCSAFLUSH, &tio) < 0) + { + error = clib_error_return_unix (0, "tcsetattr"); + goto done; + } + } efd = epoll_create1 (0); @@ -199,8 +223,12 @@ main (int argc, char *argv[]) event.data.fd = STDIN_FILENO; if (epoll_ctl (efd, EPOLL_CTL_ADD, STDIN_FILENO, &event) != 0) { - error = clib_error_return_unix (0, "epoll_ctl[%d]", STDIN_FILENO); - goto done; + /* ignore EPERM; it means stdin is something like /dev/null */ + if (errno != EPERM) + { + error = clib_error_return_unix (0, "epoll_ctl[%d]", STDIN_FILENO); + goto done; + } } /* register socket */ @@ -240,6 +268,9 @@ main (int argc, char *argv[]) int n; char c[100]; + if (!sent_ttype) + continue; /* not ready for this yet */ + n = read (STDIN_FILENO, c, sizeof (c)); if (n > 0) { @@ -250,6 +281,8 @@ main (int argc, char *argv[]) } else if (n < 0) clib_warning ("read rv=%d", n); + else /* EOF */ + do_quit = 1; } else if (event.data.fd == s->fd) { @@ -260,27 +293,49 @@ main (int argc, char *argv[]) if (clib_socket_rx_end_of_file (s)) break; - str = process_input (str, s, cmd == 0); + str = process_input (str, s, is_interactive, &sent_ttype); if (vec_len (str) > 0) { - n = write (STDOUT_FILENO, str, vec_len (str)); - if (n < 0) + int len = vec_len (str); + u8 *p = str, *q = str; + + while (len) { - error = clib_error_return_unix (0, "write"); - goto done; + /* Search for and skip NUL bytes */ + while (q < (p + len) && *q) + q++; + + n = write (STDOUT_FILENO, p, q - p); + if (n < 0) + { + error = clib_error_return_unix (0, "write"); + goto done; + } + + while (q < (p + len) && !*q) + q++; + len -= q - p; + p = q; } + vec_reset_length (str); } if (do_quit) { - clib_socket_tx_add_formatted (s, "q\n"); + /* Ask the other end to close the connection */ + clib_socket_tx_add_formatted (s, "quit\n"); clib_socket_tx (s); do_quit = 0; } - if (cmd) + if (cmd && sent_ttype) { + /* We wait until after the TELNET TTYPE option has been sent. + * That is to make sure the session at the VPP end has switched + * to line-by-line mode, and thus avoid prompts and echoing. + * Note that it does also disable further TELNET option processing. + */ clib_socket_tx_add_formatted (s, "%s\n", cmd); clib_socket_tx (s); vec_free (cmd); @@ -302,12 +357,15 @@ done: if (efd > -1) close (efd); + if (is_interactive) + tcsetattr (STDIN_FILENO, TCSAFLUSH, &orig_tio); + if (error) { clib_error_report (error); return 1; } - tcsetattr (STDIN_FILENO, TCSAFLUSH, &orig_tio); + return 0; } -- cgit 1.2.3-korg From b66201f76143391cc9b81d9c419919b9fd914a24 Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Fri, 22 Sep 2017 13:34:39 -0400 Subject: Fix vpp "unix interactive" when running under emacs + gdb In this specific corner-case setup, ioctl (0, TIOCGWINSZ) returns window height = 0 and width = 0. Rather than declaring the terminal to be non-interactive, set the window size parameters to 80 x 24. Change-Id: If66f5f0883f1940518ec1c6e26228c9bb6f32852 Signed-off-by: Dave Barach --- src/vlib/unix/cli.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/unix/cli.c b/src/vlib/unix/cli.c index ffe5de6d..be3c813a 100644 --- a/src/vlib/unix/cli.c +++ b/src/vlib/unix/cli.c @@ -2776,8 +2776,14 @@ unix_cli_config (vlib_main_t * vm, unformat_input_t * input) cf->height = ws.ws_row; if (cf->width == 0 || cf->height == 0) - /* We have a tty, but no size. Stick to line mode. */ - goto notty; + { + /* + * We have a tty, but no size. Use defaults. + * vpp "unix interactive" inside emacs + gdb ends up here. + */ + cf->width = 80; + cf->height = 24; + } /* Setup the history */ cf->history_limit = um->cli_history_limit; @@ -2816,7 +2822,6 @@ unix_cli_config (vlib_main_t * vm, unformat_input_t * input) } else { - notty: /* No tty, so make sure the session doesn't have tty-like features */ unix_cli_set_session_noninteractive (cf); } -- cgit 1.2.3-korg From 69128d0209ba6108430dca9cc78ab36a9b1c793e Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Tue, 26 Sep 2017 10:54:34 -0400 Subject: Add thread-safe event signaller, use RPC where required Update ping code to use the new function Change-Id: Ieb753b23f8402cbe5667c22747896784c8ece937 Signed-off-by: Florin Coras Signed-off-by: Dave Barach --- src/vlib/node_funcs.h | 23 +++++++++++++++++++++++ src/vlib/threads.c | 24 +++++++++++++++++++++++- src/vlib/threads.h | 14 +++++++++++++- src/vlibmemory/memory_vlib.c | 13 ++++++++++++- src/vnet/ip/ping.c | 4 ++-- 5 files changed, 73 insertions(+), 5 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/node_funcs.h b/src/vlib/node_funcs.h index 3ae4e541..0734476c 100644 --- a/src/vlib/node_funcs.h +++ b/src/vlib/node_funcs.h @@ -965,6 +965,29 @@ vlib_process_signal_event_pointer (vlib_main_t * vm, d[0] = data; } +/** + * Signal event to process from any thread. + * + * When in doubt, use this. + */ +always_inline void +vlib_process_signal_event_mt (vlib_main_t * vm, + uword node_index, uword type_opaque, uword data) +{ + if (vlib_get_thread_index () != 0) + { + vlib_process_signal_event_mt_args_t args = { + .node_index = node_index, + .type_opaque = type_opaque, + .data = data, + }; + vlib_rpc_call_main_thread (vlib_process_signal_event_mt_helper, + (u8 *) & args, sizeof (args)); + } + else + vlib_process_signal_event (vm, node_index, type_opaque, data); +} + always_inline void vlib_process_signal_one_time_event (vlib_main_t * vm, uword node_index, diff --git a/src/vlib/threads.c b/src/vlib/threads.c index f9c7043c..be8daa64 100644 --- a/src/vlib/threads.c +++ b/src/vlib/threads.c @@ -1767,7 +1767,6 @@ vlib_frame_queue_main_init (u32 node_index, u32 frame_queue_nelts) return (fqm - tm->frame_queue_mains); } - int vlib_thread_cb_register (struct vlib_main_t *vm, vlib_thread_callbacks_t * cb) { @@ -1781,6 +1780,29 @@ vlib_thread_cb_register (struct vlib_main_t *vm, vlib_thread_callbacks_t * cb) return 0; } +void +vlib_process_signal_event_mt_helper (vlib_process_signal_event_mt_args_t * + args) +{ + ASSERT (vlib_get_thread_index () == 0); + vlib_process_signal_event (vlib_get_main (), args->node_index, + args->type_opaque, args->data); +} + +void *rpc_call_main_thread_cb_fn; + +void +vlib_rpc_call_main_thread (void *callback, u8 * args, u32 arg_size) +{ + if (rpc_call_main_thread_cb_fn) + { + void (*fp) (void *, u8 *, u32) = rpc_call_main_thread_cb_fn; + (*fp) (callback, args, arg_size); + } + else + clib_warning ("BUG: rpc_call_main_thread_cb_fn NULL!"); +} + clib_error_t * threads_init (vlib_main_t * vm) { diff --git a/src/vlib/threads.h b/src/vlib/threads.h index 72340ee1..8931584b 100644 --- a/src/vlib/threads.h +++ b/src/vlib/threads.h @@ -171,6 +171,13 @@ typedef struct frame_queue_nelt_counter_t *frame_queue_histogram; } vlib_frame_queue_main_t; +typedef struct +{ + uword node_index; + uword type_opaque; + uword data; +} vlib_process_signal_event_mt_args_t; + /* Called early, in thread 0's context */ clib_error_t *vlib_thread_init (vlib_main_t * vm); @@ -510,9 +517,14 @@ vlib_get_worker_handoff_queue_elt (u32 frame_queue_index, } u8 *vlib_thread_stack_init (uword thread_index); - int vlib_thread_cb_register (struct vlib_main_t *vm, vlib_thread_callbacks_t * cb); +extern void *rpc_call_main_thread_cb_fn; + +void +vlib_process_signal_event_mt_helper (vlib_process_signal_event_mt_args_t * + args); +void vlib_rpc_call_main_thread (void *function, u8 * args, u32 size); #endif /* included_vlib_threads_h */ diff --git a/src/vlibmemory/memory_vlib.c b/src/vlibmemory/memory_vlib.c index 77959e6d..d305ea61 100644 --- a/src/vlibmemory/memory_vlib.c +++ b/src/vlibmemory/memory_vlib.c @@ -1573,6 +1573,17 @@ _(RPC_CALL_REPLY,rpc_call_reply) #define foreach_plugin_trace_msg \ _(TRACE_PLUGIN_MSG_IDS,trace_plugin_msg_ids) +/* + * Set the rpc callback at our earliest possible convenience. + * This avoids ordering issues between thread_init() -> start_workers and + * an init function which we could define here. If we ever intend to use + * vlib all by itself, we can't create a link-time dependency on + * an init function here and a typical "call foo_init first" + * guitar lick. + */ + +extern void *rpc_call_main_thread_cb_fn; + static clib_error_t * rpc_api_hookup (vlib_main_t * vm) { @@ -1599,7 +1610,7 @@ rpc_api_hookup (vlib_main_t * vm) /* No reason to halt the parade to create a trace record... */ am->is_mp_safe[VL_API_TRACE_PLUGIN_MSG_IDS] = 1; - + rpc_call_main_thread_cb_fn = vl_api_rpc_call_main_thread; return 0; } diff --git a/src/vnet/ip/ping.c b/src/vnet/ip/ping.c index c847e696..0fa537f6 100755 --- a/src/vnet/ip/ping.c +++ b/src/vnet/ip/ping.c @@ -97,7 +97,7 @@ signal_ip46_icmp_reply_event (u8 event_type, vlib_buffer_t * b0) clib_memcpy (vnet_buffer (vlib_get_buffer (vm, bi0_copy))->unused, &nowts, sizeof (nowts)); - vlib_process_signal_event (vm, pr->cli_process_id, event_type, bi0_copy); + vlib_process_signal_event_mt (vm, pr->cli_process_id, event_type, bi0_copy); return 1; } @@ -646,7 +646,7 @@ run_ping_ip46_address (vlib_main_t * vm, u32 table_id, ip4_address_t * pa4, i = 1 + ping_repeat; break; } - vec_free(event_data); + vec_free (event_data); } } vlib_cli_output (vm, "\n"); -- cgit 1.2.3-korg From 84db40e02e74efabe094e0d46ed2fafd348d126d Mon Sep 17 00:00:00 2001 From: Dave Barach Date: Fri, 13 Oct 2017 19:16:56 -0400 Subject: VPP-1029: Don't call clib_longjmp(...) directly from the SIGTERM handler It's way too easy to imagine leaving a mutex or a spin-lock held in the /vpe-api shared-memory segment, or elsewhere. Set a volatile variable and check it in a safe place... Change-Id: I9d91c38cffeb921143c272162d055c9c24a6c312 Signed-off-by: Dave Barach (cherry picked from commit 903651caf320dfdaabd20a0e6f3cd0ffc843e02f) --- src/vlib/main.h | 5 +++++ src/vlib/unix/main.c | 7 +++---- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'src/vlib') diff --git a/src/vlib/main.h b/src/vlib/main.h index fb67334e..4288d6f0 100644 --- a/src/vlib/main.h +++ b/src/vlib/main.h @@ -85,6 +85,8 @@ typedef struct vlib_main_t /* Jump target to exit main loop with given code. */ u32 main_loop_exit_set; + /* Set e.g. in the SIGTERM signal handler, checked in a safe place... */ + volatile u32 main_loop_exit_now; clib_longjmp_t main_loop_exit; #define VLIB_MAIN_LOOP_EXIT_NONE 0 #define VLIB_MAIN_LOOP_EXIT_PANIC 1 @@ -340,6 +342,9 @@ vlib_increment_main_loop_counter (vlib_main_t * vm) vm->main_loop_nodes_processed = 0; vm->vector_counts_per_main_loop[i] = v; vm->node_counts_per_main_loop[i] = n; + + if (PREDICT_FALSE (vm->main_loop_exit_now)) + clib_longjmp (&vm->main_loop_exit, VLIB_MAIN_LOOP_EXIT_CLI); } always_inline void vlib_set_queue_signal_callback diff --git a/src/vlib/unix/main.c b/src/vlib/unix/main.c index ed0631ec..f286c870 100644 --- a/src/vlib/unix/main.c +++ b/src/vlib/unix/main.c @@ -75,7 +75,7 @@ VLIB_INIT_FUNCTION (unix_main_init); static void unix_signal_handler (int signum, siginfo_t * si, ucontext_t * uc) { - uword fatal; + uword fatal = 0; u8 *msg = 0; msg = format (msg, "received signal %U, PC %U", @@ -91,10 +91,9 @@ unix_signal_handler (int signum, siginfo_t * si, ucontext_t * uc) if (unix_main.vlib_main->main_loop_exit_set) { syslog (LOG_ERR | LOG_DAEMON, "received SIGTERM, exiting..."); - - clib_longjmp (&unix_main.vlib_main->main_loop_exit, - VLIB_MAIN_LOOP_EXIT_CLI); + unix_main.vlib_main->main_loop_exit_now = 1; } + break; /* fall through */ case SIGQUIT: case SIGINT: -- cgit 1.2.3-korg